2017-01-23 Andrew Pinski <apinski@cavium.com>
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobf343d92f9f529b8874ec3c486d91d079315514bf
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
117 struct simd_immediate_info
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version;
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune = cortexa53;
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags = 0;
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads;
163 /* Support for command line parsing of boolean flags in the tuning
164 structures. */
165 struct aarch64_flag_desc
167 const char* name;
168 unsigned int flag;
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
175 { "none", AARCH64_FUSE_NOTHING },
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL },
178 { NULL, AARCH64_FUSE_NOTHING }
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
185 { "none", AARCH64_EXTRA_TUNE_NONE },
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL },
188 { NULL, AARCH64_EXTRA_TUNE_NONE }
191 /* Tuning parameters. */
193 static const struct cpu_addrcost_table generic_addrcost_table =
196 0, /* hi */
197 0, /* si */
198 0, /* di */
199 0, /* ti */
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
206 0 /* imm_offset */
209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
217 0, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
222 0, /* imm_offset */
225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
228 0, /* hi */
229 0, /* si */
230 0, /* di */
231 2, /* ti */
233 0, /* pre_modify */
234 0, /* post_modify */
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
238 0, /* imm_offset */
241 static const struct cpu_addrcost_table xgene1_addrcost_table =
244 1, /* hi */
245 0, /* si */
246 0, /* di */
247 1, /* ti */
249 1, /* pre_modify */
250 0, /* post_modify */
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
254 0, /* imm_offset */
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
276 1, /* hi */
277 1, /* si */
278 1, /* di */
279 2, /* ti */
281 0, /* pre_modify */
282 0, /* post_modify */
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
286 0, /* imm_offset */
289 static const struct cpu_regmove_cost generic_regmove_cost =
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
294 5, /* GP2FP */
295 5, /* FP2GP */
296 2 /* FP2FP */
299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
301 1, /* GP2GP */
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
304 5, /* GP2FP */
305 5, /* FP2GP */
306 2 /* FP2FP */
309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
311 1, /* GP2GP */
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
314 5, /* GP2FP */
315 5, /* FP2GP */
316 2 /* FP2FP */
319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
321 1, /* GP2GP */
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
324 9, /* GP2FP */
325 9, /* FP2GP */
326 1 /* FP2FP */
329 static const struct cpu_regmove_cost thunderx_regmove_cost =
331 2, /* GP2GP */
332 2, /* GP2FP */
333 6, /* FP2GP */
334 4 /* FP2FP */
337 static const struct cpu_regmove_cost xgene1_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 8, /* GP2FP */
343 8, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
349 2, /* GP2GP */
350 /* Avoid the use of int<->fp moves for spilling. */
351 6, /* GP2FP */
352 6, /* FP2GP */
353 4 /* FP2FP */
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of int<->fp moves for spilling. */
360 8, /* GP2FP */
361 8, /* FP2GP */
362 4 /* FP2FP */
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost =
368 1, /* scalar_stmt_cost */
369 1, /* scalar_load_cost */
370 1, /* scalar_store_cost */
371 1, /* vec_stmt_cost */
372 2, /* vec_permute_cost */
373 1, /* vec_to_scalar_cost */
374 1, /* scalar_to_vec_cost */
375 1, /* vec_align_load_cost */
376 1, /* vec_unalign_load_cost */
377 1, /* vec_unalign_store_cost */
378 1, /* vec_store_cost */
379 3, /* cond_taken_branch_cost */
380 1 /* cond_not_taken_branch_cost */
383 /* ThunderX costs for vector insn classes. */
384 static const struct cpu_vector_cost thunderx_vector_cost =
386 1, /* scalar_stmt_cost */
387 3, /* scalar_load_cost */
388 1, /* scalar_store_cost */
389 4, /* vec_stmt_cost */
390 4, /* vec_permute_cost */
391 2, /* vec_to_scalar_cost */
392 2, /* scalar_to_vec_cost */
393 3, /* vec_align_load_cost */
394 10, /* vec_unalign_load_cost */
395 10, /* vec_unalign_store_cost */
396 1, /* vec_store_cost */
397 3, /* cond_taken_branch_cost */
398 3 /* cond_not_taken_branch_cost */
401 /* Generic costs for vector insn classes. */
402 static const struct cpu_vector_cost cortexa57_vector_cost =
404 1, /* scalar_stmt_cost */
405 4, /* scalar_load_cost */
406 1, /* scalar_store_cost */
407 2, /* vec_stmt_cost */
408 3, /* vec_permute_cost */
409 8, /* vec_to_scalar_cost */
410 8, /* scalar_to_vec_cost */
411 4, /* vec_align_load_cost */
412 4, /* vec_unalign_load_cost */
413 1, /* vec_unalign_store_cost */
414 1, /* vec_store_cost */
415 1, /* cond_taken_branch_cost */
416 1 /* cond_not_taken_branch_cost */
419 static const struct cpu_vector_cost exynosm1_vector_cost =
421 1, /* scalar_stmt_cost */
422 5, /* scalar_load_cost */
423 1, /* scalar_store_cost */
424 3, /* vec_stmt_cost */
425 3, /* vec_permute_cost */
426 3, /* vec_to_scalar_cost */
427 3, /* scalar_to_vec_cost */
428 5, /* vec_align_load_cost */
429 5, /* vec_unalign_load_cost */
430 1, /* vec_unalign_store_cost */
431 1, /* vec_store_cost */
432 1, /* cond_taken_branch_cost */
433 1 /* cond_not_taken_branch_cost */
436 /* Generic costs for vector insn classes. */
437 static const struct cpu_vector_cost xgene1_vector_cost =
439 1, /* scalar_stmt_cost */
440 5, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 2, /* vec_stmt_cost */
443 2, /* vec_permute_cost */
444 4, /* vec_to_scalar_cost */
445 4, /* scalar_to_vec_cost */
446 10, /* vec_align_load_cost */
447 10, /* vec_unalign_load_cost */
448 2, /* vec_unalign_store_cost */
449 2, /* vec_store_cost */
450 2, /* cond_taken_branch_cost */
451 1 /* cond_not_taken_branch_cost */
454 /* Costs for vector insn classes for Vulcan. */
455 static const struct cpu_vector_cost thunderx2t99_vector_cost =
457 6, /* scalar_stmt_cost */
458 4, /* scalar_load_cost */
459 1, /* scalar_store_cost */
460 6, /* vec_stmt_cost */
461 3, /* vec_permute_cost */
462 6, /* vec_to_scalar_cost */
463 5, /* scalar_to_vec_cost */
464 8, /* vec_align_load_cost */
465 8, /* vec_unalign_load_cost */
466 4, /* vec_unalign_store_cost */
467 4, /* vec_store_cost */
468 2, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for branch instructions. */
473 static const struct cpu_branch_cost generic_branch_cost =
475 2, /* Predictable. */
476 2 /* Unpredictable. */
479 /* Branch costs for Cortex-A57. */
480 static const struct cpu_branch_cost cortexa57_branch_cost =
482 1, /* Predictable. */
483 3 /* Unpredictable. */
486 /* Branch costs for Vulcan. */
487 static const struct cpu_branch_cost thunderx2t99_branch_cost =
489 1, /* Predictable. */
490 3 /* Unpredictable. */
493 /* Generic approximation modes. */
494 static const cpu_approx_modes generic_approx_modes =
496 AARCH64_APPROX_NONE, /* division */
497 AARCH64_APPROX_NONE, /* sqrt */
498 AARCH64_APPROX_NONE /* recip_sqrt */
501 /* Approximation modes for Exynos M1. */
502 static const cpu_approx_modes exynosm1_approx_modes =
504 AARCH64_APPROX_NONE, /* division */
505 AARCH64_APPROX_ALL, /* sqrt */
506 AARCH64_APPROX_ALL /* recip_sqrt */
509 /* Approximation modes for X-Gene 1. */
510 static const cpu_approx_modes xgene1_approx_modes =
512 AARCH64_APPROX_NONE, /* division */
513 AARCH64_APPROX_NONE, /* sqrt */
514 AARCH64_APPROX_ALL /* recip_sqrt */
517 static const struct tune_params generic_tunings =
519 &cortexa57_extra_costs,
520 &generic_addrcost_table,
521 &generic_regmove_cost,
522 &generic_vector_cost,
523 &generic_branch_cost,
524 &generic_approx_modes,
525 4, /* memmov_cost */
526 2, /* issue_rate */
527 AARCH64_FUSE_NOTHING, /* fusible_ops */
528 8, /* function_align. */
529 8, /* jump_align. */
530 4, /* loop_align. */
531 2, /* int_reassoc_width. */
532 4, /* fp_reassoc_width. */
533 1, /* vec_reassoc_width. */
534 2, /* min_div_recip_mul_sf. */
535 2, /* min_div_recip_mul_df. */
536 0, /* max_case_values. */
537 0, /* cache_line_size. */
538 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
539 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
542 static const struct tune_params cortexa35_tunings =
544 &cortexa53_extra_costs,
545 &generic_addrcost_table,
546 &cortexa53_regmove_cost,
547 &generic_vector_cost,
548 &cortexa57_branch_cost,
549 &generic_approx_modes,
550 4, /* memmov_cost */
551 1, /* issue_rate */
552 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
553 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
554 16, /* function_align. */
555 8, /* jump_align. */
556 8, /* loop_align. */
557 2, /* int_reassoc_width. */
558 4, /* fp_reassoc_width. */
559 1, /* vec_reassoc_width. */
560 2, /* min_div_recip_mul_sf. */
561 2, /* min_div_recip_mul_df. */
562 0, /* max_case_values. */
563 0, /* cache_line_size. */
564 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
565 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
568 static const struct tune_params cortexa53_tunings =
570 &cortexa53_extra_costs,
571 &generic_addrcost_table,
572 &cortexa53_regmove_cost,
573 &generic_vector_cost,
574 &cortexa57_branch_cost,
575 &generic_approx_modes,
576 4, /* memmov_cost */
577 2, /* issue_rate */
578 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
579 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
580 16, /* function_align. */
581 8, /* jump_align. */
582 8, /* loop_align. */
583 2, /* int_reassoc_width. */
584 4, /* fp_reassoc_width. */
585 1, /* vec_reassoc_width. */
586 2, /* min_div_recip_mul_sf. */
587 2, /* min_div_recip_mul_df. */
588 0, /* max_case_values. */
589 0, /* cache_line_size. */
590 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
591 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
594 static const struct tune_params cortexa57_tunings =
596 &cortexa57_extra_costs,
597 &cortexa57_addrcost_table,
598 &cortexa57_regmove_cost,
599 &cortexa57_vector_cost,
600 &cortexa57_branch_cost,
601 &generic_approx_modes,
602 4, /* memmov_cost */
603 3, /* issue_rate */
604 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
605 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
606 16, /* function_align. */
607 8, /* jump_align. */
608 8, /* loop_align. */
609 2, /* int_reassoc_width. */
610 4, /* fp_reassoc_width. */
611 1, /* vec_reassoc_width. */
612 2, /* min_div_recip_mul_sf. */
613 2, /* min_div_recip_mul_df. */
614 0, /* max_case_values. */
615 0, /* cache_line_size. */
616 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
617 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
620 static const struct tune_params cortexa72_tunings =
622 &cortexa57_extra_costs,
623 &cortexa57_addrcost_table,
624 &cortexa57_regmove_cost,
625 &cortexa57_vector_cost,
626 &cortexa57_branch_cost,
627 &generic_approx_modes,
628 4, /* memmov_cost */
629 3, /* issue_rate */
630 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
631 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
632 16, /* function_align. */
633 8, /* jump_align. */
634 8, /* loop_align. */
635 2, /* int_reassoc_width. */
636 4, /* fp_reassoc_width. */
637 1, /* vec_reassoc_width. */
638 2, /* min_div_recip_mul_sf. */
639 2, /* min_div_recip_mul_df. */
640 0, /* max_case_values. */
641 0, /* cache_line_size. */
642 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
643 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
646 static const struct tune_params cortexa73_tunings =
648 &cortexa57_extra_costs,
649 &cortexa57_addrcost_table,
650 &cortexa57_regmove_cost,
651 &cortexa57_vector_cost,
652 &cortexa57_branch_cost,
653 &generic_approx_modes,
654 4, /* memmov_cost. */
655 2, /* issue_rate. */
656 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
657 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
658 16, /* function_align. */
659 8, /* jump_align. */
660 8, /* loop_align. */
661 2, /* int_reassoc_width. */
662 4, /* fp_reassoc_width. */
663 1, /* vec_reassoc_width. */
664 2, /* min_div_recip_mul_sf. */
665 2, /* min_div_recip_mul_df. */
666 0, /* max_case_values. */
667 0, /* cache_line_size. */
668 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
669 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
672 static const struct tune_params exynosm1_tunings =
674 &exynosm1_extra_costs,
675 &exynosm1_addrcost_table,
676 &exynosm1_regmove_cost,
677 &exynosm1_vector_cost,
678 &generic_branch_cost,
679 &exynosm1_approx_modes,
680 4, /* memmov_cost */
681 3, /* issue_rate */
682 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
683 4, /* function_align. */
684 4, /* jump_align. */
685 4, /* loop_align. */
686 2, /* int_reassoc_width. */
687 4, /* fp_reassoc_width. */
688 1, /* vec_reassoc_width. */
689 2, /* min_div_recip_mul_sf. */
690 2, /* min_div_recip_mul_df. */
691 48, /* max_case_values. */
692 64, /* cache_line_size. */
693 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
694 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
697 static const struct tune_params thunderx_tunings =
699 &thunderx_extra_costs,
700 &generic_addrcost_table,
701 &thunderx_regmove_cost,
702 &thunderx_vector_cost,
703 &generic_branch_cost,
704 &generic_approx_modes,
705 6, /* memmov_cost */
706 2, /* issue_rate */
707 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
708 8, /* function_align. */
709 8, /* jump_align. */
710 8, /* loop_align. */
711 2, /* int_reassoc_width. */
712 4, /* fp_reassoc_width. */
713 1, /* vec_reassoc_width. */
714 2, /* min_div_recip_mul_sf. */
715 2, /* min_div_recip_mul_df. */
716 0, /* max_case_values. */
717 0, /* cache_line_size. */
718 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
722 static const struct tune_params xgene1_tunings =
724 &xgene1_extra_costs,
725 &xgene1_addrcost_table,
726 &xgene1_regmove_cost,
727 &xgene1_vector_cost,
728 &generic_branch_cost,
729 &xgene1_approx_modes,
730 6, /* memmov_cost */
731 4, /* issue_rate */
732 AARCH64_FUSE_NOTHING, /* fusible_ops */
733 16, /* function_align. */
734 8, /* jump_align. */
735 16, /* loop_align. */
736 2, /* int_reassoc_width. */
737 4, /* fp_reassoc_width. */
738 1, /* vec_reassoc_width. */
739 2, /* min_div_recip_mul_sf. */
740 2, /* min_div_recip_mul_df. */
741 0, /* max_case_values. */
742 0, /* cache_line_size. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
747 static const struct tune_params qdf24xx_tunings =
749 &qdf24xx_extra_costs,
750 &qdf24xx_addrcost_table,
751 &qdf24xx_regmove_cost,
752 &generic_vector_cost,
753 &generic_branch_cost,
754 &generic_approx_modes,
755 4, /* memmov_cost */
756 4, /* issue_rate */
757 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
758 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
759 16, /* function_align. */
760 8, /* jump_align. */
761 16, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 64, /* cache_line_size. */
769 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
770 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
773 static const struct tune_params thunderx2t99_tunings =
775 &thunderx2t99_extra_costs,
776 &thunderx2t99_addrcost_table,
777 &thunderx2t99_regmove_cost,
778 &thunderx2t99_vector_cost,
779 &thunderx2t99_branch_cost,
780 &generic_approx_modes,
781 4, /* memmov_cost. */
782 4, /* issue_rate. */
783 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
784 16, /* function_align. */
785 8, /* jump_align. */
786 16, /* loop_align. */
787 3, /* int_reassoc_width. */
788 2, /* fp_reassoc_width. */
789 2, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 64, /* cache_line_size. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
798 /* Support for fine-grained override of the tuning structures. */
799 struct aarch64_tuning_override_function
801 const char* name;
802 void (*parse_override)(const char*, struct tune_params*);
805 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
806 static void aarch64_parse_tune_string (const char*, struct tune_params*);
808 static const struct aarch64_tuning_override_function
809 aarch64_tuning_override_functions[] =
811 { "fuse", aarch64_parse_fuse_string },
812 { "tune", aarch64_parse_tune_string },
813 { NULL, NULL }
816 /* A processor implementing AArch64. */
817 struct processor
819 const char *const name;
820 enum aarch64_processor ident;
821 enum aarch64_processor sched_core;
822 enum aarch64_arch arch;
823 unsigned architecture_version;
824 const unsigned long flags;
825 const struct tune_params *const tune;
828 /* Architectures implementing AArch64. */
829 static const struct processor all_architectures[] =
831 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
832 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
833 #include "aarch64-arches.def"
834 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
837 /* Processor cores implementing AArch64. */
838 static const struct processor all_cores[] =
840 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
841 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
842 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
843 FLAGS, &COSTS##_tunings},
844 #include "aarch64-cores.def"
845 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
846 AARCH64_FL_FOR_ARCH8, &generic_tunings},
847 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
851 /* Target specification. These are populated by the -march, -mtune, -mcpu
852 handling code or by target attributes. */
853 static const struct processor *selected_arch;
854 static const struct processor *selected_cpu;
855 static const struct processor *selected_tune;
857 /* The current tuning set. */
858 struct tune_params aarch64_tune_params = generic_tunings;
860 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
862 /* An ISA extension in the co-processor and main instruction set space. */
863 struct aarch64_option_extension
865 const char *const name;
866 const unsigned long flags_on;
867 const unsigned long flags_off;
870 typedef enum aarch64_cond_code
872 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
873 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
874 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
876 aarch64_cc;
878 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
880 /* The condition codes of the processor, and the inverse function. */
881 static const char * const aarch64_condition_codes[] =
883 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
884 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
887 /* Generate code to enable conditional branches in functions over 1 MiB. */
888 const char *
889 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
890 const char * branch_format)
892 rtx_code_label * tmp_label = gen_label_rtx ();
893 char label_buf[256];
894 char buffer[128];
895 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
896 CODE_LABEL_NUMBER (tmp_label));
897 const char *label_ptr = targetm.strip_name_encoding (label_buf);
898 rtx dest_label = operands[pos_label];
899 operands[pos_label] = tmp_label;
901 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
902 output_asm_insn (buffer, operands);
904 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
905 operands[pos_label] = dest_label;
906 output_asm_insn (buffer, operands);
907 return "";
910 void
911 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
913 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
914 if (TARGET_GENERAL_REGS_ONLY)
915 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
916 else
917 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
920 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
921 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
922 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
923 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
924 cost (in this case the best class is the lowest cost one). Using ALL_REGS
925 irrespectively of its cost results in bad allocations with many redundant
926 int<->FP moves which are expensive on various cores.
927 To avoid this we don't allow ALL_REGS as the allocno class, but force a
928 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
929 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
930 Otherwise set the allocno class depending on the mode.
931 The result of this is that it is no longer inefficient to have a higher
932 memory move cost than the register move cost.
935 static reg_class_t
936 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
937 reg_class_t best_class)
939 enum machine_mode mode;
941 if (allocno_class != ALL_REGS)
942 return allocno_class;
944 if (best_class != ALL_REGS)
945 return best_class;
947 mode = PSEUDO_REGNO_MODE (regno);
948 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
951 static unsigned int
952 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
954 if (GET_MODE_UNIT_SIZE (mode) == 4)
955 return aarch64_tune_params.min_div_recip_mul_sf;
956 return aarch64_tune_params.min_div_recip_mul_df;
959 static int
960 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
961 enum machine_mode mode)
963 if (VECTOR_MODE_P (mode))
964 return aarch64_tune_params.vec_reassoc_width;
965 if (INTEGRAL_MODE_P (mode))
966 return aarch64_tune_params.int_reassoc_width;
967 if (FLOAT_MODE_P (mode))
968 return aarch64_tune_params.fp_reassoc_width;
969 return 1;
972 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
973 unsigned
974 aarch64_dbx_register_number (unsigned regno)
976 if (GP_REGNUM_P (regno))
977 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
978 else if (regno == SP_REGNUM)
979 return AARCH64_DWARF_SP;
980 else if (FP_REGNUM_P (regno))
981 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
983 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
984 equivalent DWARF register. */
985 return DWARF_FRAME_REGISTERS;
988 /* Return TRUE if MODE is any of the large INT modes. */
989 static bool
990 aarch64_vect_struct_mode_p (machine_mode mode)
992 return mode == OImode || mode == CImode || mode == XImode;
995 /* Return TRUE if MODE is any of the vector modes. */
996 static bool
997 aarch64_vector_mode_p (machine_mode mode)
999 return aarch64_vector_mode_supported_p (mode)
1000 || aarch64_vect_struct_mode_p (mode);
1003 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1004 static bool
1005 aarch64_array_mode_supported_p (machine_mode mode,
1006 unsigned HOST_WIDE_INT nelems)
1008 if (TARGET_SIMD
1009 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1010 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1011 && (nelems >= 2 && nelems <= 4))
1012 return true;
1014 return false;
1017 /* Implement HARD_REGNO_NREGS. */
1020 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1022 switch (aarch64_regno_regclass (regno))
1024 case FP_REGS:
1025 case FP_LO_REGS:
1026 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1027 default:
1028 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1030 gcc_unreachable ();
1033 /* Implement HARD_REGNO_MODE_OK. */
1036 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1038 if (GET_MODE_CLASS (mode) == MODE_CC)
1039 return regno == CC_REGNUM;
1041 if (regno == SP_REGNUM)
1042 /* The purpose of comparing with ptr_mode is to support the
1043 global register variable associated with the stack pointer
1044 register via the syntax of asm ("wsp") in ILP32. */
1045 return mode == Pmode || mode == ptr_mode;
1047 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1048 return mode == Pmode;
1050 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1051 return 1;
1053 if (FP_REGNUM_P (regno))
1055 if (aarch64_vect_struct_mode_p (mode))
1056 return
1057 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1058 else
1059 return 1;
1062 return 0;
1065 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1066 machine_mode
1067 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1068 machine_mode mode)
1070 /* Handle modes that fit within single registers. */
1071 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1073 if (GET_MODE_SIZE (mode) >= 4)
1074 return mode;
1075 else
1076 return SImode;
1078 /* Fall back to generic for multi-reg and very large modes. */
1079 else
1080 return choose_hard_reg_mode (regno, nregs, false);
1083 /* Return true if calls to DECL should be treated as
1084 long-calls (ie called via a register). */
1085 static bool
1086 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1088 return false;
1091 /* Return true if calls to symbol-ref SYM should be treated as
1092 long-calls (ie called via a register). */
1093 bool
1094 aarch64_is_long_call_p (rtx sym)
1096 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1099 /* Return true if calls to symbol-ref SYM should not go through
1100 plt stubs. */
1102 bool
1103 aarch64_is_noplt_call_p (rtx sym)
1105 const_tree decl = SYMBOL_REF_DECL (sym);
1107 if (flag_pic
1108 && decl
1109 && (!flag_plt
1110 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1111 && !targetm.binds_local_p (decl))
1112 return true;
1114 return false;
1117 /* Return true if the offsets to a zero/sign-extract operation
1118 represent an expression that matches an extend operation. The
1119 operands represent the paramters from
1121 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1122 bool
1123 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1124 rtx extract_imm)
1126 HOST_WIDE_INT mult_val, extract_val;
1128 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1129 return false;
1131 mult_val = INTVAL (mult_imm);
1132 extract_val = INTVAL (extract_imm);
1134 if (extract_val > 8
1135 && extract_val < GET_MODE_BITSIZE (mode)
1136 && exact_log2 (extract_val & ~7) > 0
1137 && (extract_val & 7) <= 4
1138 && mult_val == (1 << (extract_val & 7)))
1139 return true;
1141 return false;
1144 /* Emit an insn that's a simple single-set. Both the operands must be
1145 known to be valid. */
1146 inline static rtx_insn *
1147 emit_set_insn (rtx x, rtx y)
1149 return emit_insn (gen_rtx_SET (x, y));
1152 /* X and Y are two things to compare using CODE. Emit the compare insn and
1153 return the rtx for register 0 in the proper mode. */
1155 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1157 machine_mode mode = SELECT_CC_MODE (code, x, y);
1158 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1160 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1161 return cc_reg;
1164 /* Build the SYMBOL_REF for __tls_get_addr. */
1166 static GTY(()) rtx tls_get_addr_libfunc;
1169 aarch64_tls_get_addr (void)
1171 if (!tls_get_addr_libfunc)
1172 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1173 return tls_get_addr_libfunc;
1176 /* Return the TLS model to use for ADDR. */
1178 static enum tls_model
1179 tls_symbolic_operand_type (rtx addr)
1181 enum tls_model tls_kind = TLS_MODEL_NONE;
1182 rtx sym, addend;
1184 if (GET_CODE (addr) == CONST)
1186 split_const (addr, &sym, &addend);
1187 if (GET_CODE (sym) == SYMBOL_REF)
1188 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1190 else if (GET_CODE (addr) == SYMBOL_REF)
1191 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1193 return tls_kind;
1196 /* We'll allow lo_sum's in addresses in our legitimate addresses
1197 so that combine would take care of combining addresses where
1198 necessary, but for generation purposes, we'll generate the address
1199 as :
1200 RTL Absolute
1201 tmp = hi (symbol_ref); adrp x1, foo
1202 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1205 PIC TLS
1206 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1207 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1208 bl __tls_get_addr
1211 Load TLS symbol, depending on TLS mechanism and TLS access model.
1213 Global Dynamic - Traditional TLS:
1214 adrp tmp, :tlsgd:imm
1215 add dest, tmp, #:tlsgd_lo12:imm
1216 bl __tls_get_addr
1218 Global Dynamic - TLS Descriptors:
1219 adrp dest, :tlsdesc:imm
1220 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1221 add dest, dest, #:tlsdesc_lo12:imm
1222 blr tmp
1223 mrs tp, tpidr_el0
1224 add dest, dest, tp
1226 Initial Exec:
1227 mrs tp, tpidr_el0
1228 adrp tmp, :gottprel:imm
1229 ldr dest, [tmp, #:gottprel_lo12:imm]
1230 add dest, dest, tp
1232 Local Exec:
1233 mrs tp, tpidr_el0
1234 add t0, tp, #:tprel_hi12:imm, lsl #12
1235 add t0, t0, #:tprel_lo12_nc:imm
1238 static void
1239 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1240 enum aarch64_symbol_type type)
1242 switch (type)
1244 case SYMBOL_SMALL_ABSOLUTE:
1246 /* In ILP32, the mode of dest can be either SImode or DImode. */
1247 rtx tmp_reg = dest;
1248 machine_mode mode = GET_MODE (dest);
1250 gcc_assert (mode == Pmode || mode == ptr_mode);
1252 if (can_create_pseudo_p ())
1253 tmp_reg = gen_reg_rtx (mode);
1255 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1256 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1257 return;
1260 case SYMBOL_TINY_ABSOLUTE:
1261 emit_insn (gen_rtx_SET (dest, imm));
1262 return;
1264 case SYMBOL_SMALL_GOT_28K:
1266 machine_mode mode = GET_MODE (dest);
1267 rtx gp_rtx = pic_offset_table_rtx;
1268 rtx insn;
1269 rtx mem;
1271 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1272 here before rtl expand. Tree IVOPT will generate rtl pattern to
1273 decide rtx costs, in which case pic_offset_table_rtx is not
1274 initialized. For that case no need to generate the first adrp
1275 instruction as the final cost for global variable access is
1276 one instruction. */
1277 if (gp_rtx != NULL)
1279 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1280 using the page base as GOT base, the first page may be wasted,
1281 in the worst scenario, there is only 28K space for GOT).
1283 The generate instruction sequence for accessing global variable
1286 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1288 Only one instruction needed. But we must initialize
1289 pic_offset_table_rtx properly. We generate initialize insn for
1290 every global access, and allow CSE to remove all redundant.
1292 The final instruction sequences will look like the following
1293 for multiply global variables access.
1295 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1297 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1298 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1299 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1300 ... */
1302 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1303 crtl->uses_pic_offset_table = 1;
1304 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1306 if (mode != GET_MODE (gp_rtx))
1307 gp_rtx = gen_lowpart (mode, gp_rtx);
1311 if (mode == ptr_mode)
1313 if (mode == DImode)
1314 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1315 else
1316 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1318 mem = XVECEXP (SET_SRC (insn), 0, 0);
1320 else
1322 gcc_assert (mode == Pmode);
1324 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1325 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1328 /* The operand is expected to be MEM. Whenever the related insn
1329 pattern changed, above code which calculate mem should be
1330 updated. */
1331 gcc_assert (GET_CODE (mem) == MEM);
1332 MEM_READONLY_P (mem) = 1;
1333 MEM_NOTRAP_P (mem) = 1;
1334 emit_insn (insn);
1335 return;
1338 case SYMBOL_SMALL_GOT_4G:
1340 /* In ILP32, the mode of dest can be either SImode or DImode,
1341 while the got entry is always of SImode size. The mode of
1342 dest depends on how dest is used: if dest is assigned to a
1343 pointer (e.g. in the memory), it has SImode; it may have
1344 DImode if dest is dereferenced to access the memeory.
1345 This is why we have to handle three different ldr_got_small
1346 patterns here (two patterns for ILP32). */
1348 rtx insn;
1349 rtx mem;
1350 rtx tmp_reg = dest;
1351 machine_mode mode = GET_MODE (dest);
1353 if (can_create_pseudo_p ())
1354 tmp_reg = gen_reg_rtx (mode);
1356 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1357 if (mode == ptr_mode)
1359 if (mode == DImode)
1360 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1361 else
1362 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1364 mem = XVECEXP (SET_SRC (insn), 0, 0);
1366 else
1368 gcc_assert (mode == Pmode);
1370 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1371 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1374 gcc_assert (GET_CODE (mem) == MEM);
1375 MEM_READONLY_P (mem) = 1;
1376 MEM_NOTRAP_P (mem) = 1;
1377 emit_insn (insn);
1378 return;
1381 case SYMBOL_SMALL_TLSGD:
1383 rtx_insn *insns;
1384 machine_mode mode = GET_MODE (dest);
1385 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1387 start_sequence ();
1388 if (TARGET_ILP32)
1389 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1390 else
1391 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1392 insns = get_insns ();
1393 end_sequence ();
1395 RTL_CONST_CALL_P (insns) = 1;
1396 emit_libcall_block (insns, dest, result, imm);
1397 return;
1400 case SYMBOL_SMALL_TLSDESC:
1402 machine_mode mode = GET_MODE (dest);
1403 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1404 rtx tp;
1406 gcc_assert (mode == Pmode || mode == ptr_mode);
1408 /* In ILP32, the got entry is always of SImode size. Unlike
1409 small GOT, the dest is fixed at reg 0. */
1410 if (TARGET_ILP32)
1411 emit_insn (gen_tlsdesc_small_si (imm));
1412 else
1413 emit_insn (gen_tlsdesc_small_di (imm));
1414 tp = aarch64_load_tp (NULL);
1416 if (mode != Pmode)
1417 tp = gen_lowpart (mode, tp);
1419 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1420 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1421 return;
1424 case SYMBOL_SMALL_TLSIE:
1426 /* In ILP32, the mode of dest can be either SImode or DImode,
1427 while the got entry is always of SImode size. The mode of
1428 dest depends on how dest is used: if dest is assigned to a
1429 pointer (e.g. in the memory), it has SImode; it may have
1430 DImode if dest is dereferenced to access the memeory.
1431 This is why we have to handle three different tlsie_small
1432 patterns here (two patterns for ILP32). */
1433 machine_mode mode = GET_MODE (dest);
1434 rtx tmp_reg = gen_reg_rtx (mode);
1435 rtx tp = aarch64_load_tp (NULL);
1437 if (mode == ptr_mode)
1439 if (mode == DImode)
1440 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1441 else
1443 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1444 tp = gen_lowpart (mode, tp);
1447 else
1449 gcc_assert (mode == Pmode);
1450 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1453 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1454 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1455 return;
1458 case SYMBOL_TLSLE12:
1459 case SYMBOL_TLSLE24:
1460 case SYMBOL_TLSLE32:
1461 case SYMBOL_TLSLE48:
1463 machine_mode mode = GET_MODE (dest);
1464 rtx tp = aarch64_load_tp (NULL);
1466 if (mode != Pmode)
1467 tp = gen_lowpart (mode, tp);
1469 switch (type)
1471 case SYMBOL_TLSLE12:
1472 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1473 (dest, tp, imm));
1474 break;
1475 case SYMBOL_TLSLE24:
1476 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1477 (dest, tp, imm));
1478 break;
1479 case SYMBOL_TLSLE32:
1480 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1481 (dest, imm));
1482 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1483 (dest, dest, tp));
1484 break;
1485 case SYMBOL_TLSLE48:
1486 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1487 (dest, imm));
1488 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1489 (dest, dest, tp));
1490 break;
1491 default:
1492 gcc_unreachable ();
1495 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1496 return;
1499 case SYMBOL_TINY_GOT:
1500 emit_insn (gen_ldr_got_tiny (dest, imm));
1501 return;
1503 case SYMBOL_TINY_TLSIE:
1505 machine_mode mode = GET_MODE (dest);
1506 rtx tp = aarch64_load_tp (NULL);
1508 if (mode == ptr_mode)
1510 if (mode == DImode)
1511 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1512 else
1514 tp = gen_lowpart (mode, tp);
1515 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1518 else
1520 gcc_assert (mode == Pmode);
1521 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1524 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1525 return;
1528 default:
1529 gcc_unreachable ();
1533 /* Emit a move from SRC to DEST. Assume that the move expanders can
1534 handle all moves if !can_create_pseudo_p (). The distinction is
1535 important because, unlike emit_move_insn, the move expanders know
1536 how to force Pmode objects into the constant pool even when the
1537 constant pool address is not itself legitimate. */
1538 static rtx
1539 aarch64_emit_move (rtx dest, rtx src)
1541 return (can_create_pseudo_p ()
1542 ? emit_move_insn (dest, src)
1543 : emit_move_insn_1 (dest, src));
1546 /* Split a 128-bit move operation into two 64-bit move operations,
1547 taking care to handle partial overlap of register to register
1548 copies. Special cases are needed when moving between GP regs and
1549 FP regs. SRC can be a register, constant or memory; DST a register
1550 or memory. If either operand is memory it must not have any side
1551 effects. */
1552 void
1553 aarch64_split_128bit_move (rtx dst, rtx src)
1555 rtx dst_lo, dst_hi;
1556 rtx src_lo, src_hi;
1558 machine_mode mode = GET_MODE (dst);
1560 gcc_assert (mode == TImode || mode == TFmode);
1561 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1562 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1564 if (REG_P (dst) && REG_P (src))
1566 int src_regno = REGNO (src);
1567 int dst_regno = REGNO (dst);
1569 /* Handle FP <-> GP regs. */
1570 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1572 src_lo = gen_lowpart (word_mode, src);
1573 src_hi = gen_highpart (word_mode, src);
1575 if (mode == TImode)
1577 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1578 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1580 else
1582 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1583 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1585 return;
1587 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1589 dst_lo = gen_lowpart (word_mode, dst);
1590 dst_hi = gen_highpart (word_mode, dst);
1592 if (mode == TImode)
1594 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1595 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1597 else
1599 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1600 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1602 return;
1606 dst_lo = gen_lowpart (word_mode, dst);
1607 dst_hi = gen_highpart (word_mode, dst);
1608 src_lo = gen_lowpart (word_mode, src);
1609 src_hi = gen_highpart_mode (word_mode, mode, src);
1611 /* At most one pairing may overlap. */
1612 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1614 aarch64_emit_move (dst_hi, src_hi);
1615 aarch64_emit_move (dst_lo, src_lo);
1617 else
1619 aarch64_emit_move (dst_lo, src_lo);
1620 aarch64_emit_move (dst_hi, src_hi);
1624 bool
1625 aarch64_split_128bit_move_p (rtx dst, rtx src)
1627 return (! REG_P (src)
1628 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1631 /* Split a complex SIMD combine. */
1633 void
1634 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1636 machine_mode src_mode = GET_MODE (src1);
1637 machine_mode dst_mode = GET_MODE (dst);
1639 gcc_assert (VECTOR_MODE_P (dst_mode));
1641 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1643 rtx (*gen) (rtx, rtx, rtx);
1645 switch (src_mode)
1647 case V8QImode:
1648 gen = gen_aarch64_simd_combinev8qi;
1649 break;
1650 case V4HImode:
1651 gen = gen_aarch64_simd_combinev4hi;
1652 break;
1653 case V2SImode:
1654 gen = gen_aarch64_simd_combinev2si;
1655 break;
1656 case V4HFmode:
1657 gen = gen_aarch64_simd_combinev4hf;
1658 break;
1659 case V2SFmode:
1660 gen = gen_aarch64_simd_combinev2sf;
1661 break;
1662 case DImode:
1663 gen = gen_aarch64_simd_combinedi;
1664 break;
1665 case DFmode:
1666 gen = gen_aarch64_simd_combinedf;
1667 break;
1668 default:
1669 gcc_unreachable ();
1672 emit_insn (gen (dst, src1, src2));
1673 return;
1677 /* Split a complex SIMD move. */
1679 void
1680 aarch64_split_simd_move (rtx dst, rtx src)
1682 machine_mode src_mode = GET_MODE (src);
1683 machine_mode dst_mode = GET_MODE (dst);
1685 gcc_assert (VECTOR_MODE_P (dst_mode));
1687 if (REG_P (dst) && REG_P (src))
1689 rtx (*gen) (rtx, rtx);
1691 gcc_assert (VECTOR_MODE_P (src_mode));
1693 switch (src_mode)
1695 case V16QImode:
1696 gen = gen_aarch64_split_simd_movv16qi;
1697 break;
1698 case V8HImode:
1699 gen = gen_aarch64_split_simd_movv8hi;
1700 break;
1701 case V4SImode:
1702 gen = gen_aarch64_split_simd_movv4si;
1703 break;
1704 case V2DImode:
1705 gen = gen_aarch64_split_simd_movv2di;
1706 break;
1707 case V8HFmode:
1708 gen = gen_aarch64_split_simd_movv8hf;
1709 break;
1710 case V4SFmode:
1711 gen = gen_aarch64_split_simd_movv4sf;
1712 break;
1713 case V2DFmode:
1714 gen = gen_aarch64_split_simd_movv2df;
1715 break;
1716 default:
1717 gcc_unreachable ();
1720 emit_insn (gen (dst, src));
1721 return;
1725 bool
1726 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1727 machine_mode ymode, rtx y)
1729 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1730 gcc_assert (r != NULL);
1731 return rtx_equal_p (x, r);
1735 static rtx
1736 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1738 if (can_create_pseudo_p ())
1739 return force_reg (mode, value);
1740 else
1742 x = aarch64_emit_move (x, value);
1743 return x;
1748 static rtx
1749 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1751 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1753 rtx high;
1754 /* Load the full offset into a register. This
1755 might be improvable in the future. */
1756 high = GEN_INT (offset);
1757 offset = 0;
1758 high = aarch64_force_temporary (mode, temp, high);
1759 reg = aarch64_force_temporary (mode, temp,
1760 gen_rtx_PLUS (mode, high, reg));
1762 return plus_constant (mode, reg, offset);
1765 static int
1766 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1767 machine_mode mode)
1769 int i;
1770 unsigned HOST_WIDE_INT val, val2, mask;
1771 int one_match, zero_match;
1772 int num_insns;
1774 val = INTVAL (imm);
1776 if (aarch64_move_imm (val, mode))
1778 if (generate)
1779 emit_insn (gen_rtx_SET (dest, imm));
1780 return 1;
1783 if ((val >> 32) == 0 || mode == SImode)
1785 if (generate)
1787 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1788 if (mode == SImode)
1789 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1790 GEN_INT ((val >> 16) & 0xffff)));
1791 else
1792 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1793 GEN_INT ((val >> 16) & 0xffff)));
1795 return 2;
1798 /* Remaining cases are all for DImode. */
1800 mask = 0xffff;
1801 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1802 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1803 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1804 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1806 if (zero_match != 2 && one_match != 2)
1808 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1809 For a 64-bit bitmask try whether changing 16 bits to all ones or
1810 zeroes creates a valid bitmask. To check any repeated bitmask,
1811 try using 16 bits from the other 32-bit half of val. */
1813 for (i = 0; i < 64; i += 16, mask <<= 16)
1815 val2 = val & ~mask;
1816 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817 break;
1818 val2 = val | mask;
1819 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1820 break;
1821 val2 = val2 & ~mask;
1822 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1823 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1824 break;
1826 if (i != 64)
1828 if (generate)
1830 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1831 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1832 GEN_INT ((val >> i) & 0xffff)));
1834 return 2;
1838 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1839 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1840 otherwise skip zero bits. */
1842 num_insns = 1;
1843 mask = 0xffff;
1844 val2 = one_match > zero_match ? ~val : val;
1845 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1847 if (generate)
1848 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1849 ? (val | ~(mask << i))
1850 : (val & (mask << i)))));
1851 for (i += 16; i < 64; i += 16)
1853 if ((val2 & (mask << i)) == 0)
1854 continue;
1855 if (generate)
1856 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1857 GEN_INT ((val >> i) & 0xffff)));
1858 num_insns ++;
1861 return num_insns;
1865 void
1866 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1868 machine_mode mode = GET_MODE (dest);
1870 gcc_assert (mode == SImode || mode == DImode);
1872 /* Check on what type of symbol it is. */
1873 if (GET_CODE (imm) == SYMBOL_REF
1874 || GET_CODE (imm) == LABEL_REF
1875 || GET_CODE (imm) == CONST)
1877 rtx mem, base, offset;
1878 enum aarch64_symbol_type sty;
1880 /* If we have (const (plus symbol offset)), separate out the offset
1881 before we start classifying the symbol. */
1882 split_const (imm, &base, &offset);
1884 sty = aarch64_classify_symbol (base, offset);
1885 switch (sty)
1887 case SYMBOL_FORCE_TO_MEM:
1888 if (offset != const0_rtx
1889 && targetm.cannot_force_const_mem (mode, imm))
1891 gcc_assert (can_create_pseudo_p ());
1892 base = aarch64_force_temporary (mode, dest, base);
1893 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1894 aarch64_emit_move (dest, base);
1895 return;
1898 mem = force_const_mem (ptr_mode, imm);
1899 gcc_assert (mem);
1901 /* If we aren't generating PC relative literals, then
1902 we need to expand the literal pool access carefully.
1903 This is something that needs to be done in a number
1904 of places, so could well live as a separate function. */
1905 if (!aarch64_pcrelative_literal_loads)
1907 gcc_assert (can_create_pseudo_p ());
1908 base = gen_reg_rtx (ptr_mode);
1909 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1910 mem = gen_rtx_MEM (ptr_mode, base);
1913 if (mode != ptr_mode)
1914 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1916 emit_insn (gen_rtx_SET (dest, mem));
1918 return;
1920 case SYMBOL_SMALL_TLSGD:
1921 case SYMBOL_SMALL_TLSDESC:
1922 case SYMBOL_SMALL_TLSIE:
1923 case SYMBOL_SMALL_GOT_28K:
1924 case SYMBOL_SMALL_GOT_4G:
1925 case SYMBOL_TINY_GOT:
1926 case SYMBOL_TINY_TLSIE:
1927 if (offset != const0_rtx)
1929 gcc_assert(can_create_pseudo_p ());
1930 base = aarch64_force_temporary (mode, dest, base);
1931 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1932 aarch64_emit_move (dest, base);
1933 return;
1935 /* FALLTHRU */
1937 case SYMBOL_SMALL_ABSOLUTE:
1938 case SYMBOL_TINY_ABSOLUTE:
1939 case SYMBOL_TLSLE12:
1940 case SYMBOL_TLSLE24:
1941 case SYMBOL_TLSLE32:
1942 case SYMBOL_TLSLE48:
1943 aarch64_load_symref_appropriately (dest, imm, sty);
1944 return;
1946 default:
1947 gcc_unreachable ();
1951 if (!CONST_INT_P (imm))
1953 if (GET_CODE (imm) == HIGH)
1954 emit_insn (gen_rtx_SET (dest, imm));
1955 else
1957 rtx mem = force_const_mem (mode, imm);
1958 gcc_assert (mem);
1959 emit_insn (gen_rtx_SET (dest, mem));
1962 return;
1965 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1968 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1969 temporary value if necessary. FRAME_RELATED_P should be true if
1970 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1971 to the generated instructions. If SCRATCHREG is known to hold
1972 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1973 immediate again.
1975 Since this function may be used to adjust the stack pointer, we must
1976 ensure that it cannot cause transient stack deallocation (for example
1977 by first incrementing SP and then decrementing when adjusting by a
1978 large immediate). */
1980 static void
1981 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1982 HOST_WIDE_INT delta, bool frame_related_p,
1983 bool emit_move_imm)
1985 HOST_WIDE_INT mdelta = abs_hwi (delta);
1986 rtx this_rtx = gen_rtx_REG (mode, regnum);
1987 rtx_insn *insn;
1989 if (!mdelta)
1990 return;
1992 /* Single instruction adjustment. */
1993 if (aarch64_uimm12_shift (mdelta))
1995 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1996 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1997 return;
2000 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2001 Only do this if mdelta is not a 16-bit move as adjusting using a move
2002 is better. */
2003 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2005 HOST_WIDE_INT low_off = mdelta & 0xfff;
2007 low_off = delta < 0 ? -low_off : low_off;
2008 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2009 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2010 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2011 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2012 return;
2015 /* Emit a move immediate if required and an addition/subtraction. */
2016 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2017 if (emit_move_imm)
2018 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2019 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2020 : gen_add2_insn (this_rtx, scratch_rtx));
2021 if (frame_related_p)
2023 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024 rtx adj = plus_constant (mode, this_rtx, delta);
2025 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2029 static inline void
2030 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2031 HOST_WIDE_INT delta)
2033 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2036 static inline void
2037 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2039 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2040 true, emit_move_imm);
2043 static inline void
2044 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2046 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2047 frame_related_p, true);
2050 static bool
2051 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2052 tree exp ATTRIBUTE_UNUSED)
2054 /* Currently, always true. */
2055 return true;
2058 /* Implement TARGET_PASS_BY_REFERENCE. */
2060 static bool
2061 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2062 machine_mode mode,
2063 const_tree type,
2064 bool named ATTRIBUTE_UNUSED)
2066 HOST_WIDE_INT size;
2067 machine_mode dummymode;
2068 int nregs;
2070 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2071 size = (mode == BLKmode && type)
2072 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2074 /* Aggregates are passed by reference based on their size. */
2075 if (type && AGGREGATE_TYPE_P (type))
2077 size = int_size_in_bytes (type);
2080 /* Variable sized arguments are always returned by reference. */
2081 if (size < 0)
2082 return true;
2084 /* Can this be a candidate to be passed in fp/simd register(s)? */
2085 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2086 &dummymode, &nregs,
2087 NULL))
2088 return false;
2090 /* Arguments which are variable sized or larger than 2 registers are
2091 passed by reference unless they are a homogenous floating point
2092 aggregate. */
2093 return size > 2 * UNITS_PER_WORD;
2096 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2097 static bool
2098 aarch64_return_in_msb (const_tree valtype)
2100 machine_mode dummy_mode;
2101 int dummy_int;
2103 /* Never happens in little-endian mode. */
2104 if (!BYTES_BIG_ENDIAN)
2105 return false;
2107 /* Only composite types smaller than or equal to 16 bytes can
2108 be potentially returned in registers. */
2109 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2110 || int_size_in_bytes (valtype) <= 0
2111 || int_size_in_bytes (valtype) > 16)
2112 return false;
2114 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2115 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2116 is always passed/returned in the least significant bits of fp/simd
2117 register(s). */
2118 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2119 &dummy_mode, &dummy_int, NULL))
2120 return false;
2122 return true;
2125 /* Implement TARGET_FUNCTION_VALUE.
2126 Define how to find the value returned by a function. */
2128 static rtx
2129 aarch64_function_value (const_tree type, const_tree func,
2130 bool outgoing ATTRIBUTE_UNUSED)
2132 machine_mode mode;
2133 int unsignedp;
2134 int count;
2135 machine_mode ag_mode;
2137 mode = TYPE_MODE (type);
2138 if (INTEGRAL_TYPE_P (type))
2139 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2141 if (aarch64_return_in_msb (type))
2143 HOST_WIDE_INT size = int_size_in_bytes (type);
2145 if (size % UNITS_PER_WORD != 0)
2147 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2148 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2152 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2153 &ag_mode, &count, NULL))
2155 if (!aarch64_composite_type_p (type, mode))
2157 gcc_assert (count == 1 && mode == ag_mode);
2158 return gen_rtx_REG (mode, V0_REGNUM);
2160 else
2162 int i;
2163 rtx par;
2165 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2166 for (i = 0; i < count; i++)
2168 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2169 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2170 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2171 XVECEXP (par, 0, i) = tmp;
2173 return par;
2176 else
2177 return gen_rtx_REG (mode, R0_REGNUM);
2180 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2181 Return true if REGNO is the number of a hard register in which the values
2182 of called function may come back. */
2184 static bool
2185 aarch64_function_value_regno_p (const unsigned int regno)
2187 /* Maximum of 16 bytes can be returned in the general registers. Examples
2188 of 16-byte return values are: 128-bit integers and 16-byte small
2189 structures (excluding homogeneous floating-point aggregates). */
2190 if (regno == R0_REGNUM || regno == R1_REGNUM)
2191 return true;
2193 /* Up to four fp/simd registers can return a function value, e.g. a
2194 homogeneous floating-point aggregate having four members. */
2195 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2196 return TARGET_FLOAT;
2198 return false;
2201 /* Implement TARGET_RETURN_IN_MEMORY.
2203 If the type T of the result of a function is such that
2204 void func (T arg)
2205 would require that arg be passed as a value in a register (or set of
2206 registers) according to the parameter passing rules, then the result
2207 is returned in the same registers as would be used for such an
2208 argument. */
2210 static bool
2211 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2213 HOST_WIDE_INT size;
2214 machine_mode ag_mode;
2215 int count;
2217 if (!AGGREGATE_TYPE_P (type)
2218 && TREE_CODE (type) != COMPLEX_TYPE
2219 && TREE_CODE (type) != VECTOR_TYPE)
2220 /* Simple scalar types always returned in registers. */
2221 return false;
2223 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2224 type,
2225 &ag_mode,
2226 &count,
2227 NULL))
2228 return false;
2230 /* Types larger than 2 registers returned in memory. */
2231 size = int_size_in_bytes (type);
2232 return (size < 0 || size > 2 * UNITS_PER_WORD);
2235 static bool
2236 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2237 const_tree type, int *nregs)
2239 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2240 return aarch64_vfp_is_call_or_return_candidate (mode,
2241 type,
2242 &pcum->aapcs_vfp_rmode,
2243 nregs,
2244 NULL);
2247 /* Given MODE and TYPE of a function argument, return the alignment in
2248 bits. The idea is to suppress any stronger alignment requested by
2249 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2250 This is a helper function for local use only. */
2252 static unsigned int
2253 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2255 if (!type)
2256 return GET_MODE_ALIGNMENT (mode);
2257 if (integer_zerop (TYPE_SIZE (type)))
2258 return 0;
2260 gcc_assert (TYPE_MODE (type) == mode);
2262 if (!AGGREGATE_TYPE_P (type))
2263 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2265 if (TREE_CODE (type) == ARRAY_TYPE)
2266 return TYPE_ALIGN (TREE_TYPE (type));
2268 unsigned int alignment = 0;
2270 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2271 alignment = std::max (alignment, DECL_ALIGN (field));
2273 return alignment;
2276 /* Layout a function argument according to the AAPCS64 rules. The rule
2277 numbers refer to the rule numbers in the AAPCS64. */
2279 static void
2280 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2281 const_tree type,
2282 bool named ATTRIBUTE_UNUSED)
2284 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2285 int ncrn, nvrn, nregs;
2286 bool allocate_ncrn, allocate_nvrn;
2287 HOST_WIDE_INT size;
2289 /* We need to do this once per argument. */
2290 if (pcum->aapcs_arg_processed)
2291 return;
2293 pcum->aapcs_arg_processed = true;
2295 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2296 size
2297 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2298 UNITS_PER_WORD);
2300 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2301 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2302 mode,
2303 type,
2304 &nregs);
2306 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2307 The following code thus handles passing by SIMD/FP registers first. */
2309 nvrn = pcum->aapcs_nvrn;
2311 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2312 and homogenous short-vector aggregates (HVA). */
2313 if (allocate_nvrn)
2315 if (!TARGET_FLOAT)
2316 aarch64_err_no_fpadvsimd (mode, "argument");
2318 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2320 pcum->aapcs_nextnvrn = nvrn + nregs;
2321 if (!aarch64_composite_type_p (type, mode))
2323 gcc_assert (nregs == 1);
2324 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2326 else
2328 rtx par;
2329 int i;
2330 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2331 for (i = 0; i < nregs; i++)
2333 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2334 V0_REGNUM + nvrn + i);
2335 tmp = gen_rtx_EXPR_LIST
2336 (VOIDmode, tmp,
2337 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2338 XVECEXP (par, 0, i) = tmp;
2340 pcum->aapcs_reg = par;
2342 return;
2344 else
2346 /* C.3 NSRN is set to 8. */
2347 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2348 goto on_stack;
2352 ncrn = pcum->aapcs_ncrn;
2353 nregs = size / UNITS_PER_WORD;
2355 /* C6 - C9. though the sign and zero extension semantics are
2356 handled elsewhere. This is the case where the argument fits
2357 entirely general registers. */
2358 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2360 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2362 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2364 /* C.8 if the argument has an alignment of 16 then the NGRN is
2365 rounded up to the next even number. */
2366 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2368 ++ncrn;
2369 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2371 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2372 A reg is still generated for it, but the caller should be smart
2373 enough not to use it. */
2374 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2376 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2378 else
2380 rtx par;
2381 int i;
2383 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2384 for (i = 0; i < nregs; i++)
2386 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2387 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2388 GEN_INT (i * UNITS_PER_WORD));
2389 XVECEXP (par, 0, i) = tmp;
2391 pcum->aapcs_reg = par;
2394 pcum->aapcs_nextncrn = ncrn + nregs;
2395 return;
2398 /* C.11 */
2399 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2401 /* The argument is passed on stack; record the needed number of words for
2402 this argument and align the total size if necessary. */
2403 on_stack:
2404 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2405 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2406 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2407 16 / UNITS_PER_WORD);
2408 return;
2411 /* Implement TARGET_FUNCTION_ARG. */
2413 static rtx
2414 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2415 const_tree type, bool named)
2417 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2418 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2420 if (mode == VOIDmode)
2421 return NULL_RTX;
2423 aarch64_layout_arg (pcum_v, mode, type, named);
2424 return pcum->aapcs_reg;
2427 void
2428 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2429 const_tree fntype ATTRIBUTE_UNUSED,
2430 rtx libname ATTRIBUTE_UNUSED,
2431 const_tree fndecl ATTRIBUTE_UNUSED,
2432 unsigned n_named ATTRIBUTE_UNUSED)
2434 pcum->aapcs_ncrn = 0;
2435 pcum->aapcs_nvrn = 0;
2436 pcum->aapcs_nextncrn = 0;
2437 pcum->aapcs_nextnvrn = 0;
2438 pcum->pcs_variant = ARM_PCS_AAPCS64;
2439 pcum->aapcs_reg = NULL_RTX;
2440 pcum->aapcs_arg_processed = false;
2441 pcum->aapcs_stack_words = 0;
2442 pcum->aapcs_stack_size = 0;
2444 if (!TARGET_FLOAT
2445 && fndecl && TREE_PUBLIC (fndecl)
2446 && fntype && fntype != error_mark_node)
2448 const_tree type = TREE_TYPE (fntype);
2449 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2450 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2451 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2452 &mode, &nregs, NULL))
2453 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2455 return;
2458 static void
2459 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2460 machine_mode mode,
2461 const_tree type,
2462 bool named)
2464 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2465 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2467 aarch64_layout_arg (pcum_v, mode, type, named);
2468 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2469 != (pcum->aapcs_stack_words != 0));
2470 pcum->aapcs_arg_processed = false;
2471 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2472 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2473 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2474 pcum->aapcs_stack_words = 0;
2475 pcum->aapcs_reg = NULL_RTX;
2479 bool
2480 aarch64_function_arg_regno_p (unsigned regno)
2482 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2483 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2486 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2487 PARM_BOUNDARY bits of alignment, but will be given anything up
2488 to STACK_BOUNDARY bits if the type requires it. This makes sure
2489 that both before and after the layout of each argument, the Next
2490 Stacked Argument Address (NSAA) will have a minimum alignment of
2491 8 bytes. */
2493 static unsigned int
2494 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2496 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2498 if (alignment < PARM_BOUNDARY)
2499 alignment = PARM_BOUNDARY;
2500 if (alignment > STACK_BOUNDARY)
2501 alignment = STACK_BOUNDARY;
2502 return alignment;
2505 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2507 Return true if an argument passed on the stack should be padded upwards,
2508 i.e. if the least-significant byte of the stack slot has useful data.
2510 Small aggregate types are placed in the lowest memory address.
2512 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2514 bool
2515 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2517 /* On little-endian targets, the least significant byte of every stack
2518 argument is passed at the lowest byte address of the stack slot. */
2519 if (!BYTES_BIG_ENDIAN)
2520 return true;
2522 /* Otherwise, integral, floating-point and pointer types are padded downward:
2523 the least significant byte of a stack argument is passed at the highest
2524 byte address of the stack slot. */
2525 if (type
2526 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2527 || POINTER_TYPE_P (type))
2528 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2529 return false;
2531 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2532 return true;
2535 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2537 It specifies padding for the last (may also be the only)
2538 element of a block move between registers and memory. If
2539 assuming the block is in the memory, padding upward means that
2540 the last element is padded after its highest significant byte,
2541 while in downward padding, the last element is padded at the
2542 its least significant byte side.
2544 Small aggregates and small complex types are always padded
2545 upwards.
2547 We don't need to worry about homogeneous floating-point or
2548 short-vector aggregates; their move is not affected by the
2549 padding direction determined here. Regardless of endianness,
2550 each element of such an aggregate is put in the least
2551 significant bits of a fp/simd register.
2553 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2554 register has useful data, and return the opposite if the most
2555 significant byte does. */
2557 bool
2558 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2559 bool first ATTRIBUTE_UNUSED)
2562 /* Small composite types are always padded upward. */
2563 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2565 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2566 : GET_MODE_SIZE (mode));
2567 if (size < 2 * UNITS_PER_WORD)
2568 return true;
2571 /* Otherwise, use the default padding. */
2572 return !BYTES_BIG_ENDIAN;
2575 static machine_mode
2576 aarch64_libgcc_cmp_return_mode (void)
2578 return SImode;
2581 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2583 /* We use the 12-bit shifted immediate arithmetic instructions so values
2584 must be multiple of (1 << 12), i.e. 4096. */
2585 #define ARITH_FACTOR 4096
2587 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2588 #error Cannot use simple address calculation for stack probing
2589 #endif
2591 /* The pair of scratch registers used for stack probing. */
2592 #define PROBE_STACK_FIRST_REG 9
2593 #define PROBE_STACK_SECOND_REG 10
2595 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2596 inclusive. These are offsets from the current stack pointer. */
2598 static void
2599 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2601 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2603 /* See the same assertion on PROBE_INTERVAL above. */
2604 gcc_assert ((first % ARITH_FACTOR) == 0);
2606 /* See if we have a constant small number of probes to generate. If so,
2607 that's the easy case. */
2608 if (size <= PROBE_INTERVAL)
2610 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2612 emit_set_insn (reg1,
2613 plus_constant (ptr_mode,
2614 stack_pointer_rtx, -(first + base)));
2615 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2618 /* The run-time loop is made up of 8 insns in the generic case while the
2619 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2620 else if (size <= 4 * PROBE_INTERVAL)
2622 HOST_WIDE_INT i, rem;
2624 emit_set_insn (reg1,
2625 plus_constant (ptr_mode,
2626 stack_pointer_rtx,
2627 -(first + PROBE_INTERVAL)));
2628 emit_stack_probe (reg1);
2630 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2631 it exceeds SIZE. If only two probes are needed, this will not
2632 generate any code. Then probe at FIRST + SIZE. */
2633 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2635 emit_set_insn (reg1,
2636 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2637 emit_stack_probe (reg1);
2640 rem = size - (i - PROBE_INTERVAL);
2641 if (rem > 256)
2643 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2645 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2646 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2648 else
2649 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2652 /* Otherwise, do the same as above, but in a loop. Note that we must be
2653 extra careful with variables wrapping around because we might be at
2654 the very top (or the very bottom) of the address space and we have
2655 to be able to handle this case properly; in particular, we use an
2656 equality test for the loop condition. */
2657 else
2659 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2661 /* Step 1: round SIZE to the previous multiple of the interval. */
2663 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2666 /* Step 2: compute initial and final value of the loop counter. */
2668 /* TEST_ADDR = SP + FIRST. */
2669 emit_set_insn (reg1,
2670 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2672 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2673 emit_set_insn (reg2,
2674 plus_constant (ptr_mode, stack_pointer_rtx,
2675 -(first + rounded_size)));
2678 /* Step 3: the loop
2682 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2683 probe at TEST_ADDR
2685 while (TEST_ADDR != LAST_ADDR)
2687 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2688 until it is equal to ROUNDED_SIZE. */
2690 if (ptr_mode == DImode)
2691 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2692 else
2693 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2696 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2697 that SIZE is equal to ROUNDED_SIZE. */
2699 if (size != rounded_size)
2701 HOST_WIDE_INT rem = size - rounded_size;
2703 if (rem > 256)
2705 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2707 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2708 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2710 else
2711 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2715 /* Make sure nothing is scheduled before we are done. */
2716 emit_insn (gen_blockage ());
2719 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2720 absolute addresses. */
2722 const char *
2723 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2725 static int labelno = 0;
2726 char loop_lab[32];
2727 rtx xops[2];
2729 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2731 /* Loop. */
2732 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2734 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2735 xops[0] = reg1;
2736 xops[1] = GEN_INT (PROBE_INTERVAL);
2737 output_asm_insn ("sub\t%0, %0, %1", xops);
2739 /* Probe at TEST_ADDR. */
2740 output_asm_insn ("str\txzr, [%0]", xops);
2742 /* Test if TEST_ADDR == LAST_ADDR. */
2743 xops[1] = reg2;
2744 output_asm_insn ("cmp\t%0, %1", xops);
2746 /* Branch. */
2747 fputs ("\tb.ne\t", asm_out_file);
2748 assemble_name_raw (asm_out_file, loop_lab);
2749 fputc ('\n', asm_out_file);
2751 return "";
2754 static bool
2755 aarch64_frame_pointer_required (void)
2757 /* In aarch64_override_options_after_change
2758 flag_omit_leaf_frame_pointer turns off the frame pointer by
2759 default. Turn it back on now if we've not got a leaf
2760 function. */
2761 if (flag_omit_leaf_frame_pointer
2762 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2763 return true;
2765 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2766 if (crtl->calls_eh_return)
2767 return true;
2769 return false;
2772 /* Mark the registers that need to be saved by the callee and calculate
2773 the size of the callee-saved registers area and frame record (both FP
2774 and LR may be omitted). */
2775 static void
2776 aarch64_layout_frame (void)
2778 HOST_WIDE_INT offset = 0;
2779 int regno, last_fp_reg = INVALID_REGNUM;
2781 if (reload_completed && cfun->machine->frame.laid_out)
2782 return;
2784 #define SLOT_NOT_REQUIRED (-2)
2785 #define SLOT_REQUIRED (-1)
2787 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2788 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2790 /* First mark all the registers that really need to be saved... */
2791 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2792 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2794 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2795 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2797 /* ... that includes the eh data registers (if needed)... */
2798 if (crtl->calls_eh_return)
2799 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2800 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2801 = SLOT_REQUIRED;
2803 /* ... and any callee saved register that dataflow says is live. */
2804 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2805 if (df_regs_ever_live_p (regno)
2806 && (regno == R30_REGNUM
2807 || !call_used_regs[regno]))
2808 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2810 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2811 if (df_regs_ever_live_p (regno)
2812 && !call_used_regs[regno])
2814 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2815 last_fp_reg = regno;
2818 if (frame_pointer_needed)
2820 /* FP and LR are placed in the linkage record. */
2821 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2822 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2823 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2824 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2825 offset += 2 * UNITS_PER_WORD;
2828 /* Now assign stack slots for them. */
2829 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2830 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2832 cfun->machine->frame.reg_offset[regno] = offset;
2833 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2834 cfun->machine->frame.wb_candidate1 = regno;
2835 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2836 cfun->machine->frame.wb_candidate2 = regno;
2837 offset += UNITS_PER_WORD;
2840 HOST_WIDE_INT max_int_offset = offset;
2841 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2842 bool has_align_gap = offset != max_int_offset;
2844 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2845 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2847 /* If there is an alignment gap between integer and fp callee-saves,
2848 allocate the last fp register to it if possible. */
2849 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2851 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2852 break;
2855 cfun->machine->frame.reg_offset[regno] = offset;
2856 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2857 cfun->machine->frame.wb_candidate1 = regno;
2858 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2859 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2860 cfun->machine->frame.wb_candidate2 = regno;
2861 offset += UNITS_PER_WORD;
2864 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2866 cfun->machine->frame.saved_regs_size = offset;
2868 HOST_WIDE_INT varargs_and_saved_regs_size
2869 = offset + cfun->machine->frame.saved_varargs_size;
2871 cfun->machine->frame.hard_fp_offset
2872 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2873 STACK_BOUNDARY / BITS_PER_UNIT);
2875 cfun->machine->frame.frame_size
2876 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2877 + crtl->outgoing_args_size,
2878 STACK_BOUNDARY / BITS_PER_UNIT);
2880 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2882 cfun->machine->frame.initial_adjust = 0;
2883 cfun->machine->frame.final_adjust = 0;
2884 cfun->machine->frame.callee_adjust = 0;
2885 cfun->machine->frame.callee_offset = 0;
2887 HOST_WIDE_INT max_push_offset = 0;
2888 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2889 max_push_offset = 512;
2890 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2891 max_push_offset = 256;
2893 if (cfun->machine->frame.frame_size < max_push_offset
2894 && crtl->outgoing_args_size == 0)
2896 /* Simple, small frame with no outgoing arguments:
2897 stp reg1, reg2, [sp, -frame_size]!
2898 stp reg3, reg4, [sp, 16] */
2899 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2901 else if ((crtl->outgoing_args_size
2902 + cfun->machine->frame.saved_regs_size < 512)
2903 && !(cfun->calls_alloca
2904 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2906 /* Frame with small outgoing arguments:
2907 sub sp, sp, frame_size
2908 stp reg1, reg2, [sp, outgoing_args_size]
2909 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2910 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2911 cfun->machine->frame.callee_offset
2912 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2914 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2916 /* Frame with large outgoing arguments but a small local area:
2917 stp reg1, reg2, [sp, -hard_fp_offset]!
2918 stp reg3, reg4, [sp, 16]
2919 sub sp, sp, outgoing_args_size */
2920 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2921 cfun->machine->frame.final_adjust
2922 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2924 else if (!frame_pointer_needed
2925 && varargs_and_saved_regs_size < max_push_offset)
2927 /* Frame with large local area and outgoing arguments (this pushes the
2928 callee-saves first, followed by the locals and outgoing area):
2929 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2930 stp reg3, reg4, [sp, 16]
2931 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2932 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2933 cfun->machine->frame.final_adjust
2934 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2935 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2936 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2938 else
2940 /* Frame with large local area and outgoing arguments using frame pointer:
2941 sub sp, sp, hard_fp_offset
2942 stp x29, x30, [sp, 0]
2943 add x29, sp, 0
2944 stp reg3, reg4, [sp, 16]
2945 sub sp, sp, outgoing_args_size */
2946 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2947 cfun->machine->frame.final_adjust
2948 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2951 cfun->machine->frame.laid_out = true;
2954 /* Return true if the register REGNO is saved on entry to
2955 the current function. */
2957 static bool
2958 aarch64_register_saved_on_entry (int regno)
2960 return cfun->machine->frame.reg_offset[regno] >= 0;
2963 /* Return the next register up from REGNO up to LIMIT for the callee
2964 to save. */
2966 static unsigned
2967 aarch64_next_callee_save (unsigned regno, unsigned limit)
2969 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2970 regno ++;
2971 return regno;
2974 /* Push the register number REGNO of mode MODE to the stack with write-back
2975 adjusting the stack by ADJUSTMENT. */
2977 static void
2978 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2979 HOST_WIDE_INT adjustment)
2981 rtx base_rtx = stack_pointer_rtx;
2982 rtx insn, reg, mem;
2984 reg = gen_rtx_REG (mode, regno);
2985 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2986 plus_constant (Pmode, base_rtx, -adjustment));
2987 mem = gen_rtx_MEM (mode, mem);
2989 insn = emit_move_insn (mem, reg);
2990 RTX_FRAME_RELATED_P (insn) = 1;
2993 /* Generate and return an instruction to store the pair of registers
2994 REG and REG2 of mode MODE to location BASE with write-back adjusting
2995 the stack location BASE by ADJUSTMENT. */
2997 static rtx
2998 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2999 HOST_WIDE_INT adjustment)
3001 switch (mode)
3003 case DImode:
3004 return gen_storewb_pairdi_di (base, base, reg, reg2,
3005 GEN_INT (-adjustment),
3006 GEN_INT (UNITS_PER_WORD - adjustment));
3007 case DFmode:
3008 return gen_storewb_pairdf_di (base, base, reg, reg2,
3009 GEN_INT (-adjustment),
3010 GEN_INT (UNITS_PER_WORD - adjustment));
3011 default:
3012 gcc_unreachable ();
3016 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3017 stack pointer by ADJUSTMENT. */
3019 static void
3020 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3022 rtx_insn *insn;
3023 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3025 if (regno2 == INVALID_REGNUM)
3026 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3028 rtx reg1 = gen_rtx_REG (mode, regno1);
3029 rtx reg2 = gen_rtx_REG (mode, regno2);
3031 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3032 reg2, adjustment));
3033 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3034 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3035 RTX_FRAME_RELATED_P (insn) = 1;
3038 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3039 adjusting it by ADJUSTMENT afterwards. */
3041 static rtx
3042 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3043 HOST_WIDE_INT adjustment)
3045 switch (mode)
3047 case DImode:
3048 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3049 GEN_INT (UNITS_PER_WORD));
3050 case DFmode:
3051 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3052 GEN_INT (UNITS_PER_WORD));
3053 default:
3054 gcc_unreachable ();
3058 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3059 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3060 into CFI_OPS. */
3062 static void
3063 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3064 rtx *cfi_ops)
3066 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3067 rtx reg1 = gen_rtx_REG (mode, regno1);
3069 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3071 if (regno2 == INVALID_REGNUM)
3073 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3074 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3075 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3077 else
3079 rtx reg2 = gen_rtx_REG (mode, regno2);
3080 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3081 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3082 reg2, adjustment));
3086 /* Generate and return a store pair instruction of mode MODE to store
3087 register REG1 to MEM1 and register REG2 to MEM2. */
3089 static rtx
3090 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3091 rtx reg2)
3093 switch (mode)
3095 case DImode:
3096 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3098 case DFmode:
3099 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3101 default:
3102 gcc_unreachable ();
3106 /* Generate and regurn a load pair isntruction of mode MODE to load register
3107 REG1 from MEM1 and register REG2 from MEM2. */
3109 static rtx
3110 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3111 rtx mem2)
3113 switch (mode)
3115 case DImode:
3116 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3118 case DFmode:
3119 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3121 default:
3122 gcc_unreachable ();
3126 /* Return TRUE if return address signing should be enabled for the current
3127 function, otherwise return FALSE. */
3129 bool
3130 aarch64_return_address_signing_enabled (void)
3132 /* This function should only be called after frame laid out. */
3133 gcc_assert (cfun->machine->frame.laid_out);
3135 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3136 if it's LR is pushed onto stack. */
3137 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3138 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3139 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3142 /* Emit code to save the callee-saved registers from register number START
3143 to LIMIT to the stack at the location starting at offset START_OFFSET,
3144 skipping any write-back candidates if SKIP_WB is true. */
3146 static void
3147 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3148 unsigned start, unsigned limit, bool skip_wb)
3150 rtx_insn *insn;
3151 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3152 ? gen_frame_mem : gen_rtx_MEM);
3153 unsigned regno;
3154 unsigned regno2;
3156 for (regno = aarch64_next_callee_save (start, limit);
3157 regno <= limit;
3158 regno = aarch64_next_callee_save (regno + 1, limit))
3160 rtx reg, mem;
3161 HOST_WIDE_INT offset;
3163 if (skip_wb
3164 && (regno == cfun->machine->frame.wb_candidate1
3165 || regno == cfun->machine->frame.wb_candidate2))
3166 continue;
3168 if (cfun->machine->reg_is_wrapped_separately[regno])
3169 continue;
3171 reg = gen_rtx_REG (mode, regno);
3172 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3173 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3174 offset));
3176 regno2 = aarch64_next_callee_save (regno + 1, limit);
3178 if (regno2 <= limit
3179 && !cfun->machine->reg_is_wrapped_separately[regno2]
3180 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3181 == cfun->machine->frame.reg_offset[regno2]))
3184 rtx reg2 = gen_rtx_REG (mode, regno2);
3185 rtx mem2;
3187 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3188 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3189 offset));
3190 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3191 reg2));
3193 /* The first part of a frame-related parallel insn is
3194 always assumed to be relevant to the frame
3195 calculations; subsequent parts, are only
3196 frame-related if explicitly marked. */
3197 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3198 regno = regno2;
3200 else
3201 insn = emit_move_insn (mem, reg);
3203 RTX_FRAME_RELATED_P (insn) = 1;
3207 /* Emit code to restore the callee registers of mode MODE from register
3208 number START up to and including LIMIT. Restore from the stack offset
3209 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3210 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3212 static void
3213 aarch64_restore_callee_saves (machine_mode mode,
3214 HOST_WIDE_INT start_offset, unsigned start,
3215 unsigned limit, bool skip_wb, rtx *cfi_ops)
3217 rtx base_rtx = stack_pointer_rtx;
3218 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3219 ? gen_frame_mem : gen_rtx_MEM);
3220 unsigned regno;
3221 unsigned regno2;
3222 HOST_WIDE_INT offset;
3224 for (regno = aarch64_next_callee_save (start, limit);
3225 regno <= limit;
3226 regno = aarch64_next_callee_save (regno + 1, limit))
3228 if (cfun->machine->reg_is_wrapped_separately[regno])
3229 continue;
3231 rtx reg, mem;
3233 if (skip_wb
3234 && (regno == cfun->machine->frame.wb_candidate1
3235 || regno == cfun->machine->frame.wb_candidate2))
3236 continue;
3238 reg = gen_rtx_REG (mode, regno);
3239 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3240 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3242 regno2 = aarch64_next_callee_save (regno + 1, limit);
3244 if (regno2 <= limit
3245 && !cfun->machine->reg_is_wrapped_separately[regno2]
3246 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3247 == cfun->machine->frame.reg_offset[regno2]))
3249 rtx reg2 = gen_rtx_REG (mode, regno2);
3250 rtx mem2;
3252 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3253 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3254 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3256 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3257 regno = regno2;
3259 else
3260 emit_move_insn (reg, mem);
3261 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3265 static inline bool
3266 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3267 HOST_WIDE_INT offset)
3269 return offset >= -256 && offset < 256;
3272 static inline bool
3273 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3275 return (offset >= 0
3276 && offset < 4096 * GET_MODE_SIZE (mode)
3277 && offset % GET_MODE_SIZE (mode) == 0);
3280 bool
3281 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3283 return (offset >= -64 * GET_MODE_SIZE (mode)
3284 && offset < 64 * GET_MODE_SIZE (mode)
3285 && offset % GET_MODE_SIZE (mode) == 0);
3288 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3290 static sbitmap
3291 aarch64_get_separate_components (void)
3293 aarch64_layout_frame ();
3295 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3296 bitmap_clear (components);
3298 /* The registers we need saved to the frame. */
3299 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3300 if (aarch64_register_saved_on_entry (regno))
3302 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3303 if (!frame_pointer_needed)
3304 offset += cfun->machine->frame.frame_size
3305 - cfun->machine->frame.hard_fp_offset;
3306 /* Check that we can access the stack slot of the register with one
3307 direct load with no adjustments needed. */
3308 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3309 bitmap_set_bit (components, regno);
3312 /* Don't mess with the hard frame pointer. */
3313 if (frame_pointer_needed)
3314 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3316 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3317 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3318 /* If aarch64_layout_frame has chosen registers to store/restore with
3319 writeback don't interfere with them to avoid having to output explicit
3320 stack adjustment instructions. */
3321 if (reg2 != INVALID_REGNUM)
3322 bitmap_clear_bit (components, reg2);
3323 if (reg1 != INVALID_REGNUM)
3324 bitmap_clear_bit (components, reg1);
3326 bitmap_clear_bit (components, LR_REGNUM);
3327 bitmap_clear_bit (components, SP_REGNUM);
3329 return components;
3332 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3334 static sbitmap
3335 aarch64_components_for_bb (basic_block bb)
3337 bitmap in = DF_LIVE_IN (bb);
3338 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3339 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3341 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3342 bitmap_clear (components);
3344 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3345 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3346 if ((!call_used_regs[regno])
3347 && (bitmap_bit_p (in, regno)
3348 || bitmap_bit_p (gen, regno)
3349 || bitmap_bit_p (kill, regno)))
3350 bitmap_set_bit (components, regno);
3352 return components;
3355 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3356 Nothing to do for aarch64. */
3358 static void
3359 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3363 /* Return the next set bit in BMP from START onwards. Return the total number
3364 of bits in BMP if no set bit is found at or after START. */
3366 static unsigned int
3367 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3369 unsigned int nbits = SBITMAP_SIZE (bmp);
3370 if (start == nbits)
3371 return start;
3373 gcc_assert (start < nbits);
3374 for (unsigned int i = start; i < nbits; i++)
3375 if (bitmap_bit_p (bmp, i))
3376 return i;
3378 return nbits;
3381 /* Do the work for aarch64_emit_prologue_components and
3382 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3383 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3384 for these components or the epilogue sequence. That is, it determines
3385 whether we should emit stores or loads and what kind of CFA notes to attach
3386 to the insns. Otherwise the logic for the two sequences is very
3387 similar. */
3389 static void
3390 aarch64_process_components (sbitmap components, bool prologue_p)
3392 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3393 ? HARD_FRAME_POINTER_REGNUM
3394 : STACK_POINTER_REGNUM);
3396 unsigned last_regno = SBITMAP_SIZE (components);
3397 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3398 rtx_insn *insn = NULL;
3400 while (regno != last_regno)
3402 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3403 so DFmode for the vector registers is enough. */
3404 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3405 rtx reg = gen_rtx_REG (mode, regno);
3406 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3407 if (!frame_pointer_needed)
3408 offset += cfun->machine->frame.frame_size
3409 - cfun->machine->frame.hard_fp_offset;
3410 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3411 rtx mem = gen_frame_mem (mode, addr);
3413 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3414 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3415 /* No more registers to handle after REGNO.
3416 Emit a single save/restore and exit. */
3417 if (regno2 == last_regno)
3419 insn = emit_insn (set);
3420 RTX_FRAME_RELATED_P (insn) = 1;
3421 if (prologue_p)
3422 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3423 else
3424 add_reg_note (insn, REG_CFA_RESTORE, reg);
3425 break;
3428 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3429 /* The next register is not of the same class or its offset is not
3430 mergeable with the current one into a pair. */
3431 if (!satisfies_constraint_Ump (mem)
3432 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3433 || (offset2 - cfun->machine->frame.reg_offset[regno])
3434 != GET_MODE_SIZE (mode))
3436 insn = emit_insn (set);
3437 RTX_FRAME_RELATED_P (insn) = 1;
3438 if (prologue_p)
3439 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3440 else
3441 add_reg_note (insn, REG_CFA_RESTORE, reg);
3443 regno = regno2;
3444 continue;
3447 /* REGNO2 can be saved/restored in a pair with REGNO. */
3448 rtx reg2 = gen_rtx_REG (mode, regno2);
3449 if (!frame_pointer_needed)
3450 offset2 += cfun->machine->frame.frame_size
3451 - cfun->machine->frame.hard_fp_offset;
3452 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3453 rtx mem2 = gen_frame_mem (mode, addr2);
3454 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3455 : gen_rtx_SET (reg2, mem2);
3457 if (prologue_p)
3458 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3459 else
3460 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3462 RTX_FRAME_RELATED_P (insn) = 1;
3463 if (prologue_p)
3465 add_reg_note (insn, REG_CFA_OFFSET, set);
3466 add_reg_note (insn, REG_CFA_OFFSET, set2);
3468 else
3470 add_reg_note (insn, REG_CFA_RESTORE, reg);
3471 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3474 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3478 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3480 static void
3481 aarch64_emit_prologue_components (sbitmap components)
3483 aarch64_process_components (components, true);
3486 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3488 static void
3489 aarch64_emit_epilogue_components (sbitmap components)
3491 aarch64_process_components (components, false);
3494 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3496 static void
3497 aarch64_set_handled_components (sbitmap components)
3499 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3500 if (bitmap_bit_p (components, regno))
3501 cfun->machine->reg_is_wrapped_separately[regno] = true;
3504 /* AArch64 stack frames generated by this compiler look like:
3506 +-------------------------------+
3508 | incoming stack arguments |
3510 +-------------------------------+
3511 | | <-- incoming stack pointer (aligned)
3512 | callee-allocated save area |
3513 | for register varargs |
3515 +-------------------------------+
3516 | local variables | <-- frame_pointer_rtx
3518 +-------------------------------+
3519 | padding0 | \
3520 +-------------------------------+ |
3521 | callee-saved registers | | frame.saved_regs_size
3522 +-------------------------------+ |
3523 | LR' | |
3524 +-------------------------------+ |
3525 | FP' | / <- hard_frame_pointer_rtx (aligned)
3526 +-------------------------------+
3527 | dynamic allocation |
3528 +-------------------------------+
3529 | padding |
3530 +-------------------------------+
3531 | outgoing stack arguments | <-- arg_pointer
3533 +-------------------------------+
3534 | | <-- stack_pointer_rtx (aligned)
3536 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3537 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3538 unchanged. */
3540 /* Generate the prologue instructions for entry into a function.
3541 Establish the stack frame by decreasing the stack pointer with a
3542 properly calculated size and, if necessary, create a frame record
3543 filled with the values of LR and previous frame pointer. The
3544 current FP is also set up if it is in use. */
3546 void
3547 aarch64_expand_prologue (void)
3549 aarch64_layout_frame ();
3551 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3552 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3553 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3554 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3555 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3556 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3557 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3558 rtx_insn *insn;
3560 /* Sign return address for functions. */
3561 if (aarch64_return_address_signing_enabled ())
3563 insn = emit_insn (gen_pacisp ());
3564 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3565 RTX_FRAME_RELATED_P (insn) = 1;
3568 if (flag_stack_usage_info)
3569 current_function_static_stack_size = frame_size;
3571 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3573 if (crtl->is_leaf && !cfun->calls_alloca)
3575 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3576 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3577 frame_size - STACK_CHECK_PROTECT);
3579 else if (frame_size > 0)
3580 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3583 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3585 if (callee_adjust != 0)
3586 aarch64_push_regs (reg1, reg2, callee_adjust);
3588 if (frame_pointer_needed)
3590 if (callee_adjust == 0)
3591 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3592 R30_REGNUM, false);
3593 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3594 stack_pointer_rtx,
3595 GEN_INT (callee_offset)));
3596 RTX_FRAME_RELATED_P (insn) = 1;
3597 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3600 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3601 callee_adjust != 0 || frame_pointer_needed);
3602 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3603 callee_adjust != 0 || frame_pointer_needed);
3604 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3607 /* Return TRUE if we can use a simple_return insn.
3609 This function checks whether the callee saved stack is empty, which
3610 means no restore actions are need. The pro_and_epilogue will use
3611 this to check whether shrink-wrapping opt is feasible. */
3613 bool
3614 aarch64_use_return_insn_p (void)
3616 if (!reload_completed)
3617 return false;
3619 if (crtl->profile)
3620 return false;
3622 aarch64_layout_frame ();
3624 return cfun->machine->frame.frame_size == 0;
3627 /* Generate the epilogue instructions for returning from a function.
3628 This is almost exactly the reverse of the prolog sequence, except
3629 that we need to insert barriers to avoid scheduling loads that read
3630 from a deallocated stack, and we optimize the unwind records by
3631 emitting them all together if possible. */
3632 void
3633 aarch64_expand_epilogue (bool for_sibcall)
3635 aarch64_layout_frame ();
3637 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3638 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3639 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3640 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3641 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3642 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3643 rtx cfi_ops = NULL;
3644 rtx_insn *insn;
3646 /* We need to add memory barrier to prevent read from deallocated stack. */
3647 bool need_barrier_p = (get_frame_size ()
3648 + cfun->machine->frame.saved_varargs_size) != 0;
3650 /* Emit a barrier to prevent loads from a deallocated stack. */
3651 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3652 || crtl->calls_eh_return)
3654 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3655 need_barrier_p = false;
3658 /* Restore the stack pointer from the frame pointer if it may not
3659 be the same as the stack pointer. */
3660 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3662 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3663 hard_frame_pointer_rtx,
3664 GEN_INT (-callee_offset)));
3665 /* If writeback is used when restoring callee-saves, the CFA
3666 is restored on the instruction doing the writeback. */
3667 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3669 else
3670 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3672 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3673 callee_adjust != 0, &cfi_ops);
3674 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3675 callee_adjust != 0, &cfi_ops);
3677 if (need_barrier_p)
3678 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3680 if (callee_adjust != 0)
3681 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3683 if (callee_adjust != 0 || initial_adjust > 65536)
3685 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3686 insn = get_last_insn ();
3687 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3688 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3689 RTX_FRAME_RELATED_P (insn) = 1;
3690 cfi_ops = NULL;
3693 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3695 if (cfi_ops)
3697 /* Emit delayed restores and reset the CFA to be SP. */
3698 insn = get_last_insn ();
3699 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3700 REG_NOTES (insn) = cfi_ops;
3701 RTX_FRAME_RELATED_P (insn) = 1;
3704 /* We prefer to emit the combined return/authenticate instruction RETAA,
3705 however there are three cases in which we must instead emit an explicit
3706 authentication instruction.
3708 1) Sibcalls don't return in a normal way, so if we're about to call one
3709 we must authenticate.
3711 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3712 generating code for !TARGET_ARMV8_3 we can't use it and must
3713 explicitly authenticate.
3715 3) On an eh_return path we make extra stack adjustments to update the
3716 canonical frame address to be the exception handler's CFA. We want
3717 to authenticate using the CFA of the function which calls eh_return.
3719 if (aarch64_return_address_signing_enabled ()
3720 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3722 insn = emit_insn (gen_autisp ());
3723 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3724 RTX_FRAME_RELATED_P (insn) = 1;
3727 /* Stack adjustment for exception handler. */
3728 if (crtl->calls_eh_return)
3730 /* We need to unwind the stack by the offset computed by
3731 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3732 to be SP; letting the CFA move during this adjustment
3733 is just as correct as retaining the CFA from the body
3734 of the function. Therefore, do nothing special. */
3735 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3738 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3739 if (!for_sibcall)
3740 emit_jump_insn (ret_rtx);
3743 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3744 normally or return to a previous frame after unwinding.
3746 An EH return uses a single shared return sequence. The epilogue is
3747 exactly like a normal epilogue except that it has an extra input
3748 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3749 that must be applied after the frame has been destroyed. An extra label
3750 is inserted before the epilogue which initializes this register to zero,
3751 and this is the entry point for a normal return.
3753 An actual EH return updates the return address, initializes the stack
3754 adjustment and jumps directly into the epilogue (bypassing the zeroing
3755 of the adjustment). Since the return address is typically saved on the
3756 stack when a function makes a call, the saved LR must be updated outside
3757 the epilogue.
3759 This poses problems as the store is generated well before the epilogue,
3760 so the offset of LR is not known yet. Also optimizations will remove the
3761 store as it appears dead, even after the epilogue is generated (as the
3762 base or offset for loading LR is different in many cases).
3764 To avoid these problems this implementation forces the frame pointer
3765 in eh_return functions so that the location of LR is fixed and known early.
3766 It also marks the store volatile, so no optimization is permitted to
3767 remove the store. */
3769 aarch64_eh_return_handler_rtx (void)
3771 rtx tmp = gen_frame_mem (Pmode,
3772 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3774 /* Mark the store volatile, so no optimization is permitted to remove it. */
3775 MEM_VOLATILE_P (tmp) = true;
3776 return tmp;
3779 /* Output code to add DELTA to the first argument, and then jump
3780 to FUNCTION. Used for C++ multiple inheritance. */
3781 static void
3782 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3783 HOST_WIDE_INT delta,
3784 HOST_WIDE_INT vcall_offset,
3785 tree function)
3787 /* The this pointer is always in x0. Note that this differs from
3788 Arm where the this pointer maybe bumped to r1 if r0 is required
3789 to return a pointer to an aggregate. On AArch64 a result value
3790 pointer will be in x8. */
3791 int this_regno = R0_REGNUM;
3792 rtx this_rtx, temp0, temp1, addr, funexp;
3793 rtx_insn *insn;
3795 reload_completed = 1;
3796 emit_note (NOTE_INSN_PROLOGUE_END);
3798 if (vcall_offset == 0)
3799 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3800 else
3802 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3804 this_rtx = gen_rtx_REG (Pmode, this_regno);
3805 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3806 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3808 addr = this_rtx;
3809 if (delta != 0)
3811 if (delta >= -256 && delta < 256)
3812 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3813 plus_constant (Pmode, this_rtx, delta));
3814 else
3815 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3818 if (Pmode == ptr_mode)
3819 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3820 else
3821 aarch64_emit_move (temp0,
3822 gen_rtx_ZERO_EXTEND (Pmode,
3823 gen_rtx_MEM (ptr_mode, addr)));
3825 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3826 addr = plus_constant (Pmode, temp0, vcall_offset);
3827 else
3829 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3830 Pmode);
3831 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3834 if (Pmode == ptr_mode)
3835 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3836 else
3837 aarch64_emit_move (temp1,
3838 gen_rtx_SIGN_EXTEND (Pmode,
3839 gen_rtx_MEM (ptr_mode, addr)));
3841 emit_insn (gen_add2_insn (this_rtx, temp1));
3844 /* Generate a tail call to the target function. */
3845 if (!TREE_USED (function))
3847 assemble_external (function);
3848 TREE_USED (function) = 1;
3850 funexp = XEXP (DECL_RTL (function), 0);
3851 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3852 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3853 SIBLING_CALL_P (insn) = 1;
3855 insn = get_insns ();
3856 shorten_branches (insn);
3857 final_start_function (insn, file, 1);
3858 final (insn, file, 1);
3859 final_end_function ();
3861 /* Stop pretending to be a post-reload pass. */
3862 reload_completed = 0;
3865 static bool
3866 aarch64_tls_referenced_p (rtx x)
3868 if (!TARGET_HAVE_TLS)
3869 return false;
3870 subrtx_iterator::array_type array;
3871 FOR_EACH_SUBRTX (iter, array, x, ALL)
3873 const_rtx x = *iter;
3874 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3875 return true;
3876 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3877 TLS offsets, not real symbol references. */
3878 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3879 iter.skip_subrtxes ();
3881 return false;
3885 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3886 a left shift of 0 or 12 bits. */
3887 bool
3888 aarch64_uimm12_shift (HOST_WIDE_INT val)
3890 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3891 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3896 /* Return true if val is an immediate that can be loaded into a
3897 register by a MOVZ instruction. */
3898 static bool
3899 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3901 if (GET_MODE_SIZE (mode) > 4)
3903 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3904 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3905 return 1;
3907 else
3909 /* Ignore sign extension. */
3910 val &= (HOST_WIDE_INT) 0xffffffff;
3912 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3913 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3916 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3918 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3920 0x0000000100000001ull,
3921 0x0001000100010001ull,
3922 0x0101010101010101ull,
3923 0x1111111111111111ull,
3924 0x5555555555555555ull,
3928 /* Return true if val is a valid bitmask immediate. */
3930 bool
3931 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3933 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3934 int bits;
3936 /* Check for a single sequence of one bits and return quickly if so.
3937 The special cases of all ones and all zeroes returns false. */
3938 val = (unsigned HOST_WIDE_INT) val_in;
3939 tmp = val + (val & -val);
3941 if (tmp == (tmp & -tmp))
3942 return (val + 1) > 1;
3944 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3945 if (mode == SImode)
3946 val = (val << 32) | (val & 0xffffffff);
3948 /* Invert if the immediate doesn't start with a zero bit - this means we
3949 only need to search for sequences of one bits. */
3950 if (val & 1)
3951 val = ~val;
3953 /* Find the first set bit and set tmp to val with the first sequence of one
3954 bits removed. Return success if there is a single sequence of ones. */
3955 first_one = val & -val;
3956 tmp = val & (val + first_one);
3958 if (tmp == 0)
3959 return true;
3961 /* Find the next set bit and compute the difference in bit position. */
3962 next_one = tmp & -tmp;
3963 bits = clz_hwi (first_one) - clz_hwi (next_one);
3964 mask = val ^ tmp;
3966 /* Check the bit position difference is a power of 2, and that the first
3967 sequence of one bits fits within 'bits' bits. */
3968 if ((mask >> bits) != 0 || bits != (bits & -bits))
3969 return false;
3971 /* Check the sequence of one bits is repeated 64/bits times. */
3972 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3975 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3976 Assumed precondition: VAL_IN Is not zero. */
3978 unsigned HOST_WIDE_INT
3979 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3981 int lowest_bit_set = ctz_hwi (val_in);
3982 int highest_bit_set = floor_log2 (val_in);
3983 gcc_assert (val_in != 0);
3985 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3986 (HOST_WIDE_INT_1U << lowest_bit_set));
3989 /* Create constant where bits outside of lowest bit set to highest bit set
3990 are set to 1. */
3992 unsigned HOST_WIDE_INT
3993 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
3995 return val_in | ~aarch64_and_split_imm1 (val_in);
3998 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4000 bool
4001 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4003 if (aarch64_bitmask_imm (val_in, mode))
4004 return false;
4006 if (aarch64_move_imm (val_in, mode))
4007 return false;
4009 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4011 return aarch64_bitmask_imm (imm2, mode);
4014 /* Return true if val is an immediate that can be loaded into a
4015 register in a single instruction. */
4016 bool
4017 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4019 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4020 return 1;
4021 return aarch64_bitmask_imm (val, mode);
4024 static bool
4025 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4027 rtx base, offset;
4029 if (GET_CODE (x) == HIGH)
4030 return true;
4032 split_const (x, &base, &offset);
4033 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4035 if (aarch64_classify_symbol (base, offset)
4036 != SYMBOL_FORCE_TO_MEM)
4037 return true;
4038 else
4039 /* Avoid generating a 64-bit relocation in ILP32; leave
4040 to aarch64_expand_mov_immediate to handle it properly. */
4041 return mode != ptr_mode;
4044 return aarch64_tls_referenced_p (x);
4047 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4048 The expansion for a table switch is quite expensive due to the number
4049 of instructions, the table lookup and hard to predict indirect jump.
4050 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4051 set, otherwise use tables for > 16 cases as a tradeoff between size and
4052 performance. When optimizing for size, use the default setting. */
4054 static unsigned int
4055 aarch64_case_values_threshold (void)
4057 /* Use the specified limit for the number of cases before using jump
4058 tables at higher optimization levels. */
4059 if (optimize > 2
4060 && selected_cpu->tune->max_case_values != 0)
4061 return selected_cpu->tune->max_case_values;
4062 else
4063 return optimize_size ? default_case_values_threshold () : 17;
4066 /* Return true if register REGNO is a valid index register.
4067 STRICT_P is true if REG_OK_STRICT is in effect. */
4069 bool
4070 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4072 if (!HARD_REGISTER_NUM_P (regno))
4074 if (!strict_p)
4075 return true;
4077 if (!reg_renumber)
4078 return false;
4080 regno = reg_renumber[regno];
4082 return GP_REGNUM_P (regno);
4085 /* Return true if register REGNO is a valid base register for mode MODE.
4086 STRICT_P is true if REG_OK_STRICT is in effect. */
4088 bool
4089 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4091 if (!HARD_REGISTER_NUM_P (regno))
4093 if (!strict_p)
4094 return true;
4096 if (!reg_renumber)
4097 return false;
4099 regno = reg_renumber[regno];
4102 /* The fake registers will be eliminated to either the stack or
4103 hard frame pointer, both of which are usually valid base registers.
4104 Reload deals with the cases where the eliminated form isn't valid. */
4105 return (GP_REGNUM_P (regno)
4106 || regno == SP_REGNUM
4107 || regno == FRAME_POINTER_REGNUM
4108 || regno == ARG_POINTER_REGNUM);
4111 /* Return true if X is a valid base register for mode MODE.
4112 STRICT_P is true if REG_OK_STRICT is in effect. */
4114 static bool
4115 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4117 if (!strict_p && GET_CODE (x) == SUBREG)
4118 x = SUBREG_REG (x);
4120 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4123 /* Return true if address offset is a valid index. If it is, fill in INFO
4124 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4126 static bool
4127 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4128 machine_mode mode, bool strict_p)
4130 enum aarch64_address_type type;
4131 rtx index;
4132 int shift;
4134 /* (reg:P) */
4135 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4136 && GET_MODE (x) == Pmode)
4138 type = ADDRESS_REG_REG;
4139 index = x;
4140 shift = 0;
4142 /* (sign_extend:DI (reg:SI)) */
4143 else if ((GET_CODE (x) == SIGN_EXTEND
4144 || GET_CODE (x) == ZERO_EXTEND)
4145 && GET_MODE (x) == DImode
4146 && GET_MODE (XEXP (x, 0)) == SImode)
4148 type = (GET_CODE (x) == SIGN_EXTEND)
4149 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4150 index = XEXP (x, 0);
4151 shift = 0;
4153 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4154 else if (GET_CODE (x) == MULT
4155 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4156 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4157 && GET_MODE (XEXP (x, 0)) == DImode
4158 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4159 && CONST_INT_P (XEXP (x, 1)))
4161 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4162 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4163 index = XEXP (XEXP (x, 0), 0);
4164 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4166 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4167 else if (GET_CODE (x) == ASHIFT
4168 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4169 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4170 && GET_MODE (XEXP (x, 0)) == DImode
4171 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4172 && CONST_INT_P (XEXP (x, 1)))
4174 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4175 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4176 index = XEXP (XEXP (x, 0), 0);
4177 shift = INTVAL (XEXP (x, 1));
4179 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4180 else if ((GET_CODE (x) == SIGN_EXTRACT
4181 || GET_CODE (x) == ZERO_EXTRACT)
4182 && GET_MODE (x) == DImode
4183 && GET_CODE (XEXP (x, 0)) == MULT
4184 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4185 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4187 type = (GET_CODE (x) == SIGN_EXTRACT)
4188 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4189 index = XEXP (XEXP (x, 0), 0);
4190 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4191 if (INTVAL (XEXP (x, 1)) != 32 + shift
4192 || INTVAL (XEXP (x, 2)) != 0)
4193 shift = -1;
4195 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4196 (const_int 0xffffffff<<shift)) */
4197 else if (GET_CODE (x) == AND
4198 && GET_MODE (x) == DImode
4199 && GET_CODE (XEXP (x, 0)) == MULT
4200 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4201 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4202 && CONST_INT_P (XEXP (x, 1)))
4204 type = ADDRESS_REG_UXTW;
4205 index = XEXP (XEXP (x, 0), 0);
4206 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4207 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4208 shift = -1;
4210 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4211 else if ((GET_CODE (x) == SIGN_EXTRACT
4212 || GET_CODE (x) == ZERO_EXTRACT)
4213 && GET_MODE (x) == DImode
4214 && GET_CODE (XEXP (x, 0)) == ASHIFT
4215 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4216 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4218 type = (GET_CODE (x) == SIGN_EXTRACT)
4219 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4220 index = XEXP (XEXP (x, 0), 0);
4221 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4222 if (INTVAL (XEXP (x, 1)) != 32 + shift
4223 || INTVAL (XEXP (x, 2)) != 0)
4224 shift = -1;
4226 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4227 (const_int 0xffffffff<<shift)) */
4228 else if (GET_CODE (x) == AND
4229 && GET_MODE (x) == DImode
4230 && GET_CODE (XEXP (x, 0)) == ASHIFT
4231 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4232 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4233 && CONST_INT_P (XEXP (x, 1)))
4235 type = ADDRESS_REG_UXTW;
4236 index = XEXP (XEXP (x, 0), 0);
4237 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4238 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4239 shift = -1;
4241 /* (mult:P (reg:P) (const_int scale)) */
4242 else if (GET_CODE (x) == MULT
4243 && GET_MODE (x) == Pmode
4244 && GET_MODE (XEXP (x, 0)) == Pmode
4245 && CONST_INT_P (XEXP (x, 1)))
4247 type = ADDRESS_REG_REG;
4248 index = XEXP (x, 0);
4249 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4251 /* (ashift:P (reg:P) (const_int shift)) */
4252 else if (GET_CODE (x) == ASHIFT
4253 && GET_MODE (x) == Pmode
4254 && GET_MODE (XEXP (x, 0)) == Pmode
4255 && CONST_INT_P (XEXP (x, 1)))
4257 type = ADDRESS_REG_REG;
4258 index = XEXP (x, 0);
4259 shift = INTVAL (XEXP (x, 1));
4261 else
4262 return false;
4264 if (GET_CODE (index) == SUBREG)
4265 index = SUBREG_REG (index);
4267 if ((shift == 0 ||
4268 (shift > 0 && shift <= 3
4269 && (1 << shift) == GET_MODE_SIZE (mode)))
4270 && REG_P (index)
4271 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4273 info->type = type;
4274 info->offset = index;
4275 info->shift = shift;
4276 return true;
4279 return false;
4282 /* Return true if MODE is one of the modes for which we
4283 support LDP/STP operations. */
4285 static bool
4286 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4288 return mode == SImode || mode == DImode
4289 || mode == SFmode || mode == DFmode
4290 || (aarch64_vector_mode_supported_p (mode)
4291 && GET_MODE_SIZE (mode) == 8);
4294 /* Return true if REGNO is a virtual pointer register, or an eliminable
4295 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4296 include stack_pointer or hard_frame_pointer. */
4297 static bool
4298 virt_or_elim_regno_p (unsigned regno)
4300 return ((regno >= FIRST_VIRTUAL_REGISTER
4301 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4302 || regno == FRAME_POINTER_REGNUM
4303 || regno == ARG_POINTER_REGNUM);
4306 /* Return true if X is a valid address for machine mode MODE. If it is,
4307 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4308 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4310 static bool
4311 aarch64_classify_address (struct aarch64_address_info *info,
4312 rtx x, machine_mode mode,
4313 RTX_CODE outer_code, bool strict_p)
4315 enum rtx_code code = GET_CODE (x);
4316 rtx op0, op1;
4318 /* On BE, we use load/store pair for all large int mode load/stores.
4319 TI/TFmode may also use a load/store pair. */
4320 bool load_store_pair_p = (outer_code == PARALLEL
4321 || mode == TImode
4322 || mode == TFmode
4323 || (BYTES_BIG_ENDIAN
4324 && aarch64_vect_struct_mode_p (mode)));
4326 bool allow_reg_index_p =
4327 !load_store_pair_p
4328 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4329 && !aarch64_vect_struct_mode_p (mode);
4331 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4332 REG addressing. */
4333 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4334 && (code != POST_INC && code != REG))
4335 return false;
4337 switch (code)
4339 case REG:
4340 case SUBREG:
4341 info->type = ADDRESS_REG_IMM;
4342 info->base = x;
4343 info->offset = const0_rtx;
4344 return aarch64_base_register_rtx_p (x, strict_p);
4346 case PLUS:
4347 op0 = XEXP (x, 0);
4348 op1 = XEXP (x, 1);
4350 if (! strict_p
4351 && REG_P (op0)
4352 && virt_or_elim_regno_p (REGNO (op0))
4353 && CONST_INT_P (op1))
4355 info->type = ADDRESS_REG_IMM;
4356 info->base = op0;
4357 info->offset = op1;
4359 return true;
4362 if (GET_MODE_SIZE (mode) != 0
4363 && CONST_INT_P (op1)
4364 && aarch64_base_register_rtx_p (op0, strict_p))
4366 HOST_WIDE_INT offset = INTVAL (op1);
4368 info->type = ADDRESS_REG_IMM;
4369 info->base = op0;
4370 info->offset = op1;
4372 /* TImode and TFmode values are allowed in both pairs of X
4373 registers and individual Q registers. The available
4374 address modes are:
4375 X,X: 7-bit signed scaled offset
4376 Q: 9-bit signed offset
4377 We conservatively require an offset representable in either mode.
4378 When performing the check for pairs of X registers i.e. LDP/STP
4379 pass down DImode since that is the natural size of the LDP/STP
4380 instruction memory accesses. */
4381 if (mode == TImode || mode == TFmode)
4382 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4383 && (offset_9bit_signed_unscaled_p (mode, offset)
4384 || offset_12bit_unsigned_scaled_p (mode, offset)));
4386 /* A 7bit offset check because OImode will emit a ldp/stp
4387 instruction (only big endian will get here).
4388 For ldp/stp instructions, the offset is scaled for the size of a
4389 single element of the pair. */
4390 if (mode == OImode)
4391 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4393 /* Three 9/12 bit offsets checks because CImode will emit three
4394 ldr/str instructions (only big endian will get here). */
4395 if (mode == CImode)
4396 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4397 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4398 || offset_12bit_unsigned_scaled_p (V16QImode,
4399 offset + 32)));
4401 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4402 instructions (only big endian will get here). */
4403 if (mode == XImode)
4404 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4405 && aarch64_offset_7bit_signed_scaled_p (TImode,
4406 offset + 32));
4408 if (load_store_pair_p)
4409 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4410 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4411 else
4412 return (offset_9bit_signed_unscaled_p (mode, offset)
4413 || offset_12bit_unsigned_scaled_p (mode, offset));
4416 if (allow_reg_index_p)
4418 /* Look for base + (scaled/extended) index register. */
4419 if (aarch64_base_register_rtx_p (op0, strict_p)
4420 && aarch64_classify_index (info, op1, mode, strict_p))
4422 info->base = op0;
4423 return true;
4425 if (aarch64_base_register_rtx_p (op1, strict_p)
4426 && aarch64_classify_index (info, op0, mode, strict_p))
4428 info->base = op1;
4429 return true;
4433 return false;
4435 case POST_INC:
4436 case POST_DEC:
4437 case PRE_INC:
4438 case PRE_DEC:
4439 info->type = ADDRESS_REG_WB;
4440 info->base = XEXP (x, 0);
4441 info->offset = NULL_RTX;
4442 return aarch64_base_register_rtx_p (info->base, strict_p);
4444 case POST_MODIFY:
4445 case PRE_MODIFY:
4446 info->type = ADDRESS_REG_WB;
4447 info->base = XEXP (x, 0);
4448 if (GET_CODE (XEXP (x, 1)) == PLUS
4449 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4450 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4451 && aarch64_base_register_rtx_p (info->base, strict_p))
4453 HOST_WIDE_INT offset;
4454 info->offset = XEXP (XEXP (x, 1), 1);
4455 offset = INTVAL (info->offset);
4457 /* TImode and TFmode values are allowed in both pairs of X
4458 registers and individual Q registers. The available
4459 address modes are:
4460 X,X: 7-bit signed scaled offset
4461 Q: 9-bit signed offset
4462 We conservatively require an offset representable in either mode.
4464 if (mode == TImode || mode == TFmode)
4465 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4466 && offset_9bit_signed_unscaled_p (mode, offset));
4468 if (load_store_pair_p)
4469 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4470 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4471 else
4472 return offset_9bit_signed_unscaled_p (mode, offset);
4474 return false;
4476 case CONST:
4477 case SYMBOL_REF:
4478 case LABEL_REF:
4479 /* load literal: pc-relative constant pool entry. Only supported
4480 for SI mode or larger. */
4481 info->type = ADDRESS_SYMBOLIC;
4483 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4485 rtx sym, addend;
4487 split_const (x, &sym, &addend);
4488 return ((GET_CODE (sym) == LABEL_REF
4489 || (GET_CODE (sym) == SYMBOL_REF
4490 && CONSTANT_POOL_ADDRESS_P (sym)
4491 && aarch64_pcrelative_literal_loads)));
4493 return false;
4495 case LO_SUM:
4496 info->type = ADDRESS_LO_SUM;
4497 info->base = XEXP (x, 0);
4498 info->offset = XEXP (x, 1);
4499 if (allow_reg_index_p
4500 && aarch64_base_register_rtx_p (info->base, strict_p))
4502 rtx sym, offs;
4503 split_const (info->offset, &sym, &offs);
4504 if (GET_CODE (sym) == SYMBOL_REF
4505 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4507 /* The symbol and offset must be aligned to the access size. */
4508 unsigned int align;
4509 unsigned int ref_size;
4511 if (CONSTANT_POOL_ADDRESS_P (sym))
4512 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4513 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4515 tree exp = SYMBOL_REF_DECL (sym);
4516 align = TYPE_ALIGN (TREE_TYPE (exp));
4517 align = CONSTANT_ALIGNMENT (exp, align);
4519 else if (SYMBOL_REF_DECL (sym))
4520 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4521 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4522 && SYMBOL_REF_BLOCK (sym) != NULL)
4523 align = SYMBOL_REF_BLOCK (sym)->alignment;
4524 else
4525 align = BITS_PER_UNIT;
4527 ref_size = GET_MODE_SIZE (mode);
4528 if (ref_size == 0)
4529 ref_size = GET_MODE_SIZE (DImode);
4531 return ((INTVAL (offs) & (ref_size - 1)) == 0
4532 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4535 return false;
4537 default:
4538 return false;
4542 bool
4543 aarch64_symbolic_address_p (rtx x)
4545 rtx offset;
4547 split_const (x, &x, &offset);
4548 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4551 /* Classify the base of symbolic expression X. */
4553 enum aarch64_symbol_type
4554 aarch64_classify_symbolic_expression (rtx x)
4556 rtx offset;
4558 split_const (x, &x, &offset);
4559 return aarch64_classify_symbol (x, offset);
4563 /* Return TRUE if X is a legitimate address for accessing memory in
4564 mode MODE. */
4565 static bool
4566 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4568 struct aarch64_address_info addr;
4570 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4573 /* Return TRUE if X is a legitimate address for accessing memory in
4574 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4575 pair operation. */
4576 bool
4577 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4578 RTX_CODE outer_code, bool strict_p)
4580 struct aarch64_address_info addr;
4582 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4585 /* Split an out-of-range address displacement into a base and offset.
4586 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4587 to increase opportunities for sharing the base address of different sizes.
4588 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4589 static bool
4590 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4592 HOST_WIDE_INT offset = INTVAL (*disp);
4593 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4595 if (mode == TImode || mode == TFmode
4596 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4597 base = (offset + 0x100) & ~0x1ff;
4599 *off = GEN_INT (base);
4600 *disp = GEN_INT (offset - base);
4601 return true;
4604 /* Return TRUE if rtx X is immediate constant 0.0 */
4605 bool
4606 aarch64_float_const_zero_rtx_p (rtx x)
4608 if (GET_MODE (x) == VOIDmode)
4609 return false;
4611 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4612 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4613 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4616 /* Return the fixed registers used for condition codes. */
4618 static bool
4619 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4621 *p1 = CC_REGNUM;
4622 *p2 = INVALID_REGNUM;
4623 return true;
4626 /* Emit call insn with PAT and do aarch64-specific handling. */
4628 void
4629 aarch64_emit_call_insn (rtx pat)
4631 rtx insn = emit_call_insn (pat);
4633 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4634 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4635 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4638 machine_mode
4639 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4641 /* All floating point compares return CCFP if it is an equality
4642 comparison, and CCFPE otherwise. */
4643 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4645 switch (code)
4647 case EQ:
4648 case NE:
4649 case UNORDERED:
4650 case ORDERED:
4651 case UNLT:
4652 case UNLE:
4653 case UNGT:
4654 case UNGE:
4655 case UNEQ:
4656 case LTGT:
4657 return CCFPmode;
4659 case LT:
4660 case LE:
4661 case GT:
4662 case GE:
4663 return CCFPEmode;
4665 default:
4666 gcc_unreachable ();
4670 /* Equality comparisons of short modes against zero can be performed
4671 using the TST instruction with the appropriate bitmask. */
4672 if (y == const0_rtx && REG_P (x)
4673 && (code == EQ || code == NE)
4674 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4675 return CC_NZmode;
4677 /* Similarly, comparisons of zero_extends from shorter modes can
4678 be performed using an ANDS with an immediate mask. */
4679 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4680 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4681 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4682 && (code == EQ || code == NE))
4683 return CC_NZmode;
4685 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4686 && y == const0_rtx
4687 && (code == EQ || code == NE || code == LT || code == GE)
4688 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4689 || GET_CODE (x) == NEG
4690 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4691 && CONST_INT_P (XEXP (x, 2)))))
4692 return CC_NZmode;
4694 /* A compare with a shifted operand. Because of canonicalization,
4695 the comparison will have to be swapped when we emit the assembly
4696 code. */
4697 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4698 && (REG_P (y) || GET_CODE (y) == SUBREG)
4699 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4700 || GET_CODE (x) == LSHIFTRT
4701 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4702 return CC_SWPmode;
4704 /* Similarly for a negated operand, but we can only do this for
4705 equalities. */
4706 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4707 && (REG_P (y) || GET_CODE (y) == SUBREG)
4708 && (code == EQ || code == NE)
4709 && GET_CODE (x) == NEG)
4710 return CC_Zmode;
4712 /* A test for unsigned overflow. */
4713 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4714 && code == NE
4715 && GET_CODE (x) == PLUS
4716 && GET_CODE (y) == ZERO_EXTEND)
4717 return CC_Cmode;
4719 /* For everything else, return CCmode. */
4720 return CCmode;
4723 static int
4724 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4727 aarch64_get_condition_code (rtx x)
4729 machine_mode mode = GET_MODE (XEXP (x, 0));
4730 enum rtx_code comp_code = GET_CODE (x);
4732 if (GET_MODE_CLASS (mode) != MODE_CC)
4733 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4734 return aarch64_get_condition_code_1 (mode, comp_code);
4737 static int
4738 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4740 switch (mode)
4742 case CCFPmode:
4743 case CCFPEmode:
4744 switch (comp_code)
4746 case GE: return AARCH64_GE;
4747 case GT: return AARCH64_GT;
4748 case LE: return AARCH64_LS;
4749 case LT: return AARCH64_MI;
4750 case NE: return AARCH64_NE;
4751 case EQ: return AARCH64_EQ;
4752 case ORDERED: return AARCH64_VC;
4753 case UNORDERED: return AARCH64_VS;
4754 case UNLT: return AARCH64_LT;
4755 case UNLE: return AARCH64_LE;
4756 case UNGT: return AARCH64_HI;
4757 case UNGE: return AARCH64_PL;
4758 default: return -1;
4760 break;
4762 case CCmode:
4763 switch (comp_code)
4765 case NE: return AARCH64_NE;
4766 case EQ: return AARCH64_EQ;
4767 case GE: return AARCH64_GE;
4768 case GT: return AARCH64_GT;
4769 case LE: return AARCH64_LE;
4770 case LT: return AARCH64_LT;
4771 case GEU: return AARCH64_CS;
4772 case GTU: return AARCH64_HI;
4773 case LEU: return AARCH64_LS;
4774 case LTU: return AARCH64_CC;
4775 default: return -1;
4777 break;
4779 case CC_SWPmode:
4780 switch (comp_code)
4782 case NE: return AARCH64_NE;
4783 case EQ: return AARCH64_EQ;
4784 case GE: return AARCH64_LE;
4785 case GT: return AARCH64_LT;
4786 case LE: return AARCH64_GE;
4787 case LT: return AARCH64_GT;
4788 case GEU: return AARCH64_LS;
4789 case GTU: return AARCH64_CC;
4790 case LEU: return AARCH64_CS;
4791 case LTU: return AARCH64_HI;
4792 default: return -1;
4794 break;
4796 case CC_NZmode:
4797 switch (comp_code)
4799 case NE: return AARCH64_NE;
4800 case EQ: return AARCH64_EQ;
4801 case GE: return AARCH64_PL;
4802 case LT: return AARCH64_MI;
4803 default: return -1;
4805 break;
4807 case CC_Zmode:
4808 switch (comp_code)
4810 case NE: return AARCH64_NE;
4811 case EQ: return AARCH64_EQ;
4812 default: return -1;
4814 break;
4816 case CC_Cmode:
4817 switch (comp_code)
4819 case NE: return AARCH64_CS;
4820 case EQ: return AARCH64_CC;
4821 default: return -1;
4823 break;
4825 default:
4826 return -1;
4829 return -1;
4832 bool
4833 aarch64_const_vec_all_same_in_range_p (rtx x,
4834 HOST_WIDE_INT minval,
4835 HOST_WIDE_INT maxval)
4837 HOST_WIDE_INT firstval;
4838 int count, i;
4840 if (GET_CODE (x) != CONST_VECTOR
4841 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4842 return false;
4844 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4845 if (firstval < minval || firstval > maxval)
4846 return false;
4848 count = CONST_VECTOR_NUNITS (x);
4849 for (i = 1; i < count; i++)
4850 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4851 return false;
4853 return true;
4856 bool
4857 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4859 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4863 /* N Z C V. */
4864 #define AARCH64_CC_V 1
4865 #define AARCH64_CC_C (1 << 1)
4866 #define AARCH64_CC_Z (1 << 2)
4867 #define AARCH64_CC_N (1 << 3)
4869 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4870 static const int aarch64_nzcv_codes[] =
4872 0, /* EQ, Z == 1. */
4873 AARCH64_CC_Z, /* NE, Z == 0. */
4874 0, /* CS, C == 1. */
4875 AARCH64_CC_C, /* CC, C == 0. */
4876 0, /* MI, N == 1. */
4877 AARCH64_CC_N, /* PL, N == 0. */
4878 0, /* VS, V == 1. */
4879 AARCH64_CC_V, /* VC, V == 0. */
4880 0, /* HI, C ==1 && Z == 0. */
4881 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4882 AARCH64_CC_V, /* GE, N == V. */
4883 0, /* LT, N != V. */
4884 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4885 0, /* LE, !(Z == 0 && N == V). */
4886 0, /* AL, Any. */
4887 0 /* NV, Any. */
4890 static void
4891 aarch64_print_operand (FILE *f, rtx x, int code)
4893 switch (code)
4895 /* An integer or symbol address without a preceding # sign. */
4896 case 'c':
4897 switch (GET_CODE (x))
4899 case CONST_INT:
4900 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4901 break;
4903 case SYMBOL_REF:
4904 output_addr_const (f, x);
4905 break;
4907 case CONST:
4908 if (GET_CODE (XEXP (x, 0)) == PLUS
4909 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4911 output_addr_const (f, x);
4912 break;
4914 /* Fall through. */
4916 default:
4917 output_operand_lossage ("Unsupported operand for code '%c'", code);
4919 break;
4921 case 'e':
4922 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4924 int n;
4926 if (!CONST_INT_P (x)
4927 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4929 output_operand_lossage ("invalid operand for '%%%c'", code);
4930 return;
4933 switch (n)
4935 case 3:
4936 fputc ('b', f);
4937 break;
4938 case 4:
4939 fputc ('h', f);
4940 break;
4941 case 5:
4942 fputc ('w', f);
4943 break;
4944 default:
4945 output_operand_lossage ("invalid operand for '%%%c'", code);
4946 return;
4949 break;
4951 case 'p':
4953 int n;
4955 /* Print N such that 2^N == X. */
4956 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4958 output_operand_lossage ("invalid operand for '%%%c'", code);
4959 return;
4962 asm_fprintf (f, "%d", n);
4964 break;
4966 case 'P':
4967 /* Print the number of non-zero bits in X (a const_int). */
4968 if (!CONST_INT_P (x))
4970 output_operand_lossage ("invalid operand for '%%%c'", code);
4971 return;
4974 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4975 break;
4977 case 'H':
4978 /* Print the higher numbered register of a pair (TImode) of regs. */
4979 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4981 output_operand_lossage ("invalid operand for '%%%c'", code);
4982 return;
4985 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4986 break;
4988 case 'M':
4989 case 'm':
4991 int cond_code;
4992 /* Print a condition (eq, ne, etc) or its inverse. */
4994 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4995 if (x == const_true_rtx)
4997 if (code == 'M')
4998 fputs ("nv", f);
4999 return;
5002 if (!COMPARISON_P (x))
5004 output_operand_lossage ("invalid operand for '%%%c'", code);
5005 return;
5008 cond_code = aarch64_get_condition_code (x);
5009 gcc_assert (cond_code >= 0);
5010 if (code == 'M')
5011 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5012 fputs (aarch64_condition_codes[cond_code], f);
5014 break;
5016 case 'b':
5017 case 'h':
5018 case 's':
5019 case 'd':
5020 case 'q':
5021 /* Print a scalar FP/SIMD register name. */
5022 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5024 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5025 return;
5027 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5028 break;
5030 case 'S':
5031 case 'T':
5032 case 'U':
5033 case 'V':
5034 /* Print the first FP/SIMD register name in a list. */
5035 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5037 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5038 return;
5040 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5041 break;
5043 case 'R':
5044 /* Print a scalar FP/SIMD register name + 1. */
5045 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5047 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5048 return;
5050 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5051 break;
5053 case 'X':
5054 /* Print bottom 16 bits of integer constant in hex. */
5055 if (!CONST_INT_P (x))
5057 output_operand_lossage ("invalid operand for '%%%c'", code);
5058 return;
5060 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5061 break;
5063 case 'w':
5064 case 'x':
5065 /* Print a general register name or the zero register (32-bit or
5066 64-bit). */
5067 if (x == const0_rtx
5068 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5070 asm_fprintf (f, "%czr", code);
5071 break;
5074 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5076 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5077 break;
5080 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5082 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5083 break;
5086 /* Fall through */
5088 case 0:
5089 /* Print a normal operand, if it's a general register, then we
5090 assume DImode. */
5091 if (x == NULL)
5093 output_operand_lossage ("missing operand");
5094 return;
5097 switch (GET_CODE (x))
5099 case REG:
5100 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5101 break;
5103 case MEM:
5104 output_address (GET_MODE (x), XEXP (x, 0));
5105 break;
5107 case CONST:
5108 case LABEL_REF:
5109 case SYMBOL_REF:
5110 output_addr_const (asm_out_file, x);
5111 break;
5113 case CONST_INT:
5114 asm_fprintf (f, "%wd", INTVAL (x));
5115 break;
5117 case CONST_VECTOR:
5118 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5120 gcc_assert (
5121 aarch64_const_vec_all_same_in_range_p (x,
5122 HOST_WIDE_INT_MIN,
5123 HOST_WIDE_INT_MAX));
5124 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5126 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5128 fputc ('0', f);
5130 else
5131 gcc_unreachable ();
5132 break;
5134 case CONST_DOUBLE:
5135 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5136 be getting CONST_DOUBLEs holding integers. */
5137 gcc_assert (GET_MODE (x) != VOIDmode);
5138 if (aarch64_float_const_zero_rtx_p (x))
5140 fputc ('0', f);
5141 break;
5143 else if (aarch64_float_const_representable_p (x))
5145 #define buf_size 20
5146 char float_buf[buf_size] = {'\0'};
5147 real_to_decimal_for_mode (float_buf,
5148 CONST_DOUBLE_REAL_VALUE (x),
5149 buf_size, buf_size,
5150 1, GET_MODE (x));
5151 asm_fprintf (asm_out_file, "%s", float_buf);
5152 break;
5153 #undef buf_size
5155 output_operand_lossage ("invalid constant");
5156 return;
5157 default:
5158 output_operand_lossage ("invalid operand");
5159 return;
5161 break;
5163 case 'A':
5164 if (GET_CODE (x) == HIGH)
5165 x = XEXP (x, 0);
5167 switch (aarch64_classify_symbolic_expression (x))
5169 case SYMBOL_SMALL_GOT_4G:
5170 asm_fprintf (asm_out_file, ":got:");
5171 break;
5173 case SYMBOL_SMALL_TLSGD:
5174 asm_fprintf (asm_out_file, ":tlsgd:");
5175 break;
5177 case SYMBOL_SMALL_TLSDESC:
5178 asm_fprintf (asm_out_file, ":tlsdesc:");
5179 break;
5181 case SYMBOL_SMALL_TLSIE:
5182 asm_fprintf (asm_out_file, ":gottprel:");
5183 break;
5185 case SYMBOL_TLSLE24:
5186 asm_fprintf (asm_out_file, ":tprel:");
5187 break;
5189 case SYMBOL_TINY_GOT:
5190 gcc_unreachable ();
5191 break;
5193 default:
5194 break;
5196 output_addr_const (asm_out_file, x);
5197 break;
5199 case 'L':
5200 switch (aarch64_classify_symbolic_expression (x))
5202 case SYMBOL_SMALL_GOT_4G:
5203 asm_fprintf (asm_out_file, ":lo12:");
5204 break;
5206 case SYMBOL_SMALL_TLSGD:
5207 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5208 break;
5210 case SYMBOL_SMALL_TLSDESC:
5211 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5212 break;
5214 case SYMBOL_SMALL_TLSIE:
5215 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5216 break;
5218 case SYMBOL_TLSLE12:
5219 asm_fprintf (asm_out_file, ":tprel_lo12:");
5220 break;
5222 case SYMBOL_TLSLE24:
5223 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5224 break;
5226 case SYMBOL_TINY_GOT:
5227 asm_fprintf (asm_out_file, ":got:");
5228 break;
5230 case SYMBOL_TINY_TLSIE:
5231 asm_fprintf (asm_out_file, ":gottprel:");
5232 break;
5234 default:
5235 break;
5237 output_addr_const (asm_out_file, x);
5238 break;
5240 case 'G':
5242 switch (aarch64_classify_symbolic_expression (x))
5244 case SYMBOL_TLSLE24:
5245 asm_fprintf (asm_out_file, ":tprel_hi12:");
5246 break;
5247 default:
5248 break;
5250 output_addr_const (asm_out_file, x);
5251 break;
5253 case 'k':
5255 HOST_WIDE_INT cond_code;
5256 /* Print nzcv. */
5258 if (!CONST_INT_P (x))
5260 output_operand_lossage ("invalid operand for '%%%c'", code);
5261 return;
5264 cond_code = INTVAL (x);
5265 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5266 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5268 break;
5270 default:
5271 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5272 return;
5276 static void
5277 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5279 struct aarch64_address_info addr;
5281 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5282 switch (addr.type)
5284 case ADDRESS_REG_IMM:
5285 if (addr.offset == const0_rtx)
5286 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5287 else
5288 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5289 INTVAL (addr.offset));
5290 return;
5292 case ADDRESS_REG_REG:
5293 if (addr.shift == 0)
5294 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5295 reg_names [REGNO (addr.offset)]);
5296 else
5297 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5298 reg_names [REGNO (addr.offset)], addr.shift);
5299 return;
5301 case ADDRESS_REG_UXTW:
5302 if (addr.shift == 0)
5303 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5304 REGNO (addr.offset) - R0_REGNUM);
5305 else
5306 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5307 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5308 return;
5310 case ADDRESS_REG_SXTW:
5311 if (addr.shift == 0)
5312 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5313 REGNO (addr.offset) - R0_REGNUM);
5314 else
5315 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5316 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5317 return;
5319 case ADDRESS_REG_WB:
5320 switch (GET_CODE (x))
5322 case PRE_INC:
5323 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5324 GET_MODE_SIZE (mode));
5325 return;
5326 case POST_INC:
5327 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5328 GET_MODE_SIZE (mode));
5329 return;
5330 case PRE_DEC:
5331 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5332 GET_MODE_SIZE (mode));
5333 return;
5334 case POST_DEC:
5335 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5336 GET_MODE_SIZE (mode));
5337 return;
5338 case PRE_MODIFY:
5339 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5340 INTVAL (addr.offset));
5341 return;
5342 case POST_MODIFY:
5343 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5344 INTVAL (addr.offset));
5345 return;
5346 default:
5347 break;
5349 break;
5351 case ADDRESS_LO_SUM:
5352 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5353 output_addr_const (f, addr.offset);
5354 asm_fprintf (f, "]");
5355 return;
5357 case ADDRESS_SYMBOLIC:
5358 break;
5361 output_addr_const (f, x);
5364 bool
5365 aarch64_label_mentioned_p (rtx x)
5367 const char *fmt;
5368 int i;
5370 if (GET_CODE (x) == LABEL_REF)
5371 return true;
5373 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5374 referencing instruction, but they are constant offsets, not
5375 symbols. */
5376 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5377 return false;
5379 fmt = GET_RTX_FORMAT (GET_CODE (x));
5380 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5382 if (fmt[i] == 'E')
5384 int j;
5386 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5387 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5388 return 1;
5390 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5391 return 1;
5394 return 0;
5397 /* Implement REGNO_REG_CLASS. */
5399 enum reg_class
5400 aarch64_regno_regclass (unsigned regno)
5402 if (GP_REGNUM_P (regno))
5403 return GENERAL_REGS;
5405 if (regno == SP_REGNUM)
5406 return STACK_REG;
5408 if (regno == FRAME_POINTER_REGNUM
5409 || regno == ARG_POINTER_REGNUM)
5410 return POINTER_REGS;
5412 if (FP_REGNUM_P (regno))
5413 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5415 return NO_REGS;
5418 static rtx
5419 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5421 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5422 where mask is selected by alignment and size of the offset.
5423 We try to pick as large a range for the offset as possible to
5424 maximize the chance of a CSE. However, for aligned addresses
5425 we limit the range to 4k so that structures with different sized
5426 elements are likely to use the same base. We need to be careful
5427 not to split a CONST for some forms of address expression, otherwise
5428 it will generate sub-optimal code. */
5430 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5432 rtx base = XEXP (x, 0);
5433 rtx offset_rtx = XEXP (x, 1);
5434 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5436 if (GET_CODE (base) == PLUS)
5438 rtx op0 = XEXP (base, 0);
5439 rtx op1 = XEXP (base, 1);
5441 /* Force any scaling into a temp for CSE. */
5442 op0 = force_reg (Pmode, op0);
5443 op1 = force_reg (Pmode, op1);
5445 /* Let the pointer register be in op0. */
5446 if (REG_POINTER (op1))
5447 std::swap (op0, op1);
5449 /* If the pointer is virtual or frame related, then we know that
5450 virtual register instantiation or register elimination is going
5451 to apply a second constant. We want the two constants folded
5452 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5453 if (virt_or_elim_regno_p (REGNO (op0)))
5455 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5456 NULL_RTX, true, OPTAB_DIRECT);
5457 return gen_rtx_PLUS (Pmode, base, op1);
5460 /* Otherwise, in order to encourage CSE (and thence loop strength
5461 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5462 base = expand_binop (Pmode, add_optab, op0, op1,
5463 NULL_RTX, true, OPTAB_DIRECT);
5464 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5467 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5468 HOST_WIDE_INT base_offset;
5469 if (GET_MODE_SIZE (mode) > 16)
5470 base_offset = (offset + 0x400) & ~0x7f0;
5471 /* For offsets aren't a multiple of the access size, the limit is
5472 -256...255. */
5473 else if (offset & (GET_MODE_SIZE (mode) - 1))
5475 base_offset = (offset + 0x100) & ~0x1ff;
5477 /* BLKmode typically uses LDP of X-registers. */
5478 if (mode == BLKmode)
5479 base_offset = (offset + 512) & ~0x3ff;
5481 /* Small negative offsets are supported. */
5482 else if (IN_RANGE (offset, -256, 0))
5483 base_offset = 0;
5484 else if (mode == TImode || mode == TFmode)
5485 base_offset = (offset + 0x100) & ~0x1ff;
5486 /* Use 12-bit offset by access size. */
5487 else
5488 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5490 if (base_offset != 0)
5492 base = plus_constant (Pmode, base, base_offset);
5493 base = force_operand (base, NULL_RTX);
5494 return plus_constant (Pmode, base, offset - base_offset);
5498 return x;
5501 /* Return the reload icode required for a constant pool in mode. */
5502 static enum insn_code
5503 aarch64_constant_pool_reload_icode (machine_mode mode)
5505 switch (mode)
5507 case SFmode:
5508 return CODE_FOR_aarch64_reload_movcpsfdi;
5510 case DFmode:
5511 return CODE_FOR_aarch64_reload_movcpdfdi;
5513 case TFmode:
5514 return CODE_FOR_aarch64_reload_movcptfdi;
5516 case V8QImode:
5517 return CODE_FOR_aarch64_reload_movcpv8qidi;
5519 case V16QImode:
5520 return CODE_FOR_aarch64_reload_movcpv16qidi;
5522 case V4HImode:
5523 return CODE_FOR_aarch64_reload_movcpv4hidi;
5525 case V8HImode:
5526 return CODE_FOR_aarch64_reload_movcpv8hidi;
5528 case V2SImode:
5529 return CODE_FOR_aarch64_reload_movcpv2sidi;
5531 case V4SImode:
5532 return CODE_FOR_aarch64_reload_movcpv4sidi;
5534 case V2DImode:
5535 return CODE_FOR_aarch64_reload_movcpv2didi;
5537 case V2DFmode:
5538 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5540 default:
5541 gcc_unreachable ();
5544 gcc_unreachable ();
5546 static reg_class_t
5547 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5548 reg_class_t rclass,
5549 machine_mode mode,
5550 secondary_reload_info *sri)
5553 /* If we have to disable direct literal pool loads and stores because the
5554 function is too big, then we need a scratch register. */
5555 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5556 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5557 || targetm.vector_mode_supported_p (GET_MODE (x)))
5558 && !aarch64_pcrelative_literal_loads)
5560 sri->icode = aarch64_constant_pool_reload_icode (mode);
5561 return NO_REGS;
5564 /* Without the TARGET_SIMD instructions we cannot move a Q register
5565 to a Q register directly. We need a scratch. */
5566 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5567 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5568 && reg_class_subset_p (rclass, FP_REGS))
5570 if (mode == TFmode)
5571 sri->icode = CODE_FOR_aarch64_reload_movtf;
5572 else if (mode == TImode)
5573 sri->icode = CODE_FOR_aarch64_reload_movti;
5574 return NO_REGS;
5577 /* A TFmode or TImode memory access should be handled via an FP_REGS
5578 because AArch64 has richer addressing modes for LDR/STR instructions
5579 than LDP/STP instructions. */
5580 if (TARGET_FLOAT && rclass == GENERAL_REGS
5581 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5582 return FP_REGS;
5584 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5585 return GENERAL_REGS;
5587 return NO_REGS;
5590 static bool
5591 aarch64_can_eliminate (const int from, const int to)
5593 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5594 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5596 if (frame_pointer_needed)
5598 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5599 return true;
5600 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5601 return false;
5602 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5603 && !cfun->calls_alloca)
5604 return true;
5605 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5606 return true;
5608 return false;
5610 else
5612 /* If we decided that we didn't need a leaf frame pointer but then used
5613 LR in the function, then we'll want a frame pointer after all, so
5614 prevent this elimination to ensure a frame pointer is used. */
5615 if (to == STACK_POINTER_REGNUM
5616 && flag_omit_leaf_frame_pointer
5617 && df_regs_ever_live_p (LR_REGNUM))
5618 return false;
5621 return true;
5624 HOST_WIDE_INT
5625 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5627 aarch64_layout_frame ();
5629 if (to == HARD_FRAME_POINTER_REGNUM)
5631 if (from == ARG_POINTER_REGNUM)
5632 return cfun->machine->frame.hard_fp_offset;
5634 if (from == FRAME_POINTER_REGNUM)
5635 return cfun->machine->frame.hard_fp_offset
5636 - cfun->machine->frame.locals_offset;
5639 if (to == STACK_POINTER_REGNUM)
5641 if (from == FRAME_POINTER_REGNUM)
5642 return cfun->machine->frame.frame_size
5643 - cfun->machine->frame.locals_offset;
5646 return cfun->machine->frame.frame_size;
5649 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5650 previous frame. */
5653 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5655 if (count != 0)
5656 return const0_rtx;
5657 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5661 static void
5662 aarch64_asm_trampoline_template (FILE *f)
5664 if (TARGET_ILP32)
5666 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5667 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5669 else
5671 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5672 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5674 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5675 assemble_aligned_integer (4, const0_rtx);
5676 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5677 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5680 static void
5681 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5683 rtx fnaddr, mem, a_tramp;
5684 const int tramp_code_sz = 16;
5686 /* Don't need to copy the trailing D-words, we fill those in below. */
5687 emit_block_move (m_tramp, assemble_trampoline_template (),
5688 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5689 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5690 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5691 if (GET_MODE (fnaddr) != ptr_mode)
5692 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5693 emit_move_insn (mem, fnaddr);
5695 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5696 emit_move_insn (mem, chain_value);
5698 /* XXX We should really define a "clear_cache" pattern and use
5699 gen_clear_cache(). */
5700 a_tramp = XEXP (m_tramp, 0);
5701 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5702 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5703 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5704 ptr_mode);
5707 static unsigned char
5708 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5710 switch (regclass)
5712 case CALLER_SAVE_REGS:
5713 case POINTER_REGS:
5714 case GENERAL_REGS:
5715 case ALL_REGS:
5716 case FP_REGS:
5717 case FP_LO_REGS:
5718 return
5719 aarch64_vector_mode_p (mode)
5720 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5721 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5722 case STACK_REG:
5723 return 1;
5725 case NO_REGS:
5726 return 0;
5728 default:
5729 break;
5731 gcc_unreachable ();
5734 static reg_class_t
5735 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5737 if (regclass == POINTER_REGS)
5738 return GENERAL_REGS;
5740 if (regclass == STACK_REG)
5742 if (REG_P(x)
5743 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5744 return regclass;
5746 return NO_REGS;
5749 /* If it's an integer immediate that MOVI can't handle, then
5750 FP_REGS is not an option, so we return NO_REGS instead. */
5751 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5752 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5753 return NO_REGS;
5755 /* Register eliminiation can result in a request for
5756 SP+constant->FP_REGS. We cannot support such operations which
5757 use SP as source and an FP_REG as destination, so reject out
5758 right now. */
5759 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5761 rtx lhs = XEXP (x, 0);
5763 /* Look through a possible SUBREG introduced by ILP32. */
5764 if (GET_CODE (lhs) == SUBREG)
5765 lhs = SUBREG_REG (lhs);
5767 gcc_assert (REG_P (lhs));
5768 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5769 POINTER_REGS));
5770 return NO_REGS;
5773 return regclass;
5776 void
5777 aarch64_asm_output_labelref (FILE* f, const char *name)
5779 asm_fprintf (f, "%U%s", name);
5782 static void
5783 aarch64_elf_asm_constructor (rtx symbol, int priority)
5785 if (priority == DEFAULT_INIT_PRIORITY)
5786 default_ctor_section_asm_out_constructor (symbol, priority);
5787 else
5789 section *s;
5790 /* While priority is known to be in range [0, 65535], so 18 bytes
5791 would be enough, the compiler might not know that. To avoid
5792 -Wformat-truncation false positive, use a larger size. */
5793 char buf[23];
5794 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5795 s = get_section (buf, SECTION_WRITE, NULL);
5796 switch_to_section (s);
5797 assemble_align (POINTER_SIZE);
5798 assemble_aligned_integer (POINTER_BYTES, symbol);
5802 static void
5803 aarch64_elf_asm_destructor (rtx symbol, int priority)
5805 if (priority == DEFAULT_INIT_PRIORITY)
5806 default_dtor_section_asm_out_destructor (symbol, priority);
5807 else
5809 section *s;
5810 /* While priority is known to be in range [0, 65535], so 18 bytes
5811 would be enough, the compiler might not know that. To avoid
5812 -Wformat-truncation false positive, use a larger size. */
5813 char buf[23];
5814 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5815 s = get_section (buf, SECTION_WRITE, NULL);
5816 switch_to_section (s);
5817 assemble_align (POINTER_SIZE);
5818 assemble_aligned_integer (POINTER_BYTES, symbol);
5822 const char*
5823 aarch64_output_casesi (rtx *operands)
5825 char buf[100];
5826 char label[100];
5827 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5828 int index;
5829 static const char *const patterns[4][2] =
5832 "ldrb\t%w3, [%0,%w1,uxtw]",
5833 "add\t%3, %4, %w3, sxtb #2"
5836 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5837 "add\t%3, %4, %w3, sxth #2"
5840 "ldr\t%w3, [%0,%w1,uxtw #2]",
5841 "add\t%3, %4, %w3, sxtw #2"
5843 /* We assume that DImode is only generated when not optimizing and
5844 that we don't really need 64-bit address offsets. That would
5845 imply an object file with 8GB of code in a single function! */
5847 "ldr\t%w3, [%0,%w1,uxtw #2]",
5848 "add\t%3, %4, %w3, sxtw #2"
5852 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5854 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5856 gcc_assert (index >= 0 && index <= 3);
5858 /* Need to implement table size reduction, by chaning the code below. */
5859 output_asm_insn (patterns[index][0], operands);
5860 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5861 snprintf (buf, sizeof (buf),
5862 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5863 output_asm_insn (buf, operands);
5864 output_asm_insn (patterns[index][1], operands);
5865 output_asm_insn ("br\t%3", operands);
5866 assemble_label (asm_out_file, label);
5867 return "";
5871 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5872 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5873 operator. */
5876 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5878 if (shift >= 0 && shift <= 3)
5880 int size;
5881 for (size = 8; size <= 32; size *= 2)
5883 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5884 if (mask == bits << shift)
5885 return size;
5888 return 0;
5891 /* Constant pools are per function only when PC relative
5892 literal loads are true or we are in the large memory
5893 model. */
5895 static inline bool
5896 aarch64_can_use_per_function_literal_pools_p (void)
5898 return (aarch64_pcrelative_literal_loads
5899 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5902 static bool
5903 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5905 /* Fixme:: In an ideal world this would work similar
5906 to the logic in aarch64_select_rtx_section but this
5907 breaks bootstrap in gcc go. For now we workaround
5908 this by returning false here. */
5909 return false;
5912 /* Select appropriate section for constants depending
5913 on where we place literal pools. */
5915 static section *
5916 aarch64_select_rtx_section (machine_mode mode,
5917 rtx x,
5918 unsigned HOST_WIDE_INT align)
5920 if (aarch64_can_use_per_function_literal_pools_p ())
5921 return function_section (current_function_decl);
5923 return default_elf_select_rtx_section (mode, x, align);
5926 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5927 void
5928 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5929 HOST_WIDE_INT offset)
5931 /* When using per-function literal pools, we must ensure that any code
5932 section is aligned to the minimal instruction length, lest we get
5933 errors from the assembler re "unaligned instructions". */
5934 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5935 ASM_OUTPUT_ALIGN (f, 2);
5938 /* Costs. */
5940 /* Helper function for rtx cost calculation. Strip a shift expression
5941 from X. Returns the inner operand if successful, or the original
5942 expression on failure. */
5943 static rtx
5944 aarch64_strip_shift (rtx x)
5946 rtx op = x;
5948 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5949 we can convert both to ROR during final output. */
5950 if ((GET_CODE (op) == ASHIFT
5951 || GET_CODE (op) == ASHIFTRT
5952 || GET_CODE (op) == LSHIFTRT
5953 || GET_CODE (op) == ROTATERT
5954 || GET_CODE (op) == ROTATE)
5955 && CONST_INT_P (XEXP (op, 1)))
5956 return XEXP (op, 0);
5958 if (GET_CODE (op) == MULT
5959 && CONST_INT_P (XEXP (op, 1))
5960 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5961 return XEXP (op, 0);
5963 return x;
5966 /* Helper function for rtx cost calculation. Strip an extend
5967 expression from X. Returns the inner operand if successful, or the
5968 original expression on failure. We deal with a number of possible
5969 canonicalization variations here. */
5970 static rtx
5971 aarch64_strip_extend (rtx x)
5973 rtx op = x;
5975 /* Zero and sign extraction of a widened value. */
5976 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5977 && XEXP (op, 2) == const0_rtx
5978 && GET_CODE (XEXP (op, 0)) == MULT
5979 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5980 XEXP (op, 1)))
5981 return XEXP (XEXP (op, 0), 0);
5983 /* It can also be represented (for zero-extend) as an AND with an
5984 immediate. */
5985 if (GET_CODE (op) == AND
5986 && GET_CODE (XEXP (op, 0)) == MULT
5987 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5988 && CONST_INT_P (XEXP (op, 1))
5989 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5990 INTVAL (XEXP (op, 1))) != 0)
5991 return XEXP (XEXP (op, 0), 0);
5993 /* Now handle extended register, as this may also have an optional
5994 left shift by 1..4. */
5995 if (GET_CODE (op) == ASHIFT
5996 && CONST_INT_P (XEXP (op, 1))
5997 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5998 op = XEXP (op, 0);
6000 if (GET_CODE (op) == ZERO_EXTEND
6001 || GET_CODE (op) == SIGN_EXTEND)
6002 op = XEXP (op, 0);
6004 if (op != x)
6005 return op;
6007 return x;
6010 /* Return true iff CODE is a shift supported in combination
6011 with arithmetic instructions. */
6013 static bool
6014 aarch64_shift_p (enum rtx_code code)
6016 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6019 /* Helper function for rtx cost calculation. Calculate the cost of
6020 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6021 Return the calculated cost of the expression, recursing manually in to
6022 operands where needed. */
6024 static int
6025 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6027 rtx op0, op1;
6028 const struct cpu_cost_table *extra_cost
6029 = aarch64_tune_params.insn_extra_cost;
6030 int cost = 0;
6031 bool compound_p = (outer == PLUS || outer == MINUS);
6032 machine_mode mode = GET_MODE (x);
6034 gcc_checking_assert (code == MULT);
6036 op0 = XEXP (x, 0);
6037 op1 = XEXP (x, 1);
6039 if (VECTOR_MODE_P (mode))
6040 mode = GET_MODE_INNER (mode);
6042 /* Integer multiply/fma. */
6043 if (GET_MODE_CLASS (mode) == MODE_INT)
6045 /* The multiply will be canonicalized as a shift, cost it as such. */
6046 if (aarch64_shift_p (GET_CODE (x))
6047 || (CONST_INT_P (op1)
6048 && exact_log2 (INTVAL (op1)) > 0))
6050 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6051 || GET_CODE (op0) == SIGN_EXTEND;
6052 if (speed)
6054 if (compound_p)
6056 if (REG_P (op1))
6057 /* ARITH + shift-by-register. */
6058 cost += extra_cost->alu.arith_shift_reg;
6059 else if (is_extend)
6060 /* ARITH + extended register. We don't have a cost field
6061 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6062 cost += extra_cost->alu.extend_arith;
6063 else
6064 /* ARITH + shift-by-immediate. */
6065 cost += extra_cost->alu.arith_shift;
6067 else
6068 /* LSL (immediate). */
6069 cost += extra_cost->alu.shift;
6072 /* Strip extends as we will have costed them in the case above. */
6073 if (is_extend)
6074 op0 = aarch64_strip_extend (op0);
6076 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6078 return cost;
6081 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6082 compound and let the below cases handle it. After all, MNEG is a
6083 special-case alias of MSUB. */
6084 if (GET_CODE (op0) == NEG)
6086 op0 = XEXP (op0, 0);
6087 compound_p = true;
6090 /* Integer multiplies or FMAs have zero/sign extending variants. */
6091 if ((GET_CODE (op0) == ZERO_EXTEND
6092 && GET_CODE (op1) == ZERO_EXTEND)
6093 || (GET_CODE (op0) == SIGN_EXTEND
6094 && GET_CODE (op1) == SIGN_EXTEND))
6096 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6097 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6099 if (speed)
6101 if (compound_p)
6102 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6103 cost += extra_cost->mult[0].extend_add;
6104 else
6105 /* MUL/SMULL/UMULL. */
6106 cost += extra_cost->mult[0].extend;
6109 return cost;
6112 /* This is either an integer multiply or a MADD. In both cases
6113 we want to recurse and cost the operands. */
6114 cost += rtx_cost (op0, mode, MULT, 0, speed);
6115 cost += rtx_cost (op1, mode, MULT, 1, speed);
6117 if (speed)
6119 if (compound_p)
6120 /* MADD/MSUB. */
6121 cost += extra_cost->mult[mode == DImode].add;
6122 else
6123 /* MUL. */
6124 cost += extra_cost->mult[mode == DImode].simple;
6127 return cost;
6129 else
6131 if (speed)
6133 /* Floating-point FMA/FMUL can also support negations of the
6134 operands, unless the rounding mode is upward or downward in
6135 which case FNMUL is different than FMUL with operand negation. */
6136 bool neg0 = GET_CODE (op0) == NEG;
6137 bool neg1 = GET_CODE (op1) == NEG;
6138 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6140 if (neg0)
6141 op0 = XEXP (op0, 0);
6142 if (neg1)
6143 op1 = XEXP (op1, 0);
6146 if (compound_p)
6147 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6148 cost += extra_cost->fp[mode == DFmode].fma;
6149 else
6150 /* FMUL/FNMUL. */
6151 cost += extra_cost->fp[mode == DFmode].mult;
6154 cost += rtx_cost (op0, mode, MULT, 0, speed);
6155 cost += rtx_cost (op1, mode, MULT, 1, speed);
6156 return cost;
6160 static int
6161 aarch64_address_cost (rtx x,
6162 machine_mode mode,
6163 addr_space_t as ATTRIBUTE_UNUSED,
6164 bool speed)
6166 enum rtx_code c = GET_CODE (x);
6167 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6168 struct aarch64_address_info info;
6169 int cost = 0;
6170 info.shift = 0;
6172 if (!aarch64_classify_address (&info, x, mode, c, false))
6174 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6176 /* This is a CONST or SYMBOL ref which will be split
6177 in a different way depending on the code model in use.
6178 Cost it through the generic infrastructure. */
6179 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6180 /* Divide through by the cost of one instruction to
6181 bring it to the same units as the address costs. */
6182 cost_symbol_ref /= COSTS_N_INSNS (1);
6183 /* The cost is then the cost of preparing the address,
6184 followed by an immediate (possibly 0) offset. */
6185 return cost_symbol_ref + addr_cost->imm_offset;
6187 else
6189 /* This is most likely a jump table from a case
6190 statement. */
6191 return addr_cost->register_offset;
6195 switch (info.type)
6197 case ADDRESS_LO_SUM:
6198 case ADDRESS_SYMBOLIC:
6199 case ADDRESS_REG_IMM:
6200 cost += addr_cost->imm_offset;
6201 break;
6203 case ADDRESS_REG_WB:
6204 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6205 cost += addr_cost->pre_modify;
6206 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6207 cost += addr_cost->post_modify;
6208 else
6209 gcc_unreachable ();
6211 break;
6213 case ADDRESS_REG_REG:
6214 cost += addr_cost->register_offset;
6215 break;
6217 case ADDRESS_REG_SXTW:
6218 cost += addr_cost->register_sextend;
6219 break;
6221 case ADDRESS_REG_UXTW:
6222 cost += addr_cost->register_zextend;
6223 break;
6225 default:
6226 gcc_unreachable ();
6230 if (info.shift > 0)
6232 /* For the sake of calculating the cost of the shifted register
6233 component, we can treat same sized modes in the same way. */
6234 switch (GET_MODE_BITSIZE (mode))
6236 case 16:
6237 cost += addr_cost->addr_scale_costs.hi;
6238 break;
6240 case 32:
6241 cost += addr_cost->addr_scale_costs.si;
6242 break;
6244 case 64:
6245 cost += addr_cost->addr_scale_costs.di;
6246 break;
6248 /* We can't tell, or this is a 128-bit vector. */
6249 default:
6250 cost += addr_cost->addr_scale_costs.ti;
6251 break;
6255 return cost;
6258 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6259 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6260 to be taken. */
6263 aarch64_branch_cost (bool speed_p, bool predictable_p)
6265 /* When optimizing for speed, use the cost of unpredictable branches. */
6266 const struct cpu_branch_cost *branch_costs =
6267 aarch64_tune_params.branch_costs;
6269 if (!speed_p || predictable_p)
6270 return branch_costs->predictable;
6271 else
6272 return branch_costs->unpredictable;
6275 /* Return true if the RTX X in mode MODE is a zero or sign extract
6276 usable in an ADD or SUB (extended register) instruction. */
6277 static bool
6278 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6280 /* Catch add with a sign extract.
6281 This is add_<optab><mode>_multp2. */
6282 if (GET_CODE (x) == SIGN_EXTRACT
6283 || GET_CODE (x) == ZERO_EXTRACT)
6285 rtx op0 = XEXP (x, 0);
6286 rtx op1 = XEXP (x, 1);
6287 rtx op2 = XEXP (x, 2);
6289 if (GET_CODE (op0) == MULT
6290 && CONST_INT_P (op1)
6291 && op2 == const0_rtx
6292 && CONST_INT_P (XEXP (op0, 1))
6293 && aarch64_is_extend_from_extract (mode,
6294 XEXP (op0, 1),
6295 op1))
6297 return true;
6300 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6301 No shift. */
6302 else if (GET_CODE (x) == SIGN_EXTEND
6303 || GET_CODE (x) == ZERO_EXTEND)
6304 return REG_P (XEXP (x, 0));
6306 return false;
6309 static bool
6310 aarch64_frint_unspec_p (unsigned int u)
6312 switch (u)
6314 case UNSPEC_FRINTZ:
6315 case UNSPEC_FRINTP:
6316 case UNSPEC_FRINTM:
6317 case UNSPEC_FRINTA:
6318 case UNSPEC_FRINTN:
6319 case UNSPEC_FRINTX:
6320 case UNSPEC_FRINTI:
6321 return true;
6323 default:
6324 return false;
6328 /* Return true iff X is an rtx that will match an extr instruction
6329 i.e. as described in the *extr<mode>5_insn family of patterns.
6330 OP0 and OP1 will be set to the operands of the shifts involved
6331 on success and will be NULL_RTX otherwise. */
6333 static bool
6334 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6336 rtx op0, op1;
6337 machine_mode mode = GET_MODE (x);
6339 *res_op0 = NULL_RTX;
6340 *res_op1 = NULL_RTX;
6342 if (GET_CODE (x) != IOR)
6343 return false;
6345 op0 = XEXP (x, 0);
6346 op1 = XEXP (x, 1);
6348 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6349 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6351 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6352 if (GET_CODE (op1) == ASHIFT)
6353 std::swap (op0, op1);
6355 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6356 return false;
6358 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6359 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6361 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6362 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6364 *res_op0 = XEXP (op0, 0);
6365 *res_op1 = XEXP (op1, 0);
6366 return true;
6370 return false;
6373 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6374 storing it in *COST. Result is true if the total cost of the operation
6375 has now been calculated. */
6376 static bool
6377 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6379 rtx inner;
6380 rtx comparator;
6381 enum rtx_code cmpcode;
6383 if (COMPARISON_P (op0))
6385 inner = XEXP (op0, 0);
6386 comparator = XEXP (op0, 1);
6387 cmpcode = GET_CODE (op0);
6389 else
6391 inner = op0;
6392 comparator = const0_rtx;
6393 cmpcode = NE;
6396 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6398 /* Conditional branch. */
6399 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6400 return true;
6401 else
6403 if (cmpcode == NE || cmpcode == EQ)
6405 if (comparator == const0_rtx)
6407 /* TBZ/TBNZ/CBZ/CBNZ. */
6408 if (GET_CODE (inner) == ZERO_EXTRACT)
6409 /* TBZ/TBNZ. */
6410 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6411 ZERO_EXTRACT, 0, speed);
6412 else
6413 /* CBZ/CBNZ. */
6414 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6416 return true;
6419 else if (cmpcode == LT || cmpcode == GE)
6421 /* TBZ/TBNZ. */
6422 if (comparator == const0_rtx)
6423 return true;
6427 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6429 /* CCMP. */
6430 if (GET_CODE (op1) == COMPARE)
6432 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6433 if (XEXP (op1, 1) == const0_rtx)
6434 *cost += 1;
6435 if (speed)
6437 machine_mode mode = GET_MODE (XEXP (op1, 0));
6438 const struct cpu_cost_table *extra_cost
6439 = aarch64_tune_params.insn_extra_cost;
6441 if (GET_MODE_CLASS (mode) == MODE_INT)
6442 *cost += extra_cost->alu.arith;
6443 else
6444 *cost += extra_cost->fp[mode == DFmode].compare;
6446 return true;
6449 /* It's a conditional operation based on the status flags,
6450 so it must be some flavor of CSEL. */
6452 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6453 if (GET_CODE (op1) == NEG
6454 || GET_CODE (op1) == NOT
6455 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6456 op1 = XEXP (op1, 0);
6457 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6459 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6460 op1 = XEXP (op1, 0);
6461 op2 = XEXP (op2, 0);
6464 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6465 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6466 return true;
6469 /* We don't know what this is, cost all operands. */
6470 return false;
6473 /* Check whether X is a bitfield operation of the form shift + extend that
6474 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6475 operand to which the bitfield operation is applied. Otherwise return
6476 NULL_RTX. */
6478 static rtx
6479 aarch64_extend_bitfield_pattern_p (rtx x)
6481 rtx_code outer_code = GET_CODE (x);
6482 machine_mode outer_mode = GET_MODE (x);
6484 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6485 && outer_mode != SImode && outer_mode != DImode)
6486 return NULL_RTX;
6488 rtx inner = XEXP (x, 0);
6489 rtx_code inner_code = GET_CODE (inner);
6490 machine_mode inner_mode = GET_MODE (inner);
6491 rtx op = NULL_RTX;
6493 switch (inner_code)
6495 case ASHIFT:
6496 if (CONST_INT_P (XEXP (inner, 1))
6497 && (inner_mode == QImode || inner_mode == HImode))
6498 op = XEXP (inner, 0);
6499 break;
6500 case LSHIFTRT:
6501 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6502 && (inner_mode == QImode || inner_mode == HImode))
6503 op = XEXP (inner, 0);
6504 break;
6505 case ASHIFTRT:
6506 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6507 && (inner_mode == QImode || inner_mode == HImode))
6508 op = XEXP (inner, 0);
6509 break;
6510 default:
6511 break;
6514 return op;
6517 /* Return true if the mask and a shift amount from an RTX of the form
6518 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6519 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6521 bool
6522 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6524 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6525 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6526 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6527 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6530 /* Calculate the cost of calculating X, storing it in *COST. Result
6531 is true if the total cost of the operation has now been calculated. */
6532 static bool
6533 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6534 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6536 rtx op0, op1, op2;
6537 const struct cpu_cost_table *extra_cost
6538 = aarch64_tune_params.insn_extra_cost;
6539 int code = GET_CODE (x);
6541 /* By default, assume that everything has equivalent cost to the
6542 cheapest instruction. Any additional costs are applied as a delta
6543 above this default. */
6544 *cost = COSTS_N_INSNS (1);
6546 switch (code)
6548 case SET:
6549 /* The cost depends entirely on the operands to SET. */
6550 *cost = 0;
6551 op0 = SET_DEST (x);
6552 op1 = SET_SRC (x);
6554 switch (GET_CODE (op0))
6556 case MEM:
6557 if (speed)
6559 rtx address = XEXP (op0, 0);
6560 if (VECTOR_MODE_P (mode))
6561 *cost += extra_cost->ldst.storev;
6562 else if (GET_MODE_CLASS (mode) == MODE_INT)
6563 *cost += extra_cost->ldst.store;
6564 else if (mode == SFmode)
6565 *cost += extra_cost->ldst.storef;
6566 else if (mode == DFmode)
6567 *cost += extra_cost->ldst.stored;
6569 *cost +=
6570 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6571 0, speed));
6574 *cost += rtx_cost (op1, mode, SET, 1, speed);
6575 return true;
6577 case SUBREG:
6578 if (! REG_P (SUBREG_REG (op0)))
6579 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6581 /* Fall through. */
6582 case REG:
6583 /* The cost is one per vector-register copied. */
6584 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6586 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6587 / GET_MODE_SIZE (V4SImode);
6588 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6590 /* const0_rtx is in general free, but we will use an
6591 instruction to set a register to 0. */
6592 else if (REG_P (op1) || op1 == const0_rtx)
6594 /* The cost is 1 per register copied. */
6595 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6596 / UNITS_PER_WORD;
6597 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6599 else
6600 /* Cost is just the cost of the RHS of the set. */
6601 *cost += rtx_cost (op1, mode, SET, 1, speed);
6602 return true;
6604 case ZERO_EXTRACT:
6605 case SIGN_EXTRACT:
6606 /* Bit-field insertion. Strip any redundant widening of
6607 the RHS to meet the width of the target. */
6608 if (GET_CODE (op1) == SUBREG)
6609 op1 = SUBREG_REG (op1);
6610 if ((GET_CODE (op1) == ZERO_EXTEND
6611 || GET_CODE (op1) == SIGN_EXTEND)
6612 && CONST_INT_P (XEXP (op0, 1))
6613 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6614 >= INTVAL (XEXP (op0, 1))))
6615 op1 = XEXP (op1, 0);
6617 if (CONST_INT_P (op1))
6619 /* MOV immediate is assumed to always be cheap. */
6620 *cost = COSTS_N_INSNS (1);
6622 else
6624 /* BFM. */
6625 if (speed)
6626 *cost += extra_cost->alu.bfi;
6627 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6630 return true;
6632 default:
6633 /* We can't make sense of this, assume default cost. */
6634 *cost = COSTS_N_INSNS (1);
6635 return false;
6637 return false;
6639 case CONST_INT:
6640 /* If an instruction can incorporate a constant within the
6641 instruction, the instruction's expression avoids calling
6642 rtx_cost() on the constant. If rtx_cost() is called on a
6643 constant, then it is usually because the constant must be
6644 moved into a register by one or more instructions.
6646 The exception is constant 0, which can be expressed
6647 as XZR/WZR and is therefore free. The exception to this is
6648 if we have (set (reg) (const0_rtx)) in which case we must cost
6649 the move. However, we can catch that when we cost the SET, so
6650 we don't need to consider that here. */
6651 if (x == const0_rtx)
6652 *cost = 0;
6653 else
6655 /* To an approximation, building any other constant is
6656 proportionally expensive to the number of instructions
6657 required to build that constant. This is true whether we
6658 are compiling for SPEED or otherwise. */
6659 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6660 (NULL_RTX, x, false, mode));
6662 return true;
6664 case CONST_DOUBLE:
6665 if (speed)
6667 /* mov[df,sf]_aarch64. */
6668 if (aarch64_float_const_representable_p (x))
6669 /* FMOV (scalar immediate). */
6670 *cost += extra_cost->fp[mode == DFmode].fpconst;
6671 else if (!aarch64_float_const_zero_rtx_p (x))
6673 /* This will be a load from memory. */
6674 if (mode == DFmode)
6675 *cost += extra_cost->ldst.loadd;
6676 else
6677 *cost += extra_cost->ldst.loadf;
6679 else
6680 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6681 or MOV v0.s[0], wzr - neither of which are modeled by the
6682 cost tables. Just use the default cost. */
6687 return true;
6689 case MEM:
6690 if (speed)
6692 /* For loads we want the base cost of a load, plus an
6693 approximation for the additional cost of the addressing
6694 mode. */
6695 rtx address = XEXP (x, 0);
6696 if (VECTOR_MODE_P (mode))
6697 *cost += extra_cost->ldst.loadv;
6698 else if (GET_MODE_CLASS (mode) == MODE_INT)
6699 *cost += extra_cost->ldst.load;
6700 else if (mode == SFmode)
6701 *cost += extra_cost->ldst.loadf;
6702 else if (mode == DFmode)
6703 *cost += extra_cost->ldst.loadd;
6705 *cost +=
6706 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6707 0, speed));
6710 return true;
6712 case NEG:
6713 op0 = XEXP (x, 0);
6715 if (VECTOR_MODE_P (mode))
6717 if (speed)
6719 /* FNEG. */
6720 *cost += extra_cost->vect.alu;
6722 return false;
6725 if (GET_MODE_CLASS (mode) == MODE_INT)
6727 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6728 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6730 /* CSETM. */
6731 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6732 return true;
6735 /* Cost this as SUB wzr, X. */
6736 op0 = CONST0_RTX (mode);
6737 op1 = XEXP (x, 0);
6738 goto cost_minus;
6741 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6743 /* Support (neg(fma...)) as a single instruction only if
6744 sign of zeros is unimportant. This matches the decision
6745 making in aarch64.md. */
6746 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6748 /* FNMADD. */
6749 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6750 return true;
6752 if (GET_CODE (op0) == MULT)
6754 /* FNMUL. */
6755 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6756 return true;
6758 if (speed)
6759 /* FNEG. */
6760 *cost += extra_cost->fp[mode == DFmode].neg;
6761 return false;
6764 return false;
6766 case CLRSB:
6767 case CLZ:
6768 if (speed)
6770 if (VECTOR_MODE_P (mode))
6771 *cost += extra_cost->vect.alu;
6772 else
6773 *cost += extra_cost->alu.clz;
6776 return false;
6778 case COMPARE:
6779 op0 = XEXP (x, 0);
6780 op1 = XEXP (x, 1);
6782 if (op1 == const0_rtx
6783 && GET_CODE (op0) == AND)
6785 x = op0;
6786 mode = GET_MODE (op0);
6787 goto cost_logic;
6790 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6792 /* TODO: A write to the CC flags possibly costs extra, this
6793 needs encoding in the cost tables. */
6795 mode = GET_MODE (op0);
6796 /* ANDS. */
6797 if (GET_CODE (op0) == AND)
6799 x = op0;
6800 goto cost_logic;
6803 if (GET_CODE (op0) == PLUS)
6805 /* ADDS (and CMN alias). */
6806 x = op0;
6807 goto cost_plus;
6810 if (GET_CODE (op0) == MINUS)
6812 /* SUBS. */
6813 x = op0;
6814 goto cost_minus;
6817 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6818 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6819 && CONST_INT_P (XEXP (op0, 2)))
6821 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6822 Handle it here directly rather than going to cost_logic
6823 since we know the immediate generated for the TST is valid
6824 so we can avoid creating an intermediate rtx for it only
6825 for costing purposes. */
6826 if (speed)
6827 *cost += extra_cost->alu.logical;
6829 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6830 ZERO_EXTRACT, 0, speed);
6831 return true;
6834 if (GET_CODE (op1) == NEG)
6836 /* CMN. */
6837 if (speed)
6838 *cost += extra_cost->alu.arith;
6840 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6841 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6842 return true;
6845 /* CMP.
6847 Compare can freely swap the order of operands, and
6848 canonicalization puts the more complex operation first.
6849 But the integer MINUS logic expects the shift/extend
6850 operation in op1. */
6851 if (! (REG_P (op0)
6852 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6854 op0 = XEXP (x, 1);
6855 op1 = XEXP (x, 0);
6857 goto cost_minus;
6860 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6862 /* FCMP. */
6863 if (speed)
6864 *cost += extra_cost->fp[mode == DFmode].compare;
6866 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6868 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6869 /* FCMP supports constant 0.0 for no extra cost. */
6870 return true;
6872 return false;
6875 if (VECTOR_MODE_P (mode))
6877 /* Vector compare. */
6878 if (speed)
6879 *cost += extra_cost->vect.alu;
6881 if (aarch64_float_const_zero_rtx_p (op1))
6883 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6884 cost. */
6885 return true;
6887 return false;
6889 return false;
6891 case MINUS:
6893 op0 = XEXP (x, 0);
6894 op1 = XEXP (x, 1);
6896 cost_minus:
6897 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6899 /* Detect valid immediates. */
6900 if ((GET_MODE_CLASS (mode) == MODE_INT
6901 || (GET_MODE_CLASS (mode) == MODE_CC
6902 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6903 && CONST_INT_P (op1)
6904 && aarch64_uimm12_shift (INTVAL (op1)))
6906 if (speed)
6907 /* SUB(S) (immediate). */
6908 *cost += extra_cost->alu.arith;
6909 return true;
6912 /* Look for SUB (extended register). */
6913 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6915 if (speed)
6916 *cost += extra_cost->alu.extend_arith;
6918 op1 = aarch64_strip_extend (op1);
6919 *cost += rtx_cost (op1, VOIDmode,
6920 (enum rtx_code) GET_CODE (op1), 0, speed);
6921 return true;
6924 rtx new_op1 = aarch64_strip_extend (op1);
6926 /* Cost this as an FMA-alike operation. */
6927 if ((GET_CODE (new_op1) == MULT
6928 || aarch64_shift_p (GET_CODE (new_op1)))
6929 && code != COMPARE)
6931 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6932 (enum rtx_code) code,
6933 speed);
6934 return true;
6937 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6939 if (speed)
6941 if (VECTOR_MODE_P (mode))
6943 /* Vector SUB. */
6944 *cost += extra_cost->vect.alu;
6946 else if (GET_MODE_CLASS (mode) == MODE_INT)
6948 /* SUB(S). */
6949 *cost += extra_cost->alu.arith;
6951 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6953 /* FSUB. */
6954 *cost += extra_cost->fp[mode == DFmode].addsub;
6957 return true;
6960 case PLUS:
6962 rtx new_op0;
6964 op0 = XEXP (x, 0);
6965 op1 = XEXP (x, 1);
6967 cost_plus:
6968 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6969 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6971 /* CSINC. */
6972 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6973 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6974 return true;
6977 if (GET_MODE_CLASS (mode) == MODE_INT
6978 && CONST_INT_P (op1)
6979 && aarch64_uimm12_shift (INTVAL (op1)))
6981 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6983 if (speed)
6984 /* ADD (immediate). */
6985 *cost += extra_cost->alu.arith;
6986 return true;
6989 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6991 /* Look for ADD (extended register). */
6992 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6994 if (speed)
6995 *cost += extra_cost->alu.extend_arith;
6997 op0 = aarch64_strip_extend (op0);
6998 *cost += rtx_cost (op0, VOIDmode,
6999 (enum rtx_code) GET_CODE (op0), 0, speed);
7000 return true;
7003 /* Strip any extend, leave shifts behind as we will
7004 cost them through mult_cost. */
7005 new_op0 = aarch64_strip_extend (op0);
7007 if (GET_CODE (new_op0) == MULT
7008 || aarch64_shift_p (GET_CODE (new_op0)))
7010 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7011 speed);
7012 return true;
7015 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7017 if (speed)
7019 if (VECTOR_MODE_P (mode))
7021 /* Vector ADD. */
7022 *cost += extra_cost->vect.alu;
7024 else if (GET_MODE_CLASS (mode) == MODE_INT)
7026 /* ADD. */
7027 *cost += extra_cost->alu.arith;
7029 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7031 /* FADD. */
7032 *cost += extra_cost->fp[mode == DFmode].addsub;
7035 return true;
7038 case BSWAP:
7039 *cost = COSTS_N_INSNS (1);
7041 if (speed)
7043 if (VECTOR_MODE_P (mode))
7044 *cost += extra_cost->vect.alu;
7045 else
7046 *cost += extra_cost->alu.rev;
7048 return false;
7050 case IOR:
7051 if (aarch_rev16_p (x))
7053 *cost = COSTS_N_INSNS (1);
7055 if (speed)
7057 if (VECTOR_MODE_P (mode))
7058 *cost += extra_cost->vect.alu;
7059 else
7060 *cost += extra_cost->alu.rev;
7062 return true;
7065 if (aarch64_extr_rtx_p (x, &op0, &op1))
7067 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7068 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7069 if (speed)
7070 *cost += extra_cost->alu.shift;
7072 return true;
7074 /* Fall through. */
7075 case XOR:
7076 case AND:
7077 cost_logic:
7078 op0 = XEXP (x, 0);
7079 op1 = XEXP (x, 1);
7081 if (VECTOR_MODE_P (mode))
7083 if (speed)
7084 *cost += extra_cost->vect.alu;
7085 return true;
7088 if (code == AND
7089 && GET_CODE (op0) == MULT
7090 && CONST_INT_P (XEXP (op0, 1))
7091 && CONST_INT_P (op1)
7092 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7093 INTVAL (op1)) != 0)
7095 /* This is a UBFM/SBFM. */
7096 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7097 if (speed)
7098 *cost += extra_cost->alu.bfx;
7099 return true;
7102 if (GET_MODE_CLASS (mode) == MODE_INT)
7104 if (CONST_INT_P (op1))
7106 /* We have a mask + shift version of a UBFIZ
7107 i.e. the *andim_ashift<mode>_bfiz pattern. */
7108 if (GET_CODE (op0) == ASHIFT
7109 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7110 XEXP (op0, 1)))
7112 *cost += rtx_cost (XEXP (op0, 0), mode,
7113 (enum rtx_code) code, 0, speed);
7114 if (speed)
7115 *cost += extra_cost->alu.bfx;
7117 return true;
7119 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7121 /* We possibly get the immediate for free, this is not
7122 modelled. */
7123 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7124 if (speed)
7125 *cost += extra_cost->alu.logical;
7127 return true;
7130 else
7132 rtx new_op0 = op0;
7134 /* Handle ORN, EON, or BIC. */
7135 if (GET_CODE (op0) == NOT)
7136 op0 = XEXP (op0, 0);
7138 new_op0 = aarch64_strip_shift (op0);
7140 /* If we had a shift on op0 then this is a logical-shift-
7141 by-register/immediate operation. Otherwise, this is just
7142 a logical operation. */
7143 if (speed)
7145 if (new_op0 != op0)
7147 /* Shift by immediate. */
7148 if (CONST_INT_P (XEXP (op0, 1)))
7149 *cost += extra_cost->alu.log_shift;
7150 else
7151 *cost += extra_cost->alu.log_shift_reg;
7153 else
7154 *cost += extra_cost->alu.logical;
7157 /* In both cases we want to cost both operands. */
7158 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7159 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7161 return true;
7164 return false;
7166 case NOT:
7167 x = XEXP (x, 0);
7168 op0 = aarch64_strip_shift (x);
7170 if (VECTOR_MODE_P (mode))
7172 /* Vector NOT. */
7173 *cost += extra_cost->vect.alu;
7174 return false;
7177 /* MVN-shifted-reg. */
7178 if (op0 != x)
7180 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7182 if (speed)
7183 *cost += extra_cost->alu.log_shift;
7185 return true;
7187 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7188 Handle the second form here taking care that 'a' in the above can
7189 be a shift. */
7190 else if (GET_CODE (op0) == XOR)
7192 rtx newop0 = XEXP (op0, 0);
7193 rtx newop1 = XEXP (op0, 1);
7194 rtx op0_stripped = aarch64_strip_shift (newop0);
7196 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7197 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7199 if (speed)
7201 if (op0_stripped != newop0)
7202 *cost += extra_cost->alu.log_shift;
7203 else
7204 *cost += extra_cost->alu.logical;
7207 return true;
7209 /* MVN. */
7210 if (speed)
7211 *cost += extra_cost->alu.logical;
7213 return false;
7215 case ZERO_EXTEND:
7217 op0 = XEXP (x, 0);
7218 /* If a value is written in SI mode, then zero extended to DI
7219 mode, the operation will in general be free as a write to
7220 a 'w' register implicitly zeroes the upper bits of an 'x'
7221 register. However, if this is
7223 (set (reg) (zero_extend (reg)))
7225 we must cost the explicit register move. */
7226 if (mode == DImode
7227 && GET_MODE (op0) == SImode
7228 && outer == SET)
7230 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7232 /* If OP_COST is non-zero, then the cost of the zero extend
7233 is effectively the cost of the inner operation. Otherwise
7234 we have a MOV instruction and we take the cost from the MOV
7235 itself. This is true independently of whether we are
7236 optimizing for space or time. */
7237 if (op_cost)
7238 *cost = op_cost;
7240 return true;
7242 else if (MEM_P (op0))
7244 /* All loads can zero extend to any size for free. */
7245 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7246 return true;
7249 op0 = aarch64_extend_bitfield_pattern_p (x);
7250 if (op0)
7252 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7253 if (speed)
7254 *cost += extra_cost->alu.bfx;
7255 return true;
7258 if (speed)
7260 if (VECTOR_MODE_P (mode))
7262 /* UMOV. */
7263 *cost += extra_cost->vect.alu;
7265 else
7267 /* We generate an AND instead of UXTB/UXTH. */
7268 *cost += extra_cost->alu.logical;
7271 return false;
7273 case SIGN_EXTEND:
7274 if (MEM_P (XEXP (x, 0)))
7276 /* LDRSH. */
7277 if (speed)
7279 rtx address = XEXP (XEXP (x, 0), 0);
7280 *cost += extra_cost->ldst.load_sign_extend;
7282 *cost +=
7283 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7284 0, speed));
7286 return true;
7289 op0 = aarch64_extend_bitfield_pattern_p (x);
7290 if (op0)
7292 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7293 if (speed)
7294 *cost += extra_cost->alu.bfx;
7295 return true;
7298 if (speed)
7300 if (VECTOR_MODE_P (mode))
7301 *cost += extra_cost->vect.alu;
7302 else
7303 *cost += extra_cost->alu.extend;
7305 return false;
7307 case ASHIFT:
7308 op0 = XEXP (x, 0);
7309 op1 = XEXP (x, 1);
7311 if (CONST_INT_P (op1))
7313 if (speed)
7315 if (VECTOR_MODE_P (mode))
7317 /* Vector shift (immediate). */
7318 *cost += extra_cost->vect.alu;
7320 else
7322 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7323 aliases. */
7324 *cost += extra_cost->alu.shift;
7328 /* We can incorporate zero/sign extend for free. */
7329 if (GET_CODE (op0) == ZERO_EXTEND
7330 || GET_CODE (op0) == SIGN_EXTEND)
7331 op0 = XEXP (op0, 0);
7333 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7334 return true;
7336 else
7338 if (speed)
7340 if (VECTOR_MODE_P (mode))
7342 /* Vector shift (register). */
7343 *cost += extra_cost->vect.alu;
7345 else
7347 /* LSLV. */
7348 *cost += extra_cost->alu.shift_reg;
7351 return false; /* All arguments need to be in registers. */
7354 case ROTATE:
7355 case ROTATERT:
7356 case LSHIFTRT:
7357 case ASHIFTRT:
7358 op0 = XEXP (x, 0);
7359 op1 = XEXP (x, 1);
7361 if (CONST_INT_P (op1))
7363 /* ASR (immediate) and friends. */
7364 if (speed)
7366 if (VECTOR_MODE_P (mode))
7367 *cost += extra_cost->vect.alu;
7368 else
7369 *cost += extra_cost->alu.shift;
7372 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7373 return true;
7375 else
7378 /* ASR (register) and friends. */
7379 if (speed)
7381 if (VECTOR_MODE_P (mode))
7382 *cost += extra_cost->vect.alu;
7383 else
7384 *cost += extra_cost->alu.shift_reg;
7386 return false; /* All arguments need to be in registers. */
7389 case SYMBOL_REF:
7391 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7392 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7394 /* LDR. */
7395 if (speed)
7396 *cost += extra_cost->ldst.load;
7398 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7399 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7401 /* ADRP, followed by ADD. */
7402 *cost += COSTS_N_INSNS (1);
7403 if (speed)
7404 *cost += 2 * extra_cost->alu.arith;
7406 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7407 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7409 /* ADR. */
7410 if (speed)
7411 *cost += extra_cost->alu.arith;
7414 if (flag_pic)
7416 /* One extra load instruction, after accessing the GOT. */
7417 *cost += COSTS_N_INSNS (1);
7418 if (speed)
7419 *cost += extra_cost->ldst.load;
7421 return true;
7423 case HIGH:
7424 case LO_SUM:
7425 /* ADRP/ADD (immediate). */
7426 if (speed)
7427 *cost += extra_cost->alu.arith;
7428 return true;
7430 case ZERO_EXTRACT:
7431 case SIGN_EXTRACT:
7432 /* UBFX/SBFX. */
7433 if (speed)
7435 if (VECTOR_MODE_P (mode))
7436 *cost += extra_cost->vect.alu;
7437 else
7438 *cost += extra_cost->alu.bfx;
7441 /* We can trust that the immediates used will be correct (there
7442 are no by-register forms), so we need only cost op0. */
7443 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7444 return true;
7446 case MULT:
7447 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7448 /* aarch64_rtx_mult_cost always handles recursion to its
7449 operands. */
7450 return true;
7452 case MOD:
7453 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7454 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7455 an unconditional negate. This case should only ever be reached through
7456 the set_smod_pow2_cheap check in expmed.c. */
7457 if (CONST_INT_P (XEXP (x, 1))
7458 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7459 && (mode == SImode || mode == DImode))
7461 /* We expand to 4 instructions. Reset the baseline. */
7462 *cost = COSTS_N_INSNS (4);
7464 if (speed)
7465 *cost += 2 * extra_cost->alu.logical
7466 + 2 * extra_cost->alu.arith;
7468 return true;
7471 /* Fall-through. */
7472 case UMOD:
7473 if (speed)
7475 if (VECTOR_MODE_P (mode))
7476 *cost += extra_cost->vect.alu;
7477 else if (GET_MODE_CLASS (mode) == MODE_INT)
7478 *cost += (extra_cost->mult[mode == DImode].add
7479 + extra_cost->mult[mode == DImode].idiv);
7480 else if (mode == DFmode)
7481 *cost += (extra_cost->fp[1].mult
7482 + extra_cost->fp[1].div);
7483 else if (mode == SFmode)
7484 *cost += (extra_cost->fp[0].mult
7485 + extra_cost->fp[0].div);
7487 return false; /* All arguments need to be in registers. */
7489 case DIV:
7490 case UDIV:
7491 case SQRT:
7492 if (speed)
7494 if (VECTOR_MODE_P (mode))
7495 *cost += extra_cost->vect.alu;
7496 else if (GET_MODE_CLASS (mode) == MODE_INT)
7497 /* There is no integer SQRT, so only DIV and UDIV can get
7498 here. */
7499 *cost += extra_cost->mult[mode == DImode].idiv;
7500 else
7501 *cost += extra_cost->fp[mode == DFmode].div;
7503 return false; /* All arguments need to be in registers. */
7505 case IF_THEN_ELSE:
7506 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7507 XEXP (x, 2), cost, speed);
7509 case EQ:
7510 case NE:
7511 case GT:
7512 case GTU:
7513 case LT:
7514 case LTU:
7515 case GE:
7516 case GEU:
7517 case LE:
7518 case LEU:
7520 return false; /* All arguments must be in registers. */
7522 case FMA:
7523 op0 = XEXP (x, 0);
7524 op1 = XEXP (x, 1);
7525 op2 = XEXP (x, 2);
7527 if (speed)
7529 if (VECTOR_MODE_P (mode))
7530 *cost += extra_cost->vect.alu;
7531 else
7532 *cost += extra_cost->fp[mode == DFmode].fma;
7535 /* FMSUB, FNMADD, and FNMSUB are free. */
7536 if (GET_CODE (op0) == NEG)
7537 op0 = XEXP (op0, 0);
7539 if (GET_CODE (op2) == NEG)
7540 op2 = XEXP (op2, 0);
7542 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7543 and the by-element operand as operand 0. */
7544 if (GET_CODE (op1) == NEG)
7545 op1 = XEXP (op1, 0);
7547 /* Catch vector-by-element operations. The by-element operand can
7548 either be (vec_duplicate (vec_select (x))) or just
7549 (vec_select (x)), depending on whether we are multiplying by
7550 a vector or a scalar.
7552 Canonicalization is not very good in these cases, FMA4 will put the
7553 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7554 if (GET_CODE (op0) == VEC_DUPLICATE)
7555 op0 = XEXP (op0, 0);
7556 else if (GET_CODE (op1) == VEC_DUPLICATE)
7557 op1 = XEXP (op1, 0);
7559 if (GET_CODE (op0) == VEC_SELECT)
7560 op0 = XEXP (op0, 0);
7561 else if (GET_CODE (op1) == VEC_SELECT)
7562 op1 = XEXP (op1, 0);
7564 /* If the remaining parameters are not registers,
7565 get the cost to put them into registers. */
7566 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7567 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7568 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7569 return true;
7571 case FLOAT:
7572 case UNSIGNED_FLOAT:
7573 if (speed)
7574 *cost += extra_cost->fp[mode == DFmode].fromint;
7575 return false;
7577 case FLOAT_EXTEND:
7578 if (speed)
7580 if (VECTOR_MODE_P (mode))
7582 /*Vector truncate. */
7583 *cost += extra_cost->vect.alu;
7585 else
7586 *cost += extra_cost->fp[mode == DFmode].widen;
7588 return false;
7590 case FLOAT_TRUNCATE:
7591 if (speed)
7593 if (VECTOR_MODE_P (mode))
7595 /*Vector conversion. */
7596 *cost += extra_cost->vect.alu;
7598 else
7599 *cost += extra_cost->fp[mode == DFmode].narrow;
7601 return false;
7603 case FIX:
7604 case UNSIGNED_FIX:
7605 x = XEXP (x, 0);
7606 /* Strip the rounding part. They will all be implemented
7607 by the fcvt* family of instructions anyway. */
7608 if (GET_CODE (x) == UNSPEC)
7610 unsigned int uns_code = XINT (x, 1);
7612 if (uns_code == UNSPEC_FRINTA
7613 || uns_code == UNSPEC_FRINTM
7614 || uns_code == UNSPEC_FRINTN
7615 || uns_code == UNSPEC_FRINTP
7616 || uns_code == UNSPEC_FRINTZ)
7617 x = XVECEXP (x, 0, 0);
7620 if (speed)
7622 if (VECTOR_MODE_P (mode))
7623 *cost += extra_cost->vect.alu;
7624 else
7625 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7628 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7629 fixed-point fcvt. */
7630 if (GET_CODE (x) == MULT
7631 && ((VECTOR_MODE_P (mode)
7632 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7633 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7635 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7636 0, speed);
7637 return true;
7640 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7641 return true;
7643 case ABS:
7644 if (VECTOR_MODE_P (mode))
7646 /* ABS (vector). */
7647 if (speed)
7648 *cost += extra_cost->vect.alu;
7650 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7652 op0 = XEXP (x, 0);
7654 /* FABD, which is analogous to FADD. */
7655 if (GET_CODE (op0) == MINUS)
7657 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7658 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7659 if (speed)
7660 *cost += extra_cost->fp[mode == DFmode].addsub;
7662 return true;
7664 /* Simple FABS is analogous to FNEG. */
7665 if (speed)
7666 *cost += extra_cost->fp[mode == DFmode].neg;
7668 else
7670 /* Integer ABS will either be split to
7671 two arithmetic instructions, or will be an ABS
7672 (scalar), which we don't model. */
7673 *cost = COSTS_N_INSNS (2);
7674 if (speed)
7675 *cost += 2 * extra_cost->alu.arith;
7677 return false;
7679 case SMAX:
7680 case SMIN:
7681 if (speed)
7683 if (VECTOR_MODE_P (mode))
7684 *cost += extra_cost->vect.alu;
7685 else
7687 /* FMAXNM/FMINNM/FMAX/FMIN.
7688 TODO: This may not be accurate for all implementations, but
7689 we do not model this in the cost tables. */
7690 *cost += extra_cost->fp[mode == DFmode].addsub;
7693 return false;
7695 case UNSPEC:
7696 /* The floating point round to integer frint* instructions. */
7697 if (aarch64_frint_unspec_p (XINT (x, 1)))
7699 if (speed)
7700 *cost += extra_cost->fp[mode == DFmode].roundint;
7702 return false;
7705 if (XINT (x, 1) == UNSPEC_RBIT)
7707 if (speed)
7708 *cost += extra_cost->alu.rev;
7710 return false;
7712 break;
7714 case TRUNCATE:
7716 /* Decompose <su>muldi3_highpart. */
7717 if (/* (truncate:DI */
7718 mode == DImode
7719 /* (lshiftrt:TI */
7720 && GET_MODE (XEXP (x, 0)) == TImode
7721 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7722 /* (mult:TI */
7723 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7724 /* (ANY_EXTEND:TI (reg:DI))
7725 (ANY_EXTEND:TI (reg:DI))) */
7726 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7727 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7728 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7729 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7730 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7731 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7732 /* (const_int 64) */
7733 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7734 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7736 /* UMULH/SMULH. */
7737 if (speed)
7738 *cost += extra_cost->mult[mode == DImode].extend;
7739 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7740 mode, MULT, 0, speed);
7741 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7742 mode, MULT, 1, speed);
7743 return true;
7746 /* Fall through. */
7747 default:
7748 break;
7751 if (dump_file
7752 && flag_aarch64_verbose_cost)
7753 fprintf (dump_file,
7754 "\nFailed to cost RTX. Assuming default cost.\n");
7756 return true;
7759 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7760 calculated for X. This cost is stored in *COST. Returns true
7761 if the total cost of X was calculated. */
7762 static bool
7763 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7764 int param, int *cost, bool speed)
7766 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7768 if (dump_file
7769 && flag_aarch64_verbose_cost)
7771 print_rtl_single (dump_file, x);
7772 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7773 speed ? "Hot" : "Cold",
7774 *cost, result ? "final" : "partial");
7777 return result;
7780 static int
7781 aarch64_register_move_cost (machine_mode mode,
7782 reg_class_t from_i, reg_class_t to_i)
7784 enum reg_class from = (enum reg_class) from_i;
7785 enum reg_class to = (enum reg_class) to_i;
7786 const struct cpu_regmove_cost *regmove_cost
7787 = aarch64_tune_params.regmove_cost;
7789 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7790 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7791 to = GENERAL_REGS;
7793 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7794 from = GENERAL_REGS;
7796 /* Moving between GPR and stack cost is the same as GP2GP. */
7797 if ((from == GENERAL_REGS && to == STACK_REG)
7798 || (to == GENERAL_REGS && from == STACK_REG))
7799 return regmove_cost->GP2GP;
7801 /* To/From the stack register, we move via the gprs. */
7802 if (to == STACK_REG || from == STACK_REG)
7803 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7804 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7806 if (GET_MODE_SIZE (mode) == 16)
7808 /* 128-bit operations on general registers require 2 instructions. */
7809 if (from == GENERAL_REGS && to == GENERAL_REGS)
7810 return regmove_cost->GP2GP * 2;
7811 else if (from == GENERAL_REGS)
7812 return regmove_cost->GP2FP * 2;
7813 else if (to == GENERAL_REGS)
7814 return regmove_cost->FP2GP * 2;
7816 /* When AdvSIMD instructions are disabled it is not possible to move
7817 a 128-bit value directly between Q registers. This is handled in
7818 secondary reload. A general register is used as a scratch to move
7819 the upper DI value and the lower DI value is moved directly,
7820 hence the cost is the sum of three moves. */
7821 if (! TARGET_SIMD)
7822 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7824 return regmove_cost->FP2FP;
7827 if (from == GENERAL_REGS && to == GENERAL_REGS)
7828 return regmove_cost->GP2GP;
7829 else if (from == GENERAL_REGS)
7830 return regmove_cost->GP2FP;
7831 else if (to == GENERAL_REGS)
7832 return regmove_cost->FP2GP;
7834 return regmove_cost->FP2FP;
7837 static int
7838 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7839 reg_class_t rclass ATTRIBUTE_UNUSED,
7840 bool in ATTRIBUTE_UNUSED)
7842 return aarch64_tune_params.memmov_cost;
7845 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7846 to optimize 1.0/sqrt. */
7848 static bool
7849 use_rsqrt_p (machine_mode mode)
7851 return (!flag_trapping_math
7852 && flag_unsafe_math_optimizations
7853 && ((aarch64_tune_params.approx_modes->recip_sqrt
7854 & AARCH64_APPROX_MODE (mode))
7855 || flag_mrecip_low_precision_sqrt));
7858 /* Function to decide when to use the approximate reciprocal square root
7859 builtin. */
7861 static tree
7862 aarch64_builtin_reciprocal (tree fndecl)
7864 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7866 if (!use_rsqrt_p (mode))
7867 return NULL_TREE;
7868 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7871 typedef rtx (*rsqrte_type) (rtx, rtx);
7873 /* Select reciprocal square root initial estimate insn depending on machine
7874 mode. */
7876 static rsqrte_type
7877 get_rsqrte_type (machine_mode mode)
7879 switch (mode)
7881 case DFmode: return gen_aarch64_rsqrtedf;
7882 case SFmode: return gen_aarch64_rsqrtesf;
7883 case V2DFmode: return gen_aarch64_rsqrtev2df;
7884 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7885 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7886 default: gcc_unreachable ();
7890 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7892 /* Select reciprocal square root series step insn depending on machine mode. */
7894 static rsqrts_type
7895 get_rsqrts_type (machine_mode mode)
7897 switch (mode)
7899 case DFmode: return gen_aarch64_rsqrtsdf;
7900 case SFmode: return gen_aarch64_rsqrtssf;
7901 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7902 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7903 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7904 default: gcc_unreachable ();
7908 /* Emit instruction sequence to compute either the approximate square root
7909 or its approximate reciprocal, depending on the flag RECP, and return
7910 whether the sequence was emitted or not. */
7912 bool
7913 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7915 machine_mode mode = GET_MODE (dst);
7917 if (GET_MODE_INNER (mode) == HFmode)
7918 return false;
7920 machine_mode mmsk = mode_for_vector
7921 (int_mode_for_mode (GET_MODE_INNER (mode)),
7922 GET_MODE_NUNITS (mode));
7923 bool use_approx_sqrt_p = (!recp
7924 && (flag_mlow_precision_sqrt
7925 || (aarch64_tune_params.approx_modes->sqrt
7926 & AARCH64_APPROX_MODE (mode))));
7927 bool use_approx_rsqrt_p = (recp
7928 && (flag_mrecip_low_precision_sqrt
7929 || (aarch64_tune_params.approx_modes->recip_sqrt
7930 & AARCH64_APPROX_MODE (mode))));
7932 if (!flag_finite_math_only
7933 || flag_trapping_math
7934 || !flag_unsafe_math_optimizations
7935 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7936 || optimize_function_for_size_p (cfun))
7937 return false;
7939 rtx xmsk = gen_reg_rtx (mmsk);
7940 if (!recp)
7941 /* When calculating the approximate square root, compare the argument with
7942 0.0 and create a mask. */
7943 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7944 CONST0_RTX (mode)))));
7946 /* Estimate the approximate reciprocal square root. */
7947 rtx xdst = gen_reg_rtx (mode);
7948 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7950 /* Iterate over the series twice for SF and thrice for DF. */
7951 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7953 /* Optionally iterate over the series once less for faster performance
7954 while sacrificing the accuracy. */
7955 if ((recp && flag_mrecip_low_precision_sqrt)
7956 || (!recp && flag_mlow_precision_sqrt))
7957 iterations--;
7959 /* Iterate over the series to calculate the approximate reciprocal square
7960 root. */
7961 rtx x1 = gen_reg_rtx (mode);
7962 while (iterations--)
7964 rtx x2 = gen_reg_rtx (mode);
7965 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7967 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7969 if (iterations > 0)
7970 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7973 if (!recp)
7975 /* Qualify the approximate reciprocal square root when the argument is
7976 0.0 by squashing the intermediary result to 0.0. */
7977 rtx xtmp = gen_reg_rtx (mmsk);
7978 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7979 gen_rtx_SUBREG (mmsk, xdst, 0)));
7980 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7982 /* Calculate the approximate square root. */
7983 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7986 /* Finalize the approximation. */
7987 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7989 return true;
7992 typedef rtx (*recpe_type) (rtx, rtx);
7994 /* Select reciprocal initial estimate insn depending on machine mode. */
7996 static recpe_type
7997 get_recpe_type (machine_mode mode)
7999 switch (mode)
8001 case SFmode: return (gen_aarch64_frecpesf);
8002 case V2SFmode: return (gen_aarch64_frecpev2sf);
8003 case V4SFmode: return (gen_aarch64_frecpev4sf);
8004 case DFmode: return (gen_aarch64_frecpedf);
8005 case V2DFmode: return (gen_aarch64_frecpev2df);
8006 default: gcc_unreachable ();
8010 typedef rtx (*recps_type) (rtx, rtx, rtx);
8012 /* Select reciprocal series step insn depending on machine mode. */
8014 static recps_type
8015 get_recps_type (machine_mode mode)
8017 switch (mode)
8019 case SFmode: return (gen_aarch64_frecpssf);
8020 case V2SFmode: return (gen_aarch64_frecpsv2sf);
8021 case V4SFmode: return (gen_aarch64_frecpsv4sf);
8022 case DFmode: return (gen_aarch64_frecpsdf);
8023 case V2DFmode: return (gen_aarch64_frecpsv2df);
8024 default: gcc_unreachable ();
8028 /* Emit the instruction sequence to compute the approximation for the division
8029 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8031 bool
8032 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8034 machine_mode mode = GET_MODE (quo);
8036 if (GET_MODE_INNER (mode) == HFmode)
8037 return false;
8039 bool use_approx_division_p = (flag_mlow_precision_div
8040 || (aarch64_tune_params.approx_modes->division
8041 & AARCH64_APPROX_MODE (mode)));
8043 if (!flag_finite_math_only
8044 || flag_trapping_math
8045 || !flag_unsafe_math_optimizations
8046 || optimize_function_for_size_p (cfun)
8047 || !use_approx_division_p)
8048 return false;
8050 /* Estimate the approximate reciprocal. */
8051 rtx xrcp = gen_reg_rtx (mode);
8052 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8054 /* Iterate over the series twice for SF and thrice for DF. */
8055 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8057 /* Optionally iterate over the series once less for faster performance,
8058 while sacrificing the accuracy. */
8059 if (flag_mlow_precision_div)
8060 iterations--;
8062 /* Iterate over the series to calculate the approximate reciprocal. */
8063 rtx xtmp = gen_reg_rtx (mode);
8064 while (iterations--)
8066 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8068 if (iterations > 0)
8069 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8072 if (num != CONST1_RTX (mode))
8074 /* As the approximate reciprocal of DEN is already calculated, only
8075 calculate the approximate division when NUM is not 1.0. */
8076 rtx xnum = force_reg (mode, num);
8077 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8080 /* Finalize the approximation. */
8081 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8082 return true;
8085 /* Return the number of instructions that can be issued per cycle. */
8086 static int
8087 aarch64_sched_issue_rate (void)
8089 return aarch64_tune_params.issue_rate;
8092 static int
8093 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8095 int issue_rate = aarch64_sched_issue_rate ();
8097 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8101 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8102 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8103 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8105 static int
8106 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8107 int ready_index)
8109 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8113 /* Vectorizer cost model target hooks. */
8115 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8116 static int
8117 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8118 tree vectype,
8119 int misalign ATTRIBUTE_UNUSED)
8121 unsigned elements;
8123 switch (type_of_cost)
8125 case scalar_stmt:
8126 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
8128 case scalar_load:
8129 return aarch64_tune_params.vec_costs->scalar_load_cost;
8131 case scalar_store:
8132 return aarch64_tune_params.vec_costs->scalar_store_cost;
8134 case vector_stmt:
8135 return aarch64_tune_params.vec_costs->vec_stmt_cost;
8137 case vector_load:
8138 return aarch64_tune_params.vec_costs->vec_align_load_cost;
8140 case vector_store:
8141 return aarch64_tune_params.vec_costs->vec_store_cost;
8143 case vec_to_scalar:
8144 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
8146 case scalar_to_vec:
8147 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
8149 case unaligned_load:
8150 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
8152 case unaligned_store:
8153 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
8155 case cond_branch_taken:
8156 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
8158 case cond_branch_not_taken:
8159 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
8161 case vec_perm:
8162 return aarch64_tune_params.vec_costs->vec_permute_cost;
8164 case vec_promote_demote:
8165 return aarch64_tune_params.vec_costs->vec_stmt_cost;
8167 case vec_construct:
8168 elements = TYPE_VECTOR_SUBPARTS (vectype);
8169 return elements / 2 + 1;
8171 default:
8172 gcc_unreachable ();
8176 /* Implement targetm.vectorize.add_stmt_cost. */
8177 static unsigned
8178 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8179 struct _stmt_vec_info *stmt_info, int misalign,
8180 enum vect_cost_model_location where)
8182 unsigned *cost = (unsigned *) data;
8183 unsigned retval = 0;
8185 if (flag_vect_cost_model)
8187 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8188 int stmt_cost =
8189 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8191 /* Statements in an inner loop relative to the loop being
8192 vectorized are weighted more heavily. The value here is
8193 arbitrary and could potentially be improved with analysis. */
8194 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8195 count *= 50; /* FIXME */
8197 retval = (unsigned) (count * stmt_cost);
8198 cost[where] += retval;
8201 return retval;
8204 static void initialize_aarch64_code_model (struct gcc_options *);
8206 /* Parse the TO_PARSE string and put the architecture struct that it
8207 selects into RES and the architectural features into ISA_FLAGS.
8208 Return an aarch64_parse_opt_result describing the parse result.
8209 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8211 static enum aarch64_parse_opt_result
8212 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8213 unsigned long *isa_flags)
8215 char *ext;
8216 const struct processor *arch;
8217 char *str = (char *) alloca (strlen (to_parse) + 1);
8218 size_t len;
8220 strcpy (str, to_parse);
8222 ext = strchr (str, '+');
8224 if (ext != NULL)
8225 len = ext - str;
8226 else
8227 len = strlen (str);
8229 if (len == 0)
8230 return AARCH64_PARSE_MISSING_ARG;
8233 /* Loop through the list of supported ARCHes to find a match. */
8234 for (arch = all_architectures; arch->name != NULL; arch++)
8236 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8238 unsigned long isa_temp = arch->flags;
8240 if (ext != NULL)
8242 /* TO_PARSE string contains at least one extension. */
8243 enum aarch64_parse_opt_result ext_res
8244 = aarch64_parse_extension (ext, &isa_temp);
8246 if (ext_res != AARCH64_PARSE_OK)
8247 return ext_res;
8249 /* Extension parsing was successful. Confirm the result
8250 arch and ISA flags. */
8251 *res = arch;
8252 *isa_flags = isa_temp;
8253 return AARCH64_PARSE_OK;
8257 /* ARCH name not found in list. */
8258 return AARCH64_PARSE_INVALID_ARG;
8261 /* Parse the TO_PARSE string and put the result tuning in RES and the
8262 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8263 describing the parse result. If there is an error parsing, RES and
8264 ISA_FLAGS are left unchanged. */
8266 static enum aarch64_parse_opt_result
8267 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8268 unsigned long *isa_flags)
8270 char *ext;
8271 const struct processor *cpu;
8272 char *str = (char *) alloca (strlen (to_parse) + 1);
8273 size_t len;
8275 strcpy (str, to_parse);
8277 ext = strchr (str, '+');
8279 if (ext != NULL)
8280 len = ext - str;
8281 else
8282 len = strlen (str);
8284 if (len == 0)
8285 return AARCH64_PARSE_MISSING_ARG;
8288 /* Loop through the list of supported CPUs to find a match. */
8289 for (cpu = all_cores; cpu->name != NULL; cpu++)
8291 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8293 unsigned long isa_temp = cpu->flags;
8296 if (ext != NULL)
8298 /* TO_PARSE string contains at least one extension. */
8299 enum aarch64_parse_opt_result ext_res
8300 = aarch64_parse_extension (ext, &isa_temp);
8302 if (ext_res != AARCH64_PARSE_OK)
8303 return ext_res;
8305 /* Extension parsing was successfull. Confirm the result
8306 cpu and ISA flags. */
8307 *res = cpu;
8308 *isa_flags = isa_temp;
8309 return AARCH64_PARSE_OK;
8313 /* CPU name not found in list. */
8314 return AARCH64_PARSE_INVALID_ARG;
8317 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8318 Return an aarch64_parse_opt_result describing the parse result.
8319 If the parsing fails the RES does not change. */
8321 static enum aarch64_parse_opt_result
8322 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8324 const struct processor *cpu;
8325 char *str = (char *) alloca (strlen (to_parse) + 1);
8327 strcpy (str, to_parse);
8329 /* Loop through the list of supported CPUs to find a match. */
8330 for (cpu = all_cores; cpu->name != NULL; cpu++)
8332 if (strcmp (cpu->name, str) == 0)
8334 *res = cpu;
8335 return AARCH64_PARSE_OK;
8339 /* CPU name not found in list. */
8340 return AARCH64_PARSE_INVALID_ARG;
8343 /* Parse TOKEN, which has length LENGTH to see if it is an option
8344 described in FLAG. If it is, return the index bit for that fusion type.
8345 If not, error (printing OPTION_NAME) and return zero. */
8347 static unsigned int
8348 aarch64_parse_one_option_token (const char *token,
8349 size_t length,
8350 const struct aarch64_flag_desc *flag,
8351 const char *option_name)
8353 for (; flag->name != NULL; flag++)
8355 if (length == strlen (flag->name)
8356 && !strncmp (flag->name, token, length))
8357 return flag->flag;
8360 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8361 return 0;
8364 /* Parse OPTION which is a comma-separated list of flags to enable.
8365 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8366 default state we inherit from the CPU tuning structures. OPTION_NAME
8367 gives the top-level option we are parsing in the -moverride string,
8368 for use in error messages. */
8370 static unsigned int
8371 aarch64_parse_boolean_options (const char *option,
8372 const struct aarch64_flag_desc *flags,
8373 unsigned int initial_state,
8374 const char *option_name)
8376 const char separator = '.';
8377 const char* specs = option;
8378 const char* ntoken = option;
8379 unsigned int found_flags = initial_state;
8381 while ((ntoken = strchr (specs, separator)))
8383 size_t token_length = ntoken - specs;
8384 unsigned token_ops = aarch64_parse_one_option_token (specs,
8385 token_length,
8386 flags,
8387 option_name);
8388 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8389 in the token stream, reset the supported operations. So:
8391 adrp+add.cmp+branch.none.adrp+add
8393 would have the result of turning on only adrp+add fusion. */
8394 if (!token_ops)
8395 found_flags = 0;
8397 found_flags |= token_ops;
8398 specs = ++ntoken;
8401 /* We ended with a comma, print something. */
8402 if (!(*specs))
8404 error ("%s string ill-formed\n", option_name);
8405 return 0;
8408 /* We still have one more token to parse. */
8409 size_t token_length = strlen (specs);
8410 unsigned token_ops = aarch64_parse_one_option_token (specs,
8411 token_length,
8412 flags,
8413 option_name);
8414 if (!token_ops)
8415 found_flags = 0;
8417 found_flags |= token_ops;
8418 return found_flags;
8421 /* Support for overriding instruction fusion. */
8423 static void
8424 aarch64_parse_fuse_string (const char *fuse_string,
8425 struct tune_params *tune)
8427 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8428 aarch64_fusible_pairs,
8429 tune->fusible_ops,
8430 "fuse=");
8433 /* Support for overriding other tuning flags. */
8435 static void
8436 aarch64_parse_tune_string (const char *tune_string,
8437 struct tune_params *tune)
8439 tune->extra_tuning_flags
8440 = aarch64_parse_boolean_options (tune_string,
8441 aarch64_tuning_flags,
8442 tune->extra_tuning_flags,
8443 "tune=");
8446 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8447 we understand. If it is, extract the option string and handoff to
8448 the appropriate function. */
8450 void
8451 aarch64_parse_one_override_token (const char* token,
8452 size_t length,
8453 struct tune_params *tune)
8455 const struct aarch64_tuning_override_function *fn
8456 = aarch64_tuning_override_functions;
8458 const char *option_part = strchr (token, '=');
8459 if (!option_part)
8461 error ("tuning string missing in option (%s)", token);
8462 return;
8465 /* Get the length of the option name. */
8466 length = option_part - token;
8467 /* Skip the '=' to get to the option string. */
8468 option_part++;
8470 for (; fn->name != NULL; fn++)
8472 if (!strncmp (fn->name, token, length))
8474 fn->parse_override (option_part, tune);
8475 return;
8479 error ("unknown tuning option (%s)",token);
8480 return;
8483 /* A checking mechanism for the implementation of the tls size. */
8485 static void
8486 initialize_aarch64_tls_size (struct gcc_options *opts)
8488 if (aarch64_tls_size == 0)
8489 aarch64_tls_size = 24;
8491 switch (opts->x_aarch64_cmodel_var)
8493 case AARCH64_CMODEL_TINY:
8494 /* Both the default and maximum TLS size allowed under tiny is 1M which
8495 needs two instructions to address, so we clamp the size to 24. */
8496 if (aarch64_tls_size > 24)
8497 aarch64_tls_size = 24;
8498 break;
8499 case AARCH64_CMODEL_SMALL:
8500 /* The maximum TLS size allowed under small is 4G. */
8501 if (aarch64_tls_size > 32)
8502 aarch64_tls_size = 32;
8503 break;
8504 case AARCH64_CMODEL_LARGE:
8505 /* The maximum TLS size allowed under large is 16E.
8506 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8507 if (aarch64_tls_size > 48)
8508 aarch64_tls_size = 48;
8509 break;
8510 default:
8511 gcc_unreachable ();
8514 return;
8517 /* Parse STRING looking for options in the format:
8518 string :: option:string
8519 option :: name=substring
8520 name :: {a-z}
8521 substring :: defined by option. */
8523 static void
8524 aarch64_parse_override_string (const char* input_string,
8525 struct tune_params* tune)
8527 const char separator = ':';
8528 size_t string_length = strlen (input_string) + 1;
8529 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8530 char *string = string_root;
8531 strncpy (string, input_string, string_length);
8532 string[string_length - 1] = '\0';
8534 char* ntoken = string;
8536 while ((ntoken = strchr (string, separator)))
8538 size_t token_length = ntoken - string;
8539 /* Make this substring look like a string. */
8540 *ntoken = '\0';
8541 aarch64_parse_one_override_token (string, token_length, tune);
8542 string = ++ntoken;
8545 /* One last option to parse. */
8546 aarch64_parse_one_override_token (string, strlen (string), tune);
8547 free (string_root);
8551 static void
8552 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8554 /* The logic here is that if we are disabling all frame pointer generation
8555 then we do not need to disable leaf frame pointer generation as a
8556 separate operation. But if we are *only* disabling leaf frame pointer
8557 generation then we set flag_omit_frame_pointer to true, but in
8558 aarch64_frame_pointer_required we return false only for leaf functions.
8560 PR 70044: We have to be careful about being called multiple times for the
8561 same function. Once we have decided to set flag_omit_frame_pointer just
8562 so that we can omit leaf frame pointers, we must then not interpret a
8563 second call as meaning that all frame pointer generation should be
8564 omitted. We do this by setting flag_omit_frame_pointer to a special,
8565 non-zero value. */
8566 if (opts->x_flag_omit_frame_pointer == 2)
8567 opts->x_flag_omit_frame_pointer = 0;
8569 if (opts->x_flag_omit_frame_pointer)
8570 opts->x_flag_omit_leaf_frame_pointer = false;
8571 else if (opts->x_flag_omit_leaf_frame_pointer)
8572 opts->x_flag_omit_frame_pointer = 2;
8574 /* If not optimizing for size, set the default
8575 alignment to what the target wants. */
8576 if (!opts->x_optimize_size)
8578 if (opts->x_align_loops <= 0)
8579 opts->x_align_loops = aarch64_tune_params.loop_align;
8580 if (opts->x_align_jumps <= 0)
8581 opts->x_align_jumps = aarch64_tune_params.jump_align;
8582 if (opts->x_align_functions <= 0)
8583 opts->x_align_functions = aarch64_tune_params.function_align;
8586 /* We default to no pc-relative literal loads. */
8588 aarch64_pcrelative_literal_loads = false;
8590 /* If -mpc-relative-literal-loads is set on the command line, this
8591 implies that the user asked for PC relative literal loads. */
8592 if (opts->x_pcrelative_literal_loads == 1)
8593 aarch64_pcrelative_literal_loads = true;
8595 /* This is PR70113. When building the Linux kernel with
8596 CONFIG_ARM64_ERRATUM_843419, support for relocations
8597 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8598 removed from the kernel to avoid loading objects with possibly
8599 offending sequences. Without -mpc-relative-literal-loads we would
8600 generate such relocations, preventing the kernel build from
8601 succeeding. */
8602 if (opts->x_pcrelative_literal_loads == 2
8603 && TARGET_FIX_ERR_A53_843419)
8604 aarch64_pcrelative_literal_loads = true;
8606 /* In the tiny memory model it makes no sense to disallow PC relative
8607 literal pool loads. */
8608 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8609 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8610 aarch64_pcrelative_literal_loads = true;
8612 /* When enabling the lower precision Newton series for the square root, also
8613 enable it for the reciprocal square root, since the latter is an
8614 intermediary step for the former. */
8615 if (flag_mlow_precision_sqrt)
8616 flag_mrecip_low_precision_sqrt = true;
8619 /* 'Unpack' up the internal tuning structs and update the options
8620 in OPTS. The caller must have set up selected_tune and selected_arch
8621 as all the other target-specific codegen decisions are
8622 derived from them. */
8624 void
8625 aarch64_override_options_internal (struct gcc_options *opts)
8627 aarch64_tune_flags = selected_tune->flags;
8628 aarch64_tune = selected_tune->sched_core;
8629 /* Make a copy of the tuning parameters attached to the core, which
8630 we may later overwrite. */
8631 aarch64_tune_params = *(selected_tune->tune);
8632 aarch64_architecture_version = selected_arch->architecture_version;
8634 if (opts->x_aarch64_override_tune_string)
8635 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8636 &aarch64_tune_params);
8638 /* This target defaults to strict volatile bitfields. */
8639 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8640 opts->x_flag_strict_volatile_bitfields = 1;
8642 initialize_aarch64_code_model (opts);
8643 initialize_aarch64_tls_size (opts);
8645 int queue_depth = 0;
8646 switch (aarch64_tune_params.autoprefetcher_model)
8648 case tune_params::AUTOPREFETCHER_OFF:
8649 queue_depth = -1;
8650 break;
8651 case tune_params::AUTOPREFETCHER_WEAK:
8652 queue_depth = 0;
8653 break;
8654 case tune_params::AUTOPREFETCHER_STRONG:
8655 queue_depth = max_insn_queue_index + 1;
8656 break;
8657 default:
8658 gcc_unreachable ();
8661 /* We don't mind passing in global_options_set here as we don't use
8662 the *options_set structs anyway. */
8663 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8664 queue_depth,
8665 opts->x_param_values,
8666 global_options_set.x_param_values);
8668 /* Set the L1 cache line size. */
8669 if (selected_cpu->tune->cache_line_size != 0)
8670 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8671 selected_cpu->tune->cache_line_size,
8672 opts->x_param_values,
8673 global_options_set.x_param_values);
8675 aarch64_override_options_after_change_1 (opts);
8678 /* Print a hint with a suggestion for a core or architecture name that
8679 most closely resembles what the user passed in STR. ARCH is true if
8680 the user is asking for an architecture name. ARCH is false if the user
8681 is asking for a core name. */
8683 static void
8684 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8686 auto_vec<const char *> candidates;
8687 const struct processor *entry = arch ? all_architectures : all_cores;
8688 for (; entry->name != NULL; entry++)
8689 candidates.safe_push (entry->name);
8690 char *s;
8691 const char *hint = candidates_list_and_hint (str, s, candidates);
8692 if (hint)
8693 inform (input_location, "valid arguments are: %s;"
8694 " did you mean %qs?", s, hint);
8695 XDELETEVEC (s);
8698 /* Print a hint with a suggestion for a core name that most closely resembles
8699 what the user passed in STR. */
8701 inline static void
8702 aarch64_print_hint_for_core (const char *str)
8704 aarch64_print_hint_for_core_or_arch (str, false);
8707 /* Print a hint with a suggestion for an architecture name that most closely
8708 resembles what the user passed in STR. */
8710 inline static void
8711 aarch64_print_hint_for_arch (const char *str)
8713 aarch64_print_hint_for_core_or_arch (str, true);
8716 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8717 specified in STR and throw errors if appropriate. Put the results if
8718 they are valid in RES and ISA_FLAGS. Return whether the option is
8719 valid. */
8721 static bool
8722 aarch64_validate_mcpu (const char *str, const struct processor **res,
8723 unsigned long *isa_flags)
8725 enum aarch64_parse_opt_result parse_res
8726 = aarch64_parse_cpu (str, res, isa_flags);
8728 if (parse_res == AARCH64_PARSE_OK)
8729 return true;
8731 switch (parse_res)
8733 case AARCH64_PARSE_MISSING_ARG:
8734 error ("missing cpu name in -mcpu=%qs", str);
8735 break;
8736 case AARCH64_PARSE_INVALID_ARG:
8737 error ("unknown value %qs for -mcpu", str);
8738 aarch64_print_hint_for_core (str);
8739 break;
8740 case AARCH64_PARSE_INVALID_FEATURE:
8741 error ("invalid feature modifier in -mcpu=%qs", str);
8742 break;
8743 default:
8744 gcc_unreachable ();
8747 return false;
8750 /* Validate a command-line -march option. Parse the arch and extensions
8751 (if any) specified in STR and throw errors if appropriate. Put the
8752 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8753 option is valid. */
8755 static bool
8756 aarch64_validate_march (const char *str, const struct processor **res,
8757 unsigned long *isa_flags)
8759 enum aarch64_parse_opt_result parse_res
8760 = aarch64_parse_arch (str, res, isa_flags);
8762 if (parse_res == AARCH64_PARSE_OK)
8763 return true;
8765 switch (parse_res)
8767 case AARCH64_PARSE_MISSING_ARG:
8768 error ("missing arch name in -march=%qs", str);
8769 break;
8770 case AARCH64_PARSE_INVALID_ARG:
8771 error ("unknown value %qs for -march", str);
8772 aarch64_print_hint_for_arch (str);
8773 break;
8774 case AARCH64_PARSE_INVALID_FEATURE:
8775 error ("invalid feature modifier in -march=%qs", str);
8776 break;
8777 default:
8778 gcc_unreachable ();
8781 return false;
8784 /* Validate a command-line -mtune option. Parse the cpu
8785 specified in STR and throw errors if appropriate. Put the
8786 result, if it is valid, in RES. Return whether the option is
8787 valid. */
8789 static bool
8790 aarch64_validate_mtune (const char *str, const struct processor **res)
8792 enum aarch64_parse_opt_result parse_res
8793 = aarch64_parse_tune (str, res);
8795 if (parse_res == AARCH64_PARSE_OK)
8796 return true;
8798 switch (parse_res)
8800 case AARCH64_PARSE_MISSING_ARG:
8801 error ("missing cpu name in -mtune=%qs", str);
8802 break;
8803 case AARCH64_PARSE_INVALID_ARG:
8804 error ("unknown value %qs for -mtune", str);
8805 aarch64_print_hint_for_core (str);
8806 break;
8807 default:
8808 gcc_unreachable ();
8810 return false;
8813 /* Return the CPU corresponding to the enum CPU.
8814 If it doesn't specify a cpu, return the default. */
8816 static const struct processor *
8817 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8819 if (cpu != aarch64_none)
8820 return &all_cores[cpu];
8822 /* The & 0x3f is to extract the bottom 6 bits that encode the
8823 default cpu as selected by the --with-cpu GCC configure option
8824 in config.gcc.
8825 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8826 flags mechanism should be reworked to make it more sane. */
8827 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8830 /* Return the architecture corresponding to the enum ARCH.
8831 If it doesn't specify a valid architecture, return the default. */
8833 static const struct processor *
8834 aarch64_get_arch (enum aarch64_arch arch)
8836 if (arch != aarch64_no_arch)
8837 return &all_architectures[arch];
8839 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8841 return &all_architectures[cpu->arch];
8844 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8845 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8846 tuning structs. In particular it must set selected_tune and
8847 aarch64_isa_flags that define the available ISA features and tuning
8848 decisions. It must also set selected_arch as this will be used to
8849 output the .arch asm tags for each function. */
8851 static void
8852 aarch64_override_options (void)
8854 unsigned long cpu_isa = 0;
8855 unsigned long arch_isa = 0;
8856 aarch64_isa_flags = 0;
8858 bool valid_cpu = true;
8859 bool valid_tune = true;
8860 bool valid_arch = true;
8862 selected_cpu = NULL;
8863 selected_arch = NULL;
8864 selected_tune = NULL;
8866 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8867 If either of -march or -mtune is given, they override their
8868 respective component of -mcpu. */
8869 if (aarch64_cpu_string)
8870 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8871 &cpu_isa);
8873 if (aarch64_arch_string)
8874 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8875 &arch_isa);
8877 if (aarch64_tune_string)
8878 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8880 /* If the user did not specify a processor, choose the default
8881 one for them. This will be the CPU set during configuration using
8882 --with-cpu, otherwise it is "generic". */
8883 if (!selected_cpu)
8885 if (selected_arch)
8887 selected_cpu = &all_cores[selected_arch->ident];
8888 aarch64_isa_flags = arch_isa;
8889 explicit_arch = selected_arch->arch;
8891 else
8893 /* Get default configure-time CPU. */
8894 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8895 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8898 if (selected_tune)
8899 explicit_tune_core = selected_tune->ident;
8901 /* If both -mcpu and -march are specified check that they are architecturally
8902 compatible, warn if they're not and prefer the -march ISA flags. */
8903 else if (selected_arch)
8905 if (selected_arch->arch != selected_cpu->arch)
8907 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8908 all_architectures[selected_cpu->arch].name,
8909 selected_arch->name);
8911 aarch64_isa_flags = arch_isa;
8912 explicit_arch = selected_arch->arch;
8913 explicit_tune_core = selected_tune ? selected_tune->ident
8914 : selected_cpu->ident;
8916 else
8918 /* -mcpu but no -march. */
8919 aarch64_isa_flags = cpu_isa;
8920 explicit_tune_core = selected_tune ? selected_tune->ident
8921 : selected_cpu->ident;
8922 gcc_assert (selected_cpu);
8923 selected_arch = &all_architectures[selected_cpu->arch];
8924 explicit_arch = selected_arch->arch;
8927 /* Set the arch as well as we will need it when outputing
8928 the .arch directive in assembly. */
8929 if (!selected_arch)
8931 gcc_assert (selected_cpu);
8932 selected_arch = &all_architectures[selected_cpu->arch];
8935 if (!selected_tune)
8936 selected_tune = selected_cpu;
8938 #ifndef HAVE_AS_MABI_OPTION
8939 /* The compiler may have been configured with 2.23.* binutils, which does
8940 not have support for ILP32. */
8941 if (TARGET_ILP32)
8942 error ("Assembler does not support -mabi=ilp32");
8943 #endif
8945 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
8946 sorry ("Return address signing is only supported for -mabi=lp64");
8948 /* Make sure we properly set up the explicit options. */
8949 if ((aarch64_cpu_string && valid_cpu)
8950 || (aarch64_tune_string && valid_tune))
8951 gcc_assert (explicit_tune_core != aarch64_none);
8953 if ((aarch64_cpu_string && valid_cpu)
8954 || (aarch64_arch_string && valid_arch))
8955 gcc_assert (explicit_arch != aarch64_no_arch);
8957 aarch64_override_options_internal (&global_options);
8959 /* Save these options as the default ones in case we push and pop them later
8960 while processing functions with potential target attributes. */
8961 target_option_default_node = target_option_current_node
8962 = build_target_option_node (&global_options);
8965 /* Implement targetm.override_options_after_change. */
8967 static void
8968 aarch64_override_options_after_change (void)
8970 aarch64_override_options_after_change_1 (&global_options);
8973 static struct machine_function *
8974 aarch64_init_machine_status (void)
8976 struct machine_function *machine;
8977 machine = ggc_cleared_alloc<machine_function> ();
8978 return machine;
8981 void
8982 aarch64_init_expanders (void)
8984 init_machine_status = aarch64_init_machine_status;
8987 /* A checking mechanism for the implementation of the various code models. */
8988 static void
8989 initialize_aarch64_code_model (struct gcc_options *opts)
8991 if (opts->x_flag_pic)
8993 switch (opts->x_aarch64_cmodel_var)
8995 case AARCH64_CMODEL_TINY:
8996 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8997 break;
8998 case AARCH64_CMODEL_SMALL:
8999 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9000 aarch64_cmodel = (flag_pic == 2
9001 ? AARCH64_CMODEL_SMALL_PIC
9002 : AARCH64_CMODEL_SMALL_SPIC);
9003 #else
9004 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9005 #endif
9006 break;
9007 case AARCH64_CMODEL_LARGE:
9008 sorry ("code model %qs with -f%s", "large",
9009 opts->x_flag_pic > 1 ? "PIC" : "pic");
9010 break;
9011 default:
9012 gcc_unreachable ();
9015 else
9016 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9019 /* Implement TARGET_OPTION_SAVE. */
9021 static void
9022 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9024 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9027 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9028 using the information saved in PTR. */
9030 static void
9031 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9033 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9034 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9035 opts->x_explicit_arch = ptr->x_explicit_arch;
9036 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9037 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9039 aarch64_override_options_internal (opts);
9042 /* Implement TARGET_OPTION_PRINT. */
9044 static void
9045 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9047 const struct processor *cpu
9048 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9049 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9050 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9051 std::string extension
9052 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9054 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9055 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9056 arch->name, extension.c_str ());
9059 static GTY(()) tree aarch64_previous_fndecl;
9061 void
9062 aarch64_reset_previous_fndecl (void)
9064 aarch64_previous_fndecl = NULL;
9067 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9068 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9069 make sure optab availability predicates are recomputed when necessary. */
9071 void
9072 aarch64_save_restore_target_globals (tree new_tree)
9074 if (TREE_TARGET_GLOBALS (new_tree))
9075 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9076 else if (new_tree == target_option_default_node)
9077 restore_target_globals (&default_target_globals);
9078 else
9079 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9082 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9083 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9084 of the function, if such exists. This function may be called multiple
9085 times on a single function so use aarch64_previous_fndecl to avoid
9086 setting up identical state. */
9088 static void
9089 aarch64_set_current_function (tree fndecl)
9091 if (!fndecl || fndecl == aarch64_previous_fndecl)
9092 return;
9094 tree old_tree = (aarch64_previous_fndecl
9095 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9096 : NULL_TREE);
9098 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9100 /* If current function has no attributes but the previous one did,
9101 use the default node. */
9102 if (!new_tree && old_tree)
9103 new_tree = target_option_default_node;
9105 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9106 the default have been handled by aarch64_save_restore_target_globals from
9107 aarch64_pragma_target_parse. */
9108 if (old_tree == new_tree)
9109 return;
9111 aarch64_previous_fndecl = fndecl;
9113 /* First set the target options. */
9114 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9116 aarch64_save_restore_target_globals (new_tree);
9119 /* Enum describing the various ways we can handle attributes.
9120 In many cases we can reuse the generic option handling machinery. */
9122 enum aarch64_attr_opt_type
9124 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9125 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9126 aarch64_attr_enum, /* Attribute sets an enum variable. */
9127 aarch64_attr_custom /* Attribute requires a custom handling function. */
9130 /* All the information needed to handle a target attribute.
9131 NAME is the name of the attribute.
9132 ATTR_TYPE specifies the type of behavior of the attribute as described
9133 in the definition of enum aarch64_attr_opt_type.
9134 ALLOW_NEG is true if the attribute supports a "no-" form.
9135 HANDLER is the function that takes the attribute string and whether
9136 it is a pragma or attribute and handles the option. It is needed only
9137 when the ATTR_TYPE is aarch64_attr_custom.
9138 OPT_NUM is the enum specifying the option that the attribute modifies.
9139 This is needed for attributes that mirror the behavior of a command-line
9140 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9141 aarch64_attr_enum. */
9143 struct aarch64_attribute_info
9145 const char *name;
9146 enum aarch64_attr_opt_type attr_type;
9147 bool allow_neg;
9148 bool (*handler) (const char *, const char *);
9149 enum opt_code opt_num;
9152 /* Handle the ARCH_STR argument to the arch= target attribute.
9153 PRAGMA_OR_ATTR is used in potential error messages. */
9155 static bool
9156 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9158 const struct processor *tmp_arch = NULL;
9159 enum aarch64_parse_opt_result parse_res
9160 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9162 if (parse_res == AARCH64_PARSE_OK)
9164 gcc_assert (tmp_arch);
9165 selected_arch = tmp_arch;
9166 explicit_arch = selected_arch->arch;
9167 return true;
9170 switch (parse_res)
9172 case AARCH64_PARSE_MISSING_ARG:
9173 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9174 break;
9175 case AARCH64_PARSE_INVALID_ARG:
9176 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9177 aarch64_print_hint_for_arch (str);
9178 break;
9179 case AARCH64_PARSE_INVALID_FEATURE:
9180 error ("invalid feature modifier %qs for 'arch' target %s",
9181 str, pragma_or_attr);
9182 break;
9183 default:
9184 gcc_unreachable ();
9187 return false;
9190 /* Handle the argument CPU_STR to the cpu= target attribute.
9191 PRAGMA_OR_ATTR is used in potential error messages. */
9193 static bool
9194 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9196 const struct processor *tmp_cpu = NULL;
9197 enum aarch64_parse_opt_result parse_res
9198 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9200 if (parse_res == AARCH64_PARSE_OK)
9202 gcc_assert (tmp_cpu);
9203 selected_tune = tmp_cpu;
9204 explicit_tune_core = selected_tune->ident;
9206 selected_arch = &all_architectures[tmp_cpu->arch];
9207 explicit_arch = selected_arch->arch;
9208 return true;
9211 switch (parse_res)
9213 case AARCH64_PARSE_MISSING_ARG:
9214 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9215 break;
9216 case AARCH64_PARSE_INVALID_ARG:
9217 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9218 aarch64_print_hint_for_core (str);
9219 break;
9220 case AARCH64_PARSE_INVALID_FEATURE:
9221 error ("invalid feature modifier %qs for 'cpu' target %s",
9222 str, pragma_or_attr);
9223 break;
9224 default:
9225 gcc_unreachable ();
9228 return false;
9231 /* Handle the argument STR to the tune= target attribute.
9232 PRAGMA_OR_ATTR is used in potential error messages. */
9234 static bool
9235 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9237 const struct processor *tmp_tune = NULL;
9238 enum aarch64_parse_opt_result parse_res
9239 = aarch64_parse_tune (str, &tmp_tune);
9241 if (parse_res == AARCH64_PARSE_OK)
9243 gcc_assert (tmp_tune);
9244 selected_tune = tmp_tune;
9245 explicit_tune_core = selected_tune->ident;
9246 return true;
9249 switch (parse_res)
9251 case AARCH64_PARSE_INVALID_ARG:
9252 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9253 aarch64_print_hint_for_core (str);
9254 break;
9255 default:
9256 gcc_unreachable ();
9259 return false;
9262 /* Parse an architecture extensions target attribute string specified in STR.
9263 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9264 if successful. Update aarch64_isa_flags to reflect the ISA features
9265 modified.
9266 PRAGMA_OR_ATTR is used in potential error messages. */
9268 static bool
9269 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9271 enum aarch64_parse_opt_result parse_res;
9272 unsigned long isa_flags = aarch64_isa_flags;
9274 /* We allow "+nothing" in the beginning to clear out all architectural
9275 features if the user wants to handpick specific features. */
9276 if (strncmp ("+nothing", str, 8) == 0)
9278 isa_flags = 0;
9279 str += 8;
9282 parse_res = aarch64_parse_extension (str, &isa_flags);
9284 if (parse_res == AARCH64_PARSE_OK)
9286 aarch64_isa_flags = isa_flags;
9287 return true;
9290 switch (parse_res)
9292 case AARCH64_PARSE_MISSING_ARG:
9293 error ("missing feature modifier in target %s %qs",
9294 pragma_or_attr, str);
9295 break;
9297 case AARCH64_PARSE_INVALID_FEATURE:
9298 error ("invalid feature modifier in target %s %qs",
9299 pragma_or_attr, str);
9300 break;
9302 default:
9303 gcc_unreachable ();
9306 return false;
9309 /* The target attributes that we support. On top of these we also support just
9310 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9311 handled explicitly in aarch64_process_one_target_attr. */
9313 static const struct aarch64_attribute_info aarch64_attributes[] =
9315 { "general-regs-only", aarch64_attr_mask, false, NULL,
9316 OPT_mgeneral_regs_only },
9317 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9318 OPT_mfix_cortex_a53_835769 },
9319 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9320 OPT_mfix_cortex_a53_843419 },
9321 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9322 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9323 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9324 OPT_momit_leaf_frame_pointer },
9325 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9326 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9327 OPT_march_ },
9328 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9329 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9330 OPT_mtune_ },
9331 { "sign-return-address", aarch64_attr_enum, false, NULL,
9332 OPT_msign_return_address_ },
9333 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9336 /* Parse ARG_STR which contains the definition of one target attribute.
9337 Show appropriate errors if any or return true if the attribute is valid.
9338 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9339 we're processing a target attribute or pragma. */
9341 static bool
9342 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9344 bool invert = false;
9346 size_t len = strlen (arg_str);
9348 if (len == 0)
9350 error ("malformed target %s", pragma_or_attr);
9351 return false;
9354 char *str_to_check = (char *) alloca (len + 1);
9355 strcpy (str_to_check, arg_str);
9357 /* Skip leading whitespace. */
9358 while (*str_to_check == ' ' || *str_to_check == '\t')
9359 str_to_check++;
9361 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9362 It is easier to detect and handle it explicitly here rather than going
9363 through the machinery for the rest of the target attributes in this
9364 function. */
9365 if (*str_to_check == '+')
9366 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9368 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9370 invert = true;
9371 str_to_check += 3;
9373 char *arg = strchr (str_to_check, '=');
9375 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9376 and point ARG to "foo". */
9377 if (arg)
9379 *arg = '\0';
9380 arg++;
9382 const struct aarch64_attribute_info *p_attr;
9383 bool found = false;
9384 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9386 /* If the names don't match up, or the user has given an argument
9387 to an attribute that doesn't accept one, or didn't give an argument
9388 to an attribute that expects one, fail to match. */
9389 if (strcmp (str_to_check, p_attr->name) != 0)
9390 continue;
9392 found = true;
9393 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9394 || p_attr->attr_type == aarch64_attr_enum;
9396 if (attr_need_arg_p ^ (arg != NULL))
9398 error ("target %s %qs does not accept an argument",
9399 pragma_or_attr, str_to_check);
9400 return false;
9403 /* If the name matches but the attribute does not allow "no-" versions
9404 then we can't match. */
9405 if (invert && !p_attr->allow_neg)
9407 error ("target %s %qs does not allow a negated form",
9408 pragma_or_attr, str_to_check);
9409 return false;
9412 switch (p_attr->attr_type)
9414 /* Has a custom handler registered.
9415 For example, cpu=, arch=, tune=. */
9416 case aarch64_attr_custom:
9417 gcc_assert (p_attr->handler);
9418 if (!p_attr->handler (arg, pragma_or_attr))
9419 return false;
9420 break;
9422 /* Either set or unset a boolean option. */
9423 case aarch64_attr_bool:
9425 struct cl_decoded_option decoded;
9427 generate_option (p_attr->opt_num, NULL, !invert,
9428 CL_TARGET, &decoded);
9429 aarch64_handle_option (&global_options, &global_options_set,
9430 &decoded, input_location);
9431 break;
9433 /* Set or unset a bit in the target_flags. aarch64_handle_option
9434 should know what mask to apply given the option number. */
9435 case aarch64_attr_mask:
9437 struct cl_decoded_option decoded;
9438 /* We only need to specify the option number.
9439 aarch64_handle_option will know which mask to apply. */
9440 decoded.opt_index = p_attr->opt_num;
9441 decoded.value = !invert;
9442 aarch64_handle_option (&global_options, &global_options_set,
9443 &decoded, input_location);
9444 break;
9446 /* Use the option setting machinery to set an option to an enum. */
9447 case aarch64_attr_enum:
9449 gcc_assert (arg);
9450 bool valid;
9451 int value;
9452 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9453 &value, CL_TARGET);
9454 if (valid)
9456 set_option (&global_options, NULL, p_attr->opt_num, value,
9457 NULL, DK_UNSPECIFIED, input_location,
9458 global_dc);
9460 else
9462 error ("target %s %s=%s is not valid",
9463 pragma_or_attr, str_to_check, arg);
9465 break;
9467 default:
9468 gcc_unreachable ();
9472 /* If we reached here we either have found an attribute and validated
9473 it or didn't match any. If we matched an attribute but its arguments
9474 were malformed we will have returned false already. */
9475 return found;
9478 /* Count how many times the character C appears in
9479 NULL-terminated string STR. */
9481 static unsigned int
9482 num_occurences_in_str (char c, char *str)
9484 unsigned int res = 0;
9485 while (*str != '\0')
9487 if (*str == c)
9488 res++;
9490 str++;
9493 return res;
9496 /* Parse the tree in ARGS that contains the target attribute information
9497 and update the global target options space. PRAGMA_OR_ATTR is a string
9498 to be used in error messages, specifying whether this is processing
9499 a target attribute or a target pragma. */
9501 bool
9502 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9504 if (TREE_CODE (args) == TREE_LIST)
9508 tree head = TREE_VALUE (args);
9509 if (head)
9511 if (!aarch64_process_target_attr (head, pragma_or_attr))
9512 return false;
9514 args = TREE_CHAIN (args);
9515 } while (args);
9517 return true;
9519 /* We expect to find a string to parse. */
9520 gcc_assert (TREE_CODE (args) == STRING_CST);
9522 size_t len = strlen (TREE_STRING_POINTER (args));
9523 char *str_to_check = (char *) alloca (len + 1);
9524 strcpy (str_to_check, TREE_STRING_POINTER (args));
9526 if (len == 0)
9528 error ("malformed target %s value", pragma_or_attr);
9529 return false;
9532 /* Used to catch empty spaces between commas i.e.
9533 attribute ((target ("attr1,,attr2"))). */
9534 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9536 /* Handle multiple target attributes separated by ','. */
9537 char *token = strtok (str_to_check, ",");
9539 unsigned int num_attrs = 0;
9540 while (token)
9542 num_attrs++;
9543 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9545 error ("target %s %qs is invalid", pragma_or_attr, token);
9546 return false;
9549 token = strtok (NULL, ",");
9552 if (num_attrs != num_commas + 1)
9554 error ("malformed target %s list %qs",
9555 pragma_or_attr, TREE_STRING_POINTER (args));
9556 return false;
9559 return true;
9562 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9563 process attribute ((target ("..."))). */
9565 static bool
9566 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9568 struct cl_target_option cur_target;
9569 bool ret;
9570 tree old_optimize;
9571 tree new_target, new_optimize;
9572 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9574 /* If what we're processing is the current pragma string then the
9575 target option node is already stored in target_option_current_node
9576 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9577 having to re-parse the string. This is especially useful to keep
9578 arm_neon.h compile times down since that header contains a lot
9579 of intrinsics enclosed in pragmas. */
9580 if (!existing_target && args == current_target_pragma)
9582 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9583 return true;
9585 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9587 old_optimize = build_optimization_node (&global_options);
9588 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9590 /* If the function changed the optimization levels as well as setting
9591 target options, start with the optimizations specified. */
9592 if (func_optimize && func_optimize != old_optimize)
9593 cl_optimization_restore (&global_options,
9594 TREE_OPTIMIZATION (func_optimize));
9596 /* Save the current target options to restore at the end. */
9597 cl_target_option_save (&cur_target, &global_options);
9599 /* If fndecl already has some target attributes applied to it, unpack
9600 them so that we add this attribute on top of them, rather than
9601 overwriting them. */
9602 if (existing_target)
9604 struct cl_target_option *existing_options
9605 = TREE_TARGET_OPTION (existing_target);
9607 if (existing_options)
9608 cl_target_option_restore (&global_options, existing_options);
9610 else
9611 cl_target_option_restore (&global_options,
9612 TREE_TARGET_OPTION (target_option_current_node));
9615 ret = aarch64_process_target_attr (args, "attribute");
9617 /* Set up any additional state. */
9618 if (ret)
9620 aarch64_override_options_internal (&global_options);
9621 /* Initialize SIMD builtins if we haven't already.
9622 Set current_target_pragma to NULL for the duration so that
9623 the builtin initialization code doesn't try to tag the functions
9624 being built with the attributes specified by any current pragma, thus
9625 going into an infinite recursion. */
9626 if (TARGET_SIMD)
9628 tree saved_current_target_pragma = current_target_pragma;
9629 current_target_pragma = NULL;
9630 aarch64_init_simd_builtins ();
9631 current_target_pragma = saved_current_target_pragma;
9633 new_target = build_target_option_node (&global_options);
9635 else
9636 new_target = NULL;
9638 new_optimize = build_optimization_node (&global_options);
9640 if (fndecl && ret)
9642 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9644 if (old_optimize != new_optimize)
9645 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9648 cl_target_option_restore (&global_options, &cur_target);
9650 if (old_optimize != new_optimize)
9651 cl_optimization_restore (&global_options,
9652 TREE_OPTIMIZATION (old_optimize));
9653 return ret;
9656 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9657 tri-bool options (yes, no, don't care) and the default value is
9658 DEF, determine whether to reject inlining. */
9660 static bool
9661 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9662 int dont_care, int def)
9664 /* If the callee doesn't care, always allow inlining. */
9665 if (callee == dont_care)
9666 return true;
9668 /* If the caller doesn't care, always allow inlining. */
9669 if (caller == dont_care)
9670 return true;
9672 /* Otherwise, allow inlining if either the callee and caller values
9673 agree, or if the callee is using the default value. */
9674 return (callee == caller || callee == def);
9677 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9678 to inline CALLEE into CALLER based on target-specific info.
9679 Make sure that the caller and callee have compatible architectural
9680 features. Then go through the other possible target attributes
9681 and see if they can block inlining. Try not to reject always_inline
9682 callees unless they are incompatible architecturally. */
9684 static bool
9685 aarch64_can_inline_p (tree caller, tree callee)
9687 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9688 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9690 /* If callee has no option attributes, then it is ok to inline. */
9691 if (!callee_tree)
9692 return true;
9694 struct cl_target_option *caller_opts
9695 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9696 : target_option_default_node);
9698 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9701 /* Callee's ISA flags should be a subset of the caller's. */
9702 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9703 != callee_opts->x_aarch64_isa_flags)
9704 return false;
9706 /* Allow non-strict aligned functions inlining into strict
9707 aligned ones. */
9708 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9709 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9710 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9711 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9712 return false;
9714 bool always_inline = lookup_attribute ("always_inline",
9715 DECL_ATTRIBUTES (callee));
9717 /* If the architectural features match up and the callee is always_inline
9718 then the other attributes don't matter. */
9719 if (always_inline)
9720 return true;
9722 if (caller_opts->x_aarch64_cmodel_var
9723 != callee_opts->x_aarch64_cmodel_var)
9724 return false;
9726 if (caller_opts->x_aarch64_tls_dialect
9727 != callee_opts->x_aarch64_tls_dialect)
9728 return false;
9730 /* Honour explicit requests to workaround errata. */
9731 if (!aarch64_tribools_ok_for_inlining_p (
9732 caller_opts->x_aarch64_fix_a53_err835769,
9733 callee_opts->x_aarch64_fix_a53_err835769,
9734 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9735 return false;
9737 if (!aarch64_tribools_ok_for_inlining_p (
9738 caller_opts->x_aarch64_fix_a53_err843419,
9739 callee_opts->x_aarch64_fix_a53_err843419,
9740 2, TARGET_FIX_ERR_A53_843419))
9741 return false;
9743 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9744 caller and calle and they don't match up, reject inlining. */
9745 if (!aarch64_tribools_ok_for_inlining_p (
9746 caller_opts->x_flag_omit_leaf_frame_pointer,
9747 callee_opts->x_flag_omit_leaf_frame_pointer,
9748 2, 1))
9749 return false;
9751 /* If the callee has specific tuning overrides, respect them. */
9752 if (callee_opts->x_aarch64_override_tune_string != NULL
9753 && caller_opts->x_aarch64_override_tune_string == NULL)
9754 return false;
9756 /* If the user specified tuning override strings for the
9757 caller and callee and they don't match up, reject inlining.
9758 We just do a string compare here, we don't analyze the meaning
9759 of the string, as it would be too costly for little gain. */
9760 if (callee_opts->x_aarch64_override_tune_string
9761 && caller_opts->x_aarch64_override_tune_string
9762 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9763 caller_opts->x_aarch64_override_tune_string) != 0))
9764 return false;
9766 return true;
9769 /* Return true if SYMBOL_REF X binds locally. */
9771 static bool
9772 aarch64_symbol_binds_local_p (const_rtx x)
9774 return (SYMBOL_REF_DECL (x)
9775 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9776 : SYMBOL_REF_LOCAL_P (x));
9779 /* Return true if SYMBOL_REF X is thread local */
9780 static bool
9781 aarch64_tls_symbol_p (rtx x)
9783 if (! TARGET_HAVE_TLS)
9784 return false;
9786 if (GET_CODE (x) != SYMBOL_REF)
9787 return false;
9789 return SYMBOL_REF_TLS_MODEL (x) != 0;
9792 /* Classify a TLS symbol into one of the TLS kinds. */
9793 enum aarch64_symbol_type
9794 aarch64_classify_tls_symbol (rtx x)
9796 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9798 switch (tls_kind)
9800 case TLS_MODEL_GLOBAL_DYNAMIC:
9801 case TLS_MODEL_LOCAL_DYNAMIC:
9802 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9804 case TLS_MODEL_INITIAL_EXEC:
9805 switch (aarch64_cmodel)
9807 case AARCH64_CMODEL_TINY:
9808 case AARCH64_CMODEL_TINY_PIC:
9809 return SYMBOL_TINY_TLSIE;
9810 default:
9811 return SYMBOL_SMALL_TLSIE;
9814 case TLS_MODEL_LOCAL_EXEC:
9815 if (aarch64_tls_size == 12)
9816 return SYMBOL_TLSLE12;
9817 else if (aarch64_tls_size == 24)
9818 return SYMBOL_TLSLE24;
9819 else if (aarch64_tls_size == 32)
9820 return SYMBOL_TLSLE32;
9821 else if (aarch64_tls_size == 48)
9822 return SYMBOL_TLSLE48;
9823 else
9824 gcc_unreachable ();
9826 case TLS_MODEL_EMULATED:
9827 case TLS_MODEL_NONE:
9828 return SYMBOL_FORCE_TO_MEM;
9830 default:
9831 gcc_unreachable ();
9835 /* Return the method that should be used to access SYMBOL_REF or
9836 LABEL_REF X. */
9838 enum aarch64_symbol_type
9839 aarch64_classify_symbol (rtx x, rtx offset)
9841 if (GET_CODE (x) == LABEL_REF)
9843 switch (aarch64_cmodel)
9845 case AARCH64_CMODEL_LARGE:
9846 return SYMBOL_FORCE_TO_MEM;
9848 case AARCH64_CMODEL_TINY_PIC:
9849 case AARCH64_CMODEL_TINY:
9850 return SYMBOL_TINY_ABSOLUTE;
9852 case AARCH64_CMODEL_SMALL_SPIC:
9853 case AARCH64_CMODEL_SMALL_PIC:
9854 case AARCH64_CMODEL_SMALL:
9855 return SYMBOL_SMALL_ABSOLUTE;
9857 default:
9858 gcc_unreachable ();
9862 if (GET_CODE (x) == SYMBOL_REF)
9864 if (aarch64_tls_symbol_p (x))
9865 return aarch64_classify_tls_symbol (x);
9867 switch (aarch64_cmodel)
9869 case AARCH64_CMODEL_TINY:
9870 /* When we retrieve symbol + offset address, we have to make sure
9871 the offset does not cause overflow of the final address. But
9872 we have no way of knowing the address of symbol at compile time
9873 so we can't accurately say if the distance between the PC and
9874 symbol + offset is outside the addressible range of +/-1M in the
9875 TINY code model. So we rely on images not being greater than
9876 1M and cap the offset at 1M and anything beyond 1M will have to
9877 be loaded using an alternative mechanism. Furthermore if the
9878 symbol is a weak reference to something that isn't known to
9879 resolve to a symbol in this module, then force to memory. */
9880 if ((SYMBOL_REF_WEAK (x)
9881 && !aarch64_symbol_binds_local_p (x))
9882 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9883 return SYMBOL_FORCE_TO_MEM;
9884 return SYMBOL_TINY_ABSOLUTE;
9886 case AARCH64_CMODEL_SMALL:
9887 /* Same reasoning as the tiny code model, but the offset cap here is
9888 4G. */
9889 if ((SYMBOL_REF_WEAK (x)
9890 && !aarch64_symbol_binds_local_p (x))
9891 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9892 HOST_WIDE_INT_C (4294967264)))
9893 return SYMBOL_FORCE_TO_MEM;
9894 return SYMBOL_SMALL_ABSOLUTE;
9896 case AARCH64_CMODEL_TINY_PIC:
9897 if (!aarch64_symbol_binds_local_p (x))
9898 return SYMBOL_TINY_GOT;
9899 return SYMBOL_TINY_ABSOLUTE;
9901 case AARCH64_CMODEL_SMALL_SPIC:
9902 case AARCH64_CMODEL_SMALL_PIC:
9903 if (!aarch64_symbol_binds_local_p (x))
9904 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9905 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9906 return SYMBOL_SMALL_ABSOLUTE;
9908 case AARCH64_CMODEL_LARGE:
9909 /* This is alright even in PIC code as the constant
9910 pool reference is always PC relative and within
9911 the same translation unit. */
9912 if (CONSTANT_POOL_ADDRESS_P (x))
9913 return SYMBOL_SMALL_ABSOLUTE;
9914 else
9915 return SYMBOL_FORCE_TO_MEM;
9917 default:
9918 gcc_unreachable ();
9922 /* By default push everything into the constant pool. */
9923 return SYMBOL_FORCE_TO_MEM;
9926 bool
9927 aarch64_constant_address_p (rtx x)
9929 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9932 bool
9933 aarch64_legitimate_pic_operand_p (rtx x)
9935 if (GET_CODE (x) == SYMBOL_REF
9936 || (GET_CODE (x) == CONST
9937 && GET_CODE (XEXP (x, 0)) == PLUS
9938 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9939 return false;
9941 return true;
9944 /* Return true if X holds either a quarter-precision or
9945 floating-point +0.0 constant. */
9946 static bool
9947 aarch64_valid_floating_const (machine_mode mode, rtx x)
9949 if (!CONST_DOUBLE_P (x))
9950 return false;
9952 if (aarch64_float_const_zero_rtx_p (x))
9953 return true;
9955 /* We only handle moving 0.0 to a TFmode register. */
9956 if (!(mode == SFmode || mode == DFmode))
9957 return false;
9959 return aarch64_float_const_representable_p (x);
9962 static bool
9963 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9965 /* Do not allow vector struct mode constants. We could support
9966 0 and -1 easily, but they need support in aarch64-simd.md. */
9967 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9968 return false;
9970 /* This could probably go away because
9971 we now decompose CONST_INTs according to expand_mov_immediate. */
9972 if ((GET_CODE (x) == CONST_VECTOR
9973 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9974 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9975 return !targetm.cannot_force_const_mem (mode, x);
9977 if (GET_CODE (x) == HIGH
9978 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9979 return true;
9981 return aarch64_constant_address_p (x);
9985 aarch64_load_tp (rtx target)
9987 if (!target
9988 || GET_MODE (target) != Pmode
9989 || !register_operand (target, Pmode))
9990 target = gen_reg_rtx (Pmode);
9992 /* Can return in any reg. */
9993 emit_insn (gen_aarch64_load_tp_hard (target));
9994 return target;
9997 /* On AAPCS systems, this is the "struct __va_list". */
9998 static GTY(()) tree va_list_type;
10000 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10001 Return the type to use as __builtin_va_list.
10003 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10005 struct __va_list
10007 void *__stack;
10008 void *__gr_top;
10009 void *__vr_top;
10010 int __gr_offs;
10011 int __vr_offs;
10012 }; */
10014 static tree
10015 aarch64_build_builtin_va_list (void)
10017 tree va_list_name;
10018 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10020 /* Create the type. */
10021 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10022 /* Give it the required name. */
10023 va_list_name = build_decl (BUILTINS_LOCATION,
10024 TYPE_DECL,
10025 get_identifier ("__va_list"),
10026 va_list_type);
10027 DECL_ARTIFICIAL (va_list_name) = 1;
10028 TYPE_NAME (va_list_type) = va_list_name;
10029 TYPE_STUB_DECL (va_list_type) = va_list_name;
10031 /* Create the fields. */
10032 f_stack = build_decl (BUILTINS_LOCATION,
10033 FIELD_DECL, get_identifier ("__stack"),
10034 ptr_type_node);
10035 f_grtop = build_decl (BUILTINS_LOCATION,
10036 FIELD_DECL, get_identifier ("__gr_top"),
10037 ptr_type_node);
10038 f_vrtop = build_decl (BUILTINS_LOCATION,
10039 FIELD_DECL, get_identifier ("__vr_top"),
10040 ptr_type_node);
10041 f_groff = build_decl (BUILTINS_LOCATION,
10042 FIELD_DECL, get_identifier ("__gr_offs"),
10043 integer_type_node);
10044 f_vroff = build_decl (BUILTINS_LOCATION,
10045 FIELD_DECL, get_identifier ("__vr_offs"),
10046 integer_type_node);
10048 /* Tell tree-stdarg pass about our internal offset fields.
10049 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10050 purpose to identify whether the code is updating va_list internal
10051 offset fields through irregular way. */
10052 va_list_gpr_counter_field = f_groff;
10053 va_list_fpr_counter_field = f_vroff;
10055 DECL_ARTIFICIAL (f_stack) = 1;
10056 DECL_ARTIFICIAL (f_grtop) = 1;
10057 DECL_ARTIFICIAL (f_vrtop) = 1;
10058 DECL_ARTIFICIAL (f_groff) = 1;
10059 DECL_ARTIFICIAL (f_vroff) = 1;
10061 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10062 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10063 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10064 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10065 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10067 TYPE_FIELDS (va_list_type) = f_stack;
10068 DECL_CHAIN (f_stack) = f_grtop;
10069 DECL_CHAIN (f_grtop) = f_vrtop;
10070 DECL_CHAIN (f_vrtop) = f_groff;
10071 DECL_CHAIN (f_groff) = f_vroff;
10073 /* Compute its layout. */
10074 layout_type (va_list_type);
10076 return va_list_type;
10079 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10080 static void
10081 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10083 const CUMULATIVE_ARGS *cum;
10084 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10085 tree stack, grtop, vrtop, groff, vroff;
10086 tree t;
10087 int gr_save_area_size = cfun->va_list_gpr_size;
10088 int vr_save_area_size = cfun->va_list_fpr_size;
10089 int vr_offset;
10091 cum = &crtl->args.info;
10092 if (cfun->va_list_gpr_size)
10093 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10094 cfun->va_list_gpr_size);
10095 if (cfun->va_list_fpr_size)
10096 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10097 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10099 if (!TARGET_FLOAT)
10101 gcc_assert (cum->aapcs_nvrn == 0);
10102 vr_save_area_size = 0;
10105 f_stack = TYPE_FIELDS (va_list_type_node);
10106 f_grtop = DECL_CHAIN (f_stack);
10107 f_vrtop = DECL_CHAIN (f_grtop);
10108 f_groff = DECL_CHAIN (f_vrtop);
10109 f_vroff = DECL_CHAIN (f_groff);
10111 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10112 NULL_TREE);
10113 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10114 NULL_TREE);
10115 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10116 NULL_TREE);
10117 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10118 NULL_TREE);
10119 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10120 NULL_TREE);
10122 /* Emit code to initialize STACK, which points to the next varargs stack
10123 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10124 by named arguments. STACK is 8-byte aligned. */
10125 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10126 if (cum->aapcs_stack_size > 0)
10127 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10128 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10129 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10131 /* Emit code to initialize GRTOP, the top of the GR save area.
10132 virtual_incoming_args_rtx should have been 16 byte aligned. */
10133 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10134 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10135 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10137 /* Emit code to initialize VRTOP, the top of the VR save area.
10138 This address is gr_save_area_bytes below GRTOP, rounded
10139 down to the next 16-byte boundary. */
10140 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10141 vr_offset = ROUND_UP (gr_save_area_size,
10142 STACK_BOUNDARY / BITS_PER_UNIT);
10144 if (vr_offset)
10145 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10146 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10147 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10149 /* Emit code to initialize GROFF, the offset from GRTOP of the
10150 next GPR argument. */
10151 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10152 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10153 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10155 /* Likewise emit code to initialize VROFF, the offset from FTOP
10156 of the next VR argument. */
10157 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10158 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10159 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10162 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10164 static tree
10165 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10166 gimple_seq *post_p ATTRIBUTE_UNUSED)
10168 tree addr;
10169 bool indirect_p;
10170 bool is_ha; /* is HFA or HVA. */
10171 bool dw_align; /* double-word align. */
10172 machine_mode ag_mode = VOIDmode;
10173 int nregs;
10174 machine_mode mode;
10176 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10177 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10178 HOST_WIDE_INT size, rsize, adjust, align;
10179 tree t, u, cond1, cond2;
10181 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10182 if (indirect_p)
10183 type = build_pointer_type (type);
10185 mode = TYPE_MODE (type);
10187 f_stack = TYPE_FIELDS (va_list_type_node);
10188 f_grtop = DECL_CHAIN (f_stack);
10189 f_vrtop = DECL_CHAIN (f_grtop);
10190 f_groff = DECL_CHAIN (f_vrtop);
10191 f_vroff = DECL_CHAIN (f_groff);
10193 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10194 f_stack, NULL_TREE);
10195 size = int_size_in_bytes (type);
10196 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10198 dw_align = false;
10199 adjust = 0;
10200 if (aarch64_vfp_is_call_or_return_candidate (mode,
10201 type,
10202 &ag_mode,
10203 &nregs,
10204 &is_ha))
10206 /* TYPE passed in fp/simd registers. */
10207 if (!TARGET_FLOAT)
10208 aarch64_err_no_fpadvsimd (mode, "varargs");
10210 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10211 unshare_expr (valist), f_vrtop, NULL_TREE);
10212 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10213 unshare_expr (valist), f_vroff, NULL_TREE);
10215 rsize = nregs * UNITS_PER_VREG;
10217 if (is_ha)
10219 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10220 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10222 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10223 && size < UNITS_PER_VREG)
10225 adjust = UNITS_PER_VREG - size;
10228 else
10230 /* TYPE passed in general registers. */
10231 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10232 unshare_expr (valist), f_grtop, NULL_TREE);
10233 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10234 unshare_expr (valist), f_groff, NULL_TREE);
10235 rsize = ROUND_UP (size, UNITS_PER_WORD);
10236 nregs = rsize / UNITS_PER_WORD;
10238 if (align > 8)
10239 dw_align = true;
10241 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10242 && size < UNITS_PER_WORD)
10244 adjust = UNITS_PER_WORD - size;
10248 /* Get a local temporary for the field value. */
10249 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10251 /* Emit code to branch if off >= 0. */
10252 t = build2 (GE_EXPR, boolean_type_node, off,
10253 build_int_cst (TREE_TYPE (off), 0));
10254 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10256 if (dw_align)
10258 /* Emit: offs = (offs + 15) & -16. */
10259 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10260 build_int_cst (TREE_TYPE (off), 15));
10261 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10262 build_int_cst (TREE_TYPE (off), -16));
10263 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10265 else
10266 roundup = NULL;
10268 /* Update ap.__[g|v]r_offs */
10269 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10270 build_int_cst (TREE_TYPE (off), rsize));
10271 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10273 /* String up. */
10274 if (roundup)
10275 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10277 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10278 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10279 build_int_cst (TREE_TYPE (f_off), 0));
10280 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10282 /* String up: make sure the assignment happens before the use. */
10283 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10284 COND_EXPR_ELSE (cond1) = t;
10286 /* Prepare the trees handling the argument that is passed on the stack;
10287 the top level node will store in ON_STACK. */
10288 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10289 if (align > 8)
10291 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10292 t = fold_convert (intDI_type_node, arg);
10293 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10294 build_int_cst (TREE_TYPE (t), 15));
10295 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10296 build_int_cst (TREE_TYPE (t), -16));
10297 t = fold_convert (TREE_TYPE (arg), t);
10298 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10300 else
10301 roundup = NULL;
10302 /* Advance ap.__stack */
10303 t = fold_convert (intDI_type_node, arg);
10304 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10305 build_int_cst (TREE_TYPE (t), size + 7));
10306 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10307 build_int_cst (TREE_TYPE (t), -8));
10308 t = fold_convert (TREE_TYPE (arg), t);
10309 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10310 /* String up roundup and advance. */
10311 if (roundup)
10312 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10313 /* String up with arg */
10314 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10315 /* Big-endianness related address adjustment. */
10316 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10317 && size < UNITS_PER_WORD)
10319 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10320 size_int (UNITS_PER_WORD - size));
10321 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10324 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10325 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10327 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10328 t = off;
10329 if (adjust)
10330 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10331 build_int_cst (TREE_TYPE (off), adjust));
10333 t = fold_convert (sizetype, t);
10334 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10336 if (is_ha)
10338 /* type ha; // treat as "struct {ftype field[n];}"
10339 ... [computing offs]
10340 for (i = 0; i <nregs; ++i, offs += 16)
10341 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10342 return ha; */
10343 int i;
10344 tree tmp_ha, field_t, field_ptr_t;
10346 /* Declare a local variable. */
10347 tmp_ha = create_tmp_var_raw (type, "ha");
10348 gimple_add_tmp_var (tmp_ha);
10350 /* Establish the base type. */
10351 switch (ag_mode)
10353 case SFmode:
10354 field_t = float_type_node;
10355 field_ptr_t = float_ptr_type_node;
10356 break;
10357 case DFmode:
10358 field_t = double_type_node;
10359 field_ptr_t = double_ptr_type_node;
10360 break;
10361 case TFmode:
10362 field_t = long_double_type_node;
10363 field_ptr_t = long_double_ptr_type_node;
10364 break;
10365 case HFmode:
10366 field_t = aarch64_fp16_type_node;
10367 field_ptr_t = aarch64_fp16_ptr_type_node;
10368 break;
10369 case V2SImode:
10370 case V4SImode:
10372 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10373 field_t = build_vector_type_for_mode (innertype, ag_mode);
10374 field_ptr_t = build_pointer_type (field_t);
10376 break;
10377 default:
10378 gcc_assert (0);
10381 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10382 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10383 addr = t;
10384 t = fold_convert (field_ptr_t, addr);
10385 t = build2 (MODIFY_EXPR, field_t,
10386 build1 (INDIRECT_REF, field_t, tmp_ha),
10387 build1 (INDIRECT_REF, field_t, t));
10389 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10390 for (i = 1; i < nregs; ++i)
10392 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10393 u = fold_convert (field_ptr_t, addr);
10394 u = build2 (MODIFY_EXPR, field_t,
10395 build2 (MEM_REF, field_t, tmp_ha,
10396 build_int_cst (field_ptr_t,
10397 (i *
10398 int_size_in_bytes (field_t)))),
10399 build1 (INDIRECT_REF, field_t, u));
10400 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10403 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10404 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10407 COND_EXPR_ELSE (cond2) = t;
10408 addr = fold_convert (build_pointer_type (type), cond1);
10409 addr = build_va_arg_indirect_ref (addr);
10411 if (indirect_p)
10412 addr = build_va_arg_indirect_ref (addr);
10414 return addr;
10417 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10419 static void
10420 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10421 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10422 int no_rtl)
10424 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10425 CUMULATIVE_ARGS local_cum;
10426 int gr_saved = cfun->va_list_gpr_size;
10427 int vr_saved = cfun->va_list_fpr_size;
10429 /* The caller has advanced CUM up to, but not beyond, the last named
10430 argument. Advance a local copy of CUM past the last "real" named
10431 argument, to find out how many registers are left over. */
10432 local_cum = *cum;
10433 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10435 /* Found out how many registers we need to save.
10436 Honor tree-stdvar analysis results. */
10437 if (cfun->va_list_gpr_size)
10438 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10439 cfun->va_list_gpr_size / UNITS_PER_WORD);
10440 if (cfun->va_list_fpr_size)
10441 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10442 cfun->va_list_fpr_size / UNITS_PER_VREG);
10444 if (!TARGET_FLOAT)
10446 gcc_assert (local_cum.aapcs_nvrn == 0);
10447 vr_saved = 0;
10450 if (!no_rtl)
10452 if (gr_saved > 0)
10454 rtx ptr, mem;
10456 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10457 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10458 - gr_saved * UNITS_PER_WORD);
10459 mem = gen_frame_mem (BLKmode, ptr);
10460 set_mem_alias_set (mem, get_varargs_alias_set ());
10462 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10463 mem, gr_saved);
10465 if (vr_saved > 0)
10467 /* We can't use move_block_from_reg, because it will use
10468 the wrong mode, storing D regs only. */
10469 machine_mode mode = TImode;
10470 int off, i, vr_start;
10472 /* Set OFF to the offset from virtual_incoming_args_rtx of
10473 the first vector register. The VR save area lies below
10474 the GR one, and is aligned to 16 bytes. */
10475 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10476 STACK_BOUNDARY / BITS_PER_UNIT);
10477 off -= vr_saved * UNITS_PER_VREG;
10479 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10480 for (i = 0; i < vr_saved; ++i)
10482 rtx ptr, mem;
10484 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10485 mem = gen_frame_mem (mode, ptr);
10486 set_mem_alias_set (mem, get_varargs_alias_set ());
10487 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10488 off += UNITS_PER_VREG;
10493 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10494 any complication of having crtl->args.pretend_args_size changed. */
10495 cfun->machine->frame.saved_varargs_size
10496 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10497 STACK_BOUNDARY / BITS_PER_UNIT)
10498 + vr_saved * UNITS_PER_VREG);
10501 static void
10502 aarch64_conditional_register_usage (void)
10504 int i;
10505 if (!TARGET_FLOAT)
10507 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10509 fixed_regs[i] = 1;
10510 call_used_regs[i] = 1;
10515 /* Walk down the type tree of TYPE counting consecutive base elements.
10516 If *MODEP is VOIDmode, then set it to the first valid floating point
10517 type. If a non-floating point type is found, or if a floating point
10518 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10519 otherwise return the count in the sub-tree. */
10520 static int
10521 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10523 machine_mode mode;
10524 HOST_WIDE_INT size;
10526 switch (TREE_CODE (type))
10528 case REAL_TYPE:
10529 mode = TYPE_MODE (type);
10530 if (mode != DFmode && mode != SFmode
10531 && mode != TFmode && mode != HFmode)
10532 return -1;
10534 if (*modep == VOIDmode)
10535 *modep = mode;
10537 if (*modep == mode)
10538 return 1;
10540 break;
10542 case COMPLEX_TYPE:
10543 mode = TYPE_MODE (TREE_TYPE (type));
10544 if (mode != DFmode && mode != SFmode
10545 && mode != TFmode && mode != HFmode)
10546 return -1;
10548 if (*modep == VOIDmode)
10549 *modep = mode;
10551 if (*modep == mode)
10552 return 2;
10554 break;
10556 case VECTOR_TYPE:
10557 /* Use V2SImode and V4SImode as representatives of all 64-bit
10558 and 128-bit vector types. */
10559 size = int_size_in_bytes (type);
10560 switch (size)
10562 case 8:
10563 mode = V2SImode;
10564 break;
10565 case 16:
10566 mode = V4SImode;
10567 break;
10568 default:
10569 return -1;
10572 if (*modep == VOIDmode)
10573 *modep = mode;
10575 /* Vector modes are considered to be opaque: two vectors are
10576 equivalent for the purposes of being homogeneous aggregates
10577 if they are the same size. */
10578 if (*modep == mode)
10579 return 1;
10581 break;
10583 case ARRAY_TYPE:
10585 int count;
10586 tree index = TYPE_DOMAIN (type);
10588 /* Can't handle incomplete types nor sizes that are not
10589 fixed. */
10590 if (!COMPLETE_TYPE_P (type)
10591 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10592 return -1;
10594 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10595 if (count == -1
10596 || !index
10597 || !TYPE_MAX_VALUE (index)
10598 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10599 || !TYPE_MIN_VALUE (index)
10600 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10601 || count < 0)
10602 return -1;
10604 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10605 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10607 /* There must be no padding. */
10608 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10609 return -1;
10611 return count;
10614 case RECORD_TYPE:
10616 int count = 0;
10617 int sub_count;
10618 tree field;
10620 /* Can't handle incomplete types nor sizes that are not
10621 fixed. */
10622 if (!COMPLETE_TYPE_P (type)
10623 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10624 return -1;
10626 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10628 if (TREE_CODE (field) != FIELD_DECL)
10629 continue;
10631 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10632 if (sub_count < 0)
10633 return -1;
10634 count += sub_count;
10637 /* There must be no padding. */
10638 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10639 return -1;
10641 return count;
10644 case UNION_TYPE:
10645 case QUAL_UNION_TYPE:
10647 /* These aren't very interesting except in a degenerate case. */
10648 int count = 0;
10649 int sub_count;
10650 tree field;
10652 /* Can't handle incomplete types nor sizes that are not
10653 fixed. */
10654 if (!COMPLETE_TYPE_P (type)
10655 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10656 return -1;
10658 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10660 if (TREE_CODE (field) != FIELD_DECL)
10661 continue;
10663 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10664 if (sub_count < 0)
10665 return -1;
10666 count = count > sub_count ? count : sub_count;
10669 /* There must be no padding. */
10670 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10671 return -1;
10673 return count;
10676 default:
10677 break;
10680 return -1;
10683 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10684 type as described in AAPCS64 \S 4.1.2.
10686 See the comment above aarch64_composite_type_p for the notes on MODE. */
10688 static bool
10689 aarch64_short_vector_p (const_tree type,
10690 machine_mode mode)
10692 HOST_WIDE_INT size = -1;
10694 if (type && TREE_CODE (type) == VECTOR_TYPE)
10695 size = int_size_in_bytes (type);
10696 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10697 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10698 size = GET_MODE_SIZE (mode);
10700 return (size == 8 || size == 16);
10703 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10704 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10705 array types. The C99 floating-point complex types are also considered
10706 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10707 types, which are GCC extensions and out of the scope of AAPCS64, are
10708 treated as composite types here as well.
10710 Note that MODE itself is not sufficient in determining whether a type
10711 is such a composite type or not. This is because
10712 stor-layout.c:compute_record_mode may have already changed the MODE
10713 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10714 structure with only one field may have its MODE set to the mode of the
10715 field. Also an integer mode whose size matches the size of the
10716 RECORD_TYPE type may be used to substitute the original mode
10717 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10718 solely relied on. */
10720 static bool
10721 aarch64_composite_type_p (const_tree type,
10722 machine_mode mode)
10724 if (aarch64_short_vector_p (type, mode))
10725 return false;
10727 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10728 return true;
10730 if (mode == BLKmode
10731 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10732 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10733 return true;
10735 return false;
10738 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10739 shall be passed or returned in simd/fp register(s) (providing these
10740 parameter passing registers are available).
10742 Upon successful return, *COUNT returns the number of needed registers,
10743 *BASE_MODE returns the mode of the individual register and when IS_HAF
10744 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10745 floating-point aggregate or a homogeneous short-vector aggregate. */
10747 static bool
10748 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10749 const_tree type,
10750 machine_mode *base_mode,
10751 int *count,
10752 bool *is_ha)
10754 machine_mode new_mode = VOIDmode;
10755 bool composite_p = aarch64_composite_type_p (type, mode);
10757 if (is_ha != NULL) *is_ha = false;
10759 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10760 || aarch64_short_vector_p (type, mode))
10762 *count = 1;
10763 new_mode = mode;
10765 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10767 if (is_ha != NULL) *is_ha = true;
10768 *count = 2;
10769 new_mode = GET_MODE_INNER (mode);
10771 else if (type && composite_p)
10773 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10775 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10777 if (is_ha != NULL) *is_ha = true;
10778 *count = ag_count;
10780 else
10781 return false;
10783 else
10784 return false;
10786 *base_mode = new_mode;
10787 return true;
10790 /* Implement TARGET_STRUCT_VALUE_RTX. */
10792 static rtx
10793 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10794 int incoming ATTRIBUTE_UNUSED)
10796 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10799 /* Implements target hook vector_mode_supported_p. */
10800 static bool
10801 aarch64_vector_mode_supported_p (machine_mode mode)
10803 if (TARGET_SIMD
10804 && (mode == V4SImode || mode == V8HImode
10805 || mode == V16QImode || mode == V2DImode
10806 || mode == V2SImode || mode == V4HImode
10807 || mode == V8QImode || mode == V2SFmode
10808 || mode == V4SFmode || mode == V2DFmode
10809 || mode == V4HFmode || mode == V8HFmode
10810 || mode == V1DFmode))
10811 return true;
10813 return false;
10816 /* Return appropriate SIMD container
10817 for MODE within a vector of WIDTH bits. */
10818 static machine_mode
10819 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10821 gcc_assert (width == 64 || width == 128);
10822 if (TARGET_SIMD)
10824 if (width == 128)
10825 switch (mode)
10827 case DFmode:
10828 return V2DFmode;
10829 case SFmode:
10830 return V4SFmode;
10831 case SImode:
10832 return V4SImode;
10833 case HImode:
10834 return V8HImode;
10835 case QImode:
10836 return V16QImode;
10837 case DImode:
10838 return V2DImode;
10839 default:
10840 break;
10842 else
10843 switch (mode)
10845 case SFmode:
10846 return V2SFmode;
10847 case SImode:
10848 return V2SImode;
10849 case HImode:
10850 return V4HImode;
10851 case QImode:
10852 return V8QImode;
10853 default:
10854 break;
10857 return word_mode;
10860 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10861 static machine_mode
10862 aarch64_preferred_simd_mode (machine_mode mode)
10864 return aarch64_simd_container_mode (mode, 128);
10867 /* Return the bitmask of possible vector sizes for the vectorizer
10868 to iterate over. */
10869 static unsigned int
10870 aarch64_autovectorize_vector_sizes (void)
10872 return (16 | 8);
10875 /* Implement TARGET_MANGLE_TYPE. */
10877 static const char *
10878 aarch64_mangle_type (const_tree type)
10880 /* The AArch64 ABI documents say that "__va_list" has to be
10881 managled as if it is in the "std" namespace. */
10882 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10883 return "St9__va_list";
10885 /* Half-precision float. */
10886 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10887 return "Dh";
10889 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10890 builtin types. */
10891 if (TYPE_NAME (type) != NULL)
10892 return aarch64_mangle_builtin_type (type);
10894 /* Use the default mangling. */
10895 return NULL;
10899 /* Return true if the rtx_insn contains a MEM RTX somewhere
10900 in it. */
10902 static bool
10903 has_memory_op (rtx_insn *mem_insn)
10905 subrtx_iterator::array_type array;
10906 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10907 if (MEM_P (*iter))
10908 return true;
10910 return false;
10913 /* Find the first rtx_insn before insn that will generate an assembly
10914 instruction. */
10916 static rtx_insn *
10917 aarch64_prev_real_insn (rtx_insn *insn)
10919 if (!insn)
10920 return NULL;
10924 insn = prev_real_insn (insn);
10926 while (insn && recog_memoized (insn) < 0);
10928 return insn;
10931 static bool
10932 is_madd_op (enum attr_type t1)
10934 unsigned int i;
10935 /* A number of these may be AArch32 only. */
10936 enum attr_type mlatypes[] = {
10937 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10938 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10939 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10942 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10944 if (t1 == mlatypes[i])
10945 return true;
10948 return false;
10951 /* Check if there is a register dependency between a load and the insn
10952 for which we hold recog_data. */
10954 static bool
10955 dep_between_memop_and_curr (rtx memop)
10957 rtx load_reg;
10958 int opno;
10960 gcc_assert (GET_CODE (memop) == SET);
10962 if (!REG_P (SET_DEST (memop)))
10963 return false;
10965 load_reg = SET_DEST (memop);
10966 for (opno = 1; opno < recog_data.n_operands; opno++)
10968 rtx operand = recog_data.operand[opno];
10969 if (REG_P (operand)
10970 && reg_overlap_mentioned_p (load_reg, operand))
10971 return true;
10974 return false;
10978 /* When working around the Cortex-A53 erratum 835769,
10979 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10980 instruction and has a preceding memory instruction such that a NOP
10981 should be inserted between them. */
10983 bool
10984 aarch64_madd_needs_nop (rtx_insn* insn)
10986 enum attr_type attr_type;
10987 rtx_insn *prev;
10988 rtx body;
10990 if (!TARGET_FIX_ERR_A53_835769)
10991 return false;
10993 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10994 return false;
10996 attr_type = get_attr_type (insn);
10997 if (!is_madd_op (attr_type))
10998 return false;
11000 prev = aarch64_prev_real_insn (insn);
11001 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11002 Restore recog state to INSN to avoid state corruption. */
11003 extract_constrain_insn_cached (insn);
11005 if (!prev || !has_memory_op (prev))
11006 return false;
11008 body = single_set (prev);
11010 /* If the previous insn is a memory op and there is no dependency between
11011 it and the DImode madd, emit a NOP between them. If body is NULL then we
11012 have a complex memory operation, probably a load/store pair.
11013 Be conservative for now and emit a NOP. */
11014 if (GET_MODE (recog_data.operand[0]) == DImode
11015 && (!body || !dep_between_memop_and_curr (body)))
11016 return true;
11018 return false;
11023 /* Implement FINAL_PRESCAN_INSN. */
11025 void
11026 aarch64_final_prescan_insn (rtx_insn *insn)
11028 if (aarch64_madd_needs_nop (insn))
11029 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11033 /* Return the equivalent letter for size. */
11034 static char
11035 sizetochar (int size)
11037 switch (size)
11039 case 64: return 'd';
11040 case 32: return 's';
11041 case 16: return 'h';
11042 case 8 : return 'b';
11043 default: gcc_unreachable ();
11047 /* Return true iff x is a uniform vector of floating-point
11048 constants, and the constant can be represented in
11049 quarter-precision form. Note, as aarch64_float_const_representable
11050 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11051 static bool
11052 aarch64_vect_float_const_representable_p (rtx x)
11054 rtx elt;
11055 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11056 && const_vec_duplicate_p (x, &elt)
11057 && aarch64_float_const_representable_p (elt));
11060 /* Return true for valid and false for invalid. */
11061 bool
11062 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11063 struct simd_immediate_info *info)
11065 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11066 matches = 1; \
11067 for (i = 0; i < idx; i += (STRIDE)) \
11068 if (!(TEST)) \
11069 matches = 0; \
11070 if (matches) \
11072 immtype = (CLASS); \
11073 elsize = (ELSIZE); \
11074 eshift = (SHIFT); \
11075 emvn = (NEG); \
11076 break; \
11079 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11080 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11081 unsigned char bytes[16];
11082 int immtype = -1, matches;
11083 unsigned int invmask = inverse ? 0xff : 0;
11084 int eshift, emvn;
11086 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11088 if (! (aarch64_simd_imm_zero_p (op, mode)
11089 || aarch64_vect_float_const_representable_p (op)))
11090 return false;
11092 if (info)
11094 info->value = CONST_VECTOR_ELT (op, 0);
11095 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11096 info->mvn = false;
11097 info->shift = 0;
11100 return true;
11103 /* Splat vector constant out into a byte vector. */
11104 for (i = 0; i < n_elts; i++)
11106 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11107 it must be laid out in the vector register in reverse order. */
11108 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11109 unsigned HOST_WIDE_INT elpart;
11111 gcc_assert (CONST_INT_P (el));
11112 elpart = INTVAL (el);
11114 for (unsigned int byte = 0; byte < innersize; byte++)
11116 bytes[idx++] = (elpart & 0xff) ^ invmask;
11117 elpart >>= BITS_PER_UNIT;
11122 /* Sanity check. */
11123 gcc_assert (idx == GET_MODE_SIZE (mode));
11127 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11128 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11130 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11131 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11133 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11134 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11136 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11137 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11139 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11141 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11143 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11144 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11146 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11147 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11149 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11150 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11152 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11153 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11155 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11157 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11159 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11160 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11162 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11163 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11165 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11166 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11168 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11169 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11171 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11173 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11174 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11176 while (0);
11178 if (immtype == -1)
11179 return false;
11181 if (info)
11183 info->element_width = elsize;
11184 info->mvn = emvn != 0;
11185 info->shift = eshift;
11187 unsigned HOST_WIDE_INT imm = 0;
11189 if (immtype >= 12 && immtype <= 15)
11190 info->msl = true;
11192 /* Un-invert bytes of recognized vector, if necessary. */
11193 if (invmask != 0)
11194 for (i = 0; i < idx; i++)
11195 bytes[i] ^= invmask;
11197 if (immtype == 17)
11199 /* FIXME: Broken on 32-bit H_W_I hosts. */
11200 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11202 for (i = 0; i < 8; i++)
11203 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11204 << (i * BITS_PER_UNIT);
11207 info->value = GEN_INT (imm);
11209 else
11211 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11212 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11214 /* Construct 'abcdefgh' because the assembler cannot handle
11215 generic constants. */
11216 if (info->mvn)
11217 imm = ~imm;
11218 imm = (imm >> info->shift) & 0xff;
11219 info->value = GEN_INT (imm);
11223 return true;
11224 #undef CHECK
11227 /* Check of immediate shift constants are within range. */
11228 bool
11229 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11231 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11232 if (left)
11233 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11234 else
11235 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11238 /* Return true if X is a uniform vector where all elements
11239 are either the floating-point constant 0.0 or the
11240 integer constant 0. */
11241 bool
11242 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11244 return x == CONST0_RTX (mode);
11248 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11249 operation of width WIDTH at bit position POS. */
11252 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11254 gcc_assert (CONST_INT_P (width));
11255 gcc_assert (CONST_INT_P (pos));
11257 unsigned HOST_WIDE_INT mask
11258 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11259 return GEN_INT (mask << UINTVAL (pos));
11262 bool
11263 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11265 HOST_WIDE_INT imm = INTVAL (x);
11266 int i;
11268 for (i = 0; i < 8; i++)
11270 unsigned int byte = imm & 0xff;
11271 if (byte != 0xff && byte != 0)
11272 return false;
11273 imm >>= 8;
11276 return true;
11279 bool
11280 aarch64_mov_operand_p (rtx x, machine_mode mode)
11282 if (GET_CODE (x) == HIGH
11283 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11284 return true;
11286 if (CONST_INT_P (x))
11287 return true;
11289 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11290 return true;
11292 return aarch64_classify_symbolic_expression (x)
11293 == SYMBOL_TINY_ABSOLUTE;
11296 /* Return a const_int vector of VAL. */
11298 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11300 int nunits = GET_MODE_NUNITS (mode);
11301 rtvec v = rtvec_alloc (nunits);
11302 int i;
11304 rtx cache = GEN_INT (val);
11306 for (i=0; i < nunits; i++)
11307 RTVEC_ELT (v, i) = cache;
11309 return gen_rtx_CONST_VECTOR (mode, v);
11312 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11314 bool
11315 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11317 machine_mode vmode;
11319 gcc_assert (!VECTOR_MODE_P (mode));
11320 vmode = aarch64_preferred_simd_mode (mode);
11321 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11322 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11325 /* Construct and return a PARALLEL RTX vector with elements numbering the
11326 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11327 the vector - from the perspective of the architecture. This does not
11328 line up with GCC's perspective on lane numbers, so we end up with
11329 different masks depending on our target endian-ness. The diagram
11330 below may help. We must draw the distinction when building masks
11331 which select one half of the vector. An instruction selecting
11332 architectural low-lanes for a big-endian target, must be described using
11333 a mask selecting GCC high-lanes.
11335 Big-Endian Little-Endian
11337 GCC 0 1 2 3 3 2 1 0
11338 | x | x | x | x | | x | x | x | x |
11339 Architecture 3 2 1 0 3 2 1 0
11341 Low Mask: { 2, 3 } { 0, 1 }
11342 High Mask: { 0, 1 } { 2, 3 }
11346 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11348 int nunits = GET_MODE_NUNITS (mode);
11349 rtvec v = rtvec_alloc (nunits / 2);
11350 int high_base = nunits / 2;
11351 int low_base = 0;
11352 int base;
11353 rtx t1;
11354 int i;
11356 if (BYTES_BIG_ENDIAN)
11357 base = high ? low_base : high_base;
11358 else
11359 base = high ? high_base : low_base;
11361 for (i = 0; i < nunits / 2; i++)
11362 RTVEC_ELT (v, i) = GEN_INT (base + i);
11364 t1 = gen_rtx_PARALLEL (mode, v);
11365 return t1;
11368 /* Check OP for validity as a PARALLEL RTX vector with elements
11369 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11370 from the perspective of the architecture. See the diagram above
11371 aarch64_simd_vect_par_cnst_half for more details. */
11373 bool
11374 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11375 bool high)
11377 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11378 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11379 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11380 int i = 0;
11382 if (!VECTOR_MODE_P (mode))
11383 return false;
11385 if (count_op != count_ideal)
11386 return false;
11388 for (i = 0; i < count_ideal; i++)
11390 rtx elt_op = XVECEXP (op, 0, i);
11391 rtx elt_ideal = XVECEXP (ideal, 0, i);
11393 if (!CONST_INT_P (elt_op)
11394 || INTVAL (elt_ideal) != INTVAL (elt_op))
11395 return false;
11397 return true;
11400 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11401 HIGH (exclusive). */
11402 void
11403 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11404 const_tree exp)
11406 HOST_WIDE_INT lane;
11407 gcc_assert (CONST_INT_P (operand));
11408 lane = INTVAL (operand);
11410 if (lane < low || lane >= high)
11412 if (exp)
11413 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11414 else
11415 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11419 /* Return TRUE if OP is a valid vector addressing mode. */
11420 bool
11421 aarch64_simd_mem_operand_p (rtx op)
11423 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11424 || REG_P (XEXP (op, 0)));
11427 /* Emit a register copy from operand to operand, taking care not to
11428 early-clobber source registers in the process.
11430 COUNT is the number of components into which the copy needs to be
11431 decomposed. */
11432 void
11433 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11434 unsigned int count)
11436 unsigned int i;
11437 int rdest = REGNO (operands[0]);
11438 int rsrc = REGNO (operands[1]);
11440 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11441 || rdest < rsrc)
11442 for (i = 0; i < count; i++)
11443 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11444 gen_rtx_REG (mode, rsrc + i));
11445 else
11446 for (i = 0; i < count; i++)
11447 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11448 gen_rtx_REG (mode, rsrc + count - i - 1));
11451 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11452 one of VSTRUCT modes: OI, CI, or XI. */
11454 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11456 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11459 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11460 alignment of a vector to 128 bits. */
11461 static HOST_WIDE_INT
11462 aarch64_simd_vector_alignment (const_tree type)
11464 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11465 return MIN (align, 128);
11468 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11469 static bool
11470 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11472 if (is_packed)
11473 return false;
11475 /* We guarantee alignment for vectors up to 128-bits. */
11476 if (tree_int_cst_compare (TYPE_SIZE (type),
11477 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11478 return false;
11480 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11481 return true;
11484 /* Return true if the vector misalignment factor is supported by the
11485 target. */
11486 static bool
11487 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11488 const_tree type, int misalignment,
11489 bool is_packed)
11491 if (TARGET_SIMD && STRICT_ALIGNMENT)
11493 /* Return if movmisalign pattern is not supported for this mode. */
11494 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11495 return false;
11497 if (misalignment == -1)
11499 /* Misalignment factor is unknown at compile time but we know
11500 it's word aligned. */
11501 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11503 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11505 if (element_size != 64)
11506 return true;
11508 return false;
11511 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11512 is_packed);
11515 /* If VALS is a vector constant that can be loaded into a register
11516 using DUP, generate instructions to do so and return an RTX to
11517 assign to the register. Otherwise return NULL_RTX. */
11518 static rtx
11519 aarch64_simd_dup_constant (rtx vals)
11521 machine_mode mode = GET_MODE (vals);
11522 machine_mode inner_mode = GET_MODE_INNER (mode);
11523 rtx x;
11525 if (!const_vec_duplicate_p (vals, &x))
11526 return NULL_RTX;
11528 /* We can load this constant by using DUP and a constant in a
11529 single ARM register. This will be cheaper than a vector
11530 load. */
11531 x = copy_to_mode_reg (inner_mode, x);
11532 return gen_rtx_VEC_DUPLICATE (mode, x);
11536 /* Generate code to load VALS, which is a PARALLEL containing only
11537 constants (for vec_init) or CONST_VECTOR, efficiently into a
11538 register. Returns an RTX to copy into the register, or NULL_RTX
11539 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11540 static rtx
11541 aarch64_simd_make_constant (rtx vals)
11543 machine_mode mode = GET_MODE (vals);
11544 rtx const_dup;
11545 rtx const_vec = NULL_RTX;
11546 int n_elts = GET_MODE_NUNITS (mode);
11547 int n_const = 0;
11548 int i;
11550 if (GET_CODE (vals) == CONST_VECTOR)
11551 const_vec = vals;
11552 else if (GET_CODE (vals) == PARALLEL)
11554 /* A CONST_VECTOR must contain only CONST_INTs and
11555 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11556 Only store valid constants in a CONST_VECTOR. */
11557 for (i = 0; i < n_elts; ++i)
11559 rtx x = XVECEXP (vals, 0, i);
11560 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11561 n_const++;
11563 if (n_const == n_elts)
11564 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11566 else
11567 gcc_unreachable ();
11569 if (const_vec != NULL_RTX
11570 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11571 /* Load using MOVI/MVNI. */
11572 return const_vec;
11573 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11574 /* Loaded using DUP. */
11575 return const_dup;
11576 else if (const_vec != NULL_RTX)
11577 /* Load from constant pool. We can not take advantage of single-cycle
11578 LD1 because we need a PC-relative addressing mode. */
11579 return const_vec;
11580 else
11581 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11582 We can not construct an initializer. */
11583 return NULL_RTX;
11586 /* Expand a vector initialisation sequence, such that TARGET is
11587 initialised to contain VALS. */
11589 void
11590 aarch64_expand_vector_init (rtx target, rtx vals)
11592 machine_mode mode = GET_MODE (target);
11593 machine_mode inner_mode = GET_MODE_INNER (mode);
11594 /* The number of vector elements. */
11595 int n_elts = GET_MODE_NUNITS (mode);
11596 /* The number of vector elements which are not constant. */
11597 int n_var = 0;
11598 rtx any_const = NULL_RTX;
11599 /* The first element of vals. */
11600 rtx v0 = XVECEXP (vals, 0, 0);
11601 bool all_same = true;
11603 /* Count the number of variable elements to initialise. */
11604 for (int i = 0; i < n_elts; ++i)
11606 rtx x = XVECEXP (vals, 0, i);
11607 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11608 ++n_var;
11609 else
11610 any_const = x;
11612 all_same &= rtx_equal_p (x, v0);
11615 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11616 how best to handle this. */
11617 if (n_var == 0)
11619 rtx constant = aarch64_simd_make_constant (vals);
11620 if (constant != NULL_RTX)
11622 emit_move_insn (target, constant);
11623 return;
11627 /* Splat a single non-constant element if we can. */
11628 if (all_same)
11630 rtx x = copy_to_mode_reg (inner_mode, v0);
11631 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11632 return;
11635 /* Initialise a vector which is part-variable. We want to first try
11636 to build those lanes which are constant in the most efficient way we
11637 can. */
11638 if (n_var != n_elts)
11640 rtx copy = copy_rtx (vals);
11642 /* Load constant part of vector. We really don't care what goes into the
11643 parts we will overwrite, but we're more likely to be able to load the
11644 constant efficiently if it has fewer, larger, repeating parts
11645 (see aarch64_simd_valid_immediate). */
11646 for (int i = 0; i < n_elts; i++)
11648 rtx x = XVECEXP (vals, 0, i);
11649 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11650 continue;
11651 rtx subst = any_const;
11652 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11654 /* Look in the copied vector, as more elements are const. */
11655 rtx test = XVECEXP (copy, 0, i ^ bit);
11656 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11658 subst = test;
11659 break;
11662 XVECEXP (copy, 0, i) = subst;
11664 aarch64_expand_vector_init (target, copy);
11667 /* Insert the variable lanes directly. */
11669 enum insn_code icode = optab_handler (vec_set_optab, mode);
11670 gcc_assert (icode != CODE_FOR_nothing);
11672 for (int i = 0; i < n_elts; i++)
11674 rtx x = XVECEXP (vals, 0, i);
11675 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11676 continue;
11677 x = copy_to_mode_reg (inner_mode, x);
11678 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11682 static unsigned HOST_WIDE_INT
11683 aarch64_shift_truncation_mask (machine_mode mode)
11685 return
11686 (!SHIFT_COUNT_TRUNCATED
11687 || aarch64_vector_mode_supported_p (mode)
11688 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11691 /* Select a format to encode pointers in exception handling data. */
11693 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11695 int type;
11696 switch (aarch64_cmodel)
11698 case AARCH64_CMODEL_TINY:
11699 case AARCH64_CMODEL_TINY_PIC:
11700 case AARCH64_CMODEL_SMALL:
11701 case AARCH64_CMODEL_SMALL_PIC:
11702 case AARCH64_CMODEL_SMALL_SPIC:
11703 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11704 for everything. */
11705 type = DW_EH_PE_sdata4;
11706 break;
11707 default:
11708 /* No assumptions here. 8-byte relocs required. */
11709 type = DW_EH_PE_sdata8;
11710 break;
11712 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11715 /* The last .arch and .tune assembly strings that we printed. */
11716 static std::string aarch64_last_printed_arch_string;
11717 static std::string aarch64_last_printed_tune_string;
11719 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11720 by the function fndecl. */
11722 void
11723 aarch64_declare_function_name (FILE *stream, const char* name,
11724 tree fndecl)
11726 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11728 struct cl_target_option *targ_options;
11729 if (target_parts)
11730 targ_options = TREE_TARGET_OPTION (target_parts);
11731 else
11732 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11733 gcc_assert (targ_options);
11735 const struct processor *this_arch
11736 = aarch64_get_arch (targ_options->x_explicit_arch);
11738 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11739 std::string extension
11740 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11741 this_arch->flags);
11742 /* Only update the assembler .arch string if it is distinct from the last
11743 such string we printed. */
11744 std::string to_print = this_arch->name + extension;
11745 if (to_print != aarch64_last_printed_arch_string)
11747 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11748 aarch64_last_printed_arch_string = to_print;
11751 /* Print the cpu name we're tuning for in the comments, might be
11752 useful to readers of the generated asm. Do it only when it changes
11753 from function to function and verbose assembly is requested. */
11754 const struct processor *this_tune
11755 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11757 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11759 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11760 this_tune->name);
11761 aarch64_last_printed_tune_string = this_tune->name;
11764 /* Don't forget the type directive for ELF. */
11765 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11766 ASM_OUTPUT_LABEL (stream, name);
11769 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11771 static void
11772 aarch64_start_file (void)
11774 struct cl_target_option *default_options
11775 = TREE_TARGET_OPTION (target_option_default_node);
11777 const struct processor *default_arch
11778 = aarch64_get_arch (default_options->x_explicit_arch);
11779 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11780 std::string extension
11781 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11782 default_arch->flags);
11784 aarch64_last_printed_arch_string = default_arch->name + extension;
11785 aarch64_last_printed_tune_string = "";
11786 asm_fprintf (asm_out_file, "\t.arch %s\n",
11787 aarch64_last_printed_arch_string.c_str ());
11789 default_file_start ();
11792 /* Emit load exclusive. */
11794 static void
11795 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11796 rtx mem, rtx model_rtx)
11798 rtx (*gen) (rtx, rtx, rtx);
11800 switch (mode)
11802 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11803 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11804 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11805 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11806 default:
11807 gcc_unreachable ();
11810 emit_insn (gen (rval, mem, model_rtx));
11813 /* Emit store exclusive. */
11815 static void
11816 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11817 rtx rval, rtx mem, rtx model_rtx)
11819 rtx (*gen) (rtx, rtx, rtx, rtx);
11821 switch (mode)
11823 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11824 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11825 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11826 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11827 default:
11828 gcc_unreachable ();
11831 emit_insn (gen (bval, rval, mem, model_rtx));
11834 /* Mark the previous jump instruction as unlikely. */
11836 static void
11837 aarch64_emit_unlikely_jump (rtx insn)
11839 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11841 rtx_insn *jump = emit_jump_insn (insn);
11842 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11845 /* Expand a compare and swap pattern. */
11847 void
11848 aarch64_expand_compare_and_swap (rtx operands[])
11850 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11851 machine_mode mode, cmp_mode;
11852 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11853 int idx;
11854 gen_cas_fn gen;
11855 const gen_cas_fn split_cas[] =
11857 gen_aarch64_compare_and_swapqi,
11858 gen_aarch64_compare_and_swaphi,
11859 gen_aarch64_compare_and_swapsi,
11860 gen_aarch64_compare_and_swapdi
11862 const gen_cas_fn atomic_cas[] =
11864 gen_aarch64_compare_and_swapqi_lse,
11865 gen_aarch64_compare_and_swaphi_lse,
11866 gen_aarch64_compare_and_swapsi_lse,
11867 gen_aarch64_compare_and_swapdi_lse
11870 bval = operands[0];
11871 rval = operands[1];
11872 mem = operands[2];
11873 oldval = operands[3];
11874 newval = operands[4];
11875 is_weak = operands[5];
11876 mod_s = operands[6];
11877 mod_f = operands[7];
11878 mode = GET_MODE (mem);
11879 cmp_mode = mode;
11881 /* Normally the succ memory model must be stronger than fail, but in the
11882 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11883 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11885 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11886 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11887 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11889 switch (mode)
11891 case QImode:
11892 case HImode:
11893 /* For short modes, we're going to perform the comparison in SImode,
11894 so do the zero-extension now. */
11895 cmp_mode = SImode;
11896 rval = gen_reg_rtx (SImode);
11897 oldval = convert_modes (SImode, mode, oldval, true);
11898 /* Fall through. */
11900 case SImode:
11901 case DImode:
11902 /* Force the value into a register if needed. */
11903 if (!aarch64_plus_operand (oldval, mode))
11904 oldval = force_reg (cmp_mode, oldval);
11905 break;
11907 default:
11908 gcc_unreachable ();
11911 switch (mode)
11913 case QImode: idx = 0; break;
11914 case HImode: idx = 1; break;
11915 case SImode: idx = 2; break;
11916 case DImode: idx = 3; break;
11917 default:
11918 gcc_unreachable ();
11920 if (TARGET_LSE)
11921 gen = atomic_cas[idx];
11922 else
11923 gen = split_cas[idx];
11925 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11927 if (mode == QImode || mode == HImode)
11928 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11930 x = gen_rtx_REG (CCmode, CC_REGNUM);
11931 x = gen_rtx_EQ (SImode, x, const0_rtx);
11932 emit_insn (gen_rtx_SET (bval, x));
11935 /* Test whether the target supports using a atomic load-operate instruction.
11936 CODE is the operation and AFTER is TRUE if the data in memory after the
11937 operation should be returned and FALSE if the data before the operation
11938 should be returned. Returns FALSE if the operation isn't supported by the
11939 architecture. */
11941 bool
11942 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11944 if (!TARGET_LSE)
11945 return false;
11947 switch (code)
11949 case SET:
11950 case AND:
11951 case IOR:
11952 case XOR:
11953 case MINUS:
11954 case PLUS:
11955 return true;
11956 default:
11957 return false;
11961 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11962 sequence implementing an atomic operation. */
11964 static void
11965 aarch64_emit_post_barrier (enum memmodel model)
11967 const enum memmodel base_model = memmodel_base (model);
11969 if (is_mm_sync (model)
11970 && (base_model == MEMMODEL_ACQUIRE
11971 || base_model == MEMMODEL_ACQ_REL
11972 || base_model == MEMMODEL_SEQ_CST))
11974 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11978 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11979 for the data in memory. EXPECTED is the value expected to be in memory.
11980 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11981 is the memory ordering to use. */
11983 void
11984 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11985 rtx expected, rtx desired,
11986 rtx model)
11988 rtx (*gen) (rtx, rtx, rtx, rtx);
11989 machine_mode mode;
11991 mode = GET_MODE (mem);
11993 switch (mode)
11995 case QImode: gen = gen_aarch64_atomic_casqi; break;
11996 case HImode: gen = gen_aarch64_atomic_cashi; break;
11997 case SImode: gen = gen_aarch64_atomic_cassi; break;
11998 case DImode: gen = gen_aarch64_atomic_casdi; break;
11999 default:
12000 gcc_unreachable ();
12003 /* Move the expected value into the CAS destination register. */
12004 emit_insn (gen_rtx_SET (rval, expected));
12006 /* Emit the CAS. */
12007 emit_insn (gen (rval, mem, desired, model));
12009 /* Compare the expected value with the value loaded by the CAS, to establish
12010 whether the swap was made. */
12011 aarch64_gen_compare_reg (EQ, rval, expected);
12014 /* Split a compare and swap pattern. */
12016 void
12017 aarch64_split_compare_and_swap (rtx operands[])
12019 rtx rval, mem, oldval, newval, scratch;
12020 machine_mode mode;
12021 bool is_weak;
12022 rtx_code_label *label1, *label2;
12023 rtx x, cond;
12024 enum memmodel model;
12025 rtx model_rtx;
12027 rval = operands[0];
12028 mem = operands[1];
12029 oldval = operands[2];
12030 newval = operands[3];
12031 is_weak = (operands[4] != const0_rtx);
12032 model_rtx = operands[5];
12033 scratch = operands[7];
12034 mode = GET_MODE (mem);
12035 model = memmodel_from_int (INTVAL (model_rtx));
12037 label1 = NULL;
12038 if (!is_weak)
12040 label1 = gen_label_rtx ();
12041 emit_label (label1);
12043 label2 = gen_label_rtx ();
12045 /* The initial load can be relaxed for a __sync operation since a final
12046 barrier will be emitted to stop code hoisting. */
12047 if (is_mm_sync (model))
12048 aarch64_emit_load_exclusive (mode, rval, mem,
12049 GEN_INT (MEMMODEL_RELAXED));
12050 else
12051 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12053 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12054 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12055 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12056 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12057 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12059 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12061 if (!is_weak)
12063 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12064 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12065 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12066 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12068 else
12070 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12071 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12072 emit_insn (gen_rtx_SET (cond, x));
12075 emit_label (label2);
12077 /* Emit any final barrier needed for a __sync operation. */
12078 if (is_mm_sync (model))
12079 aarch64_emit_post_barrier (model);
12082 /* Emit a BIC instruction. */
12084 static void
12085 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12087 rtx shift_rtx = GEN_INT (shift);
12088 rtx (*gen) (rtx, rtx, rtx, rtx);
12090 switch (mode)
12092 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12093 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12094 default:
12095 gcc_unreachable ();
12098 emit_insn (gen (dst, s2, shift_rtx, s1));
12101 /* Emit an atomic swap. */
12103 static void
12104 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12105 rtx mem, rtx model)
12107 rtx (*gen) (rtx, rtx, rtx, rtx);
12109 switch (mode)
12111 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12112 case HImode: gen = gen_aarch64_atomic_swphi; break;
12113 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12114 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12115 default:
12116 gcc_unreachable ();
12119 emit_insn (gen (dst, mem, value, model));
12122 /* Operations supported by aarch64_emit_atomic_load_op. */
12124 enum aarch64_atomic_load_op_code
12126 AARCH64_LDOP_PLUS, /* A + B */
12127 AARCH64_LDOP_XOR, /* A ^ B */
12128 AARCH64_LDOP_OR, /* A | B */
12129 AARCH64_LDOP_BIC /* A & ~B */
12132 /* Emit an atomic load-operate. */
12134 static void
12135 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12136 machine_mode mode, rtx dst, rtx src,
12137 rtx mem, rtx model)
12139 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12140 const aarch64_atomic_load_op_fn plus[] =
12142 gen_aarch64_atomic_loadaddqi,
12143 gen_aarch64_atomic_loadaddhi,
12144 gen_aarch64_atomic_loadaddsi,
12145 gen_aarch64_atomic_loadadddi
12147 const aarch64_atomic_load_op_fn eor[] =
12149 gen_aarch64_atomic_loadeorqi,
12150 gen_aarch64_atomic_loadeorhi,
12151 gen_aarch64_atomic_loadeorsi,
12152 gen_aarch64_atomic_loadeordi
12154 const aarch64_atomic_load_op_fn ior[] =
12156 gen_aarch64_atomic_loadsetqi,
12157 gen_aarch64_atomic_loadsethi,
12158 gen_aarch64_atomic_loadsetsi,
12159 gen_aarch64_atomic_loadsetdi
12161 const aarch64_atomic_load_op_fn bic[] =
12163 gen_aarch64_atomic_loadclrqi,
12164 gen_aarch64_atomic_loadclrhi,
12165 gen_aarch64_atomic_loadclrsi,
12166 gen_aarch64_atomic_loadclrdi
12168 aarch64_atomic_load_op_fn gen;
12169 int idx = 0;
12171 switch (mode)
12173 case QImode: idx = 0; break;
12174 case HImode: idx = 1; break;
12175 case SImode: idx = 2; break;
12176 case DImode: idx = 3; break;
12177 default:
12178 gcc_unreachable ();
12181 switch (code)
12183 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12184 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12185 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12186 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12187 default:
12188 gcc_unreachable ();
12191 emit_insn (gen (dst, mem, src, model));
12194 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12195 location to store the data read from memory. OUT_RESULT is the location to
12196 store the result of the operation. MEM is the memory location to read and
12197 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12198 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12199 be NULL. */
12201 void
12202 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12203 rtx mem, rtx value, rtx model_rtx)
12205 machine_mode mode = GET_MODE (mem);
12206 machine_mode wmode = (mode == DImode ? DImode : SImode);
12207 const bool short_mode = (mode < SImode);
12208 aarch64_atomic_load_op_code ldop_code;
12209 rtx src;
12210 rtx x;
12212 if (out_data)
12213 out_data = gen_lowpart (mode, out_data);
12215 if (out_result)
12216 out_result = gen_lowpart (mode, out_result);
12218 /* Make sure the value is in a register, putting it into a destination
12219 register if it needs to be manipulated. */
12220 if (!register_operand (value, mode)
12221 || code == AND || code == MINUS)
12223 src = out_result ? out_result : out_data;
12224 emit_move_insn (src, gen_lowpart (mode, value));
12226 else
12227 src = value;
12228 gcc_assert (register_operand (src, mode));
12230 /* Preprocess the data for the operation as necessary. If the operation is
12231 a SET then emit a swap instruction and finish. */
12232 switch (code)
12234 case SET:
12235 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12236 return;
12238 case MINUS:
12239 /* Negate the value and treat it as a PLUS. */
12241 rtx neg_src;
12243 /* Resize the value if necessary. */
12244 if (short_mode)
12245 src = gen_lowpart (wmode, src);
12247 neg_src = gen_rtx_NEG (wmode, src);
12248 emit_insn (gen_rtx_SET (src, neg_src));
12250 if (short_mode)
12251 src = gen_lowpart (mode, src);
12253 /* Fall-through. */
12254 case PLUS:
12255 ldop_code = AARCH64_LDOP_PLUS;
12256 break;
12258 case IOR:
12259 ldop_code = AARCH64_LDOP_OR;
12260 break;
12262 case XOR:
12263 ldop_code = AARCH64_LDOP_XOR;
12264 break;
12266 case AND:
12268 rtx not_src;
12270 /* Resize the value if necessary. */
12271 if (short_mode)
12272 src = gen_lowpart (wmode, src);
12274 not_src = gen_rtx_NOT (wmode, src);
12275 emit_insn (gen_rtx_SET (src, not_src));
12277 if (short_mode)
12278 src = gen_lowpart (mode, src);
12280 ldop_code = AARCH64_LDOP_BIC;
12281 break;
12283 default:
12284 /* The operation can't be done with atomic instructions. */
12285 gcc_unreachable ();
12288 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12290 /* If necessary, calculate the data in memory after the update by redoing the
12291 operation from values in registers. */
12292 if (!out_result)
12293 return;
12295 if (short_mode)
12297 src = gen_lowpart (wmode, src);
12298 out_data = gen_lowpart (wmode, out_data);
12299 out_result = gen_lowpart (wmode, out_result);
12302 x = NULL_RTX;
12304 switch (code)
12306 case MINUS:
12307 case PLUS:
12308 x = gen_rtx_PLUS (wmode, out_data, src);
12309 break;
12310 case IOR:
12311 x = gen_rtx_IOR (wmode, out_data, src);
12312 break;
12313 case XOR:
12314 x = gen_rtx_XOR (wmode, out_data, src);
12315 break;
12316 case AND:
12317 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12318 return;
12319 default:
12320 gcc_unreachable ();
12323 emit_set_insn (out_result, x);
12325 return;
12328 /* Split an atomic operation. */
12330 void
12331 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12332 rtx value, rtx model_rtx, rtx cond)
12334 machine_mode mode = GET_MODE (mem);
12335 machine_mode wmode = (mode == DImode ? DImode : SImode);
12336 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12337 const bool is_sync = is_mm_sync (model);
12338 rtx_code_label *label;
12339 rtx x;
12341 /* Split the atomic operation into a sequence. */
12342 label = gen_label_rtx ();
12343 emit_label (label);
12345 if (new_out)
12346 new_out = gen_lowpart (wmode, new_out);
12347 if (old_out)
12348 old_out = gen_lowpart (wmode, old_out);
12349 else
12350 old_out = new_out;
12351 value = simplify_gen_subreg (wmode, value, mode, 0);
12353 /* The initial load can be relaxed for a __sync operation since a final
12354 barrier will be emitted to stop code hoisting. */
12355 if (is_sync)
12356 aarch64_emit_load_exclusive (mode, old_out, mem,
12357 GEN_INT (MEMMODEL_RELAXED));
12358 else
12359 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12361 switch (code)
12363 case SET:
12364 new_out = value;
12365 break;
12367 case NOT:
12368 x = gen_rtx_AND (wmode, old_out, value);
12369 emit_insn (gen_rtx_SET (new_out, x));
12370 x = gen_rtx_NOT (wmode, new_out);
12371 emit_insn (gen_rtx_SET (new_out, x));
12372 break;
12374 case MINUS:
12375 if (CONST_INT_P (value))
12377 value = GEN_INT (-INTVAL (value));
12378 code = PLUS;
12380 /* Fall through. */
12382 default:
12383 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12384 emit_insn (gen_rtx_SET (new_out, x));
12385 break;
12388 aarch64_emit_store_exclusive (mode, cond, mem,
12389 gen_lowpart (mode, new_out), model_rtx);
12391 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12392 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12393 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12394 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12396 /* Emit any final barrier needed for a __sync operation. */
12397 if (is_sync)
12398 aarch64_emit_post_barrier (model);
12401 static void
12402 aarch64_init_libfuncs (void)
12404 /* Half-precision float operations. The compiler handles all operations
12405 with NULL libfuncs by converting to SFmode. */
12407 /* Conversions. */
12408 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12409 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12411 /* Arithmetic. */
12412 set_optab_libfunc (add_optab, HFmode, NULL);
12413 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12414 set_optab_libfunc (smul_optab, HFmode, NULL);
12415 set_optab_libfunc (neg_optab, HFmode, NULL);
12416 set_optab_libfunc (sub_optab, HFmode, NULL);
12418 /* Comparisons. */
12419 set_optab_libfunc (eq_optab, HFmode, NULL);
12420 set_optab_libfunc (ne_optab, HFmode, NULL);
12421 set_optab_libfunc (lt_optab, HFmode, NULL);
12422 set_optab_libfunc (le_optab, HFmode, NULL);
12423 set_optab_libfunc (ge_optab, HFmode, NULL);
12424 set_optab_libfunc (gt_optab, HFmode, NULL);
12425 set_optab_libfunc (unord_optab, HFmode, NULL);
12428 /* Target hook for c_mode_for_suffix. */
12429 static machine_mode
12430 aarch64_c_mode_for_suffix (char suffix)
12432 if (suffix == 'q')
12433 return TFmode;
12435 return VOIDmode;
12438 /* We can only represent floating point constants which will fit in
12439 "quarter-precision" values. These values are characterised by
12440 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12443 (-1)^s * (n/16) * 2^r
12445 Where:
12446 's' is the sign bit.
12447 'n' is an integer in the range 16 <= n <= 31.
12448 'r' is an integer in the range -3 <= r <= 4. */
12450 /* Return true iff X can be represented by a quarter-precision
12451 floating point immediate operand X. Note, we cannot represent 0.0. */
12452 bool
12453 aarch64_float_const_representable_p (rtx x)
12455 /* This represents our current view of how many bits
12456 make up the mantissa. */
12457 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12458 int exponent;
12459 unsigned HOST_WIDE_INT mantissa, mask;
12460 REAL_VALUE_TYPE r, m;
12461 bool fail;
12463 if (!CONST_DOUBLE_P (x))
12464 return false;
12466 /* We don't support HFmode constants yet. */
12467 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12468 return false;
12470 r = *CONST_DOUBLE_REAL_VALUE (x);
12472 /* We cannot represent infinities, NaNs or +/-zero. We won't
12473 know if we have +zero until we analyse the mantissa, but we
12474 can reject the other invalid values. */
12475 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12476 || REAL_VALUE_MINUS_ZERO (r))
12477 return false;
12479 /* Extract exponent. */
12480 r = real_value_abs (&r);
12481 exponent = REAL_EXP (&r);
12483 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12484 highest (sign) bit, with a fixed binary point at bit point_pos.
12485 m1 holds the low part of the mantissa, m2 the high part.
12486 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12487 bits for the mantissa, this can fail (low bits will be lost). */
12488 real_ldexp (&m, &r, point_pos - exponent);
12489 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12491 /* If the low part of the mantissa has bits set we cannot represent
12492 the value. */
12493 if (w.elt (0) != 0)
12494 return false;
12495 /* We have rejected the lower HOST_WIDE_INT, so update our
12496 understanding of how many bits lie in the mantissa and
12497 look only at the high HOST_WIDE_INT. */
12498 mantissa = w.elt (1);
12499 point_pos -= HOST_BITS_PER_WIDE_INT;
12501 /* We can only represent values with a mantissa of the form 1.xxxx. */
12502 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12503 if ((mantissa & mask) != 0)
12504 return false;
12506 /* Having filtered unrepresentable values, we may now remove all
12507 but the highest 5 bits. */
12508 mantissa >>= point_pos - 5;
12510 /* We cannot represent the value 0.0, so reject it. This is handled
12511 elsewhere. */
12512 if (mantissa == 0)
12513 return false;
12515 /* Then, as bit 4 is always set, we can mask it off, leaving
12516 the mantissa in the range [0, 15]. */
12517 mantissa &= ~(1 << 4);
12518 gcc_assert (mantissa <= 15);
12520 /* GCC internally does not use IEEE754-like encoding (where normalized
12521 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12522 Our mantissa values are shifted 4 places to the left relative to
12523 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12524 by 5 places to correct for GCC's representation. */
12525 exponent = 5 - exponent;
12527 return (exponent >= 0 && exponent <= 7);
12530 char*
12531 aarch64_output_simd_mov_immediate (rtx const_vector,
12532 machine_mode mode,
12533 unsigned width)
12535 bool is_valid;
12536 static char templ[40];
12537 const char *mnemonic;
12538 const char *shift_op;
12539 unsigned int lane_count = 0;
12540 char element_char;
12542 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12544 /* This will return true to show const_vector is legal for use as either
12545 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12546 also update INFO to show how the immediate should be generated. */
12547 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12548 gcc_assert (is_valid);
12550 element_char = sizetochar (info.element_width);
12551 lane_count = width / info.element_width;
12553 mode = GET_MODE_INNER (mode);
12554 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12556 gcc_assert (info.shift == 0 && ! info.mvn);
12557 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12558 move immediate path. */
12559 if (aarch64_float_const_zero_rtx_p (info.value))
12560 info.value = GEN_INT (0);
12561 else
12563 const unsigned int buf_size = 20;
12564 char float_buf[buf_size] = {'\0'};
12565 real_to_decimal_for_mode (float_buf,
12566 CONST_DOUBLE_REAL_VALUE (info.value),
12567 buf_size, buf_size, 1, mode);
12569 if (lane_count == 1)
12570 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12571 else
12572 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12573 lane_count, element_char, float_buf);
12574 return templ;
12578 mnemonic = info.mvn ? "mvni" : "movi";
12579 shift_op = info.msl ? "msl" : "lsl";
12581 gcc_assert (CONST_INT_P (info.value));
12582 if (lane_count == 1)
12583 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12584 mnemonic, UINTVAL (info.value));
12585 else if (info.shift)
12586 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12587 ", %s %d", mnemonic, lane_count, element_char,
12588 UINTVAL (info.value), shift_op, info.shift);
12589 else
12590 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12591 mnemonic, lane_count, element_char, UINTVAL (info.value));
12592 return templ;
12595 char*
12596 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12597 machine_mode mode)
12599 machine_mode vmode;
12601 gcc_assert (!VECTOR_MODE_P (mode));
12602 vmode = aarch64_simd_container_mode (mode, 64);
12603 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12604 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12607 /* Split operands into moves from op[1] + op[2] into op[0]. */
12609 void
12610 aarch64_split_combinev16qi (rtx operands[3])
12612 unsigned int dest = REGNO (operands[0]);
12613 unsigned int src1 = REGNO (operands[1]);
12614 unsigned int src2 = REGNO (operands[2]);
12615 machine_mode halfmode = GET_MODE (operands[1]);
12616 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12617 rtx destlo, desthi;
12619 gcc_assert (halfmode == V16QImode);
12621 if (src1 == dest && src2 == dest + halfregs)
12623 /* No-op move. Can't split to nothing; emit something. */
12624 emit_note (NOTE_INSN_DELETED);
12625 return;
12628 /* Preserve register attributes for variable tracking. */
12629 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12630 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12631 GET_MODE_SIZE (halfmode));
12633 /* Special case of reversed high/low parts. */
12634 if (reg_overlap_mentioned_p (operands[2], destlo)
12635 && reg_overlap_mentioned_p (operands[1], desthi))
12637 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12638 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12639 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12641 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12643 /* Try to avoid unnecessary moves if part of the result
12644 is in the right place already. */
12645 if (src1 != dest)
12646 emit_move_insn (destlo, operands[1]);
12647 if (src2 != dest + halfregs)
12648 emit_move_insn (desthi, operands[2]);
12650 else
12652 if (src2 != dest + halfregs)
12653 emit_move_insn (desthi, operands[2]);
12654 if (src1 != dest)
12655 emit_move_insn (destlo, operands[1]);
12659 /* vec_perm support. */
12661 #define MAX_VECT_LEN 16
12663 struct expand_vec_perm_d
12665 rtx target, op0, op1;
12666 unsigned char perm[MAX_VECT_LEN];
12667 machine_mode vmode;
12668 unsigned char nelt;
12669 bool one_vector_p;
12670 bool testing_p;
12673 /* Generate a variable permutation. */
12675 static void
12676 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12678 machine_mode vmode = GET_MODE (target);
12679 bool one_vector_p = rtx_equal_p (op0, op1);
12681 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12682 gcc_checking_assert (GET_MODE (op0) == vmode);
12683 gcc_checking_assert (GET_MODE (op1) == vmode);
12684 gcc_checking_assert (GET_MODE (sel) == vmode);
12685 gcc_checking_assert (TARGET_SIMD);
12687 if (one_vector_p)
12689 if (vmode == V8QImode)
12691 /* Expand the argument to a V16QI mode by duplicating it. */
12692 rtx pair = gen_reg_rtx (V16QImode);
12693 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12694 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12696 else
12698 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12701 else
12703 rtx pair;
12705 if (vmode == V8QImode)
12707 pair = gen_reg_rtx (V16QImode);
12708 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12709 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12711 else
12713 pair = gen_reg_rtx (OImode);
12714 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12715 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12720 void
12721 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12723 machine_mode vmode = GET_MODE (target);
12724 unsigned int nelt = GET_MODE_NUNITS (vmode);
12725 bool one_vector_p = rtx_equal_p (op0, op1);
12726 rtx mask;
12728 /* The TBL instruction does not use a modulo index, so we must take care
12729 of that ourselves. */
12730 mask = aarch64_simd_gen_const_vector_dup (vmode,
12731 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12732 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12734 /* For big-endian, we also need to reverse the index within the vector
12735 (but not which vector). */
12736 if (BYTES_BIG_ENDIAN)
12738 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12739 if (!one_vector_p)
12740 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12741 sel = expand_simple_binop (vmode, XOR, sel, mask,
12742 NULL, 0, OPTAB_LIB_WIDEN);
12744 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12747 /* Recognize patterns suitable for the TRN instructions. */
12748 static bool
12749 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12751 unsigned int i, odd, mask, nelt = d->nelt;
12752 rtx out, in0, in1, x;
12753 rtx (*gen) (rtx, rtx, rtx);
12754 machine_mode vmode = d->vmode;
12756 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12757 return false;
12759 /* Note that these are little-endian tests.
12760 We correct for big-endian later. */
12761 if (d->perm[0] == 0)
12762 odd = 0;
12763 else if (d->perm[0] == 1)
12764 odd = 1;
12765 else
12766 return false;
12767 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12769 for (i = 0; i < nelt; i += 2)
12771 if (d->perm[i] != i + odd)
12772 return false;
12773 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12774 return false;
12777 /* Success! */
12778 if (d->testing_p)
12779 return true;
12781 in0 = d->op0;
12782 in1 = d->op1;
12783 if (BYTES_BIG_ENDIAN)
12785 x = in0, in0 = in1, in1 = x;
12786 odd = !odd;
12788 out = d->target;
12790 if (odd)
12792 switch (vmode)
12794 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12795 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12796 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12797 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12798 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12799 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12800 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12801 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12802 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12803 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12804 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12805 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12806 default:
12807 return false;
12810 else
12812 switch (vmode)
12814 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12815 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12816 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12817 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12818 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12819 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12820 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12821 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12822 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12823 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12824 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12825 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12826 default:
12827 return false;
12831 emit_insn (gen (out, in0, in1));
12832 return true;
12835 /* Recognize patterns suitable for the UZP instructions. */
12836 static bool
12837 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12839 unsigned int i, odd, mask, nelt = d->nelt;
12840 rtx out, in0, in1, x;
12841 rtx (*gen) (rtx, rtx, rtx);
12842 machine_mode vmode = d->vmode;
12844 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12845 return false;
12847 /* Note that these are little-endian tests.
12848 We correct for big-endian later. */
12849 if (d->perm[0] == 0)
12850 odd = 0;
12851 else if (d->perm[0] == 1)
12852 odd = 1;
12853 else
12854 return false;
12855 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12857 for (i = 0; i < nelt; i++)
12859 unsigned elt = (i * 2 + odd) & mask;
12860 if (d->perm[i] != elt)
12861 return false;
12864 /* Success! */
12865 if (d->testing_p)
12866 return true;
12868 in0 = d->op0;
12869 in1 = d->op1;
12870 if (BYTES_BIG_ENDIAN)
12872 x = in0, in0 = in1, in1 = x;
12873 odd = !odd;
12875 out = d->target;
12877 if (odd)
12879 switch (vmode)
12881 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12882 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12883 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12884 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12885 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12886 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12887 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12888 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12889 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12890 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12891 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12892 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12893 default:
12894 return false;
12897 else
12899 switch (vmode)
12901 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12902 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12903 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12904 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12905 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12906 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12907 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12908 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12909 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12910 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12911 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12912 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12913 default:
12914 return false;
12918 emit_insn (gen (out, in0, in1));
12919 return true;
12922 /* Recognize patterns suitable for the ZIP instructions. */
12923 static bool
12924 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12926 unsigned int i, high, mask, nelt = d->nelt;
12927 rtx out, in0, in1, x;
12928 rtx (*gen) (rtx, rtx, rtx);
12929 machine_mode vmode = d->vmode;
12931 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12932 return false;
12934 /* Note that these are little-endian tests.
12935 We correct for big-endian later. */
12936 high = nelt / 2;
12937 if (d->perm[0] == high)
12938 /* Do Nothing. */
12940 else if (d->perm[0] == 0)
12941 high = 0;
12942 else
12943 return false;
12944 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12946 for (i = 0; i < nelt / 2; i++)
12948 unsigned elt = (i + high) & mask;
12949 if (d->perm[i * 2] != elt)
12950 return false;
12951 elt = (elt + nelt) & mask;
12952 if (d->perm[i * 2 + 1] != elt)
12953 return false;
12956 /* Success! */
12957 if (d->testing_p)
12958 return true;
12960 in0 = d->op0;
12961 in1 = d->op1;
12962 if (BYTES_BIG_ENDIAN)
12964 x = in0, in0 = in1, in1 = x;
12965 high = !high;
12967 out = d->target;
12969 if (high)
12971 switch (vmode)
12973 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12974 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12975 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12976 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12977 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12978 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12979 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12980 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12981 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12982 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12983 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12984 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12985 default:
12986 return false;
12989 else
12991 switch (vmode)
12993 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12994 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12995 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12996 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12997 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12998 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12999 case V2DImode: gen = gen_aarch64_zip1v2di; break;
13000 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13001 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13002 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13003 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13004 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13005 default:
13006 return false;
13010 emit_insn (gen (out, in0, in1));
13011 return true;
13014 /* Recognize patterns for the EXT insn. */
13016 static bool
13017 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13019 unsigned int i, nelt = d->nelt;
13020 rtx (*gen) (rtx, rtx, rtx, rtx);
13021 rtx offset;
13023 unsigned int location = d->perm[0]; /* Always < nelt. */
13025 /* Check if the extracted indices are increasing by one. */
13026 for (i = 1; i < nelt; i++)
13028 unsigned int required = location + i;
13029 if (d->one_vector_p)
13031 /* We'll pass the same vector in twice, so allow indices to wrap. */
13032 required &= (nelt - 1);
13034 if (d->perm[i] != required)
13035 return false;
13038 switch (d->vmode)
13040 case V16QImode: gen = gen_aarch64_extv16qi; break;
13041 case V8QImode: gen = gen_aarch64_extv8qi; break;
13042 case V4HImode: gen = gen_aarch64_extv4hi; break;
13043 case V8HImode: gen = gen_aarch64_extv8hi; break;
13044 case V2SImode: gen = gen_aarch64_extv2si; break;
13045 case V4SImode: gen = gen_aarch64_extv4si; break;
13046 case V4HFmode: gen = gen_aarch64_extv4hf; break;
13047 case V8HFmode: gen = gen_aarch64_extv8hf; break;
13048 case V2SFmode: gen = gen_aarch64_extv2sf; break;
13049 case V4SFmode: gen = gen_aarch64_extv4sf; break;
13050 case V2DImode: gen = gen_aarch64_extv2di; break;
13051 case V2DFmode: gen = gen_aarch64_extv2df; break;
13052 default:
13053 return false;
13056 /* Success! */
13057 if (d->testing_p)
13058 return true;
13060 /* The case where (location == 0) is a no-op for both big- and little-endian,
13061 and is removed by the mid-end at optimization levels -O1 and higher. */
13063 if (BYTES_BIG_ENDIAN && (location != 0))
13065 /* After setup, we want the high elements of the first vector (stored
13066 at the LSB end of the register), and the low elements of the second
13067 vector (stored at the MSB end of the register). So swap. */
13068 std::swap (d->op0, d->op1);
13069 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13070 location = nelt - location;
13073 offset = GEN_INT (location);
13074 emit_insn (gen (d->target, d->op0, d->op1, offset));
13075 return true;
13078 /* Recognize patterns for the REV insns. */
13080 static bool
13081 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13083 unsigned int i, j, diff, nelt = d->nelt;
13084 rtx (*gen) (rtx, rtx);
13086 if (!d->one_vector_p)
13087 return false;
13089 diff = d->perm[0];
13090 switch (diff)
13092 case 7:
13093 switch (d->vmode)
13095 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13096 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13097 default:
13098 return false;
13100 break;
13101 case 3:
13102 switch (d->vmode)
13104 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13105 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13106 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13107 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13108 default:
13109 return false;
13111 break;
13112 case 1:
13113 switch (d->vmode)
13115 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13116 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13117 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13118 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13119 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13120 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13121 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13122 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13123 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13124 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13125 default:
13126 return false;
13128 break;
13129 default:
13130 return false;
13133 for (i = 0; i < nelt ; i += diff + 1)
13134 for (j = 0; j <= diff; j += 1)
13136 /* This is guaranteed to be true as the value of diff
13137 is 7, 3, 1 and we should have enough elements in the
13138 queue to generate this. Getting a vector mask with a
13139 value of diff other than these values implies that
13140 something is wrong by the time we get here. */
13141 gcc_assert (i + j < nelt);
13142 if (d->perm[i + j] != i + diff - j)
13143 return false;
13146 /* Success! */
13147 if (d->testing_p)
13148 return true;
13150 emit_insn (gen (d->target, d->op0));
13151 return true;
13154 static bool
13155 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13157 rtx (*gen) (rtx, rtx, rtx);
13158 rtx out = d->target;
13159 rtx in0;
13160 machine_mode vmode = d->vmode;
13161 unsigned int i, elt, nelt = d->nelt;
13162 rtx lane;
13164 elt = d->perm[0];
13165 for (i = 1; i < nelt; i++)
13167 if (elt != d->perm[i])
13168 return false;
13171 /* The generic preparation in aarch64_expand_vec_perm_const_1
13172 swaps the operand order and the permute indices if it finds
13173 d->perm[0] to be in the second operand. Thus, we can always
13174 use d->op0 and need not do any extra arithmetic to get the
13175 correct lane number. */
13176 in0 = d->op0;
13177 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13179 switch (vmode)
13181 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13182 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13183 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13184 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13185 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13186 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13187 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13188 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13189 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13190 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13191 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13192 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13193 default:
13194 return false;
13197 emit_insn (gen (out, in0, lane));
13198 return true;
13201 static bool
13202 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13204 rtx rperm[MAX_VECT_LEN], sel;
13205 machine_mode vmode = d->vmode;
13206 unsigned int i, nelt = d->nelt;
13208 if (d->testing_p)
13209 return true;
13211 /* Generic code will try constant permutation twice. Once with the
13212 original mode and again with the elements lowered to QImode.
13213 So wait and don't do the selector expansion ourselves. */
13214 if (vmode != V8QImode && vmode != V16QImode)
13215 return false;
13217 for (i = 0; i < nelt; ++i)
13219 int nunits = GET_MODE_NUNITS (vmode);
13221 /* If big-endian and two vectors we end up with a weird mixed-endian
13222 mode on NEON. Reverse the index within each word but not the word
13223 itself. */
13224 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13225 : d->perm[i]);
13227 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13228 sel = force_reg (vmode, sel);
13230 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13231 return true;
13234 static bool
13235 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13237 /* The pattern matching functions above are written to look for a small
13238 number to begin the sequence (0, 1, N/2). If we begin with an index
13239 from the second operand, we can swap the operands. */
13240 if (d->perm[0] >= d->nelt)
13242 unsigned i, nelt = d->nelt;
13244 gcc_assert (nelt == (nelt & -nelt));
13245 for (i = 0; i < nelt; ++i)
13246 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13248 std::swap (d->op0, d->op1);
13251 if (TARGET_SIMD)
13253 if (aarch64_evpc_rev (d))
13254 return true;
13255 else if (aarch64_evpc_ext (d))
13256 return true;
13257 else if (aarch64_evpc_dup (d))
13258 return true;
13259 else if (aarch64_evpc_zip (d))
13260 return true;
13261 else if (aarch64_evpc_uzp (d))
13262 return true;
13263 else if (aarch64_evpc_trn (d))
13264 return true;
13265 return aarch64_evpc_tbl (d);
13267 return false;
13270 /* Expand a vec_perm_const pattern. */
13272 bool
13273 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13275 struct expand_vec_perm_d d;
13276 int i, nelt, which;
13278 d.target = target;
13279 d.op0 = op0;
13280 d.op1 = op1;
13282 d.vmode = GET_MODE (target);
13283 gcc_assert (VECTOR_MODE_P (d.vmode));
13284 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13285 d.testing_p = false;
13287 for (i = which = 0; i < nelt; ++i)
13289 rtx e = XVECEXP (sel, 0, i);
13290 int ei = INTVAL (e) & (2 * nelt - 1);
13291 which |= (ei < nelt ? 1 : 2);
13292 d.perm[i] = ei;
13295 switch (which)
13297 default:
13298 gcc_unreachable ();
13300 case 3:
13301 d.one_vector_p = false;
13302 if (!rtx_equal_p (op0, op1))
13303 break;
13305 /* The elements of PERM do not suggest that only the first operand
13306 is used, but both operands are identical. Allow easier matching
13307 of the permutation by folding the permutation into the single
13308 input vector. */
13309 /* Fall Through. */
13310 case 2:
13311 for (i = 0; i < nelt; ++i)
13312 d.perm[i] &= nelt - 1;
13313 d.op0 = op1;
13314 d.one_vector_p = true;
13315 break;
13317 case 1:
13318 d.op1 = op0;
13319 d.one_vector_p = true;
13320 break;
13323 return aarch64_expand_vec_perm_const_1 (&d);
13326 static bool
13327 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13328 const unsigned char *sel)
13330 struct expand_vec_perm_d d;
13331 unsigned int i, nelt, which;
13332 bool ret;
13334 d.vmode = vmode;
13335 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13336 d.testing_p = true;
13337 memcpy (d.perm, sel, nelt);
13339 /* Calculate whether all elements are in one vector. */
13340 for (i = which = 0; i < nelt; ++i)
13342 unsigned char e = d.perm[i];
13343 gcc_assert (e < 2 * nelt);
13344 which |= (e < nelt ? 1 : 2);
13347 /* If all elements are from the second vector, reindex as if from the
13348 first vector. */
13349 if (which == 2)
13350 for (i = 0; i < nelt; ++i)
13351 d.perm[i] -= nelt;
13353 /* Check whether the mask can be applied to a single vector. */
13354 d.one_vector_p = (which != 3);
13356 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13357 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13358 if (!d.one_vector_p)
13359 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13361 start_sequence ();
13362 ret = aarch64_expand_vec_perm_const_1 (&d);
13363 end_sequence ();
13365 return ret;
13369 aarch64_reverse_mask (enum machine_mode mode)
13371 /* We have to reverse each vector because we dont have
13372 a permuted load that can reverse-load according to ABI rules. */
13373 rtx mask;
13374 rtvec v = rtvec_alloc (16);
13375 int i, j;
13376 int nunits = GET_MODE_NUNITS (mode);
13377 int usize = GET_MODE_UNIT_SIZE (mode);
13379 gcc_assert (BYTES_BIG_ENDIAN);
13380 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13382 for (i = 0; i < nunits; i++)
13383 for (j = 0; j < usize; j++)
13384 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13385 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13386 return force_reg (V16QImode, mask);
13389 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13390 However due to issues with register allocation it is preferable to avoid
13391 tieing integer scalar and FP scalar modes. Executing integer operations
13392 in general registers is better than treating them as scalar vector
13393 operations. This reduces latency and avoids redundant int<->FP moves.
13394 So tie modes if they are either the same class, or vector modes with
13395 other vector modes, vector structs or any scalar mode.
13398 bool
13399 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13401 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13402 return true;
13404 /* We specifically want to allow elements of "structure" modes to
13405 be tieable to the structure. This more general condition allows
13406 other rarer situations too. */
13407 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13408 return true;
13410 /* Also allow any scalar modes with vectors. */
13411 if (aarch64_vector_mode_supported_p (mode1)
13412 || aarch64_vector_mode_supported_p (mode2))
13413 return true;
13415 return false;
13418 /* Return a new RTX holding the result of moving POINTER forward by
13419 AMOUNT bytes. */
13421 static rtx
13422 aarch64_move_pointer (rtx pointer, int amount)
13424 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13426 return adjust_automodify_address (pointer, GET_MODE (pointer),
13427 next, amount);
13430 /* Return a new RTX holding the result of moving POINTER forward by the
13431 size of the mode it points to. */
13433 static rtx
13434 aarch64_progress_pointer (rtx pointer)
13436 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13438 return aarch64_move_pointer (pointer, amount);
13441 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13442 MODE bytes. */
13444 static void
13445 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13446 machine_mode mode)
13448 rtx reg = gen_reg_rtx (mode);
13450 /* "Cast" the pointers to the correct mode. */
13451 *src = adjust_address (*src, mode, 0);
13452 *dst = adjust_address (*dst, mode, 0);
13453 /* Emit the memcpy. */
13454 emit_move_insn (reg, *src);
13455 emit_move_insn (*dst, reg);
13456 /* Move the pointers forward. */
13457 *src = aarch64_progress_pointer (*src);
13458 *dst = aarch64_progress_pointer (*dst);
13461 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13462 we succeed, otherwise return false. */
13464 bool
13465 aarch64_expand_movmem (rtx *operands)
13467 unsigned int n;
13468 rtx dst = operands[0];
13469 rtx src = operands[1];
13470 rtx base;
13471 bool speed_p = !optimize_function_for_size_p (cfun);
13473 /* When optimizing for size, give a better estimate of the length of a
13474 memcpy call, but use the default otherwise. */
13475 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13477 /* We can't do anything smart if the amount to copy is not constant. */
13478 if (!CONST_INT_P (operands[2]))
13479 return false;
13481 n = UINTVAL (operands[2]);
13483 /* Try to keep the number of instructions low. For cases below 16 bytes we
13484 need to make at most two moves. For cases above 16 bytes it will be one
13485 move for each 16 byte chunk, then at most two additional moves. */
13486 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13487 return false;
13489 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13490 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13492 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13493 src = adjust_automodify_address (src, VOIDmode, base, 0);
13495 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13496 1-byte chunk. */
13497 if (n < 4)
13499 if (n >= 2)
13501 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13502 n -= 2;
13505 if (n == 1)
13506 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13508 return true;
13511 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13512 4-byte chunk, partially overlapping with the previously copied chunk. */
13513 if (n < 8)
13515 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13516 n -= 4;
13517 if (n > 0)
13519 int move = n - 4;
13521 src = aarch64_move_pointer (src, move);
13522 dst = aarch64_move_pointer (dst, move);
13523 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13525 return true;
13528 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13529 them, then (if applicable) an 8-byte chunk. */
13530 while (n >= 8)
13532 if (n / 16)
13534 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13535 n -= 16;
13537 else
13539 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13540 n -= 8;
13544 /* Finish the final bytes of the copy. We can always do this in one
13545 instruction. We either copy the exact amount we need, or partially
13546 overlap with the previous chunk we copied and copy 8-bytes. */
13547 if (n == 0)
13548 return true;
13549 else if (n == 1)
13550 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13551 else if (n == 2)
13552 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13553 else if (n == 4)
13554 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13555 else
13557 if (n == 3)
13559 src = aarch64_move_pointer (src, -1);
13560 dst = aarch64_move_pointer (dst, -1);
13561 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13563 else
13565 int move = n - 8;
13567 src = aarch64_move_pointer (src, move);
13568 dst = aarch64_move_pointer (dst, move);
13569 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13573 return true;
13576 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13577 SImode stores. Handle the case when the constant has identical
13578 bottom and top halves. This is beneficial when the two stores can be
13579 merged into an STP and we avoid synthesising potentially expensive
13580 immediates twice. Return true if such a split is possible. */
13582 bool
13583 aarch64_split_dimode_const_store (rtx dst, rtx src)
13585 rtx lo = gen_lowpart (SImode, src);
13586 rtx hi = gen_highpart_mode (SImode, DImode, src);
13588 bool size_p = optimize_function_for_size_p (cfun);
13590 if (!rtx_equal_p (lo, hi))
13591 return false;
13593 unsigned int orig_cost
13594 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13595 unsigned int lo_cost
13596 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13598 /* We want to transform:
13599 MOV x1, 49370
13600 MOVK x1, 0x140, lsl 16
13601 MOVK x1, 0xc0da, lsl 32
13602 MOVK x1, 0x140, lsl 48
13603 STR x1, [x0]
13604 into:
13605 MOV w1, 49370
13606 MOVK w1, 0x140, lsl 16
13607 STP w1, w1, [x0]
13608 So we want to perform this only when we save two instructions
13609 or more. When optimizing for size, however, accept any code size
13610 savings we can. */
13611 if (size_p && orig_cost <= lo_cost)
13612 return false;
13614 if (!size_p
13615 && (orig_cost <= lo_cost + 1))
13616 return false;
13618 rtx mem_lo = adjust_address (dst, SImode, 0);
13619 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13620 return false;
13622 rtx tmp_reg = gen_reg_rtx (SImode);
13623 aarch64_expand_mov_immediate (tmp_reg, lo);
13624 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13625 /* Don't emit an explicit store pair as this may not be always profitable.
13626 Let the sched-fusion logic decide whether to merge them. */
13627 emit_move_insn (mem_lo, tmp_reg);
13628 emit_move_insn (mem_hi, tmp_reg);
13630 return true;
13633 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13635 static unsigned HOST_WIDE_INT
13636 aarch64_asan_shadow_offset (void)
13638 return (HOST_WIDE_INT_1 << 36);
13641 static bool
13642 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13643 unsigned int align,
13644 enum by_pieces_operation op,
13645 bool speed_p)
13647 /* STORE_BY_PIECES can be used when copying a constant string, but
13648 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13649 For now we always fail this and let the move_by_pieces code copy
13650 the string from read-only memory. */
13651 if (op == STORE_BY_PIECES)
13652 return false;
13654 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13657 static rtx
13658 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13659 int code, tree treeop0, tree treeop1)
13661 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13662 rtx op0, op1;
13663 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13664 insn_code icode;
13665 struct expand_operand ops[4];
13667 start_sequence ();
13668 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13670 op_mode = GET_MODE (op0);
13671 if (op_mode == VOIDmode)
13672 op_mode = GET_MODE (op1);
13674 switch (op_mode)
13676 case QImode:
13677 case HImode:
13678 case SImode:
13679 cmp_mode = SImode;
13680 icode = CODE_FOR_cmpsi;
13681 break;
13683 case DImode:
13684 cmp_mode = DImode;
13685 icode = CODE_FOR_cmpdi;
13686 break;
13688 case SFmode:
13689 cmp_mode = SFmode;
13690 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13691 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13692 break;
13694 case DFmode:
13695 cmp_mode = DFmode;
13696 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13697 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13698 break;
13700 default:
13701 end_sequence ();
13702 return NULL_RTX;
13705 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13706 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13707 if (!op0 || !op1)
13709 end_sequence ();
13710 return NULL_RTX;
13712 *prep_seq = get_insns ();
13713 end_sequence ();
13715 create_fixed_operand (&ops[0], op0);
13716 create_fixed_operand (&ops[1], op1);
13718 start_sequence ();
13719 if (!maybe_expand_insn (icode, 2, ops))
13721 end_sequence ();
13722 return NULL_RTX;
13724 *gen_seq = get_insns ();
13725 end_sequence ();
13727 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13728 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13731 static rtx
13732 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13733 int cmp_code, tree treeop0, tree treeop1, int bit_code)
13735 rtx op0, op1, target;
13736 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13737 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13738 insn_code icode;
13739 struct expand_operand ops[6];
13740 int aarch64_cond;
13742 push_to_sequence (*prep_seq);
13743 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13745 op_mode = GET_MODE (op0);
13746 if (op_mode == VOIDmode)
13747 op_mode = GET_MODE (op1);
13749 switch (op_mode)
13751 case QImode:
13752 case HImode:
13753 case SImode:
13754 cmp_mode = SImode;
13755 icode = CODE_FOR_ccmpsi;
13756 break;
13758 case DImode:
13759 cmp_mode = DImode;
13760 icode = CODE_FOR_ccmpdi;
13761 break;
13763 case SFmode:
13764 cmp_mode = SFmode;
13765 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13766 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13767 break;
13769 case DFmode:
13770 cmp_mode = DFmode;
13771 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13772 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13773 break;
13775 default:
13776 end_sequence ();
13777 return NULL_RTX;
13780 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13781 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13782 if (!op0 || !op1)
13784 end_sequence ();
13785 return NULL_RTX;
13787 *prep_seq = get_insns ();
13788 end_sequence ();
13790 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13791 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13793 if (bit_code != AND)
13795 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13796 GET_MODE (XEXP (prev, 0))),
13797 VOIDmode, XEXP (prev, 0), const0_rtx);
13798 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13801 create_fixed_operand (&ops[0], XEXP (prev, 0));
13802 create_fixed_operand (&ops[1], target);
13803 create_fixed_operand (&ops[2], op0);
13804 create_fixed_operand (&ops[3], op1);
13805 create_fixed_operand (&ops[4], prev);
13806 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13808 push_to_sequence (*gen_seq);
13809 if (!maybe_expand_insn (icode, 6, ops))
13811 end_sequence ();
13812 return NULL_RTX;
13815 *gen_seq = get_insns ();
13816 end_sequence ();
13818 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13821 #undef TARGET_GEN_CCMP_FIRST
13822 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13824 #undef TARGET_GEN_CCMP_NEXT
13825 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13827 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13828 instruction fusion of some sort. */
13830 static bool
13831 aarch64_macro_fusion_p (void)
13833 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13837 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13838 should be kept together during scheduling. */
13840 static bool
13841 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13843 rtx set_dest;
13844 rtx prev_set = single_set (prev);
13845 rtx curr_set = single_set (curr);
13846 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13847 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13849 if (!aarch64_macro_fusion_p ())
13850 return false;
13852 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13854 /* We are trying to match:
13855 prev (mov) == (set (reg r0) (const_int imm16))
13856 curr (movk) == (set (zero_extract (reg r0)
13857 (const_int 16)
13858 (const_int 16))
13859 (const_int imm16_1)) */
13861 set_dest = SET_DEST (curr_set);
13863 if (GET_CODE (set_dest) == ZERO_EXTRACT
13864 && CONST_INT_P (SET_SRC (curr_set))
13865 && CONST_INT_P (SET_SRC (prev_set))
13866 && CONST_INT_P (XEXP (set_dest, 2))
13867 && INTVAL (XEXP (set_dest, 2)) == 16
13868 && REG_P (XEXP (set_dest, 0))
13869 && REG_P (SET_DEST (prev_set))
13870 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13872 return true;
13876 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13879 /* We're trying to match:
13880 prev (adrp) == (set (reg r1)
13881 (high (symbol_ref ("SYM"))))
13882 curr (add) == (set (reg r0)
13883 (lo_sum (reg r1)
13884 (symbol_ref ("SYM"))))
13885 Note that r0 need not necessarily be the same as r1, especially
13886 during pre-regalloc scheduling. */
13888 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13889 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13891 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13892 && REG_P (XEXP (SET_SRC (curr_set), 0))
13893 && REGNO (XEXP (SET_SRC (curr_set), 0))
13894 == REGNO (SET_DEST (prev_set))
13895 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13896 XEXP (SET_SRC (curr_set), 1)))
13897 return true;
13901 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13904 /* We're trying to match:
13905 prev (movk) == (set (zero_extract (reg r0)
13906 (const_int 16)
13907 (const_int 32))
13908 (const_int imm16_1))
13909 curr (movk) == (set (zero_extract (reg r0)
13910 (const_int 16)
13911 (const_int 48))
13912 (const_int imm16_2)) */
13914 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13915 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13916 && REG_P (XEXP (SET_DEST (prev_set), 0))
13917 && REG_P (XEXP (SET_DEST (curr_set), 0))
13918 && REGNO (XEXP (SET_DEST (prev_set), 0))
13919 == REGNO (XEXP (SET_DEST (curr_set), 0))
13920 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13921 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13922 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13923 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13924 && CONST_INT_P (SET_SRC (prev_set))
13925 && CONST_INT_P (SET_SRC (curr_set)))
13926 return true;
13929 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13931 /* We're trying to match:
13932 prev (adrp) == (set (reg r0)
13933 (high (symbol_ref ("SYM"))))
13934 curr (ldr) == (set (reg r1)
13935 (mem (lo_sum (reg r0)
13936 (symbol_ref ("SYM")))))
13938 curr (ldr) == (set (reg r1)
13939 (zero_extend (mem
13940 (lo_sum (reg r0)
13941 (symbol_ref ("SYM")))))) */
13942 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13943 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13945 rtx curr_src = SET_SRC (curr_set);
13947 if (GET_CODE (curr_src) == ZERO_EXTEND)
13948 curr_src = XEXP (curr_src, 0);
13950 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13951 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13952 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13953 == REGNO (SET_DEST (prev_set))
13954 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13955 XEXP (SET_SRC (prev_set), 0)))
13956 return true;
13960 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13961 && aarch_crypto_can_dual_issue (prev, curr))
13962 return true;
13964 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13965 && any_condjump_p (curr))
13967 enum attr_type prev_type = get_attr_type (prev);
13969 /* FIXME: this misses some which is considered simple arthematic
13970 instructions for ThunderX. Simple shifts are missed here. */
13971 if (prev_type == TYPE_ALUS_SREG
13972 || prev_type == TYPE_ALUS_IMM
13973 || prev_type == TYPE_LOGICS_REG
13974 || prev_type == TYPE_LOGICS_IMM)
13975 return true;
13978 return false;
13981 /* Return true iff the instruction fusion described by OP is enabled. */
13983 bool
13984 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13986 return (aarch64_tune_params.fusible_ops & op) != 0;
13989 /* If MEM is in the form of [base+offset], extract the two parts
13990 of address and set to BASE and OFFSET, otherwise return false
13991 after clearing BASE and OFFSET. */
13993 bool
13994 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13996 rtx addr;
13998 gcc_assert (MEM_P (mem));
14000 addr = XEXP (mem, 0);
14002 if (REG_P (addr))
14004 *base = addr;
14005 *offset = const0_rtx;
14006 return true;
14009 if (GET_CODE (addr) == PLUS
14010 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14012 *base = XEXP (addr, 0);
14013 *offset = XEXP (addr, 1);
14014 return true;
14017 *base = NULL_RTX;
14018 *offset = NULL_RTX;
14020 return false;
14023 /* Types for scheduling fusion. */
14024 enum sched_fusion_type
14026 SCHED_FUSION_NONE = 0,
14027 SCHED_FUSION_LD_SIGN_EXTEND,
14028 SCHED_FUSION_LD_ZERO_EXTEND,
14029 SCHED_FUSION_LD,
14030 SCHED_FUSION_ST,
14031 SCHED_FUSION_NUM
14034 /* If INSN is a load or store of address in the form of [base+offset],
14035 extract the two parts and set to BASE and OFFSET. Return scheduling
14036 fusion type this INSN is. */
14038 static enum sched_fusion_type
14039 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14041 rtx x, dest, src;
14042 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14044 gcc_assert (INSN_P (insn));
14045 x = PATTERN (insn);
14046 if (GET_CODE (x) != SET)
14047 return SCHED_FUSION_NONE;
14049 src = SET_SRC (x);
14050 dest = SET_DEST (x);
14052 machine_mode dest_mode = GET_MODE (dest);
14054 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14055 return SCHED_FUSION_NONE;
14057 if (GET_CODE (src) == SIGN_EXTEND)
14059 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14060 src = XEXP (src, 0);
14061 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14062 return SCHED_FUSION_NONE;
14064 else if (GET_CODE (src) == ZERO_EXTEND)
14066 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14067 src = XEXP (src, 0);
14068 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14069 return SCHED_FUSION_NONE;
14072 if (GET_CODE (src) == MEM && REG_P (dest))
14073 extract_base_offset_in_addr (src, base, offset);
14074 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14076 fusion = SCHED_FUSION_ST;
14077 extract_base_offset_in_addr (dest, base, offset);
14079 else
14080 return SCHED_FUSION_NONE;
14082 if (*base == NULL_RTX || *offset == NULL_RTX)
14083 fusion = SCHED_FUSION_NONE;
14085 return fusion;
14088 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14090 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14091 and PRI are only calculated for these instructions. For other instruction,
14092 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14093 type instruction fusion can be added by returning different priorities.
14095 It's important that irrelevant instructions get the largest FUSION_PRI. */
14097 static void
14098 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14099 int *fusion_pri, int *pri)
14101 int tmp, off_val;
14102 rtx base, offset;
14103 enum sched_fusion_type fusion;
14105 gcc_assert (INSN_P (insn));
14107 tmp = max_pri - 1;
14108 fusion = fusion_load_store (insn, &base, &offset);
14109 if (fusion == SCHED_FUSION_NONE)
14111 *pri = tmp;
14112 *fusion_pri = tmp;
14113 return;
14116 /* Set FUSION_PRI according to fusion type and base register. */
14117 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14119 /* Calculate PRI. */
14120 tmp /= 2;
14122 /* INSN with smaller offset goes first. */
14123 off_val = (int)(INTVAL (offset));
14124 if (off_val >= 0)
14125 tmp -= (off_val & 0xfffff);
14126 else
14127 tmp += ((- off_val) & 0xfffff);
14129 *pri = tmp;
14130 return;
14133 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14134 Adjust priority of sha1h instructions so they are scheduled before
14135 other SHA1 instructions. */
14137 static int
14138 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14140 rtx x = PATTERN (insn);
14142 if (GET_CODE (x) == SET)
14144 x = SET_SRC (x);
14146 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14147 return priority + 10;
14150 return priority;
14153 /* Given OPERANDS of consecutive load/store, check if we can merge
14154 them into ldp/stp. LOAD is true if they are load instructions.
14155 MODE is the mode of memory operands. */
14157 bool
14158 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14159 enum machine_mode mode)
14161 HOST_WIDE_INT offval_1, offval_2, msize;
14162 enum reg_class rclass_1, rclass_2;
14163 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14165 if (load)
14167 mem_1 = operands[1];
14168 mem_2 = operands[3];
14169 reg_1 = operands[0];
14170 reg_2 = operands[2];
14171 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14172 if (REGNO (reg_1) == REGNO (reg_2))
14173 return false;
14175 else
14177 mem_1 = operands[0];
14178 mem_2 = operands[2];
14179 reg_1 = operands[1];
14180 reg_2 = operands[3];
14183 /* The mems cannot be volatile. */
14184 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14185 return false;
14187 /* If we have SImode and slow unaligned ldp,
14188 check the alignment to be at least 8 byte. */
14189 if (mode == SImode
14190 && (aarch64_tune_params.extra_tuning_flags
14191 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14192 && !optimize_size
14193 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14194 return false;
14196 /* Check if the addresses are in the form of [base+offset]. */
14197 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14198 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14199 return false;
14200 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14201 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14202 return false;
14204 /* Check if the bases are same. */
14205 if (!rtx_equal_p (base_1, base_2))
14206 return false;
14208 offval_1 = INTVAL (offset_1);
14209 offval_2 = INTVAL (offset_2);
14210 msize = GET_MODE_SIZE (mode);
14211 /* Check if the offsets are consecutive. */
14212 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14213 return false;
14215 /* Check if the addresses are clobbered by load. */
14216 if (load)
14218 if (reg_mentioned_p (reg_1, mem_1))
14219 return false;
14221 /* In increasing order, the last load can clobber the address. */
14222 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14223 return false;
14226 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14227 rclass_1 = FP_REGS;
14228 else
14229 rclass_1 = GENERAL_REGS;
14231 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14232 rclass_2 = FP_REGS;
14233 else
14234 rclass_2 = GENERAL_REGS;
14236 /* Check if the registers are of same class. */
14237 if (rclass_1 != rclass_2)
14238 return false;
14240 return true;
14243 /* Given OPERANDS of consecutive load/store, check if we can merge
14244 them into ldp/stp by adjusting the offset. LOAD is true if they
14245 are load instructions. MODE is the mode of memory operands.
14247 Given below consecutive stores:
14249 str w1, [xb, 0x100]
14250 str w1, [xb, 0x104]
14251 str w1, [xb, 0x108]
14252 str w1, [xb, 0x10c]
14254 Though the offsets are out of the range supported by stp, we can
14255 still pair them after adjusting the offset, like:
14257 add scratch, xb, 0x100
14258 stp w1, w1, [scratch]
14259 stp w1, w1, [scratch, 0x8]
14261 The peephole patterns detecting this opportunity should guarantee
14262 the scratch register is avaliable. */
14264 bool
14265 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14266 enum machine_mode mode)
14268 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14269 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14270 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14271 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14273 if (load)
14275 reg_1 = operands[0];
14276 mem_1 = operands[1];
14277 reg_2 = operands[2];
14278 mem_2 = operands[3];
14279 reg_3 = operands[4];
14280 mem_3 = operands[5];
14281 reg_4 = operands[6];
14282 mem_4 = operands[7];
14283 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14284 && REG_P (reg_3) && REG_P (reg_4));
14285 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14286 return false;
14288 else
14290 mem_1 = operands[0];
14291 reg_1 = operands[1];
14292 mem_2 = operands[2];
14293 reg_2 = operands[3];
14294 mem_3 = operands[4];
14295 reg_3 = operands[5];
14296 mem_4 = operands[6];
14297 reg_4 = operands[7];
14299 /* Skip if memory operand is by itslef valid for ldp/stp. */
14300 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14301 return false;
14303 /* The mems cannot be volatile. */
14304 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14305 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14306 return false;
14308 /* Check if the addresses are in the form of [base+offset]. */
14309 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14310 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14311 return false;
14312 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14313 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14314 return false;
14315 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14316 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14317 return false;
14318 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14319 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14320 return false;
14322 /* Check if the bases are same. */
14323 if (!rtx_equal_p (base_1, base_2)
14324 || !rtx_equal_p (base_2, base_3)
14325 || !rtx_equal_p (base_3, base_4))
14326 return false;
14328 offval_1 = INTVAL (offset_1);
14329 offval_2 = INTVAL (offset_2);
14330 offval_3 = INTVAL (offset_3);
14331 offval_4 = INTVAL (offset_4);
14332 msize = GET_MODE_SIZE (mode);
14333 /* Check if the offsets are consecutive. */
14334 if ((offval_1 != (offval_2 + msize)
14335 || offval_1 != (offval_3 + msize * 2)
14336 || offval_1 != (offval_4 + msize * 3))
14337 && (offval_4 != (offval_3 + msize)
14338 || offval_4 != (offval_2 + msize * 2)
14339 || offval_4 != (offval_1 + msize * 3)))
14340 return false;
14342 /* Check if the addresses are clobbered by load. */
14343 if (load)
14345 if (reg_mentioned_p (reg_1, mem_1)
14346 || reg_mentioned_p (reg_2, mem_2)
14347 || reg_mentioned_p (reg_3, mem_3))
14348 return false;
14350 /* In increasing order, the last load can clobber the address. */
14351 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14352 return false;
14355 /* If we have SImode and slow unaligned ldp,
14356 check the alignment to be at least 8 byte. */
14357 if (mode == SImode
14358 && (aarch64_tune_params.extra_tuning_flags
14359 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14360 && !optimize_size
14361 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14362 return false;
14364 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14365 rclass_1 = FP_REGS;
14366 else
14367 rclass_1 = GENERAL_REGS;
14369 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14370 rclass_2 = FP_REGS;
14371 else
14372 rclass_2 = GENERAL_REGS;
14374 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14375 rclass_3 = FP_REGS;
14376 else
14377 rclass_3 = GENERAL_REGS;
14379 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14380 rclass_4 = FP_REGS;
14381 else
14382 rclass_4 = GENERAL_REGS;
14384 /* Check if the registers are of same class. */
14385 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14386 return false;
14388 return true;
14391 /* Given OPERANDS of consecutive load/store, this function pairs them
14392 into ldp/stp after adjusting the offset. It depends on the fact
14393 that addresses of load/store instructions are in increasing order.
14394 MODE is the mode of memory operands. CODE is the rtl operator
14395 which should be applied to all memory operands, it's SIGN_EXTEND,
14396 ZERO_EXTEND or UNKNOWN. */
14398 bool
14399 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14400 enum machine_mode mode, RTX_CODE code)
14402 rtx base, offset, t1, t2;
14403 rtx mem_1, mem_2, mem_3, mem_4;
14404 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14406 if (load)
14408 mem_1 = operands[1];
14409 mem_2 = operands[3];
14410 mem_3 = operands[5];
14411 mem_4 = operands[7];
14413 else
14415 mem_1 = operands[0];
14416 mem_2 = operands[2];
14417 mem_3 = operands[4];
14418 mem_4 = operands[6];
14419 gcc_assert (code == UNKNOWN);
14422 extract_base_offset_in_addr (mem_1, &base, &offset);
14423 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14425 /* Adjust offset thus it can fit in ldp/stp instruction. */
14426 msize = GET_MODE_SIZE (mode);
14427 stp_off_limit = msize * 0x40;
14428 off_val = INTVAL (offset);
14429 abs_off = (off_val < 0) ? -off_val : off_val;
14430 new_off = abs_off % stp_off_limit;
14431 adj_off = abs_off - new_off;
14433 /* Further adjust to make sure all offsets are OK. */
14434 if ((new_off + msize * 2) >= stp_off_limit)
14436 adj_off += stp_off_limit;
14437 new_off -= stp_off_limit;
14440 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14441 if (adj_off >= 0x1000)
14442 return false;
14444 if (off_val < 0)
14446 adj_off = -adj_off;
14447 new_off = -new_off;
14450 /* Create new memory references. */
14451 mem_1 = change_address (mem_1, VOIDmode,
14452 plus_constant (DImode, operands[8], new_off));
14454 /* Check if the adjusted address is OK for ldp/stp. */
14455 if (!aarch64_mem_pair_operand (mem_1, mode))
14456 return false;
14458 msize = GET_MODE_SIZE (mode);
14459 mem_2 = change_address (mem_2, VOIDmode,
14460 plus_constant (DImode,
14461 operands[8],
14462 new_off + msize));
14463 mem_3 = change_address (mem_3, VOIDmode,
14464 plus_constant (DImode,
14465 operands[8],
14466 new_off + msize * 2));
14467 mem_4 = change_address (mem_4, VOIDmode,
14468 plus_constant (DImode,
14469 operands[8],
14470 new_off + msize * 3));
14472 if (code == ZERO_EXTEND)
14474 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14475 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14476 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14477 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14479 else if (code == SIGN_EXTEND)
14481 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14482 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14483 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14484 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14487 if (load)
14489 operands[1] = mem_1;
14490 operands[3] = mem_2;
14491 operands[5] = mem_3;
14492 operands[7] = mem_4;
14494 else
14496 operands[0] = mem_1;
14497 operands[2] = mem_2;
14498 operands[4] = mem_3;
14499 operands[6] = mem_4;
14502 /* Emit adjusting instruction. */
14503 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14504 /* Emit ldp/stp instructions. */
14505 t1 = gen_rtx_SET (operands[0], operands[1]);
14506 t2 = gen_rtx_SET (operands[2], operands[3]);
14507 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14508 t1 = gen_rtx_SET (operands[4], operands[5]);
14509 t2 = gen_rtx_SET (operands[6], operands[7]);
14510 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14511 return true;
14514 /* Return 1 if pseudo register should be created and used to hold
14515 GOT address for PIC code. */
14517 bool
14518 aarch64_use_pseudo_pic_reg (void)
14520 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14523 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14525 static int
14526 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14528 switch (XINT (x, 1))
14530 case UNSPEC_GOTSMALLPIC:
14531 case UNSPEC_GOTSMALLPIC28K:
14532 case UNSPEC_GOTTINYPIC:
14533 return 0;
14534 default:
14535 break;
14538 return default_unspec_may_trap_p (x, flags);
14542 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14543 return the log2 of that value. Otherwise return -1. */
14546 aarch64_fpconst_pow_of_2 (rtx x)
14548 const REAL_VALUE_TYPE *r;
14550 if (!CONST_DOUBLE_P (x))
14551 return -1;
14553 r = CONST_DOUBLE_REAL_VALUE (x);
14555 if (REAL_VALUE_NEGATIVE (*r)
14556 || REAL_VALUE_ISNAN (*r)
14557 || REAL_VALUE_ISINF (*r)
14558 || !real_isinteger (r, DFmode))
14559 return -1;
14561 return exact_log2 (real_to_integer (r));
14564 /* If X is a vector of equal CONST_DOUBLE values and that value is
14565 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14568 aarch64_vec_fpconst_pow_of_2 (rtx x)
14570 if (GET_CODE (x) != CONST_VECTOR)
14571 return -1;
14573 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14574 return -1;
14576 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14577 if (firstval <= 0)
14578 return -1;
14580 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14581 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14582 return -1;
14584 return firstval;
14587 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14588 to float.
14590 __fp16 always promotes through this hook.
14591 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14592 through the generic excess precision logic rather than here. */
14594 static tree
14595 aarch64_promoted_type (const_tree t)
14597 if (SCALAR_FLOAT_TYPE_P (t)
14598 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14599 return float_type_node;
14601 return NULL_TREE;
14604 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14606 static bool
14607 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14608 optimization_type opt_type)
14610 switch (op)
14612 case rsqrt_optab:
14613 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14615 default:
14616 return true;
14620 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14621 if MODE is HFmode, and punt to the generic implementation otherwise. */
14623 static bool
14624 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14626 return (mode == HFmode
14627 ? true
14628 : default_libgcc_floating_mode_supported_p (mode));
14631 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14632 if MODE is HFmode, and punt to the generic implementation otherwise. */
14634 static bool
14635 aarch64_scalar_mode_supported_p (machine_mode mode)
14637 return (mode == HFmode
14638 ? true
14639 : default_scalar_mode_supported_p (mode));
14642 /* Set the value of FLT_EVAL_METHOD.
14643 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14645 0: evaluate all operations and constants, whose semantic type has at
14646 most the range and precision of type float, to the range and
14647 precision of float; evaluate all other operations and constants to
14648 the range and precision of the semantic type;
14650 N, where _FloatN is a supported interchange floating type
14651 evaluate all operations and constants, whose semantic type has at
14652 most the range and precision of _FloatN type, to the range and
14653 precision of the _FloatN type; evaluate all other operations and
14654 constants to the range and precision of the semantic type;
14656 If we have the ARMv8.2-A extensions then we support _Float16 in native
14657 precision, so we should set this to 16. Otherwise, we support the type,
14658 but want to evaluate expressions in float precision, so set this to
14659 0. */
14661 static enum flt_eval_method
14662 aarch64_excess_precision (enum excess_precision_type type)
14664 switch (type)
14666 case EXCESS_PRECISION_TYPE_FAST:
14667 case EXCESS_PRECISION_TYPE_STANDARD:
14668 /* We can calculate either in 16-bit range and precision or
14669 32-bit range and precision. Make that decision based on whether
14670 we have native support for the ARMv8.2-A 16-bit floating-point
14671 instructions or not. */
14672 return (TARGET_FP_F16INST
14673 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14674 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14675 case EXCESS_PRECISION_TYPE_IMPLICIT:
14676 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14677 default:
14678 gcc_unreachable ();
14680 return FLT_EVAL_METHOD_UNPREDICTABLE;
14683 /* Target-specific selftests. */
14685 #if CHECKING_P
14687 namespace selftest {
14689 /* Selftest for the RTL loader.
14690 Verify that the RTL loader copes with a dump from
14691 print_rtx_function. This is essentially just a test that class
14692 function_reader can handle a real dump, but it also verifies
14693 that lookup_reg_by_dump_name correctly handles hard regs.
14694 The presence of hard reg names in the dump means that the test is
14695 target-specific, hence it is in this file. */
14697 static void
14698 aarch64_test_loading_full_dump ()
14700 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14702 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14704 rtx_insn *insn_1 = get_insn_by_uid (1);
14705 ASSERT_EQ (NOTE, GET_CODE (insn_1));
14707 rtx_insn *insn_15 = get_insn_by_uid (15);
14708 ASSERT_EQ (INSN, GET_CODE (insn_15));
14709 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14711 /* Verify crtl->return_rtx. */
14712 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14713 ASSERT_EQ (0, REGNO (crtl->return_rtx));
14714 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14717 /* Run all target-specific selftests. */
14719 static void
14720 aarch64_run_selftests (void)
14722 aarch64_test_loading_full_dump ();
14725 } // namespace selftest
14727 #endif /* #if CHECKING_P */
14729 #undef TARGET_ADDRESS_COST
14730 #define TARGET_ADDRESS_COST aarch64_address_cost
14732 /* This hook will determines whether unnamed bitfields affect the alignment
14733 of the containing structure. The hook returns true if the structure
14734 should inherit the alignment requirements of an unnamed bitfield's
14735 type. */
14736 #undef TARGET_ALIGN_ANON_BITFIELD
14737 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14739 #undef TARGET_ASM_ALIGNED_DI_OP
14740 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14742 #undef TARGET_ASM_ALIGNED_HI_OP
14743 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14745 #undef TARGET_ASM_ALIGNED_SI_OP
14746 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14748 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14749 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14750 hook_bool_const_tree_hwi_hwi_const_tree_true
14752 #undef TARGET_ASM_FILE_START
14753 #define TARGET_ASM_FILE_START aarch64_start_file
14755 #undef TARGET_ASM_OUTPUT_MI_THUNK
14756 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14758 #undef TARGET_ASM_SELECT_RTX_SECTION
14759 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14761 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14762 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14764 #undef TARGET_BUILD_BUILTIN_VA_LIST
14765 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14767 #undef TARGET_CALLEE_COPIES
14768 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14770 #undef TARGET_CAN_ELIMINATE
14771 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14773 #undef TARGET_CAN_INLINE_P
14774 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14776 #undef TARGET_CANNOT_FORCE_CONST_MEM
14777 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14779 #undef TARGET_CASE_VALUES_THRESHOLD
14780 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14782 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14783 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14785 /* Only the least significant bit is used for initialization guard
14786 variables. */
14787 #undef TARGET_CXX_GUARD_MASK_BIT
14788 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14790 #undef TARGET_C_MODE_FOR_SUFFIX
14791 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14793 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14794 #undef TARGET_DEFAULT_TARGET_FLAGS
14795 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14796 #endif
14798 #undef TARGET_CLASS_MAX_NREGS
14799 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14801 #undef TARGET_BUILTIN_DECL
14802 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14804 #undef TARGET_BUILTIN_RECIPROCAL
14805 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14807 #undef TARGET_C_EXCESS_PRECISION
14808 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14810 #undef TARGET_EXPAND_BUILTIN
14811 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14813 #undef TARGET_EXPAND_BUILTIN_VA_START
14814 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14816 #undef TARGET_FOLD_BUILTIN
14817 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14819 #undef TARGET_FUNCTION_ARG
14820 #define TARGET_FUNCTION_ARG aarch64_function_arg
14822 #undef TARGET_FUNCTION_ARG_ADVANCE
14823 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14825 #undef TARGET_FUNCTION_ARG_BOUNDARY
14826 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14828 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14829 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14831 #undef TARGET_FUNCTION_VALUE
14832 #define TARGET_FUNCTION_VALUE aarch64_function_value
14834 #undef TARGET_FUNCTION_VALUE_REGNO_P
14835 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14837 #undef TARGET_FRAME_POINTER_REQUIRED
14838 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14840 #undef TARGET_GIMPLE_FOLD_BUILTIN
14841 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14843 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14844 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14846 #undef TARGET_INIT_BUILTINS
14847 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14849 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14850 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14851 aarch64_ira_change_pseudo_allocno_class
14853 #undef TARGET_LEGITIMATE_ADDRESS_P
14854 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14856 #undef TARGET_LEGITIMATE_CONSTANT_P
14857 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14859 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14860 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14861 aarch64_legitimize_address_displacement
14863 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14864 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14866 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14867 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14868 aarch64_libgcc_floating_mode_supported_p
14870 #undef TARGET_MANGLE_TYPE
14871 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14873 #undef TARGET_MEMORY_MOVE_COST
14874 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14876 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14877 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14879 #undef TARGET_MUST_PASS_IN_STACK
14880 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14882 /* This target hook should return true if accesses to volatile bitfields
14883 should use the narrowest mode possible. It should return false if these
14884 accesses should use the bitfield container type. */
14885 #undef TARGET_NARROW_VOLATILE_BITFIELD
14886 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14888 #undef TARGET_OPTION_OVERRIDE
14889 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14891 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14892 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14893 aarch64_override_options_after_change
14895 #undef TARGET_OPTION_SAVE
14896 #define TARGET_OPTION_SAVE aarch64_option_save
14898 #undef TARGET_OPTION_RESTORE
14899 #define TARGET_OPTION_RESTORE aarch64_option_restore
14901 #undef TARGET_OPTION_PRINT
14902 #define TARGET_OPTION_PRINT aarch64_option_print
14904 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14905 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14907 #undef TARGET_SET_CURRENT_FUNCTION
14908 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14910 #undef TARGET_PASS_BY_REFERENCE
14911 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14913 #undef TARGET_PREFERRED_RELOAD_CLASS
14914 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14916 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14917 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14919 #undef TARGET_PROMOTED_TYPE
14920 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14922 #undef TARGET_SECONDARY_RELOAD
14923 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14925 #undef TARGET_SHIFT_TRUNCATION_MASK
14926 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14928 #undef TARGET_SETUP_INCOMING_VARARGS
14929 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14931 #undef TARGET_STRUCT_VALUE_RTX
14932 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14934 #undef TARGET_REGISTER_MOVE_COST
14935 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14937 #undef TARGET_RETURN_IN_MEMORY
14938 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14940 #undef TARGET_RETURN_IN_MSB
14941 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14943 #undef TARGET_RTX_COSTS
14944 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14946 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14947 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14949 #undef TARGET_SCHED_ISSUE_RATE
14950 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14952 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14953 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14954 aarch64_sched_first_cycle_multipass_dfa_lookahead
14956 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14957 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14958 aarch64_first_cycle_multipass_dfa_lookahead_guard
14960 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14961 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14962 aarch64_get_separate_components
14964 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14965 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14966 aarch64_components_for_bb
14968 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14969 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14970 aarch64_disqualify_components
14972 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14973 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14974 aarch64_emit_prologue_components
14976 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14977 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14978 aarch64_emit_epilogue_components
14980 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14981 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14982 aarch64_set_handled_components
14984 #undef TARGET_TRAMPOLINE_INIT
14985 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14987 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14988 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14990 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14991 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14993 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
14994 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
14995 aarch64_builtin_support_vector_misalignment
14997 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14998 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15000 #undef TARGET_VECTORIZE_ADD_STMT_COST
15001 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15003 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15004 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15005 aarch64_builtin_vectorization_cost
15007 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15008 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15010 #undef TARGET_VECTORIZE_BUILTINS
15011 #define TARGET_VECTORIZE_BUILTINS
15013 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15014 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15015 aarch64_builtin_vectorized_function
15017 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15018 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15019 aarch64_autovectorize_vector_sizes
15021 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15022 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15023 aarch64_atomic_assign_expand_fenv
15025 /* Section anchor support. */
15027 #undef TARGET_MIN_ANCHOR_OFFSET
15028 #define TARGET_MIN_ANCHOR_OFFSET -256
15030 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15031 byte offset; we can do much more for larger data types, but have no way
15032 to determine the size of the access. We assume accesses are aligned. */
15033 #undef TARGET_MAX_ANCHOR_OFFSET
15034 #define TARGET_MAX_ANCHOR_OFFSET 4095
15036 #undef TARGET_VECTOR_ALIGNMENT
15037 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15039 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15040 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15041 aarch64_simd_vector_alignment_reachable
15043 /* vec_perm support. */
15045 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15046 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15047 aarch64_vectorize_vec_perm_const_ok
15049 #undef TARGET_INIT_LIBFUNCS
15050 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15052 #undef TARGET_FIXED_CONDITION_CODE_REGS
15053 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15055 #undef TARGET_FLAGS_REGNUM
15056 #define TARGET_FLAGS_REGNUM CC_REGNUM
15058 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15059 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15061 #undef TARGET_ASAN_SHADOW_OFFSET
15062 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15064 #undef TARGET_LEGITIMIZE_ADDRESS
15065 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15067 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15068 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15069 aarch64_use_by_pieces_infrastructure_p
15071 #undef TARGET_CAN_USE_DOLOOP_P
15072 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15074 #undef TARGET_SCHED_ADJUST_PRIORITY
15075 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15077 #undef TARGET_SCHED_MACRO_FUSION_P
15078 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15080 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15081 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15083 #undef TARGET_SCHED_FUSION_PRIORITY
15084 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15086 #undef TARGET_UNSPEC_MAY_TRAP_P
15087 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15089 #undef TARGET_USE_PSEUDO_PIC_REG
15090 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15092 #undef TARGET_PRINT_OPERAND
15093 #define TARGET_PRINT_OPERAND aarch64_print_operand
15095 #undef TARGET_PRINT_OPERAND_ADDRESS
15096 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15098 #undef TARGET_OPTAB_SUPPORTED_P
15099 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15101 #undef TARGET_OMIT_STRUCT_RETURN_REG
15102 #define TARGET_OMIT_STRUCT_RETURN_REG true
15104 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15105 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15106 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15108 #if CHECKING_P
15109 #undef TARGET_RUN_TARGET_SELFTESTS
15110 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15111 #endif /* #if CHECKING_P */
15113 struct gcc_target targetm = TARGET_INITIALIZER;
15115 #include "gt-aarch64.h"