gcc/
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob4432cae6b8ddc84b00219eae884720af69dc376f
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
117 struct simd_immediate_info
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version;
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune = cortexa53;
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags = 0;
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads;
163 /* Support for command line parsing of boolean flags in the tuning
164 structures. */
165 struct aarch64_flag_desc
167 const char* name;
168 unsigned int flag;
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
175 { "none", AARCH64_FUSE_NOTHING },
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL },
178 { NULL, AARCH64_FUSE_NOTHING }
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
185 { "none", AARCH64_EXTRA_TUNE_NONE },
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL },
188 { NULL, AARCH64_EXTRA_TUNE_NONE }
191 /* Tuning parameters. */
193 static const struct cpu_addrcost_table generic_addrcost_table =
196 0, /* hi */
197 0, /* si */
198 0, /* di */
199 0, /* ti */
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
206 0 /* imm_offset */
209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
217 0, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
222 0, /* imm_offset */
225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
228 0, /* hi */
229 0, /* si */
230 0, /* di */
231 2, /* ti */
233 0, /* pre_modify */
234 0, /* post_modify */
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
238 0, /* imm_offset */
241 static const struct cpu_addrcost_table xgene1_addrcost_table =
244 1, /* hi */
245 0, /* si */
246 0, /* di */
247 1, /* ti */
249 1, /* pre_modify */
250 0, /* post_modify */
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
254 0, /* imm_offset */
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
281 0, /* pre_modify */
282 0, /* post_modify */
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
286 0, /* imm_offset */
289 static const struct cpu_regmove_cost generic_regmove_cost =
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
294 5, /* GP2FP */
295 5, /* FP2GP */
296 2 /* FP2FP */
299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
301 1, /* GP2GP */
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
304 5, /* GP2FP */
305 5, /* FP2GP */
306 2 /* FP2FP */
309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
311 1, /* GP2GP */
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
314 5, /* GP2FP */
315 5, /* FP2GP */
316 2 /* FP2FP */
319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
321 1, /* GP2GP */
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
324 9, /* GP2FP */
325 9, /* FP2GP */
326 1 /* FP2FP */
329 static const struct cpu_regmove_cost thunderx_regmove_cost =
331 2, /* GP2GP */
332 2, /* GP2FP */
333 6, /* FP2GP */
334 4 /* FP2FP */
337 static const struct cpu_regmove_cost xgene1_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 8, /* GP2FP */
343 8, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
349 2, /* GP2GP */
350 /* Avoid the use of int<->fp moves for spilling. */
351 6, /* GP2FP */
352 6, /* FP2GP */
353 4 /* FP2FP */
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of int<->fp moves for spilling. */
360 8, /* GP2FP */
361 8, /* FP2GP */
362 4 /* FP2FP */
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost =
368 1, /* scalar_stmt_cost */
369 1, /* scalar_load_cost */
370 1, /* scalar_store_cost */
371 1, /* vec_stmt_cost */
372 2, /* vec_permute_cost */
373 1, /* vec_to_scalar_cost */
374 1, /* scalar_to_vec_cost */
375 1, /* vec_align_load_cost */
376 1, /* vec_unalign_load_cost */
377 1, /* vec_unalign_store_cost */
378 1, /* vec_store_cost */
379 3, /* cond_taken_branch_cost */
380 1 /* cond_not_taken_branch_cost */
383 /* ThunderX costs for vector insn classes. */
384 static const struct cpu_vector_cost thunderx_vector_cost =
386 1, /* scalar_stmt_cost */
387 3, /* scalar_load_cost */
388 1, /* scalar_store_cost */
389 4, /* vec_stmt_cost */
390 4, /* vec_permute_cost */
391 2, /* vec_to_scalar_cost */
392 2, /* scalar_to_vec_cost */
393 3, /* vec_align_load_cost */
394 10, /* vec_unalign_load_cost */
395 10, /* vec_unalign_store_cost */
396 1, /* vec_store_cost */
397 3, /* cond_taken_branch_cost */
398 3 /* cond_not_taken_branch_cost */
401 /* Generic costs for vector insn classes. */
402 static const struct cpu_vector_cost cortexa57_vector_cost =
404 1, /* scalar_stmt_cost */
405 4, /* scalar_load_cost */
406 1, /* scalar_store_cost */
407 2, /* vec_stmt_cost */
408 3, /* vec_permute_cost */
409 8, /* vec_to_scalar_cost */
410 8, /* scalar_to_vec_cost */
411 4, /* vec_align_load_cost */
412 4, /* vec_unalign_load_cost */
413 1, /* vec_unalign_store_cost */
414 1, /* vec_store_cost */
415 1, /* cond_taken_branch_cost */
416 1 /* cond_not_taken_branch_cost */
419 static const struct cpu_vector_cost exynosm1_vector_cost =
421 1, /* scalar_stmt_cost */
422 5, /* scalar_load_cost */
423 1, /* scalar_store_cost */
424 3, /* vec_stmt_cost */
425 3, /* vec_permute_cost */
426 3, /* vec_to_scalar_cost */
427 3, /* scalar_to_vec_cost */
428 5, /* vec_align_load_cost */
429 5, /* vec_unalign_load_cost */
430 1, /* vec_unalign_store_cost */
431 1, /* vec_store_cost */
432 1, /* cond_taken_branch_cost */
433 1 /* cond_not_taken_branch_cost */
436 /* Generic costs for vector insn classes. */
437 static const struct cpu_vector_cost xgene1_vector_cost =
439 1, /* scalar_stmt_cost */
440 5, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 2, /* vec_stmt_cost */
443 2, /* vec_permute_cost */
444 4, /* vec_to_scalar_cost */
445 4, /* scalar_to_vec_cost */
446 10, /* vec_align_load_cost */
447 10, /* vec_unalign_load_cost */
448 2, /* vec_unalign_store_cost */
449 2, /* vec_store_cost */
450 2, /* cond_taken_branch_cost */
451 1 /* cond_not_taken_branch_cost */
454 /* Costs for vector insn classes for Vulcan. */
455 static const struct cpu_vector_cost thunderx2t99_vector_cost =
457 6, /* scalar_stmt_cost */
458 4, /* scalar_load_cost */
459 1, /* scalar_store_cost */
460 6, /* vec_stmt_cost */
461 3, /* vec_permute_cost */
462 6, /* vec_to_scalar_cost */
463 5, /* scalar_to_vec_cost */
464 8, /* vec_align_load_cost */
465 8, /* vec_unalign_load_cost */
466 4, /* vec_unalign_store_cost */
467 4, /* vec_store_cost */
468 2, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for branch instructions. */
473 static const struct cpu_branch_cost generic_branch_cost =
475 2, /* Predictable. */
476 2 /* Unpredictable. */
479 /* Branch costs for Cortex-A57. */
480 static const struct cpu_branch_cost cortexa57_branch_cost =
482 1, /* Predictable. */
483 3 /* Unpredictable. */
486 /* Branch costs for Vulcan. */
487 static const struct cpu_branch_cost thunderx2t99_branch_cost =
489 1, /* Predictable. */
490 3 /* Unpredictable. */
493 /* Generic approximation modes. */
494 static const cpu_approx_modes generic_approx_modes =
496 AARCH64_APPROX_NONE, /* division */
497 AARCH64_APPROX_NONE, /* sqrt */
498 AARCH64_APPROX_NONE /* recip_sqrt */
501 /* Approximation modes for Exynos M1. */
502 static const cpu_approx_modes exynosm1_approx_modes =
504 AARCH64_APPROX_NONE, /* division */
505 AARCH64_APPROX_ALL, /* sqrt */
506 AARCH64_APPROX_ALL /* recip_sqrt */
509 /* Approximation modes for X-Gene 1. */
510 static const cpu_approx_modes xgene1_approx_modes =
512 AARCH64_APPROX_NONE, /* division */
513 AARCH64_APPROX_NONE, /* sqrt */
514 AARCH64_APPROX_ALL /* recip_sqrt */
517 static const struct tune_params generic_tunings =
519 &cortexa57_extra_costs,
520 &generic_addrcost_table,
521 &generic_regmove_cost,
522 &generic_vector_cost,
523 &generic_branch_cost,
524 &generic_approx_modes,
525 4, /* memmov_cost */
526 2, /* issue_rate */
527 AARCH64_FUSE_NOTHING, /* fusible_ops */
528 8, /* function_align. */
529 8, /* jump_align. */
530 4, /* loop_align. */
531 2, /* int_reassoc_width. */
532 4, /* fp_reassoc_width. */
533 1, /* vec_reassoc_width. */
534 2, /* min_div_recip_mul_sf. */
535 2, /* min_div_recip_mul_df. */
536 0, /* max_case_values. */
537 0, /* cache_line_size. */
538 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
539 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
542 static const struct tune_params cortexa35_tunings =
544 &cortexa53_extra_costs,
545 &generic_addrcost_table,
546 &cortexa53_regmove_cost,
547 &generic_vector_cost,
548 &cortexa57_branch_cost,
549 &generic_approx_modes,
550 4, /* memmov_cost */
551 1, /* issue_rate */
552 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
553 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
554 16, /* function_align. */
555 8, /* jump_align. */
556 8, /* loop_align. */
557 2, /* int_reassoc_width. */
558 4, /* fp_reassoc_width. */
559 1, /* vec_reassoc_width. */
560 2, /* min_div_recip_mul_sf. */
561 2, /* min_div_recip_mul_df. */
562 0, /* max_case_values. */
563 0, /* cache_line_size. */
564 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
565 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
568 static const struct tune_params cortexa53_tunings =
570 &cortexa53_extra_costs,
571 &generic_addrcost_table,
572 &cortexa53_regmove_cost,
573 &generic_vector_cost,
574 &cortexa57_branch_cost,
575 &generic_approx_modes,
576 4, /* memmov_cost */
577 2, /* issue_rate */
578 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
579 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
580 16, /* function_align. */
581 8, /* jump_align. */
582 8, /* loop_align. */
583 2, /* int_reassoc_width. */
584 4, /* fp_reassoc_width. */
585 1, /* vec_reassoc_width. */
586 2, /* min_div_recip_mul_sf. */
587 2, /* min_div_recip_mul_df. */
588 0, /* max_case_values. */
589 0, /* cache_line_size. */
590 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
591 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
594 static const struct tune_params cortexa57_tunings =
596 &cortexa57_extra_costs,
597 &cortexa57_addrcost_table,
598 &cortexa57_regmove_cost,
599 &cortexa57_vector_cost,
600 &cortexa57_branch_cost,
601 &generic_approx_modes,
602 4, /* memmov_cost */
603 3, /* issue_rate */
604 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
605 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
606 16, /* function_align. */
607 8, /* jump_align. */
608 8, /* loop_align. */
609 2, /* int_reassoc_width. */
610 4, /* fp_reassoc_width. */
611 1, /* vec_reassoc_width. */
612 2, /* min_div_recip_mul_sf. */
613 2, /* min_div_recip_mul_df. */
614 0, /* max_case_values. */
615 0, /* cache_line_size. */
616 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
617 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
620 static const struct tune_params cortexa72_tunings =
622 &cortexa57_extra_costs,
623 &cortexa57_addrcost_table,
624 &cortexa57_regmove_cost,
625 &cortexa57_vector_cost,
626 &cortexa57_branch_cost,
627 &generic_approx_modes,
628 4, /* memmov_cost */
629 3, /* issue_rate */
630 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
631 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
632 16, /* function_align. */
633 8, /* jump_align. */
634 8, /* loop_align. */
635 2, /* int_reassoc_width. */
636 4, /* fp_reassoc_width. */
637 1, /* vec_reassoc_width. */
638 2, /* min_div_recip_mul_sf. */
639 2, /* min_div_recip_mul_df. */
640 0, /* max_case_values. */
641 0, /* cache_line_size. */
642 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
643 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
646 static const struct tune_params cortexa73_tunings =
648 &cortexa57_extra_costs,
649 &cortexa57_addrcost_table,
650 &cortexa57_regmove_cost,
651 &cortexa57_vector_cost,
652 &cortexa57_branch_cost,
653 &generic_approx_modes,
654 4, /* memmov_cost. */
655 2, /* issue_rate. */
656 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
657 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
658 16, /* function_align. */
659 8, /* jump_align. */
660 8, /* loop_align. */
661 2, /* int_reassoc_width. */
662 4, /* fp_reassoc_width. */
663 1, /* vec_reassoc_width. */
664 2, /* min_div_recip_mul_sf. */
665 2, /* min_div_recip_mul_df. */
666 0, /* max_case_values. */
667 0, /* cache_line_size. */
668 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
669 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
672 static const struct tune_params exynosm1_tunings =
674 &exynosm1_extra_costs,
675 &exynosm1_addrcost_table,
676 &exynosm1_regmove_cost,
677 &exynosm1_vector_cost,
678 &generic_branch_cost,
679 &exynosm1_approx_modes,
680 4, /* memmov_cost */
681 3, /* issue_rate */
682 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
683 4, /* function_align. */
684 4, /* jump_align. */
685 4, /* loop_align. */
686 2, /* int_reassoc_width. */
687 4, /* fp_reassoc_width. */
688 1, /* vec_reassoc_width. */
689 2, /* min_div_recip_mul_sf. */
690 2, /* min_div_recip_mul_df. */
691 48, /* max_case_values. */
692 64, /* cache_line_size. */
693 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
694 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
697 static const struct tune_params thunderx_tunings =
699 &thunderx_extra_costs,
700 &generic_addrcost_table,
701 &thunderx_regmove_cost,
702 &thunderx_vector_cost,
703 &generic_branch_cost,
704 &generic_approx_modes,
705 6, /* memmov_cost */
706 2, /* issue_rate */
707 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
708 8, /* function_align. */
709 8, /* jump_align. */
710 8, /* loop_align. */
711 2, /* int_reassoc_width. */
712 4, /* fp_reassoc_width. */
713 1, /* vec_reassoc_width. */
714 2, /* min_div_recip_mul_sf. */
715 2, /* min_div_recip_mul_df. */
716 0, /* max_case_values. */
717 0, /* cache_line_size. */
718 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
722 static const struct tune_params xgene1_tunings =
724 &xgene1_extra_costs,
725 &xgene1_addrcost_table,
726 &xgene1_regmove_cost,
727 &xgene1_vector_cost,
728 &generic_branch_cost,
729 &xgene1_approx_modes,
730 6, /* memmov_cost */
731 4, /* issue_rate */
732 AARCH64_FUSE_NOTHING, /* fusible_ops */
733 16, /* function_align. */
734 8, /* jump_align. */
735 16, /* loop_align. */
736 2, /* int_reassoc_width. */
737 4, /* fp_reassoc_width. */
738 1, /* vec_reassoc_width. */
739 2, /* min_div_recip_mul_sf. */
740 2, /* min_div_recip_mul_df. */
741 0, /* max_case_values. */
742 0, /* cache_line_size. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
747 static const struct tune_params qdf24xx_tunings =
749 &qdf24xx_extra_costs,
750 &qdf24xx_addrcost_table,
751 &qdf24xx_regmove_cost,
752 &generic_vector_cost,
753 &generic_branch_cost,
754 &generic_approx_modes,
755 4, /* memmov_cost */
756 4, /* issue_rate */
757 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
758 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
759 16, /* function_align. */
760 8, /* jump_align. */
761 16, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 64, /* cache_line_size. */
769 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
770 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
773 static const struct tune_params thunderx2t99_tunings =
775 &thunderx2t99_extra_costs,
776 &thunderx2t99_addrcost_table,
777 &thunderx2t99_regmove_cost,
778 &thunderx2t99_vector_cost,
779 &thunderx2t99_branch_cost,
780 &generic_approx_modes,
781 4, /* memmov_cost. */
782 4, /* issue_rate. */
783 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
784 16, /* function_align. */
785 8, /* jump_align. */
786 16, /* loop_align. */
787 3, /* int_reassoc_width. */
788 2, /* fp_reassoc_width. */
789 2, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 64, /* cache_line_size. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
798 /* Support for fine-grained override of the tuning structures. */
799 struct aarch64_tuning_override_function
801 const char* name;
802 void (*parse_override)(const char*, struct tune_params*);
805 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
806 static void aarch64_parse_tune_string (const char*, struct tune_params*);
808 static const struct aarch64_tuning_override_function
809 aarch64_tuning_override_functions[] =
811 { "fuse", aarch64_parse_fuse_string },
812 { "tune", aarch64_parse_tune_string },
813 { NULL, NULL }
816 /* A processor implementing AArch64. */
817 struct processor
819 const char *const name;
820 enum aarch64_processor ident;
821 enum aarch64_processor sched_core;
822 enum aarch64_arch arch;
823 unsigned architecture_version;
824 const unsigned long flags;
825 const struct tune_params *const tune;
828 /* Architectures implementing AArch64. */
829 static const struct processor all_architectures[] =
831 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
832 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
833 #include "aarch64-arches.def"
834 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
837 /* Processor cores implementing AArch64. */
838 static const struct processor all_cores[] =
840 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
841 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
842 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
843 FLAGS, &COSTS##_tunings},
844 #include "aarch64-cores.def"
845 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
846 AARCH64_FL_FOR_ARCH8, &generic_tunings},
847 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
851 /* Target specification. These are populated by the -march, -mtune, -mcpu
852 handling code or by target attributes. */
853 static const struct processor *selected_arch;
854 static const struct processor *selected_cpu;
855 static const struct processor *selected_tune;
857 /* The current tuning set. */
858 struct tune_params aarch64_tune_params = generic_tunings;
860 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
862 /* An ISA extension in the co-processor and main instruction set space. */
863 struct aarch64_option_extension
865 const char *const name;
866 const unsigned long flags_on;
867 const unsigned long flags_off;
870 typedef enum aarch64_cond_code
872 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
873 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
874 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
876 aarch64_cc;
878 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
880 /* The condition codes of the processor, and the inverse function. */
881 static const char * const aarch64_condition_codes[] =
883 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
884 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
887 /* Generate code to enable conditional branches in functions over 1 MiB. */
888 const char *
889 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
890 const char * branch_format)
892 rtx_code_label * tmp_label = gen_label_rtx ();
893 char label_buf[256];
894 char buffer[128];
895 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
896 CODE_LABEL_NUMBER (tmp_label));
897 const char *label_ptr = targetm.strip_name_encoding (label_buf);
898 rtx dest_label = operands[pos_label];
899 operands[pos_label] = tmp_label;
901 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
902 output_asm_insn (buffer, operands);
904 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
905 operands[pos_label] = dest_label;
906 output_asm_insn (buffer, operands);
907 return "";
910 void
911 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
913 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
914 if (TARGET_GENERAL_REGS_ONLY)
915 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
916 else
917 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
920 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
921 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
922 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
923 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
924 cost (in this case the best class is the lowest cost one). Using ALL_REGS
925 irrespectively of its cost results in bad allocations with many redundant
926 int<->FP moves which are expensive on various cores.
927 To avoid this we don't allow ALL_REGS as the allocno class, but force a
928 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
929 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
930 Otherwise set the allocno class depending on the mode.
931 The result of this is that it is no longer inefficient to have a higher
932 memory move cost than the register move cost.
935 static reg_class_t
936 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
937 reg_class_t best_class)
939 enum machine_mode mode;
941 if (allocno_class != ALL_REGS)
942 return allocno_class;
944 if (best_class != ALL_REGS)
945 return best_class;
947 mode = PSEUDO_REGNO_MODE (regno);
948 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
951 static unsigned int
952 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
954 if (GET_MODE_UNIT_SIZE (mode) == 4)
955 return aarch64_tune_params.min_div_recip_mul_sf;
956 return aarch64_tune_params.min_div_recip_mul_df;
959 static int
960 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
961 enum machine_mode mode)
963 if (VECTOR_MODE_P (mode))
964 return aarch64_tune_params.vec_reassoc_width;
965 if (INTEGRAL_MODE_P (mode))
966 return aarch64_tune_params.int_reassoc_width;
967 if (FLOAT_MODE_P (mode))
968 return aarch64_tune_params.fp_reassoc_width;
969 return 1;
972 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
973 unsigned
974 aarch64_dbx_register_number (unsigned regno)
976 if (GP_REGNUM_P (regno))
977 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
978 else if (regno == SP_REGNUM)
979 return AARCH64_DWARF_SP;
980 else if (FP_REGNUM_P (regno))
981 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
983 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
984 equivalent DWARF register. */
985 return DWARF_FRAME_REGISTERS;
988 /* Return TRUE if MODE is any of the large INT modes. */
989 static bool
990 aarch64_vect_struct_mode_p (machine_mode mode)
992 return mode == OImode || mode == CImode || mode == XImode;
995 /* Return TRUE if MODE is any of the vector modes. */
996 static bool
997 aarch64_vector_mode_p (machine_mode mode)
999 return aarch64_vector_mode_supported_p (mode)
1000 || aarch64_vect_struct_mode_p (mode);
1003 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1004 static bool
1005 aarch64_array_mode_supported_p (machine_mode mode,
1006 unsigned HOST_WIDE_INT nelems)
1008 if (TARGET_SIMD
1009 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1010 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1011 && (nelems >= 2 && nelems <= 4))
1012 return true;
1014 return false;
1017 /* Implement HARD_REGNO_NREGS. */
1020 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1022 switch (aarch64_regno_regclass (regno))
1024 case FP_REGS:
1025 case FP_LO_REGS:
1026 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1027 default:
1028 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1030 gcc_unreachable ();
1033 /* Implement HARD_REGNO_MODE_OK. */
1036 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1038 if (GET_MODE_CLASS (mode) == MODE_CC)
1039 return regno == CC_REGNUM;
1041 if (regno == SP_REGNUM)
1042 /* The purpose of comparing with ptr_mode is to support the
1043 global register variable associated with the stack pointer
1044 register via the syntax of asm ("wsp") in ILP32. */
1045 return mode == Pmode || mode == ptr_mode;
1047 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1048 return mode == Pmode;
1050 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1051 return 1;
1053 if (FP_REGNUM_P (regno))
1055 if (aarch64_vect_struct_mode_p (mode))
1056 return
1057 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1058 else
1059 return 1;
1062 return 0;
1065 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1066 machine_mode
1067 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1068 machine_mode mode)
1070 /* Handle modes that fit within single registers. */
1071 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1073 if (GET_MODE_SIZE (mode) >= 4)
1074 return mode;
1075 else
1076 return SImode;
1078 /* Fall back to generic for multi-reg and very large modes. */
1079 else
1080 return choose_hard_reg_mode (regno, nregs, false);
1083 /* Return true if calls to DECL should be treated as
1084 long-calls (ie called via a register). */
1085 static bool
1086 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1088 return false;
1091 /* Return true if calls to symbol-ref SYM should be treated as
1092 long-calls (ie called via a register). */
1093 bool
1094 aarch64_is_long_call_p (rtx sym)
1096 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1099 /* Return true if calls to symbol-ref SYM should not go through
1100 plt stubs. */
1102 bool
1103 aarch64_is_noplt_call_p (rtx sym)
1105 const_tree decl = SYMBOL_REF_DECL (sym);
1107 if (flag_pic
1108 && decl
1109 && (!flag_plt
1110 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1111 && !targetm.binds_local_p (decl))
1112 return true;
1114 return false;
1117 /* Return true if the offsets to a zero/sign-extract operation
1118 represent an expression that matches an extend operation. The
1119 operands represent the paramters from
1121 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1122 bool
1123 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1124 rtx extract_imm)
1126 HOST_WIDE_INT mult_val, extract_val;
1128 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1129 return false;
1131 mult_val = INTVAL (mult_imm);
1132 extract_val = INTVAL (extract_imm);
1134 if (extract_val > 8
1135 && extract_val < GET_MODE_BITSIZE (mode)
1136 && exact_log2 (extract_val & ~7) > 0
1137 && (extract_val & 7) <= 4
1138 && mult_val == (1 << (extract_val & 7)))
1139 return true;
1141 return false;
1144 /* Emit an insn that's a simple single-set. Both the operands must be
1145 known to be valid. */
1146 inline static rtx_insn *
1147 emit_set_insn (rtx x, rtx y)
1149 return emit_insn (gen_rtx_SET (x, y));
1152 /* X and Y are two things to compare using CODE. Emit the compare insn and
1153 return the rtx for register 0 in the proper mode. */
1155 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1157 machine_mode mode = SELECT_CC_MODE (code, x, y);
1158 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1160 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1161 return cc_reg;
1164 /* Build the SYMBOL_REF for __tls_get_addr. */
1166 static GTY(()) rtx tls_get_addr_libfunc;
1169 aarch64_tls_get_addr (void)
1171 if (!tls_get_addr_libfunc)
1172 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1173 return tls_get_addr_libfunc;
1176 /* Return the TLS model to use for ADDR. */
1178 static enum tls_model
1179 tls_symbolic_operand_type (rtx addr)
1181 enum tls_model tls_kind = TLS_MODEL_NONE;
1182 rtx sym, addend;
1184 if (GET_CODE (addr) == CONST)
1186 split_const (addr, &sym, &addend);
1187 if (GET_CODE (sym) == SYMBOL_REF)
1188 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1190 else if (GET_CODE (addr) == SYMBOL_REF)
1191 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1193 return tls_kind;
1196 /* We'll allow lo_sum's in addresses in our legitimate addresses
1197 so that combine would take care of combining addresses where
1198 necessary, but for generation purposes, we'll generate the address
1199 as :
1200 RTL Absolute
1201 tmp = hi (symbol_ref); adrp x1, foo
1202 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1205 PIC TLS
1206 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1207 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1208 bl __tls_get_addr
1211 Load TLS symbol, depending on TLS mechanism and TLS access model.
1213 Global Dynamic - Traditional TLS:
1214 adrp tmp, :tlsgd:imm
1215 add dest, tmp, #:tlsgd_lo12:imm
1216 bl __tls_get_addr
1218 Global Dynamic - TLS Descriptors:
1219 adrp dest, :tlsdesc:imm
1220 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1221 add dest, dest, #:tlsdesc_lo12:imm
1222 blr tmp
1223 mrs tp, tpidr_el0
1224 add dest, dest, tp
1226 Initial Exec:
1227 mrs tp, tpidr_el0
1228 adrp tmp, :gottprel:imm
1229 ldr dest, [tmp, #:gottprel_lo12:imm]
1230 add dest, dest, tp
1232 Local Exec:
1233 mrs tp, tpidr_el0
1234 add t0, tp, #:tprel_hi12:imm, lsl #12
1235 add t0, t0, #:tprel_lo12_nc:imm
1238 static void
1239 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1240 enum aarch64_symbol_type type)
1242 switch (type)
1244 case SYMBOL_SMALL_ABSOLUTE:
1246 /* In ILP32, the mode of dest can be either SImode or DImode. */
1247 rtx tmp_reg = dest;
1248 machine_mode mode = GET_MODE (dest);
1250 gcc_assert (mode == Pmode || mode == ptr_mode);
1252 if (can_create_pseudo_p ())
1253 tmp_reg = gen_reg_rtx (mode);
1255 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1256 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1257 return;
1260 case SYMBOL_TINY_ABSOLUTE:
1261 emit_insn (gen_rtx_SET (dest, imm));
1262 return;
1264 case SYMBOL_SMALL_GOT_28K:
1266 machine_mode mode = GET_MODE (dest);
1267 rtx gp_rtx = pic_offset_table_rtx;
1268 rtx insn;
1269 rtx mem;
1271 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1272 here before rtl expand. Tree IVOPT will generate rtl pattern to
1273 decide rtx costs, in which case pic_offset_table_rtx is not
1274 initialized. For that case no need to generate the first adrp
1275 instruction as the final cost for global variable access is
1276 one instruction. */
1277 if (gp_rtx != NULL)
1279 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1280 using the page base as GOT base, the first page may be wasted,
1281 in the worst scenario, there is only 28K space for GOT).
1283 The generate instruction sequence for accessing global variable
1286 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1288 Only one instruction needed. But we must initialize
1289 pic_offset_table_rtx properly. We generate initialize insn for
1290 every global access, and allow CSE to remove all redundant.
1292 The final instruction sequences will look like the following
1293 for multiply global variables access.
1295 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1297 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1298 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1299 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1300 ... */
1302 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1303 crtl->uses_pic_offset_table = 1;
1304 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1306 if (mode != GET_MODE (gp_rtx))
1307 gp_rtx = gen_lowpart (mode, gp_rtx);
1311 if (mode == ptr_mode)
1313 if (mode == DImode)
1314 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1315 else
1316 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1318 mem = XVECEXP (SET_SRC (insn), 0, 0);
1320 else
1322 gcc_assert (mode == Pmode);
1324 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1325 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1328 /* The operand is expected to be MEM. Whenever the related insn
1329 pattern changed, above code which calculate mem should be
1330 updated. */
1331 gcc_assert (GET_CODE (mem) == MEM);
1332 MEM_READONLY_P (mem) = 1;
1333 MEM_NOTRAP_P (mem) = 1;
1334 emit_insn (insn);
1335 return;
1338 case SYMBOL_SMALL_GOT_4G:
1340 /* In ILP32, the mode of dest can be either SImode or DImode,
1341 while the got entry is always of SImode size. The mode of
1342 dest depends on how dest is used: if dest is assigned to a
1343 pointer (e.g. in the memory), it has SImode; it may have
1344 DImode if dest is dereferenced to access the memeory.
1345 This is why we have to handle three different ldr_got_small
1346 patterns here (two patterns for ILP32). */
1348 rtx insn;
1349 rtx mem;
1350 rtx tmp_reg = dest;
1351 machine_mode mode = GET_MODE (dest);
1353 if (can_create_pseudo_p ())
1354 tmp_reg = gen_reg_rtx (mode);
1356 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1357 if (mode == ptr_mode)
1359 if (mode == DImode)
1360 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1361 else
1362 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1364 mem = XVECEXP (SET_SRC (insn), 0, 0);
1366 else
1368 gcc_assert (mode == Pmode);
1370 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1371 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1374 gcc_assert (GET_CODE (mem) == MEM);
1375 MEM_READONLY_P (mem) = 1;
1376 MEM_NOTRAP_P (mem) = 1;
1377 emit_insn (insn);
1378 return;
1381 case SYMBOL_SMALL_TLSGD:
1383 rtx_insn *insns;
1384 machine_mode mode = GET_MODE (dest);
1385 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1387 start_sequence ();
1388 if (TARGET_ILP32)
1389 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1390 else
1391 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1392 insns = get_insns ();
1393 end_sequence ();
1395 RTL_CONST_CALL_P (insns) = 1;
1396 emit_libcall_block (insns, dest, result, imm);
1397 return;
1400 case SYMBOL_SMALL_TLSDESC:
1402 machine_mode mode = GET_MODE (dest);
1403 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1404 rtx tp;
1406 gcc_assert (mode == Pmode || mode == ptr_mode);
1408 /* In ILP32, the got entry is always of SImode size. Unlike
1409 small GOT, the dest is fixed at reg 0. */
1410 if (TARGET_ILP32)
1411 emit_insn (gen_tlsdesc_small_si (imm));
1412 else
1413 emit_insn (gen_tlsdesc_small_di (imm));
1414 tp = aarch64_load_tp (NULL);
1416 if (mode != Pmode)
1417 tp = gen_lowpart (mode, tp);
1419 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1420 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1421 return;
1424 case SYMBOL_SMALL_TLSIE:
1426 /* In ILP32, the mode of dest can be either SImode or DImode,
1427 while the got entry is always of SImode size. The mode of
1428 dest depends on how dest is used: if dest is assigned to a
1429 pointer (e.g. in the memory), it has SImode; it may have
1430 DImode if dest is dereferenced to access the memeory.
1431 This is why we have to handle three different tlsie_small
1432 patterns here (two patterns for ILP32). */
1433 machine_mode mode = GET_MODE (dest);
1434 rtx tmp_reg = gen_reg_rtx (mode);
1435 rtx tp = aarch64_load_tp (NULL);
1437 if (mode == ptr_mode)
1439 if (mode == DImode)
1440 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1441 else
1443 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1444 tp = gen_lowpart (mode, tp);
1447 else
1449 gcc_assert (mode == Pmode);
1450 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1453 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1454 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1455 return;
1458 case SYMBOL_TLSLE12:
1459 case SYMBOL_TLSLE24:
1460 case SYMBOL_TLSLE32:
1461 case SYMBOL_TLSLE48:
1463 machine_mode mode = GET_MODE (dest);
1464 rtx tp = aarch64_load_tp (NULL);
1466 if (mode != Pmode)
1467 tp = gen_lowpart (mode, tp);
1469 switch (type)
1471 case SYMBOL_TLSLE12:
1472 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1473 (dest, tp, imm));
1474 break;
1475 case SYMBOL_TLSLE24:
1476 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1477 (dest, tp, imm));
1478 break;
1479 case SYMBOL_TLSLE32:
1480 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1481 (dest, imm));
1482 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1483 (dest, dest, tp));
1484 break;
1485 case SYMBOL_TLSLE48:
1486 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1487 (dest, imm));
1488 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1489 (dest, dest, tp));
1490 break;
1491 default:
1492 gcc_unreachable ();
1495 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1496 return;
1499 case SYMBOL_TINY_GOT:
1500 emit_insn (gen_ldr_got_tiny (dest, imm));
1501 return;
1503 case SYMBOL_TINY_TLSIE:
1505 machine_mode mode = GET_MODE (dest);
1506 rtx tp = aarch64_load_tp (NULL);
1508 if (mode == ptr_mode)
1510 if (mode == DImode)
1511 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1512 else
1514 tp = gen_lowpart (mode, tp);
1515 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1518 else
1520 gcc_assert (mode == Pmode);
1521 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1524 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1525 return;
1528 default:
1529 gcc_unreachable ();
1533 /* Emit a move from SRC to DEST. Assume that the move expanders can
1534 handle all moves if !can_create_pseudo_p (). The distinction is
1535 important because, unlike emit_move_insn, the move expanders know
1536 how to force Pmode objects into the constant pool even when the
1537 constant pool address is not itself legitimate. */
1538 static rtx
1539 aarch64_emit_move (rtx dest, rtx src)
1541 return (can_create_pseudo_p ()
1542 ? emit_move_insn (dest, src)
1543 : emit_move_insn_1 (dest, src));
1546 /* Split a 128-bit move operation into two 64-bit move operations,
1547 taking care to handle partial overlap of register to register
1548 copies. Special cases are needed when moving between GP regs and
1549 FP regs. SRC can be a register, constant or memory; DST a register
1550 or memory. If either operand is memory it must not have any side
1551 effects. */
1552 void
1553 aarch64_split_128bit_move (rtx dst, rtx src)
1555 rtx dst_lo, dst_hi;
1556 rtx src_lo, src_hi;
1558 machine_mode mode = GET_MODE (dst);
1560 gcc_assert (mode == TImode || mode == TFmode);
1561 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1562 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1564 if (REG_P (dst) && REG_P (src))
1566 int src_regno = REGNO (src);
1567 int dst_regno = REGNO (dst);
1569 /* Handle FP <-> GP regs. */
1570 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1572 src_lo = gen_lowpart (word_mode, src);
1573 src_hi = gen_highpart (word_mode, src);
1575 if (mode == TImode)
1577 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1578 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1580 else
1582 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1583 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1585 return;
1587 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1589 dst_lo = gen_lowpart (word_mode, dst);
1590 dst_hi = gen_highpart (word_mode, dst);
1592 if (mode == TImode)
1594 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1595 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1597 else
1599 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1600 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1602 return;
1606 dst_lo = gen_lowpart (word_mode, dst);
1607 dst_hi = gen_highpart (word_mode, dst);
1608 src_lo = gen_lowpart (word_mode, src);
1609 src_hi = gen_highpart_mode (word_mode, mode, src);
1611 /* At most one pairing may overlap. */
1612 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1614 aarch64_emit_move (dst_hi, src_hi);
1615 aarch64_emit_move (dst_lo, src_lo);
1617 else
1619 aarch64_emit_move (dst_lo, src_lo);
1620 aarch64_emit_move (dst_hi, src_hi);
1624 bool
1625 aarch64_split_128bit_move_p (rtx dst, rtx src)
1627 return (! REG_P (src)
1628 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1631 /* Split a complex SIMD combine. */
1633 void
1634 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1636 machine_mode src_mode = GET_MODE (src1);
1637 machine_mode dst_mode = GET_MODE (dst);
1639 gcc_assert (VECTOR_MODE_P (dst_mode));
1641 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1643 rtx (*gen) (rtx, rtx, rtx);
1645 switch (src_mode)
1647 case V8QImode:
1648 gen = gen_aarch64_simd_combinev8qi;
1649 break;
1650 case V4HImode:
1651 gen = gen_aarch64_simd_combinev4hi;
1652 break;
1653 case V2SImode:
1654 gen = gen_aarch64_simd_combinev2si;
1655 break;
1656 case V4HFmode:
1657 gen = gen_aarch64_simd_combinev4hf;
1658 break;
1659 case V2SFmode:
1660 gen = gen_aarch64_simd_combinev2sf;
1661 break;
1662 case DImode:
1663 gen = gen_aarch64_simd_combinedi;
1664 break;
1665 case DFmode:
1666 gen = gen_aarch64_simd_combinedf;
1667 break;
1668 default:
1669 gcc_unreachable ();
1672 emit_insn (gen (dst, src1, src2));
1673 return;
1677 /* Split a complex SIMD move. */
1679 void
1680 aarch64_split_simd_move (rtx dst, rtx src)
1682 machine_mode src_mode = GET_MODE (src);
1683 machine_mode dst_mode = GET_MODE (dst);
1685 gcc_assert (VECTOR_MODE_P (dst_mode));
1687 if (REG_P (dst) && REG_P (src))
1689 rtx (*gen) (rtx, rtx);
1691 gcc_assert (VECTOR_MODE_P (src_mode));
1693 switch (src_mode)
1695 case V16QImode:
1696 gen = gen_aarch64_split_simd_movv16qi;
1697 break;
1698 case V8HImode:
1699 gen = gen_aarch64_split_simd_movv8hi;
1700 break;
1701 case V4SImode:
1702 gen = gen_aarch64_split_simd_movv4si;
1703 break;
1704 case V2DImode:
1705 gen = gen_aarch64_split_simd_movv2di;
1706 break;
1707 case V8HFmode:
1708 gen = gen_aarch64_split_simd_movv8hf;
1709 break;
1710 case V4SFmode:
1711 gen = gen_aarch64_split_simd_movv4sf;
1712 break;
1713 case V2DFmode:
1714 gen = gen_aarch64_split_simd_movv2df;
1715 break;
1716 default:
1717 gcc_unreachable ();
1720 emit_insn (gen (dst, src));
1721 return;
1725 bool
1726 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1727 machine_mode ymode, rtx y)
1729 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1730 gcc_assert (r != NULL);
1731 return rtx_equal_p (x, r);
1735 static rtx
1736 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1738 if (can_create_pseudo_p ())
1739 return force_reg (mode, value);
1740 else
1742 x = aarch64_emit_move (x, value);
1743 return x;
1748 static rtx
1749 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1751 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1753 rtx high;
1754 /* Load the full offset into a register. This
1755 might be improvable in the future. */
1756 high = GEN_INT (offset);
1757 offset = 0;
1758 high = aarch64_force_temporary (mode, temp, high);
1759 reg = aarch64_force_temporary (mode, temp,
1760 gen_rtx_PLUS (mode, high, reg));
1762 return plus_constant (mode, reg, offset);
1765 static int
1766 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1767 machine_mode mode)
1769 int i;
1770 unsigned HOST_WIDE_INT val, val2, mask;
1771 int one_match, zero_match;
1772 int num_insns;
1774 val = INTVAL (imm);
1776 if (aarch64_move_imm (val, mode))
1778 if (generate)
1779 emit_insn (gen_rtx_SET (dest, imm));
1780 return 1;
1783 if ((val >> 32) == 0 || mode == SImode)
1785 if (generate)
1787 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1788 if (mode == SImode)
1789 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1790 GEN_INT ((val >> 16) & 0xffff)));
1791 else
1792 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1793 GEN_INT ((val >> 16) & 0xffff)));
1795 return 2;
1798 /* Remaining cases are all for DImode. */
1800 mask = 0xffff;
1801 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1802 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1803 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1804 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1806 if (zero_match != 2 && one_match != 2)
1808 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1809 For a 64-bit bitmask try whether changing 16 bits to all ones or
1810 zeroes creates a valid bitmask. To check any repeated bitmask,
1811 try using 16 bits from the other 32-bit half of val. */
1813 for (i = 0; i < 64; i += 16, mask <<= 16)
1815 val2 = val & ~mask;
1816 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817 break;
1818 val2 = val | mask;
1819 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1820 break;
1821 val2 = val2 & ~mask;
1822 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1823 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1824 break;
1826 if (i != 64)
1828 if (generate)
1830 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1831 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1832 GEN_INT ((val >> i) & 0xffff)));
1834 return 2;
1838 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1839 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1840 otherwise skip zero bits. */
1842 num_insns = 1;
1843 mask = 0xffff;
1844 val2 = one_match > zero_match ? ~val : val;
1845 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1847 if (generate)
1848 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1849 ? (val | ~(mask << i))
1850 : (val & (mask << i)))));
1851 for (i += 16; i < 64; i += 16)
1853 if ((val2 & (mask << i)) == 0)
1854 continue;
1855 if (generate)
1856 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1857 GEN_INT ((val >> i) & 0xffff)));
1858 num_insns ++;
1861 return num_insns;
1865 void
1866 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1868 machine_mode mode = GET_MODE (dest);
1870 gcc_assert (mode == SImode || mode == DImode);
1872 /* Check on what type of symbol it is. */
1873 if (GET_CODE (imm) == SYMBOL_REF
1874 || GET_CODE (imm) == LABEL_REF
1875 || GET_CODE (imm) == CONST)
1877 rtx mem, base, offset;
1878 enum aarch64_symbol_type sty;
1880 /* If we have (const (plus symbol offset)), separate out the offset
1881 before we start classifying the symbol. */
1882 split_const (imm, &base, &offset);
1884 sty = aarch64_classify_symbol (base, offset);
1885 switch (sty)
1887 case SYMBOL_FORCE_TO_MEM:
1888 if (offset != const0_rtx
1889 && targetm.cannot_force_const_mem (mode, imm))
1891 gcc_assert (can_create_pseudo_p ());
1892 base = aarch64_force_temporary (mode, dest, base);
1893 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1894 aarch64_emit_move (dest, base);
1895 return;
1898 mem = force_const_mem (ptr_mode, imm);
1899 gcc_assert (mem);
1901 /* If we aren't generating PC relative literals, then
1902 we need to expand the literal pool access carefully.
1903 This is something that needs to be done in a number
1904 of places, so could well live as a separate function. */
1905 if (!aarch64_pcrelative_literal_loads)
1907 gcc_assert (can_create_pseudo_p ());
1908 base = gen_reg_rtx (ptr_mode);
1909 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1910 mem = gen_rtx_MEM (ptr_mode, base);
1913 if (mode != ptr_mode)
1914 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1916 emit_insn (gen_rtx_SET (dest, mem));
1918 return;
1920 case SYMBOL_SMALL_TLSGD:
1921 case SYMBOL_SMALL_TLSDESC:
1922 case SYMBOL_SMALL_TLSIE:
1923 case SYMBOL_SMALL_GOT_28K:
1924 case SYMBOL_SMALL_GOT_4G:
1925 case SYMBOL_TINY_GOT:
1926 case SYMBOL_TINY_TLSIE:
1927 if (offset != const0_rtx)
1929 gcc_assert(can_create_pseudo_p ());
1930 base = aarch64_force_temporary (mode, dest, base);
1931 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1932 aarch64_emit_move (dest, base);
1933 return;
1935 /* FALLTHRU */
1937 case SYMBOL_SMALL_ABSOLUTE:
1938 case SYMBOL_TINY_ABSOLUTE:
1939 case SYMBOL_TLSLE12:
1940 case SYMBOL_TLSLE24:
1941 case SYMBOL_TLSLE32:
1942 case SYMBOL_TLSLE48:
1943 aarch64_load_symref_appropriately (dest, imm, sty);
1944 return;
1946 default:
1947 gcc_unreachable ();
1951 if (!CONST_INT_P (imm))
1953 if (GET_CODE (imm) == HIGH)
1954 emit_insn (gen_rtx_SET (dest, imm));
1955 else
1957 rtx mem = force_const_mem (mode, imm);
1958 gcc_assert (mem);
1959 emit_insn (gen_rtx_SET (dest, mem));
1962 return;
1965 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1968 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1969 temporary value if necessary. FRAME_RELATED_P should be true if
1970 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1971 to the generated instructions. If SCRATCHREG is known to hold
1972 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1973 immediate again.
1975 Since this function may be used to adjust the stack pointer, we must
1976 ensure that it cannot cause transient stack deallocation (for example
1977 by first incrementing SP and then decrementing when adjusting by a
1978 large immediate). */
1980 static void
1981 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1982 HOST_WIDE_INT delta, bool frame_related_p,
1983 bool emit_move_imm)
1985 HOST_WIDE_INT mdelta = abs_hwi (delta);
1986 rtx this_rtx = gen_rtx_REG (mode, regnum);
1987 rtx_insn *insn;
1989 if (!mdelta)
1990 return;
1992 /* Single instruction adjustment. */
1993 if (aarch64_uimm12_shift (mdelta))
1995 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1996 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1997 return;
2000 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2001 Only do this if mdelta is not a 16-bit move as adjusting using a move
2002 is better. */
2003 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2005 HOST_WIDE_INT low_off = mdelta & 0xfff;
2007 low_off = delta < 0 ? -low_off : low_off;
2008 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2009 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2010 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2011 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2012 return;
2015 /* Emit a move immediate if required and an addition/subtraction. */
2016 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2017 if (emit_move_imm)
2018 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2019 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2020 : gen_add2_insn (this_rtx, scratch_rtx));
2021 if (frame_related_p)
2023 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024 rtx adj = plus_constant (mode, this_rtx, delta);
2025 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2029 static inline void
2030 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2031 HOST_WIDE_INT delta)
2033 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2036 static inline void
2037 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2039 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2040 true, emit_move_imm);
2043 static inline void
2044 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2046 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2047 frame_related_p, true);
2050 static bool
2051 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2052 tree exp ATTRIBUTE_UNUSED)
2054 /* Currently, always true. */
2055 return true;
2058 /* Implement TARGET_PASS_BY_REFERENCE. */
2060 static bool
2061 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2062 machine_mode mode,
2063 const_tree type,
2064 bool named ATTRIBUTE_UNUSED)
2066 HOST_WIDE_INT size;
2067 machine_mode dummymode;
2068 int nregs;
2070 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2071 size = (mode == BLKmode && type)
2072 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2074 /* Aggregates are passed by reference based on their size. */
2075 if (type && AGGREGATE_TYPE_P (type))
2077 size = int_size_in_bytes (type);
2080 /* Variable sized arguments are always returned by reference. */
2081 if (size < 0)
2082 return true;
2084 /* Can this be a candidate to be passed in fp/simd register(s)? */
2085 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2086 &dummymode, &nregs,
2087 NULL))
2088 return false;
2090 /* Arguments which are variable sized or larger than 2 registers are
2091 passed by reference unless they are a homogenous floating point
2092 aggregate. */
2093 return size > 2 * UNITS_PER_WORD;
2096 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2097 static bool
2098 aarch64_return_in_msb (const_tree valtype)
2100 machine_mode dummy_mode;
2101 int dummy_int;
2103 /* Never happens in little-endian mode. */
2104 if (!BYTES_BIG_ENDIAN)
2105 return false;
2107 /* Only composite types smaller than or equal to 16 bytes can
2108 be potentially returned in registers. */
2109 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2110 || int_size_in_bytes (valtype) <= 0
2111 || int_size_in_bytes (valtype) > 16)
2112 return false;
2114 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2115 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2116 is always passed/returned in the least significant bits of fp/simd
2117 register(s). */
2118 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2119 &dummy_mode, &dummy_int, NULL))
2120 return false;
2122 return true;
2125 /* Implement TARGET_FUNCTION_VALUE.
2126 Define how to find the value returned by a function. */
2128 static rtx
2129 aarch64_function_value (const_tree type, const_tree func,
2130 bool outgoing ATTRIBUTE_UNUSED)
2132 machine_mode mode;
2133 int unsignedp;
2134 int count;
2135 machine_mode ag_mode;
2137 mode = TYPE_MODE (type);
2138 if (INTEGRAL_TYPE_P (type))
2139 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2141 if (aarch64_return_in_msb (type))
2143 HOST_WIDE_INT size = int_size_in_bytes (type);
2145 if (size % UNITS_PER_WORD != 0)
2147 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2148 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2152 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2153 &ag_mode, &count, NULL))
2155 if (!aarch64_composite_type_p (type, mode))
2157 gcc_assert (count == 1 && mode == ag_mode);
2158 return gen_rtx_REG (mode, V0_REGNUM);
2160 else
2162 int i;
2163 rtx par;
2165 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2166 for (i = 0; i < count; i++)
2168 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2169 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2170 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2171 XVECEXP (par, 0, i) = tmp;
2173 return par;
2176 else
2177 return gen_rtx_REG (mode, R0_REGNUM);
2180 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2181 Return true if REGNO is the number of a hard register in which the values
2182 of called function may come back. */
2184 static bool
2185 aarch64_function_value_regno_p (const unsigned int regno)
2187 /* Maximum of 16 bytes can be returned in the general registers. Examples
2188 of 16-byte return values are: 128-bit integers and 16-byte small
2189 structures (excluding homogeneous floating-point aggregates). */
2190 if (regno == R0_REGNUM || regno == R1_REGNUM)
2191 return true;
2193 /* Up to four fp/simd registers can return a function value, e.g. a
2194 homogeneous floating-point aggregate having four members. */
2195 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2196 return TARGET_FLOAT;
2198 return false;
2201 /* Implement TARGET_RETURN_IN_MEMORY.
2203 If the type T of the result of a function is such that
2204 void func (T arg)
2205 would require that arg be passed as a value in a register (or set of
2206 registers) according to the parameter passing rules, then the result
2207 is returned in the same registers as would be used for such an
2208 argument. */
2210 static bool
2211 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2213 HOST_WIDE_INT size;
2214 machine_mode ag_mode;
2215 int count;
2217 if (!AGGREGATE_TYPE_P (type)
2218 && TREE_CODE (type) != COMPLEX_TYPE
2219 && TREE_CODE (type) != VECTOR_TYPE)
2220 /* Simple scalar types always returned in registers. */
2221 return false;
2223 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2224 type,
2225 &ag_mode,
2226 &count,
2227 NULL))
2228 return false;
2230 /* Types larger than 2 registers returned in memory. */
2231 size = int_size_in_bytes (type);
2232 return (size < 0 || size > 2 * UNITS_PER_WORD);
2235 static bool
2236 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2237 const_tree type, int *nregs)
2239 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2240 return aarch64_vfp_is_call_or_return_candidate (mode,
2241 type,
2242 &pcum->aapcs_vfp_rmode,
2243 nregs,
2244 NULL);
2247 /* Given MODE and TYPE of a function argument, return the alignment in
2248 bits. The idea is to suppress any stronger alignment requested by
2249 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2250 This is a helper function for local use only. */
2252 static unsigned int
2253 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2255 if (!type)
2256 return GET_MODE_ALIGNMENT (mode);
2257 if (integer_zerop (TYPE_SIZE (type)))
2258 return 0;
2260 gcc_assert (TYPE_MODE (type) == mode);
2262 if (!AGGREGATE_TYPE_P (type))
2263 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2265 if (TREE_CODE (type) == ARRAY_TYPE)
2266 return TYPE_ALIGN (TREE_TYPE (type));
2268 unsigned int alignment = 0;
2270 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2271 alignment = std::max (alignment, DECL_ALIGN (field));
2273 return alignment;
2276 /* Layout a function argument according to the AAPCS64 rules. The rule
2277 numbers refer to the rule numbers in the AAPCS64. */
2279 static void
2280 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2281 const_tree type,
2282 bool named ATTRIBUTE_UNUSED)
2284 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2285 int ncrn, nvrn, nregs;
2286 bool allocate_ncrn, allocate_nvrn;
2287 HOST_WIDE_INT size;
2289 /* We need to do this once per argument. */
2290 if (pcum->aapcs_arg_processed)
2291 return;
2293 pcum->aapcs_arg_processed = true;
2295 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2296 size
2297 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2298 UNITS_PER_WORD);
2300 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2301 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2302 mode,
2303 type,
2304 &nregs);
2306 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2307 The following code thus handles passing by SIMD/FP registers first. */
2309 nvrn = pcum->aapcs_nvrn;
2311 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2312 and homogenous short-vector aggregates (HVA). */
2313 if (allocate_nvrn)
2315 if (!TARGET_FLOAT)
2316 aarch64_err_no_fpadvsimd (mode, "argument");
2318 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2320 pcum->aapcs_nextnvrn = nvrn + nregs;
2321 if (!aarch64_composite_type_p (type, mode))
2323 gcc_assert (nregs == 1);
2324 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2326 else
2328 rtx par;
2329 int i;
2330 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2331 for (i = 0; i < nregs; i++)
2333 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2334 V0_REGNUM + nvrn + i);
2335 tmp = gen_rtx_EXPR_LIST
2336 (VOIDmode, tmp,
2337 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2338 XVECEXP (par, 0, i) = tmp;
2340 pcum->aapcs_reg = par;
2342 return;
2344 else
2346 /* C.3 NSRN is set to 8. */
2347 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2348 goto on_stack;
2352 ncrn = pcum->aapcs_ncrn;
2353 nregs = size / UNITS_PER_WORD;
2355 /* C6 - C9. though the sign and zero extension semantics are
2356 handled elsewhere. This is the case where the argument fits
2357 entirely general registers. */
2358 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2360 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2362 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2364 /* C.8 if the argument has an alignment of 16 then the NGRN is
2365 rounded up to the next even number. */
2366 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2368 ++ncrn;
2369 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2371 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2372 A reg is still generated for it, but the caller should be smart
2373 enough not to use it. */
2374 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2376 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2378 else
2380 rtx par;
2381 int i;
2383 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2384 for (i = 0; i < nregs; i++)
2386 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2387 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2388 GEN_INT (i * UNITS_PER_WORD));
2389 XVECEXP (par, 0, i) = tmp;
2391 pcum->aapcs_reg = par;
2394 pcum->aapcs_nextncrn = ncrn + nregs;
2395 return;
2398 /* C.11 */
2399 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2401 /* The argument is passed on stack; record the needed number of words for
2402 this argument and align the total size if necessary. */
2403 on_stack:
2404 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2405 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2406 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2407 16 / UNITS_PER_WORD);
2408 return;
2411 /* Implement TARGET_FUNCTION_ARG. */
2413 static rtx
2414 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2415 const_tree type, bool named)
2417 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2418 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2420 if (mode == VOIDmode)
2421 return NULL_RTX;
2423 aarch64_layout_arg (pcum_v, mode, type, named);
2424 return pcum->aapcs_reg;
2427 void
2428 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2429 const_tree fntype ATTRIBUTE_UNUSED,
2430 rtx libname ATTRIBUTE_UNUSED,
2431 const_tree fndecl ATTRIBUTE_UNUSED,
2432 unsigned n_named ATTRIBUTE_UNUSED)
2434 pcum->aapcs_ncrn = 0;
2435 pcum->aapcs_nvrn = 0;
2436 pcum->aapcs_nextncrn = 0;
2437 pcum->aapcs_nextnvrn = 0;
2438 pcum->pcs_variant = ARM_PCS_AAPCS64;
2439 pcum->aapcs_reg = NULL_RTX;
2440 pcum->aapcs_arg_processed = false;
2441 pcum->aapcs_stack_words = 0;
2442 pcum->aapcs_stack_size = 0;
2444 if (!TARGET_FLOAT
2445 && fndecl && TREE_PUBLIC (fndecl)
2446 && fntype && fntype != error_mark_node)
2448 const_tree type = TREE_TYPE (fntype);
2449 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2450 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2451 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2452 &mode, &nregs, NULL))
2453 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2455 return;
2458 static void
2459 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2460 machine_mode mode,
2461 const_tree type,
2462 bool named)
2464 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2465 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2467 aarch64_layout_arg (pcum_v, mode, type, named);
2468 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2469 != (pcum->aapcs_stack_words != 0));
2470 pcum->aapcs_arg_processed = false;
2471 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2472 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2473 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2474 pcum->aapcs_stack_words = 0;
2475 pcum->aapcs_reg = NULL_RTX;
2479 bool
2480 aarch64_function_arg_regno_p (unsigned regno)
2482 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2483 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2486 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2487 PARM_BOUNDARY bits of alignment, but will be given anything up
2488 to STACK_BOUNDARY bits if the type requires it. This makes sure
2489 that both before and after the layout of each argument, the Next
2490 Stacked Argument Address (NSAA) will have a minimum alignment of
2491 8 bytes. */
2493 static unsigned int
2494 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2496 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2498 if (alignment < PARM_BOUNDARY)
2499 alignment = PARM_BOUNDARY;
2500 if (alignment > STACK_BOUNDARY)
2501 alignment = STACK_BOUNDARY;
2502 return alignment;
2505 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2507 Return true if an argument passed on the stack should be padded upwards,
2508 i.e. if the least-significant byte of the stack slot has useful data.
2510 Small aggregate types are placed in the lowest memory address.
2512 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2514 bool
2515 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2517 /* On little-endian targets, the least significant byte of every stack
2518 argument is passed at the lowest byte address of the stack slot. */
2519 if (!BYTES_BIG_ENDIAN)
2520 return true;
2522 /* Otherwise, integral, floating-point and pointer types are padded downward:
2523 the least significant byte of a stack argument is passed at the highest
2524 byte address of the stack slot. */
2525 if (type
2526 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2527 || POINTER_TYPE_P (type))
2528 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2529 return false;
2531 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2532 return true;
2535 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2537 It specifies padding for the last (may also be the only)
2538 element of a block move between registers and memory. If
2539 assuming the block is in the memory, padding upward means that
2540 the last element is padded after its highest significant byte,
2541 while in downward padding, the last element is padded at the
2542 its least significant byte side.
2544 Small aggregates and small complex types are always padded
2545 upwards.
2547 We don't need to worry about homogeneous floating-point or
2548 short-vector aggregates; their move is not affected by the
2549 padding direction determined here. Regardless of endianness,
2550 each element of such an aggregate is put in the least
2551 significant bits of a fp/simd register.
2553 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2554 register has useful data, and return the opposite if the most
2555 significant byte does. */
2557 bool
2558 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2559 bool first ATTRIBUTE_UNUSED)
2562 /* Small composite types are always padded upward. */
2563 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2565 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2566 : GET_MODE_SIZE (mode));
2567 if (size < 2 * UNITS_PER_WORD)
2568 return true;
2571 /* Otherwise, use the default padding. */
2572 return !BYTES_BIG_ENDIAN;
2575 static machine_mode
2576 aarch64_libgcc_cmp_return_mode (void)
2578 return SImode;
2581 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2583 /* We use the 12-bit shifted immediate arithmetic instructions so values
2584 must be multiple of (1 << 12), i.e. 4096. */
2585 #define ARITH_FACTOR 4096
2587 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2588 #error Cannot use simple address calculation for stack probing
2589 #endif
2591 /* The pair of scratch registers used for stack probing. */
2592 #define PROBE_STACK_FIRST_REG 9
2593 #define PROBE_STACK_SECOND_REG 10
2595 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2596 inclusive. These are offsets from the current stack pointer. */
2598 static void
2599 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2601 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2603 /* See the same assertion on PROBE_INTERVAL above. */
2604 gcc_assert ((first % ARITH_FACTOR) == 0);
2606 /* See if we have a constant small number of probes to generate. If so,
2607 that's the easy case. */
2608 if (size <= PROBE_INTERVAL)
2610 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2612 emit_set_insn (reg1,
2613 plus_constant (ptr_mode,
2614 stack_pointer_rtx, -(first + base)));
2615 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2618 /* The run-time loop is made up of 8 insns in the generic case while the
2619 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2620 else if (size <= 4 * PROBE_INTERVAL)
2622 HOST_WIDE_INT i, rem;
2624 emit_set_insn (reg1,
2625 plus_constant (ptr_mode,
2626 stack_pointer_rtx,
2627 -(first + PROBE_INTERVAL)));
2628 emit_stack_probe (reg1);
2630 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2631 it exceeds SIZE. If only two probes are needed, this will not
2632 generate any code. Then probe at FIRST + SIZE. */
2633 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2635 emit_set_insn (reg1,
2636 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2637 emit_stack_probe (reg1);
2640 rem = size - (i - PROBE_INTERVAL);
2641 if (rem > 256)
2643 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2645 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2646 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2648 else
2649 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2652 /* Otherwise, do the same as above, but in a loop. Note that we must be
2653 extra careful with variables wrapping around because we might be at
2654 the very top (or the very bottom) of the address space and we have
2655 to be able to handle this case properly; in particular, we use an
2656 equality test for the loop condition. */
2657 else
2659 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2661 /* Step 1: round SIZE to the previous multiple of the interval. */
2663 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2666 /* Step 2: compute initial and final value of the loop counter. */
2668 /* TEST_ADDR = SP + FIRST. */
2669 emit_set_insn (reg1,
2670 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2672 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2673 emit_set_insn (reg2,
2674 plus_constant (ptr_mode, stack_pointer_rtx,
2675 -(first + rounded_size)));
2678 /* Step 3: the loop
2682 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2683 probe at TEST_ADDR
2685 while (TEST_ADDR != LAST_ADDR)
2687 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2688 until it is equal to ROUNDED_SIZE. */
2690 if (ptr_mode == DImode)
2691 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2692 else
2693 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2696 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2697 that SIZE is equal to ROUNDED_SIZE. */
2699 if (size != rounded_size)
2701 HOST_WIDE_INT rem = size - rounded_size;
2703 if (rem > 256)
2705 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2707 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2708 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2710 else
2711 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2715 /* Make sure nothing is scheduled before we are done. */
2716 emit_insn (gen_blockage ());
2719 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2720 absolute addresses. */
2722 const char *
2723 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2725 static int labelno = 0;
2726 char loop_lab[32];
2727 rtx xops[2];
2729 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2731 /* Loop. */
2732 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2734 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2735 xops[0] = reg1;
2736 xops[1] = GEN_INT (PROBE_INTERVAL);
2737 output_asm_insn ("sub\t%0, %0, %1", xops);
2739 /* Probe at TEST_ADDR. */
2740 output_asm_insn ("str\txzr, [%0]", xops);
2742 /* Test if TEST_ADDR == LAST_ADDR. */
2743 xops[1] = reg2;
2744 output_asm_insn ("cmp\t%0, %1", xops);
2746 /* Branch. */
2747 fputs ("\tb.ne\t", asm_out_file);
2748 assemble_name_raw (asm_out_file, loop_lab);
2749 fputc ('\n', asm_out_file);
2751 return "";
2754 static bool
2755 aarch64_frame_pointer_required (void)
2757 /* In aarch64_override_options_after_change
2758 flag_omit_leaf_frame_pointer turns off the frame pointer by
2759 default. Turn it back on now if we've not got a leaf
2760 function. */
2761 if (flag_omit_leaf_frame_pointer
2762 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2763 return true;
2765 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2766 if (crtl->calls_eh_return)
2767 return true;
2769 return false;
2772 /* Mark the registers that need to be saved by the callee and calculate
2773 the size of the callee-saved registers area and frame record (both FP
2774 and LR may be omitted). */
2775 static void
2776 aarch64_layout_frame (void)
2778 HOST_WIDE_INT offset = 0;
2779 int regno, last_fp_reg = INVALID_REGNUM;
2781 if (reload_completed && cfun->machine->frame.laid_out)
2782 return;
2784 #define SLOT_NOT_REQUIRED (-2)
2785 #define SLOT_REQUIRED (-1)
2787 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2788 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2790 /* First mark all the registers that really need to be saved... */
2791 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2792 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2794 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2795 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2797 /* ... that includes the eh data registers (if needed)... */
2798 if (crtl->calls_eh_return)
2799 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2800 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2801 = SLOT_REQUIRED;
2803 /* ... and any callee saved register that dataflow says is live. */
2804 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2805 if (df_regs_ever_live_p (regno)
2806 && (regno == R30_REGNUM
2807 || !call_used_regs[regno]))
2808 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2810 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2811 if (df_regs_ever_live_p (regno)
2812 && !call_used_regs[regno])
2814 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2815 last_fp_reg = regno;
2818 if (frame_pointer_needed)
2820 /* FP and LR are placed in the linkage record. */
2821 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2822 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2823 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2824 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2825 offset += 2 * UNITS_PER_WORD;
2828 /* Now assign stack slots for them. */
2829 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2830 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2832 cfun->machine->frame.reg_offset[regno] = offset;
2833 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2834 cfun->machine->frame.wb_candidate1 = regno;
2835 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2836 cfun->machine->frame.wb_candidate2 = regno;
2837 offset += UNITS_PER_WORD;
2840 HOST_WIDE_INT max_int_offset = offset;
2841 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2842 bool has_align_gap = offset != max_int_offset;
2844 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2845 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2847 /* If there is an alignment gap between integer and fp callee-saves,
2848 allocate the last fp register to it if possible. */
2849 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2851 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2852 break;
2855 cfun->machine->frame.reg_offset[regno] = offset;
2856 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2857 cfun->machine->frame.wb_candidate1 = regno;
2858 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2859 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2860 cfun->machine->frame.wb_candidate2 = regno;
2861 offset += UNITS_PER_WORD;
2864 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2866 cfun->machine->frame.saved_regs_size = offset;
2868 HOST_WIDE_INT varargs_and_saved_regs_size
2869 = offset + cfun->machine->frame.saved_varargs_size;
2871 cfun->machine->frame.hard_fp_offset
2872 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2873 STACK_BOUNDARY / BITS_PER_UNIT);
2875 cfun->machine->frame.frame_size
2876 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2877 + crtl->outgoing_args_size,
2878 STACK_BOUNDARY / BITS_PER_UNIT);
2880 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2882 cfun->machine->frame.initial_adjust = 0;
2883 cfun->machine->frame.final_adjust = 0;
2884 cfun->machine->frame.callee_adjust = 0;
2885 cfun->machine->frame.callee_offset = 0;
2887 HOST_WIDE_INT max_push_offset = 0;
2888 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2889 max_push_offset = 512;
2890 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2891 max_push_offset = 256;
2893 if (cfun->machine->frame.frame_size < max_push_offset
2894 && crtl->outgoing_args_size == 0)
2896 /* Simple, small frame with no outgoing arguments:
2897 stp reg1, reg2, [sp, -frame_size]!
2898 stp reg3, reg4, [sp, 16] */
2899 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2901 else if ((crtl->outgoing_args_size
2902 + cfun->machine->frame.saved_regs_size < 512)
2903 && !(cfun->calls_alloca
2904 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2906 /* Frame with small outgoing arguments:
2907 sub sp, sp, frame_size
2908 stp reg1, reg2, [sp, outgoing_args_size]
2909 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2910 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2911 cfun->machine->frame.callee_offset
2912 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2914 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2916 /* Frame with large outgoing arguments but a small local area:
2917 stp reg1, reg2, [sp, -hard_fp_offset]!
2918 stp reg3, reg4, [sp, 16]
2919 sub sp, sp, outgoing_args_size */
2920 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2921 cfun->machine->frame.final_adjust
2922 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2924 else if (!frame_pointer_needed
2925 && varargs_and_saved_regs_size < max_push_offset)
2927 /* Frame with large local area and outgoing arguments (this pushes the
2928 callee-saves first, followed by the locals and outgoing area):
2929 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2930 stp reg3, reg4, [sp, 16]
2931 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2932 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2933 cfun->machine->frame.final_adjust
2934 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2935 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2936 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2938 else
2940 /* Frame with large local area and outgoing arguments using frame pointer:
2941 sub sp, sp, hard_fp_offset
2942 stp x29, x30, [sp, 0]
2943 add x29, sp, 0
2944 stp reg3, reg4, [sp, 16]
2945 sub sp, sp, outgoing_args_size */
2946 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2947 cfun->machine->frame.final_adjust
2948 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2951 cfun->machine->frame.laid_out = true;
2954 /* Return true if the register REGNO is saved on entry to
2955 the current function. */
2957 static bool
2958 aarch64_register_saved_on_entry (int regno)
2960 return cfun->machine->frame.reg_offset[regno] >= 0;
2963 /* Return the next register up from REGNO up to LIMIT for the callee
2964 to save. */
2966 static unsigned
2967 aarch64_next_callee_save (unsigned regno, unsigned limit)
2969 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2970 regno ++;
2971 return regno;
2974 /* Push the register number REGNO of mode MODE to the stack with write-back
2975 adjusting the stack by ADJUSTMENT. */
2977 static void
2978 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2979 HOST_WIDE_INT adjustment)
2981 rtx base_rtx = stack_pointer_rtx;
2982 rtx insn, reg, mem;
2984 reg = gen_rtx_REG (mode, regno);
2985 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2986 plus_constant (Pmode, base_rtx, -adjustment));
2987 mem = gen_rtx_MEM (mode, mem);
2989 insn = emit_move_insn (mem, reg);
2990 RTX_FRAME_RELATED_P (insn) = 1;
2993 /* Generate and return an instruction to store the pair of registers
2994 REG and REG2 of mode MODE to location BASE with write-back adjusting
2995 the stack location BASE by ADJUSTMENT. */
2997 static rtx
2998 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2999 HOST_WIDE_INT adjustment)
3001 switch (mode)
3003 case DImode:
3004 return gen_storewb_pairdi_di (base, base, reg, reg2,
3005 GEN_INT (-adjustment),
3006 GEN_INT (UNITS_PER_WORD - adjustment));
3007 case DFmode:
3008 return gen_storewb_pairdf_di (base, base, reg, reg2,
3009 GEN_INT (-adjustment),
3010 GEN_INT (UNITS_PER_WORD - adjustment));
3011 default:
3012 gcc_unreachable ();
3016 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3017 stack pointer by ADJUSTMENT. */
3019 static void
3020 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3022 rtx_insn *insn;
3023 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3025 if (regno2 == INVALID_REGNUM)
3026 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3028 rtx reg1 = gen_rtx_REG (mode, regno1);
3029 rtx reg2 = gen_rtx_REG (mode, regno2);
3031 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3032 reg2, adjustment));
3033 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3034 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3035 RTX_FRAME_RELATED_P (insn) = 1;
3038 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3039 adjusting it by ADJUSTMENT afterwards. */
3041 static rtx
3042 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3043 HOST_WIDE_INT adjustment)
3045 switch (mode)
3047 case DImode:
3048 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3049 GEN_INT (UNITS_PER_WORD));
3050 case DFmode:
3051 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3052 GEN_INT (UNITS_PER_WORD));
3053 default:
3054 gcc_unreachable ();
3058 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3059 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3060 into CFI_OPS. */
3062 static void
3063 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3064 rtx *cfi_ops)
3066 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3067 rtx reg1 = gen_rtx_REG (mode, regno1);
3069 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3071 if (regno2 == INVALID_REGNUM)
3073 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3074 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3075 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3077 else
3079 rtx reg2 = gen_rtx_REG (mode, regno2);
3080 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3081 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3082 reg2, adjustment));
3086 /* Generate and return a store pair instruction of mode MODE to store
3087 register REG1 to MEM1 and register REG2 to MEM2. */
3089 static rtx
3090 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3091 rtx reg2)
3093 switch (mode)
3095 case DImode:
3096 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3098 case DFmode:
3099 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3101 default:
3102 gcc_unreachable ();
3106 /* Generate and regurn a load pair isntruction of mode MODE to load register
3107 REG1 from MEM1 and register REG2 from MEM2. */
3109 static rtx
3110 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3111 rtx mem2)
3113 switch (mode)
3115 case DImode:
3116 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3118 case DFmode:
3119 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3121 default:
3122 gcc_unreachable ();
3126 /* Emit code to save the callee-saved registers from register number START
3127 to LIMIT to the stack at the location starting at offset START_OFFSET,
3128 skipping any write-back candidates if SKIP_WB is true. */
3130 static void
3131 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3132 unsigned start, unsigned limit, bool skip_wb)
3134 rtx_insn *insn;
3135 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3136 ? gen_frame_mem : gen_rtx_MEM);
3137 unsigned regno;
3138 unsigned regno2;
3140 for (regno = aarch64_next_callee_save (start, limit);
3141 regno <= limit;
3142 regno = aarch64_next_callee_save (regno + 1, limit))
3144 rtx reg, mem;
3145 HOST_WIDE_INT offset;
3147 if (skip_wb
3148 && (regno == cfun->machine->frame.wb_candidate1
3149 || regno == cfun->machine->frame.wb_candidate2))
3150 continue;
3152 if (cfun->machine->reg_is_wrapped_separately[regno])
3153 continue;
3155 reg = gen_rtx_REG (mode, regno);
3156 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3157 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3158 offset));
3160 regno2 = aarch64_next_callee_save (regno + 1, limit);
3162 if (regno2 <= limit
3163 && !cfun->machine->reg_is_wrapped_separately[regno2]
3164 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3165 == cfun->machine->frame.reg_offset[regno2]))
3168 rtx reg2 = gen_rtx_REG (mode, regno2);
3169 rtx mem2;
3171 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3172 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3173 offset));
3174 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3175 reg2));
3177 /* The first part of a frame-related parallel insn is
3178 always assumed to be relevant to the frame
3179 calculations; subsequent parts, are only
3180 frame-related if explicitly marked. */
3181 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3182 regno = regno2;
3184 else
3185 insn = emit_move_insn (mem, reg);
3187 RTX_FRAME_RELATED_P (insn) = 1;
3191 /* Emit code to restore the callee registers of mode MODE from register
3192 number START up to and including LIMIT. Restore from the stack offset
3193 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3194 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3196 static void
3197 aarch64_restore_callee_saves (machine_mode mode,
3198 HOST_WIDE_INT start_offset, unsigned start,
3199 unsigned limit, bool skip_wb, rtx *cfi_ops)
3201 rtx base_rtx = stack_pointer_rtx;
3202 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3203 ? gen_frame_mem : gen_rtx_MEM);
3204 unsigned regno;
3205 unsigned regno2;
3206 HOST_WIDE_INT offset;
3208 for (regno = aarch64_next_callee_save (start, limit);
3209 regno <= limit;
3210 regno = aarch64_next_callee_save (regno + 1, limit))
3212 if (cfun->machine->reg_is_wrapped_separately[regno])
3213 continue;
3215 rtx reg, mem;
3217 if (skip_wb
3218 && (regno == cfun->machine->frame.wb_candidate1
3219 || regno == cfun->machine->frame.wb_candidate2))
3220 continue;
3222 reg = gen_rtx_REG (mode, regno);
3223 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3224 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3226 regno2 = aarch64_next_callee_save (regno + 1, limit);
3228 if (regno2 <= limit
3229 && !cfun->machine->reg_is_wrapped_separately[regno2]
3230 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3231 == cfun->machine->frame.reg_offset[regno2]))
3233 rtx reg2 = gen_rtx_REG (mode, regno2);
3234 rtx mem2;
3236 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3237 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3238 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3240 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3241 regno = regno2;
3243 else
3244 emit_move_insn (reg, mem);
3245 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3249 static inline bool
3250 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3251 HOST_WIDE_INT offset)
3253 return offset >= -256 && offset < 256;
3256 static inline bool
3257 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3259 return (offset >= 0
3260 && offset < 4096 * GET_MODE_SIZE (mode)
3261 && offset % GET_MODE_SIZE (mode) == 0);
3264 bool
3265 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3267 return (offset >= -64 * GET_MODE_SIZE (mode)
3268 && offset < 64 * GET_MODE_SIZE (mode)
3269 && offset % GET_MODE_SIZE (mode) == 0);
3272 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3274 static sbitmap
3275 aarch64_get_separate_components (void)
3277 aarch64_layout_frame ();
3279 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3280 bitmap_clear (components);
3282 /* The registers we need saved to the frame. */
3283 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3284 if (aarch64_register_saved_on_entry (regno))
3286 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3287 if (!frame_pointer_needed)
3288 offset += cfun->machine->frame.frame_size
3289 - cfun->machine->frame.hard_fp_offset;
3290 /* Check that we can access the stack slot of the register with one
3291 direct load with no adjustments needed. */
3292 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3293 bitmap_set_bit (components, regno);
3296 /* Don't mess with the hard frame pointer. */
3297 if (frame_pointer_needed)
3298 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3300 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3301 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3302 /* If aarch64_layout_frame has chosen registers to store/restore with
3303 writeback don't interfere with them to avoid having to output explicit
3304 stack adjustment instructions. */
3305 if (reg2 != INVALID_REGNUM)
3306 bitmap_clear_bit (components, reg2);
3307 if (reg1 != INVALID_REGNUM)
3308 bitmap_clear_bit (components, reg1);
3310 bitmap_clear_bit (components, LR_REGNUM);
3311 bitmap_clear_bit (components, SP_REGNUM);
3313 return components;
3316 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3318 static sbitmap
3319 aarch64_components_for_bb (basic_block bb)
3321 bitmap in = DF_LIVE_IN (bb);
3322 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3323 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3325 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3326 bitmap_clear (components);
3328 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3329 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3330 if ((!call_used_regs[regno])
3331 && (bitmap_bit_p (in, regno)
3332 || bitmap_bit_p (gen, regno)
3333 || bitmap_bit_p (kill, regno)))
3334 bitmap_set_bit (components, regno);
3336 return components;
3339 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3340 Nothing to do for aarch64. */
3342 static void
3343 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3347 /* Return the next set bit in BMP from START onwards. Return the total number
3348 of bits in BMP if no set bit is found at or after START. */
3350 static unsigned int
3351 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3353 unsigned int nbits = SBITMAP_SIZE (bmp);
3354 if (start == nbits)
3355 return start;
3357 gcc_assert (start < nbits);
3358 for (unsigned int i = start; i < nbits; i++)
3359 if (bitmap_bit_p (bmp, i))
3360 return i;
3362 return nbits;
3365 /* Do the work for aarch64_emit_prologue_components and
3366 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3367 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3368 for these components or the epilogue sequence. That is, it determines
3369 whether we should emit stores or loads and what kind of CFA notes to attach
3370 to the insns. Otherwise the logic for the two sequences is very
3371 similar. */
3373 static void
3374 aarch64_process_components (sbitmap components, bool prologue_p)
3376 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3377 ? HARD_FRAME_POINTER_REGNUM
3378 : STACK_POINTER_REGNUM);
3380 unsigned last_regno = SBITMAP_SIZE (components);
3381 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3382 rtx_insn *insn = NULL;
3384 while (regno != last_regno)
3386 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3387 so DFmode for the vector registers is enough. */
3388 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3389 rtx reg = gen_rtx_REG (mode, regno);
3390 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3391 if (!frame_pointer_needed)
3392 offset += cfun->machine->frame.frame_size
3393 - cfun->machine->frame.hard_fp_offset;
3394 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3395 rtx mem = gen_frame_mem (mode, addr);
3397 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3398 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3399 /* No more registers to handle after REGNO.
3400 Emit a single save/restore and exit. */
3401 if (regno2 == last_regno)
3403 insn = emit_insn (set);
3404 RTX_FRAME_RELATED_P (insn) = 1;
3405 if (prologue_p)
3406 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3407 else
3408 add_reg_note (insn, REG_CFA_RESTORE, reg);
3409 break;
3412 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3413 /* The next register is not of the same class or its offset is not
3414 mergeable with the current one into a pair. */
3415 if (!satisfies_constraint_Ump (mem)
3416 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3417 || (offset2 - cfun->machine->frame.reg_offset[regno])
3418 != GET_MODE_SIZE (mode))
3420 insn = emit_insn (set);
3421 RTX_FRAME_RELATED_P (insn) = 1;
3422 if (prologue_p)
3423 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3424 else
3425 add_reg_note (insn, REG_CFA_RESTORE, reg);
3427 regno = regno2;
3428 continue;
3431 /* REGNO2 can be saved/restored in a pair with REGNO. */
3432 rtx reg2 = gen_rtx_REG (mode, regno2);
3433 if (!frame_pointer_needed)
3434 offset2 += cfun->machine->frame.frame_size
3435 - cfun->machine->frame.hard_fp_offset;
3436 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3437 rtx mem2 = gen_frame_mem (mode, addr2);
3438 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3439 : gen_rtx_SET (reg2, mem2);
3441 if (prologue_p)
3442 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3443 else
3444 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3446 RTX_FRAME_RELATED_P (insn) = 1;
3447 if (prologue_p)
3449 add_reg_note (insn, REG_CFA_OFFSET, set);
3450 add_reg_note (insn, REG_CFA_OFFSET, set2);
3452 else
3454 add_reg_note (insn, REG_CFA_RESTORE, reg);
3455 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3458 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3462 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3464 static void
3465 aarch64_emit_prologue_components (sbitmap components)
3467 aarch64_process_components (components, true);
3470 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3472 static void
3473 aarch64_emit_epilogue_components (sbitmap components)
3475 aarch64_process_components (components, false);
3478 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3480 static void
3481 aarch64_set_handled_components (sbitmap components)
3483 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3484 if (bitmap_bit_p (components, regno))
3485 cfun->machine->reg_is_wrapped_separately[regno] = true;
3488 /* AArch64 stack frames generated by this compiler look like:
3490 +-------------------------------+
3492 | incoming stack arguments |
3494 +-------------------------------+
3495 | | <-- incoming stack pointer (aligned)
3496 | callee-allocated save area |
3497 | for register varargs |
3499 +-------------------------------+
3500 | local variables | <-- frame_pointer_rtx
3502 +-------------------------------+
3503 | padding0 | \
3504 +-------------------------------+ |
3505 | callee-saved registers | | frame.saved_regs_size
3506 +-------------------------------+ |
3507 | LR' | |
3508 +-------------------------------+ |
3509 | FP' | / <- hard_frame_pointer_rtx (aligned)
3510 +-------------------------------+
3511 | dynamic allocation |
3512 +-------------------------------+
3513 | padding |
3514 +-------------------------------+
3515 | outgoing stack arguments | <-- arg_pointer
3517 +-------------------------------+
3518 | | <-- stack_pointer_rtx (aligned)
3520 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3521 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3522 unchanged. */
3524 /* Generate the prologue instructions for entry into a function.
3525 Establish the stack frame by decreasing the stack pointer with a
3526 properly calculated size and, if necessary, create a frame record
3527 filled with the values of LR and previous frame pointer. The
3528 current FP is also set up if it is in use. */
3530 void
3531 aarch64_expand_prologue (void)
3533 aarch64_layout_frame ();
3535 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3536 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3537 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3538 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3539 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3540 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3541 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3542 rtx_insn *insn;
3544 if (flag_stack_usage_info)
3545 current_function_static_stack_size = frame_size;
3547 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3549 if (crtl->is_leaf && !cfun->calls_alloca)
3551 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3552 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3553 frame_size - STACK_CHECK_PROTECT);
3555 else if (frame_size > 0)
3556 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3559 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3561 if (callee_adjust != 0)
3562 aarch64_push_regs (reg1, reg2, callee_adjust);
3564 if (frame_pointer_needed)
3566 if (callee_adjust == 0)
3567 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3568 R30_REGNUM, false);
3569 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3570 stack_pointer_rtx,
3571 GEN_INT (callee_offset)));
3572 RTX_FRAME_RELATED_P (insn) = 1;
3573 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3576 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3577 callee_adjust != 0 || frame_pointer_needed);
3578 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3579 callee_adjust != 0 || frame_pointer_needed);
3580 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3583 /* Return TRUE if we can use a simple_return insn.
3585 This function checks whether the callee saved stack is empty, which
3586 means no restore actions are need. The pro_and_epilogue will use
3587 this to check whether shrink-wrapping opt is feasible. */
3589 bool
3590 aarch64_use_return_insn_p (void)
3592 if (!reload_completed)
3593 return false;
3595 if (crtl->profile)
3596 return false;
3598 aarch64_layout_frame ();
3600 return cfun->machine->frame.frame_size == 0;
3603 /* Generate the epilogue instructions for returning from a function.
3604 This is almost exactly the reverse of the prolog sequence, except
3605 that we need to insert barriers to avoid scheduling loads that read
3606 from a deallocated stack, and we optimize the unwind records by
3607 emitting them all together if possible. */
3608 void
3609 aarch64_expand_epilogue (bool for_sibcall)
3611 aarch64_layout_frame ();
3613 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3614 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3615 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3616 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3617 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3618 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3619 rtx cfi_ops = NULL;
3620 rtx_insn *insn;
3622 /* We need to add memory barrier to prevent read from deallocated stack. */
3623 bool need_barrier_p = (get_frame_size ()
3624 + cfun->machine->frame.saved_varargs_size) != 0;
3626 /* Emit a barrier to prevent loads from a deallocated stack. */
3627 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3628 || crtl->calls_eh_return)
3630 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3631 need_barrier_p = false;
3634 /* Restore the stack pointer from the frame pointer if it may not
3635 be the same as the stack pointer. */
3636 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3638 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3639 hard_frame_pointer_rtx,
3640 GEN_INT (-callee_offset)));
3641 /* If writeback is used when restoring callee-saves, the CFA
3642 is restored on the instruction doing the writeback. */
3643 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3645 else
3646 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3648 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3649 callee_adjust != 0, &cfi_ops);
3650 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3651 callee_adjust != 0, &cfi_ops);
3653 if (need_barrier_p)
3654 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3656 if (callee_adjust != 0)
3657 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3659 if (callee_adjust != 0 || initial_adjust > 65536)
3661 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3662 insn = get_last_insn ();
3663 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3664 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3665 RTX_FRAME_RELATED_P (insn) = 1;
3666 cfi_ops = NULL;
3669 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3671 if (cfi_ops)
3673 /* Emit delayed restores and reset the CFA to be SP. */
3674 insn = get_last_insn ();
3675 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3676 REG_NOTES (insn) = cfi_ops;
3677 RTX_FRAME_RELATED_P (insn) = 1;
3680 /* Stack adjustment for exception handler. */
3681 if (crtl->calls_eh_return)
3683 /* We need to unwind the stack by the offset computed by
3684 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3685 to be SP; letting the CFA move during this adjustment
3686 is just as correct as retaining the CFA from the body
3687 of the function. Therefore, do nothing special. */
3688 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3691 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3692 if (!for_sibcall)
3693 emit_jump_insn (ret_rtx);
3696 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3697 normally or return to a previous frame after unwinding.
3699 An EH return uses a single shared return sequence. The epilogue is
3700 exactly like a normal epilogue except that it has an extra input
3701 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3702 that must be applied after the frame has been destroyed. An extra label
3703 is inserted before the epilogue which initializes this register to zero,
3704 and this is the entry point for a normal return.
3706 An actual EH return updates the return address, initializes the stack
3707 adjustment and jumps directly into the epilogue (bypassing the zeroing
3708 of the adjustment). Since the return address is typically saved on the
3709 stack when a function makes a call, the saved LR must be updated outside
3710 the epilogue.
3712 This poses problems as the store is generated well before the epilogue,
3713 so the offset of LR is not known yet. Also optimizations will remove the
3714 store as it appears dead, even after the epilogue is generated (as the
3715 base or offset for loading LR is different in many cases).
3717 To avoid these problems this implementation forces the frame pointer
3718 in eh_return functions so that the location of LR is fixed and known early.
3719 It also marks the store volatile, so no optimization is permitted to
3720 remove the store. */
3722 aarch64_eh_return_handler_rtx (void)
3724 rtx tmp = gen_frame_mem (Pmode,
3725 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3727 /* Mark the store volatile, so no optimization is permitted to remove it. */
3728 MEM_VOLATILE_P (tmp) = true;
3729 return tmp;
3732 /* Output code to add DELTA to the first argument, and then jump
3733 to FUNCTION. Used for C++ multiple inheritance. */
3734 static void
3735 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3736 HOST_WIDE_INT delta,
3737 HOST_WIDE_INT vcall_offset,
3738 tree function)
3740 /* The this pointer is always in x0. Note that this differs from
3741 Arm where the this pointer maybe bumped to r1 if r0 is required
3742 to return a pointer to an aggregate. On AArch64 a result value
3743 pointer will be in x8. */
3744 int this_regno = R0_REGNUM;
3745 rtx this_rtx, temp0, temp1, addr, funexp;
3746 rtx_insn *insn;
3748 reload_completed = 1;
3749 emit_note (NOTE_INSN_PROLOGUE_END);
3751 if (vcall_offset == 0)
3752 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3753 else
3755 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3757 this_rtx = gen_rtx_REG (Pmode, this_regno);
3758 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3759 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3761 addr = this_rtx;
3762 if (delta != 0)
3764 if (delta >= -256 && delta < 256)
3765 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3766 plus_constant (Pmode, this_rtx, delta));
3767 else
3768 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3771 if (Pmode == ptr_mode)
3772 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3773 else
3774 aarch64_emit_move (temp0,
3775 gen_rtx_ZERO_EXTEND (Pmode,
3776 gen_rtx_MEM (ptr_mode, addr)));
3778 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3779 addr = plus_constant (Pmode, temp0, vcall_offset);
3780 else
3782 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3783 Pmode);
3784 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3787 if (Pmode == ptr_mode)
3788 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3789 else
3790 aarch64_emit_move (temp1,
3791 gen_rtx_SIGN_EXTEND (Pmode,
3792 gen_rtx_MEM (ptr_mode, addr)));
3794 emit_insn (gen_add2_insn (this_rtx, temp1));
3797 /* Generate a tail call to the target function. */
3798 if (!TREE_USED (function))
3800 assemble_external (function);
3801 TREE_USED (function) = 1;
3803 funexp = XEXP (DECL_RTL (function), 0);
3804 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3805 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3806 SIBLING_CALL_P (insn) = 1;
3808 insn = get_insns ();
3809 shorten_branches (insn);
3810 final_start_function (insn, file, 1);
3811 final (insn, file, 1);
3812 final_end_function ();
3814 /* Stop pretending to be a post-reload pass. */
3815 reload_completed = 0;
3818 static bool
3819 aarch64_tls_referenced_p (rtx x)
3821 if (!TARGET_HAVE_TLS)
3822 return false;
3823 subrtx_iterator::array_type array;
3824 FOR_EACH_SUBRTX (iter, array, x, ALL)
3826 const_rtx x = *iter;
3827 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3828 return true;
3829 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3830 TLS offsets, not real symbol references. */
3831 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3832 iter.skip_subrtxes ();
3834 return false;
3838 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3839 a left shift of 0 or 12 bits. */
3840 bool
3841 aarch64_uimm12_shift (HOST_WIDE_INT val)
3843 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3844 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3849 /* Return true if val is an immediate that can be loaded into a
3850 register by a MOVZ instruction. */
3851 static bool
3852 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3854 if (GET_MODE_SIZE (mode) > 4)
3856 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3857 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3858 return 1;
3860 else
3862 /* Ignore sign extension. */
3863 val &= (HOST_WIDE_INT) 0xffffffff;
3865 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3866 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3869 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3871 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3873 0x0000000100000001ull,
3874 0x0001000100010001ull,
3875 0x0101010101010101ull,
3876 0x1111111111111111ull,
3877 0x5555555555555555ull,
3881 /* Return true if val is a valid bitmask immediate. */
3883 bool
3884 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3886 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3887 int bits;
3889 /* Check for a single sequence of one bits and return quickly if so.
3890 The special cases of all ones and all zeroes returns false. */
3891 val = (unsigned HOST_WIDE_INT) val_in;
3892 tmp = val + (val & -val);
3894 if (tmp == (tmp & -tmp))
3895 return (val + 1) > 1;
3897 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3898 if (mode == SImode)
3899 val = (val << 32) | (val & 0xffffffff);
3901 /* Invert if the immediate doesn't start with a zero bit - this means we
3902 only need to search for sequences of one bits. */
3903 if (val & 1)
3904 val = ~val;
3906 /* Find the first set bit and set tmp to val with the first sequence of one
3907 bits removed. Return success if there is a single sequence of ones. */
3908 first_one = val & -val;
3909 tmp = val & (val + first_one);
3911 if (tmp == 0)
3912 return true;
3914 /* Find the next set bit and compute the difference in bit position. */
3915 next_one = tmp & -tmp;
3916 bits = clz_hwi (first_one) - clz_hwi (next_one);
3917 mask = val ^ tmp;
3919 /* Check the bit position difference is a power of 2, and that the first
3920 sequence of one bits fits within 'bits' bits. */
3921 if ((mask >> bits) != 0 || bits != (bits & -bits))
3922 return false;
3924 /* Check the sequence of one bits is repeated 64/bits times. */
3925 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3928 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3929 Assumed precondition: VAL_IN Is not zero. */
3931 unsigned HOST_WIDE_INT
3932 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3934 int lowest_bit_set = ctz_hwi (val_in);
3935 int highest_bit_set = floor_log2 (val_in);
3936 gcc_assert (val_in != 0);
3938 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3939 (HOST_WIDE_INT_1U << lowest_bit_set));
3942 /* Create constant where bits outside of lowest bit set to highest bit set
3943 are set to 1. */
3945 unsigned HOST_WIDE_INT
3946 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
3948 return val_in | ~aarch64_and_split_imm1 (val_in);
3951 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
3953 bool
3954 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
3956 if (aarch64_bitmask_imm (val_in, mode))
3957 return false;
3959 if (aarch64_move_imm (val_in, mode))
3960 return false;
3962 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
3964 return aarch64_bitmask_imm (imm2, mode);
3967 /* Return true if val is an immediate that can be loaded into a
3968 register in a single instruction. */
3969 bool
3970 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3972 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3973 return 1;
3974 return aarch64_bitmask_imm (val, mode);
3977 static bool
3978 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3980 rtx base, offset;
3982 if (GET_CODE (x) == HIGH)
3983 return true;
3985 split_const (x, &base, &offset);
3986 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3988 if (aarch64_classify_symbol (base, offset)
3989 != SYMBOL_FORCE_TO_MEM)
3990 return true;
3991 else
3992 /* Avoid generating a 64-bit relocation in ILP32; leave
3993 to aarch64_expand_mov_immediate to handle it properly. */
3994 return mode != ptr_mode;
3997 return aarch64_tls_referenced_p (x);
4000 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4001 The expansion for a table switch is quite expensive due to the number
4002 of instructions, the table lookup and hard to predict indirect jump.
4003 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4004 set, otherwise use tables for > 16 cases as a tradeoff between size and
4005 performance. When optimizing for size, use the default setting. */
4007 static unsigned int
4008 aarch64_case_values_threshold (void)
4010 /* Use the specified limit for the number of cases before using jump
4011 tables at higher optimization levels. */
4012 if (optimize > 2
4013 && selected_cpu->tune->max_case_values != 0)
4014 return selected_cpu->tune->max_case_values;
4015 else
4016 return optimize_size ? default_case_values_threshold () : 17;
4019 /* Return true if register REGNO is a valid index register.
4020 STRICT_P is true if REG_OK_STRICT is in effect. */
4022 bool
4023 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4025 if (!HARD_REGISTER_NUM_P (regno))
4027 if (!strict_p)
4028 return true;
4030 if (!reg_renumber)
4031 return false;
4033 regno = reg_renumber[regno];
4035 return GP_REGNUM_P (regno);
4038 /* Return true if register REGNO is a valid base register for mode MODE.
4039 STRICT_P is true if REG_OK_STRICT is in effect. */
4041 bool
4042 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4044 if (!HARD_REGISTER_NUM_P (regno))
4046 if (!strict_p)
4047 return true;
4049 if (!reg_renumber)
4050 return false;
4052 regno = reg_renumber[regno];
4055 /* The fake registers will be eliminated to either the stack or
4056 hard frame pointer, both of which are usually valid base registers.
4057 Reload deals with the cases where the eliminated form isn't valid. */
4058 return (GP_REGNUM_P (regno)
4059 || regno == SP_REGNUM
4060 || regno == FRAME_POINTER_REGNUM
4061 || regno == ARG_POINTER_REGNUM);
4064 /* Return true if X is a valid base register for mode MODE.
4065 STRICT_P is true if REG_OK_STRICT is in effect. */
4067 static bool
4068 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4070 if (!strict_p && GET_CODE (x) == SUBREG)
4071 x = SUBREG_REG (x);
4073 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4076 /* Return true if address offset is a valid index. If it is, fill in INFO
4077 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4079 static bool
4080 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4081 machine_mode mode, bool strict_p)
4083 enum aarch64_address_type type;
4084 rtx index;
4085 int shift;
4087 /* (reg:P) */
4088 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4089 && GET_MODE (x) == Pmode)
4091 type = ADDRESS_REG_REG;
4092 index = x;
4093 shift = 0;
4095 /* (sign_extend:DI (reg:SI)) */
4096 else if ((GET_CODE (x) == SIGN_EXTEND
4097 || GET_CODE (x) == ZERO_EXTEND)
4098 && GET_MODE (x) == DImode
4099 && GET_MODE (XEXP (x, 0)) == SImode)
4101 type = (GET_CODE (x) == SIGN_EXTEND)
4102 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4103 index = XEXP (x, 0);
4104 shift = 0;
4106 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4107 else if (GET_CODE (x) == MULT
4108 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4109 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4110 && GET_MODE (XEXP (x, 0)) == DImode
4111 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4112 && CONST_INT_P (XEXP (x, 1)))
4114 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4115 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4116 index = XEXP (XEXP (x, 0), 0);
4117 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4119 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4120 else if (GET_CODE (x) == ASHIFT
4121 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4122 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4123 && GET_MODE (XEXP (x, 0)) == DImode
4124 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4125 && CONST_INT_P (XEXP (x, 1)))
4127 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4128 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4129 index = XEXP (XEXP (x, 0), 0);
4130 shift = INTVAL (XEXP (x, 1));
4132 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4133 else if ((GET_CODE (x) == SIGN_EXTRACT
4134 || GET_CODE (x) == ZERO_EXTRACT)
4135 && GET_MODE (x) == DImode
4136 && GET_CODE (XEXP (x, 0)) == MULT
4137 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4138 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4140 type = (GET_CODE (x) == SIGN_EXTRACT)
4141 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4142 index = XEXP (XEXP (x, 0), 0);
4143 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4144 if (INTVAL (XEXP (x, 1)) != 32 + shift
4145 || INTVAL (XEXP (x, 2)) != 0)
4146 shift = -1;
4148 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4149 (const_int 0xffffffff<<shift)) */
4150 else if (GET_CODE (x) == AND
4151 && GET_MODE (x) == DImode
4152 && GET_CODE (XEXP (x, 0)) == MULT
4153 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4154 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4155 && CONST_INT_P (XEXP (x, 1)))
4157 type = ADDRESS_REG_UXTW;
4158 index = XEXP (XEXP (x, 0), 0);
4159 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4160 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4161 shift = -1;
4163 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4164 else if ((GET_CODE (x) == SIGN_EXTRACT
4165 || GET_CODE (x) == ZERO_EXTRACT)
4166 && GET_MODE (x) == DImode
4167 && GET_CODE (XEXP (x, 0)) == ASHIFT
4168 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4169 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4171 type = (GET_CODE (x) == SIGN_EXTRACT)
4172 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4173 index = XEXP (XEXP (x, 0), 0);
4174 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4175 if (INTVAL (XEXP (x, 1)) != 32 + shift
4176 || INTVAL (XEXP (x, 2)) != 0)
4177 shift = -1;
4179 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4180 (const_int 0xffffffff<<shift)) */
4181 else if (GET_CODE (x) == AND
4182 && GET_MODE (x) == DImode
4183 && GET_CODE (XEXP (x, 0)) == ASHIFT
4184 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4185 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4186 && CONST_INT_P (XEXP (x, 1)))
4188 type = ADDRESS_REG_UXTW;
4189 index = XEXP (XEXP (x, 0), 0);
4190 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4191 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4192 shift = -1;
4194 /* (mult:P (reg:P) (const_int scale)) */
4195 else if (GET_CODE (x) == MULT
4196 && GET_MODE (x) == Pmode
4197 && GET_MODE (XEXP (x, 0)) == Pmode
4198 && CONST_INT_P (XEXP (x, 1)))
4200 type = ADDRESS_REG_REG;
4201 index = XEXP (x, 0);
4202 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4204 /* (ashift:P (reg:P) (const_int shift)) */
4205 else if (GET_CODE (x) == ASHIFT
4206 && GET_MODE (x) == Pmode
4207 && GET_MODE (XEXP (x, 0)) == Pmode
4208 && CONST_INT_P (XEXP (x, 1)))
4210 type = ADDRESS_REG_REG;
4211 index = XEXP (x, 0);
4212 shift = INTVAL (XEXP (x, 1));
4214 else
4215 return false;
4217 if (GET_CODE (index) == SUBREG)
4218 index = SUBREG_REG (index);
4220 if ((shift == 0 ||
4221 (shift > 0 && shift <= 3
4222 && (1 << shift) == GET_MODE_SIZE (mode)))
4223 && REG_P (index)
4224 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4226 info->type = type;
4227 info->offset = index;
4228 info->shift = shift;
4229 return true;
4232 return false;
4235 /* Return true if MODE is one of the modes for which we
4236 support LDP/STP operations. */
4238 static bool
4239 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4241 return mode == SImode || mode == DImode
4242 || mode == SFmode || mode == DFmode
4243 || (aarch64_vector_mode_supported_p (mode)
4244 && GET_MODE_SIZE (mode) == 8);
4247 /* Return true if REGNO is a virtual pointer register, or an eliminable
4248 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4249 include stack_pointer or hard_frame_pointer. */
4250 static bool
4251 virt_or_elim_regno_p (unsigned regno)
4253 return ((regno >= FIRST_VIRTUAL_REGISTER
4254 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4255 || regno == FRAME_POINTER_REGNUM
4256 || regno == ARG_POINTER_REGNUM);
4259 /* Return true if X is a valid address for machine mode MODE. If it is,
4260 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4261 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4263 static bool
4264 aarch64_classify_address (struct aarch64_address_info *info,
4265 rtx x, machine_mode mode,
4266 RTX_CODE outer_code, bool strict_p)
4268 enum rtx_code code = GET_CODE (x);
4269 rtx op0, op1;
4271 /* On BE, we use load/store pair for all large int mode load/stores.
4272 TI/TFmode may also use a load/store pair. */
4273 bool load_store_pair_p = (outer_code == PARALLEL
4274 || mode == TImode
4275 || mode == TFmode
4276 || (BYTES_BIG_ENDIAN
4277 && aarch64_vect_struct_mode_p (mode)));
4279 bool allow_reg_index_p =
4280 !load_store_pair_p
4281 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4282 && !aarch64_vect_struct_mode_p (mode);
4284 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4285 REG addressing. */
4286 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4287 && (code != POST_INC && code != REG))
4288 return false;
4290 switch (code)
4292 case REG:
4293 case SUBREG:
4294 info->type = ADDRESS_REG_IMM;
4295 info->base = x;
4296 info->offset = const0_rtx;
4297 return aarch64_base_register_rtx_p (x, strict_p);
4299 case PLUS:
4300 op0 = XEXP (x, 0);
4301 op1 = XEXP (x, 1);
4303 if (! strict_p
4304 && REG_P (op0)
4305 && virt_or_elim_regno_p (REGNO (op0))
4306 && CONST_INT_P (op1))
4308 info->type = ADDRESS_REG_IMM;
4309 info->base = op0;
4310 info->offset = op1;
4312 return true;
4315 if (GET_MODE_SIZE (mode) != 0
4316 && CONST_INT_P (op1)
4317 && aarch64_base_register_rtx_p (op0, strict_p))
4319 HOST_WIDE_INT offset = INTVAL (op1);
4321 info->type = ADDRESS_REG_IMM;
4322 info->base = op0;
4323 info->offset = op1;
4325 /* TImode and TFmode values are allowed in both pairs of X
4326 registers and individual Q registers. The available
4327 address modes are:
4328 X,X: 7-bit signed scaled offset
4329 Q: 9-bit signed offset
4330 We conservatively require an offset representable in either mode.
4331 When performing the check for pairs of X registers i.e. LDP/STP
4332 pass down DImode since that is the natural size of the LDP/STP
4333 instruction memory accesses. */
4334 if (mode == TImode || mode == TFmode)
4335 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4336 && (offset_9bit_signed_unscaled_p (mode, offset)
4337 || offset_12bit_unsigned_scaled_p (mode, offset)));
4339 /* A 7bit offset check because OImode will emit a ldp/stp
4340 instruction (only big endian will get here).
4341 For ldp/stp instructions, the offset is scaled for the size of a
4342 single element of the pair. */
4343 if (mode == OImode)
4344 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4346 /* Three 9/12 bit offsets checks because CImode will emit three
4347 ldr/str instructions (only big endian will get here). */
4348 if (mode == CImode)
4349 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4350 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4351 || offset_12bit_unsigned_scaled_p (V16QImode,
4352 offset + 32)));
4354 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4355 instructions (only big endian will get here). */
4356 if (mode == XImode)
4357 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4358 && aarch64_offset_7bit_signed_scaled_p (TImode,
4359 offset + 32));
4361 if (load_store_pair_p)
4362 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4363 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4364 else
4365 return (offset_9bit_signed_unscaled_p (mode, offset)
4366 || offset_12bit_unsigned_scaled_p (mode, offset));
4369 if (allow_reg_index_p)
4371 /* Look for base + (scaled/extended) index register. */
4372 if (aarch64_base_register_rtx_p (op0, strict_p)
4373 && aarch64_classify_index (info, op1, mode, strict_p))
4375 info->base = op0;
4376 return true;
4378 if (aarch64_base_register_rtx_p (op1, strict_p)
4379 && aarch64_classify_index (info, op0, mode, strict_p))
4381 info->base = op1;
4382 return true;
4386 return false;
4388 case POST_INC:
4389 case POST_DEC:
4390 case PRE_INC:
4391 case PRE_DEC:
4392 info->type = ADDRESS_REG_WB;
4393 info->base = XEXP (x, 0);
4394 info->offset = NULL_RTX;
4395 return aarch64_base_register_rtx_p (info->base, strict_p);
4397 case POST_MODIFY:
4398 case PRE_MODIFY:
4399 info->type = ADDRESS_REG_WB;
4400 info->base = XEXP (x, 0);
4401 if (GET_CODE (XEXP (x, 1)) == PLUS
4402 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4403 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4404 && aarch64_base_register_rtx_p (info->base, strict_p))
4406 HOST_WIDE_INT offset;
4407 info->offset = XEXP (XEXP (x, 1), 1);
4408 offset = INTVAL (info->offset);
4410 /* TImode and TFmode values are allowed in both pairs of X
4411 registers and individual Q registers. The available
4412 address modes are:
4413 X,X: 7-bit signed scaled offset
4414 Q: 9-bit signed offset
4415 We conservatively require an offset representable in either mode.
4417 if (mode == TImode || mode == TFmode)
4418 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4419 && offset_9bit_signed_unscaled_p (mode, offset));
4421 if (load_store_pair_p)
4422 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4423 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4424 else
4425 return offset_9bit_signed_unscaled_p (mode, offset);
4427 return false;
4429 case CONST:
4430 case SYMBOL_REF:
4431 case LABEL_REF:
4432 /* load literal: pc-relative constant pool entry. Only supported
4433 for SI mode or larger. */
4434 info->type = ADDRESS_SYMBOLIC;
4436 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4438 rtx sym, addend;
4440 split_const (x, &sym, &addend);
4441 return ((GET_CODE (sym) == LABEL_REF
4442 || (GET_CODE (sym) == SYMBOL_REF
4443 && CONSTANT_POOL_ADDRESS_P (sym)
4444 && aarch64_pcrelative_literal_loads)));
4446 return false;
4448 case LO_SUM:
4449 info->type = ADDRESS_LO_SUM;
4450 info->base = XEXP (x, 0);
4451 info->offset = XEXP (x, 1);
4452 if (allow_reg_index_p
4453 && aarch64_base_register_rtx_p (info->base, strict_p))
4455 rtx sym, offs;
4456 split_const (info->offset, &sym, &offs);
4457 if (GET_CODE (sym) == SYMBOL_REF
4458 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4460 /* The symbol and offset must be aligned to the access size. */
4461 unsigned int align;
4462 unsigned int ref_size;
4464 if (CONSTANT_POOL_ADDRESS_P (sym))
4465 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4466 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4468 tree exp = SYMBOL_REF_DECL (sym);
4469 align = TYPE_ALIGN (TREE_TYPE (exp));
4470 align = CONSTANT_ALIGNMENT (exp, align);
4472 else if (SYMBOL_REF_DECL (sym))
4473 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4474 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4475 && SYMBOL_REF_BLOCK (sym) != NULL)
4476 align = SYMBOL_REF_BLOCK (sym)->alignment;
4477 else
4478 align = BITS_PER_UNIT;
4480 ref_size = GET_MODE_SIZE (mode);
4481 if (ref_size == 0)
4482 ref_size = GET_MODE_SIZE (DImode);
4484 return ((INTVAL (offs) & (ref_size - 1)) == 0
4485 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4488 return false;
4490 default:
4491 return false;
4495 bool
4496 aarch64_symbolic_address_p (rtx x)
4498 rtx offset;
4500 split_const (x, &x, &offset);
4501 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4504 /* Classify the base of symbolic expression X. */
4506 enum aarch64_symbol_type
4507 aarch64_classify_symbolic_expression (rtx x)
4509 rtx offset;
4511 split_const (x, &x, &offset);
4512 return aarch64_classify_symbol (x, offset);
4516 /* Return TRUE if X is a legitimate address for accessing memory in
4517 mode MODE. */
4518 static bool
4519 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4521 struct aarch64_address_info addr;
4523 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4526 /* Return TRUE if X is a legitimate address for accessing memory in
4527 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4528 pair operation. */
4529 bool
4530 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4531 RTX_CODE outer_code, bool strict_p)
4533 struct aarch64_address_info addr;
4535 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4538 /* Split an out-of-range address displacement into a base and offset.
4539 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4540 to increase opportunities for sharing the base address of different sizes.
4541 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4542 static bool
4543 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4545 HOST_WIDE_INT offset = INTVAL (*disp);
4546 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4548 if (mode == TImode || mode == TFmode
4549 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4550 base = (offset + 0x100) & ~0x1ff;
4552 *off = GEN_INT (base);
4553 *disp = GEN_INT (offset - base);
4554 return true;
4557 /* Return TRUE if rtx X is immediate constant 0.0 */
4558 bool
4559 aarch64_float_const_zero_rtx_p (rtx x)
4561 if (GET_MODE (x) == VOIDmode)
4562 return false;
4564 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4565 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4566 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4569 /* Return the fixed registers used for condition codes. */
4571 static bool
4572 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4574 *p1 = CC_REGNUM;
4575 *p2 = INVALID_REGNUM;
4576 return true;
4579 /* Emit call insn with PAT and do aarch64-specific handling. */
4581 void
4582 aarch64_emit_call_insn (rtx pat)
4584 rtx insn = emit_call_insn (pat);
4586 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4587 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4588 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4591 machine_mode
4592 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4594 /* All floating point compares return CCFP if it is an equality
4595 comparison, and CCFPE otherwise. */
4596 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4598 switch (code)
4600 case EQ:
4601 case NE:
4602 case UNORDERED:
4603 case ORDERED:
4604 case UNLT:
4605 case UNLE:
4606 case UNGT:
4607 case UNGE:
4608 case UNEQ:
4609 case LTGT:
4610 return CCFPmode;
4612 case LT:
4613 case LE:
4614 case GT:
4615 case GE:
4616 return CCFPEmode;
4618 default:
4619 gcc_unreachable ();
4623 /* Equality comparisons of short modes against zero can be performed
4624 using the TST instruction with the appropriate bitmask. */
4625 if (y == const0_rtx && REG_P (x)
4626 && (code == EQ || code == NE)
4627 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4628 return CC_NZmode;
4630 /* Similarly, comparisons of zero_extends from shorter modes can
4631 be performed using an ANDS with an immediate mask. */
4632 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4633 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4634 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4635 && (code == EQ || code == NE))
4636 return CC_NZmode;
4638 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4639 && y == const0_rtx
4640 && (code == EQ || code == NE || code == LT || code == GE)
4641 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4642 || GET_CODE (x) == NEG
4643 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4644 && CONST_INT_P (XEXP (x, 2)))))
4645 return CC_NZmode;
4647 /* A compare with a shifted operand. Because of canonicalization,
4648 the comparison will have to be swapped when we emit the assembly
4649 code. */
4650 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4651 && (REG_P (y) || GET_CODE (y) == SUBREG)
4652 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4653 || GET_CODE (x) == LSHIFTRT
4654 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4655 return CC_SWPmode;
4657 /* Similarly for a negated operand, but we can only do this for
4658 equalities. */
4659 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4660 && (REG_P (y) || GET_CODE (y) == SUBREG)
4661 && (code == EQ || code == NE)
4662 && GET_CODE (x) == NEG)
4663 return CC_Zmode;
4665 /* A test for unsigned overflow. */
4666 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4667 && code == NE
4668 && GET_CODE (x) == PLUS
4669 && GET_CODE (y) == ZERO_EXTEND)
4670 return CC_Cmode;
4672 /* For everything else, return CCmode. */
4673 return CCmode;
4676 static int
4677 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4680 aarch64_get_condition_code (rtx x)
4682 machine_mode mode = GET_MODE (XEXP (x, 0));
4683 enum rtx_code comp_code = GET_CODE (x);
4685 if (GET_MODE_CLASS (mode) != MODE_CC)
4686 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4687 return aarch64_get_condition_code_1 (mode, comp_code);
4690 static int
4691 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4693 switch (mode)
4695 case CCFPmode:
4696 case CCFPEmode:
4697 switch (comp_code)
4699 case GE: return AARCH64_GE;
4700 case GT: return AARCH64_GT;
4701 case LE: return AARCH64_LS;
4702 case LT: return AARCH64_MI;
4703 case NE: return AARCH64_NE;
4704 case EQ: return AARCH64_EQ;
4705 case ORDERED: return AARCH64_VC;
4706 case UNORDERED: return AARCH64_VS;
4707 case UNLT: return AARCH64_LT;
4708 case UNLE: return AARCH64_LE;
4709 case UNGT: return AARCH64_HI;
4710 case UNGE: return AARCH64_PL;
4711 default: return -1;
4713 break;
4715 case CCmode:
4716 switch (comp_code)
4718 case NE: return AARCH64_NE;
4719 case EQ: return AARCH64_EQ;
4720 case GE: return AARCH64_GE;
4721 case GT: return AARCH64_GT;
4722 case LE: return AARCH64_LE;
4723 case LT: return AARCH64_LT;
4724 case GEU: return AARCH64_CS;
4725 case GTU: return AARCH64_HI;
4726 case LEU: return AARCH64_LS;
4727 case LTU: return AARCH64_CC;
4728 default: return -1;
4730 break;
4732 case CC_SWPmode:
4733 switch (comp_code)
4735 case NE: return AARCH64_NE;
4736 case EQ: return AARCH64_EQ;
4737 case GE: return AARCH64_LE;
4738 case GT: return AARCH64_LT;
4739 case LE: return AARCH64_GE;
4740 case LT: return AARCH64_GT;
4741 case GEU: return AARCH64_LS;
4742 case GTU: return AARCH64_CC;
4743 case LEU: return AARCH64_CS;
4744 case LTU: return AARCH64_HI;
4745 default: return -1;
4747 break;
4749 case CC_NZmode:
4750 switch (comp_code)
4752 case NE: return AARCH64_NE;
4753 case EQ: return AARCH64_EQ;
4754 case GE: return AARCH64_PL;
4755 case LT: return AARCH64_MI;
4756 default: return -1;
4758 break;
4760 case CC_Zmode:
4761 switch (comp_code)
4763 case NE: return AARCH64_NE;
4764 case EQ: return AARCH64_EQ;
4765 default: return -1;
4767 break;
4769 case CC_Cmode:
4770 switch (comp_code)
4772 case NE: return AARCH64_CS;
4773 case EQ: return AARCH64_CC;
4774 default: return -1;
4776 break;
4778 default:
4779 return -1;
4782 return -1;
4785 bool
4786 aarch64_const_vec_all_same_in_range_p (rtx x,
4787 HOST_WIDE_INT minval,
4788 HOST_WIDE_INT maxval)
4790 HOST_WIDE_INT firstval;
4791 int count, i;
4793 if (GET_CODE (x) != CONST_VECTOR
4794 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4795 return false;
4797 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4798 if (firstval < minval || firstval > maxval)
4799 return false;
4801 count = CONST_VECTOR_NUNITS (x);
4802 for (i = 1; i < count; i++)
4803 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4804 return false;
4806 return true;
4809 bool
4810 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4812 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4816 /* N Z C V. */
4817 #define AARCH64_CC_V 1
4818 #define AARCH64_CC_C (1 << 1)
4819 #define AARCH64_CC_Z (1 << 2)
4820 #define AARCH64_CC_N (1 << 3)
4822 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4823 static const int aarch64_nzcv_codes[] =
4825 0, /* EQ, Z == 1. */
4826 AARCH64_CC_Z, /* NE, Z == 0. */
4827 0, /* CS, C == 1. */
4828 AARCH64_CC_C, /* CC, C == 0. */
4829 0, /* MI, N == 1. */
4830 AARCH64_CC_N, /* PL, N == 0. */
4831 0, /* VS, V == 1. */
4832 AARCH64_CC_V, /* VC, V == 0. */
4833 0, /* HI, C ==1 && Z == 0. */
4834 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4835 AARCH64_CC_V, /* GE, N == V. */
4836 0, /* LT, N != V. */
4837 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4838 0, /* LE, !(Z == 0 && N == V). */
4839 0, /* AL, Any. */
4840 0 /* NV, Any. */
4843 static void
4844 aarch64_print_operand (FILE *f, rtx x, int code)
4846 switch (code)
4848 /* An integer or symbol address without a preceding # sign. */
4849 case 'c':
4850 switch (GET_CODE (x))
4852 case CONST_INT:
4853 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4854 break;
4856 case SYMBOL_REF:
4857 output_addr_const (f, x);
4858 break;
4860 case CONST:
4861 if (GET_CODE (XEXP (x, 0)) == PLUS
4862 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4864 output_addr_const (f, x);
4865 break;
4867 /* Fall through. */
4869 default:
4870 output_operand_lossage ("Unsupported operand for code '%c'", code);
4872 break;
4874 case 'e':
4875 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4877 int n;
4879 if (!CONST_INT_P (x)
4880 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4882 output_operand_lossage ("invalid operand for '%%%c'", code);
4883 return;
4886 switch (n)
4888 case 3:
4889 fputc ('b', f);
4890 break;
4891 case 4:
4892 fputc ('h', f);
4893 break;
4894 case 5:
4895 fputc ('w', f);
4896 break;
4897 default:
4898 output_operand_lossage ("invalid operand for '%%%c'", code);
4899 return;
4902 break;
4904 case 'p':
4906 int n;
4908 /* Print N such that 2^N == X. */
4909 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4911 output_operand_lossage ("invalid operand for '%%%c'", code);
4912 return;
4915 asm_fprintf (f, "%d", n);
4917 break;
4919 case 'P':
4920 /* Print the number of non-zero bits in X (a const_int). */
4921 if (!CONST_INT_P (x))
4923 output_operand_lossage ("invalid operand for '%%%c'", code);
4924 return;
4927 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4928 break;
4930 case 'H':
4931 /* Print the higher numbered register of a pair (TImode) of regs. */
4932 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4934 output_operand_lossage ("invalid operand for '%%%c'", code);
4935 return;
4938 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4939 break;
4941 case 'M':
4942 case 'm':
4944 int cond_code;
4945 /* Print a condition (eq, ne, etc) or its inverse. */
4947 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4948 if (x == const_true_rtx)
4950 if (code == 'M')
4951 fputs ("nv", f);
4952 return;
4955 if (!COMPARISON_P (x))
4957 output_operand_lossage ("invalid operand for '%%%c'", code);
4958 return;
4961 cond_code = aarch64_get_condition_code (x);
4962 gcc_assert (cond_code >= 0);
4963 if (code == 'M')
4964 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4965 fputs (aarch64_condition_codes[cond_code], f);
4967 break;
4969 case 'b':
4970 case 'h':
4971 case 's':
4972 case 'd':
4973 case 'q':
4974 /* Print a scalar FP/SIMD register name. */
4975 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4977 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4978 return;
4980 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4981 break;
4983 case 'S':
4984 case 'T':
4985 case 'U':
4986 case 'V':
4987 /* Print the first FP/SIMD register name in a list. */
4988 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4990 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4991 return;
4993 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4994 break;
4996 case 'R':
4997 /* Print a scalar FP/SIMD register name + 1. */
4998 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5000 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5001 return;
5003 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5004 break;
5006 case 'X':
5007 /* Print bottom 16 bits of integer constant in hex. */
5008 if (!CONST_INT_P (x))
5010 output_operand_lossage ("invalid operand for '%%%c'", code);
5011 return;
5013 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5014 break;
5016 case 'w':
5017 case 'x':
5018 /* Print a general register name or the zero register (32-bit or
5019 64-bit). */
5020 if (x == const0_rtx
5021 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5023 asm_fprintf (f, "%czr", code);
5024 break;
5027 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5029 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5030 break;
5033 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5035 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5036 break;
5039 /* Fall through */
5041 case 0:
5042 /* Print a normal operand, if it's a general register, then we
5043 assume DImode. */
5044 if (x == NULL)
5046 output_operand_lossage ("missing operand");
5047 return;
5050 switch (GET_CODE (x))
5052 case REG:
5053 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5054 break;
5056 case MEM:
5057 output_address (GET_MODE (x), XEXP (x, 0));
5058 break;
5060 case CONST:
5061 case LABEL_REF:
5062 case SYMBOL_REF:
5063 output_addr_const (asm_out_file, x);
5064 break;
5066 case CONST_INT:
5067 asm_fprintf (f, "%wd", INTVAL (x));
5068 break;
5070 case CONST_VECTOR:
5071 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5073 gcc_assert (
5074 aarch64_const_vec_all_same_in_range_p (x,
5075 HOST_WIDE_INT_MIN,
5076 HOST_WIDE_INT_MAX));
5077 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5079 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5081 fputc ('0', f);
5083 else
5084 gcc_unreachable ();
5085 break;
5087 case CONST_DOUBLE:
5088 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5089 be getting CONST_DOUBLEs holding integers. */
5090 gcc_assert (GET_MODE (x) != VOIDmode);
5091 if (aarch64_float_const_zero_rtx_p (x))
5093 fputc ('0', f);
5094 break;
5096 else if (aarch64_float_const_representable_p (x))
5098 #define buf_size 20
5099 char float_buf[buf_size] = {'\0'};
5100 real_to_decimal_for_mode (float_buf,
5101 CONST_DOUBLE_REAL_VALUE (x),
5102 buf_size, buf_size,
5103 1, GET_MODE (x));
5104 asm_fprintf (asm_out_file, "%s", float_buf);
5105 break;
5106 #undef buf_size
5108 output_operand_lossage ("invalid constant");
5109 return;
5110 default:
5111 output_operand_lossage ("invalid operand");
5112 return;
5114 break;
5116 case 'A':
5117 if (GET_CODE (x) == HIGH)
5118 x = XEXP (x, 0);
5120 switch (aarch64_classify_symbolic_expression (x))
5122 case SYMBOL_SMALL_GOT_4G:
5123 asm_fprintf (asm_out_file, ":got:");
5124 break;
5126 case SYMBOL_SMALL_TLSGD:
5127 asm_fprintf (asm_out_file, ":tlsgd:");
5128 break;
5130 case SYMBOL_SMALL_TLSDESC:
5131 asm_fprintf (asm_out_file, ":tlsdesc:");
5132 break;
5134 case SYMBOL_SMALL_TLSIE:
5135 asm_fprintf (asm_out_file, ":gottprel:");
5136 break;
5138 case SYMBOL_TLSLE24:
5139 asm_fprintf (asm_out_file, ":tprel:");
5140 break;
5142 case SYMBOL_TINY_GOT:
5143 gcc_unreachable ();
5144 break;
5146 default:
5147 break;
5149 output_addr_const (asm_out_file, x);
5150 break;
5152 case 'L':
5153 switch (aarch64_classify_symbolic_expression (x))
5155 case SYMBOL_SMALL_GOT_4G:
5156 asm_fprintf (asm_out_file, ":lo12:");
5157 break;
5159 case SYMBOL_SMALL_TLSGD:
5160 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5161 break;
5163 case SYMBOL_SMALL_TLSDESC:
5164 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5165 break;
5167 case SYMBOL_SMALL_TLSIE:
5168 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5169 break;
5171 case SYMBOL_TLSLE12:
5172 asm_fprintf (asm_out_file, ":tprel_lo12:");
5173 break;
5175 case SYMBOL_TLSLE24:
5176 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5177 break;
5179 case SYMBOL_TINY_GOT:
5180 asm_fprintf (asm_out_file, ":got:");
5181 break;
5183 case SYMBOL_TINY_TLSIE:
5184 asm_fprintf (asm_out_file, ":gottprel:");
5185 break;
5187 default:
5188 break;
5190 output_addr_const (asm_out_file, x);
5191 break;
5193 case 'G':
5195 switch (aarch64_classify_symbolic_expression (x))
5197 case SYMBOL_TLSLE24:
5198 asm_fprintf (asm_out_file, ":tprel_hi12:");
5199 break;
5200 default:
5201 break;
5203 output_addr_const (asm_out_file, x);
5204 break;
5206 case 'k':
5208 HOST_WIDE_INT cond_code;
5209 /* Print nzcv. */
5211 if (!CONST_INT_P (x))
5213 output_operand_lossage ("invalid operand for '%%%c'", code);
5214 return;
5217 cond_code = INTVAL (x);
5218 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5219 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5221 break;
5223 default:
5224 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5225 return;
5229 static void
5230 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5232 struct aarch64_address_info addr;
5234 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5235 switch (addr.type)
5237 case ADDRESS_REG_IMM:
5238 if (addr.offset == const0_rtx)
5239 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5240 else
5241 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5242 INTVAL (addr.offset));
5243 return;
5245 case ADDRESS_REG_REG:
5246 if (addr.shift == 0)
5247 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5248 reg_names [REGNO (addr.offset)]);
5249 else
5250 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5251 reg_names [REGNO (addr.offset)], addr.shift);
5252 return;
5254 case ADDRESS_REG_UXTW:
5255 if (addr.shift == 0)
5256 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5257 REGNO (addr.offset) - R0_REGNUM);
5258 else
5259 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5260 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5261 return;
5263 case ADDRESS_REG_SXTW:
5264 if (addr.shift == 0)
5265 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5266 REGNO (addr.offset) - R0_REGNUM);
5267 else
5268 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5269 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5270 return;
5272 case ADDRESS_REG_WB:
5273 switch (GET_CODE (x))
5275 case PRE_INC:
5276 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5277 GET_MODE_SIZE (mode));
5278 return;
5279 case POST_INC:
5280 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5281 GET_MODE_SIZE (mode));
5282 return;
5283 case PRE_DEC:
5284 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5285 GET_MODE_SIZE (mode));
5286 return;
5287 case POST_DEC:
5288 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5289 GET_MODE_SIZE (mode));
5290 return;
5291 case PRE_MODIFY:
5292 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5293 INTVAL (addr.offset));
5294 return;
5295 case POST_MODIFY:
5296 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5297 INTVAL (addr.offset));
5298 return;
5299 default:
5300 break;
5302 break;
5304 case ADDRESS_LO_SUM:
5305 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5306 output_addr_const (f, addr.offset);
5307 asm_fprintf (f, "]");
5308 return;
5310 case ADDRESS_SYMBOLIC:
5311 break;
5314 output_addr_const (f, x);
5317 bool
5318 aarch64_label_mentioned_p (rtx x)
5320 const char *fmt;
5321 int i;
5323 if (GET_CODE (x) == LABEL_REF)
5324 return true;
5326 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5327 referencing instruction, but they are constant offsets, not
5328 symbols. */
5329 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5330 return false;
5332 fmt = GET_RTX_FORMAT (GET_CODE (x));
5333 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5335 if (fmt[i] == 'E')
5337 int j;
5339 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5340 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5341 return 1;
5343 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5344 return 1;
5347 return 0;
5350 /* Implement REGNO_REG_CLASS. */
5352 enum reg_class
5353 aarch64_regno_regclass (unsigned regno)
5355 if (GP_REGNUM_P (regno))
5356 return GENERAL_REGS;
5358 if (regno == SP_REGNUM)
5359 return STACK_REG;
5361 if (regno == FRAME_POINTER_REGNUM
5362 || regno == ARG_POINTER_REGNUM)
5363 return POINTER_REGS;
5365 if (FP_REGNUM_P (regno))
5366 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5368 return NO_REGS;
5371 static rtx
5372 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5374 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5375 where mask is selected by alignment and size of the offset.
5376 We try to pick as large a range for the offset as possible to
5377 maximize the chance of a CSE. However, for aligned addresses
5378 we limit the range to 4k so that structures with different sized
5379 elements are likely to use the same base. We need to be careful
5380 not to split a CONST for some forms of address expression, otherwise
5381 it will generate sub-optimal code. */
5383 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5385 rtx base = XEXP (x, 0);
5386 rtx offset_rtx = XEXP (x, 1);
5387 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5389 if (GET_CODE (base) == PLUS)
5391 rtx op0 = XEXP (base, 0);
5392 rtx op1 = XEXP (base, 1);
5394 /* Force any scaling into a temp for CSE. */
5395 op0 = force_reg (Pmode, op0);
5396 op1 = force_reg (Pmode, op1);
5398 /* Let the pointer register be in op0. */
5399 if (REG_POINTER (op1))
5400 std::swap (op0, op1);
5402 /* If the pointer is virtual or frame related, then we know that
5403 virtual register instantiation or register elimination is going
5404 to apply a second constant. We want the two constants folded
5405 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5406 if (virt_or_elim_regno_p (REGNO (op0)))
5408 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5409 NULL_RTX, true, OPTAB_DIRECT);
5410 return gen_rtx_PLUS (Pmode, base, op1);
5413 /* Otherwise, in order to encourage CSE (and thence loop strength
5414 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5415 base = expand_binop (Pmode, add_optab, op0, op1,
5416 NULL_RTX, true, OPTAB_DIRECT);
5417 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5420 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5421 HOST_WIDE_INT base_offset;
5422 if (GET_MODE_SIZE (mode) > 16)
5423 base_offset = (offset + 0x400) & ~0x7f0;
5424 /* For offsets aren't a multiple of the access size, the limit is
5425 -256...255. */
5426 else if (offset & (GET_MODE_SIZE (mode) - 1))
5428 base_offset = (offset + 0x100) & ~0x1ff;
5430 /* BLKmode typically uses LDP of X-registers. */
5431 if (mode == BLKmode)
5432 base_offset = (offset + 512) & ~0x3ff;
5434 /* Small negative offsets are supported. */
5435 else if (IN_RANGE (offset, -256, 0))
5436 base_offset = 0;
5437 else if (mode == TImode || mode == TFmode)
5438 base_offset = (offset + 0x100) & ~0x1ff;
5439 /* Use 12-bit offset by access size. */
5440 else
5441 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5443 if (base_offset != 0)
5445 base = plus_constant (Pmode, base, base_offset);
5446 base = force_operand (base, NULL_RTX);
5447 return plus_constant (Pmode, base, offset - base_offset);
5451 return x;
5454 /* Return the reload icode required for a constant pool in mode. */
5455 static enum insn_code
5456 aarch64_constant_pool_reload_icode (machine_mode mode)
5458 switch (mode)
5460 case SFmode:
5461 return CODE_FOR_aarch64_reload_movcpsfdi;
5463 case DFmode:
5464 return CODE_FOR_aarch64_reload_movcpdfdi;
5466 case TFmode:
5467 return CODE_FOR_aarch64_reload_movcptfdi;
5469 case V8QImode:
5470 return CODE_FOR_aarch64_reload_movcpv8qidi;
5472 case V16QImode:
5473 return CODE_FOR_aarch64_reload_movcpv16qidi;
5475 case V4HImode:
5476 return CODE_FOR_aarch64_reload_movcpv4hidi;
5478 case V8HImode:
5479 return CODE_FOR_aarch64_reload_movcpv8hidi;
5481 case V2SImode:
5482 return CODE_FOR_aarch64_reload_movcpv2sidi;
5484 case V4SImode:
5485 return CODE_FOR_aarch64_reload_movcpv4sidi;
5487 case V2DImode:
5488 return CODE_FOR_aarch64_reload_movcpv2didi;
5490 case V2DFmode:
5491 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5493 default:
5494 gcc_unreachable ();
5497 gcc_unreachable ();
5499 static reg_class_t
5500 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5501 reg_class_t rclass,
5502 machine_mode mode,
5503 secondary_reload_info *sri)
5506 /* If we have to disable direct literal pool loads and stores because the
5507 function is too big, then we need a scratch register. */
5508 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5509 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5510 || targetm.vector_mode_supported_p (GET_MODE (x)))
5511 && !aarch64_pcrelative_literal_loads)
5513 sri->icode = aarch64_constant_pool_reload_icode (mode);
5514 return NO_REGS;
5517 /* Without the TARGET_SIMD instructions we cannot move a Q register
5518 to a Q register directly. We need a scratch. */
5519 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5520 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5521 && reg_class_subset_p (rclass, FP_REGS))
5523 if (mode == TFmode)
5524 sri->icode = CODE_FOR_aarch64_reload_movtf;
5525 else if (mode == TImode)
5526 sri->icode = CODE_FOR_aarch64_reload_movti;
5527 return NO_REGS;
5530 /* A TFmode or TImode memory access should be handled via an FP_REGS
5531 because AArch64 has richer addressing modes for LDR/STR instructions
5532 than LDP/STP instructions. */
5533 if (TARGET_FLOAT && rclass == GENERAL_REGS
5534 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5535 return FP_REGS;
5537 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5538 return GENERAL_REGS;
5540 return NO_REGS;
5543 static bool
5544 aarch64_can_eliminate (const int from, const int to)
5546 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5547 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5549 if (frame_pointer_needed)
5551 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5552 return true;
5553 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5554 return false;
5555 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5556 && !cfun->calls_alloca)
5557 return true;
5558 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5559 return true;
5561 return false;
5563 else
5565 /* If we decided that we didn't need a leaf frame pointer but then used
5566 LR in the function, then we'll want a frame pointer after all, so
5567 prevent this elimination to ensure a frame pointer is used. */
5568 if (to == STACK_POINTER_REGNUM
5569 && flag_omit_leaf_frame_pointer
5570 && df_regs_ever_live_p (LR_REGNUM))
5571 return false;
5574 return true;
5577 HOST_WIDE_INT
5578 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5580 aarch64_layout_frame ();
5582 if (to == HARD_FRAME_POINTER_REGNUM)
5584 if (from == ARG_POINTER_REGNUM)
5585 return cfun->machine->frame.hard_fp_offset;
5587 if (from == FRAME_POINTER_REGNUM)
5588 return cfun->machine->frame.hard_fp_offset
5589 - cfun->machine->frame.locals_offset;
5592 if (to == STACK_POINTER_REGNUM)
5594 if (from == FRAME_POINTER_REGNUM)
5595 return cfun->machine->frame.frame_size
5596 - cfun->machine->frame.locals_offset;
5599 return cfun->machine->frame.frame_size;
5602 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5603 previous frame. */
5606 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5608 if (count != 0)
5609 return const0_rtx;
5610 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5614 static void
5615 aarch64_asm_trampoline_template (FILE *f)
5617 if (TARGET_ILP32)
5619 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5620 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5622 else
5624 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5625 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5627 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5628 assemble_aligned_integer (4, const0_rtx);
5629 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5630 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5633 static void
5634 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5636 rtx fnaddr, mem, a_tramp;
5637 const int tramp_code_sz = 16;
5639 /* Don't need to copy the trailing D-words, we fill those in below. */
5640 emit_block_move (m_tramp, assemble_trampoline_template (),
5641 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5642 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5643 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5644 if (GET_MODE (fnaddr) != ptr_mode)
5645 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5646 emit_move_insn (mem, fnaddr);
5648 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5649 emit_move_insn (mem, chain_value);
5651 /* XXX We should really define a "clear_cache" pattern and use
5652 gen_clear_cache(). */
5653 a_tramp = XEXP (m_tramp, 0);
5654 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5655 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5656 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5657 ptr_mode);
5660 static unsigned char
5661 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5663 switch (regclass)
5665 case CALLER_SAVE_REGS:
5666 case POINTER_REGS:
5667 case GENERAL_REGS:
5668 case ALL_REGS:
5669 case FP_REGS:
5670 case FP_LO_REGS:
5671 return
5672 aarch64_vector_mode_p (mode)
5673 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5674 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5675 case STACK_REG:
5676 return 1;
5678 case NO_REGS:
5679 return 0;
5681 default:
5682 break;
5684 gcc_unreachable ();
5687 static reg_class_t
5688 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5690 if (regclass == POINTER_REGS)
5691 return GENERAL_REGS;
5693 if (regclass == STACK_REG)
5695 if (REG_P(x)
5696 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5697 return regclass;
5699 return NO_REGS;
5702 /* If it's an integer immediate that MOVI can't handle, then
5703 FP_REGS is not an option, so we return NO_REGS instead. */
5704 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5705 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5706 return NO_REGS;
5708 /* Register eliminiation can result in a request for
5709 SP+constant->FP_REGS. We cannot support such operations which
5710 use SP as source and an FP_REG as destination, so reject out
5711 right now. */
5712 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5714 rtx lhs = XEXP (x, 0);
5716 /* Look through a possible SUBREG introduced by ILP32. */
5717 if (GET_CODE (lhs) == SUBREG)
5718 lhs = SUBREG_REG (lhs);
5720 gcc_assert (REG_P (lhs));
5721 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5722 POINTER_REGS));
5723 return NO_REGS;
5726 return regclass;
5729 void
5730 aarch64_asm_output_labelref (FILE* f, const char *name)
5732 asm_fprintf (f, "%U%s", name);
5735 static void
5736 aarch64_elf_asm_constructor (rtx symbol, int priority)
5738 if (priority == DEFAULT_INIT_PRIORITY)
5739 default_ctor_section_asm_out_constructor (symbol, priority);
5740 else
5742 section *s;
5743 char buf[18];
5744 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5745 s = get_section (buf, SECTION_WRITE, NULL);
5746 switch_to_section (s);
5747 assemble_align (POINTER_SIZE);
5748 assemble_aligned_integer (POINTER_BYTES, symbol);
5752 static void
5753 aarch64_elf_asm_destructor (rtx symbol, int priority)
5755 if (priority == DEFAULT_INIT_PRIORITY)
5756 default_dtor_section_asm_out_destructor (symbol, priority);
5757 else
5759 section *s;
5760 char buf[18];
5761 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5762 s = get_section (buf, SECTION_WRITE, NULL);
5763 switch_to_section (s);
5764 assemble_align (POINTER_SIZE);
5765 assemble_aligned_integer (POINTER_BYTES, symbol);
5769 const char*
5770 aarch64_output_casesi (rtx *operands)
5772 char buf[100];
5773 char label[100];
5774 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5775 int index;
5776 static const char *const patterns[4][2] =
5779 "ldrb\t%w3, [%0,%w1,uxtw]",
5780 "add\t%3, %4, %w3, sxtb #2"
5783 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5784 "add\t%3, %4, %w3, sxth #2"
5787 "ldr\t%w3, [%0,%w1,uxtw #2]",
5788 "add\t%3, %4, %w3, sxtw #2"
5790 /* We assume that DImode is only generated when not optimizing and
5791 that we don't really need 64-bit address offsets. That would
5792 imply an object file with 8GB of code in a single function! */
5794 "ldr\t%w3, [%0,%w1,uxtw #2]",
5795 "add\t%3, %4, %w3, sxtw #2"
5799 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5801 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5803 gcc_assert (index >= 0 && index <= 3);
5805 /* Need to implement table size reduction, by chaning the code below. */
5806 output_asm_insn (patterns[index][0], operands);
5807 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5808 snprintf (buf, sizeof (buf),
5809 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5810 output_asm_insn (buf, operands);
5811 output_asm_insn (patterns[index][1], operands);
5812 output_asm_insn ("br\t%3", operands);
5813 assemble_label (asm_out_file, label);
5814 return "";
5818 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5819 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5820 operator. */
5823 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5825 if (shift >= 0 && shift <= 3)
5827 int size;
5828 for (size = 8; size <= 32; size *= 2)
5830 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5831 if (mask == bits << shift)
5832 return size;
5835 return 0;
5838 /* Constant pools are per function only when PC relative
5839 literal loads are true or we are in the large memory
5840 model. */
5842 static inline bool
5843 aarch64_can_use_per_function_literal_pools_p (void)
5845 return (aarch64_pcrelative_literal_loads
5846 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5849 static bool
5850 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5852 /* Fixme:: In an ideal world this would work similar
5853 to the logic in aarch64_select_rtx_section but this
5854 breaks bootstrap in gcc go. For now we workaround
5855 this by returning false here. */
5856 return false;
5859 /* Select appropriate section for constants depending
5860 on where we place literal pools. */
5862 static section *
5863 aarch64_select_rtx_section (machine_mode mode,
5864 rtx x,
5865 unsigned HOST_WIDE_INT align)
5867 if (aarch64_can_use_per_function_literal_pools_p ())
5868 return function_section (current_function_decl);
5870 return default_elf_select_rtx_section (mode, x, align);
5873 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5874 void
5875 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5876 HOST_WIDE_INT offset)
5878 /* When using per-function literal pools, we must ensure that any code
5879 section is aligned to the minimal instruction length, lest we get
5880 errors from the assembler re "unaligned instructions". */
5881 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5882 ASM_OUTPUT_ALIGN (f, 2);
5885 /* Costs. */
5887 /* Helper function for rtx cost calculation. Strip a shift expression
5888 from X. Returns the inner operand if successful, or the original
5889 expression on failure. */
5890 static rtx
5891 aarch64_strip_shift (rtx x)
5893 rtx op = x;
5895 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5896 we can convert both to ROR during final output. */
5897 if ((GET_CODE (op) == ASHIFT
5898 || GET_CODE (op) == ASHIFTRT
5899 || GET_CODE (op) == LSHIFTRT
5900 || GET_CODE (op) == ROTATERT
5901 || GET_CODE (op) == ROTATE)
5902 && CONST_INT_P (XEXP (op, 1)))
5903 return XEXP (op, 0);
5905 if (GET_CODE (op) == MULT
5906 && CONST_INT_P (XEXP (op, 1))
5907 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5908 return XEXP (op, 0);
5910 return x;
5913 /* Helper function for rtx cost calculation. Strip an extend
5914 expression from X. Returns the inner operand if successful, or the
5915 original expression on failure. We deal with a number of possible
5916 canonicalization variations here. */
5917 static rtx
5918 aarch64_strip_extend (rtx x)
5920 rtx op = x;
5922 /* Zero and sign extraction of a widened value. */
5923 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5924 && XEXP (op, 2) == const0_rtx
5925 && GET_CODE (XEXP (op, 0)) == MULT
5926 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5927 XEXP (op, 1)))
5928 return XEXP (XEXP (op, 0), 0);
5930 /* It can also be represented (for zero-extend) as an AND with an
5931 immediate. */
5932 if (GET_CODE (op) == AND
5933 && GET_CODE (XEXP (op, 0)) == MULT
5934 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5935 && CONST_INT_P (XEXP (op, 1))
5936 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5937 INTVAL (XEXP (op, 1))) != 0)
5938 return XEXP (XEXP (op, 0), 0);
5940 /* Now handle extended register, as this may also have an optional
5941 left shift by 1..4. */
5942 if (GET_CODE (op) == ASHIFT
5943 && CONST_INT_P (XEXP (op, 1))
5944 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5945 op = XEXP (op, 0);
5947 if (GET_CODE (op) == ZERO_EXTEND
5948 || GET_CODE (op) == SIGN_EXTEND)
5949 op = XEXP (op, 0);
5951 if (op != x)
5952 return op;
5954 return x;
5957 /* Return true iff CODE is a shift supported in combination
5958 with arithmetic instructions. */
5960 static bool
5961 aarch64_shift_p (enum rtx_code code)
5963 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5966 /* Helper function for rtx cost calculation. Calculate the cost of
5967 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5968 Return the calculated cost of the expression, recursing manually in to
5969 operands where needed. */
5971 static int
5972 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5974 rtx op0, op1;
5975 const struct cpu_cost_table *extra_cost
5976 = aarch64_tune_params.insn_extra_cost;
5977 int cost = 0;
5978 bool compound_p = (outer == PLUS || outer == MINUS);
5979 machine_mode mode = GET_MODE (x);
5981 gcc_checking_assert (code == MULT);
5983 op0 = XEXP (x, 0);
5984 op1 = XEXP (x, 1);
5986 if (VECTOR_MODE_P (mode))
5987 mode = GET_MODE_INNER (mode);
5989 /* Integer multiply/fma. */
5990 if (GET_MODE_CLASS (mode) == MODE_INT)
5992 /* The multiply will be canonicalized as a shift, cost it as such. */
5993 if (aarch64_shift_p (GET_CODE (x))
5994 || (CONST_INT_P (op1)
5995 && exact_log2 (INTVAL (op1)) > 0))
5997 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5998 || GET_CODE (op0) == SIGN_EXTEND;
5999 if (speed)
6001 if (compound_p)
6003 if (REG_P (op1))
6004 /* ARITH + shift-by-register. */
6005 cost += extra_cost->alu.arith_shift_reg;
6006 else if (is_extend)
6007 /* ARITH + extended register. We don't have a cost field
6008 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6009 cost += extra_cost->alu.extend_arith;
6010 else
6011 /* ARITH + shift-by-immediate. */
6012 cost += extra_cost->alu.arith_shift;
6014 else
6015 /* LSL (immediate). */
6016 cost += extra_cost->alu.shift;
6019 /* Strip extends as we will have costed them in the case above. */
6020 if (is_extend)
6021 op0 = aarch64_strip_extend (op0);
6023 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6025 return cost;
6028 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6029 compound and let the below cases handle it. After all, MNEG is a
6030 special-case alias of MSUB. */
6031 if (GET_CODE (op0) == NEG)
6033 op0 = XEXP (op0, 0);
6034 compound_p = true;
6037 /* Integer multiplies or FMAs have zero/sign extending variants. */
6038 if ((GET_CODE (op0) == ZERO_EXTEND
6039 && GET_CODE (op1) == ZERO_EXTEND)
6040 || (GET_CODE (op0) == SIGN_EXTEND
6041 && GET_CODE (op1) == SIGN_EXTEND))
6043 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6044 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6046 if (speed)
6048 if (compound_p)
6049 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6050 cost += extra_cost->mult[0].extend_add;
6051 else
6052 /* MUL/SMULL/UMULL. */
6053 cost += extra_cost->mult[0].extend;
6056 return cost;
6059 /* This is either an integer multiply or a MADD. In both cases
6060 we want to recurse and cost the operands. */
6061 cost += rtx_cost (op0, mode, MULT, 0, speed);
6062 cost += rtx_cost (op1, mode, MULT, 1, speed);
6064 if (speed)
6066 if (compound_p)
6067 /* MADD/MSUB. */
6068 cost += extra_cost->mult[mode == DImode].add;
6069 else
6070 /* MUL. */
6071 cost += extra_cost->mult[mode == DImode].simple;
6074 return cost;
6076 else
6078 if (speed)
6080 /* Floating-point FMA/FMUL can also support negations of the
6081 operands, unless the rounding mode is upward or downward in
6082 which case FNMUL is different than FMUL with operand negation. */
6083 bool neg0 = GET_CODE (op0) == NEG;
6084 bool neg1 = GET_CODE (op1) == NEG;
6085 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6087 if (neg0)
6088 op0 = XEXP (op0, 0);
6089 if (neg1)
6090 op1 = XEXP (op1, 0);
6093 if (compound_p)
6094 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6095 cost += extra_cost->fp[mode == DFmode].fma;
6096 else
6097 /* FMUL/FNMUL. */
6098 cost += extra_cost->fp[mode == DFmode].mult;
6101 cost += rtx_cost (op0, mode, MULT, 0, speed);
6102 cost += rtx_cost (op1, mode, MULT, 1, speed);
6103 return cost;
6107 static int
6108 aarch64_address_cost (rtx x,
6109 machine_mode mode,
6110 addr_space_t as ATTRIBUTE_UNUSED,
6111 bool speed)
6113 enum rtx_code c = GET_CODE (x);
6114 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6115 struct aarch64_address_info info;
6116 int cost = 0;
6117 info.shift = 0;
6119 if (!aarch64_classify_address (&info, x, mode, c, false))
6121 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6123 /* This is a CONST or SYMBOL ref which will be split
6124 in a different way depending on the code model in use.
6125 Cost it through the generic infrastructure. */
6126 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6127 /* Divide through by the cost of one instruction to
6128 bring it to the same units as the address costs. */
6129 cost_symbol_ref /= COSTS_N_INSNS (1);
6130 /* The cost is then the cost of preparing the address,
6131 followed by an immediate (possibly 0) offset. */
6132 return cost_symbol_ref + addr_cost->imm_offset;
6134 else
6136 /* This is most likely a jump table from a case
6137 statement. */
6138 return addr_cost->register_offset;
6142 switch (info.type)
6144 case ADDRESS_LO_SUM:
6145 case ADDRESS_SYMBOLIC:
6146 case ADDRESS_REG_IMM:
6147 cost += addr_cost->imm_offset;
6148 break;
6150 case ADDRESS_REG_WB:
6151 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6152 cost += addr_cost->pre_modify;
6153 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6154 cost += addr_cost->post_modify;
6155 else
6156 gcc_unreachable ();
6158 break;
6160 case ADDRESS_REG_REG:
6161 cost += addr_cost->register_offset;
6162 break;
6164 case ADDRESS_REG_SXTW:
6165 cost += addr_cost->register_sextend;
6166 break;
6168 case ADDRESS_REG_UXTW:
6169 cost += addr_cost->register_zextend;
6170 break;
6172 default:
6173 gcc_unreachable ();
6177 if (info.shift > 0)
6179 /* For the sake of calculating the cost of the shifted register
6180 component, we can treat same sized modes in the same way. */
6181 switch (GET_MODE_BITSIZE (mode))
6183 case 16:
6184 cost += addr_cost->addr_scale_costs.hi;
6185 break;
6187 case 32:
6188 cost += addr_cost->addr_scale_costs.si;
6189 break;
6191 case 64:
6192 cost += addr_cost->addr_scale_costs.di;
6193 break;
6195 /* We can't tell, or this is a 128-bit vector. */
6196 default:
6197 cost += addr_cost->addr_scale_costs.ti;
6198 break;
6202 return cost;
6205 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6206 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6207 to be taken. */
6210 aarch64_branch_cost (bool speed_p, bool predictable_p)
6212 /* When optimizing for speed, use the cost of unpredictable branches. */
6213 const struct cpu_branch_cost *branch_costs =
6214 aarch64_tune_params.branch_costs;
6216 if (!speed_p || predictable_p)
6217 return branch_costs->predictable;
6218 else
6219 return branch_costs->unpredictable;
6222 /* Return true if the RTX X in mode MODE is a zero or sign extract
6223 usable in an ADD or SUB (extended register) instruction. */
6224 static bool
6225 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6227 /* Catch add with a sign extract.
6228 This is add_<optab><mode>_multp2. */
6229 if (GET_CODE (x) == SIGN_EXTRACT
6230 || GET_CODE (x) == ZERO_EXTRACT)
6232 rtx op0 = XEXP (x, 0);
6233 rtx op1 = XEXP (x, 1);
6234 rtx op2 = XEXP (x, 2);
6236 if (GET_CODE (op0) == MULT
6237 && CONST_INT_P (op1)
6238 && op2 == const0_rtx
6239 && CONST_INT_P (XEXP (op0, 1))
6240 && aarch64_is_extend_from_extract (mode,
6241 XEXP (op0, 1),
6242 op1))
6244 return true;
6247 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6248 No shift. */
6249 else if (GET_CODE (x) == SIGN_EXTEND
6250 || GET_CODE (x) == ZERO_EXTEND)
6251 return REG_P (XEXP (x, 0));
6253 return false;
6256 static bool
6257 aarch64_frint_unspec_p (unsigned int u)
6259 switch (u)
6261 case UNSPEC_FRINTZ:
6262 case UNSPEC_FRINTP:
6263 case UNSPEC_FRINTM:
6264 case UNSPEC_FRINTA:
6265 case UNSPEC_FRINTN:
6266 case UNSPEC_FRINTX:
6267 case UNSPEC_FRINTI:
6268 return true;
6270 default:
6271 return false;
6275 /* Return true iff X is an rtx that will match an extr instruction
6276 i.e. as described in the *extr<mode>5_insn family of patterns.
6277 OP0 and OP1 will be set to the operands of the shifts involved
6278 on success and will be NULL_RTX otherwise. */
6280 static bool
6281 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6283 rtx op0, op1;
6284 machine_mode mode = GET_MODE (x);
6286 *res_op0 = NULL_RTX;
6287 *res_op1 = NULL_RTX;
6289 if (GET_CODE (x) != IOR)
6290 return false;
6292 op0 = XEXP (x, 0);
6293 op1 = XEXP (x, 1);
6295 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6296 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6298 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6299 if (GET_CODE (op1) == ASHIFT)
6300 std::swap (op0, op1);
6302 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6303 return false;
6305 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6306 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6308 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6309 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6311 *res_op0 = XEXP (op0, 0);
6312 *res_op1 = XEXP (op1, 0);
6313 return true;
6317 return false;
6320 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6321 storing it in *COST. Result is true if the total cost of the operation
6322 has now been calculated. */
6323 static bool
6324 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6326 rtx inner;
6327 rtx comparator;
6328 enum rtx_code cmpcode;
6330 if (COMPARISON_P (op0))
6332 inner = XEXP (op0, 0);
6333 comparator = XEXP (op0, 1);
6334 cmpcode = GET_CODE (op0);
6336 else
6338 inner = op0;
6339 comparator = const0_rtx;
6340 cmpcode = NE;
6343 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6345 /* Conditional branch. */
6346 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6347 return true;
6348 else
6350 if (cmpcode == NE || cmpcode == EQ)
6352 if (comparator == const0_rtx)
6354 /* TBZ/TBNZ/CBZ/CBNZ. */
6355 if (GET_CODE (inner) == ZERO_EXTRACT)
6356 /* TBZ/TBNZ. */
6357 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6358 ZERO_EXTRACT, 0, speed);
6359 else
6360 /* CBZ/CBNZ. */
6361 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6363 return true;
6366 else if (cmpcode == LT || cmpcode == GE)
6368 /* TBZ/TBNZ. */
6369 if (comparator == const0_rtx)
6370 return true;
6374 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6376 /* CCMP. */
6377 if (GET_CODE (op1) == COMPARE)
6379 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6380 if (XEXP (op1, 1) == const0_rtx)
6381 *cost += 1;
6382 if (speed)
6384 machine_mode mode = GET_MODE (XEXP (op1, 0));
6385 const struct cpu_cost_table *extra_cost
6386 = aarch64_tune_params.insn_extra_cost;
6388 if (GET_MODE_CLASS (mode) == MODE_INT)
6389 *cost += extra_cost->alu.arith;
6390 else
6391 *cost += extra_cost->fp[mode == DFmode].compare;
6393 return true;
6396 /* It's a conditional operation based on the status flags,
6397 so it must be some flavor of CSEL. */
6399 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6400 if (GET_CODE (op1) == NEG
6401 || GET_CODE (op1) == NOT
6402 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6403 op1 = XEXP (op1, 0);
6404 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6406 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6407 op1 = XEXP (op1, 0);
6408 op2 = XEXP (op2, 0);
6411 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6412 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6413 return true;
6416 /* We don't know what this is, cost all operands. */
6417 return false;
6420 /* Check whether X is a bitfield operation of the form shift + extend that
6421 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6422 operand to which the bitfield operation is applied. Otherwise return
6423 NULL_RTX. */
6425 static rtx
6426 aarch64_extend_bitfield_pattern_p (rtx x)
6428 rtx_code outer_code = GET_CODE (x);
6429 machine_mode outer_mode = GET_MODE (x);
6431 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6432 && outer_mode != SImode && outer_mode != DImode)
6433 return NULL_RTX;
6435 rtx inner = XEXP (x, 0);
6436 rtx_code inner_code = GET_CODE (inner);
6437 machine_mode inner_mode = GET_MODE (inner);
6438 rtx op = NULL_RTX;
6440 switch (inner_code)
6442 case ASHIFT:
6443 if (CONST_INT_P (XEXP (inner, 1))
6444 && (inner_mode == QImode || inner_mode == HImode))
6445 op = XEXP (inner, 0);
6446 break;
6447 case LSHIFTRT:
6448 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6449 && (inner_mode == QImode || inner_mode == HImode))
6450 op = XEXP (inner, 0);
6451 break;
6452 case ASHIFTRT:
6453 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6454 && (inner_mode == QImode || inner_mode == HImode))
6455 op = XEXP (inner, 0);
6456 break;
6457 default:
6458 break;
6461 return op;
6464 /* Return true if the mask and a shift amount from an RTX of the form
6465 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6466 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6468 bool
6469 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6471 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6472 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6473 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6474 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6477 /* Calculate the cost of calculating X, storing it in *COST. Result
6478 is true if the total cost of the operation has now been calculated. */
6479 static bool
6480 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6481 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6483 rtx op0, op1, op2;
6484 const struct cpu_cost_table *extra_cost
6485 = aarch64_tune_params.insn_extra_cost;
6486 int code = GET_CODE (x);
6488 /* By default, assume that everything has equivalent cost to the
6489 cheapest instruction. Any additional costs are applied as a delta
6490 above this default. */
6491 *cost = COSTS_N_INSNS (1);
6493 switch (code)
6495 case SET:
6496 /* The cost depends entirely on the operands to SET. */
6497 *cost = 0;
6498 op0 = SET_DEST (x);
6499 op1 = SET_SRC (x);
6501 switch (GET_CODE (op0))
6503 case MEM:
6504 if (speed)
6506 rtx address = XEXP (op0, 0);
6507 if (VECTOR_MODE_P (mode))
6508 *cost += extra_cost->ldst.storev;
6509 else if (GET_MODE_CLASS (mode) == MODE_INT)
6510 *cost += extra_cost->ldst.store;
6511 else if (mode == SFmode)
6512 *cost += extra_cost->ldst.storef;
6513 else if (mode == DFmode)
6514 *cost += extra_cost->ldst.stored;
6516 *cost +=
6517 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6518 0, speed));
6521 *cost += rtx_cost (op1, mode, SET, 1, speed);
6522 return true;
6524 case SUBREG:
6525 if (! REG_P (SUBREG_REG (op0)))
6526 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6528 /* Fall through. */
6529 case REG:
6530 /* The cost is one per vector-register copied. */
6531 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6533 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6534 / GET_MODE_SIZE (V4SImode);
6535 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6537 /* const0_rtx is in general free, but we will use an
6538 instruction to set a register to 0. */
6539 else if (REG_P (op1) || op1 == const0_rtx)
6541 /* The cost is 1 per register copied. */
6542 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6543 / UNITS_PER_WORD;
6544 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6546 else
6547 /* Cost is just the cost of the RHS of the set. */
6548 *cost += rtx_cost (op1, mode, SET, 1, speed);
6549 return true;
6551 case ZERO_EXTRACT:
6552 case SIGN_EXTRACT:
6553 /* Bit-field insertion. Strip any redundant widening of
6554 the RHS to meet the width of the target. */
6555 if (GET_CODE (op1) == SUBREG)
6556 op1 = SUBREG_REG (op1);
6557 if ((GET_CODE (op1) == ZERO_EXTEND
6558 || GET_CODE (op1) == SIGN_EXTEND)
6559 && CONST_INT_P (XEXP (op0, 1))
6560 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6561 >= INTVAL (XEXP (op0, 1))))
6562 op1 = XEXP (op1, 0);
6564 if (CONST_INT_P (op1))
6566 /* MOV immediate is assumed to always be cheap. */
6567 *cost = COSTS_N_INSNS (1);
6569 else
6571 /* BFM. */
6572 if (speed)
6573 *cost += extra_cost->alu.bfi;
6574 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6577 return true;
6579 default:
6580 /* We can't make sense of this, assume default cost. */
6581 *cost = COSTS_N_INSNS (1);
6582 return false;
6584 return false;
6586 case CONST_INT:
6587 /* If an instruction can incorporate a constant within the
6588 instruction, the instruction's expression avoids calling
6589 rtx_cost() on the constant. If rtx_cost() is called on a
6590 constant, then it is usually because the constant must be
6591 moved into a register by one or more instructions.
6593 The exception is constant 0, which can be expressed
6594 as XZR/WZR and is therefore free. The exception to this is
6595 if we have (set (reg) (const0_rtx)) in which case we must cost
6596 the move. However, we can catch that when we cost the SET, so
6597 we don't need to consider that here. */
6598 if (x == const0_rtx)
6599 *cost = 0;
6600 else
6602 /* To an approximation, building any other constant is
6603 proportionally expensive to the number of instructions
6604 required to build that constant. This is true whether we
6605 are compiling for SPEED or otherwise. */
6606 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6607 (NULL_RTX, x, false, mode));
6609 return true;
6611 case CONST_DOUBLE:
6612 if (speed)
6614 /* mov[df,sf]_aarch64. */
6615 if (aarch64_float_const_representable_p (x))
6616 /* FMOV (scalar immediate). */
6617 *cost += extra_cost->fp[mode == DFmode].fpconst;
6618 else if (!aarch64_float_const_zero_rtx_p (x))
6620 /* This will be a load from memory. */
6621 if (mode == DFmode)
6622 *cost += extra_cost->ldst.loadd;
6623 else
6624 *cost += extra_cost->ldst.loadf;
6626 else
6627 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6628 or MOV v0.s[0], wzr - neither of which are modeled by the
6629 cost tables. Just use the default cost. */
6634 return true;
6636 case MEM:
6637 if (speed)
6639 /* For loads we want the base cost of a load, plus an
6640 approximation for the additional cost of the addressing
6641 mode. */
6642 rtx address = XEXP (x, 0);
6643 if (VECTOR_MODE_P (mode))
6644 *cost += extra_cost->ldst.loadv;
6645 else if (GET_MODE_CLASS (mode) == MODE_INT)
6646 *cost += extra_cost->ldst.load;
6647 else if (mode == SFmode)
6648 *cost += extra_cost->ldst.loadf;
6649 else if (mode == DFmode)
6650 *cost += extra_cost->ldst.loadd;
6652 *cost +=
6653 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6654 0, speed));
6657 return true;
6659 case NEG:
6660 op0 = XEXP (x, 0);
6662 if (VECTOR_MODE_P (mode))
6664 if (speed)
6666 /* FNEG. */
6667 *cost += extra_cost->vect.alu;
6669 return false;
6672 if (GET_MODE_CLASS (mode) == MODE_INT)
6674 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6675 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6677 /* CSETM. */
6678 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6679 return true;
6682 /* Cost this as SUB wzr, X. */
6683 op0 = CONST0_RTX (mode);
6684 op1 = XEXP (x, 0);
6685 goto cost_minus;
6688 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6690 /* Support (neg(fma...)) as a single instruction only if
6691 sign of zeros is unimportant. This matches the decision
6692 making in aarch64.md. */
6693 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6695 /* FNMADD. */
6696 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6697 return true;
6699 if (GET_CODE (op0) == MULT)
6701 /* FNMUL. */
6702 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6703 return true;
6705 if (speed)
6706 /* FNEG. */
6707 *cost += extra_cost->fp[mode == DFmode].neg;
6708 return false;
6711 return false;
6713 case CLRSB:
6714 case CLZ:
6715 if (speed)
6717 if (VECTOR_MODE_P (mode))
6718 *cost += extra_cost->vect.alu;
6719 else
6720 *cost += extra_cost->alu.clz;
6723 return false;
6725 case COMPARE:
6726 op0 = XEXP (x, 0);
6727 op1 = XEXP (x, 1);
6729 if (op1 == const0_rtx
6730 && GET_CODE (op0) == AND)
6732 x = op0;
6733 mode = GET_MODE (op0);
6734 goto cost_logic;
6737 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6739 /* TODO: A write to the CC flags possibly costs extra, this
6740 needs encoding in the cost tables. */
6742 mode = GET_MODE (op0);
6743 /* ANDS. */
6744 if (GET_CODE (op0) == AND)
6746 x = op0;
6747 goto cost_logic;
6750 if (GET_CODE (op0) == PLUS)
6752 /* ADDS (and CMN alias). */
6753 x = op0;
6754 goto cost_plus;
6757 if (GET_CODE (op0) == MINUS)
6759 /* SUBS. */
6760 x = op0;
6761 goto cost_minus;
6764 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6765 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6766 && CONST_INT_P (XEXP (op0, 2)))
6768 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6769 Handle it here directly rather than going to cost_logic
6770 since we know the immediate generated for the TST is valid
6771 so we can avoid creating an intermediate rtx for it only
6772 for costing purposes. */
6773 if (speed)
6774 *cost += extra_cost->alu.logical;
6776 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6777 ZERO_EXTRACT, 0, speed);
6778 return true;
6781 if (GET_CODE (op1) == NEG)
6783 /* CMN. */
6784 if (speed)
6785 *cost += extra_cost->alu.arith;
6787 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6788 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6789 return true;
6792 /* CMP.
6794 Compare can freely swap the order of operands, and
6795 canonicalization puts the more complex operation first.
6796 But the integer MINUS logic expects the shift/extend
6797 operation in op1. */
6798 if (! (REG_P (op0)
6799 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6801 op0 = XEXP (x, 1);
6802 op1 = XEXP (x, 0);
6804 goto cost_minus;
6807 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6809 /* FCMP. */
6810 if (speed)
6811 *cost += extra_cost->fp[mode == DFmode].compare;
6813 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6815 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6816 /* FCMP supports constant 0.0 for no extra cost. */
6817 return true;
6819 return false;
6822 if (VECTOR_MODE_P (mode))
6824 /* Vector compare. */
6825 if (speed)
6826 *cost += extra_cost->vect.alu;
6828 if (aarch64_float_const_zero_rtx_p (op1))
6830 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6831 cost. */
6832 return true;
6834 return false;
6836 return false;
6838 case MINUS:
6840 op0 = XEXP (x, 0);
6841 op1 = XEXP (x, 1);
6843 cost_minus:
6844 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6846 /* Detect valid immediates. */
6847 if ((GET_MODE_CLASS (mode) == MODE_INT
6848 || (GET_MODE_CLASS (mode) == MODE_CC
6849 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6850 && CONST_INT_P (op1)
6851 && aarch64_uimm12_shift (INTVAL (op1)))
6853 if (speed)
6854 /* SUB(S) (immediate). */
6855 *cost += extra_cost->alu.arith;
6856 return true;
6859 /* Look for SUB (extended register). */
6860 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6862 if (speed)
6863 *cost += extra_cost->alu.extend_arith;
6865 op1 = aarch64_strip_extend (op1);
6866 *cost += rtx_cost (op1, VOIDmode,
6867 (enum rtx_code) GET_CODE (op1), 0, speed);
6868 return true;
6871 rtx new_op1 = aarch64_strip_extend (op1);
6873 /* Cost this as an FMA-alike operation. */
6874 if ((GET_CODE (new_op1) == MULT
6875 || aarch64_shift_p (GET_CODE (new_op1)))
6876 && code != COMPARE)
6878 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6879 (enum rtx_code) code,
6880 speed);
6881 return true;
6884 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6886 if (speed)
6888 if (VECTOR_MODE_P (mode))
6890 /* Vector SUB. */
6891 *cost += extra_cost->vect.alu;
6893 else if (GET_MODE_CLASS (mode) == MODE_INT)
6895 /* SUB(S). */
6896 *cost += extra_cost->alu.arith;
6898 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6900 /* FSUB. */
6901 *cost += extra_cost->fp[mode == DFmode].addsub;
6904 return true;
6907 case PLUS:
6909 rtx new_op0;
6911 op0 = XEXP (x, 0);
6912 op1 = XEXP (x, 1);
6914 cost_plus:
6915 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6916 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6918 /* CSINC. */
6919 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6920 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6921 return true;
6924 if (GET_MODE_CLASS (mode) == MODE_INT
6925 && CONST_INT_P (op1)
6926 && aarch64_uimm12_shift (INTVAL (op1)))
6928 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6930 if (speed)
6931 /* ADD (immediate). */
6932 *cost += extra_cost->alu.arith;
6933 return true;
6936 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6938 /* Look for ADD (extended register). */
6939 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6941 if (speed)
6942 *cost += extra_cost->alu.extend_arith;
6944 op0 = aarch64_strip_extend (op0);
6945 *cost += rtx_cost (op0, VOIDmode,
6946 (enum rtx_code) GET_CODE (op0), 0, speed);
6947 return true;
6950 /* Strip any extend, leave shifts behind as we will
6951 cost them through mult_cost. */
6952 new_op0 = aarch64_strip_extend (op0);
6954 if (GET_CODE (new_op0) == MULT
6955 || aarch64_shift_p (GET_CODE (new_op0)))
6957 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6958 speed);
6959 return true;
6962 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6964 if (speed)
6966 if (VECTOR_MODE_P (mode))
6968 /* Vector ADD. */
6969 *cost += extra_cost->vect.alu;
6971 else if (GET_MODE_CLASS (mode) == MODE_INT)
6973 /* ADD. */
6974 *cost += extra_cost->alu.arith;
6976 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6978 /* FADD. */
6979 *cost += extra_cost->fp[mode == DFmode].addsub;
6982 return true;
6985 case BSWAP:
6986 *cost = COSTS_N_INSNS (1);
6988 if (speed)
6990 if (VECTOR_MODE_P (mode))
6991 *cost += extra_cost->vect.alu;
6992 else
6993 *cost += extra_cost->alu.rev;
6995 return false;
6997 case IOR:
6998 if (aarch_rev16_p (x))
7000 *cost = COSTS_N_INSNS (1);
7002 if (speed)
7004 if (VECTOR_MODE_P (mode))
7005 *cost += extra_cost->vect.alu;
7006 else
7007 *cost += extra_cost->alu.rev;
7009 return true;
7012 if (aarch64_extr_rtx_p (x, &op0, &op1))
7014 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7015 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7016 if (speed)
7017 *cost += extra_cost->alu.shift;
7019 return true;
7021 /* Fall through. */
7022 case XOR:
7023 case AND:
7024 cost_logic:
7025 op0 = XEXP (x, 0);
7026 op1 = XEXP (x, 1);
7028 if (VECTOR_MODE_P (mode))
7030 if (speed)
7031 *cost += extra_cost->vect.alu;
7032 return true;
7035 if (code == AND
7036 && GET_CODE (op0) == MULT
7037 && CONST_INT_P (XEXP (op0, 1))
7038 && CONST_INT_P (op1)
7039 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7040 INTVAL (op1)) != 0)
7042 /* This is a UBFM/SBFM. */
7043 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7044 if (speed)
7045 *cost += extra_cost->alu.bfx;
7046 return true;
7049 if (GET_MODE_CLASS (mode) == MODE_INT)
7051 if (CONST_INT_P (op1))
7053 /* We have a mask + shift version of a UBFIZ
7054 i.e. the *andim_ashift<mode>_bfiz pattern. */
7055 if (GET_CODE (op0) == ASHIFT
7056 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7057 XEXP (op0, 1)))
7059 *cost += rtx_cost (XEXP (op0, 0), mode,
7060 (enum rtx_code) code, 0, speed);
7061 if (speed)
7062 *cost += extra_cost->alu.bfx;
7064 return true;
7066 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7068 /* We possibly get the immediate for free, this is not
7069 modelled. */
7070 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7071 if (speed)
7072 *cost += extra_cost->alu.logical;
7074 return true;
7077 else
7079 rtx new_op0 = op0;
7081 /* Handle ORN, EON, or BIC. */
7082 if (GET_CODE (op0) == NOT)
7083 op0 = XEXP (op0, 0);
7085 new_op0 = aarch64_strip_shift (op0);
7087 /* If we had a shift on op0 then this is a logical-shift-
7088 by-register/immediate operation. Otherwise, this is just
7089 a logical operation. */
7090 if (speed)
7092 if (new_op0 != op0)
7094 /* Shift by immediate. */
7095 if (CONST_INT_P (XEXP (op0, 1)))
7096 *cost += extra_cost->alu.log_shift;
7097 else
7098 *cost += extra_cost->alu.log_shift_reg;
7100 else
7101 *cost += extra_cost->alu.logical;
7104 /* In both cases we want to cost both operands. */
7105 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7106 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7108 return true;
7111 return false;
7113 case NOT:
7114 x = XEXP (x, 0);
7115 op0 = aarch64_strip_shift (x);
7117 if (VECTOR_MODE_P (mode))
7119 /* Vector NOT. */
7120 *cost += extra_cost->vect.alu;
7121 return false;
7124 /* MVN-shifted-reg. */
7125 if (op0 != x)
7127 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7129 if (speed)
7130 *cost += extra_cost->alu.log_shift;
7132 return true;
7134 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7135 Handle the second form here taking care that 'a' in the above can
7136 be a shift. */
7137 else if (GET_CODE (op0) == XOR)
7139 rtx newop0 = XEXP (op0, 0);
7140 rtx newop1 = XEXP (op0, 1);
7141 rtx op0_stripped = aarch64_strip_shift (newop0);
7143 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7144 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7146 if (speed)
7148 if (op0_stripped != newop0)
7149 *cost += extra_cost->alu.log_shift;
7150 else
7151 *cost += extra_cost->alu.logical;
7154 return true;
7156 /* MVN. */
7157 if (speed)
7158 *cost += extra_cost->alu.logical;
7160 return false;
7162 case ZERO_EXTEND:
7164 op0 = XEXP (x, 0);
7165 /* If a value is written in SI mode, then zero extended to DI
7166 mode, the operation will in general be free as a write to
7167 a 'w' register implicitly zeroes the upper bits of an 'x'
7168 register. However, if this is
7170 (set (reg) (zero_extend (reg)))
7172 we must cost the explicit register move. */
7173 if (mode == DImode
7174 && GET_MODE (op0) == SImode
7175 && outer == SET)
7177 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7179 /* If OP_COST is non-zero, then the cost of the zero extend
7180 is effectively the cost of the inner operation. Otherwise
7181 we have a MOV instruction and we take the cost from the MOV
7182 itself. This is true independently of whether we are
7183 optimizing for space or time. */
7184 if (op_cost)
7185 *cost = op_cost;
7187 return true;
7189 else if (MEM_P (op0))
7191 /* All loads can zero extend to any size for free. */
7192 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7193 return true;
7196 op0 = aarch64_extend_bitfield_pattern_p (x);
7197 if (op0)
7199 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7200 if (speed)
7201 *cost += extra_cost->alu.bfx;
7202 return true;
7205 if (speed)
7207 if (VECTOR_MODE_P (mode))
7209 /* UMOV. */
7210 *cost += extra_cost->vect.alu;
7212 else
7214 /* We generate an AND instead of UXTB/UXTH. */
7215 *cost += extra_cost->alu.logical;
7218 return false;
7220 case SIGN_EXTEND:
7221 if (MEM_P (XEXP (x, 0)))
7223 /* LDRSH. */
7224 if (speed)
7226 rtx address = XEXP (XEXP (x, 0), 0);
7227 *cost += extra_cost->ldst.load_sign_extend;
7229 *cost +=
7230 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7231 0, speed));
7233 return true;
7236 op0 = aarch64_extend_bitfield_pattern_p (x);
7237 if (op0)
7239 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7240 if (speed)
7241 *cost += extra_cost->alu.bfx;
7242 return true;
7245 if (speed)
7247 if (VECTOR_MODE_P (mode))
7248 *cost += extra_cost->vect.alu;
7249 else
7250 *cost += extra_cost->alu.extend;
7252 return false;
7254 case ASHIFT:
7255 op0 = XEXP (x, 0);
7256 op1 = XEXP (x, 1);
7258 if (CONST_INT_P (op1))
7260 if (speed)
7262 if (VECTOR_MODE_P (mode))
7264 /* Vector shift (immediate). */
7265 *cost += extra_cost->vect.alu;
7267 else
7269 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7270 aliases. */
7271 *cost += extra_cost->alu.shift;
7275 /* We can incorporate zero/sign extend for free. */
7276 if (GET_CODE (op0) == ZERO_EXTEND
7277 || GET_CODE (op0) == SIGN_EXTEND)
7278 op0 = XEXP (op0, 0);
7280 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7281 return true;
7283 else
7285 if (speed)
7287 if (VECTOR_MODE_P (mode))
7289 /* Vector shift (register). */
7290 *cost += extra_cost->vect.alu;
7292 else
7294 /* LSLV. */
7295 *cost += extra_cost->alu.shift_reg;
7298 return false; /* All arguments need to be in registers. */
7301 case ROTATE:
7302 case ROTATERT:
7303 case LSHIFTRT:
7304 case ASHIFTRT:
7305 op0 = XEXP (x, 0);
7306 op1 = XEXP (x, 1);
7308 if (CONST_INT_P (op1))
7310 /* ASR (immediate) and friends. */
7311 if (speed)
7313 if (VECTOR_MODE_P (mode))
7314 *cost += extra_cost->vect.alu;
7315 else
7316 *cost += extra_cost->alu.shift;
7319 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7320 return true;
7322 else
7325 /* ASR (register) and friends. */
7326 if (speed)
7328 if (VECTOR_MODE_P (mode))
7329 *cost += extra_cost->vect.alu;
7330 else
7331 *cost += extra_cost->alu.shift_reg;
7333 return false; /* All arguments need to be in registers. */
7336 case SYMBOL_REF:
7338 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7339 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7341 /* LDR. */
7342 if (speed)
7343 *cost += extra_cost->ldst.load;
7345 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7346 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7348 /* ADRP, followed by ADD. */
7349 *cost += COSTS_N_INSNS (1);
7350 if (speed)
7351 *cost += 2 * extra_cost->alu.arith;
7353 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7354 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7356 /* ADR. */
7357 if (speed)
7358 *cost += extra_cost->alu.arith;
7361 if (flag_pic)
7363 /* One extra load instruction, after accessing the GOT. */
7364 *cost += COSTS_N_INSNS (1);
7365 if (speed)
7366 *cost += extra_cost->ldst.load;
7368 return true;
7370 case HIGH:
7371 case LO_SUM:
7372 /* ADRP/ADD (immediate). */
7373 if (speed)
7374 *cost += extra_cost->alu.arith;
7375 return true;
7377 case ZERO_EXTRACT:
7378 case SIGN_EXTRACT:
7379 /* UBFX/SBFX. */
7380 if (speed)
7382 if (VECTOR_MODE_P (mode))
7383 *cost += extra_cost->vect.alu;
7384 else
7385 *cost += extra_cost->alu.bfx;
7388 /* We can trust that the immediates used will be correct (there
7389 are no by-register forms), so we need only cost op0. */
7390 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7391 return true;
7393 case MULT:
7394 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7395 /* aarch64_rtx_mult_cost always handles recursion to its
7396 operands. */
7397 return true;
7399 case MOD:
7400 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7401 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7402 an unconditional negate. This case should only ever be reached through
7403 the set_smod_pow2_cheap check in expmed.c. */
7404 if (CONST_INT_P (XEXP (x, 1))
7405 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7406 && (mode == SImode || mode == DImode))
7408 /* We expand to 4 instructions. Reset the baseline. */
7409 *cost = COSTS_N_INSNS (4);
7411 if (speed)
7412 *cost += 2 * extra_cost->alu.logical
7413 + 2 * extra_cost->alu.arith;
7415 return true;
7418 /* Fall-through. */
7419 case UMOD:
7420 if (speed)
7422 if (VECTOR_MODE_P (mode))
7423 *cost += extra_cost->vect.alu;
7424 else if (GET_MODE_CLASS (mode) == MODE_INT)
7425 *cost += (extra_cost->mult[mode == DImode].add
7426 + extra_cost->mult[mode == DImode].idiv);
7427 else if (mode == DFmode)
7428 *cost += (extra_cost->fp[1].mult
7429 + extra_cost->fp[1].div);
7430 else if (mode == SFmode)
7431 *cost += (extra_cost->fp[0].mult
7432 + extra_cost->fp[0].div);
7434 return false; /* All arguments need to be in registers. */
7436 case DIV:
7437 case UDIV:
7438 case SQRT:
7439 if (speed)
7441 if (VECTOR_MODE_P (mode))
7442 *cost += extra_cost->vect.alu;
7443 else if (GET_MODE_CLASS (mode) == MODE_INT)
7444 /* There is no integer SQRT, so only DIV and UDIV can get
7445 here. */
7446 *cost += extra_cost->mult[mode == DImode].idiv;
7447 else
7448 *cost += extra_cost->fp[mode == DFmode].div;
7450 return false; /* All arguments need to be in registers. */
7452 case IF_THEN_ELSE:
7453 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7454 XEXP (x, 2), cost, speed);
7456 case EQ:
7457 case NE:
7458 case GT:
7459 case GTU:
7460 case LT:
7461 case LTU:
7462 case GE:
7463 case GEU:
7464 case LE:
7465 case LEU:
7467 return false; /* All arguments must be in registers. */
7469 case FMA:
7470 op0 = XEXP (x, 0);
7471 op1 = XEXP (x, 1);
7472 op2 = XEXP (x, 2);
7474 if (speed)
7476 if (VECTOR_MODE_P (mode))
7477 *cost += extra_cost->vect.alu;
7478 else
7479 *cost += extra_cost->fp[mode == DFmode].fma;
7482 /* FMSUB, FNMADD, and FNMSUB are free. */
7483 if (GET_CODE (op0) == NEG)
7484 op0 = XEXP (op0, 0);
7486 if (GET_CODE (op2) == NEG)
7487 op2 = XEXP (op2, 0);
7489 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7490 and the by-element operand as operand 0. */
7491 if (GET_CODE (op1) == NEG)
7492 op1 = XEXP (op1, 0);
7494 /* Catch vector-by-element operations. The by-element operand can
7495 either be (vec_duplicate (vec_select (x))) or just
7496 (vec_select (x)), depending on whether we are multiplying by
7497 a vector or a scalar.
7499 Canonicalization is not very good in these cases, FMA4 will put the
7500 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7501 if (GET_CODE (op0) == VEC_DUPLICATE)
7502 op0 = XEXP (op0, 0);
7503 else if (GET_CODE (op1) == VEC_DUPLICATE)
7504 op1 = XEXP (op1, 0);
7506 if (GET_CODE (op0) == VEC_SELECT)
7507 op0 = XEXP (op0, 0);
7508 else if (GET_CODE (op1) == VEC_SELECT)
7509 op1 = XEXP (op1, 0);
7511 /* If the remaining parameters are not registers,
7512 get the cost to put them into registers. */
7513 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7514 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7515 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7516 return true;
7518 case FLOAT:
7519 case UNSIGNED_FLOAT:
7520 if (speed)
7521 *cost += extra_cost->fp[mode == DFmode].fromint;
7522 return false;
7524 case FLOAT_EXTEND:
7525 if (speed)
7527 if (VECTOR_MODE_P (mode))
7529 /*Vector truncate. */
7530 *cost += extra_cost->vect.alu;
7532 else
7533 *cost += extra_cost->fp[mode == DFmode].widen;
7535 return false;
7537 case FLOAT_TRUNCATE:
7538 if (speed)
7540 if (VECTOR_MODE_P (mode))
7542 /*Vector conversion. */
7543 *cost += extra_cost->vect.alu;
7545 else
7546 *cost += extra_cost->fp[mode == DFmode].narrow;
7548 return false;
7550 case FIX:
7551 case UNSIGNED_FIX:
7552 x = XEXP (x, 0);
7553 /* Strip the rounding part. They will all be implemented
7554 by the fcvt* family of instructions anyway. */
7555 if (GET_CODE (x) == UNSPEC)
7557 unsigned int uns_code = XINT (x, 1);
7559 if (uns_code == UNSPEC_FRINTA
7560 || uns_code == UNSPEC_FRINTM
7561 || uns_code == UNSPEC_FRINTN
7562 || uns_code == UNSPEC_FRINTP
7563 || uns_code == UNSPEC_FRINTZ)
7564 x = XVECEXP (x, 0, 0);
7567 if (speed)
7569 if (VECTOR_MODE_P (mode))
7570 *cost += extra_cost->vect.alu;
7571 else
7572 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7575 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7576 fixed-point fcvt. */
7577 if (GET_CODE (x) == MULT
7578 && ((VECTOR_MODE_P (mode)
7579 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7580 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7582 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7583 0, speed);
7584 return true;
7587 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7588 return true;
7590 case ABS:
7591 if (VECTOR_MODE_P (mode))
7593 /* ABS (vector). */
7594 if (speed)
7595 *cost += extra_cost->vect.alu;
7597 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7599 op0 = XEXP (x, 0);
7601 /* FABD, which is analogous to FADD. */
7602 if (GET_CODE (op0) == MINUS)
7604 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7605 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7606 if (speed)
7607 *cost += extra_cost->fp[mode == DFmode].addsub;
7609 return true;
7611 /* Simple FABS is analogous to FNEG. */
7612 if (speed)
7613 *cost += extra_cost->fp[mode == DFmode].neg;
7615 else
7617 /* Integer ABS will either be split to
7618 two arithmetic instructions, or will be an ABS
7619 (scalar), which we don't model. */
7620 *cost = COSTS_N_INSNS (2);
7621 if (speed)
7622 *cost += 2 * extra_cost->alu.arith;
7624 return false;
7626 case SMAX:
7627 case SMIN:
7628 if (speed)
7630 if (VECTOR_MODE_P (mode))
7631 *cost += extra_cost->vect.alu;
7632 else
7634 /* FMAXNM/FMINNM/FMAX/FMIN.
7635 TODO: This may not be accurate for all implementations, but
7636 we do not model this in the cost tables. */
7637 *cost += extra_cost->fp[mode == DFmode].addsub;
7640 return false;
7642 case UNSPEC:
7643 /* The floating point round to integer frint* instructions. */
7644 if (aarch64_frint_unspec_p (XINT (x, 1)))
7646 if (speed)
7647 *cost += extra_cost->fp[mode == DFmode].roundint;
7649 return false;
7652 if (XINT (x, 1) == UNSPEC_RBIT)
7654 if (speed)
7655 *cost += extra_cost->alu.rev;
7657 return false;
7659 break;
7661 case TRUNCATE:
7663 /* Decompose <su>muldi3_highpart. */
7664 if (/* (truncate:DI */
7665 mode == DImode
7666 /* (lshiftrt:TI */
7667 && GET_MODE (XEXP (x, 0)) == TImode
7668 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7669 /* (mult:TI */
7670 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7671 /* (ANY_EXTEND:TI (reg:DI))
7672 (ANY_EXTEND:TI (reg:DI))) */
7673 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7674 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7675 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7676 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7677 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7678 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7679 /* (const_int 64) */
7680 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7681 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7683 /* UMULH/SMULH. */
7684 if (speed)
7685 *cost += extra_cost->mult[mode == DImode].extend;
7686 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7687 mode, MULT, 0, speed);
7688 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7689 mode, MULT, 1, speed);
7690 return true;
7693 /* Fall through. */
7694 default:
7695 break;
7698 if (dump_file
7699 && flag_aarch64_verbose_cost)
7700 fprintf (dump_file,
7701 "\nFailed to cost RTX. Assuming default cost.\n");
7703 return true;
7706 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7707 calculated for X. This cost is stored in *COST. Returns true
7708 if the total cost of X was calculated. */
7709 static bool
7710 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7711 int param, int *cost, bool speed)
7713 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7715 if (dump_file
7716 && flag_aarch64_verbose_cost)
7718 print_rtl_single (dump_file, x);
7719 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7720 speed ? "Hot" : "Cold",
7721 *cost, result ? "final" : "partial");
7724 return result;
7727 static int
7728 aarch64_register_move_cost (machine_mode mode,
7729 reg_class_t from_i, reg_class_t to_i)
7731 enum reg_class from = (enum reg_class) from_i;
7732 enum reg_class to = (enum reg_class) to_i;
7733 const struct cpu_regmove_cost *regmove_cost
7734 = aarch64_tune_params.regmove_cost;
7736 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7737 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7738 to = GENERAL_REGS;
7740 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7741 from = GENERAL_REGS;
7743 /* Moving between GPR and stack cost is the same as GP2GP. */
7744 if ((from == GENERAL_REGS && to == STACK_REG)
7745 || (to == GENERAL_REGS && from == STACK_REG))
7746 return regmove_cost->GP2GP;
7748 /* To/From the stack register, we move via the gprs. */
7749 if (to == STACK_REG || from == STACK_REG)
7750 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7751 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7753 if (GET_MODE_SIZE (mode) == 16)
7755 /* 128-bit operations on general registers require 2 instructions. */
7756 if (from == GENERAL_REGS && to == GENERAL_REGS)
7757 return regmove_cost->GP2GP * 2;
7758 else if (from == GENERAL_REGS)
7759 return regmove_cost->GP2FP * 2;
7760 else if (to == GENERAL_REGS)
7761 return regmove_cost->FP2GP * 2;
7763 /* When AdvSIMD instructions are disabled it is not possible to move
7764 a 128-bit value directly between Q registers. This is handled in
7765 secondary reload. A general register is used as a scratch to move
7766 the upper DI value and the lower DI value is moved directly,
7767 hence the cost is the sum of three moves. */
7768 if (! TARGET_SIMD)
7769 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7771 return regmove_cost->FP2FP;
7774 if (from == GENERAL_REGS && to == GENERAL_REGS)
7775 return regmove_cost->GP2GP;
7776 else if (from == GENERAL_REGS)
7777 return regmove_cost->GP2FP;
7778 else if (to == GENERAL_REGS)
7779 return regmove_cost->FP2GP;
7781 return regmove_cost->FP2FP;
7784 static int
7785 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7786 reg_class_t rclass ATTRIBUTE_UNUSED,
7787 bool in ATTRIBUTE_UNUSED)
7789 return aarch64_tune_params.memmov_cost;
7792 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7793 to optimize 1.0/sqrt. */
7795 static bool
7796 use_rsqrt_p (machine_mode mode)
7798 return (!flag_trapping_math
7799 && flag_unsafe_math_optimizations
7800 && ((aarch64_tune_params.approx_modes->recip_sqrt
7801 & AARCH64_APPROX_MODE (mode))
7802 || flag_mrecip_low_precision_sqrt));
7805 /* Function to decide when to use the approximate reciprocal square root
7806 builtin. */
7808 static tree
7809 aarch64_builtin_reciprocal (tree fndecl)
7811 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7813 if (!use_rsqrt_p (mode))
7814 return NULL_TREE;
7815 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7818 typedef rtx (*rsqrte_type) (rtx, rtx);
7820 /* Select reciprocal square root initial estimate insn depending on machine
7821 mode. */
7823 static rsqrte_type
7824 get_rsqrte_type (machine_mode mode)
7826 switch (mode)
7828 case DFmode: return gen_aarch64_rsqrtedf;
7829 case SFmode: return gen_aarch64_rsqrtesf;
7830 case V2DFmode: return gen_aarch64_rsqrtev2df;
7831 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7832 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7833 default: gcc_unreachable ();
7837 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7839 /* Select reciprocal square root series step insn depending on machine mode. */
7841 static rsqrts_type
7842 get_rsqrts_type (machine_mode mode)
7844 switch (mode)
7846 case DFmode: return gen_aarch64_rsqrtsdf;
7847 case SFmode: return gen_aarch64_rsqrtssf;
7848 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7849 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7850 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7851 default: gcc_unreachable ();
7855 /* Emit instruction sequence to compute either the approximate square root
7856 or its approximate reciprocal, depending on the flag RECP, and return
7857 whether the sequence was emitted or not. */
7859 bool
7860 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7862 machine_mode mode = GET_MODE (dst);
7864 if (GET_MODE_INNER (mode) == HFmode)
7865 return false;
7867 machine_mode mmsk = mode_for_vector
7868 (int_mode_for_mode (GET_MODE_INNER (mode)),
7869 GET_MODE_NUNITS (mode));
7870 bool use_approx_sqrt_p = (!recp
7871 && (flag_mlow_precision_sqrt
7872 || (aarch64_tune_params.approx_modes->sqrt
7873 & AARCH64_APPROX_MODE (mode))));
7874 bool use_approx_rsqrt_p = (recp
7875 && (flag_mrecip_low_precision_sqrt
7876 || (aarch64_tune_params.approx_modes->recip_sqrt
7877 & AARCH64_APPROX_MODE (mode))));
7879 if (!flag_finite_math_only
7880 || flag_trapping_math
7881 || !flag_unsafe_math_optimizations
7882 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7883 || optimize_function_for_size_p (cfun))
7884 return false;
7886 rtx xmsk = gen_reg_rtx (mmsk);
7887 if (!recp)
7888 /* When calculating the approximate square root, compare the argument with
7889 0.0 and create a mask. */
7890 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7891 CONST0_RTX (mode)))));
7893 /* Estimate the approximate reciprocal square root. */
7894 rtx xdst = gen_reg_rtx (mode);
7895 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7897 /* Iterate over the series twice for SF and thrice for DF. */
7898 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7900 /* Optionally iterate over the series once less for faster performance
7901 while sacrificing the accuracy. */
7902 if ((recp && flag_mrecip_low_precision_sqrt)
7903 || (!recp && flag_mlow_precision_sqrt))
7904 iterations--;
7906 /* Iterate over the series to calculate the approximate reciprocal square
7907 root. */
7908 rtx x1 = gen_reg_rtx (mode);
7909 while (iterations--)
7911 rtx x2 = gen_reg_rtx (mode);
7912 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7914 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7916 if (iterations > 0)
7917 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7920 if (!recp)
7922 /* Qualify the approximate reciprocal square root when the argument is
7923 0.0 by squashing the intermediary result to 0.0. */
7924 rtx xtmp = gen_reg_rtx (mmsk);
7925 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7926 gen_rtx_SUBREG (mmsk, xdst, 0)));
7927 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7929 /* Calculate the approximate square root. */
7930 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7933 /* Finalize the approximation. */
7934 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7936 return true;
7939 typedef rtx (*recpe_type) (rtx, rtx);
7941 /* Select reciprocal initial estimate insn depending on machine mode. */
7943 static recpe_type
7944 get_recpe_type (machine_mode mode)
7946 switch (mode)
7948 case SFmode: return (gen_aarch64_frecpesf);
7949 case V2SFmode: return (gen_aarch64_frecpev2sf);
7950 case V4SFmode: return (gen_aarch64_frecpev4sf);
7951 case DFmode: return (gen_aarch64_frecpedf);
7952 case V2DFmode: return (gen_aarch64_frecpev2df);
7953 default: gcc_unreachable ();
7957 typedef rtx (*recps_type) (rtx, rtx, rtx);
7959 /* Select reciprocal series step insn depending on machine mode. */
7961 static recps_type
7962 get_recps_type (machine_mode mode)
7964 switch (mode)
7966 case SFmode: return (gen_aarch64_frecpssf);
7967 case V2SFmode: return (gen_aarch64_frecpsv2sf);
7968 case V4SFmode: return (gen_aarch64_frecpsv4sf);
7969 case DFmode: return (gen_aarch64_frecpsdf);
7970 case V2DFmode: return (gen_aarch64_frecpsv2df);
7971 default: gcc_unreachable ();
7975 /* Emit the instruction sequence to compute the approximation for the division
7976 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7978 bool
7979 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7981 machine_mode mode = GET_MODE (quo);
7983 if (GET_MODE_INNER (mode) == HFmode)
7984 return false;
7986 bool use_approx_division_p = (flag_mlow_precision_div
7987 || (aarch64_tune_params.approx_modes->division
7988 & AARCH64_APPROX_MODE (mode)));
7990 if (!flag_finite_math_only
7991 || flag_trapping_math
7992 || !flag_unsafe_math_optimizations
7993 || optimize_function_for_size_p (cfun)
7994 || !use_approx_division_p)
7995 return false;
7997 /* Estimate the approximate reciprocal. */
7998 rtx xrcp = gen_reg_rtx (mode);
7999 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8001 /* Iterate over the series twice for SF and thrice for DF. */
8002 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8004 /* Optionally iterate over the series once less for faster performance,
8005 while sacrificing the accuracy. */
8006 if (flag_mlow_precision_div)
8007 iterations--;
8009 /* Iterate over the series to calculate the approximate reciprocal. */
8010 rtx xtmp = gen_reg_rtx (mode);
8011 while (iterations--)
8013 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8015 if (iterations > 0)
8016 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8019 if (num != CONST1_RTX (mode))
8021 /* As the approximate reciprocal of DEN is already calculated, only
8022 calculate the approximate division when NUM is not 1.0. */
8023 rtx xnum = force_reg (mode, num);
8024 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8027 /* Finalize the approximation. */
8028 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8029 return true;
8032 /* Return the number of instructions that can be issued per cycle. */
8033 static int
8034 aarch64_sched_issue_rate (void)
8036 return aarch64_tune_params.issue_rate;
8039 static int
8040 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8042 int issue_rate = aarch64_sched_issue_rate ();
8044 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8048 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8049 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8050 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8052 static int
8053 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8054 int ready_index)
8056 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8060 /* Vectorizer cost model target hooks. */
8062 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8063 static int
8064 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8065 tree vectype,
8066 int misalign ATTRIBUTE_UNUSED)
8068 unsigned elements;
8070 switch (type_of_cost)
8072 case scalar_stmt:
8073 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
8075 case scalar_load:
8076 return aarch64_tune_params.vec_costs->scalar_load_cost;
8078 case scalar_store:
8079 return aarch64_tune_params.vec_costs->scalar_store_cost;
8081 case vector_stmt:
8082 return aarch64_tune_params.vec_costs->vec_stmt_cost;
8084 case vector_load:
8085 return aarch64_tune_params.vec_costs->vec_align_load_cost;
8087 case vector_store:
8088 return aarch64_tune_params.vec_costs->vec_store_cost;
8090 case vec_to_scalar:
8091 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
8093 case scalar_to_vec:
8094 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
8096 case unaligned_load:
8097 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
8099 case unaligned_store:
8100 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
8102 case cond_branch_taken:
8103 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
8105 case cond_branch_not_taken:
8106 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
8108 case vec_perm:
8109 return aarch64_tune_params.vec_costs->vec_permute_cost;
8111 case vec_promote_demote:
8112 return aarch64_tune_params.vec_costs->vec_stmt_cost;
8114 case vec_construct:
8115 elements = TYPE_VECTOR_SUBPARTS (vectype);
8116 return elements / 2 + 1;
8118 default:
8119 gcc_unreachable ();
8123 /* Implement targetm.vectorize.add_stmt_cost. */
8124 static unsigned
8125 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8126 struct _stmt_vec_info *stmt_info, int misalign,
8127 enum vect_cost_model_location where)
8129 unsigned *cost = (unsigned *) data;
8130 unsigned retval = 0;
8132 if (flag_vect_cost_model)
8134 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8135 int stmt_cost =
8136 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8138 /* Statements in an inner loop relative to the loop being
8139 vectorized are weighted more heavily. The value here is
8140 arbitrary and could potentially be improved with analysis. */
8141 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8142 count *= 50; /* FIXME */
8144 retval = (unsigned) (count * stmt_cost);
8145 cost[where] += retval;
8148 return retval;
8151 static void initialize_aarch64_code_model (struct gcc_options *);
8153 /* Parse the TO_PARSE string and put the architecture struct that it
8154 selects into RES and the architectural features into ISA_FLAGS.
8155 Return an aarch64_parse_opt_result describing the parse result.
8156 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8158 static enum aarch64_parse_opt_result
8159 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8160 unsigned long *isa_flags)
8162 char *ext;
8163 const struct processor *arch;
8164 char *str = (char *) alloca (strlen (to_parse) + 1);
8165 size_t len;
8167 strcpy (str, to_parse);
8169 ext = strchr (str, '+');
8171 if (ext != NULL)
8172 len = ext - str;
8173 else
8174 len = strlen (str);
8176 if (len == 0)
8177 return AARCH64_PARSE_MISSING_ARG;
8180 /* Loop through the list of supported ARCHes to find a match. */
8181 for (arch = all_architectures; arch->name != NULL; arch++)
8183 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8185 unsigned long isa_temp = arch->flags;
8187 if (ext != NULL)
8189 /* TO_PARSE string contains at least one extension. */
8190 enum aarch64_parse_opt_result ext_res
8191 = aarch64_parse_extension (ext, &isa_temp);
8193 if (ext_res != AARCH64_PARSE_OK)
8194 return ext_res;
8196 /* Extension parsing was successful. Confirm the result
8197 arch and ISA flags. */
8198 *res = arch;
8199 *isa_flags = isa_temp;
8200 return AARCH64_PARSE_OK;
8204 /* ARCH name not found in list. */
8205 return AARCH64_PARSE_INVALID_ARG;
8208 /* Parse the TO_PARSE string and put the result tuning in RES and the
8209 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8210 describing the parse result. If there is an error parsing, RES and
8211 ISA_FLAGS are left unchanged. */
8213 static enum aarch64_parse_opt_result
8214 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8215 unsigned long *isa_flags)
8217 char *ext;
8218 const struct processor *cpu;
8219 char *str = (char *) alloca (strlen (to_parse) + 1);
8220 size_t len;
8222 strcpy (str, to_parse);
8224 ext = strchr (str, '+');
8226 if (ext != NULL)
8227 len = ext - str;
8228 else
8229 len = strlen (str);
8231 if (len == 0)
8232 return AARCH64_PARSE_MISSING_ARG;
8235 /* Loop through the list of supported CPUs to find a match. */
8236 for (cpu = all_cores; cpu->name != NULL; cpu++)
8238 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8240 unsigned long isa_temp = cpu->flags;
8243 if (ext != NULL)
8245 /* TO_PARSE string contains at least one extension. */
8246 enum aarch64_parse_opt_result ext_res
8247 = aarch64_parse_extension (ext, &isa_temp);
8249 if (ext_res != AARCH64_PARSE_OK)
8250 return ext_res;
8252 /* Extension parsing was successfull. Confirm the result
8253 cpu and ISA flags. */
8254 *res = cpu;
8255 *isa_flags = isa_temp;
8256 return AARCH64_PARSE_OK;
8260 /* CPU name not found in list. */
8261 return AARCH64_PARSE_INVALID_ARG;
8264 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8265 Return an aarch64_parse_opt_result describing the parse result.
8266 If the parsing fails the RES does not change. */
8268 static enum aarch64_parse_opt_result
8269 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8271 const struct processor *cpu;
8272 char *str = (char *) alloca (strlen (to_parse) + 1);
8274 strcpy (str, to_parse);
8276 /* Loop through the list of supported CPUs to find a match. */
8277 for (cpu = all_cores; cpu->name != NULL; cpu++)
8279 if (strcmp (cpu->name, str) == 0)
8281 *res = cpu;
8282 return AARCH64_PARSE_OK;
8286 /* CPU name not found in list. */
8287 return AARCH64_PARSE_INVALID_ARG;
8290 /* Parse TOKEN, which has length LENGTH to see if it is an option
8291 described in FLAG. If it is, return the index bit for that fusion type.
8292 If not, error (printing OPTION_NAME) and return zero. */
8294 static unsigned int
8295 aarch64_parse_one_option_token (const char *token,
8296 size_t length,
8297 const struct aarch64_flag_desc *flag,
8298 const char *option_name)
8300 for (; flag->name != NULL; flag++)
8302 if (length == strlen (flag->name)
8303 && !strncmp (flag->name, token, length))
8304 return flag->flag;
8307 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8308 return 0;
8311 /* Parse OPTION which is a comma-separated list of flags to enable.
8312 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8313 default state we inherit from the CPU tuning structures. OPTION_NAME
8314 gives the top-level option we are parsing in the -moverride string,
8315 for use in error messages. */
8317 static unsigned int
8318 aarch64_parse_boolean_options (const char *option,
8319 const struct aarch64_flag_desc *flags,
8320 unsigned int initial_state,
8321 const char *option_name)
8323 const char separator = '.';
8324 const char* specs = option;
8325 const char* ntoken = option;
8326 unsigned int found_flags = initial_state;
8328 while ((ntoken = strchr (specs, separator)))
8330 size_t token_length = ntoken - specs;
8331 unsigned token_ops = aarch64_parse_one_option_token (specs,
8332 token_length,
8333 flags,
8334 option_name);
8335 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8336 in the token stream, reset the supported operations. So:
8338 adrp+add.cmp+branch.none.adrp+add
8340 would have the result of turning on only adrp+add fusion. */
8341 if (!token_ops)
8342 found_flags = 0;
8344 found_flags |= token_ops;
8345 specs = ++ntoken;
8348 /* We ended with a comma, print something. */
8349 if (!(*specs))
8351 error ("%s string ill-formed\n", option_name);
8352 return 0;
8355 /* We still have one more token to parse. */
8356 size_t token_length = strlen (specs);
8357 unsigned token_ops = aarch64_parse_one_option_token (specs,
8358 token_length,
8359 flags,
8360 option_name);
8361 if (!token_ops)
8362 found_flags = 0;
8364 found_flags |= token_ops;
8365 return found_flags;
8368 /* Support for overriding instruction fusion. */
8370 static void
8371 aarch64_parse_fuse_string (const char *fuse_string,
8372 struct tune_params *tune)
8374 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8375 aarch64_fusible_pairs,
8376 tune->fusible_ops,
8377 "fuse=");
8380 /* Support for overriding other tuning flags. */
8382 static void
8383 aarch64_parse_tune_string (const char *tune_string,
8384 struct tune_params *tune)
8386 tune->extra_tuning_flags
8387 = aarch64_parse_boolean_options (tune_string,
8388 aarch64_tuning_flags,
8389 tune->extra_tuning_flags,
8390 "tune=");
8393 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8394 we understand. If it is, extract the option string and handoff to
8395 the appropriate function. */
8397 void
8398 aarch64_parse_one_override_token (const char* token,
8399 size_t length,
8400 struct tune_params *tune)
8402 const struct aarch64_tuning_override_function *fn
8403 = aarch64_tuning_override_functions;
8405 const char *option_part = strchr (token, '=');
8406 if (!option_part)
8408 error ("tuning string missing in option (%s)", token);
8409 return;
8412 /* Get the length of the option name. */
8413 length = option_part - token;
8414 /* Skip the '=' to get to the option string. */
8415 option_part++;
8417 for (; fn->name != NULL; fn++)
8419 if (!strncmp (fn->name, token, length))
8421 fn->parse_override (option_part, tune);
8422 return;
8426 error ("unknown tuning option (%s)",token);
8427 return;
8430 /* A checking mechanism for the implementation of the tls size. */
8432 static void
8433 initialize_aarch64_tls_size (struct gcc_options *opts)
8435 if (aarch64_tls_size == 0)
8436 aarch64_tls_size = 24;
8438 switch (opts->x_aarch64_cmodel_var)
8440 case AARCH64_CMODEL_TINY:
8441 /* Both the default and maximum TLS size allowed under tiny is 1M which
8442 needs two instructions to address, so we clamp the size to 24. */
8443 if (aarch64_tls_size > 24)
8444 aarch64_tls_size = 24;
8445 break;
8446 case AARCH64_CMODEL_SMALL:
8447 /* The maximum TLS size allowed under small is 4G. */
8448 if (aarch64_tls_size > 32)
8449 aarch64_tls_size = 32;
8450 break;
8451 case AARCH64_CMODEL_LARGE:
8452 /* The maximum TLS size allowed under large is 16E.
8453 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8454 if (aarch64_tls_size > 48)
8455 aarch64_tls_size = 48;
8456 break;
8457 default:
8458 gcc_unreachable ();
8461 return;
8464 /* Parse STRING looking for options in the format:
8465 string :: option:string
8466 option :: name=substring
8467 name :: {a-z}
8468 substring :: defined by option. */
8470 static void
8471 aarch64_parse_override_string (const char* input_string,
8472 struct tune_params* tune)
8474 const char separator = ':';
8475 size_t string_length = strlen (input_string) + 1;
8476 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8477 char *string = string_root;
8478 strncpy (string, input_string, string_length);
8479 string[string_length - 1] = '\0';
8481 char* ntoken = string;
8483 while ((ntoken = strchr (string, separator)))
8485 size_t token_length = ntoken - string;
8486 /* Make this substring look like a string. */
8487 *ntoken = '\0';
8488 aarch64_parse_one_override_token (string, token_length, tune);
8489 string = ++ntoken;
8492 /* One last option to parse. */
8493 aarch64_parse_one_override_token (string, strlen (string), tune);
8494 free (string_root);
8498 static void
8499 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8501 /* The logic here is that if we are disabling all frame pointer generation
8502 then we do not need to disable leaf frame pointer generation as a
8503 separate operation. But if we are *only* disabling leaf frame pointer
8504 generation then we set flag_omit_frame_pointer to true, but in
8505 aarch64_frame_pointer_required we return false only for leaf functions.
8507 PR 70044: We have to be careful about being called multiple times for the
8508 same function. Once we have decided to set flag_omit_frame_pointer just
8509 so that we can omit leaf frame pointers, we must then not interpret a
8510 second call as meaning that all frame pointer generation should be
8511 omitted. We do this by setting flag_omit_frame_pointer to a special,
8512 non-zero value. */
8513 if (opts->x_flag_omit_frame_pointer == 2)
8514 opts->x_flag_omit_frame_pointer = 0;
8516 if (opts->x_flag_omit_frame_pointer)
8517 opts->x_flag_omit_leaf_frame_pointer = false;
8518 else if (opts->x_flag_omit_leaf_frame_pointer)
8519 opts->x_flag_omit_frame_pointer = 2;
8521 /* If not optimizing for size, set the default
8522 alignment to what the target wants. */
8523 if (!opts->x_optimize_size)
8525 if (opts->x_align_loops <= 0)
8526 opts->x_align_loops = aarch64_tune_params.loop_align;
8527 if (opts->x_align_jumps <= 0)
8528 opts->x_align_jumps = aarch64_tune_params.jump_align;
8529 if (opts->x_align_functions <= 0)
8530 opts->x_align_functions = aarch64_tune_params.function_align;
8533 /* We default to no pc-relative literal loads. */
8535 aarch64_pcrelative_literal_loads = false;
8537 /* If -mpc-relative-literal-loads is set on the command line, this
8538 implies that the user asked for PC relative literal loads. */
8539 if (opts->x_pcrelative_literal_loads == 1)
8540 aarch64_pcrelative_literal_loads = true;
8542 /* This is PR70113. When building the Linux kernel with
8543 CONFIG_ARM64_ERRATUM_843419, support for relocations
8544 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8545 removed from the kernel to avoid loading objects with possibly
8546 offending sequences. Without -mpc-relative-literal-loads we would
8547 generate such relocations, preventing the kernel build from
8548 succeeding. */
8549 if (opts->x_pcrelative_literal_loads == 2
8550 && TARGET_FIX_ERR_A53_843419)
8551 aarch64_pcrelative_literal_loads = true;
8553 /* In the tiny memory model it makes no sense to disallow PC relative
8554 literal pool loads. */
8555 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8556 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8557 aarch64_pcrelative_literal_loads = true;
8559 /* When enabling the lower precision Newton series for the square root, also
8560 enable it for the reciprocal square root, since the latter is an
8561 intermediary step for the former. */
8562 if (flag_mlow_precision_sqrt)
8563 flag_mrecip_low_precision_sqrt = true;
8566 /* 'Unpack' up the internal tuning structs and update the options
8567 in OPTS. The caller must have set up selected_tune and selected_arch
8568 as all the other target-specific codegen decisions are
8569 derived from them. */
8571 void
8572 aarch64_override_options_internal (struct gcc_options *opts)
8574 aarch64_tune_flags = selected_tune->flags;
8575 aarch64_tune = selected_tune->sched_core;
8576 /* Make a copy of the tuning parameters attached to the core, which
8577 we may later overwrite. */
8578 aarch64_tune_params = *(selected_tune->tune);
8579 aarch64_architecture_version = selected_arch->architecture_version;
8581 if (opts->x_aarch64_override_tune_string)
8582 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8583 &aarch64_tune_params);
8585 /* This target defaults to strict volatile bitfields. */
8586 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8587 opts->x_flag_strict_volatile_bitfields = 1;
8589 initialize_aarch64_code_model (opts);
8590 initialize_aarch64_tls_size (opts);
8592 int queue_depth = 0;
8593 switch (aarch64_tune_params.autoprefetcher_model)
8595 case tune_params::AUTOPREFETCHER_OFF:
8596 queue_depth = -1;
8597 break;
8598 case tune_params::AUTOPREFETCHER_WEAK:
8599 queue_depth = 0;
8600 break;
8601 case tune_params::AUTOPREFETCHER_STRONG:
8602 queue_depth = max_insn_queue_index + 1;
8603 break;
8604 default:
8605 gcc_unreachable ();
8608 /* We don't mind passing in global_options_set here as we don't use
8609 the *options_set structs anyway. */
8610 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8611 queue_depth,
8612 opts->x_param_values,
8613 global_options_set.x_param_values);
8615 /* Set the L1 cache line size. */
8616 if (selected_cpu->tune->cache_line_size != 0)
8617 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8618 selected_cpu->tune->cache_line_size,
8619 opts->x_param_values,
8620 global_options_set.x_param_values);
8622 aarch64_override_options_after_change_1 (opts);
8625 /* Print a hint with a suggestion for a core or architecture name that
8626 most closely resembles what the user passed in STR. ARCH is true if
8627 the user is asking for an architecture name. ARCH is false if the user
8628 is asking for a core name. */
8630 static void
8631 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8633 auto_vec<const char *> candidates;
8634 const struct processor *entry = arch ? all_architectures : all_cores;
8635 for (; entry->name != NULL; entry++)
8636 candidates.safe_push (entry->name);
8637 char *s;
8638 const char *hint = candidates_list_and_hint (str, s, candidates);
8639 if (hint)
8640 inform (input_location, "valid arguments are: %s;"
8641 " did you mean %qs?", s, hint);
8642 XDELETEVEC (s);
8645 /* Print a hint with a suggestion for a core name that most closely resembles
8646 what the user passed in STR. */
8648 inline static void
8649 aarch64_print_hint_for_core (const char *str)
8651 aarch64_print_hint_for_core_or_arch (str, false);
8654 /* Print a hint with a suggestion for an architecture name that most closely
8655 resembles what the user passed in STR. */
8657 inline static void
8658 aarch64_print_hint_for_arch (const char *str)
8660 aarch64_print_hint_for_core_or_arch (str, true);
8663 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8664 specified in STR and throw errors if appropriate. Put the results if
8665 they are valid in RES and ISA_FLAGS. Return whether the option is
8666 valid. */
8668 static bool
8669 aarch64_validate_mcpu (const char *str, const struct processor **res,
8670 unsigned long *isa_flags)
8672 enum aarch64_parse_opt_result parse_res
8673 = aarch64_parse_cpu (str, res, isa_flags);
8675 if (parse_res == AARCH64_PARSE_OK)
8676 return true;
8678 switch (parse_res)
8680 case AARCH64_PARSE_MISSING_ARG:
8681 error ("missing cpu name in -mcpu=%qs", str);
8682 break;
8683 case AARCH64_PARSE_INVALID_ARG:
8684 error ("unknown value %qs for -mcpu", str);
8685 aarch64_print_hint_for_core (str);
8686 break;
8687 case AARCH64_PARSE_INVALID_FEATURE:
8688 error ("invalid feature modifier in -mcpu=%qs", str);
8689 break;
8690 default:
8691 gcc_unreachable ();
8694 return false;
8697 /* Validate a command-line -march option. Parse the arch and extensions
8698 (if any) specified in STR and throw errors if appropriate. Put the
8699 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8700 option is valid. */
8702 static bool
8703 aarch64_validate_march (const char *str, const struct processor **res,
8704 unsigned long *isa_flags)
8706 enum aarch64_parse_opt_result parse_res
8707 = aarch64_parse_arch (str, res, isa_flags);
8709 if (parse_res == AARCH64_PARSE_OK)
8710 return true;
8712 switch (parse_res)
8714 case AARCH64_PARSE_MISSING_ARG:
8715 error ("missing arch name in -march=%qs", str);
8716 break;
8717 case AARCH64_PARSE_INVALID_ARG:
8718 error ("unknown value %qs for -march", str);
8719 aarch64_print_hint_for_arch (str);
8720 break;
8721 case AARCH64_PARSE_INVALID_FEATURE:
8722 error ("invalid feature modifier in -march=%qs", str);
8723 break;
8724 default:
8725 gcc_unreachable ();
8728 return false;
8731 /* Validate a command-line -mtune option. Parse the cpu
8732 specified in STR and throw errors if appropriate. Put the
8733 result, if it is valid, in RES. Return whether the option is
8734 valid. */
8736 static bool
8737 aarch64_validate_mtune (const char *str, const struct processor **res)
8739 enum aarch64_parse_opt_result parse_res
8740 = aarch64_parse_tune (str, res);
8742 if (parse_res == AARCH64_PARSE_OK)
8743 return true;
8745 switch (parse_res)
8747 case AARCH64_PARSE_MISSING_ARG:
8748 error ("missing cpu name in -mtune=%qs", str);
8749 break;
8750 case AARCH64_PARSE_INVALID_ARG:
8751 error ("unknown value %qs for -mtune", str);
8752 aarch64_print_hint_for_core (str);
8753 break;
8754 default:
8755 gcc_unreachable ();
8757 return false;
8760 /* Return the CPU corresponding to the enum CPU.
8761 If it doesn't specify a cpu, return the default. */
8763 static const struct processor *
8764 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8766 if (cpu != aarch64_none)
8767 return &all_cores[cpu];
8769 /* The & 0x3f is to extract the bottom 6 bits that encode the
8770 default cpu as selected by the --with-cpu GCC configure option
8771 in config.gcc.
8772 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8773 flags mechanism should be reworked to make it more sane. */
8774 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8777 /* Return the architecture corresponding to the enum ARCH.
8778 If it doesn't specify a valid architecture, return the default. */
8780 static const struct processor *
8781 aarch64_get_arch (enum aarch64_arch arch)
8783 if (arch != aarch64_no_arch)
8784 return &all_architectures[arch];
8786 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8788 return &all_architectures[cpu->arch];
8791 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8792 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8793 tuning structs. In particular it must set selected_tune and
8794 aarch64_isa_flags that define the available ISA features and tuning
8795 decisions. It must also set selected_arch as this will be used to
8796 output the .arch asm tags for each function. */
8798 static void
8799 aarch64_override_options (void)
8801 unsigned long cpu_isa = 0;
8802 unsigned long arch_isa = 0;
8803 aarch64_isa_flags = 0;
8805 bool valid_cpu = true;
8806 bool valid_tune = true;
8807 bool valid_arch = true;
8809 selected_cpu = NULL;
8810 selected_arch = NULL;
8811 selected_tune = NULL;
8813 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8814 If either of -march or -mtune is given, they override their
8815 respective component of -mcpu. */
8816 if (aarch64_cpu_string)
8817 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8818 &cpu_isa);
8820 if (aarch64_arch_string)
8821 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8822 &arch_isa);
8824 if (aarch64_tune_string)
8825 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8827 /* If the user did not specify a processor, choose the default
8828 one for them. This will be the CPU set during configuration using
8829 --with-cpu, otherwise it is "generic". */
8830 if (!selected_cpu)
8832 if (selected_arch)
8834 selected_cpu = &all_cores[selected_arch->ident];
8835 aarch64_isa_flags = arch_isa;
8836 explicit_arch = selected_arch->arch;
8838 else
8840 /* Get default configure-time CPU. */
8841 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8842 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8845 if (selected_tune)
8846 explicit_tune_core = selected_tune->ident;
8848 /* If both -mcpu and -march are specified check that they are architecturally
8849 compatible, warn if they're not and prefer the -march ISA flags. */
8850 else if (selected_arch)
8852 if (selected_arch->arch != selected_cpu->arch)
8854 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8855 all_architectures[selected_cpu->arch].name,
8856 selected_arch->name);
8858 aarch64_isa_flags = arch_isa;
8859 explicit_arch = selected_arch->arch;
8860 explicit_tune_core = selected_tune ? selected_tune->ident
8861 : selected_cpu->ident;
8863 else
8865 /* -mcpu but no -march. */
8866 aarch64_isa_flags = cpu_isa;
8867 explicit_tune_core = selected_tune ? selected_tune->ident
8868 : selected_cpu->ident;
8869 gcc_assert (selected_cpu);
8870 selected_arch = &all_architectures[selected_cpu->arch];
8871 explicit_arch = selected_arch->arch;
8874 /* Set the arch as well as we will need it when outputing
8875 the .arch directive in assembly. */
8876 if (!selected_arch)
8878 gcc_assert (selected_cpu);
8879 selected_arch = &all_architectures[selected_cpu->arch];
8882 if (!selected_tune)
8883 selected_tune = selected_cpu;
8885 #ifndef HAVE_AS_MABI_OPTION
8886 /* The compiler may have been configured with 2.23.* binutils, which does
8887 not have support for ILP32. */
8888 if (TARGET_ILP32)
8889 error ("Assembler does not support -mabi=ilp32");
8890 #endif
8892 /* Make sure we properly set up the explicit options. */
8893 if ((aarch64_cpu_string && valid_cpu)
8894 || (aarch64_tune_string && valid_tune))
8895 gcc_assert (explicit_tune_core != aarch64_none);
8897 if ((aarch64_cpu_string && valid_cpu)
8898 || (aarch64_arch_string && valid_arch))
8899 gcc_assert (explicit_arch != aarch64_no_arch);
8901 aarch64_override_options_internal (&global_options);
8903 /* Save these options as the default ones in case we push and pop them later
8904 while processing functions with potential target attributes. */
8905 target_option_default_node = target_option_current_node
8906 = build_target_option_node (&global_options);
8909 /* Implement targetm.override_options_after_change. */
8911 static void
8912 aarch64_override_options_after_change (void)
8914 aarch64_override_options_after_change_1 (&global_options);
8917 static struct machine_function *
8918 aarch64_init_machine_status (void)
8920 struct machine_function *machine;
8921 machine = ggc_cleared_alloc<machine_function> ();
8922 return machine;
8925 void
8926 aarch64_init_expanders (void)
8928 init_machine_status = aarch64_init_machine_status;
8931 /* A checking mechanism for the implementation of the various code models. */
8932 static void
8933 initialize_aarch64_code_model (struct gcc_options *opts)
8935 if (opts->x_flag_pic)
8937 switch (opts->x_aarch64_cmodel_var)
8939 case AARCH64_CMODEL_TINY:
8940 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8941 break;
8942 case AARCH64_CMODEL_SMALL:
8943 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8944 aarch64_cmodel = (flag_pic == 2
8945 ? AARCH64_CMODEL_SMALL_PIC
8946 : AARCH64_CMODEL_SMALL_SPIC);
8947 #else
8948 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8949 #endif
8950 break;
8951 case AARCH64_CMODEL_LARGE:
8952 sorry ("code model %qs with -f%s", "large",
8953 opts->x_flag_pic > 1 ? "PIC" : "pic");
8954 break;
8955 default:
8956 gcc_unreachable ();
8959 else
8960 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8963 /* Implement TARGET_OPTION_SAVE. */
8965 static void
8966 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8968 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8971 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8972 using the information saved in PTR. */
8974 static void
8975 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8977 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8978 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8979 opts->x_explicit_arch = ptr->x_explicit_arch;
8980 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8981 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8983 aarch64_override_options_internal (opts);
8986 /* Implement TARGET_OPTION_PRINT. */
8988 static void
8989 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8991 const struct processor *cpu
8992 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8993 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8994 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8995 std::string extension
8996 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8998 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8999 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9000 arch->name, extension.c_str ());
9003 static GTY(()) tree aarch64_previous_fndecl;
9005 void
9006 aarch64_reset_previous_fndecl (void)
9008 aarch64_previous_fndecl = NULL;
9011 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9012 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9013 make sure optab availability predicates are recomputed when necessary. */
9015 void
9016 aarch64_save_restore_target_globals (tree new_tree)
9018 if (TREE_TARGET_GLOBALS (new_tree))
9019 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9020 else if (new_tree == target_option_default_node)
9021 restore_target_globals (&default_target_globals);
9022 else
9023 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9026 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9027 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9028 of the function, if such exists. This function may be called multiple
9029 times on a single function so use aarch64_previous_fndecl to avoid
9030 setting up identical state. */
9032 static void
9033 aarch64_set_current_function (tree fndecl)
9035 if (!fndecl || fndecl == aarch64_previous_fndecl)
9036 return;
9038 tree old_tree = (aarch64_previous_fndecl
9039 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9040 : NULL_TREE);
9042 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9044 /* If current function has no attributes but the previous one did,
9045 use the default node. */
9046 if (!new_tree && old_tree)
9047 new_tree = target_option_default_node;
9049 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9050 the default have been handled by aarch64_save_restore_target_globals from
9051 aarch64_pragma_target_parse. */
9052 if (old_tree == new_tree)
9053 return;
9055 aarch64_previous_fndecl = fndecl;
9057 /* First set the target options. */
9058 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9060 aarch64_save_restore_target_globals (new_tree);
9063 /* Enum describing the various ways we can handle attributes.
9064 In many cases we can reuse the generic option handling machinery. */
9066 enum aarch64_attr_opt_type
9068 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9069 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9070 aarch64_attr_enum, /* Attribute sets an enum variable. */
9071 aarch64_attr_custom /* Attribute requires a custom handling function. */
9074 /* All the information needed to handle a target attribute.
9075 NAME is the name of the attribute.
9076 ATTR_TYPE specifies the type of behavior of the attribute as described
9077 in the definition of enum aarch64_attr_opt_type.
9078 ALLOW_NEG is true if the attribute supports a "no-" form.
9079 HANDLER is the function that takes the attribute string and whether
9080 it is a pragma or attribute and handles the option. It is needed only
9081 when the ATTR_TYPE is aarch64_attr_custom.
9082 OPT_NUM is the enum specifying the option that the attribute modifies.
9083 This is needed for attributes that mirror the behavior of a command-line
9084 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9085 aarch64_attr_enum. */
9087 struct aarch64_attribute_info
9089 const char *name;
9090 enum aarch64_attr_opt_type attr_type;
9091 bool allow_neg;
9092 bool (*handler) (const char *, const char *);
9093 enum opt_code opt_num;
9096 /* Handle the ARCH_STR argument to the arch= target attribute.
9097 PRAGMA_OR_ATTR is used in potential error messages. */
9099 static bool
9100 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9102 const struct processor *tmp_arch = NULL;
9103 enum aarch64_parse_opt_result parse_res
9104 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9106 if (parse_res == AARCH64_PARSE_OK)
9108 gcc_assert (tmp_arch);
9109 selected_arch = tmp_arch;
9110 explicit_arch = selected_arch->arch;
9111 return true;
9114 switch (parse_res)
9116 case AARCH64_PARSE_MISSING_ARG:
9117 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9118 break;
9119 case AARCH64_PARSE_INVALID_ARG:
9120 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9121 aarch64_print_hint_for_arch (str);
9122 break;
9123 case AARCH64_PARSE_INVALID_FEATURE:
9124 error ("invalid feature modifier %qs for 'arch' target %s",
9125 str, pragma_or_attr);
9126 break;
9127 default:
9128 gcc_unreachable ();
9131 return false;
9134 /* Handle the argument CPU_STR to the cpu= target attribute.
9135 PRAGMA_OR_ATTR is used in potential error messages. */
9137 static bool
9138 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9140 const struct processor *tmp_cpu = NULL;
9141 enum aarch64_parse_opt_result parse_res
9142 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9144 if (parse_res == AARCH64_PARSE_OK)
9146 gcc_assert (tmp_cpu);
9147 selected_tune = tmp_cpu;
9148 explicit_tune_core = selected_tune->ident;
9150 selected_arch = &all_architectures[tmp_cpu->arch];
9151 explicit_arch = selected_arch->arch;
9152 return true;
9155 switch (parse_res)
9157 case AARCH64_PARSE_MISSING_ARG:
9158 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9159 break;
9160 case AARCH64_PARSE_INVALID_ARG:
9161 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9162 aarch64_print_hint_for_core (str);
9163 break;
9164 case AARCH64_PARSE_INVALID_FEATURE:
9165 error ("invalid feature modifier %qs for 'cpu' target %s",
9166 str, pragma_or_attr);
9167 break;
9168 default:
9169 gcc_unreachable ();
9172 return false;
9175 /* Handle the argument STR to the tune= target attribute.
9176 PRAGMA_OR_ATTR is used in potential error messages. */
9178 static bool
9179 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9181 const struct processor *tmp_tune = NULL;
9182 enum aarch64_parse_opt_result parse_res
9183 = aarch64_parse_tune (str, &tmp_tune);
9185 if (parse_res == AARCH64_PARSE_OK)
9187 gcc_assert (tmp_tune);
9188 selected_tune = tmp_tune;
9189 explicit_tune_core = selected_tune->ident;
9190 return true;
9193 switch (parse_res)
9195 case AARCH64_PARSE_INVALID_ARG:
9196 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9197 aarch64_print_hint_for_core (str);
9198 break;
9199 default:
9200 gcc_unreachable ();
9203 return false;
9206 /* Parse an architecture extensions target attribute string specified in STR.
9207 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9208 if successful. Update aarch64_isa_flags to reflect the ISA features
9209 modified.
9210 PRAGMA_OR_ATTR is used in potential error messages. */
9212 static bool
9213 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9215 enum aarch64_parse_opt_result parse_res;
9216 unsigned long isa_flags = aarch64_isa_flags;
9218 /* We allow "+nothing" in the beginning to clear out all architectural
9219 features if the user wants to handpick specific features. */
9220 if (strncmp ("+nothing", str, 8) == 0)
9222 isa_flags = 0;
9223 str += 8;
9226 parse_res = aarch64_parse_extension (str, &isa_flags);
9228 if (parse_res == AARCH64_PARSE_OK)
9230 aarch64_isa_flags = isa_flags;
9231 return true;
9234 switch (parse_res)
9236 case AARCH64_PARSE_MISSING_ARG:
9237 error ("missing feature modifier in target %s %qs",
9238 pragma_or_attr, str);
9239 break;
9241 case AARCH64_PARSE_INVALID_FEATURE:
9242 error ("invalid feature modifier in target %s %qs",
9243 pragma_or_attr, str);
9244 break;
9246 default:
9247 gcc_unreachable ();
9250 return false;
9253 /* The target attributes that we support. On top of these we also support just
9254 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9255 handled explicitly in aarch64_process_one_target_attr. */
9257 static const struct aarch64_attribute_info aarch64_attributes[] =
9259 { "general-regs-only", aarch64_attr_mask, false, NULL,
9260 OPT_mgeneral_regs_only },
9261 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9262 OPT_mfix_cortex_a53_835769 },
9263 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9264 OPT_mfix_cortex_a53_843419 },
9265 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9266 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9267 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9268 OPT_momit_leaf_frame_pointer },
9269 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9270 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9271 OPT_march_ },
9272 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9273 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9274 OPT_mtune_ },
9275 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9278 /* Parse ARG_STR which contains the definition of one target attribute.
9279 Show appropriate errors if any or return true if the attribute is valid.
9280 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9281 we're processing a target attribute or pragma. */
9283 static bool
9284 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9286 bool invert = false;
9288 size_t len = strlen (arg_str);
9290 if (len == 0)
9292 error ("malformed target %s", pragma_or_attr);
9293 return false;
9296 char *str_to_check = (char *) alloca (len + 1);
9297 strcpy (str_to_check, arg_str);
9299 /* Skip leading whitespace. */
9300 while (*str_to_check == ' ' || *str_to_check == '\t')
9301 str_to_check++;
9303 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9304 It is easier to detect and handle it explicitly here rather than going
9305 through the machinery for the rest of the target attributes in this
9306 function. */
9307 if (*str_to_check == '+')
9308 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9310 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9312 invert = true;
9313 str_to_check += 3;
9315 char *arg = strchr (str_to_check, '=');
9317 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9318 and point ARG to "foo". */
9319 if (arg)
9321 *arg = '\0';
9322 arg++;
9324 const struct aarch64_attribute_info *p_attr;
9325 bool found = false;
9326 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9328 /* If the names don't match up, or the user has given an argument
9329 to an attribute that doesn't accept one, or didn't give an argument
9330 to an attribute that expects one, fail to match. */
9331 if (strcmp (str_to_check, p_attr->name) != 0)
9332 continue;
9334 found = true;
9335 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9336 || p_attr->attr_type == aarch64_attr_enum;
9338 if (attr_need_arg_p ^ (arg != NULL))
9340 error ("target %s %qs does not accept an argument",
9341 pragma_or_attr, str_to_check);
9342 return false;
9345 /* If the name matches but the attribute does not allow "no-" versions
9346 then we can't match. */
9347 if (invert && !p_attr->allow_neg)
9349 error ("target %s %qs does not allow a negated form",
9350 pragma_or_attr, str_to_check);
9351 return false;
9354 switch (p_attr->attr_type)
9356 /* Has a custom handler registered.
9357 For example, cpu=, arch=, tune=. */
9358 case aarch64_attr_custom:
9359 gcc_assert (p_attr->handler);
9360 if (!p_attr->handler (arg, pragma_or_attr))
9361 return false;
9362 break;
9364 /* Either set or unset a boolean option. */
9365 case aarch64_attr_bool:
9367 struct cl_decoded_option decoded;
9369 generate_option (p_attr->opt_num, NULL, !invert,
9370 CL_TARGET, &decoded);
9371 aarch64_handle_option (&global_options, &global_options_set,
9372 &decoded, input_location);
9373 break;
9375 /* Set or unset a bit in the target_flags. aarch64_handle_option
9376 should know what mask to apply given the option number. */
9377 case aarch64_attr_mask:
9379 struct cl_decoded_option decoded;
9380 /* We only need to specify the option number.
9381 aarch64_handle_option will know which mask to apply. */
9382 decoded.opt_index = p_attr->opt_num;
9383 decoded.value = !invert;
9384 aarch64_handle_option (&global_options, &global_options_set,
9385 &decoded, input_location);
9386 break;
9388 /* Use the option setting machinery to set an option to an enum. */
9389 case aarch64_attr_enum:
9391 gcc_assert (arg);
9392 bool valid;
9393 int value;
9394 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9395 &value, CL_TARGET);
9396 if (valid)
9398 set_option (&global_options, NULL, p_attr->opt_num, value,
9399 NULL, DK_UNSPECIFIED, input_location,
9400 global_dc);
9402 else
9404 error ("target %s %s=%s is not valid",
9405 pragma_or_attr, str_to_check, arg);
9407 break;
9409 default:
9410 gcc_unreachable ();
9414 /* If we reached here we either have found an attribute and validated
9415 it or didn't match any. If we matched an attribute but its arguments
9416 were malformed we will have returned false already. */
9417 return found;
9420 /* Count how many times the character C appears in
9421 NULL-terminated string STR. */
9423 static unsigned int
9424 num_occurences_in_str (char c, char *str)
9426 unsigned int res = 0;
9427 while (*str != '\0')
9429 if (*str == c)
9430 res++;
9432 str++;
9435 return res;
9438 /* Parse the tree in ARGS that contains the target attribute information
9439 and update the global target options space. PRAGMA_OR_ATTR is a string
9440 to be used in error messages, specifying whether this is processing
9441 a target attribute or a target pragma. */
9443 bool
9444 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9446 if (TREE_CODE (args) == TREE_LIST)
9450 tree head = TREE_VALUE (args);
9451 if (head)
9453 if (!aarch64_process_target_attr (head, pragma_or_attr))
9454 return false;
9456 args = TREE_CHAIN (args);
9457 } while (args);
9459 return true;
9461 /* We expect to find a string to parse. */
9462 gcc_assert (TREE_CODE (args) == STRING_CST);
9464 size_t len = strlen (TREE_STRING_POINTER (args));
9465 char *str_to_check = (char *) alloca (len + 1);
9466 strcpy (str_to_check, TREE_STRING_POINTER (args));
9468 if (len == 0)
9470 error ("malformed target %s value", pragma_or_attr);
9471 return false;
9474 /* Used to catch empty spaces between commas i.e.
9475 attribute ((target ("attr1,,attr2"))). */
9476 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9478 /* Handle multiple target attributes separated by ','. */
9479 char *token = strtok (str_to_check, ",");
9481 unsigned int num_attrs = 0;
9482 while (token)
9484 num_attrs++;
9485 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9487 error ("target %s %qs is invalid", pragma_or_attr, token);
9488 return false;
9491 token = strtok (NULL, ",");
9494 if (num_attrs != num_commas + 1)
9496 error ("malformed target %s list %qs",
9497 pragma_or_attr, TREE_STRING_POINTER (args));
9498 return false;
9501 return true;
9504 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9505 process attribute ((target ("..."))). */
9507 static bool
9508 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9510 struct cl_target_option cur_target;
9511 bool ret;
9512 tree old_optimize;
9513 tree new_target, new_optimize;
9514 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9516 /* If what we're processing is the current pragma string then the
9517 target option node is already stored in target_option_current_node
9518 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9519 having to re-parse the string. This is especially useful to keep
9520 arm_neon.h compile times down since that header contains a lot
9521 of intrinsics enclosed in pragmas. */
9522 if (!existing_target && args == current_target_pragma)
9524 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9525 return true;
9527 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9529 old_optimize = build_optimization_node (&global_options);
9530 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9532 /* If the function changed the optimization levels as well as setting
9533 target options, start with the optimizations specified. */
9534 if (func_optimize && func_optimize != old_optimize)
9535 cl_optimization_restore (&global_options,
9536 TREE_OPTIMIZATION (func_optimize));
9538 /* Save the current target options to restore at the end. */
9539 cl_target_option_save (&cur_target, &global_options);
9541 /* If fndecl already has some target attributes applied to it, unpack
9542 them so that we add this attribute on top of them, rather than
9543 overwriting them. */
9544 if (existing_target)
9546 struct cl_target_option *existing_options
9547 = TREE_TARGET_OPTION (existing_target);
9549 if (existing_options)
9550 cl_target_option_restore (&global_options, existing_options);
9552 else
9553 cl_target_option_restore (&global_options,
9554 TREE_TARGET_OPTION (target_option_current_node));
9557 ret = aarch64_process_target_attr (args, "attribute");
9559 /* Set up any additional state. */
9560 if (ret)
9562 aarch64_override_options_internal (&global_options);
9563 /* Initialize SIMD builtins if we haven't already.
9564 Set current_target_pragma to NULL for the duration so that
9565 the builtin initialization code doesn't try to tag the functions
9566 being built with the attributes specified by any current pragma, thus
9567 going into an infinite recursion. */
9568 if (TARGET_SIMD)
9570 tree saved_current_target_pragma = current_target_pragma;
9571 current_target_pragma = NULL;
9572 aarch64_init_simd_builtins ();
9573 current_target_pragma = saved_current_target_pragma;
9575 new_target = build_target_option_node (&global_options);
9577 else
9578 new_target = NULL;
9580 new_optimize = build_optimization_node (&global_options);
9582 if (fndecl && ret)
9584 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9586 if (old_optimize != new_optimize)
9587 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9590 cl_target_option_restore (&global_options, &cur_target);
9592 if (old_optimize != new_optimize)
9593 cl_optimization_restore (&global_options,
9594 TREE_OPTIMIZATION (old_optimize));
9595 return ret;
9598 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9599 tri-bool options (yes, no, don't care) and the default value is
9600 DEF, determine whether to reject inlining. */
9602 static bool
9603 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9604 int dont_care, int def)
9606 /* If the callee doesn't care, always allow inlining. */
9607 if (callee == dont_care)
9608 return true;
9610 /* If the caller doesn't care, always allow inlining. */
9611 if (caller == dont_care)
9612 return true;
9614 /* Otherwise, allow inlining if either the callee and caller values
9615 agree, or if the callee is using the default value. */
9616 return (callee == caller || callee == def);
9619 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9620 to inline CALLEE into CALLER based on target-specific info.
9621 Make sure that the caller and callee have compatible architectural
9622 features. Then go through the other possible target attributes
9623 and see if they can block inlining. Try not to reject always_inline
9624 callees unless they are incompatible architecturally. */
9626 static bool
9627 aarch64_can_inline_p (tree caller, tree callee)
9629 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9630 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9632 /* If callee has no option attributes, then it is ok to inline. */
9633 if (!callee_tree)
9634 return true;
9636 struct cl_target_option *caller_opts
9637 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9638 : target_option_default_node);
9640 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9643 /* Callee's ISA flags should be a subset of the caller's. */
9644 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9645 != callee_opts->x_aarch64_isa_flags)
9646 return false;
9648 /* Allow non-strict aligned functions inlining into strict
9649 aligned ones. */
9650 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9651 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9652 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9653 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9654 return false;
9656 bool always_inline = lookup_attribute ("always_inline",
9657 DECL_ATTRIBUTES (callee));
9659 /* If the architectural features match up and the callee is always_inline
9660 then the other attributes don't matter. */
9661 if (always_inline)
9662 return true;
9664 if (caller_opts->x_aarch64_cmodel_var
9665 != callee_opts->x_aarch64_cmodel_var)
9666 return false;
9668 if (caller_opts->x_aarch64_tls_dialect
9669 != callee_opts->x_aarch64_tls_dialect)
9670 return false;
9672 /* Honour explicit requests to workaround errata. */
9673 if (!aarch64_tribools_ok_for_inlining_p (
9674 caller_opts->x_aarch64_fix_a53_err835769,
9675 callee_opts->x_aarch64_fix_a53_err835769,
9676 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9677 return false;
9679 if (!aarch64_tribools_ok_for_inlining_p (
9680 caller_opts->x_aarch64_fix_a53_err843419,
9681 callee_opts->x_aarch64_fix_a53_err843419,
9682 2, TARGET_FIX_ERR_A53_843419))
9683 return false;
9685 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9686 caller and calle and they don't match up, reject inlining. */
9687 if (!aarch64_tribools_ok_for_inlining_p (
9688 caller_opts->x_flag_omit_leaf_frame_pointer,
9689 callee_opts->x_flag_omit_leaf_frame_pointer,
9690 2, 1))
9691 return false;
9693 /* If the callee has specific tuning overrides, respect them. */
9694 if (callee_opts->x_aarch64_override_tune_string != NULL
9695 && caller_opts->x_aarch64_override_tune_string == NULL)
9696 return false;
9698 /* If the user specified tuning override strings for the
9699 caller and callee and they don't match up, reject inlining.
9700 We just do a string compare here, we don't analyze the meaning
9701 of the string, as it would be too costly for little gain. */
9702 if (callee_opts->x_aarch64_override_tune_string
9703 && caller_opts->x_aarch64_override_tune_string
9704 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9705 caller_opts->x_aarch64_override_tune_string) != 0))
9706 return false;
9708 return true;
9711 /* Return true if SYMBOL_REF X binds locally. */
9713 static bool
9714 aarch64_symbol_binds_local_p (const_rtx x)
9716 return (SYMBOL_REF_DECL (x)
9717 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9718 : SYMBOL_REF_LOCAL_P (x));
9721 /* Return true if SYMBOL_REF X is thread local */
9722 static bool
9723 aarch64_tls_symbol_p (rtx x)
9725 if (! TARGET_HAVE_TLS)
9726 return false;
9728 if (GET_CODE (x) != SYMBOL_REF)
9729 return false;
9731 return SYMBOL_REF_TLS_MODEL (x) != 0;
9734 /* Classify a TLS symbol into one of the TLS kinds. */
9735 enum aarch64_symbol_type
9736 aarch64_classify_tls_symbol (rtx x)
9738 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9740 switch (tls_kind)
9742 case TLS_MODEL_GLOBAL_DYNAMIC:
9743 case TLS_MODEL_LOCAL_DYNAMIC:
9744 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9746 case TLS_MODEL_INITIAL_EXEC:
9747 switch (aarch64_cmodel)
9749 case AARCH64_CMODEL_TINY:
9750 case AARCH64_CMODEL_TINY_PIC:
9751 return SYMBOL_TINY_TLSIE;
9752 default:
9753 return SYMBOL_SMALL_TLSIE;
9756 case TLS_MODEL_LOCAL_EXEC:
9757 if (aarch64_tls_size == 12)
9758 return SYMBOL_TLSLE12;
9759 else if (aarch64_tls_size == 24)
9760 return SYMBOL_TLSLE24;
9761 else if (aarch64_tls_size == 32)
9762 return SYMBOL_TLSLE32;
9763 else if (aarch64_tls_size == 48)
9764 return SYMBOL_TLSLE48;
9765 else
9766 gcc_unreachable ();
9768 case TLS_MODEL_EMULATED:
9769 case TLS_MODEL_NONE:
9770 return SYMBOL_FORCE_TO_MEM;
9772 default:
9773 gcc_unreachable ();
9777 /* Return the method that should be used to access SYMBOL_REF or
9778 LABEL_REF X. */
9780 enum aarch64_symbol_type
9781 aarch64_classify_symbol (rtx x, rtx offset)
9783 if (GET_CODE (x) == LABEL_REF)
9785 switch (aarch64_cmodel)
9787 case AARCH64_CMODEL_LARGE:
9788 return SYMBOL_FORCE_TO_MEM;
9790 case AARCH64_CMODEL_TINY_PIC:
9791 case AARCH64_CMODEL_TINY:
9792 return SYMBOL_TINY_ABSOLUTE;
9794 case AARCH64_CMODEL_SMALL_SPIC:
9795 case AARCH64_CMODEL_SMALL_PIC:
9796 case AARCH64_CMODEL_SMALL:
9797 return SYMBOL_SMALL_ABSOLUTE;
9799 default:
9800 gcc_unreachable ();
9804 if (GET_CODE (x) == SYMBOL_REF)
9806 if (aarch64_tls_symbol_p (x))
9807 return aarch64_classify_tls_symbol (x);
9809 switch (aarch64_cmodel)
9811 case AARCH64_CMODEL_TINY:
9812 /* When we retrieve symbol + offset address, we have to make sure
9813 the offset does not cause overflow of the final address. But
9814 we have no way of knowing the address of symbol at compile time
9815 so we can't accurately say if the distance between the PC and
9816 symbol + offset is outside the addressible range of +/-1M in the
9817 TINY code model. So we rely on images not being greater than
9818 1M and cap the offset at 1M and anything beyond 1M will have to
9819 be loaded using an alternative mechanism. Furthermore if the
9820 symbol is a weak reference to something that isn't known to
9821 resolve to a symbol in this module, then force to memory. */
9822 if ((SYMBOL_REF_WEAK (x)
9823 && !aarch64_symbol_binds_local_p (x))
9824 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9825 return SYMBOL_FORCE_TO_MEM;
9826 return SYMBOL_TINY_ABSOLUTE;
9828 case AARCH64_CMODEL_SMALL:
9829 /* Same reasoning as the tiny code model, but the offset cap here is
9830 4G. */
9831 if ((SYMBOL_REF_WEAK (x)
9832 && !aarch64_symbol_binds_local_p (x))
9833 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9834 HOST_WIDE_INT_C (4294967264)))
9835 return SYMBOL_FORCE_TO_MEM;
9836 return SYMBOL_SMALL_ABSOLUTE;
9838 case AARCH64_CMODEL_TINY_PIC:
9839 if (!aarch64_symbol_binds_local_p (x))
9840 return SYMBOL_TINY_GOT;
9841 return SYMBOL_TINY_ABSOLUTE;
9843 case AARCH64_CMODEL_SMALL_SPIC:
9844 case AARCH64_CMODEL_SMALL_PIC:
9845 if (!aarch64_symbol_binds_local_p (x))
9846 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9847 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9848 return SYMBOL_SMALL_ABSOLUTE;
9850 case AARCH64_CMODEL_LARGE:
9851 /* This is alright even in PIC code as the constant
9852 pool reference is always PC relative and within
9853 the same translation unit. */
9854 if (CONSTANT_POOL_ADDRESS_P (x))
9855 return SYMBOL_SMALL_ABSOLUTE;
9856 else
9857 return SYMBOL_FORCE_TO_MEM;
9859 default:
9860 gcc_unreachable ();
9864 /* By default push everything into the constant pool. */
9865 return SYMBOL_FORCE_TO_MEM;
9868 bool
9869 aarch64_constant_address_p (rtx x)
9871 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9874 bool
9875 aarch64_legitimate_pic_operand_p (rtx x)
9877 if (GET_CODE (x) == SYMBOL_REF
9878 || (GET_CODE (x) == CONST
9879 && GET_CODE (XEXP (x, 0)) == PLUS
9880 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9881 return false;
9883 return true;
9886 /* Return true if X holds either a quarter-precision or
9887 floating-point +0.0 constant. */
9888 static bool
9889 aarch64_valid_floating_const (machine_mode mode, rtx x)
9891 if (!CONST_DOUBLE_P (x))
9892 return false;
9894 if (aarch64_float_const_zero_rtx_p (x))
9895 return true;
9897 /* We only handle moving 0.0 to a TFmode register. */
9898 if (!(mode == SFmode || mode == DFmode))
9899 return false;
9901 return aarch64_float_const_representable_p (x);
9904 static bool
9905 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9907 /* Do not allow vector struct mode constants. We could support
9908 0 and -1 easily, but they need support in aarch64-simd.md. */
9909 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9910 return false;
9912 /* This could probably go away because
9913 we now decompose CONST_INTs according to expand_mov_immediate. */
9914 if ((GET_CODE (x) == CONST_VECTOR
9915 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9916 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9917 return !targetm.cannot_force_const_mem (mode, x);
9919 if (GET_CODE (x) == HIGH
9920 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9921 return true;
9923 return aarch64_constant_address_p (x);
9927 aarch64_load_tp (rtx target)
9929 if (!target
9930 || GET_MODE (target) != Pmode
9931 || !register_operand (target, Pmode))
9932 target = gen_reg_rtx (Pmode);
9934 /* Can return in any reg. */
9935 emit_insn (gen_aarch64_load_tp_hard (target));
9936 return target;
9939 /* On AAPCS systems, this is the "struct __va_list". */
9940 static GTY(()) tree va_list_type;
9942 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9943 Return the type to use as __builtin_va_list.
9945 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9947 struct __va_list
9949 void *__stack;
9950 void *__gr_top;
9951 void *__vr_top;
9952 int __gr_offs;
9953 int __vr_offs;
9954 }; */
9956 static tree
9957 aarch64_build_builtin_va_list (void)
9959 tree va_list_name;
9960 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9962 /* Create the type. */
9963 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9964 /* Give it the required name. */
9965 va_list_name = build_decl (BUILTINS_LOCATION,
9966 TYPE_DECL,
9967 get_identifier ("__va_list"),
9968 va_list_type);
9969 DECL_ARTIFICIAL (va_list_name) = 1;
9970 TYPE_NAME (va_list_type) = va_list_name;
9971 TYPE_STUB_DECL (va_list_type) = va_list_name;
9973 /* Create the fields. */
9974 f_stack = build_decl (BUILTINS_LOCATION,
9975 FIELD_DECL, get_identifier ("__stack"),
9976 ptr_type_node);
9977 f_grtop = build_decl (BUILTINS_LOCATION,
9978 FIELD_DECL, get_identifier ("__gr_top"),
9979 ptr_type_node);
9980 f_vrtop = build_decl (BUILTINS_LOCATION,
9981 FIELD_DECL, get_identifier ("__vr_top"),
9982 ptr_type_node);
9983 f_groff = build_decl (BUILTINS_LOCATION,
9984 FIELD_DECL, get_identifier ("__gr_offs"),
9985 integer_type_node);
9986 f_vroff = build_decl (BUILTINS_LOCATION,
9987 FIELD_DECL, get_identifier ("__vr_offs"),
9988 integer_type_node);
9990 /* Tell tree-stdarg pass about our internal offset fields.
9991 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9992 purpose to identify whether the code is updating va_list internal
9993 offset fields through irregular way. */
9994 va_list_gpr_counter_field = f_groff;
9995 va_list_fpr_counter_field = f_vroff;
9997 DECL_ARTIFICIAL (f_stack) = 1;
9998 DECL_ARTIFICIAL (f_grtop) = 1;
9999 DECL_ARTIFICIAL (f_vrtop) = 1;
10000 DECL_ARTIFICIAL (f_groff) = 1;
10001 DECL_ARTIFICIAL (f_vroff) = 1;
10003 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10004 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10005 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10006 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10007 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10009 TYPE_FIELDS (va_list_type) = f_stack;
10010 DECL_CHAIN (f_stack) = f_grtop;
10011 DECL_CHAIN (f_grtop) = f_vrtop;
10012 DECL_CHAIN (f_vrtop) = f_groff;
10013 DECL_CHAIN (f_groff) = f_vroff;
10015 /* Compute its layout. */
10016 layout_type (va_list_type);
10018 return va_list_type;
10021 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10022 static void
10023 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10025 const CUMULATIVE_ARGS *cum;
10026 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10027 tree stack, grtop, vrtop, groff, vroff;
10028 tree t;
10029 int gr_save_area_size = cfun->va_list_gpr_size;
10030 int vr_save_area_size = cfun->va_list_fpr_size;
10031 int vr_offset;
10033 cum = &crtl->args.info;
10034 if (cfun->va_list_gpr_size)
10035 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10036 cfun->va_list_gpr_size);
10037 if (cfun->va_list_fpr_size)
10038 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10039 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10041 if (!TARGET_FLOAT)
10043 gcc_assert (cum->aapcs_nvrn == 0);
10044 vr_save_area_size = 0;
10047 f_stack = TYPE_FIELDS (va_list_type_node);
10048 f_grtop = DECL_CHAIN (f_stack);
10049 f_vrtop = DECL_CHAIN (f_grtop);
10050 f_groff = DECL_CHAIN (f_vrtop);
10051 f_vroff = DECL_CHAIN (f_groff);
10053 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10054 NULL_TREE);
10055 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10056 NULL_TREE);
10057 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10058 NULL_TREE);
10059 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10060 NULL_TREE);
10061 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10062 NULL_TREE);
10064 /* Emit code to initialize STACK, which points to the next varargs stack
10065 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10066 by named arguments. STACK is 8-byte aligned. */
10067 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10068 if (cum->aapcs_stack_size > 0)
10069 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10070 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10071 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10073 /* Emit code to initialize GRTOP, the top of the GR save area.
10074 virtual_incoming_args_rtx should have been 16 byte aligned. */
10075 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10076 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10077 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10079 /* Emit code to initialize VRTOP, the top of the VR save area.
10080 This address is gr_save_area_bytes below GRTOP, rounded
10081 down to the next 16-byte boundary. */
10082 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10083 vr_offset = ROUND_UP (gr_save_area_size,
10084 STACK_BOUNDARY / BITS_PER_UNIT);
10086 if (vr_offset)
10087 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10088 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10089 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10091 /* Emit code to initialize GROFF, the offset from GRTOP of the
10092 next GPR argument. */
10093 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10094 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10095 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10097 /* Likewise emit code to initialize VROFF, the offset from FTOP
10098 of the next VR argument. */
10099 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10100 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10101 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10104 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10106 static tree
10107 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10108 gimple_seq *post_p ATTRIBUTE_UNUSED)
10110 tree addr;
10111 bool indirect_p;
10112 bool is_ha; /* is HFA or HVA. */
10113 bool dw_align; /* double-word align. */
10114 machine_mode ag_mode = VOIDmode;
10115 int nregs;
10116 machine_mode mode;
10118 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10119 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10120 HOST_WIDE_INT size, rsize, adjust, align;
10121 tree t, u, cond1, cond2;
10123 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10124 if (indirect_p)
10125 type = build_pointer_type (type);
10127 mode = TYPE_MODE (type);
10129 f_stack = TYPE_FIELDS (va_list_type_node);
10130 f_grtop = DECL_CHAIN (f_stack);
10131 f_vrtop = DECL_CHAIN (f_grtop);
10132 f_groff = DECL_CHAIN (f_vrtop);
10133 f_vroff = DECL_CHAIN (f_groff);
10135 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10136 f_stack, NULL_TREE);
10137 size = int_size_in_bytes (type);
10138 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10140 dw_align = false;
10141 adjust = 0;
10142 if (aarch64_vfp_is_call_or_return_candidate (mode,
10143 type,
10144 &ag_mode,
10145 &nregs,
10146 &is_ha))
10148 /* TYPE passed in fp/simd registers. */
10149 if (!TARGET_FLOAT)
10150 aarch64_err_no_fpadvsimd (mode, "varargs");
10152 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10153 unshare_expr (valist), f_vrtop, NULL_TREE);
10154 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10155 unshare_expr (valist), f_vroff, NULL_TREE);
10157 rsize = nregs * UNITS_PER_VREG;
10159 if (is_ha)
10161 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10162 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10164 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10165 && size < UNITS_PER_VREG)
10167 adjust = UNITS_PER_VREG - size;
10170 else
10172 /* TYPE passed in general registers. */
10173 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10174 unshare_expr (valist), f_grtop, NULL_TREE);
10175 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10176 unshare_expr (valist), f_groff, NULL_TREE);
10177 rsize = ROUND_UP (size, UNITS_PER_WORD);
10178 nregs = rsize / UNITS_PER_WORD;
10180 if (align > 8)
10181 dw_align = true;
10183 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10184 && size < UNITS_PER_WORD)
10186 adjust = UNITS_PER_WORD - size;
10190 /* Get a local temporary for the field value. */
10191 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10193 /* Emit code to branch if off >= 0. */
10194 t = build2 (GE_EXPR, boolean_type_node, off,
10195 build_int_cst (TREE_TYPE (off), 0));
10196 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10198 if (dw_align)
10200 /* Emit: offs = (offs + 15) & -16. */
10201 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10202 build_int_cst (TREE_TYPE (off), 15));
10203 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10204 build_int_cst (TREE_TYPE (off), -16));
10205 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10207 else
10208 roundup = NULL;
10210 /* Update ap.__[g|v]r_offs */
10211 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10212 build_int_cst (TREE_TYPE (off), rsize));
10213 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10215 /* String up. */
10216 if (roundup)
10217 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10219 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10220 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10221 build_int_cst (TREE_TYPE (f_off), 0));
10222 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10224 /* String up: make sure the assignment happens before the use. */
10225 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10226 COND_EXPR_ELSE (cond1) = t;
10228 /* Prepare the trees handling the argument that is passed on the stack;
10229 the top level node will store in ON_STACK. */
10230 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10231 if (align > 8)
10233 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10234 t = fold_convert (intDI_type_node, arg);
10235 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10236 build_int_cst (TREE_TYPE (t), 15));
10237 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10238 build_int_cst (TREE_TYPE (t), -16));
10239 t = fold_convert (TREE_TYPE (arg), t);
10240 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10242 else
10243 roundup = NULL;
10244 /* Advance ap.__stack */
10245 t = fold_convert (intDI_type_node, arg);
10246 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10247 build_int_cst (TREE_TYPE (t), size + 7));
10248 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10249 build_int_cst (TREE_TYPE (t), -8));
10250 t = fold_convert (TREE_TYPE (arg), t);
10251 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10252 /* String up roundup and advance. */
10253 if (roundup)
10254 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10255 /* String up with arg */
10256 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10257 /* Big-endianness related address adjustment. */
10258 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10259 && size < UNITS_PER_WORD)
10261 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10262 size_int (UNITS_PER_WORD - size));
10263 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10266 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10267 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10269 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10270 t = off;
10271 if (adjust)
10272 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10273 build_int_cst (TREE_TYPE (off), adjust));
10275 t = fold_convert (sizetype, t);
10276 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10278 if (is_ha)
10280 /* type ha; // treat as "struct {ftype field[n];}"
10281 ... [computing offs]
10282 for (i = 0; i <nregs; ++i, offs += 16)
10283 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10284 return ha; */
10285 int i;
10286 tree tmp_ha, field_t, field_ptr_t;
10288 /* Declare a local variable. */
10289 tmp_ha = create_tmp_var_raw (type, "ha");
10290 gimple_add_tmp_var (tmp_ha);
10292 /* Establish the base type. */
10293 switch (ag_mode)
10295 case SFmode:
10296 field_t = float_type_node;
10297 field_ptr_t = float_ptr_type_node;
10298 break;
10299 case DFmode:
10300 field_t = double_type_node;
10301 field_ptr_t = double_ptr_type_node;
10302 break;
10303 case TFmode:
10304 field_t = long_double_type_node;
10305 field_ptr_t = long_double_ptr_type_node;
10306 break;
10307 case HFmode:
10308 field_t = aarch64_fp16_type_node;
10309 field_ptr_t = aarch64_fp16_ptr_type_node;
10310 break;
10311 case V2SImode:
10312 case V4SImode:
10314 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10315 field_t = build_vector_type_for_mode (innertype, ag_mode);
10316 field_ptr_t = build_pointer_type (field_t);
10318 break;
10319 default:
10320 gcc_assert (0);
10323 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10324 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10325 addr = t;
10326 t = fold_convert (field_ptr_t, addr);
10327 t = build2 (MODIFY_EXPR, field_t,
10328 build1 (INDIRECT_REF, field_t, tmp_ha),
10329 build1 (INDIRECT_REF, field_t, t));
10331 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10332 for (i = 1; i < nregs; ++i)
10334 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10335 u = fold_convert (field_ptr_t, addr);
10336 u = build2 (MODIFY_EXPR, field_t,
10337 build2 (MEM_REF, field_t, tmp_ha,
10338 build_int_cst (field_ptr_t,
10339 (i *
10340 int_size_in_bytes (field_t)))),
10341 build1 (INDIRECT_REF, field_t, u));
10342 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10345 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10346 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10349 COND_EXPR_ELSE (cond2) = t;
10350 addr = fold_convert (build_pointer_type (type), cond1);
10351 addr = build_va_arg_indirect_ref (addr);
10353 if (indirect_p)
10354 addr = build_va_arg_indirect_ref (addr);
10356 return addr;
10359 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10361 static void
10362 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10363 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10364 int no_rtl)
10366 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10367 CUMULATIVE_ARGS local_cum;
10368 int gr_saved = cfun->va_list_gpr_size;
10369 int vr_saved = cfun->va_list_fpr_size;
10371 /* The caller has advanced CUM up to, but not beyond, the last named
10372 argument. Advance a local copy of CUM past the last "real" named
10373 argument, to find out how many registers are left over. */
10374 local_cum = *cum;
10375 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10377 /* Found out how many registers we need to save.
10378 Honor tree-stdvar analysis results. */
10379 if (cfun->va_list_gpr_size)
10380 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10381 cfun->va_list_gpr_size / UNITS_PER_WORD);
10382 if (cfun->va_list_fpr_size)
10383 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10384 cfun->va_list_fpr_size / UNITS_PER_VREG);
10386 if (!TARGET_FLOAT)
10388 gcc_assert (local_cum.aapcs_nvrn == 0);
10389 vr_saved = 0;
10392 if (!no_rtl)
10394 if (gr_saved > 0)
10396 rtx ptr, mem;
10398 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10399 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10400 - gr_saved * UNITS_PER_WORD);
10401 mem = gen_frame_mem (BLKmode, ptr);
10402 set_mem_alias_set (mem, get_varargs_alias_set ());
10404 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10405 mem, gr_saved);
10407 if (vr_saved > 0)
10409 /* We can't use move_block_from_reg, because it will use
10410 the wrong mode, storing D regs only. */
10411 machine_mode mode = TImode;
10412 int off, i, vr_start;
10414 /* Set OFF to the offset from virtual_incoming_args_rtx of
10415 the first vector register. The VR save area lies below
10416 the GR one, and is aligned to 16 bytes. */
10417 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10418 STACK_BOUNDARY / BITS_PER_UNIT);
10419 off -= vr_saved * UNITS_PER_VREG;
10421 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10422 for (i = 0; i < vr_saved; ++i)
10424 rtx ptr, mem;
10426 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10427 mem = gen_frame_mem (mode, ptr);
10428 set_mem_alias_set (mem, get_varargs_alias_set ());
10429 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10430 off += UNITS_PER_VREG;
10435 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10436 any complication of having crtl->args.pretend_args_size changed. */
10437 cfun->machine->frame.saved_varargs_size
10438 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10439 STACK_BOUNDARY / BITS_PER_UNIT)
10440 + vr_saved * UNITS_PER_VREG);
10443 static void
10444 aarch64_conditional_register_usage (void)
10446 int i;
10447 if (!TARGET_FLOAT)
10449 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10451 fixed_regs[i] = 1;
10452 call_used_regs[i] = 1;
10457 /* Walk down the type tree of TYPE counting consecutive base elements.
10458 If *MODEP is VOIDmode, then set it to the first valid floating point
10459 type. If a non-floating point type is found, or if a floating point
10460 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10461 otherwise return the count in the sub-tree. */
10462 static int
10463 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10465 machine_mode mode;
10466 HOST_WIDE_INT size;
10468 switch (TREE_CODE (type))
10470 case REAL_TYPE:
10471 mode = TYPE_MODE (type);
10472 if (mode != DFmode && mode != SFmode
10473 && mode != TFmode && mode != HFmode)
10474 return -1;
10476 if (*modep == VOIDmode)
10477 *modep = mode;
10479 if (*modep == mode)
10480 return 1;
10482 break;
10484 case COMPLEX_TYPE:
10485 mode = TYPE_MODE (TREE_TYPE (type));
10486 if (mode != DFmode && mode != SFmode
10487 && mode != TFmode && mode != HFmode)
10488 return -1;
10490 if (*modep == VOIDmode)
10491 *modep = mode;
10493 if (*modep == mode)
10494 return 2;
10496 break;
10498 case VECTOR_TYPE:
10499 /* Use V2SImode and V4SImode as representatives of all 64-bit
10500 and 128-bit vector types. */
10501 size = int_size_in_bytes (type);
10502 switch (size)
10504 case 8:
10505 mode = V2SImode;
10506 break;
10507 case 16:
10508 mode = V4SImode;
10509 break;
10510 default:
10511 return -1;
10514 if (*modep == VOIDmode)
10515 *modep = mode;
10517 /* Vector modes are considered to be opaque: two vectors are
10518 equivalent for the purposes of being homogeneous aggregates
10519 if they are the same size. */
10520 if (*modep == mode)
10521 return 1;
10523 break;
10525 case ARRAY_TYPE:
10527 int count;
10528 tree index = TYPE_DOMAIN (type);
10530 /* Can't handle incomplete types nor sizes that are not
10531 fixed. */
10532 if (!COMPLETE_TYPE_P (type)
10533 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10534 return -1;
10536 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10537 if (count == -1
10538 || !index
10539 || !TYPE_MAX_VALUE (index)
10540 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10541 || !TYPE_MIN_VALUE (index)
10542 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10543 || count < 0)
10544 return -1;
10546 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10547 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10549 /* There must be no padding. */
10550 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10551 return -1;
10553 return count;
10556 case RECORD_TYPE:
10558 int count = 0;
10559 int sub_count;
10560 tree field;
10562 /* Can't handle incomplete types nor sizes that are not
10563 fixed. */
10564 if (!COMPLETE_TYPE_P (type)
10565 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10566 return -1;
10568 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10570 if (TREE_CODE (field) != FIELD_DECL)
10571 continue;
10573 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10574 if (sub_count < 0)
10575 return -1;
10576 count += sub_count;
10579 /* There must be no padding. */
10580 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10581 return -1;
10583 return count;
10586 case UNION_TYPE:
10587 case QUAL_UNION_TYPE:
10589 /* These aren't very interesting except in a degenerate case. */
10590 int count = 0;
10591 int sub_count;
10592 tree field;
10594 /* Can't handle incomplete types nor sizes that are not
10595 fixed. */
10596 if (!COMPLETE_TYPE_P (type)
10597 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10598 return -1;
10600 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10602 if (TREE_CODE (field) != FIELD_DECL)
10603 continue;
10605 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10606 if (sub_count < 0)
10607 return -1;
10608 count = count > sub_count ? count : sub_count;
10611 /* There must be no padding. */
10612 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10613 return -1;
10615 return count;
10618 default:
10619 break;
10622 return -1;
10625 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10626 type as described in AAPCS64 \S 4.1.2.
10628 See the comment above aarch64_composite_type_p for the notes on MODE. */
10630 static bool
10631 aarch64_short_vector_p (const_tree type,
10632 machine_mode mode)
10634 HOST_WIDE_INT size = -1;
10636 if (type && TREE_CODE (type) == VECTOR_TYPE)
10637 size = int_size_in_bytes (type);
10638 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10639 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10640 size = GET_MODE_SIZE (mode);
10642 return (size == 8 || size == 16);
10645 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10646 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10647 array types. The C99 floating-point complex types are also considered
10648 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10649 types, which are GCC extensions and out of the scope of AAPCS64, are
10650 treated as composite types here as well.
10652 Note that MODE itself is not sufficient in determining whether a type
10653 is such a composite type or not. This is because
10654 stor-layout.c:compute_record_mode may have already changed the MODE
10655 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10656 structure with only one field may have its MODE set to the mode of the
10657 field. Also an integer mode whose size matches the size of the
10658 RECORD_TYPE type may be used to substitute the original mode
10659 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10660 solely relied on. */
10662 static bool
10663 aarch64_composite_type_p (const_tree type,
10664 machine_mode mode)
10666 if (aarch64_short_vector_p (type, mode))
10667 return false;
10669 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10670 return true;
10672 if (mode == BLKmode
10673 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10674 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10675 return true;
10677 return false;
10680 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10681 shall be passed or returned in simd/fp register(s) (providing these
10682 parameter passing registers are available).
10684 Upon successful return, *COUNT returns the number of needed registers,
10685 *BASE_MODE returns the mode of the individual register and when IS_HAF
10686 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10687 floating-point aggregate or a homogeneous short-vector aggregate. */
10689 static bool
10690 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10691 const_tree type,
10692 machine_mode *base_mode,
10693 int *count,
10694 bool *is_ha)
10696 machine_mode new_mode = VOIDmode;
10697 bool composite_p = aarch64_composite_type_p (type, mode);
10699 if (is_ha != NULL) *is_ha = false;
10701 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10702 || aarch64_short_vector_p (type, mode))
10704 *count = 1;
10705 new_mode = mode;
10707 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10709 if (is_ha != NULL) *is_ha = true;
10710 *count = 2;
10711 new_mode = GET_MODE_INNER (mode);
10713 else if (type && composite_p)
10715 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10717 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10719 if (is_ha != NULL) *is_ha = true;
10720 *count = ag_count;
10722 else
10723 return false;
10725 else
10726 return false;
10728 *base_mode = new_mode;
10729 return true;
10732 /* Implement TARGET_STRUCT_VALUE_RTX. */
10734 static rtx
10735 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10736 int incoming ATTRIBUTE_UNUSED)
10738 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10741 /* Implements target hook vector_mode_supported_p. */
10742 static bool
10743 aarch64_vector_mode_supported_p (machine_mode mode)
10745 if (TARGET_SIMD
10746 && (mode == V4SImode || mode == V8HImode
10747 || mode == V16QImode || mode == V2DImode
10748 || mode == V2SImode || mode == V4HImode
10749 || mode == V8QImode || mode == V2SFmode
10750 || mode == V4SFmode || mode == V2DFmode
10751 || mode == V4HFmode || mode == V8HFmode
10752 || mode == V1DFmode))
10753 return true;
10755 return false;
10758 /* Return appropriate SIMD container
10759 for MODE within a vector of WIDTH bits. */
10760 static machine_mode
10761 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10763 gcc_assert (width == 64 || width == 128);
10764 if (TARGET_SIMD)
10766 if (width == 128)
10767 switch (mode)
10769 case DFmode:
10770 return V2DFmode;
10771 case SFmode:
10772 return V4SFmode;
10773 case SImode:
10774 return V4SImode;
10775 case HImode:
10776 return V8HImode;
10777 case QImode:
10778 return V16QImode;
10779 case DImode:
10780 return V2DImode;
10781 default:
10782 break;
10784 else
10785 switch (mode)
10787 case SFmode:
10788 return V2SFmode;
10789 case SImode:
10790 return V2SImode;
10791 case HImode:
10792 return V4HImode;
10793 case QImode:
10794 return V8QImode;
10795 default:
10796 break;
10799 return word_mode;
10802 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10803 static machine_mode
10804 aarch64_preferred_simd_mode (machine_mode mode)
10806 return aarch64_simd_container_mode (mode, 128);
10809 /* Return the bitmask of possible vector sizes for the vectorizer
10810 to iterate over. */
10811 static unsigned int
10812 aarch64_autovectorize_vector_sizes (void)
10814 return (16 | 8);
10817 /* Implement TARGET_MANGLE_TYPE. */
10819 static const char *
10820 aarch64_mangle_type (const_tree type)
10822 /* The AArch64 ABI documents say that "__va_list" has to be
10823 managled as if it is in the "std" namespace. */
10824 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10825 return "St9__va_list";
10827 /* Half-precision float. */
10828 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10829 return "Dh";
10831 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10832 builtin types. */
10833 if (TYPE_NAME (type) != NULL)
10834 return aarch64_mangle_builtin_type (type);
10836 /* Use the default mangling. */
10837 return NULL;
10841 /* Return true if the rtx_insn contains a MEM RTX somewhere
10842 in it. */
10844 static bool
10845 has_memory_op (rtx_insn *mem_insn)
10847 subrtx_iterator::array_type array;
10848 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10849 if (MEM_P (*iter))
10850 return true;
10852 return false;
10855 /* Find the first rtx_insn before insn that will generate an assembly
10856 instruction. */
10858 static rtx_insn *
10859 aarch64_prev_real_insn (rtx_insn *insn)
10861 if (!insn)
10862 return NULL;
10866 insn = prev_real_insn (insn);
10868 while (insn && recog_memoized (insn) < 0);
10870 return insn;
10873 static bool
10874 is_madd_op (enum attr_type t1)
10876 unsigned int i;
10877 /* A number of these may be AArch32 only. */
10878 enum attr_type mlatypes[] = {
10879 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10880 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10881 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10884 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10886 if (t1 == mlatypes[i])
10887 return true;
10890 return false;
10893 /* Check if there is a register dependency between a load and the insn
10894 for which we hold recog_data. */
10896 static bool
10897 dep_between_memop_and_curr (rtx memop)
10899 rtx load_reg;
10900 int opno;
10902 gcc_assert (GET_CODE (memop) == SET);
10904 if (!REG_P (SET_DEST (memop)))
10905 return false;
10907 load_reg = SET_DEST (memop);
10908 for (opno = 1; opno < recog_data.n_operands; opno++)
10910 rtx operand = recog_data.operand[opno];
10911 if (REG_P (operand)
10912 && reg_overlap_mentioned_p (load_reg, operand))
10913 return true;
10916 return false;
10920 /* When working around the Cortex-A53 erratum 835769,
10921 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10922 instruction and has a preceding memory instruction such that a NOP
10923 should be inserted between them. */
10925 bool
10926 aarch64_madd_needs_nop (rtx_insn* insn)
10928 enum attr_type attr_type;
10929 rtx_insn *prev;
10930 rtx body;
10932 if (!TARGET_FIX_ERR_A53_835769)
10933 return false;
10935 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10936 return false;
10938 attr_type = get_attr_type (insn);
10939 if (!is_madd_op (attr_type))
10940 return false;
10942 prev = aarch64_prev_real_insn (insn);
10943 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10944 Restore recog state to INSN to avoid state corruption. */
10945 extract_constrain_insn_cached (insn);
10947 if (!prev || !has_memory_op (prev))
10948 return false;
10950 body = single_set (prev);
10952 /* If the previous insn is a memory op and there is no dependency between
10953 it and the DImode madd, emit a NOP between them. If body is NULL then we
10954 have a complex memory operation, probably a load/store pair.
10955 Be conservative for now and emit a NOP. */
10956 if (GET_MODE (recog_data.operand[0]) == DImode
10957 && (!body || !dep_between_memop_and_curr (body)))
10958 return true;
10960 return false;
10965 /* Implement FINAL_PRESCAN_INSN. */
10967 void
10968 aarch64_final_prescan_insn (rtx_insn *insn)
10970 if (aarch64_madd_needs_nop (insn))
10971 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10975 /* Return the equivalent letter for size. */
10976 static char
10977 sizetochar (int size)
10979 switch (size)
10981 case 64: return 'd';
10982 case 32: return 's';
10983 case 16: return 'h';
10984 case 8 : return 'b';
10985 default: gcc_unreachable ();
10989 /* Return true iff x is a uniform vector of floating-point
10990 constants, and the constant can be represented in
10991 quarter-precision form. Note, as aarch64_float_const_representable
10992 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10993 static bool
10994 aarch64_vect_float_const_representable_p (rtx x)
10996 rtx elt;
10997 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10998 && const_vec_duplicate_p (x, &elt)
10999 && aarch64_float_const_representable_p (elt));
11002 /* Return true for valid and false for invalid. */
11003 bool
11004 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11005 struct simd_immediate_info *info)
11007 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11008 matches = 1; \
11009 for (i = 0; i < idx; i += (STRIDE)) \
11010 if (!(TEST)) \
11011 matches = 0; \
11012 if (matches) \
11014 immtype = (CLASS); \
11015 elsize = (ELSIZE); \
11016 eshift = (SHIFT); \
11017 emvn = (NEG); \
11018 break; \
11021 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11022 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11023 unsigned char bytes[16];
11024 int immtype = -1, matches;
11025 unsigned int invmask = inverse ? 0xff : 0;
11026 int eshift, emvn;
11028 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11030 if (! (aarch64_simd_imm_zero_p (op, mode)
11031 || aarch64_vect_float_const_representable_p (op)))
11032 return false;
11034 if (info)
11036 info->value = CONST_VECTOR_ELT (op, 0);
11037 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11038 info->mvn = false;
11039 info->shift = 0;
11042 return true;
11045 /* Splat vector constant out into a byte vector. */
11046 for (i = 0; i < n_elts; i++)
11048 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11049 it must be laid out in the vector register in reverse order. */
11050 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11051 unsigned HOST_WIDE_INT elpart;
11053 gcc_assert (CONST_INT_P (el));
11054 elpart = INTVAL (el);
11056 for (unsigned int byte = 0; byte < innersize; byte++)
11058 bytes[idx++] = (elpart & 0xff) ^ invmask;
11059 elpart >>= BITS_PER_UNIT;
11064 /* Sanity check. */
11065 gcc_assert (idx == GET_MODE_SIZE (mode));
11069 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11070 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11072 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11073 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11075 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11076 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11078 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11079 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11081 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11083 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11085 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11086 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11088 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11089 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11091 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11092 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11094 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11095 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11097 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11099 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11101 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11102 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11104 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11105 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11107 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11108 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11110 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11111 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11113 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11115 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11116 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11118 while (0);
11120 if (immtype == -1)
11121 return false;
11123 if (info)
11125 info->element_width = elsize;
11126 info->mvn = emvn != 0;
11127 info->shift = eshift;
11129 unsigned HOST_WIDE_INT imm = 0;
11131 if (immtype >= 12 && immtype <= 15)
11132 info->msl = true;
11134 /* Un-invert bytes of recognized vector, if necessary. */
11135 if (invmask != 0)
11136 for (i = 0; i < idx; i++)
11137 bytes[i] ^= invmask;
11139 if (immtype == 17)
11141 /* FIXME: Broken on 32-bit H_W_I hosts. */
11142 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11144 for (i = 0; i < 8; i++)
11145 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11146 << (i * BITS_PER_UNIT);
11149 info->value = GEN_INT (imm);
11151 else
11153 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11154 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11156 /* Construct 'abcdefgh' because the assembler cannot handle
11157 generic constants. */
11158 if (info->mvn)
11159 imm = ~imm;
11160 imm = (imm >> info->shift) & 0xff;
11161 info->value = GEN_INT (imm);
11165 return true;
11166 #undef CHECK
11169 /* Check of immediate shift constants are within range. */
11170 bool
11171 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11173 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11174 if (left)
11175 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11176 else
11177 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11180 /* Return true if X is a uniform vector where all elements
11181 are either the floating-point constant 0.0 or the
11182 integer constant 0. */
11183 bool
11184 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11186 return x == CONST0_RTX (mode);
11190 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11191 operation of width WIDTH at bit position POS. */
11194 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11196 gcc_assert (CONST_INT_P (width));
11197 gcc_assert (CONST_INT_P (pos));
11199 unsigned HOST_WIDE_INT mask
11200 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11201 return GEN_INT (mask << UINTVAL (pos));
11204 bool
11205 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11207 HOST_WIDE_INT imm = INTVAL (x);
11208 int i;
11210 for (i = 0; i < 8; i++)
11212 unsigned int byte = imm & 0xff;
11213 if (byte != 0xff && byte != 0)
11214 return false;
11215 imm >>= 8;
11218 return true;
11221 bool
11222 aarch64_mov_operand_p (rtx x, machine_mode mode)
11224 if (GET_CODE (x) == HIGH
11225 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11226 return true;
11228 if (CONST_INT_P (x))
11229 return true;
11231 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11232 return true;
11234 return aarch64_classify_symbolic_expression (x)
11235 == SYMBOL_TINY_ABSOLUTE;
11238 /* Return a const_int vector of VAL. */
11240 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11242 int nunits = GET_MODE_NUNITS (mode);
11243 rtvec v = rtvec_alloc (nunits);
11244 int i;
11246 rtx cache = GEN_INT (val);
11248 for (i=0; i < nunits; i++)
11249 RTVEC_ELT (v, i) = cache;
11251 return gen_rtx_CONST_VECTOR (mode, v);
11254 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11256 bool
11257 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11259 machine_mode vmode;
11261 gcc_assert (!VECTOR_MODE_P (mode));
11262 vmode = aarch64_preferred_simd_mode (mode);
11263 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11264 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11267 /* Construct and return a PARALLEL RTX vector with elements numbering the
11268 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11269 the vector - from the perspective of the architecture. This does not
11270 line up with GCC's perspective on lane numbers, so we end up with
11271 different masks depending on our target endian-ness. The diagram
11272 below may help. We must draw the distinction when building masks
11273 which select one half of the vector. An instruction selecting
11274 architectural low-lanes for a big-endian target, must be described using
11275 a mask selecting GCC high-lanes.
11277 Big-Endian Little-Endian
11279 GCC 0 1 2 3 3 2 1 0
11280 | x | x | x | x | | x | x | x | x |
11281 Architecture 3 2 1 0 3 2 1 0
11283 Low Mask: { 2, 3 } { 0, 1 }
11284 High Mask: { 0, 1 } { 2, 3 }
11288 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11290 int nunits = GET_MODE_NUNITS (mode);
11291 rtvec v = rtvec_alloc (nunits / 2);
11292 int high_base = nunits / 2;
11293 int low_base = 0;
11294 int base;
11295 rtx t1;
11296 int i;
11298 if (BYTES_BIG_ENDIAN)
11299 base = high ? low_base : high_base;
11300 else
11301 base = high ? high_base : low_base;
11303 for (i = 0; i < nunits / 2; i++)
11304 RTVEC_ELT (v, i) = GEN_INT (base + i);
11306 t1 = gen_rtx_PARALLEL (mode, v);
11307 return t1;
11310 /* Check OP for validity as a PARALLEL RTX vector with elements
11311 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11312 from the perspective of the architecture. See the diagram above
11313 aarch64_simd_vect_par_cnst_half for more details. */
11315 bool
11316 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11317 bool high)
11319 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11320 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11321 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11322 int i = 0;
11324 if (!VECTOR_MODE_P (mode))
11325 return false;
11327 if (count_op != count_ideal)
11328 return false;
11330 for (i = 0; i < count_ideal; i++)
11332 rtx elt_op = XVECEXP (op, 0, i);
11333 rtx elt_ideal = XVECEXP (ideal, 0, i);
11335 if (!CONST_INT_P (elt_op)
11336 || INTVAL (elt_ideal) != INTVAL (elt_op))
11337 return false;
11339 return true;
11342 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11343 HIGH (exclusive). */
11344 void
11345 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11346 const_tree exp)
11348 HOST_WIDE_INT lane;
11349 gcc_assert (CONST_INT_P (operand));
11350 lane = INTVAL (operand);
11352 if (lane < low || lane >= high)
11354 if (exp)
11355 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11356 else
11357 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11361 /* Return TRUE if OP is a valid vector addressing mode. */
11362 bool
11363 aarch64_simd_mem_operand_p (rtx op)
11365 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11366 || REG_P (XEXP (op, 0)));
11369 /* Emit a register copy from operand to operand, taking care not to
11370 early-clobber source registers in the process.
11372 COUNT is the number of components into which the copy needs to be
11373 decomposed. */
11374 void
11375 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11376 unsigned int count)
11378 unsigned int i;
11379 int rdest = REGNO (operands[0]);
11380 int rsrc = REGNO (operands[1]);
11382 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11383 || rdest < rsrc)
11384 for (i = 0; i < count; i++)
11385 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11386 gen_rtx_REG (mode, rsrc + i));
11387 else
11388 for (i = 0; i < count; i++)
11389 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11390 gen_rtx_REG (mode, rsrc + count - i - 1));
11393 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11394 one of VSTRUCT modes: OI, CI, or XI. */
11396 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11398 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11401 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11402 alignment of a vector to 128 bits. */
11403 static HOST_WIDE_INT
11404 aarch64_simd_vector_alignment (const_tree type)
11406 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11407 return MIN (align, 128);
11410 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11411 static bool
11412 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11414 if (is_packed)
11415 return false;
11417 /* We guarantee alignment for vectors up to 128-bits. */
11418 if (tree_int_cst_compare (TYPE_SIZE (type),
11419 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11420 return false;
11422 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11423 return true;
11426 /* Return true if the vector misalignment factor is supported by the
11427 target. */
11428 static bool
11429 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11430 const_tree type, int misalignment,
11431 bool is_packed)
11433 if (TARGET_SIMD && STRICT_ALIGNMENT)
11435 /* Return if movmisalign pattern is not supported for this mode. */
11436 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11437 return false;
11439 if (misalignment == -1)
11441 /* Misalignment factor is unknown at compile time but we know
11442 it's word aligned. */
11443 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11445 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11447 if (element_size != 64)
11448 return true;
11450 return false;
11453 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11454 is_packed);
11457 /* If VALS is a vector constant that can be loaded into a register
11458 using DUP, generate instructions to do so and return an RTX to
11459 assign to the register. Otherwise return NULL_RTX. */
11460 static rtx
11461 aarch64_simd_dup_constant (rtx vals)
11463 machine_mode mode = GET_MODE (vals);
11464 machine_mode inner_mode = GET_MODE_INNER (mode);
11465 rtx x;
11467 if (!const_vec_duplicate_p (vals, &x))
11468 return NULL_RTX;
11470 /* We can load this constant by using DUP and a constant in a
11471 single ARM register. This will be cheaper than a vector
11472 load. */
11473 x = copy_to_mode_reg (inner_mode, x);
11474 return gen_rtx_VEC_DUPLICATE (mode, x);
11478 /* Generate code to load VALS, which is a PARALLEL containing only
11479 constants (for vec_init) or CONST_VECTOR, efficiently into a
11480 register. Returns an RTX to copy into the register, or NULL_RTX
11481 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11482 static rtx
11483 aarch64_simd_make_constant (rtx vals)
11485 machine_mode mode = GET_MODE (vals);
11486 rtx const_dup;
11487 rtx const_vec = NULL_RTX;
11488 int n_elts = GET_MODE_NUNITS (mode);
11489 int n_const = 0;
11490 int i;
11492 if (GET_CODE (vals) == CONST_VECTOR)
11493 const_vec = vals;
11494 else if (GET_CODE (vals) == PARALLEL)
11496 /* A CONST_VECTOR must contain only CONST_INTs and
11497 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11498 Only store valid constants in a CONST_VECTOR. */
11499 for (i = 0; i < n_elts; ++i)
11501 rtx x = XVECEXP (vals, 0, i);
11502 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11503 n_const++;
11505 if (n_const == n_elts)
11506 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11508 else
11509 gcc_unreachable ();
11511 if (const_vec != NULL_RTX
11512 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11513 /* Load using MOVI/MVNI. */
11514 return const_vec;
11515 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11516 /* Loaded using DUP. */
11517 return const_dup;
11518 else if (const_vec != NULL_RTX)
11519 /* Load from constant pool. We can not take advantage of single-cycle
11520 LD1 because we need a PC-relative addressing mode. */
11521 return const_vec;
11522 else
11523 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11524 We can not construct an initializer. */
11525 return NULL_RTX;
11528 /* Expand a vector initialisation sequence, such that TARGET is
11529 initialised to contain VALS. */
11531 void
11532 aarch64_expand_vector_init (rtx target, rtx vals)
11534 machine_mode mode = GET_MODE (target);
11535 machine_mode inner_mode = GET_MODE_INNER (mode);
11536 /* The number of vector elements. */
11537 int n_elts = GET_MODE_NUNITS (mode);
11538 /* The number of vector elements which are not constant. */
11539 int n_var = 0;
11540 rtx any_const = NULL_RTX;
11541 /* The first element of vals. */
11542 rtx v0 = XVECEXP (vals, 0, 0);
11543 bool all_same = true;
11545 /* Count the number of variable elements to initialise. */
11546 for (int i = 0; i < n_elts; ++i)
11548 rtx x = XVECEXP (vals, 0, i);
11549 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11550 ++n_var;
11551 else
11552 any_const = x;
11554 all_same &= rtx_equal_p (x, v0);
11557 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11558 how best to handle this. */
11559 if (n_var == 0)
11561 rtx constant = aarch64_simd_make_constant (vals);
11562 if (constant != NULL_RTX)
11564 emit_move_insn (target, constant);
11565 return;
11569 /* Splat a single non-constant element if we can. */
11570 if (all_same)
11572 rtx x = copy_to_mode_reg (inner_mode, v0);
11573 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11574 return;
11577 /* Initialise a vector which is part-variable. We want to first try
11578 to build those lanes which are constant in the most efficient way we
11579 can. */
11580 if (n_var != n_elts)
11582 rtx copy = copy_rtx (vals);
11584 /* Load constant part of vector. We really don't care what goes into the
11585 parts we will overwrite, but we're more likely to be able to load the
11586 constant efficiently if it has fewer, larger, repeating parts
11587 (see aarch64_simd_valid_immediate). */
11588 for (int i = 0; i < n_elts; i++)
11590 rtx x = XVECEXP (vals, 0, i);
11591 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11592 continue;
11593 rtx subst = any_const;
11594 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11596 /* Look in the copied vector, as more elements are const. */
11597 rtx test = XVECEXP (copy, 0, i ^ bit);
11598 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11600 subst = test;
11601 break;
11604 XVECEXP (copy, 0, i) = subst;
11606 aarch64_expand_vector_init (target, copy);
11609 /* Insert the variable lanes directly. */
11611 enum insn_code icode = optab_handler (vec_set_optab, mode);
11612 gcc_assert (icode != CODE_FOR_nothing);
11614 for (int i = 0; i < n_elts; i++)
11616 rtx x = XVECEXP (vals, 0, i);
11617 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11618 continue;
11619 x = copy_to_mode_reg (inner_mode, x);
11620 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11624 static unsigned HOST_WIDE_INT
11625 aarch64_shift_truncation_mask (machine_mode mode)
11627 return
11628 (!SHIFT_COUNT_TRUNCATED
11629 || aarch64_vector_mode_supported_p (mode)
11630 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11633 /* Select a format to encode pointers in exception handling data. */
11635 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11637 int type;
11638 switch (aarch64_cmodel)
11640 case AARCH64_CMODEL_TINY:
11641 case AARCH64_CMODEL_TINY_PIC:
11642 case AARCH64_CMODEL_SMALL:
11643 case AARCH64_CMODEL_SMALL_PIC:
11644 case AARCH64_CMODEL_SMALL_SPIC:
11645 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11646 for everything. */
11647 type = DW_EH_PE_sdata4;
11648 break;
11649 default:
11650 /* No assumptions here. 8-byte relocs required. */
11651 type = DW_EH_PE_sdata8;
11652 break;
11654 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11657 /* The last .arch and .tune assembly strings that we printed. */
11658 static std::string aarch64_last_printed_arch_string;
11659 static std::string aarch64_last_printed_tune_string;
11661 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11662 by the function fndecl. */
11664 void
11665 aarch64_declare_function_name (FILE *stream, const char* name,
11666 tree fndecl)
11668 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11670 struct cl_target_option *targ_options;
11671 if (target_parts)
11672 targ_options = TREE_TARGET_OPTION (target_parts);
11673 else
11674 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11675 gcc_assert (targ_options);
11677 const struct processor *this_arch
11678 = aarch64_get_arch (targ_options->x_explicit_arch);
11680 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11681 std::string extension
11682 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11683 this_arch->flags);
11684 /* Only update the assembler .arch string if it is distinct from the last
11685 such string we printed. */
11686 std::string to_print = this_arch->name + extension;
11687 if (to_print != aarch64_last_printed_arch_string)
11689 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11690 aarch64_last_printed_arch_string = to_print;
11693 /* Print the cpu name we're tuning for in the comments, might be
11694 useful to readers of the generated asm. Do it only when it changes
11695 from function to function and verbose assembly is requested. */
11696 const struct processor *this_tune
11697 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11699 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11701 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11702 this_tune->name);
11703 aarch64_last_printed_tune_string = this_tune->name;
11706 /* Don't forget the type directive for ELF. */
11707 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11708 ASM_OUTPUT_LABEL (stream, name);
11711 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11713 static void
11714 aarch64_start_file (void)
11716 struct cl_target_option *default_options
11717 = TREE_TARGET_OPTION (target_option_default_node);
11719 const struct processor *default_arch
11720 = aarch64_get_arch (default_options->x_explicit_arch);
11721 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11722 std::string extension
11723 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11724 default_arch->flags);
11726 aarch64_last_printed_arch_string = default_arch->name + extension;
11727 aarch64_last_printed_tune_string = "";
11728 asm_fprintf (asm_out_file, "\t.arch %s\n",
11729 aarch64_last_printed_arch_string.c_str ());
11731 default_file_start ();
11734 /* Emit load exclusive. */
11736 static void
11737 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11738 rtx mem, rtx model_rtx)
11740 rtx (*gen) (rtx, rtx, rtx);
11742 switch (mode)
11744 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11745 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11746 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11747 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11748 default:
11749 gcc_unreachable ();
11752 emit_insn (gen (rval, mem, model_rtx));
11755 /* Emit store exclusive. */
11757 static void
11758 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11759 rtx rval, rtx mem, rtx model_rtx)
11761 rtx (*gen) (rtx, rtx, rtx, rtx);
11763 switch (mode)
11765 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11766 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11767 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11768 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11769 default:
11770 gcc_unreachable ();
11773 emit_insn (gen (bval, rval, mem, model_rtx));
11776 /* Mark the previous jump instruction as unlikely. */
11778 static void
11779 aarch64_emit_unlikely_jump (rtx insn)
11781 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11783 rtx_insn *jump = emit_jump_insn (insn);
11784 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11787 /* Expand a compare and swap pattern. */
11789 void
11790 aarch64_expand_compare_and_swap (rtx operands[])
11792 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11793 machine_mode mode, cmp_mode;
11794 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11795 int idx;
11796 gen_cas_fn gen;
11797 const gen_cas_fn split_cas[] =
11799 gen_aarch64_compare_and_swapqi,
11800 gen_aarch64_compare_and_swaphi,
11801 gen_aarch64_compare_and_swapsi,
11802 gen_aarch64_compare_and_swapdi
11804 const gen_cas_fn atomic_cas[] =
11806 gen_aarch64_compare_and_swapqi_lse,
11807 gen_aarch64_compare_and_swaphi_lse,
11808 gen_aarch64_compare_and_swapsi_lse,
11809 gen_aarch64_compare_and_swapdi_lse
11812 bval = operands[0];
11813 rval = operands[1];
11814 mem = operands[2];
11815 oldval = operands[3];
11816 newval = operands[4];
11817 is_weak = operands[5];
11818 mod_s = operands[6];
11819 mod_f = operands[7];
11820 mode = GET_MODE (mem);
11821 cmp_mode = mode;
11823 /* Normally the succ memory model must be stronger than fail, but in the
11824 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11825 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11827 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11828 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11829 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11831 switch (mode)
11833 case QImode:
11834 case HImode:
11835 /* For short modes, we're going to perform the comparison in SImode,
11836 so do the zero-extension now. */
11837 cmp_mode = SImode;
11838 rval = gen_reg_rtx (SImode);
11839 oldval = convert_modes (SImode, mode, oldval, true);
11840 /* Fall through. */
11842 case SImode:
11843 case DImode:
11844 /* Force the value into a register if needed. */
11845 if (!aarch64_plus_operand (oldval, mode))
11846 oldval = force_reg (cmp_mode, oldval);
11847 break;
11849 default:
11850 gcc_unreachable ();
11853 switch (mode)
11855 case QImode: idx = 0; break;
11856 case HImode: idx = 1; break;
11857 case SImode: idx = 2; break;
11858 case DImode: idx = 3; break;
11859 default:
11860 gcc_unreachable ();
11862 if (TARGET_LSE)
11863 gen = atomic_cas[idx];
11864 else
11865 gen = split_cas[idx];
11867 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11869 if (mode == QImode || mode == HImode)
11870 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11872 x = gen_rtx_REG (CCmode, CC_REGNUM);
11873 x = gen_rtx_EQ (SImode, x, const0_rtx);
11874 emit_insn (gen_rtx_SET (bval, x));
11877 /* Test whether the target supports using a atomic load-operate instruction.
11878 CODE is the operation and AFTER is TRUE if the data in memory after the
11879 operation should be returned and FALSE if the data before the operation
11880 should be returned. Returns FALSE if the operation isn't supported by the
11881 architecture. */
11883 bool
11884 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11886 if (!TARGET_LSE)
11887 return false;
11889 switch (code)
11891 case SET:
11892 case AND:
11893 case IOR:
11894 case XOR:
11895 case MINUS:
11896 case PLUS:
11897 return true;
11898 default:
11899 return false;
11903 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11904 sequence implementing an atomic operation. */
11906 static void
11907 aarch64_emit_post_barrier (enum memmodel model)
11909 const enum memmodel base_model = memmodel_base (model);
11911 if (is_mm_sync (model)
11912 && (base_model == MEMMODEL_ACQUIRE
11913 || base_model == MEMMODEL_ACQ_REL
11914 || base_model == MEMMODEL_SEQ_CST))
11916 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11920 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11921 for the data in memory. EXPECTED is the value expected to be in memory.
11922 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11923 is the memory ordering to use. */
11925 void
11926 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11927 rtx expected, rtx desired,
11928 rtx model)
11930 rtx (*gen) (rtx, rtx, rtx, rtx);
11931 machine_mode mode;
11933 mode = GET_MODE (mem);
11935 switch (mode)
11937 case QImode: gen = gen_aarch64_atomic_casqi; break;
11938 case HImode: gen = gen_aarch64_atomic_cashi; break;
11939 case SImode: gen = gen_aarch64_atomic_cassi; break;
11940 case DImode: gen = gen_aarch64_atomic_casdi; break;
11941 default:
11942 gcc_unreachable ();
11945 /* Move the expected value into the CAS destination register. */
11946 emit_insn (gen_rtx_SET (rval, expected));
11948 /* Emit the CAS. */
11949 emit_insn (gen (rval, mem, desired, model));
11951 /* Compare the expected value with the value loaded by the CAS, to establish
11952 whether the swap was made. */
11953 aarch64_gen_compare_reg (EQ, rval, expected);
11956 /* Split a compare and swap pattern. */
11958 void
11959 aarch64_split_compare_and_swap (rtx operands[])
11961 rtx rval, mem, oldval, newval, scratch;
11962 machine_mode mode;
11963 bool is_weak;
11964 rtx_code_label *label1, *label2;
11965 rtx x, cond;
11966 enum memmodel model;
11967 rtx model_rtx;
11969 rval = operands[0];
11970 mem = operands[1];
11971 oldval = operands[2];
11972 newval = operands[3];
11973 is_weak = (operands[4] != const0_rtx);
11974 model_rtx = operands[5];
11975 scratch = operands[7];
11976 mode = GET_MODE (mem);
11977 model = memmodel_from_int (INTVAL (model_rtx));
11979 label1 = NULL;
11980 if (!is_weak)
11982 label1 = gen_label_rtx ();
11983 emit_label (label1);
11985 label2 = gen_label_rtx ();
11987 /* The initial load can be relaxed for a __sync operation since a final
11988 barrier will be emitted to stop code hoisting. */
11989 if (is_mm_sync (model))
11990 aarch64_emit_load_exclusive (mode, rval, mem,
11991 GEN_INT (MEMMODEL_RELAXED));
11992 else
11993 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11995 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11996 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11997 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11998 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11999 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12001 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12003 if (!is_weak)
12005 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12006 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12007 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12008 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12010 else
12012 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12013 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12014 emit_insn (gen_rtx_SET (cond, x));
12017 emit_label (label2);
12019 /* Emit any final barrier needed for a __sync operation. */
12020 if (is_mm_sync (model))
12021 aarch64_emit_post_barrier (model);
12024 /* Emit a BIC instruction. */
12026 static void
12027 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12029 rtx shift_rtx = GEN_INT (shift);
12030 rtx (*gen) (rtx, rtx, rtx, rtx);
12032 switch (mode)
12034 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12035 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12036 default:
12037 gcc_unreachable ();
12040 emit_insn (gen (dst, s2, shift_rtx, s1));
12043 /* Emit an atomic swap. */
12045 static void
12046 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12047 rtx mem, rtx model)
12049 rtx (*gen) (rtx, rtx, rtx, rtx);
12051 switch (mode)
12053 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12054 case HImode: gen = gen_aarch64_atomic_swphi; break;
12055 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12056 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12057 default:
12058 gcc_unreachable ();
12061 emit_insn (gen (dst, mem, value, model));
12064 /* Operations supported by aarch64_emit_atomic_load_op. */
12066 enum aarch64_atomic_load_op_code
12068 AARCH64_LDOP_PLUS, /* A + B */
12069 AARCH64_LDOP_XOR, /* A ^ B */
12070 AARCH64_LDOP_OR, /* A | B */
12071 AARCH64_LDOP_BIC /* A & ~B */
12074 /* Emit an atomic load-operate. */
12076 static void
12077 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12078 machine_mode mode, rtx dst, rtx src,
12079 rtx mem, rtx model)
12081 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12082 const aarch64_atomic_load_op_fn plus[] =
12084 gen_aarch64_atomic_loadaddqi,
12085 gen_aarch64_atomic_loadaddhi,
12086 gen_aarch64_atomic_loadaddsi,
12087 gen_aarch64_atomic_loadadddi
12089 const aarch64_atomic_load_op_fn eor[] =
12091 gen_aarch64_atomic_loadeorqi,
12092 gen_aarch64_atomic_loadeorhi,
12093 gen_aarch64_atomic_loadeorsi,
12094 gen_aarch64_atomic_loadeordi
12096 const aarch64_atomic_load_op_fn ior[] =
12098 gen_aarch64_atomic_loadsetqi,
12099 gen_aarch64_atomic_loadsethi,
12100 gen_aarch64_atomic_loadsetsi,
12101 gen_aarch64_atomic_loadsetdi
12103 const aarch64_atomic_load_op_fn bic[] =
12105 gen_aarch64_atomic_loadclrqi,
12106 gen_aarch64_atomic_loadclrhi,
12107 gen_aarch64_atomic_loadclrsi,
12108 gen_aarch64_atomic_loadclrdi
12110 aarch64_atomic_load_op_fn gen;
12111 int idx = 0;
12113 switch (mode)
12115 case QImode: idx = 0; break;
12116 case HImode: idx = 1; break;
12117 case SImode: idx = 2; break;
12118 case DImode: idx = 3; break;
12119 default:
12120 gcc_unreachable ();
12123 switch (code)
12125 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12126 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12127 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12128 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12129 default:
12130 gcc_unreachable ();
12133 emit_insn (gen (dst, mem, src, model));
12136 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12137 location to store the data read from memory. OUT_RESULT is the location to
12138 store the result of the operation. MEM is the memory location to read and
12139 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12140 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12141 be NULL. */
12143 void
12144 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12145 rtx mem, rtx value, rtx model_rtx)
12147 machine_mode mode = GET_MODE (mem);
12148 machine_mode wmode = (mode == DImode ? DImode : SImode);
12149 const bool short_mode = (mode < SImode);
12150 aarch64_atomic_load_op_code ldop_code;
12151 rtx src;
12152 rtx x;
12154 if (out_data)
12155 out_data = gen_lowpart (mode, out_data);
12157 if (out_result)
12158 out_result = gen_lowpart (mode, out_result);
12160 /* Make sure the value is in a register, putting it into a destination
12161 register if it needs to be manipulated. */
12162 if (!register_operand (value, mode)
12163 || code == AND || code == MINUS)
12165 src = out_result ? out_result : out_data;
12166 emit_move_insn (src, gen_lowpart (mode, value));
12168 else
12169 src = value;
12170 gcc_assert (register_operand (src, mode));
12172 /* Preprocess the data for the operation as necessary. If the operation is
12173 a SET then emit a swap instruction and finish. */
12174 switch (code)
12176 case SET:
12177 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12178 return;
12180 case MINUS:
12181 /* Negate the value and treat it as a PLUS. */
12183 rtx neg_src;
12185 /* Resize the value if necessary. */
12186 if (short_mode)
12187 src = gen_lowpart (wmode, src);
12189 neg_src = gen_rtx_NEG (wmode, src);
12190 emit_insn (gen_rtx_SET (src, neg_src));
12192 if (short_mode)
12193 src = gen_lowpart (mode, src);
12195 /* Fall-through. */
12196 case PLUS:
12197 ldop_code = AARCH64_LDOP_PLUS;
12198 break;
12200 case IOR:
12201 ldop_code = AARCH64_LDOP_OR;
12202 break;
12204 case XOR:
12205 ldop_code = AARCH64_LDOP_XOR;
12206 break;
12208 case AND:
12210 rtx not_src;
12212 /* Resize the value if necessary. */
12213 if (short_mode)
12214 src = gen_lowpart (wmode, src);
12216 not_src = gen_rtx_NOT (wmode, src);
12217 emit_insn (gen_rtx_SET (src, not_src));
12219 if (short_mode)
12220 src = gen_lowpart (mode, src);
12222 ldop_code = AARCH64_LDOP_BIC;
12223 break;
12225 default:
12226 /* The operation can't be done with atomic instructions. */
12227 gcc_unreachable ();
12230 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12232 /* If necessary, calculate the data in memory after the update by redoing the
12233 operation from values in registers. */
12234 if (!out_result)
12235 return;
12237 if (short_mode)
12239 src = gen_lowpart (wmode, src);
12240 out_data = gen_lowpart (wmode, out_data);
12241 out_result = gen_lowpart (wmode, out_result);
12244 x = NULL_RTX;
12246 switch (code)
12248 case MINUS:
12249 case PLUS:
12250 x = gen_rtx_PLUS (wmode, out_data, src);
12251 break;
12252 case IOR:
12253 x = gen_rtx_IOR (wmode, out_data, src);
12254 break;
12255 case XOR:
12256 x = gen_rtx_XOR (wmode, out_data, src);
12257 break;
12258 case AND:
12259 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12260 return;
12261 default:
12262 gcc_unreachable ();
12265 emit_set_insn (out_result, x);
12267 return;
12270 /* Split an atomic operation. */
12272 void
12273 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12274 rtx value, rtx model_rtx, rtx cond)
12276 machine_mode mode = GET_MODE (mem);
12277 machine_mode wmode = (mode == DImode ? DImode : SImode);
12278 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12279 const bool is_sync = is_mm_sync (model);
12280 rtx_code_label *label;
12281 rtx x;
12283 /* Split the atomic operation into a sequence. */
12284 label = gen_label_rtx ();
12285 emit_label (label);
12287 if (new_out)
12288 new_out = gen_lowpart (wmode, new_out);
12289 if (old_out)
12290 old_out = gen_lowpart (wmode, old_out);
12291 else
12292 old_out = new_out;
12293 value = simplify_gen_subreg (wmode, value, mode, 0);
12295 /* The initial load can be relaxed for a __sync operation since a final
12296 barrier will be emitted to stop code hoisting. */
12297 if (is_sync)
12298 aarch64_emit_load_exclusive (mode, old_out, mem,
12299 GEN_INT (MEMMODEL_RELAXED));
12300 else
12301 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12303 switch (code)
12305 case SET:
12306 new_out = value;
12307 break;
12309 case NOT:
12310 x = gen_rtx_AND (wmode, old_out, value);
12311 emit_insn (gen_rtx_SET (new_out, x));
12312 x = gen_rtx_NOT (wmode, new_out);
12313 emit_insn (gen_rtx_SET (new_out, x));
12314 break;
12316 case MINUS:
12317 if (CONST_INT_P (value))
12319 value = GEN_INT (-INTVAL (value));
12320 code = PLUS;
12322 /* Fall through. */
12324 default:
12325 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12326 emit_insn (gen_rtx_SET (new_out, x));
12327 break;
12330 aarch64_emit_store_exclusive (mode, cond, mem,
12331 gen_lowpart (mode, new_out), model_rtx);
12333 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12334 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12335 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12336 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12338 /* Emit any final barrier needed for a __sync operation. */
12339 if (is_sync)
12340 aarch64_emit_post_barrier (model);
12343 static void
12344 aarch64_init_libfuncs (void)
12346 /* Half-precision float operations. The compiler handles all operations
12347 with NULL libfuncs by converting to SFmode. */
12349 /* Conversions. */
12350 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12351 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12353 /* Arithmetic. */
12354 set_optab_libfunc (add_optab, HFmode, NULL);
12355 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12356 set_optab_libfunc (smul_optab, HFmode, NULL);
12357 set_optab_libfunc (neg_optab, HFmode, NULL);
12358 set_optab_libfunc (sub_optab, HFmode, NULL);
12360 /* Comparisons. */
12361 set_optab_libfunc (eq_optab, HFmode, NULL);
12362 set_optab_libfunc (ne_optab, HFmode, NULL);
12363 set_optab_libfunc (lt_optab, HFmode, NULL);
12364 set_optab_libfunc (le_optab, HFmode, NULL);
12365 set_optab_libfunc (ge_optab, HFmode, NULL);
12366 set_optab_libfunc (gt_optab, HFmode, NULL);
12367 set_optab_libfunc (unord_optab, HFmode, NULL);
12370 /* Target hook for c_mode_for_suffix. */
12371 static machine_mode
12372 aarch64_c_mode_for_suffix (char suffix)
12374 if (suffix == 'q')
12375 return TFmode;
12377 return VOIDmode;
12380 /* We can only represent floating point constants which will fit in
12381 "quarter-precision" values. These values are characterised by
12382 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12385 (-1)^s * (n/16) * 2^r
12387 Where:
12388 's' is the sign bit.
12389 'n' is an integer in the range 16 <= n <= 31.
12390 'r' is an integer in the range -3 <= r <= 4. */
12392 /* Return true iff X can be represented by a quarter-precision
12393 floating point immediate operand X. Note, we cannot represent 0.0. */
12394 bool
12395 aarch64_float_const_representable_p (rtx x)
12397 /* This represents our current view of how many bits
12398 make up the mantissa. */
12399 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12400 int exponent;
12401 unsigned HOST_WIDE_INT mantissa, mask;
12402 REAL_VALUE_TYPE r, m;
12403 bool fail;
12405 if (!CONST_DOUBLE_P (x))
12406 return false;
12408 /* We don't support HFmode constants yet. */
12409 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12410 return false;
12412 r = *CONST_DOUBLE_REAL_VALUE (x);
12414 /* We cannot represent infinities, NaNs or +/-zero. We won't
12415 know if we have +zero until we analyse the mantissa, but we
12416 can reject the other invalid values. */
12417 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12418 || REAL_VALUE_MINUS_ZERO (r))
12419 return false;
12421 /* Extract exponent. */
12422 r = real_value_abs (&r);
12423 exponent = REAL_EXP (&r);
12425 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12426 highest (sign) bit, with a fixed binary point at bit point_pos.
12427 m1 holds the low part of the mantissa, m2 the high part.
12428 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12429 bits for the mantissa, this can fail (low bits will be lost). */
12430 real_ldexp (&m, &r, point_pos - exponent);
12431 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12433 /* If the low part of the mantissa has bits set we cannot represent
12434 the value. */
12435 if (w.elt (0) != 0)
12436 return false;
12437 /* We have rejected the lower HOST_WIDE_INT, so update our
12438 understanding of how many bits lie in the mantissa and
12439 look only at the high HOST_WIDE_INT. */
12440 mantissa = w.elt (1);
12441 point_pos -= HOST_BITS_PER_WIDE_INT;
12443 /* We can only represent values with a mantissa of the form 1.xxxx. */
12444 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12445 if ((mantissa & mask) != 0)
12446 return false;
12448 /* Having filtered unrepresentable values, we may now remove all
12449 but the highest 5 bits. */
12450 mantissa >>= point_pos - 5;
12452 /* We cannot represent the value 0.0, so reject it. This is handled
12453 elsewhere. */
12454 if (mantissa == 0)
12455 return false;
12457 /* Then, as bit 4 is always set, we can mask it off, leaving
12458 the mantissa in the range [0, 15]. */
12459 mantissa &= ~(1 << 4);
12460 gcc_assert (mantissa <= 15);
12462 /* GCC internally does not use IEEE754-like encoding (where normalized
12463 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12464 Our mantissa values are shifted 4 places to the left relative to
12465 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12466 by 5 places to correct for GCC's representation. */
12467 exponent = 5 - exponent;
12469 return (exponent >= 0 && exponent <= 7);
12472 char*
12473 aarch64_output_simd_mov_immediate (rtx const_vector,
12474 machine_mode mode,
12475 unsigned width)
12477 bool is_valid;
12478 static char templ[40];
12479 const char *mnemonic;
12480 const char *shift_op;
12481 unsigned int lane_count = 0;
12482 char element_char;
12484 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12486 /* This will return true to show const_vector is legal for use as either
12487 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12488 also update INFO to show how the immediate should be generated. */
12489 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12490 gcc_assert (is_valid);
12492 element_char = sizetochar (info.element_width);
12493 lane_count = width / info.element_width;
12495 mode = GET_MODE_INNER (mode);
12496 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12498 gcc_assert (info.shift == 0 && ! info.mvn);
12499 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12500 move immediate path. */
12501 if (aarch64_float_const_zero_rtx_p (info.value))
12502 info.value = GEN_INT (0);
12503 else
12505 const unsigned int buf_size = 20;
12506 char float_buf[buf_size] = {'\0'};
12507 real_to_decimal_for_mode (float_buf,
12508 CONST_DOUBLE_REAL_VALUE (info.value),
12509 buf_size, buf_size, 1, mode);
12511 if (lane_count == 1)
12512 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12513 else
12514 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12515 lane_count, element_char, float_buf);
12516 return templ;
12520 mnemonic = info.mvn ? "mvni" : "movi";
12521 shift_op = info.msl ? "msl" : "lsl";
12523 gcc_assert (CONST_INT_P (info.value));
12524 if (lane_count == 1)
12525 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12526 mnemonic, UINTVAL (info.value));
12527 else if (info.shift)
12528 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12529 ", %s %d", mnemonic, lane_count, element_char,
12530 UINTVAL (info.value), shift_op, info.shift);
12531 else
12532 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12533 mnemonic, lane_count, element_char, UINTVAL (info.value));
12534 return templ;
12537 char*
12538 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12539 machine_mode mode)
12541 machine_mode vmode;
12543 gcc_assert (!VECTOR_MODE_P (mode));
12544 vmode = aarch64_simd_container_mode (mode, 64);
12545 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12546 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12549 /* Split operands into moves from op[1] + op[2] into op[0]. */
12551 void
12552 aarch64_split_combinev16qi (rtx operands[3])
12554 unsigned int dest = REGNO (operands[0]);
12555 unsigned int src1 = REGNO (operands[1]);
12556 unsigned int src2 = REGNO (operands[2]);
12557 machine_mode halfmode = GET_MODE (operands[1]);
12558 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12559 rtx destlo, desthi;
12561 gcc_assert (halfmode == V16QImode);
12563 if (src1 == dest && src2 == dest + halfregs)
12565 /* No-op move. Can't split to nothing; emit something. */
12566 emit_note (NOTE_INSN_DELETED);
12567 return;
12570 /* Preserve register attributes for variable tracking. */
12571 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12572 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12573 GET_MODE_SIZE (halfmode));
12575 /* Special case of reversed high/low parts. */
12576 if (reg_overlap_mentioned_p (operands[2], destlo)
12577 && reg_overlap_mentioned_p (operands[1], desthi))
12579 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12580 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12581 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12583 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12585 /* Try to avoid unnecessary moves if part of the result
12586 is in the right place already. */
12587 if (src1 != dest)
12588 emit_move_insn (destlo, operands[1]);
12589 if (src2 != dest + halfregs)
12590 emit_move_insn (desthi, operands[2]);
12592 else
12594 if (src2 != dest + halfregs)
12595 emit_move_insn (desthi, operands[2]);
12596 if (src1 != dest)
12597 emit_move_insn (destlo, operands[1]);
12601 /* vec_perm support. */
12603 #define MAX_VECT_LEN 16
12605 struct expand_vec_perm_d
12607 rtx target, op0, op1;
12608 unsigned char perm[MAX_VECT_LEN];
12609 machine_mode vmode;
12610 unsigned char nelt;
12611 bool one_vector_p;
12612 bool testing_p;
12615 /* Generate a variable permutation. */
12617 static void
12618 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12620 machine_mode vmode = GET_MODE (target);
12621 bool one_vector_p = rtx_equal_p (op0, op1);
12623 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12624 gcc_checking_assert (GET_MODE (op0) == vmode);
12625 gcc_checking_assert (GET_MODE (op1) == vmode);
12626 gcc_checking_assert (GET_MODE (sel) == vmode);
12627 gcc_checking_assert (TARGET_SIMD);
12629 if (one_vector_p)
12631 if (vmode == V8QImode)
12633 /* Expand the argument to a V16QI mode by duplicating it. */
12634 rtx pair = gen_reg_rtx (V16QImode);
12635 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12636 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12638 else
12640 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12643 else
12645 rtx pair;
12647 if (vmode == V8QImode)
12649 pair = gen_reg_rtx (V16QImode);
12650 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12651 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12653 else
12655 pair = gen_reg_rtx (OImode);
12656 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12657 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12662 void
12663 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12665 machine_mode vmode = GET_MODE (target);
12666 unsigned int nelt = GET_MODE_NUNITS (vmode);
12667 bool one_vector_p = rtx_equal_p (op0, op1);
12668 rtx mask;
12670 /* The TBL instruction does not use a modulo index, so we must take care
12671 of that ourselves. */
12672 mask = aarch64_simd_gen_const_vector_dup (vmode,
12673 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12674 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12676 /* For big-endian, we also need to reverse the index within the vector
12677 (but not which vector). */
12678 if (BYTES_BIG_ENDIAN)
12680 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12681 if (!one_vector_p)
12682 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12683 sel = expand_simple_binop (vmode, XOR, sel, mask,
12684 NULL, 0, OPTAB_LIB_WIDEN);
12686 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12689 /* Recognize patterns suitable for the TRN instructions. */
12690 static bool
12691 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12693 unsigned int i, odd, mask, nelt = d->nelt;
12694 rtx out, in0, in1, x;
12695 rtx (*gen) (rtx, rtx, rtx);
12696 machine_mode vmode = d->vmode;
12698 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12699 return false;
12701 /* Note that these are little-endian tests.
12702 We correct for big-endian later. */
12703 if (d->perm[0] == 0)
12704 odd = 0;
12705 else if (d->perm[0] == 1)
12706 odd = 1;
12707 else
12708 return false;
12709 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12711 for (i = 0; i < nelt; i += 2)
12713 if (d->perm[i] != i + odd)
12714 return false;
12715 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12716 return false;
12719 /* Success! */
12720 if (d->testing_p)
12721 return true;
12723 in0 = d->op0;
12724 in1 = d->op1;
12725 if (BYTES_BIG_ENDIAN)
12727 x = in0, in0 = in1, in1 = x;
12728 odd = !odd;
12730 out = d->target;
12732 if (odd)
12734 switch (vmode)
12736 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12737 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12738 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12739 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12740 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12741 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12742 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12743 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12744 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12745 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12746 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12747 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12748 default:
12749 return false;
12752 else
12754 switch (vmode)
12756 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12757 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12758 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12759 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12760 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12761 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12762 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12763 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12764 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12765 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12766 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12767 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12768 default:
12769 return false;
12773 emit_insn (gen (out, in0, in1));
12774 return true;
12777 /* Recognize patterns suitable for the UZP instructions. */
12778 static bool
12779 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12781 unsigned int i, odd, mask, nelt = d->nelt;
12782 rtx out, in0, in1, x;
12783 rtx (*gen) (rtx, rtx, rtx);
12784 machine_mode vmode = d->vmode;
12786 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12787 return false;
12789 /* Note that these are little-endian tests.
12790 We correct for big-endian later. */
12791 if (d->perm[0] == 0)
12792 odd = 0;
12793 else if (d->perm[0] == 1)
12794 odd = 1;
12795 else
12796 return false;
12797 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12799 for (i = 0; i < nelt; i++)
12801 unsigned elt = (i * 2 + odd) & mask;
12802 if (d->perm[i] != elt)
12803 return false;
12806 /* Success! */
12807 if (d->testing_p)
12808 return true;
12810 in0 = d->op0;
12811 in1 = d->op1;
12812 if (BYTES_BIG_ENDIAN)
12814 x = in0, in0 = in1, in1 = x;
12815 odd = !odd;
12817 out = d->target;
12819 if (odd)
12821 switch (vmode)
12823 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12824 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12825 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12826 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12827 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12828 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12829 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12830 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12831 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12832 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12833 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12834 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12835 default:
12836 return false;
12839 else
12841 switch (vmode)
12843 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12844 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12845 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12846 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12847 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12848 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12849 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12850 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12851 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12852 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12853 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12854 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12855 default:
12856 return false;
12860 emit_insn (gen (out, in0, in1));
12861 return true;
12864 /* Recognize patterns suitable for the ZIP instructions. */
12865 static bool
12866 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12868 unsigned int i, high, mask, nelt = d->nelt;
12869 rtx out, in0, in1, x;
12870 rtx (*gen) (rtx, rtx, rtx);
12871 machine_mode vmode = d->vmode;
12873 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12874 return false;
12876 /* Note that these are little-endian tests.
12877 We correct for big-endian later. */
12878 high = nelt / 2;
12879 if (d->perm[0] == high)
12880 /* Do Nothing. */
12882 else if (d->perm[0] == 0)
12883 high = 0;
12884 else
12885 return false;
12886 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12888 for (i = 0; i < nelt / 2; i++)
12890 unsigned elt = (i + high) & mask;
12891 if (d->perm[i * 2] != elt)
12892 return false;
12893 elt = (elt + nelt) & mask;
12894 if (d->perm[i * 2 + 1] != elt)
12895 return false;
12898 /* Success! */
12899 if (d->testing_p)
12900 return true;
12902 in0 = d->op0;
12903 in1 = d->op1;
12904 if (BYTES_BIG_ENDIAN)
12906 x = in0, in0 = in1, in1 = x;
12907 high = !high;
12909 out = d->target;
12911 if (high)
12913 switch (vmode)
12915 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12916 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12917 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12918 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12919 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12920 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12921 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12922 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12923 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12924 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12925 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12926 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12927 default:
12928 return false;
12931 else
12933 switch (vmode)
12935 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12936 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12937 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12938 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12939 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12940 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12941 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12942 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12943 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12944 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12945 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12946 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12947 default:
12948 return false;
12952 emit_insn (gen (out, in0, in1));
12953 return true;
12956 /* Recognize patterns for the EXT insn. */
12958 static bool
12959 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12961 unsigned int i, nelt = d->nelt;
12962 rtx (*gen) (rtx, rtx, rtx, rtx);
12963 rtx offset;
12965 unsigned int location = d->perm[0]; /* Always < nelt. */
12967 /* Check if the extracted indices are increasing by one. */
12968 for (i = 1; i < nelt; i++)
12970 unsigned int required = location + i;
12971 if (d->one_vector_p)
12973 /* We'll pass the same vector in twice, so allow indices to wrap. */
12974 required &= (nelt - 1);
12976 if (d->perm[i] != required)
12977 return false;
12980 switch (d->vmode)
12982 case V16QImode: gen = gen_aarch64_extv16qi; break;
12983 case V8QImode: gen = gen_aarch64_extv8qi; break;
12984 case V4HImode: gen = gen_aarch64_extv4hi; break;
12985 case V8HImode: gen = gen_aarch64_extv8hi; break;
12986 case V2SImode: gen = gen_aarch64_extv2si; break;
12987 case V4SImode: gen = gen_aarch64_extv4si; break;
12988 case V4HFmode: gen = gen_aarch64_extv4hf; break;
12989 case V8HFmode: gen = gen_aarch64_extv8hf; break;
12990 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12991 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12992 case V2DImode: gen = gen_aarch64_extv2di; break;
12993 case V2DFmode: gen = gen_aarch64_extv2df; break;
12994 default:
12995 return false;
12998 /* Success! */
12999 if (d->testing_p)
13000 return true;
13002 /* The case where (location == 0) is a no-op for both big- and little-endian,
13003 and is removed by the mid-end at optimization levels -O1 and higher. */
13005 if (BYTES_BIG_ENDIAN && (location != 0))
13007 /* After setup, we want the high elements of the first vector (stored
13008 at the LSB end of the register), and the low elements of the second
13009 vector (stored at the MSB end of the register). So swap. */
13010 std::swap (d->op0, d->op1);
13011 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13012 location = nelt - location;
13015 offset = GEN_INT (location);
13016 emit_insn (gen (d->target, d->op0, d->op1, offset));
13017 return true;
13020 /* Recognize patterns for the REV insns. */
13022 static bool
13023 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13025 unsigned int i, j, diff, nelt = d->nelt;
13026 rtx (*gen) (rtx, rtx);
13028 if (!d->one_vector_p)
13029 return false;
13031 diff = d->perm[0];
13032 switch (diff)
13034 case 7:
13035 switch (d->vmode)
13037 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13038 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13039 default:
13040 return false;
13042 break;
13043 case 3:
13044 switch (d->vmode)
13046 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13047 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13048 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13049 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13050 default:
13051 return false;
13053 break;
13054 case 1:
13055 switch (d->vmode)
13057 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13058 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13059 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13060 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13061 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13062 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13063 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13064 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13065 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13066 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13067 default:
13068 return false;
13070 break;
13071 default:
13072 return false;
13075 for (i = 0; i < nelt ; i += diff + 1)
13076 for (j = 0; j <= diff; j += 1)
13078 /* This is guaranteed to be true as the value of diff
13079 is 7, 3, 1 and we should have enough elements in the
13080 queue to generate this. Getting a vector mask with a
13081 value of diff other than these values implies that
13082 something is wrong by the time we get here. */
13083 gcc_assert (i + j < nelt);
13084 if (d->perm[i + j] != i + diff - j)
13085 return false;
13088 /* Success! */
13089 if (d->testing_p)
13090 return true;
13092 emit_insn (gen (d->target, d->op0));
13093 return true;
13096 static bool
13097 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13099 rtx (*gen) (rtx, rtx, rtx);
13100 rtx out = d->target;
13101 rtx in0;
13102 machine_mode vmode = d->vmode;
13103 unsigned int i, elt, nelt = d->nelt;
13104 rtx lane;
13106 elt = d->perm[0];
13107 for (i = 1; i < nelt; i++)
13109 if (elt != d->perm[i])
13110 return false;
13113 /* The generic preparation in aarch64_expand_vec_perm_const_1
13114 swaps the operand order and the permute indices if it finds
13115 d->perm[0] to be in the second operand. Thus, we can always
13116 use d->op0 and need not do any extra arithmetic to get the
13117 correct lane number. */
13118 in0 = d->op0;
13119 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13121 switch (vmode)
13123 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13124 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13125 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13126 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13127 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13128 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13129 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13130 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13131 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13132 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13133 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13134 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13135 default:
13136 return false;
13139 emit_insn (gen (out, in0, lane));
13140 return true;
13143 static bool
13144 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13146 rtx rperm[MAX_VECT_LEN], sel;
13147 machine_mode vmode = d->vmode;
13148 unsigned int i, nelt = d->nelt;
13150 if (d->testing_p)
13151 return true;
13153 /* Generic code will try constant permutation twice. Once with the
13154 original mode and again with the elements lowered to QImode.
13155 So wait and don't do the selector expansion ourselves. */
13156 if (vmode != V8QImode && vmode != V16QImode)
13157 return false;
13159 for (i = 0; i < nelt; ++i)
13161 int nunits = GET_MODE_NUNITS (vmode);
13163 /* If big-endian and two vectors we end up with a weird mixed-endian
13164 mode on NEON. Reverse the index within each word but not the word
13165 itself. */
13166 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13167 : d->perm[i]);
13169 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13170 sel = force_reg (vmode, sel);
13172 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13173 return true;
13176 static bool
13177 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13179 /* The pattern matching functions above are written to look for a small
13180 number to begin the sequence (0, 1, N/2). If we begin with an index
13181 from the second operand, we can swap the operands. */
13182 if (d->perm[0] >= d->nelt)
13184 unsigned i, nelt = d->nelt;
13186 gcc_assert (nelt == (nelt & -nelt));
13187 for (i = 0; i < nelt; ++i)
13188 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13190 std::swap (d->op0, d->op1);
13193 if (TARGET_SIMD)
13195 if (aarch64_evpc_rev (d))
13196 return true;
13197 else if (aarch64_evpc_ext (d))
13198 return true;
13199 else if (aarch64_evpc_dup (d))
13200 return true;
13201 else if (aarch64_evpc_zip (d))
13202 return true;
13203 else if (aarch64_evpc_uzp (d))
13204 return true;
13205 else if (aarch64_evpc_trn (d))
13206 return true;
13207 return aarch64_evpc_tbl (d);
13209 return false;
13212 /* Expand a vec_perm_const pattern. */
13214 bool
13215 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13217 struct expand_vec_perm_d d;
13218 int i, nelt, which;
13220 d.target = target;
13221 d.op0 = op0;
13222 d.op1 = op1;
13224 d.vmode = GET_MODE (target);
13225 gcc_assert (VECTOR_MODE_P (d.vmode));
13226 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13227 d.testing_p = false;
13229 for (i = which = 0; i < nelt; ++i)
13231 rtx e = XVECEXP (sel, 0, i);
13232 int ei = INTVAL (e) & (2 * nelt - 1);
13233 which |= (ei < nelt ? 1 : 2);
13234 d.perm[i] = ei;
13237 switch (which)
13239 default:
13240 gcc_unreachable ();
13242 case 3:
13243 d.one_vector_p = false;
13244 if (!rtx_equal_p (op0, op1))
13245 break;
13247 /* The elements of PERM do not suggest that only the first operand
13248 is used, but both operands are identical. Allow easier matching
13249 of the permutation by folding the permutation into the single
13250 input vector. */
13251 /* Fall Through. */
13252 case 2:
13253 for (i = 0; i < nelt; ++i)
13254 d.perm[i] &= nelt - 1;
13255 d.op0 = op1;
13256 d.one_vector_p = true;
13257 break;
13259 case 1:
13260 d.op1 = op0;
13261 d.one_vector_p = true;
13262 break;
13265 return aarch64_expand_vec_perm_const_1 (&d);
13268 static bool
13269 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13270 const unsigned char *sel)
13272 struct expand_vec_perm_d d;
13273 unsigned int i, nelt, which;
13274 bool ret;
13276 d.vmode = vmode;
13277 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13278 d.testing_p = true;
13279 memcpy (d.perm, sel, nelt);
13281 /* Calculate whether all elements are in one vector. */
13282 for (i = which = 0; i < nelt; ++i)
13284 unsigned char e = d.perm[i];
13285 gcc_assert (e < 2 * nelt);
13286 which |= (e < nelt ? 1 : 2);
13289 /* If all elements are from the second vector, reindex as if from the
13290 first vector. */
13291 if (which == 2)
13292 for (i = 0; i < nelt; ++i)
13293 d.perm[i] -= nelt;
13295 /* Check whether the mask can be applied to a single vector. */
13296 d.one_vector_p = (which != 3);
13298 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13299 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13300 if (!d.one_vector_p)
13301 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13303 start_sequence ();
13304 ret = aarch64_expand_vec_perm_const_1 (&d);
13305 end_sequence ();
13307 return ret;
13311 aarch64_reverse_mask (enum machine_mode mode)
13313 /* We have to reverse each vector because we dont have
13314 a permuted load that can reverse-load according to ABI rules. */
13315 rtx mask;
13316 rtvec v = rtvec_alloc (16);
13317 int i, j;
13318 int nunits = GET_MODE_NUNITS (mode);
13319 int usize = GET_MODE_UNIT_SIZE (mode);
13321 gcc_assert (BYTES_BIG_ENDIAN);
13322 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13324 for (i = 0; i < nunits; i++)
13325 for (j = 0; j < usize; j++)
13326 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13327 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13328 return force_reg (V16QImode, mask);
13331 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13332 However due to issues with register allocation it is preferable to avoid
13333 tieing integer scalar and FP scalar modes. Executing integer operations
13334 in general registers is better than treating them as scalar vector
13335 operations. This reduces latency and avoids redundant int<->FP moves.
13336 So tie modes if they are either the same class, or vector modes with
13337 other vector modes, vector structs or any scalar mode.
13340 bool
13341 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13343 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13344 return true;
13346 /* We specifically want to allow elements of "structure" modes to
13347 be tieable to the structure. This more general condition allows
13348 other rarer situations too. */
13349 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13350 return true;
13352 /* Also allow any scalar modes with vectors. */
13353 if (aarch64_vector_mode_supported_p (mode1)
13354 || aarch64_vector_mode_supported_p (mode2))
13355 return true;
13357 return false;
13360 /* Return a new RTX holding the result of moving POINTER forward by
13361 AMOUNT bytes. */
13363 static rtx
13364 aarch64_move_pointer (rtx pointer, int amount)
13366 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13368 return adjust_automodify_address (pointer, GET_MODE (pointer),
13369 next, amount);
13372 /* Return a new RTX holding the result of moving POINTER forward by the
13373 size of the mode it points to. */
13375 static rtx
13376 aarch64_progress_pointer (rtx pointer)
13378 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13380 return aarch64_move_pointer (pointer, amount);
13383 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13384 MODE bytes. */
13386 static void
13387 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13388 machine_mode mode)
13390 rtx reg = gen_reg_rtx (mode);
13392 /* "Cast" the pointers to the correct mode. */
13393 *src = adjust_address (*src, mode, 0);
13394 *dst = adjust_address (*dst, mode, 0);
13395 /* Emit the memcpy. */
13396 emit_move_insn (reg, *src);
13397 emit_move_insn (*dst, reg);
13398 /* Move the pointers forward. */
13399 *src = aarch64_progress_pointer (*src);
13400 *dst = aarch64_progress_pointer (*dst);
13403 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13404 we succeed, otherwise return false. */
13406 bool
13407 aarch64_expand_movmem (rtx *operands)
13409 unsigned int n;
13410 rtx dst = operands[0];
13411 rtx src = operands[1];
13412 rtx base;
13413 bool speed_p = !optimize_function_for_size_p (cfun);
13415 /* When optimizing for size, give a better estimate of the length of a
13416 memcpy call, but use the default otherwise. */
13417 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13419 /* We can't do anything smart if the amount to copy is not constant. */
13420 if (!CONST_INT_P (operands[2]))
13421 return false;
13423 n = UINTVAL (operands[2]);
13425 /* Try to keep the number of instructions low. For cases below 16 bytes we
13426 need to make at most two moves. For cases above 16 bytes it will be one
13427 move for each 16 byte chunk, then at most two additional moves. */
13428 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13429 return false;
13431 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13432 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13434 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13435 src = adjust_automodify_address (src, VOIDmode, base, 0);
13437 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13438 1-byte chunk. */
13439 if (n < 4)
13441 if (n >= 2)
13443 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13444 n -= 2;
13447 if (n == 1)
13448 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13450 return true;
13453 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13454 4-byte chunk, partially overlapping with the previously copied chunk. */
13455 if (n < 8)
13457 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13458 n -= 4;
13459 if (n > 0)
13461 int move = n - 4;
13463 src = aarch64_move_pointer (src, move);
13464 dst = aarch64_move_pointer (dst, move);
13465 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13467 return true;
13470 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13471 them, then (if applicable) an 8-byte chunk. */
13472 while (n >= 8)
13474 if (n / 16)
13476 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13477 n -= 16;
13479 else
13481 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13482 n -= 8;
13486 /* Finish the final bytes of the copy. We can always do this in one
13487 instruction. We either copy the exact amount we need, or partially
13488 overlap with the previous chunk we copied and copy 8-bytes. */
13489 if (n == 0)
13490 return true;
13491 else if (n == 1)
13492 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13493 else if (n == 2)
13494 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13495 else if (n == 4)
13496 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13497 else
13499 if (n == 3)
13501 src = aarch64_move_pointer (src, -1);
13502 dst = aarch64_move_pointer (dst, -1);
13503 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13505 else
13507 int move = n - 8;
13509 src = aarch64_move_pointer (src, move);
13510 dst = aarch64_move_pointer (dst, move);
13511 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13515 return true;
13518 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13519 SImode stores. Handle the case when the constant has identical
13520 bottom and top halves. This is beneficial when the two stores can be
13521 merged into an STP and we avoid synthesising potentially expensive
13522 immediates twice. Return true if such a split is possible. */
13524 bool
13525 aarch64_split_dimode_const_store (rtx dst, rtx src)
13527 rtx lo = gen_lowpart (SImode, src);
13528 rtx hi = gen_highpart_mode (SImode, DImode, src);
13530 bool size_p = optimize_function_for_size_p (cfun);
13532 if (!rtx_equal_p (lo, hi))
13533 return false;
13535 unsigned int orig_cost
13536 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13537 unsigned int lo_cost
13538 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13540 /* We want to transform:
13541 MOV x1, 49370
13542 MOVK x1, 0x140, lsl 16
13543 MOVK x1, 0xc0da, lsl 32
13544 MOVK x1, 0x140, lsl 48
13545 STR x1, [x0]
13546 into:
13547 MOV w1, 49370
13548 MOVK w1, 0x140, lsl 16
13549 STP w1, w1, [x0]
13550 So we want to perform this only when we save two instructions
13551 or more. When optimizing for size, however, accept any code size
13552 savings we can. */
13553 if (size_p && orig_cost <= lo_cost)
13554 return false;
13556 if (!size_p
13557 && (orig_cost <= lo_cost + 1))
13558 return false;
13560 rtx mem_lo = adjust_address (dst, SImode, 0);
13561 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13562 return false;
13564 rtx tmp_reg = gen_reg_rtx (SImode);
13565 aarch64_expand_mov_immediate (tmp_reg, lo);
13566 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13567 /* Don't emit an explicit store pair as this may not be always profitable.
13568 Let the sched-fusion logic decide whether to merge them. */
13569 emit_move_insn (mem_lo, tmp_reg);
13570 emit_move_insn (mem_hi, tmp_reg);
13572 return true;
13575 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13577 static unsigned HOST_WIDE_INT
13578 aarch64_asan_shadow_offset (void)
13580 return (HOST_WIDE_INT_1 << 36);
13583 static bool
13584 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13585 unsigned int align,
13586 enum by_pieces_operation op,
13587 bool speed_p)
13589 /* STORE_BY_PIECES can be used when copying a constant string, but
13590 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13591 For now we always fail this and let the move_by_pieces code copy
13592 the string from read-only memory. */
13593 if (op == STORE_BY_PIECES)
13594 return false;
13596 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13599 static rtx
13600 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13601 int code, tree treeop0, tree treeop1)
13603 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13604 rtx op0, op1;
13605 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13606 insn_code icode;
13607 struct expand_operand ops[4];
13609 start_sequence ();
13610 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13612 op_mode = GET_MODE (op0);
13613 if (op_mode == VOIDmode)
13614 op_mode = GET_MODE (op1);
13616 switch (op_mode)
13618 case QImode:
13619 case HImode:
13620 case SImode:
13621 cmp_mode = SImode;
13622 icode = CODE_FOR_cmpsi;
13623 break;
13625 case DImode:
13626 cmp_mode = DImode;
13627 icode = CODE_FOR_cmpdi;
13628 break;
13630 case SFmode:
13631 cmp_mode = SFmode;
13632 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13633 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13634 break;
13636 case DFmode:
13637 cmp_mode = DFmode;
13638 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13639 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13640 break;
13642 default:
13643 end_sequence ();
13644 return NULL_RTX;
13647 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13648 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13649 if (!op0 || !op1)
13651 end_sequence ();
13652 return NULL_RTX;
13654 *prep_seq = get_insns ();
13655 end_sequence ();
13657 create_fixed_operand (&ops[0], op0);
13658 create_fixed_operand (&ops[1], op1);
13660 start_sequence ();
13661 if (!maybe_expand_insn (icode, 2, ops))
13663 end_sequence ();
13664 return NULL_RTX;
13666 *gen_seq = get_insns ();
13667 end_sequence ();
13669 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13670 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13673 static rtx
13674 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13675 int cmp_code, tree treeop0, tree treeop1, int bit_code)
13677 rtx op0, op1, target;
13678 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13679 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13680 insn_code icode;
13681 struct expand_operand ops[6];
13682 int aarch64_cond;
13684 push_to_sequence (*prep_seq);
13685 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13687 op_mode = GET_MODE (op0);
13688 if (op_mode == VOIDmode)
13689 op_mode = GET_MODE (op1);
13691 switch (op_mode)
13693 case QImode:
13694 case HImode:
13695 case SImode:
13696 cmp_mode = SImode;
13697 icode = CODE_FOR_ccmpsi;
13698 break;
13700 case DImode:
13701 cmp_mode = DImode;
13702 icode = CODE_FOR_ccmpdi;
13703 break;
13705 case SFmode:
13706 cmp_mode = SFmode;
13707 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13708 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13709 break;
13711 case DFmode:
13712 cmp_mode = DFmode;
13713 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13714 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13715 break;
13717 default:
13718 end_sequence ();
13719 return NULL_RTX;
13722 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13723 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13724 if (!op0 || !op1)
13726 end_sequence ();
13727 return NULL_RTX;
13729 *prep_seq = get_insns ();
13730 end_sequence ();
13732 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13733 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13735 if (bit_code != AND)
13737 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13738 GET_MODE (XEXP (prev, 0))),
13739 VOIDmode, XEXP (prev, 0), const0_rtx);
13740 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13743 create_fixed_operand (&ops[0], XEXP (prev, 0));
13744 create_fixed_operand (&ops[1], target);
13745 create_fixed_operand (&ops[2], op0);
13746 create_fixed_operand (&ops[3], op1);
13747 create_fixed_operand (&ops[4], prev);
13748 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13750 push_to_sequence (*gen_seq);
13751 if (!maybe_expand_insn (icode, 6, ops))
13753 end_sequence ();
13754 return NULL_RTX;
13757 *gen_seq = get_insns ();
13758 end_sequence ();
13760 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13763 #undef TARGET_GEN_CCMP_FIRST
13764 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13766 #undef TARGET_GEN_CCMP_NEXT
13767 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13769 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13770 instruction fusion of some sort. */
13772 static bool
13773 aarch64_macro_fusion_p (void)
13775 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13779 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13780 should be kept together during scheduling. */
13782 static bool
13783 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13785 rtx set_dest;
13786 rtx prev_set = single_set (prev);
13787 rtx curr_set = single_set (curr);
13788 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13789 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13791 if (!aarch64_macro_fusion_p ())
13792 return false;
13794 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13796 /* We are trying to match:
13797 prev (mov) == (set (reg r0) (const_int imm16))
13798 curr (movk) == (set (zero_extract (reg r0)
13799 (const_int 16)
13800 (const_int 16))
13801 (const_int imm16_1)) */
13803 set_dest = SET_DEST (curr_set);
13805 if (GET_CODE (set_dest) == ZERO_EXTRACT
13806 && CONST_INT_P (SET_SRC (curr_set))
13807 && CONST_INT_P (SET_SRC (prev_set))
13808 && CONST_INT_P (XEXP (set_dest, 2))
13809 && INTVAL (XEXP (set_dest, 2)) == 16
13810 && REG_P (XEXP (set_dest, 0))
13811 && REG_P (SET_DEST (prev_set))
13812 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13814 return true;
13818 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13821 /* We're trying to match:
13822 prev (adrp) == (set (reg r1)
13823 (high (symbol_ref ("SYM"))))
13824 curr (add) == (set (reg r0)
13825 (lo_sum (reg r1)
13826 (symbol_ref ("SYM"))))
13827 Note that r0 need not necessarily be the same as r1, especially
13828 during pre-regalloc scheduling. */
13830 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13831 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13833 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13834 && REG_P (XEXP (SET_SRC (curr_set), 0))
13835 && REGNO (XEXP (SET_SRC (curr_set), 0))
13836 == REGNO (SET_DEST (prev_set))
13837 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13838 XEXP (SET_SRC (curr_set), 1)))
13839 return true;
13843 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13846 /* We're trying to match:
13847 prev (movk) == (set (zero_extract (reg r0)
13848 (const_int 16)
13849 (const_int 32))
13850 (const_int imm16_1))
13851 curr (movk) == (set (zero_extract (reg r0)
13852 (const_int 16)
13853 (const_int 48))
13854 (const_int imm16_2)) */
13856 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13857 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13858 && REG_P (XEXP (SET_DEST (prev_set), 0))
13859 && REG_P (XEXP (SET_DEST (curr_set), 0))
13860 && REGNO (XEXP (SET_DEST (prev_set), 0))
13861 == REGNO (XEXP (SET_DEST (curr_set), 0))
13862 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13863 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13864 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13865 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13866 && CONST_INT_P (SET_SRC (prev_set))
13867 && CONST_INT_P (SET_SRC (curr_set)))
13868 return true;
13871 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13873 /* We're trying to match:
13874 prev (adrp) == (set (reg r0)
13875 (high (symbol_ref ("SYM"))))
13876 curr (ldr) == (set (reg r1)
13877 (mem (lo_sum (reg r0)
13878 (symbol_ref ("SYM")))))
13880 curr (ldr) == (set (reg r1)
13881 (zero_extend (mem
13882 (lo_sum (reg r0)
13883 (symbol_ref ("SYM")))))) */
13884 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13885 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13887 rtx curr_src = SET_SRC (curr_set);
13889 if (GET_CODE (curr_src) == ZERO_EXTEND)
13890 curr_src = XEXP (curr_src, 0);
13892 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13893 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13894 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13895 == REGNO (SET_DEST (prev_set))
13896 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13897 XEXP (SET_SRC (prev_set), 0)))
13898 return true;
13902 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13903 && aarch_crypto_can_dual_issue (prev, curr))
13904 return true;
13906 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13907 && any_condjump_p (curr))
13909 enum attr_type prev_type = get_attr_type (prev);
13911 /* FIXME: this misses some which is considered simple arthematic
13912 instructions for ThunderX. Simple shifts are missed here. */
13913 if (prev_type == TYPE_ALUS_SREG
13914 || prev_type == TYPE_ALUS_IMM
13915 || prev_type == TYPE_LOGICS_REG
13916 || prev_type == TYPE_LOGICS_IMM)
13917 return true;
13920 return false;
13923 /* Return true iff the instruction fusion described by OP is enabled. */
13925 bool
13926 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13928 return (aarch64_tune_params.fusible_ops & op) != 0;
13931 /* If MEM is in the form of [base+offset], extract the two parts
13932 of address and set to BASE and OFFSET, otherwise return false
13933 after clearing BASE and OFFSET. */
13935 bool
13936 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13938 rtx addr;
13940 gcc_assert (MEM_P (mem));
13942 addr = XEXP (mem, 0);
13944 if (REG_P (addr))
13946 *base = addr;
13947 *offset = const0_rtx;
13948 return true;
13951 if (GET_CODE (addr) == PLUS
13952 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13954 *base = XEXP (addr, 0);
13955 *offset = XEXP (addr, 1);
13956 return true;
13959 *base = NULL_RTX;
13960 *offset = NULL_RTX;
13962 return false;
13965 /* Types for scheduling fusion. */
13966 enum sched_fusion_type
13968 SCHED_FUSION_NONE = 0,
13969 SCHED_FUSION_LD_SIGN_EXTEND,
13970 SCHED_FUSION_LD_ZERO_EXTEND,
13971 SCHED_FUSION_LD,
13972 SCHED_FUSION_ST,
13973 SCHED_FUSION_NUM
13976 /* If INSN is a load or store of address in the form of [base+offset],
13977 extract the two parts and set to BASE and OFFSET. Return scheduling
13978 fusion type this INSN is. */
13980 static enum sched_fusion_type
13981 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13983 rtx x, dest, src;
13984 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13986 gcc_assert (INSN_P (insn));
13987 x = PATTERN (insn);
13988 if (GET_CODE (x) != SET)
13989 return SCHED_FUSION_NONE;
13991 src = SET_SRC (x);
13992 dest = SET_DEST (x);
13994 machine_mode dest_mode = GET_MODE (dest);
13996 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13997 return SCHED_FUSION_NONE;
13999 if (GET_CODE (src) == SIGN_EXTEND)
14001 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14002 src = XEXP (src, 0);
14003 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14004 return SCHED_FUSION_NONE;
14006 else if (GET_CODE (src) == ZERO_EXTEND)
14008 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14009 src = XEXP (src, 0);
14010 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14011 return SCHED_FUSION_NONE;
14014 if (GET_CODE (src) == MEM && REG_P (dest))
14015 extract_base_offset_in_addr (src, base, offset);
14016 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14018 fusion = SCHED_FUSION_ST;
14019 extract_base_offset_in_addr (dest, base, offset);
14021 else
14022 return SCHED_FUSION_NONE;
14024 if (*base == NULL_RTX || *offset == NULL_RTX)
14025 fusion = SCHED_FUSION_NONE;
14027 return fusion;
14030 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14032 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14033 and PRI are only calculated for these instructions. For other instruction,
14034 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14035 type instruction fusion can be added by returning different priorities.
14037 It's important that irrelevant instructions get the largest FUSION_PRI. */
14039 static void
14040 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14041 int *fusion_pri, int *pri)
14043 int tmp, off_val;
14044 rtx base, offset;
14045 enum sched_fusion_type fusion;
14047 gcc_assert (INSN_P (insn));
14049 tmp = max_pri - 1;
14050 fusion = fusion_load_store (insn, &base, &offset);
14051 if (fusion == SCHED_FUSION_NONE)
14053 *pri = tmp;
14054 *fusion_pri = tmp;
14055 return;
14058 /* Set FUSION_PRI according to fusion type and base register. */
14059 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14061 /* Calculate PRI. */
14062 tmp /= 2;
14064 /* INSN with smaller offset goes first. */
14065 off_val = (int)(INTVAL (offset));
14066 if (off_val >= 0)
14067 tmp -= (off_val & 0xfffff);
14068 else
14069 tmp += ((- off_val) & 0xfffff);
14071 *pri = tmp;
14072 return;
14075 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14076 Adjust priority of sha1h instructions so they are scheduled before
14077 other SHA1 instructions. */
14079 static int
14080 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14082 rtx x = PATTERN (insn);
14084 if (GET_CODE (x) == SET)
14086 x = SET_SRC (x);
14088 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14089 return priority + 10;
14092 return priority;
14095 /* Given OPERANDS of consecutive load/store, check if we can merge
14096 them into ldp/stp. LOAD is true if they are load instructions.
14097 MODE is the mode of memory operands. */
14099 bool
14100 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14101 enum machine_mode mode)
14103 HOST_WIDE_INT offval_1, offval_2, msize;
14104 enum reg_class rclass_1, rclass_2;
14105 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14107 if (load)
14109 mem_1 = operands[1];
14110 mem_2 = operands[3];
14111 reg_1 = operands[0];
14112 reg_2 = operands[2];
14113 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14114 if (REGNO (reg_1) == REGNO (reg_2))
14115 return false;
14117 else
14119 mem_1 = operands[0];
14120 mem_2 = operands[2];
14121 reg_1 = operands[1];
14122 reg_2 = operands[3];
14125 /* The mems cannot be volatile. */
14126 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14127 return false;
14129 /* If we have SImode and slow unaligned ldp,
14130 check the alignment to be at least 8 byte. */
14131 if (mode == SImode
14132 && (aarch64_tune_params.extra_tuning_flags
14133 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14134 && !optimize_size
14135 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14136 return false;
14138 /* Check if the addresses are in the form of [base+offset]. */
14139 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14140 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14141 return false;
14142 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14143 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14144 return false;
14146 /* Check if the bases are same. */
14147 if (!rtx_equal_p (base_1, base_2))
14148 return false;
14150 offval_1 = INTVAL (offset_1);
14151 offval_2 = INTVAL (offset_2);
14152 msize = GET_MODE_SIZE (mode);
14153 /* Check if the offsets are consecutive. */
14154 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14155 return false;
14157 /* Check if the addresses are clobbered by load. */
14158 if (load)
14160 if (reg_mentioned_p (reg_1, mem_1))
14161 return false;
14163 /* In increasing order, the last load can clobber the address. */
14164 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14165 return false;
14168 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14169 rclass_1 = FP_REGS;
14170 else
14171 rclass_1 = GENERAL_REGS;
14173 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14174 rclass_2 = FP_REGS;
14175 else
14176 rclass_2 = GENERAL_REGS;
14178 /* Check if the registers are of same class. */
14179 if (rclass_1 != rclass_2)
14180 return false;
14182 return true;
14185 /* Given OPERANDS of consecutive load/store, check if we can merge
14186 them into ldp/stp by adjusting the offset. LOAD is true if they
14187 are load instructions. MODE is the mode of memory operands.
14189 Given below consecutive stores:
14191 str w1, [xb, 0x100]
14192 str w1, [xb, 0x104]
14193 str w1, [xb, 0x108]
14194 str w1, [xb, 0x10c]
14196 Though the offsets are out of the range supported by stp, we can
14197 still pair them after adjusting the offset, like:
14199 add scratch, xb, 0x100
14200 stp w1, w1, [scratch]
14201 stp w1, w1, [scratch, 0x8]
14203 The peephole patterns detecting this opportunity should guarantee
14204 the scratch register is avaliable. */
14206 bool
14207 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14208 enum machine_mode mode)
14210 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14211 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14212 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14213 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14215 if (load)
14217 reg_1 = operands[0];
14218 mem_1 = operands[1];
14219 reg_2 = operands[2];
14220 mem_2 = operands[3];
14221 reg_3 = operands[4];
14222 mem_3 = operands[5];
14223 reg_4 = operands[6];
14224 mem_4 = operands[7];
14225 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14226 && REG_P (reg_3) && REG_P (reg_4));
14227 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14228 return false;
14230 else
14232 mem_1 = operands[0];
14233 reg_1 = operands[1];
14234 mem_2 = operands[2];
14235 reg_2 = operands[3];
14236 mem_3 = operands[4];
14237 reg_3 = operands[5];
14238 mem_4 = operands[6];
14239 reg_4 = operands[7];
14241 /* Skip if memory operand is by itslef valid for ldp/stp. */
14242 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14243 return false;
14245 /* The mems cannot be volatile. */
14246 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14247 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14248 return false;
14250 /* Check if the addresses are in the form of [base+offset]. */
14251 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14252 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14253 return false;
14254 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14255 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14256 return false;
14257 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14258 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14259 return false;
14260 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14261 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14262 return false;
14264 /* Check if the bases are same. */
14265 if (!rtx_equal_p (base_1, base_2)
14266 || !rtx_equal_p (base_2, base_3)
14267 || !rtx_equal_p (base_3, base_4))
14268 return false;
14270 offval_1 = INTVAL (offset_1);
14271 offval_2 = INTVAL (offset_2);
14272 offval_3 = INTVAL (offset_3);
14273 offval_4 = INTVAL (offset_4);
14274 msize = GET_MODE_SIZE (mode);
14275 /* Check if the offsets are consecutive. */
14276 if ((offval_1 != (offval_2 + msize)
14277 || offval_1 != (offval_3 + msize * 2)
14278 || offval_1 != (offval_4 + msize * 3))
14279 && (offval_4 != (offval_3 + msize)
14280 || offval_4 != (offval_2 + msize * 2)
14281 || offval_4 != (offval_1 + msize * 3)))
14282 return false;
14284 /* Check if the addresses are clobbered by load. */
14285 if (load)
14287 if (reg_mentioned_p (reg_1, mem_1)
14288 || reg_mentioned_p (reg_2, mem_2)
14289 || reg_mentioned_p (reg_3, mem_3))
14290 return false;
14292 /* In increasing order, the last load can clobber the address. */
14293 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14294 return false;
14297 /* If we have SImode and slow unaligned ldp,
14298 check the alignment to be at least 8 byte. */
14299 if (mode == SImode
14300 && (aarch64_tune_params.extra_tuning_flags
14301 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14302 && !optimize_size
14303 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14304 return false;
14306 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14307 rclass_1 = FP_REGS;
14308 else
14309 rclass_1 = GENERAL_REGS;
14311 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14312 rclass_2 = FP_REGS;
14313 else
14314 rclass_2 = GENERAL_REGS;
14316 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14317 rclass_3 = FP_REGS;
14318 else
14319 rclass_3 = GENERAL_REGS;
14321 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14322 rclass_4 = FP_REGS;
14323 else
14324 rclass_4 = GENERAL_REGS;
14326 /* Check if the registers are of same class. */
14327 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14328 return false;
14330 return true;
14333 /* Given OPERANDS of consecutive load/store, this function pairs them
14334 into ldp/stp after adjusting the offset. It depends on the fact
14335 that addresses of load/store instructions are in increasing order.
14336 MODE is the mode of memory operands. CODE is the rtl operator
14337 which should be applied to all memory operands, it's SIGN_EXTEND,
14338 ZERO_EXTEND or UNKNOWN. */
14340 bool
14341 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14342 enum machine_mode mode, RTX_CODE code)
14344 rtx base, offset, t1, t2;
14345 rtx mem_1, mem_2, mem_3, mem_4;
14346 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14348 if (load)
14350 mem_1 = operands[1];
14351 mem_2 = operands[3];
14352 mem_3 = operands[5];
14353 mem_4 = operands[7];
14355 else
14357 mem_1 = operands[0];
14358 mem_2 = operands[2];
14359 mem_3 = operands[4];
14360 mem_4 = operands[6];
14361 gcc_assert (code == UNKNOWN);
14364 extract_base_offset_in_addr (mem_1, &base, &offset);
14365 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14367 /* Adjust offset thus it can fit in ldp/stp instruction. */
14368 msize = GET_MODE_SIZE (mode);
14369 stp_off_limit = msize * 0x40;
14370 off_val = INTVAL (offset);
14371 abs_off = (off_val < 0) ? -off_val : off_val;
14372 new_off = abs_off % stp_off_limit;
14373 adj_off = abs_off - new_off;
14375 /* Further adjust to make sure all offsets are OK. */
14376 if ((new_off + msize * 2) >= stp_off_limit)
14378 adj_off += stp_off_limit;
14379 new_off -= stp_off_limit;
14382 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14383 if (adj_off >= 0x1000)
14384 return false;
14386 if (off_val < 0)
14388 adj_off = -adj_off;
14389 new_off = -new_off;
14392 /* Create new memory references. */
14393 mem_1 = change_address (mem_1, VOIDmode,
14394 plus_constant (DImode, operands[8], new_off));
14396 /* Check if the adjusted address is OK for ldp/stp. */
14397 if (!aarch64_mem_pair_operand (mem_1, mode))
14398 return false;
14400 msize = GET_MODE_SIZE (mode);
14401 mem_2 = change_address (mem_2, VOIDmode,
14402 plus_constant (DImode,
14403 operands[8],
14404 new_off + msize));
14405 mem_3 = change_address (mem_3, VOIDmode,
14406 plus_constant (DImode,
14407 operands[8],
14408 new_off + msize * 2));
14409 mem_4 = change_address (mem_4, VOIDmode,
14410 plus_constant (DImode,
14411 operands[8],
14412 new_off + msize * 3));
14414 if (code == ZERO_EXTEND)
14416 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14417 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14418 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14419 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14421 else if (code == SIGN_EXTEND)
14423 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14424 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14425 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14426 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14429 if (load)
14431 operands[1] = mem_1;
14432 operands[3] = mem_2;
14433 operands[5] = mem_3;
14434 operands[7] = mem_4;
14436 else
14438 operands[0] = mem_1;
14439 operands[2] = mem_2;
14440 operands[4] = mem_3;
14441 operands[6] = mem_4;
14444 /* Emit adjusting instruction. */
14445 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14446 /* Emit ldp/stp instructions. */
14447 t1 = gen_rtx_SET (operands[0], operands[1]);
14448 t2 = gen_rtx_SET (operands[2], operands[3]);
14449 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14450 t1 = gen_rtx_SET (operands[4], operands[5]);
14451 t2 = gen_rtx_SET (operands[6], operands[7]);
14452 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14453 return true;
14456 /* Return 1 if pseudo register should be created and used to hold
14457 GOT address for PIC code. */
14459 bool
14460 aarch64_use_pseudo_pic_reg (void)
14462 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14465 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14467 static int
14468 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14470 switch (XINT (x, 1))
14472 case UNSPEC_GOTSMALLPIC:
14473 case UNSPEC_GOTSMALLPIC28K:
14474 case UNSPEC_GOTTINYPIC:
14475 return 0;
14476 default:
14477 break;
14480 return default_unspec_may_trap_p (x, flags);
14484 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14485 return the log2 of that value. Otherwise return -1. */
14488 aarch64_fpconst_pow_of_2 (rtx x)
14490 const REAL_VALUE_TYPE *r;
14492 if (!CONST_DOUBLE_P (x))
14493 return -1;
14495 r = CONST_DOUBLE_REAL_VALUE (x);
14497 if (REAL_VALUE_NEGATIVE (*r)
14498 || REAL_VALUE_ISNAN (*r)
14499 || REAL_VALUE_ISINF (*r)
14500 || !real_isinteger (r, DFmode))
14501 return -1;
14503 return exact_log2 (real_to_integer (r));
14506 /* If X is a vector of equal CONST_DOUBLE values and that value is
14507 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14510 aarch64_vec_fpconst_pow_of_2 (rtx x)
14512 if (GET_CODE (x) != CONST_VECTOR)
14513 return -1;
14515 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14516 return -1;
14518 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14519 if (firstval <= 0)
14520 return -1;
14522 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14523 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14524 return -1;
14526 return firstval;
14529 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14530 to float.
14532 __fp16 always promotes through this hook.
14533 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14534 through the generic excess precision logic rather than here. */
14536 static tree
14537 aarch64_promoted_type (const_tree t)
14539 if (SCALAR_FLOAT_TYPE_P (t)
14540 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14541 return float_type_node;
14543 return NULL_TREE;
14546 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14548 static bool
14549 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14550 optimization_type opt_type)
14552 switch (op)
14554 case rsqrt_optab:
14555 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14557 default:
14558 return true;
14562 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14563 if MODE is HFmode, and punt to the generic implementation otherwise. */
14565 static bool
14566 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14568 return (mode == HFmode
14569 ? true
14570 : default_libgcc_floating_mode_supported_p (mode));
14573 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14574 if MODE is HFmode, and punt to the generic implementation otherwise. */
14576 static bool
14577 aarch64_scalar_mode_supported_p (machine_mode mode)
14579 return (mode == HFmode
14580 ? true
14581 : default_scalar_mode_supported_p (mode));
14584 /* Set the value of FLT_EVAL_METHOD.
14585 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14587 0: evaluate all operations and constants, whose semantic type has at
14588 most the range and precision of type float, to the range and
14589 precision of float; evaluate all other operations and constants to
14590 the range and precision of the semantic type;
14592 N, where _FloatN is a supported interchange floating type
14593 evaluate all operations and constants, whose semantic type has at
14594 most the range and precision of _FloatN type, to the range and
14595 precision of the _FloatN type; evaluate all other operations and
14596 constants to the range and precision of the semantic type;
14598 If we have the ARMv8.2-A extensions then we support _Float16 in native
14599 precision, so we should set this to 16. Otherwise, we support the type,
14600 but want to evaluate expressions in float precision, so set this to
14601 0. */
14603 static enum flt_eval_method
14604 aarch64_excess_precision (enum excess_precision_type type)
14606 switch (type)
14608 case EXCESS_PRECISION_TYPE_FAST:
14609 case EXCESS_PRECISION_TYPE_STANDARD:
14610 /* We can calculate either in 16-bit range and precision or
14611 32-bit range and precision. Make that decision based on whether
14612 we have native support for the ARMv8.2-A 16-bit floating-point
14613 instructions or not. */
14614 return (TARGET_FP_F16INST
14615 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14616 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14617 case EXCESS_PRECISION_TYPE_IMPLICIT:
14618 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14619 default:
14620 gcc_unreachable ();
14622 return FLT_EVAL_METHOD_UNPREDICTABLE;
14625 /* Target-specific selftests. */
14627 #if CHECKING_P
14629 namespace selftest {
14631 /* Selftest for the RTL loader.
14632 Verify that the RTL loader copes with a dump from
14633 print_rtx_function. This is essentially just a test that class
14634 function_reader can handle a real dump, but it also verifies
14635 that lookup_reg_by_dump_name correctly handles hard regs.
14636 The presence of hard reg names in the dump means that the test is
14637 target-specific, hence it is in this file. */
14639 static void
14640 aarch64_test_loading_full_dump ()
14642 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14644 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14646 rtx_insn *insn_1 = get_insn_by_uid (1);
14647 ASSERT_EQ (NOTE, GET_CODE (insn_1));
14649 rtx_insn *insn_15 = get_insn_by_uid (15);
14650 ASSERT_EQ (INSN, GET_CODE (insn_15));
14651 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14653 /* Verify crtl->return_rtx. */
14654 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14655 ASSERT_EQ (0, REGNO (crtl->return_rtx));
14656 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14659 /* Run all target-specific selftests. */
14661 static void
14662 aarch64_run_selftests (void)
14664 aarch64_test_loading_full_dump ();
14667 } // namespace selftest
14669 #endif /* #if CHECKING_P */
14671 #undef TARGET_ADDRESS_COST
14672 #define TARGET_ADDRESS_COST aarch64_address_cost
14674 /* This hook will determines whether unnamed bitfields affect the alignment
14675 of the containing structure. The hook returns true if the structure
14676 should inherit the alignment requirements of an unnamed bitfield's
14677 type. */
14678 #undef TARGET_ALIGN_ANON_BITFIELD
14679 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14681 #undef TARGET_ASM_ALIGNED_DI_OP
14682 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14684 #undef TARGET_ASM_ALIGNED_HI_OP
14685 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14687 #undef TARGET_ASM_ALIGNED_SI_OP
14688 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14690 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14691 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14692 hook_bool_const_tree_hwi_hwi_const_tree_true
14694 #undef TARGET_ASM_FILE_START
14695 #define TARGET_ASM_FILE_START aarch64_start_file
14697 #undef TARGET_ASM_OUTPUT_MI_THUNK
14698 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14700 #undef TARGET_ASM_SELECT_RTX_SECTION
14701 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14703 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14704 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14706 #undef TARGET_BUILD_BUILTIN_VA_LIST
14707 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14709 #undef TARGET_CALLEE_COPIES
14710 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14712 #undef TARGET_CAN_ELIMINATE
14713 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14715 #undef TARGET_CAN_INLINE_P
14716 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14718 #undef TARGET_CANNOT_FORCE_CONST_MEM
14719 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14721 #undef TARGET_CASE_VALUES_THRESHOLD
14722 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14724 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14725 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14727 /* Only the least significant bit is used for initialization guard
14728 variables. */
14729 #undef TARGET_CXX_GUARD_MASK_BIT
14730 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14732 #undef TARGET_C_MODE_FOR_SUFFIX
14733 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14735 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14736 #undef TARGET_DEFAULT_TARGET_FLAGS
14737 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14738 #endif
14740 #undef TARGET_CLASS_MAX_NREGS
14741 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14743 #undef TARGET_BUILTIN_DECL
14744 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14746 #undef TARGET_BUILTIN_RECIPROCAL
14747 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14749 #undef TARGET_C_EXCESS_PRECISION
14750 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14752 #undef TARGET_EXPAND_BUILTIN
14753 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14755 #undef TARGET_EXPAND_BUILTIN_VA_START
14756 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14758 #undef TARGET_FOLD_BUILTIN
14759 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14761 #undef TARGET_FUNCTION_ARG
14762 #define TARGET_FUNCTION_ARG aarch64_function_arg
14764 #undef TARGET_FUNCTION_ARG_ADVANCE
14765 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14767 #undef TARGET_FUNCTION_ARG_BOUNDARY
14768 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14770 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14771 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14773 #undef TARGET_FUNCTION_VALUE
14774 #define TARGET_FUNCTION_VALUE aarch64_function_value
14776 #undef TARGET_FUNCTION_VALUE_REGNO_P
14777 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14779 #undef TARGET_FRAME_POINTER_REQUIRED
14780 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14782 #undef TARGET_GIMPLE_FOLD_BUILTIN
14783 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14785 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14786 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14788 #undef TARGET_INIT_BUILTINS
14789 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14791 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14792 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14793 aarch64_ira_change_pseudo_allocno_class
14795 #undef TARGET_LEGITIMATE_ADDRESS_P
14796 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14798 #undef TARGET_LEGITIMATE_CONSTANT_P
14799 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14801 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14802 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14803 aarch64_legitimize_address_displacement
14805 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14806 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14808 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14809 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14810 aarch64_libgcc_floating_mode_supported_p
14812 #undef TARGET_MANGLE_TYPE
14813 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14815 #undef TARGET_MEMORY_MOVE_COST
14816 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14818 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14819 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14821 #undef TARGET_MUST_PASS_IN_STACK
14822 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14824 /* This target hook should return true if accesses to volatile bitfields
14825 should use the narrowest mode possible. It should return false if these
14826 accesses should use the bitfield container type. */
14827 #undef TARGET_NARROW_VOLATILE_BITFIELD
14828 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14830 #undef TARGET_OPTION_OVERRIDE
14831 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14833 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14834 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14835 aarch64_override_options_after_change
14837 #undef TARGET_OPTION_SAVE
14838 #define TARGET_OPTION_SAVE aarch64_option_save
14840 #undef TARGET_OPTION_RESTORE
14841 #define TARGET_OPTION_RESTORE aarch64_option_restore
14843 #undef TARGET_OPTION_PRINT
14844 #define TARGET_OPTION_PRINT aarch64_option_print
14846 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14847 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14849 #undef TARGET_SET_CURRENT_FUNCTION
14850 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14852 #undef TARGET_PASS_BY_REFERENCE
14853 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14855 #undef TARGET_PREFERRED_RELOAD_CLASS
14856 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14858 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14859 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14861 #undef TARGET_PROMOTED_TYPE
14862 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14864 #undef TARGET_SECONDARY_RELOAD
14865 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14867 #undef TARGET_SHIFT_TRUNCATION_MASK
14868 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14870 #undef TARGET_SETUP_INCOMING_VARARGS
14871 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14873 #undef TARGET_STRUCT_VALUE_RTX
14874 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14876 #undef TARGET_REGISTER_MOVE_COST
14877 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14879 #undef TARGET_RETURN_IN_MEMORY
14880 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14882 #undef TARGET_RETURN_IN_MSB
14883 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14885 #undef TARGET_RTX_COSTS
14886 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14888 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14889 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14891 #undef TARGET_SCHED_ISSUE_RATE
14892 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14894 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14895 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14896 aarch64_sched_first_cycle_multipass_dfa_lookahead
14898 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14899 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14900 aarch64_first_cycle_multipass_dfa_lookahead_guard
14902 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14903 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14904 aarch64_get_separate_components
14906 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14907 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14908 aarch64_components_for_bb
14910 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14911 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14912 aarch64_disqualify_components
14914 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14915 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14916 aarch64_emit_prologue_components
14918 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14919 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14920 aarch64_emit_epilogue_components
14922 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14923 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14924 aarch64_set_handled_components
14926 #undef TARGET_TRAMPOLINE_INIT
14927 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14929 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14930 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14932 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14933 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14935 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
14936 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
14937 aarch64_builtin_support_vector_misalignment
14939 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14940 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14942 #undef TARGET_VECTORIZE_ADD_STMT_COST
14943 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14945 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14946 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14947 aarch64_builtin_vectorization_cost
14949 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14950 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14952 #undef TARGET_VECTORIZE_BUILTINS
14953 #define TARGET_VECTORIZE_BUILTINS
14955 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14956 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14957 aarch64_builtin_vectorized_function
14959 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14960 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14961 aarch64_autovectorize_vector_sizes
14963 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14964 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14965 aarch64_atomic_assign_expand_fenv
14967 /* Section anchor support. */
14969 #undef TARGET_MIN_ANCHOR_OFFSET
14970 #define TARGET_MIN_ANCHOR_OFFSET -256
14972 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14973 byte offset; we can do much more for larger data types, but have no way
14974 to determine the size of the access. We assume accesses are aligned. */
14975 #undef TARGET_MAX_ANCHOR_OFFSET
14976 #define TARGET_MAX_ANCHOR_OFFSET 4095
14978 #undef TARGET_VECTOR_ALIGNMENT
14979 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14981 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14982 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14983 aarch64_simd_vector_alignment_reachable
14985 /* vec_perm support. */
14987 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14988 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14989 aarch64_vectorize_vec_perm_const_ok
14991 #undef TARGET_INIT_LIBFUNCS
14992 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14994 #undef TARGET_FIXED_CONDITION_CODE_REGS
14995 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14997 #undef TARGET_FLAGS_REGNUM
14998 #define TARGET_FLAGS_REGNUM CC_REGNUM
15000 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15001 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15003 #undef TARGET_ASAN_SHADOW_OFFSET
15004 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15006 #undef TARGET_LEGITIMIZE_ADDRESS
15007 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15009 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15010 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15011 aarch64_use_by_pieces_infrastructure_p
15013 #undef TARGET_CAN_USE_DOLOOP_P
15014 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15016 #undef TARGET_SCHED_ADJUST_PRIORITY
15017 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15019 #undef TARGET_SCHED_MACRO_FUSION_P
15020 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15022 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15023 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15025 #undef TARGET_SCHED_FUSION_PRIORITY
15026 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15028 #undef TARGET_UNSPEC_MAY_TRAP_P
15029 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15031 #undef TARGET_USE_PSEUDO_PIC_REG
15032 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15034 #undef TARGET_PRINT_OPERAND
15035 #define TARGET_PRINT_OPERAND aarch64_print_operand
15037 #undef TARGET_PRINT_OPERAND_ADDRESS
15038 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15040 #undef TARGET_OPTAB_SUPPORTED_P
15041 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15043 #undef TARGET_OMIT_STRUCT_RETURN_REG
15044 #define TARGET_OMIT_STRUCT_RETURN_REG true
15046 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15047 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15048 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15050 #if CHECKING_P
15051 #undef TARGET_RUN_TARGET_SELFTESTS
15052 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15053 #endif /* #if CHECKING_P */
15055 struct gcc_target targetm = TARGET_INITIALIZER;
15057 #include "gt-aarch64.h"