[AArch64] Handle CSEL of zero_extended operands in rtx costs
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob73ef7e5a554cbfdf9f87c1554de7ab5d3724c482
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "target.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "insn-attr.h"
40 #include "alias.h"
41 #include "fold-const.h"
42 #include "stor-layout.h"
43 #include "calls.h"
44 #include "varasm.h"
45 #include "output.h"
46 #include "flags.h"
47 #include "explow.h"
48 #include "expr.h"
49 #include "reload.h"
50 #include "langhooks.h"
51 #include "opts.h"
52 #include "params.h"
53 #include "gimplify.h"
54 #include "dwarf2.h"
55 #include "gimple-iterator.h"
56 #include "tree-vectorizer.h"
57 #include "aarch64-cost-tables.h"
58 #include "dumpfile.h"
59 #include "builtins.h"
60 #include "rtl-iter.h"
61 #include "tm-constrs.h"
62 #include "sched-int.h"
63 #include "cortex-a57-fma-steering.h"
64 #include "target-globals.h"
65 #include "common/common-target.h"
67 /* This file should be included last. */
68 #include "target-def.h"
70 /* Defined for convenience. */
71 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
73 /* Classifies an address.
75 ADDRESS_REG_IMM
76 A simple base register plus immediate offset.
78 ADDRESS_REG_WB
79 A base register indexed by immediate offset with writeback.
81 ADDRESS_REG_REG
82 A base register indexed by (optionally scaled) register.
84 ADDRESS_REG_UXTW
85 A base register indexed by (optionally scaled) zero-extended register.
87 ADDRESS_REG_SXTW
88 A base register indexed by (optionally scaled) sign-extended register.
90 ADDRESS_LO_SUM
91 A LO_SUM rtx with a base register and "LO12" symbol relocation.
93 ADDRESS_SYMBOLIC:
94 A constant symbolic address, in pc-relative literal pool. */
96 enum aarch64_address_type {
97 ADDRESS_REG_IMM,
98 ADDRESS_REG_WB,
99 ADDRESS_REG_REG,
100 ADDRESS_REG_UXTW,
101 ADDRESS_REG_SXTW,
102 ADDRESS_LO_SUM,
103 ADDRESS_SYMBOLIC
106 struct aarch64_address_info {
107 enum aarch64_address_type type;
108 rtx base;
109 rtx offset;
110 int shift;
111 enum aarch64_symbol_type symbol_type;
114 struct simd_immediate_info
116 rtx value;
117 int shift;
118 int element_width;
119 bool mvn;
120 bool msl;
123 /* The current code model. */
124 enum aarch64_code_model aarch64_cmodel;
126 #ifdef HAVE_AS_TLS
127 #undef TARGET_HAVE_TLS
128 #define TARGET_HAVE_TLS 1
129 #endif
131 static bool aarch64_composite_type_p (const_tree, machine_mode);
132 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
133 const_tree,
134 machine_mode *, int *,
135 bool *);
136 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
137 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_override_options_after_change (void);
139 static bool aarch64_vector_mode_supported_p (machine_mode);
140 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
141 const unsigned char *sel);
142 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
144 /* Major revision number of the ARM Architecture implemented by the target. */
145 unsigned aarch64_architecture_version;
147 /* The processor for which instructions should be scheduled. */
148 enum aarch64_processor aarch64_tune = cortexa53;
150 /* Mask to specify which instruction scheduling options should be used. */
151 unsigned long aarch64_tune_flags = 0;
153 /* Global flag for PC relative loads. */
154 bool aarch64_nopcrelative_literal_loads;
156 /* Support for command line parsing of boolean flags in the tuning
157 structures. */
158 struct aarch64_flag_desc
160 const char* name;
161 unsigned int flag;
164 #define AARCH64_FUSION_PAIR(name, internal_name) \
165 { name, AARCH64_FUSE_##internal_name },
166 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
168 { "none", AARCH64_FUSE_NOTHING },
169 #include "aarch64-fusion-pairs.def"
170 { "all", AARCH64_FUSE_ALL },
171 { NULL, AARCH64_FUSE_NOTHING }
173 #undef AARCH64_FUION_PAIR
175 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
176 { name, AARCH64_EXTRA_TUNE_##internal_name },
177 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
179 { "none", AARCH64_EXTRA_TUNE_NONE },
180 #include "aarch64-tuning-flags.def"
181 { "all", AARCH64_EXTRA_TUNE_ALL },
182 { NULL, AARCH64_EXTRA_TUNE_NONE }
184 #undef AARCH64_EXTRA_TUNING_OPTION
186 /* Tuning parameters. */
188 static const struct cpu_addrcost_table generic_addrcost_table =
191 0, /* hi */
192 0, /* si */
193 0, /* di */
194 0, /* ti */
196 0, /* pre_modify */
197 0, /* post_modify */
198 0, /* register_offset */
199 0, /* register_sextend */
200 0, /* register_zextend */
201 0 /* imm_offset */
204 static const struct cpu_addrcost_table cortexa57_addrcost_table =
207 1, /* hi */
208 0, /* si */
209 0, /* di */
210 1, /* ti */
212 0, /* pre_modify */
213 0, /* post_modify */
214 0, /* register_offset */
215 0, /* register_sextend */
216 0, /* register_zextend */
217 0, /* imm_offset */
220 static const struct cpu_addrcost_table exynosm1_addrcost_table =
223 0, /* hi */
224 0, /* si */
225 0, /* di */
226 2, /* ti */
228 0, /* pre_modify */
229 0, /* post_modify */
230 1, /* register_offset */
231 1, /* register_sextend */
232 2, /* register_zextend */
233 0, /* imm_offset */
236 static const struct cpu_addrcost_table xgene1_addrcost_table =
239 1, /* hi */
240 0, /* si */
241 0, /* di */
242 1, /* ti */
244 1, /* pre_modify */
245 0, /* post_modify */
246 0, /* register_offset */
247 1, /* register_sextend */
248 1, /* register_zextend */
249 0, /* imm_offset */
252 static const struct cpu_regmove_cost generic_regmove_cost =
254 1, /* GP2GP */
255 /* Avoid the use of slow int<->fp moves for spilling by setting
256 their cost higher than memmov_cost. */
257 5, /* GP2FP */
258 5, /* FP2GP */
259 2 /* FP2FP */
262 static const struct cpu_regmove_cost cortexa57_regmove_cost =
264 1, /* GP2GP */
265 /* Avoid the use of slow int<->fp moves for spilling by setting
266 their cost higher than memmov_cost. */
267 5, /* GP2FP */
268 5, /* FP2GP */
269 2 /* FP2FP */
272 static const struct cpu_regmove_cost cortexa53_regmove_cost =
274 1, /* GP2GP */
275 /* Avoid the use of slow int<->fp moves for spilling by setting
276 their cost higher than memmov_cost. */
277 5, /* GP2FP */
278 5, /* FP2GP */
279 2 /* FP2FP */
282 static const struct cpu_regmove_cost exynosm1_regmove_cost =
284 1, /* GP2GP */
285 /* Avoid the use of slow int<->fp moves for spilling by setting
286 their cost higher than memmov_cost (actual, 4 and 9). */
287 9, /* GP2FP */
288 9, /* FP2GP */
289 1 /* FP2FP */
292 static const struct cpu_regmove_cost thunderx_regmove_cost =
294 2, /* GP2GP */
295 2, /* GP2FP */
296 6, /* FP2GP */
297 4 /* FP2FP */
300 static const struct cpu_regmove_cost xgene1_regmove_cost =
302 1, /* GP2GP */
303 /* Avoid the use of slow int<->fp moves for spilling by setting
304 their cost higher than memmov_cost. */
305 8, /* GP2FP */
306 8, /* FP2GP */
307 2 /* FP2FP */
310 /* Generic costs for vector insn classes. */
311 static const struct cpu_vector_cost generic_vector_cost =
313 1, /* scalar_stmt_cost */
314 1, /* scalar_load_cost */
315 1, /* scalar_store_cost */
316 1, /* vec_stmt_cost */
317 1, /* vec_to_scalar_cost */
318 1, /* scalar_to_vec_cost */
319 1, /* vec_align_load_cost */
320 1, /* vec_unalign_load_cost */
321 1, /* vec_unalign_store_cost */
322 1, /* vec_store_cost */
323 3, /* cond_taken_branch_cost */
324 1 /* cond_not_taken_branch_cost */
327 /* Generic costs for vector insn classes. */
328 static const struct cpu_vector_cost cortexa57_vector_cost =
330 1, /* scalar_stmt_cost */
331 4, /* scalar_load_cost */
332 1, /* scalar_store_cost */
333 3, /* vec_stmt_cost */
334 8, /* vec_to_scalar_cost */
335 8, /* scalar_to_vec_cost */
336 5, /* vec_align_load_cost */
337 5, /* vec_unalign_load_cost */
338 1, /* vec_unalign_store_cost */
339 1, /* vec_store_cost */
340 1, /* cond_taken_branch_cost */
341 1 /* cond_not_taken_branch_cost */
344 static const struct cpu_vector_cost exynosm1_vector_cost =
346 1, /* scalar_stmt_cost */
347 5, /* scalar_load_cost */
348 1, /* scalar_store_cost */
349 3, /* vec_stmt_cost */
350 3, /* vec_to_scalar_cost */
351 3, /* scalar_to_vec_cost */
352 5, /* vec_align_load_cost */
353 5, /* vec_unalign_load_cost */
354 1, /* vec_unalign_store_cost */
355 1, /* vec_store_cost */
356 1, /* cond_taken_branch_cost */
357 1 /* cond_not_taken_branch_cost */
360 /* Generic costs for vector insn classes. */
361 static const struct cpu_vector_cost xgene1_vector_cost =
363 1, /* scalar_stmt_cost */
364 5, /* scalar_load_cost */
365 1, /* scalar_store_cost */
366 2, /* vec_stmt_cost */
367 4, /* vec_to_scalar_cost */
368 4, /* scalar_to_vec_cost */
369 10, /* vec_align_load_cost */
370 10, /* vec_unalign_load_cost */
371 2, /* vec_unalign_store_cost */
372 2, /* vec_store_cost */
373 2, /* cond_taken_branch_cost */
374 1 /* cond_not_taken_branch_cost */
377 /* Generic costs for branch instructions. */
378 static const struct cpu_branch_cost generic_branch_cost =
380 2, /* Predictable. */
381 2 /* Unpredictable. */
384 /* Branch costs for Cortex-A57. */
385 static const struct cpu_branch_cost cortexa57_branch_cost =
387 1, /* Predictable. */
388 3 /* Unpredictable. */
391 static const struct tune_params generic_tunings =
393 &cortexa57_extra_costs,
394 &generic_addrcost_table,
395 &generic_regmove_cost,
396 &generic_vector_cost,
397 &generic_branch_cost,
398 4, /* memmov_cost */
399 2, /* issue_rate */
400 AARCH64_FUSE_NOTHING, /* fusible_ops */
401 8, /* function_align. */
402 8, /* jump_align. */
403 4, /* loop_align. */
404 2, /* int_reassoc_width. */
405 4, /* fp_reassoc_width. */
406 1, /* vec_reassoc_width. */
407 2, /* min_div_recip_mul_sf. */
408 2, /* min_div_recip_mul_df. */
409 0, /* max_case_values. */
410 0, /* cache_line_size. */
411 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
412 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
415 static const struct tune_params cortexa35_tunings =
417 &cortexa53_extra_costs,
418 &generic_addrcost_table,
419 &cortexa53_regmove_cost,
420 &generic_vector_cost,
421 &generic_branch_cost,
422 4, /* memmov_cost */
423 1, /* issue_rate */
424 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
425 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
426 8, /* function_align. */
427 8, /* jump_align. */
428 4, /* loop_align. */
429 2, /* int_reassoc_width. */
430 4, /* fp_reassoc_width. */
431 1, /* vec_reassoc_width. */
432 2, /* min_div_recip_mul_sf. */
433 2, /* min_div_recip_mul_df. */
434 0, /* max_case_values. */
435 0, /* cache_line_size. */
436 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
437 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
440 static const struct tune_params cortexa53_tunings =
442 &cortexa53_extra_costs,
443 &generic_addrcost_table,
444 &cortexa53_regmove_cost,
445 &generic_vector_cost,
446 &generic_branch_cost,
447 4, /* memmov_cost */
448 2, /* issue_rate */
449 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
450 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
451 8, /* function_align. */
452 8, /* jump_align. */
453 4, /* loop_align. */
454 2, /* int_reassoc_width. */
455 4, /* fp_reassoc_width. */
456 1, /* vec_reassoc_width. */
457 2, /* min_div_recip_mul_sf. */
458 2, /* min_div_recip_mul_df. */
459 0, /* max_case_values. */
460 0, /* cache_line_size. */
461 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
462 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
465 static const struct tune_params cortexa57_tunings =
467 &cortexa57_extra_costs,
468 &cortexa57_addrcost_table,
469 &cortexa57_regmove_cost,
470 &cortexa57_vector_cost,
471 &cortexa57_branch_cost,
472 4, /* memmov_cost */
473 3, /* issue_rate */
474 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
475 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
476 16, /* function_align. */
477 8, /* jump_align. */
478 4, /* loop_align. */
479 2, /* int_reassoc_width. */
480 4, /* fp_reassoc_width. */
481 1, /* vec_reassoc_width. */
482 2, /* min_div_recip_mul_sf. */
483 2, /* min_div_recip_mul_df. */
484 0, /* max_case_values. */
485 0, /* cache_line_size. */
486 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
487 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
488 | AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
491 static const struct tune_params cortexa72_tunings =
493 &cortexa57_extra_costs,
494 &cortexa57_addrcost_table,
495 &cortexa57_regmove_cost,
496 &cortexa57_vector_cost,
497 &generic_branch_cost,
498 4, /* memmov_cost */
499 3, /* issue_rate */
500 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
501 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
502 16, /* function_align. */
503 8, /* jump_align. */
504 4, /* loop_align. */
505 2, /* int_reassoc_width. */
506 4, /* fp_reassoc_width. */
507 1, /* vec_reassoc_width. */
508 2, /* min_div_recip_mul_sf. */
509 2, /* min_div_recip_mul_df. */
510 0, /* max_case_values. */
511 0, /* cache_line_size. */
512 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
513 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
516 static const struct tune_params exynosm1_tunings =
518 &exynosm1_extra_costs,
519 &exynosm1_addrcost_table,
520 &exynosm1_regmove_cost,
521 &exynosm1_vector_cost,
522 &generic_branch_cost,
523 4, /* memmov_cost */
524 3, /* issue_rate */
525 (AARCH64_FUSE_NOTHING), /* fusible_ops */
526 4, /* function_align. */
527 4, /* jump_align. */
528 4, /* loop_align. */
529 2, /* int_reassoc_width. */
530 4, /* fp_reassoc_width. */
531 1, /* vec_reassoc_width. */
532 2, /* min_div_recip_mul_sf. */
533 2, /* min_div_recip_mul_df. */
534 48, /* max_case_values. */
535 64, /* cache_line_size. */
536 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
537 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
540 static const struct tune_params thunderx_tunings =
542 &thunderx_extra_costs,
543 &generic_addrcost_table,
544 &thunderx_regmove_cost,
545 &generic_vector_cost,
546 &generic_branch_cost,
547 6, /* memmov_cost */
548 2, /* issue_rate */
549 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
550 8, /* function_align. */
551 8, /* jump_align. */
552 8, /* loop_align. */
553 2, /* int_reassoc_width. */
554 4, /* fp_reassoc_width. */
555 1, /* vec_reassoc_width. */
556 2, /* min_div_recip_mul_sf. */
557 2, /* min_div_recip_mul_df. */
558 0, /* max_case_values. */
559 0, /* cache_line_size. */
560 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
561 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
564 static const struct tune_params xgene1_tunings =
566 &xgene1_extra_costs,
567 &xgene1_addrcost_table,
568 &xgene1_regmove_cost,
569 &xgene1_vector_cost,
570 &generic_branch_cost,
571 6, /* memmov_cost */
572 4, /* issue_rate */
573 AARCH64_FUSE_NOTHING, /* fusible_ops */
574 16, /* function_align. */
575 8, /* jump_align. */
576 16, /* loop_align. */
577 2, /* int_reassoc_width. */
578 4, /* fp_reassoc_width. */
579 1, /* vec_reassoc_width. */
580 2, /* min_div_recip_mul_sf. */
581 2, /* min_div_recip_mul_df. */
582 0, /* max_case_values. */
583 0, /* cache_line_size. */
584 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
585 (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
588 /* Support for fine-grained override of the tuning structures. */
589 struct aarch64_tuning_override_function
591 const char* name;
592 void (*parse_override)(const char*, struct tune_params*);
595 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
596 static void aarch64_parse_tune_string (const char*, struct tune_params*);
598 static const struct aarch64_tuning_override_function
599 aarch64_tuning_override_functions[] =
601 { "fuse", aarch64_parse_fuse_string },
602 { "tune", aarch64_parse_tune_string },
603 { NULL, NULL }
606 /* A processor implementing AArch64. */
607 struct processor
609 const char *const name;
610 enum aarch64_processor ident;
611 enum aarch64_processor sched_core;
612 enum aarch64_arch arch;
613 unsigned architecture_version;
614 const unsigned long flags;
615 const struct tune_params *const tune;
618 /* Architectures implementing AArch64. */
619 static const struct processor all_architectures[] =
621 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
622 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
623 #include "aarch64-arches.def"
624 #undef AARCH64_ARCH
625 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
628 /* Processor cores implementing AArch64. */
629 static const struct processor all_cores[] =
631 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
632 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
633 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
634 FLAGS, &COSTS##_tunings},
635 #include "aarch64-cores.def"
636 #undef AARCH64_CORE
637 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
638 AARCH64_FL_FOR_ARCH8, &generic_tunings},
639 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
643 /* Target specification. These are populated by the -march, -mtune, -mcpu
644 handling code or by target attributes. */
645 static const struct processor *selected_arch;
646 static const struct processor *selected_cpu;
647 static const struct processor *selected_tune;
649 /* The current tuning set. */
650 struct tune_params aarch64_tune_params = generic_tunings;
652 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
654 /* An ISA extension in the co-processor and main instruction set space. */
655 struct aarch64_option_extension
657 const char *const name;
658 const unsigned long flags_on;
659 const unsigned long flags_off;
662 /* ISA extensions in AArch64. */
663 static const struct aarch64_option_extension all_extensions[] =
665 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
666 {NAME, FLAGS_ON, FLAGS_OFF},
667 #include "aarch64-option-extensions.def"
668 #undef AARCH64_OPT_EXTENSION
669 {NULL, 0, 0}
672 typedef enum aarch64_cond_code
674 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
675 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
676 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
678 aarch64_cc;
680 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
682 /* The condition codes of the processor, and the inverse function. */
683 static const char * const aarch64_condition_codes[] =
685 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
686 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
689 /* Generate code to enable conditional branches in functions over 1 MiB. */
690 const char *
691 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
692 const char * branch_format)
694 rtx_code_label * tmp_label = gen_label_rtx ();
695 char label_buf[256];
696 char buffer[128];
697 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
698 CODE_LABEL_NUMBER (tmp_label));
699 const char *label_ptr = targetm.strip_name_encoding (label_buf);
700 rtx dest_label = operands[pos_label];
701 operands[pos_label] = tmp_label;
703 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
704 output_asm_insn (buffer, operands);
706 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
707 operands[pos_label] = dest_label;
708 output_asm_insn (buffer, operands);
709 return "";
712 void
713 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
715 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
716 if (TARGET_GENERAL_REGS_ONLY)
717 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
718 else
719 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
722 static unsigned int
723 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
725 if (GET_MODE_UNIT_SIZE (mode) == 4)
726 return aarch64_tune_params.min_div_recip_mul_sf;
727 return aarch64_tune_params.min_div_recip_mul_df;
730 static int
731 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
732 enum machine_mode mode)
734 if (VECTOR_MODE_P (mode))
735 return aarch64_tune_params.vec_reassoc_width;
736 if (INTEGRAL_MODE_P (mode))
737 return aarch64_tune_params.int_reassoc_width;
738 if (FLOAT_MODE_P (mode))
739 return aarch64_tune_params.fp_reassoc_width;
740 return 1;
743 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
744 unsigned
745 aarch64_dbx_register_number (unsigned regno)
747 if (GP_REGNUM_P (regno))
748 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
749 else if (regno == SP_REGNUM)
750 return AARCH64_DWARF_SP;
751 else if (FP_REGNUM_P (regno))
752 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
754 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
755 equivalent DWARF register. */
756 return DWARF_FRAME_REGISTERS;
759 /* Return TRUE if MODE is any of the large INT modes. */
760 static bool
761 aarch64_vect_struct_mode_p (machine_mode mode)
763 return mode == OImode || mode == CImode || mode == XImode;
766 /* Return TRUE if MODE is any of the vector modes. */
767 static bool
768 aarch64_vector_mode_p (machine_mode mode)
770 return aarch64_vector_mode_supported_p (mode)
771 || aarch64_vect_struct_mode_p (mode);
774 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
775 static bool
776 aarch64_array_mode_supported_p (machine_mode mode,
777 unsigned HOST_WIDE_INT nelems)
779 if (TARGET_SIMD
780 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
781 || AARCH64_VALID_SIMD_DREG_MODE (mode))
782 && (nelems >= 2 && nelems <= 4))
783 return true;
785 return false;
788 /* Implement HARD_REGNO_NREGS. */
791 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
793 switch (aarch64_regno_regclass (regno))
795 case FP_REGS:
796 case FP_LO_REGS:
797 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
798 default:
799 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
801 gcc_unreachable ();
804 /* Implement HARD_REGNO_MODE_OK. */
807 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
809 if (GET_MODE_CLASS (mode) == MODE_CC)
810 return regno == CC_REGNUM;
812 if (regno == SP_REGNUM)
813 /* The purpose of comparing with ptr_mode is to support the
814 global register variable associated with the stack pointer
815 register via the syntax of asm ("wsp") in ILP32. */
816 return mode == Pmode || mode == ptr_mode;
818 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
819 return mode == Pmode;
821 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
822 return 1;
824 if (FP_REGNUM_P (regno))
826 if (aarch64_vect_struct_mode_p (mode))
827 return
828 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
829 else
830 return 1;
833 return 0;
836 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
837 machine_mode
838 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
839 machine_mode mode)
841 /* Handle modes that fit within single registers. */
842 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
844 if (GET_MODE_SIZE (mode) >= 4)
845 return mode;
846 else
847 return SImode;
849 /* Fall back to generic for multi-reg and very large modes. */
850 else
851 return choose_hard_reg_mode (regno, nregs, false);
854 /* Return true if calls to DECL should be treated as
855 long-calls (ie called via a register). */
856 static bool
857 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
859 return false;
862 /* Return true if calls to symbol-ref SYM should be treated as
863 long-calls (ie called via a register). */
864 bool
865 aarch64_is_long_call_p (rtx sym)
867 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
870 /* Return true if calls to symbol-ref SYM should not go through
871 plt stubs. */
873 bool
874 aarch64_is_noplt_call_p (rtx sym)
876 const_tree decl = SYMBOL_REF_DECL (sym);
878 if (flag_pic
879 && decl
880 && (!flag_plt
881 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
882 && !targetm.binds_local_p (decl))
883 return true;
885 return false;
888 /* Return true if the offsets to a zero/sign-extract operation
889 represent an expression that matches an extend operation. The
890 operands represent the paramters from
892 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
893 bool
894 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
895 rtx extract_imm)
897 HOST_WIDE_INT mult_val, extract_val;
899 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
900 return false;
902 mult_val = INTVAL (mult_imm);
903 extract_val = INTVAL (extract_imm);
905 if (extract_val > 8
906 && extract_val < GET_MODE_BITSIZE (mode)
907 && exact_log2 (extract_val & ~7) > 0
908 && (extract_val & 7) <= 4
909 && mult_val == (1 << (extract_val & 7)))
910 return true;
912 return false;
915 /* Emit an insn that's a simple single-set. Both the operands must be
916 known to be valid. */
917 inline static rtx
918 emit_set_insn (rtx x, rtx y)
920 return emit_insn (gen_rtx_SET (x, y));
923 /* X and Y are two things to compare using CODE. Emit the compare insn and
924 return the rtx for register 0 in the proper mode. */
926 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
928 machine_mode mode = SELECT_CC_MODE (code, x, y);
929 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
931 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
932 return cc_reg;
935 /* Build the SYMBOL_REF for __tls_get_addr. */
937 static GTY(()) rtx tls_get_addr_libfunc;
940 aarch64_tls_get_addr (void)
942 if (!tls_get_addr_libfunc)
943 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
944 return tls_get_addr_libfunc;
947 /* Return the TLS model to use for ADDR. */
949 static enum tls_model
950 tls_symbolic_operand_type (rtx addr)
952 enum tls_model tls_kind = TLS_MODEL_NONE;
953 rtx sym, addend;
955 if (GET_CODE (addr) == CONST)
957 split_const (addr, &sym, &addend);
958 if (GET_CODE (sym) == SYMBOL_REF)
959 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
961 else if (GET_CODE (addr) == SYMBOL_REF)
962 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
964 return tls_kind;
967 /* We'll allow lo_sum's in addresses in our legitimate addresses
968 so that combine would take care of combining addresses where
969 necessary, but for generation purposes, we'll generate the address
970 as :
971 RTL Absolute
972 tmp = hi (symbol_ref); adrp x1, foo
973 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
976 PIC TLS
977 adrp x1, :got:foo adrp tmp, :tlsgd:foo
978 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
979 bl __tls_get_addr
982 Load TLS symbol, depending on TLS mechanism and TLS access model.
984 Global Dynamic - Traditional TLS:
985 adrp tmp, :tlsgd:imm
986 add dest, tmp, #:tlsgd_lo12:imm
987 bl __tls_get_addr
989 Global Dynamic - TLS Descriptors:
990 adrp dest, :tlsdesc:imm
991 ldr tmp, [dest, #:tlsdesc_lo12:imm]
992 add dest, dest, #:tlsdesc_lo12:imm
993 blr tmp
994 mrs tp, tpidr_el0
995 add dest, dest, tp
997 Initial Exec:
998 mrs tp, tpidr_el0
999 adrp tmp, :gottprel:imm
1000 ldr dest, [tmp, #:gottprel_lo12:imm]
1001 add dest, dest, tp
1003 Local Exec:
1004 mrs tp, tpidr_el0
1005 add t0, tp, #:tprel_hi12:imm, lsl #12
1006 add t0, t0, #:tprel_lo12_nc:imm
1009 static void
1010 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1011 enum aarch64_symbol_type type)
1013 switch (type)
1015 case SYMBOL_SMALL_ABSOLUTE:
1017 /* In ILP32, the mode of dest can be either SImode or DImode. */
1018 rtx tmp_reg = dest;
1019 machine_mode mode = GET_MODE (dest);
1021 gcc_assert (mode == Pmode || mode == ptr_mode);
1023 if (can_create_pseudo_p ())
1024 tmp_reg = gen_reg_rtx (mode);
1026 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1027 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1028 return;
1031 case SYMBOL_TINY_ABSOLUTE:
1032 emit_insn (gen_rtx_SET (dest, imm));
1033 return;
1035 case SYMBOL_SMALL_GOT_28K:
1037 machine_mode mode = GET_MODE (dest);
1038 rtx gp_rtx = pic_offset_table_rtx;
1039 rtx insn;
1040 rtx mem;
1042 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1043 here before rtl expand. Tree IVOPT will generate rtl pattern to
1044 decide rtx costs, in which case pic_offset_table_rtx is not
1045 initialized. For that case no need to generate the first adrp
1046 instruction as the final cost for global variable access is
1047 one instruction. */
1048 if (gp_rtx != NULL)
1050 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1051 using the page base as GOT base, the first page may be wasted,
1052 in the worst scenario, there is only 28K space for GOT).
1054 The generate instruction sequence for accessing global variable
1057 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1059 Only one instruction needed. But we must initialize
1060 pic_offset_table_rtx properly. We generate initialize insn for
1061 every global access, and allow CSE to remove all redundant.
1063 The final instruction sequences will look like the following
1064 for multiply global variables access.
1066 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1068 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1069 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1070 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1071 ... */
1073 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1074 crtl->uses_pic_offset_table = 1;
1075 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1077 if (mode != GET_MODE (gp_rtx))
1078 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1081 if (mode == ptr_mode)
1083 if (mode == DImode)
1084 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1085 else
1086 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1088 mem = XVECEXP (SET_SRC (insn), 0, 0);
1090 else
1092 gcc_assert (mode == Pmode);
1094 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1095 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1098 /* The operand is expected to be MEM. Whenever the related insn
1099 pattern changed, above code which calculate mem should be
1100 updated. */
1101 gcc_assert (GET_CODE (mem) == MEM);
1102 MEM_READONLY_P (mem) = 1;
1103 MEM_NOTRAP_P (mem) = 1;
1104 emit_insn (insn);
1105 return;
1108 case SYMBOL_SMALL_GOT_4G:
1110 /* In ILP32, the mode of dest can be either SImode or DImode,
1111 while the got entry is always of SImode size. The mode of
1112 dest depends on how dest is used: if dest is assigned to a
1113 pointer (e.g. in the memory), it has SImode; it may have
1114 DImode if dest is dereferenced to access the memeory.
1115 This is why we have to handle three different ldr_got_small
1116 patterns here (two patterns for ILP32). */
1118 rtx insn;
1119 rtx mem;
1120 rtx tmp_reg = dest;
1121 machine_mode mode = GET_MODE (dest);
1123 if (can_create_pseudo_p ())
1124 tmp_reg = gen_reg_rtx (mode);
1126 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1127 if (mode == ptr_mode)
1129 if (mode == DImode)
1130 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1131 else
1132 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1134 mem = XVECEXP (SET_SRC (insn), 0, 0);
1136 else
1138 gcc_assert (mode == Pmode);
1140 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1141 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1144 gcc_assert (GET_CODE (mem) == MEM);
1145 MEM_READONLY_P (mem) = 1;
1146 MEM_NOTRAP_P (mem) = 1;
1147 emit_insn (insn);
1148 return;
1151 case SYMBOL_SMALL_TLSGD:
1153 rtx_insn *insns;
1154 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1156 start_sequence ();
1157 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1158 insns = get_insns ();
1159 end_sequence ();
1161 RTL_CONST_CALL_P (insns) = 1;
1162 emit_libcall_block (insns, dest, result, imm);
1163 return;
1166 case SYMBOL_SMALL_TLSDESC:
1168 machine_mode mode = GET_MODE (dest);
1169 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1170 rtx tp;
1172 gcc_assert (mode == Pmode || mode == ptr_mode);
1174 /* In ILP32, the got entry is always of SImode size. Unlike
1175 small GOT, the dest is fixed at reg 0. */
1176 if (TARGET_ILP32)
1177 emit_insn (gen_tlsdesc_small_si (imm));
1178 else
1179 emit_insn (gen_tlsdesc_small_di (imm));
1180 tp = aarch64_load_tp (NULL);
1182 if (mode != Pmode)
1183 tp = gen_lowpart (mode, tp);
1185 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1186 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1187 return;
1190 case SYMBOL_SMALL_TLSIE:
1192 /* In ILP32, the mode of dest can be either SImode or DImode,
1193 while the got entry is always of SImode size. The mode of
1194 dest depends on how dest is used: if dest is assigned to a
1195 pointer (e.g. in the memory), it has SImode; it may have
1196 DImode if dest is dereferenced to access the memeory.
1197 This is why we have to handle three different tlsie_small
1198 patterns here (two patterns for ILP32). */
1199 machine_mode mode = GET_MODE (dest);
1200 rtx tmp_reg = gen_reg_rtx (mode);
1201 rtx tp = aarch64_load_tp (NULL);
1203 if (mode == ptr_mode)
1205 if (mode == DImode)
1206 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1207 else
1209 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1210 tp = gen_lowpart (mode, tp);
1213 else
1215 gcc_assert (mode == Pmode);
1216 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1219 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1220 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1221 return;
1224 case SYMBOL_TLSLE12:
1225 case SYMBOL_TLSLE24:
1226 case SYMBOL_TLSLE32:
1227 case SYMBOL_TLSLE48:
1229 machine_mode mode = GET_MODE (dest);
1230 rtx tp = aarch64_load_tp (NULL);
1232 if (mode != Pmode)
1233 tp = gen_lowpart (mode, tp);
1235 switch (type)
1237 case SYMBOL_TLSLE12:
1238 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1239 (dest, tp, imm));
1240 break;
1241 case SYMBOL_TLSLE24:
1242 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1243 (dest, tp, imm));
1244 break;
1245 case SYMBOL_TLSLE32:
1246 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1247 (dest, imm));
1248 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1249 (dest, dest, tp));
1250 break;
1251 case SYMBOL_TLSLE48:
1252 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1253 (dest, imm));
1254 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1255 (dest, dest, tp));
1256 break;
1257 default:
1258 gcc_unreachable ();
1261 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1262 return;
1265 case SYMBOL_TINY_GOT:
1266 emit_insn (gen_ldr_got_tiny (dest, imm));
1267 return;
1269 case SYMBOL_TINY_TLSIE:
1271 machine_mode mode = GET_MODE (dest);
1272 rtx tp = aarch64_load_tp (NULL);
1274 if (mode == ptr_mode)
1276 if (mode == DImode)
1277 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1278 else
1280 tp = gen_lowpart (mode, tp);
1281 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1284 else
1286 gcc_assert (mode == Pmode);
1287 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1290 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1291 return;
1294 default:
1295 gcc_unreachable ();
1299 /* Emit a move from SRC to DEST. Assume that the move expanders can
1300 handle all moves if !can_create_pseudo_p (). The distinction is
1301 important because, unlike emit_move_insn, the move expanders know
1302 how to force Pmode objects into the constant pool even when the
1303 constant pool address is not itself legitimate. */
1304 static rtx
1305 aarch64_emit_move (rtx dest, rtx src)
1307 return (can_create_pseudo_p ()
1308 ? emit_move_insn (dest, src)
1309 : emit_move_insn_1 (dest, src));
1312 /* Split a 128-bit move operation into two 64-bit move operations,
1313 taking care to handle partial overlap of register to register
1314 copies. Special cases are needed when moving between GP regs and
1315 FP regs. SRC can be a register, constant or memory; DST a register
1316 or memory. If either operand is memory it must not have any side
1317 effects. */
1318 void
1319 aarch64_split_128bit_move (rtx dst, rtx src)
1321 rtx dst_lo, dst_hi;
1322 rtx src_lo, src_hi;
1324 machine_mode mode = GET_MODE (dst);
1326 gcc_assert (mode == TImode || mode == TFmode);
1327 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1328 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1330 if (REG_P (dst) && REG_P (src))
1332 int src_regno = REGNO (src);
1333 int dst_regno = REGNO (dst);
1335 /* Handle FP <-> GP regs. */
1336 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1338 src_lo = gen_lowpart (word_mode, src);
1339 src_hi = gen_highpart (word_mode, src);
1341 if (mode == TImode)
1343 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1344 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1346 else
1348 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1349 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1351 return;
1353 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1355 dst_lo = gen_lowpart (word_mode, dst);
1356 dst_hi = gen_highpart (word_mode, dst);
1358 if (mode == TImode)
1360 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1361 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1363 else
1365 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1366 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1368 return;
1372 dst_lo = gen_lowpart (word_mode, dst);
1373 dst_hi = gen_highpart (word_mode, dst);
1374 src_lo = gen_lowpart (word_mode, src);
1375 src_hi = gen_highpart_mode (word_mode, mode, src);
1377 /* At most one pairing may overlap. */
1378 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1380 aarch64_emit_move (dst_hi, src_hi);
1381 aarch64_emit_move (dst_lo, src_lo);
1383 else
1385 aarch64_emit_move (dst_lo, src_lo);
1386 aarch64_emit_move (dst_hi, src_hi);
1390 bool
1391 aarch64_split_128bit_move_p (rtx dst, rtx src)
1393 return (! REG_P (src)
1394 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1397 /* Split a complex SIMD combine. */
1399 void
1400 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1402 machine_mode src_mode = GET_MODE (src1);
1403 machine_mode dst_mode = GET_MODE (dst);
1405 gcc_assert (VECTOR_MODE_P (dst_mode));
1407 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1409 rtx (*gen) (rtx, rtx, rtx);
1411 switch (src_mode)
1413 case V8QImode:
1414 gen = gen_aarch64_simd_combinev8qi;
1415 break;
1416 case V4HImode:
1417 gen = gen_aarch64_simd_combinev4hi;
1418 break;
1419 case V2SImode:
1420 gen = gen_aarch64_simd_combinev2si;
1421 break;
1422 case V4HFmode:
1423 gen = gen_aarch64_simd_combinev4hf;
1424 break;
1425 case V2SFmode:
1426 gen = gen_aarch64_simd_combinev2sf;
1427 break;
1428 case DImode:
1429 gen = gen_aarch64_simd_combinedi;
1430 break;
1431 case DFmode:
1432 gen = gen_aarch64_simd_combinedf;
1433 break;
1434 default:
1435 gcc_unreachable ();
1438 emit_insn (gen (dst, src1, src2));
1439 return;
1443 /* Split a complex SIMD move. */
1445 void
1446 aarch64_split_simd_move (rtx dst, rtx src)
1448 machine_mode src_mode = GET_MODE (src);
1449 machine_mode dst_mode = GET_MODE (dst);
1451 gcc_assert (VECTOR_MODE_P (dst_mode));
1453 if (REG_P (dst) && REG_P (src))
1455 rtx (*gen) (rtx, rtx);
1457 gcc_assert (VECTOR_MODE_P (src_mode));
1459 switch (src_mode)
1461 case V16QImode:
1462 gen = gen_aarch64_split_simd_movv16qi;
1463 break;
1464 case V8HImode:
1465 gen = gen_aarch64_split_simd_movv8hi;
1466 break;
1467 case V4SImode:
1468 gen = gen_aarch64_split_simd_movv4si;
1469 break;
1470 case V2DImode:
1471 gen = gen_aarch64_split_simd_movv2di;
1472 break;
1473 case V8HFmode:
1474 gen = gen_aarch64_split_simd_movv8hf;
1475 break;
1476 case V4SFmode:
1477 gen = gen_aarch64_split_simd_movv4sf;
1478 break;
1479 case V2DFmode:
1480 gen = gen_aarch64_split_simd_movv2df;
1481 break;
1482 default:
1483 gcc_unreachable ();
1486 emit_insn (gen (dst, src));
1487 return;
1491 static rtx
1492 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1494 if (can_create_pseudo_p ())
1495 return force_reg (mode, value);
1496 else
1498 x = aarch64_emit_move (x, value);
1499 return x;
1504 static rtx
1505 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1507 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1509 rtx high;
1510 /* Load the full offset into a register. This
1511 might be improvable in the future. */
1512 high = GEN_INT (offset);
1513 offset = 0;
1514 high = aarch64_force_temporary (mode, temp, high);
1515 reg = aarch64_force_temporary (mode, temp,
1516 gen_rtx_PLUS (mode, high, reg));
1518 return plus_constant (mode, reg, offset);
1521 static int
1522 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1523 machine_mode mode)
1525 int i;
1526 unsigned HOST_WIDE_INT val, val2, mask;
1527 int one_match, zero_match;
1528 int num_insns;
1530 val = INTVAL (imm);
1532 if (aarch64_move_imm (val, mode))
1534 if (generate)
1535 emit_insn (gen_rtx_SET (dest, imm));
1536 return 1;
1539 if ((val >> 32) == 0 || mode == SImode)
1541 if (generate)
1543 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1544 if (mode == SImode)
1545 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1546 GEN_INT ((val >> 16) & 0xffff)));
1547 else
1548 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1549 GEN_INT ((val >> 16) & 0xffff)));
1551 return 2;
1554 /* Remaining cases are all for DImode. */
1556 mask = 0xffff;
1557 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1558 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1559 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1560 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1562 if (zero_match != 2 && one_match != 2)
1564 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1565 For a 64-bit bitmask try whether changing 16 bits to all ones or
1566 zeroes creates a valid bitmask. To check any repeated bitmask,
1567 try using 16 bits from the other 32-bit half of val. */
1569 for (i = 0; i < 64; i += 16, mask <<= 16)
1571 val2 = val & ~mask;
1572 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1573 break;
1574 val2 = val | mask;
1575 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1576 break;
1577 val2 = val2 & ~mask;
1578 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1579 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1580 break;
1582 if (i != 64)
1584 if (generate)
1586 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1587 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1588 GEN_INT ((val >> i) & 0xffff)));
1593 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1594 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1595 otherwise skip zero bits. */
1597 num_insns = 1;
1598 mask = 0xffff;
1599 val2 = one_match > zero_match ? ~val : val;
1600 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1602 if (generate)
1603 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1604 ? (val | ~(mask << i))
1605 : (val & (mask << i)))));
1606 for (i += 16; i < 64; i += 16)
1608 if ((val2 & (mask << i)) == 0)
1609 continue;
1610 if (generate)
1611 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1612 GEN_INT ((val >> i) & 0xffff)));
1613 num_insns ++;
1616 return num_insns;
1620 void
1621 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1623 machine_mode mode = GET_MODE (dest);
1625 gcc_assert (mode == SImode || mode == DImode);
1627 /* Check on what type of symbol it is. */
1628 if (GET_CODE (imm) == SYMBOL_REF
1629 || GET_CODE (imm) == LABEL_REF
1630 || GET_CODE (imm) == CONST)
1632 rtx mem, base, offset;
1633 enum aarch64_symbol_type sty;
1635 /* If we have (const (plus symbol offset)), separate out the offset
1636 before we start classifying the symbol. */
1637 split_const (imm, &base, &offset);
1639 sty = aarch64_classify_symbol (base, offset);
1640 switch (sty)
1642 case SYMBOL_FORCE_TO_MEM:
1643 if (offset != const0_rtx
1644 && targetm.cannot_force_const_mem (mode, imm))
1646 gcc_assert (can_create_pseudo_p ());
1647 base = aarch64_force_temporary (mode, dest, base);
1648 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1649 aarch64_emit_move (dest, base);
1650 return;
1653 mem = force_const_mem (ptr_mode, imm);
1654 gcc_assert (mem);
1656 /* If we aren't generating PC relative literals, then
1657 we need to expand the literal pool access carefully.
1658 This is something that needs to be done in a number
1659 of places, so could well live as a separate function. */
1660 if (aarch64_nopcrelative_literal_loads)
1662 gcc_assert (can_create_pseudo_p ());
1663 base = gen_reg_rtx (ptr_mode);
1664 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1665 mem = gen_rtx_MEM (ptr_mode, base);
1668 if (mode != ptr_mode)
1669 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1671 emit_insn (gen_rtx_SET (dest, mem));
1673 return;
1675 case SYMBOL_SMALL_TLSGD:
1676 case SYMBOL_SMALL_TLSDESC:
1677 case SYMBOL_SMALL_TLSIE:
1678 case SYMBOL_SMALL_GOT_28K:
1679 case SYMBOL_SMALL_GOT_4G:
1680 case SYMBOL_TINY_GOT:
1681 case SYMBOL_TINY_TLSIE:
1682 if (offset != const0_rtx)
1684 gcc_assert(can_create_pseudo_p ());
1685 base = aarch64_force_temporary (mode, dest, base);
1686 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1687 aarch64_emit_move (dest, base);
1688 return;
1690 /* FALLTHRU */
1692 case SYMBOL_SMALL_ABSOLUTE:
1693 case SYMBOL_TINY_ABSOLUTE:
1694 case SYMBOL_TLSLE12:
1695 case SYMBOL_TLSLE24:
1696 case SYMBOL_TLSLE32:
1697 case SYMBOL_TLSLE48:
1698 aarch64_load_symref_appropriately (dest, imm, sty);
1699 return;
1701 default:
1702 gcc_unreachable ();
1706 if (!CONST_INT_P (imm))
1708 if (GET_CODE (imm) == HIGH)
1709 emit_insn (gen_rtx_SET (dest, imm));
1710 else
1712 rtx mem = force_const_mem (mode, imm);
1713 gcc_assert (mem);
1714 emit_insn (gen_rtx_SET (dest, mem));
1717 return;
1720 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1723 static bool
1724 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1725 tree exp ATTRIBUTE_UNUSED)
1727 /* Currently, always true. */
1728 return true;
1731 /* Implement TARGET_PASS_BY_REFERENCE. */
1733 static bool
1734 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1735 machine_mode mode,
1736 const_tree type,
1737 bool named ATTRIBUTE_UNUSED)
1739 HOST_WIDE_INT size;
1740 machine_mode dummymode;
1741 int nregs;
1743 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1744 size = (mode == BLKmode && type)
1745 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1747 /* Aggregates are passed by reference based on their size. */
1748 if (type && AGGREGATE_TYPE_P (type))
1750 size = int_size_in_bytes (type);
1753 /* Variable sized arguments are always returned by reference. */
1754 if (size < 0)
1755 return true;
1757 /* Can this be a candidate to be passed in fp/simd register(s)? */
1758 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1759 &dummymode, &nregs,
1760 NULL))
1761 return false;
1763 /* Arguments which are variable sized or larger than 2 registers are
1764 passed by reference unless they are a homogenous floating point
1765 aggregate. */
1766 return size > 2 * UNITS_PER_WORD;
1769 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1770 static bool
1771 aarch64_return_in_msb (const_tree valtype)
1773 machine_mode dummy_mode;
1774 int dummy_int;
1776 /* Never happens in little-endian mode. */
1777 if (!BYTES_BIG_ENDIAN)
1778 return false;
1780 /* Only composite types smaller than or equal to 16 bytes can
1781 be potentially returned in registers. */
1782 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1783 || int_size_in_bytes (valtype) <= 0
1784 || int_size_in_bytes (valtype) > 16)
1785 return false;
1787 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1788 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1789 is always passed/returned in the least significant bits of fp/simd
1790 register(s). */
1791 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1792 &dummy_mode, &dummy_int, NULL))
1793 return false;
1795 return true;
1798 /* Implement TARGET_FUNCTION_VALUE.
1799 Define how to find the value returned by a function. */
1801 static rtx
1802 aarch64_function_value (const_tree type, const_tree func,
1803 bool outgoing ATTRIBUTE_UNUSED)
1805 machine_mode mode;
1806 int unsignedp;
1807 int count;
1808 machine_mode ag_mode;
1810 mode = TYPE_MODE (type);
1811 if (INTEGRAL_TYPE_P (type))
1812 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1814 if (aarch64_return_in_msb (type))
1816 HOST_WIDE_INT size = int_size_in_bytes (type);
1818 if (size % UNITS_PER_WORD != 0)
1820 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1821 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1825 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1826 &ag_mode, &count, NULL))
1828 if (!aarch64_composite_type_p (type, mode))
1830 gcc_assert (count == 1 && mode == ag_mode);
1831 return gen_rtx_REG (mode, V0_REGNUM);
1833 else
1835 int i;
1836 rtx par;
1838 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1839 for (i = 0; i < count; i++)
1841 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1842 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1843 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1844 XVECEXP (par, 0, i) = tmp;
1846 return par;
1849 else
1850 return gen_rtx_REG (mode, R0_REGNUM);
1853 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1854 Return true if REGNO is the number of a hard register in which the values
1855 of called function may come back. */
1857 static bool
1858 aarch64_function_value_regno_p (const unsigned int regno)
1860 /* Maximum of 16 bytes can be returned in the general registers. Examples
1861 of 16-byte return values are: 128-bit integers and 16-byte small
1862 structures (excluding homogeneous floating-point aggregates). */
1863 if (regno == R0_REGNUM || regno == R1_REGNUM)
1864 return true;
1866 /* Up to four fp/simd registers can return a function value, e.g. a
1867 homogeneous floating-point aggregate having four members. */
1868 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1869 return TARGET_FLOAT;
1871 return false;
1874 /* Implement TARGET_RETURN_IN_MEMORY.
1876 If the type T of the result of a function is such that
1877 void func (T arg)
1878 would require that arg be passed as a value in a register (or set of
1879 registers) according to the parameter passing rules, then the result
1880 is returned in the same registers as would be used for such an
1881 argument. */
1883 static bool
1884 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1886 HOST_WIDE_INT size;
1887 machine_mode ag_mode;
1888 int count;
1890 if (!AGGREGATE_TYPE_P (type)
1891 && TREE_CODE (type) != COMPLEX_TYPE
1892 && TREE_CODE (type) != VECTOR_TYPE)
1893 /* Simple scalar types always returned in registers. */
1894 return false;
1896 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1897 type,
1898 &ag_mode,
1899 &count,
1900 NULL))
1901 return false;
1903 /* Types larger than 2 registers returned in memory. */
1904 size = int_size_in_bytes (type);
1905 return (size < 0 || size > 2 * UNITS_PER_WORD);
1908 static bool
1909 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1910 const_tree type, int *nregs)
1912 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1913 return aarch64_vfp_is_call_or_return_candidate (mode,
1914 type,
1915 &pcum->aapcs_vfp_rmode,
1916 nregs,
1917 NULL);
1920 /* Given MODE and TYPE of a function argument, return the alignment in
1921 bits. The idea is to suppress any stronger alignment requested by
1922 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1923 This is a helper function for local use only. */
1925 static unsigned int
1926 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1928 unsigned int alignment;
1930 if (type)
1932 if (!integer_zerop (TYPE_SIZE (type)))
1934 if (TYPE_MODE (type) == mode)
1935 alignment = TYPE_ALIGN (type);
1936 else
1937 alignment = GET_MODE_ALIGNMENT (mode);
1939 else
1940 alignment = 0;
1942 else
1943 alignment = GET_MODE_ALIGNMENT (mode);
1945 return alignment;
1948 /* Layout a function argument according to the AAPCS64 rules. The rule
1949 numbers refer to the rule numbers in the AAPCS64. */
1951 static void
1952 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1953 const_tree type,
1954 bool named ATTRIBUTE_UNUSED)
1956 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1957 int ncrn, nvrn, nregs;
1958 bool allocate_ncrn, allocate_nvrn;
1959 HOST_WIDE_INT size;
1961 /* We need to do this once per argument. */
1962 if (pcum->aapcs_arg_processed)
1963 return;
1965 pcum->aapcs_arg_processed = true;
1967 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1968 size
1969 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1970 UNITS_PER_WORD);
1972 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1973 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1974 mode,
1975 type,
1976 &nregs);
1978 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1979 The following code thus handles passing by SIMD/FP registers first. */
1981 nvrn = pcum->aapcs_nvrn;
1983 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1984 and homogenous short-vector aggregates (HVA). */
1985 if (allocate_nvrn)
1987 if (!TARGET_FLOAT)
1988 aarch64_err_no_fpadvsimd (mode, "argument");
1990 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1992 pcum->aapcs_nextnvrn = nvrn + nregs;
1993 if (!aarch64_composite_type_p (type, mode))
1995 gcc_assert (nregs == 1);
1996 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1998 else
2000 rtx par;
2001 int i;
2002 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2003 for (i = 0; i < nregs; i++)
2005 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2006 V0_REGNUM + nvrn + i);
2007 tmp = gen_rtx_EXPR_LIST
2008 (VOIDmode, tmp,
2009 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2010 XVECEXP (par, 0, i) = tmp;
2012 pcum->aapcs_reg = par;
2014 return;
2016 else
2018 /* C.3 NSRN is set to 8. */
2019 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2020 goto on_stack;
2024 ncrn = pcum->aapcs_ncrn;
2025 nregs = size / UNITS_PER_WORD;
2027 /* C6 - C9. though the sign and zero extension semantics are
2028 handled elsewhere. This is the case where the argument fits
2029 entirely general registers. */
2030 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2032 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2034 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2036 /* C.8 if the argument has an alignment of 16 then the NGRN is
2037 rounded up to the next even number. */
2038 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2040 ++ncrn;
2041 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2043 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2044 A reg is still generated for it, but the caller should be smart
2045 enough not to use it. */
2046 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2048 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2050 else
2052 rtx par;
2053 int i;
2055 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2056 for (i = 0; i < nregs; i++)
2058 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2059 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2060 GEN_INT (i * UNITS_PER_WORD));
2061 XVECEXP (par, 0, i) = tmp;
2063 pcum->aapcs_reg = par;
2066 pcum->aapcs_nextncrn = ncrn + nregs;
2067 return;
2070 /* C.11 */
2071 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2073 /* The argument is passed on stack; record the needed number of words for
2074 this argument and align the total size if necessary. */
2075 on_stack:
2076 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2077 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2078 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2079 16 / UNITS_PER_WORD);
2080 return;
2083 /* Implement TARGET_FUNCTION_ARG. */
2085 static rtx
2086 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2087 const_tree type, bool named)
2089 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2090 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2092 if (mode == VOIDmode)
2093 return NULL_RTX;
2095 aarch64_layout_arg (pcum_v, mode, type, named);
2096 return pcum->aapcs_reg;
2099 void
2100 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2101 const_tree fntype ATTRIBUTE_UNUSED,
2102 rtx libname ATTRIBUTE_UNUSED,
2103 const_tree fndecl ATTRIBUTE_UNUSED,
2104 unsigned n_named ATTRIBUTE_UNUSED)
2106 pcum->aapcs_ncrn = 0;
2107 pcum->aapcs_nvrn = 0;
2108 pcum->aapcs_nextncrn = 0;
2109 pcum->aapcs_nextnvrn = 0;
2110 pcum->pcs_variant = ARM_PCS_AAPCS64;
2111 pcum->aapcs_reg = NULL_RTX;
2112 pcum->aapcs_arg_processed = false;
2113 pcum->aapcs_stack_words = 0;
2114 pcum->aapcs_stack_size = 0;
2116 if (!TARGET_FLOAT
2117 && fndecl && TREE_PUBLIC (fndecl)
2118 && fntype && fntype != error_mark_node)
2120 const_tree type = TREE_TYPE (fntype);
2121 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2122 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2123 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2124 &mode, &nregs, NULL))
2125 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2127 return;
2130 static void
2131 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2132 machine_mode mode,
2133 const_tree type,
2134 bool named)
2136 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2137 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2139 aarch64_layout_arg (pcum_v, mode, type, named);
2140 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2141 != (pcum->aapcs_stack_words != 0));
2142 pcum->aapcs_arg_processed = false;
2143 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2144 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2145 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2146 pcum->aapcs_stack_words = 0;
2147 pcum->aapcs_reg = NULL_RTX;
2151 bool
2152 aarch64_function_arg_regno_p (unsigned regno)
2154 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2155 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2158 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2159 PARM_BOUNDARY bits of alignment, but will be given anything up
2160 to STACK_BOUNDARY bits if the type requires it. This makes sure
2161 that both before and after the layout of each argument, the Next
2162 Stacked Argument Address (NSAA) will have a minimum alignment of
2163 8 bytes. */
2165 static unsigned int
2166 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2168 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2170 if (alignment < PARM_BOUNDARY)
2171 alignment = PARM_BOUNDARY;
2172 if (alignment > STACK_BOUNDARY)
2173 alignment = STACK_BOUNDARY;
2174 return alignment;
2177 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2179 Return true if an argument passed on the stack should be padded upwards,
2180 i.e. if the least-significant byte of the stack slot has useful data.
2182 Small aggregate types are placed in the lowest memory address.
2184 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2186 bool
2187 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2189 /* On little-endian targets, the least significant byte of every stack
2190 argument is passed at the lowest byte address of the stack slot. */
2191 if (!BYTES_BIG_ENDIAN)
2192 return true;
2194 /* Otherwise, integral, floating-point and pointer types are padded downward:
2195 the least significant byte of a stack argument is passed at the highest
2196 byte address of the stack slot. */
2197 if (type
2198 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2199 || POINTER_TYPE_P (type))
2200 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2201 return false;
2203 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2204 return true;
2207 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2209 It specifies padding for the last (may also be the only)
2210 element of a block move between registers and memory. If
2211 assuming the block is in the memory, padding upward means that
2212 the last element is padded after its highest significant byte,
2213 while in downward padding, the last element is padded at the
2214 its least significant byte side.
2216 Small aggregates and small complex types are always padded
2217 upwards.
2219 We don't need to worry about homogeneous floating-point or
2220 short-vector aggregates; their move is not affected by the
2221 padding direction determined here. Regardless of endianness,
2222 each element of such an aggregate is put in the least
2223 significant bits of a fp/simd register.
2225 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2226 register has useful data, and return the opposite if the most
2227 significant byte does. */
2229 bool
2230 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2231 bool first ATTRIBUTE_UNUSED)
2234 /* Small composite types are always padded upward. */
2235 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2237 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2238 : GET_MODE_SIZE (mode));
2239 if (size < 2 * UNITS_PER_WORD)
2240 return true;
2243 /* Otherwise, use the default padding. */
2244 return !BYTES_BIG_ENDIAN;
2247 static machine_mode
2248 aarch64_libgcc_cmp_return_mode (void)
2250 return SImode;
2253 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2255 /* We use the 12-bit shifted immediate arithmetic instructions so values
2256 must be multiple of (1 << 12), i.e. 4096. */
2257 #define ARITH_FACTOR 4096
2259 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2260 #error Cannot use simple address calculation for stack probing
2261 #endif
2263 /* The pair of scratch registers used for stack probing. */
2264 #define PROBE_STACK_FIRST_REG 9
2265 #define PROBE_STACK_SECOND_REG 10
2267 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2268 inclusive. These are offsets from the current stack pointer. */
2270 static void
2271 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2273 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2275 /* See the same assertion on PROBE_INTERVAL above. */
2276 gcc_assert ((first % ARITH_FACTOR) == 0);
2278 /* See if we have a constant small number of probes to generate. If so,
2279 that's the easy case. */
2280 if (size <= PROBE_INTERVAL)
2282 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2284 emit_set_insn (reg1,
2285 plus_constant (ptr_mode,
2286 stack_pointer_rtx, -(first + base)));
2287 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2290 /* The run-time loop is made up of 8 insns in the generic case while the
2291 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2292 else if (size <= 4 * PROBE_INTERVAL)
2294 HOST_WIDE_INT i, rem;
2296 emit_set_insn (reg1,
2297 plus_constant (ptr_mode,
2298 stack_pointer_rtx,
2299 -(first + PROBE_INTERVAL)));
2300 emit_stack_probe (reg1);
2302 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2303 it exceeds SIZE. If only two probes are needed, this will not
2304 generate any code. Then probe at FIRST + SIZE. */
2305 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2307 emit_set_insn (reg1,
2308 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2309 emit_stack_probe (reg1);
2312 rem = size - (i - PROBE_INTERVAL);
2313 if (rem > 256)
2315 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2317 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2318 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2320 else
2321 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2324 /* Otherwise, do the same as above, but in a loop. Note that we must be
2325 extra careful with variables wrapping around because we might be at
2326 the very top (or the very bottom) of the address space and we have
2327 to be able to handle this case properly; in particular, we use an
2328 equality test for the loop condition. */
2329 else
2331 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2333 /* Step 1: round SIZE to the previous multiple of the interval. */
2335 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2338 /* Step 2: compute initial and final value of the loop counter. */
2340 /* TEST_ADDR = SP + FIRST. */
2341 emit_set_insn (reg1,
2342 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2344 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2345 emit_set_insn (reg2,
2346 plus_constant (ptr_mode, stack_pointer_rtx,
2347 -(first + rounded_size)));
2350 /* Step 3: the loop
2354 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2355 probe at TEST_ADDR
2357 while (TEST_ADDR != LAST_ADDR)
2359 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2360 until it is equal to ROUNDED_SIZE. */
2362 if (ptr_mode == DImode)
2363 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2364 else
2365 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2368 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2369 that SIZE is equal to ROUNDED_SIZE. */
2371 if (size != rounded_size)
2373 HOST_WIDE_INT rem = size - rounded_size;
2375 if (rem > 256)
2377 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2379 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2380 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2382 else
2383 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2387 /* Make sure nothing is scheduled before we are done. */
2388 emit_insn (gen_blockage ());
2391 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2392 absolute addresses. */
2394 const char *
2395 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2397 static int labelno = 0;
2398 char loop_lab[32];
2399 rtx xops[2];
2401 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2403 /* Loop. */
2404 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2406 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2407 xops[0] = reg1;
2408 xops[1] = GEN_INT (PROBE_INTERVAL);
2409 output_asm_insn ("sub\t%0, %0, %1", xops);
2411 /* Probe at TEST_ADDR. */
2412 output_asm_insn ("str\txzr, [%0]", xops);
2414 /* Test if TEST_ADDR == LAST_ADDR. */
2415 xops[1] = reg2;
2416 output_asm_insn ("cmp\t%0, %1", xops);
2418 /* Branch. */
2419 fputs ("\tb.ne\t", asm_out_file);
2420 assemble_name_raw (asm_out_file, loop_lab);
2421 fputc ('\n', asm_out_file);
2423 return "";
2426 static bool
2427 aarch64_frame_pointer_required (void)
2429 /* In aarch64_override_options_after_change
2430 flag_omit_leaf_frame_pointer turns off the frame pointer by
2431 default. Turn it back on now if we've not got a leaf
2432 function. */
2433 if (flag_omit_leaf_frame_pointer
2434 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2435 return true;
2437 return false;
2440 /* Mark the registers that need to be saved by the callee and calculate
2441 the size of the callee-saved registers area and frame record (both FP
2442 and LR may be omitted). */
2443 static void
2444 aarch64_layout_frame (void)
2446 HOST_WIDE_INT offset = 0;
2447 int regno;
2449 if (reload_completed && cfun->machine->frame.laid_out)
2450 return;
2452 #define SLOT_NOT_REQUIRED (-2)
2453 #define SLOT_REQUIRED (-1)
2455 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2456 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2458 /* First mark all the registers that really need to be saved... */
2459 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2460 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2462 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2463 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2465 /* ... that includes the eh data registers (if needed)... */
2466 if (crtl->calls_eh_return)
2467 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2468 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2469 = SLOT_REQUIRED;
2471 /* ... and any callee saved register that dataflow says is live. */
2472 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2473 if (df_regs_ever_live_p (regno)
2474 && (regno == R30_REGNUM
2475 || !call_used_regs[regno]))
2476 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2478 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2479 if (df_regs_ever_live_p (regno)
2480 && !call_used_regs[regno])
2481 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2483 if (frame_pointer_needed)
2485 /* FP and LR are placed in the linkage record. */
2486 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2487 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2488 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2489 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2490 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2491 offset += 2 * UNITS_PER_WORD;
2494 /* Now assign stack slots for them. */
2495 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2496 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2498 cfun->machine->frame.reg_offset[regno] = offset;
2499 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2500 cfun->machine->frame.wb_candidate1 = regno;
2501 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2502 cfun->machine->frame.wb_candidate2 = regno;
2503 offset += UNITS_PER_WORD;
2506 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2507 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2509 cfun->machine->frame.reg_offset[regno] = offset;
2510 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2511 cfun->machine->frame.wb_candidate1 = regno;
2512 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2513 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2514 cfun->machine->frame.wb_candidate2 = regno;
2515 offset += UNITS_PER_WORD;
2518 cfun->machine->frame.padding0 =
2519 (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2520 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2522 cfun->machine->frame.saved_regs_size = offset;
2524 cfun->machine->frame.hard_fp_offset
2525 = ROUND_UP (cfun->machine->frame.saved_varargs_size
2526 + get_frame_size ()
2527 + cfun->machine->frame.saved_regs_size,
2528 STACK_BOUNDARY / BITS_PER_UNIT);
2530 cfun->machine->frame.frame_size
2531 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2532 + crtl->outgoing_args_size,
2533 STACK_BOUNDARY / BITS_PER_UNIT);
2535 cfun->machine->frame.laid_out = true;
2538 static bool
2539 aarch64_register_saved_on_entry (int regno)
2541 return cfun->machine->frame.reg_offset[regno] >= 0;
2544 static unsigned
2545 aarch64_next_callee_save (unsigned regno, unsigned limit)
2547 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2548 regno ++;
2549 return regno;
2552 static void
2553 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2554 HOST_WIDE_INT adjustment)
2556 rtx base_rtx = stack_pointer_rtx;
2557 rtx insn, reg, mem;
2559 reg = gen_rtx_REG (mode, regno);
2560 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2561 plus_constant (Pmode, base_rtx, -adjustment));
2562 mem = gen_rtx_MEM (mode, mem);
2564 insn = emit_move_insn (mem, reg);
2565 RTX_FRAME_RELATED_P (insn) = 1;
2568 static rtx
2569 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2570 HOST_WIDE_INT adjustment)
2572 switch (mode)
2574 case DImode:
2575 return gen_storewb_pairdi_di (base, base, reg, reg2,
2576 GEN_INT (-adjustment),
2577 GEN_INT (UNITS_PER_WORD - adjustment));
2578 case DFmode:
2579 return gen_storewb_pairdf_di (base, base, reg, reg2,
2580 GEN_INT (-adjustment),
2581 GEN_INT (UNITS_PER_WORD - adjustment));
2582 default:
2583 gcc_unreachable ();
2587 static void
2588 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2589 unsigned regno2, HOST_WIDE_INT adjustment)
2591 rtx_insn *insn;
2592 rtx reg1 = gen_rtx_REG (mode, regno1);
2593 rtx reg2 = gen_rtx_REG (mode, regno2);
2595 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2596 reg2, adjustment));
2597 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2598 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2599 RTX_FRAME_RELATED_P (insn) = 1;
2602 static rtx
2603 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2604 HOST_WIDE_INT adjustment)
2606 switch (mode)
2608 case DImode:
2609 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2610 GEN_INT (UNITS_PER_WORD));
2611 case DFmode:
2612 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2613 GEN_INT (UNITS_PER_WORD));
2614 default:
2615 gcc_unreachable ();
2619 static rtx
2620 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2621 rtx reg2)
2623 switch (mode)
2625 case DImode:
2626 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2628 case DFmode:
2629 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2631 default:
2632 gcc_unreachable ();
2636 static rtx
2637 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2638 rtx mem2)
2640 switch (mode)
2642 case DImode:
2643 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2645 case DFmode:
2646 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2648 default:
2649 gcc_unreachable ();
2654 static void
2655 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2656 unsigned start, unsigned limit, bool skip_wb)
2658 rtx_insn *insn;
2659 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2660 ? gen_frame_mem : gen_rtx_MEM);
2661 unsigned regno;
2662 unsigned regno2;
2664 for (regno = aarch64_next_callee_save (start, limit);
2665 regno <= limit;
2666 regno = aarch64_next_callee_save (regno + 1, limit))
2668 rtx reg, mem;
2669 HOST_WIDE_INT offset;
2671 if (skip_wb
2672 && (regno == cfun->machine->frame.wb_candidate1
2673 || regno == cfun->machine->frame.wb_candidate2))
2674 continue;
2676 reg = gen_rtx_REG (mode, regno);
2677 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2678 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2679 offset));
2681 regno2 = aarch64_next_callee_save (regno + 1, limit);
2683 if (regno2 <= limit
2684 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2685 == cfun->machine->frame.reg_offset[regno2]))
2688 rtx reg2 = gen_rtx_REG (mode, regno2);
2689 rtx mem2;
2691 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2692 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2693 offset));
2694 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2695 reg2));
2697 /* The first part of a frame-related parallel insn is
2698 always assumed to be relevant to the frame
2699 calculations; subsequent parts, are only
2700 frame-related if explicitly marked. */
2701 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2702 regno = regno2;
2704 else
2705 insn = emit_move_insn (mem, reg);
2707 RTX_FRAME_RELATED_P (insn) = 1;
2711 static void
2712 aarch64_restore_callee_saves (machine_mode mode,
2713 HOST_WIDE_INT start_offset, unsigned start,
2714 unsigned limit, bool skip_wb, rtx *cfi_ops)
2716 rtx base_rtx = stack_pointer_rtx;
2717 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2718 ? gen_frame_mem : gen_rtx_MEM);
2719 unsigned regno;
2720 unsigned regno2;
2721 HOST_WIDE_INT offset;
2723 for (regno = aarch64_next_callee_save (start, limit);
2724 regno <= limit;
2725 regno = aarch64_next_callee_save (regno + 1, limit))
2727 rtx reg, mem;
2729 if (skip_wb
2730 && (regno == cfun->machine->frame.wb_candidate1
2731 || regno == cfun->machine->frame.wb_candidate2))
2732 continue;
2734 reg = gen_rtx_REG (mode, regno);
2735 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2736 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2738 regno2 = aarch64_next_callee_save (regno + 1, limit);
2740 if (regno2 <= limit
2741 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2742 == cfun->machine->frame.reg_offset[regno2]))
2744 rtx reg2 = gen_rtx_REG (mode, regno2);
2745 rtx mem2;
2747 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2748 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2749 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2751 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2752 regno = regno2;
2754 else
2755 emit_move_insn (reg, mem);
2756 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2760 /* AArch64 stack frames generated by this compiler look like:
2762 +-------------------------------+
2764 | incoming stack arguments |
2766 +-------------------------------+
2767 | | <-- incoming stack pointer (aligned)
2768 | callee-allocated save area |
2769 | for register varargs |
2771 +-------------------------------+
2772 | local variables | <-- frame_pointer_rtx
2774 +-------------------------------+
2775 | padding0 | \
2776 +-------------------------------+ |
2777 | callee-saved registers | | frame.saved_regs_size
2778 +-------------------------------+ |
2779 | LR' | |
2780 +-------------------------------+ |
2781 | FP' | / <- hard_frame_pointer_rtx (aligned)
2782 +-------------------------------+
2783 | dynamic allocation |
2784 +-------------------------------+
2785 | padding |
2786 +-------------------------------+
2787 | outgoing stack arguments | <-- arg_pointer
2789 +-------------------------------+
2790 | | <-- stack_pointer_rtx (aligned)
2792 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2793 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2794 unchanged. */
2796 /* Generate the prologue instructions for entry into a function.
2797 Establish the stack frame by decreasing the stack pointer with a
2798 properly calculated size and, if necessary, create a frame record
2799 filled with the values of LR and previous frame pointer. The
2800 current FP is also set up if it is in use. */
2802 void
2803 aarch64_expand_prologue (void)
2805 /* sub sp, sp, #<frame_size>
2806 stp {fp, lr}, [sp, #<frame_size> - 16]
2807 add fp, sp, #<frame_size> - hardfp_offset
2808 stp {cs_reg}, [fp, #-16] etc.
2810 sub sp, sp, <final_adjustment_if_any>
2812 HOST_WIDE_INT frame_size, offset;
2813 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2814 HOST_WIDE_INT hard_fp_offset;
2815 rtx_insn *insn;
2817 aarch64_layout_frame ();
2819 offset = frame_size = cfun->machine->frame.frame_size;
2820 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2821 fp_offset = frame_size - hard_fp_offset;
2823 if (flag_stack_usage_info)
2824 current_function_static_stack_size = frame_size;
2826 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
2828 if (crtl->is_leaf && !cfun->calls_alloca)
2830 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
2831 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
2832 frame_size - STACK_CHECK_PROTECT);
2834 else if (frame_size > 0)
2835 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
2838 /* Store pairs and load pairs have a range only -512 to 504. */
2839 if (offset >= 512)
2841 /* When the frame has a large size, an initial decrease is done on
2842 the stack pointer to jump over the callee-allocated save area for
2843 register varargs, the local variable area and/or the callee-saved
2844 register area. This will allow the pre-index write-back
2845 store pair instructions to be used for setting up the stack frame
2846 efficiently. */
2847 offset = hard_fp_offset;
2848 if (offset >= 512)
2849 offset = cfun->machine->frame.saved_regs_size;
2851 frame_size -= (offset + crtl->outgoing_args_size);
2852 fp_offset = 0;
2854 if (frame_size >= 0x1000000)
2856 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2857 emit_move_insn (op0, GEN_INT (-frame_size));
2858 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2860 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2861 gen_rtx_SET (stack_pointer_rtx,
2862 plus_constant (Pmode, stack_pointer_rtx,
2863 -frame_size)));
2864 RTX_FRAME_RELATED_P (insn) = 1;
2866 else if (frame_size > 0)
2868 int hi_ofs = frame_size & 0xfff000;
2869 int lo_ofs = frame_size & 0x000fff;
2871 if (hi_ofs)
2873 insn = emit_insn (gen_add2_insn
2874 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2875 RTX_FRAME_RELATED_P (insn) = 1;
2877 if (lo_ofs)
2879 insn = emit_insn (gen_add2_insn
2880 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2881 RTX_FRAME_RELATED_P (insn) = 1;
2885 else
2886 frame_size = -1;
2888 if (offset > 0)
2890 bool skip_wb = false;
2892 if (frame_pointer_needed)
2894 skip_wb = true;
2896 if (fp_offset)
2898 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2899 GEN_INT (-offset)));
2900 RTX_FRAME_RELATED_P (insn) = 1;
2902 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2903 R30_REGNUM, false);
2905 else
2906 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2908 /* Set up frame pointer to point to the location of the
2909 previous frame pointer on the stack. */
2910 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2911 stack_pointer_rtx,
2912 GEN_INT (fp_offset)));
2913 RTX_FRAME_RELATED_P (insn) = 1;
2914 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2916 else
2918 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2919 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2921 if (fp_offset
2922 || reg1 == FIRST_PSEUDO_REGISTER
2923 || (reg2 == FIRST_PSEUDO_REGISTER
2924 && offset >= 256))
2926 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2927 GEN_INT (-offset)));
2928 RTX_FRAME_RELATED_P (insn) = 1;
2930 else
2932 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2934 skip_wb = true;
2936 if (reg2 == FIRST_PSEUDO_REGISTER)
2937 aarch64_pushwb_single_reg (mode1, reg1, offset);
2938 else
2939 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2943 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2944 skip_wb);
2945 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2946 skip_wb);
2949 /* when offset >= 512,
2950 sub sp, sp, #<outgoing_args_size> */
2951 if (frame_size > -1)
2953 if (crtl->outgoing_args_size > 0)
2955 insn = emit_insn (gen_add2_insn
2956 (stack_pointer_rtx,
2957 GEN_INT (- crtl->outgoing_args_size)));
2958 RTX_FRAME_RELATED_P (insn) = 1;
2963 /* Return TRUE if we can use a simple_return insn.
2965 This function checks whether the callee saved stack is empty, which
2966 means no restore actions are need. The pro_and_epilogue will use
2967 this to check whether shrink-wrapping opt is feasible. */
2969 bool
2970 aarch64_use_return_insn_p (void)
2972 if (!reload_completed)
2973 return false;
2975 if (crtl->profile)
2976 return false;
2978 aarch64_layout_frame ();
2980 return cfun->machine->frame.frame_size == 0;
2983 /* Generate the epilogue instructions for returning from a function. */
2984 void
2985 aarch64_expand_epilogue (bool for_sibcall)
2987 HOST_WIDE_INT frame_size, offset;
2988 HOST_WIDE_INT fp_offset;
2989 HOST_WIDE_INT hard_fp_offset;
2990 rtx_insn *insn;
2991 /* We need to add memory barrier to prevent read from deallocated stack. */
2992 bool need_barrier_p = (get_frame_size () != 0
2993 || cfun->machine->frame.saved_varargs_size);
2995 aarch64_layout_frame ();
2997 offset = frame_size = cfun->machine->frame.frame_size;
2998 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2999 fp_offset = frame_size - hard_fp_offset;
3001 /* Store pairs and load pairs have a range only -512 to 504. */
3002 if (offset >= 512)
3004 offset = hard_fp_offset;
3005 if (offset >= 512)
3006 offset = cfun->machine->frame.saved_regs_size;
3008 frame_size -= (offset + crtl->outgoing_args_size);
3009 fp_offset = 0;
3010 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
3012 insn = emit_insn (gen_add2_insn
3013 (stack_pointer_rtx,
3014 GEN_INT (crtl->outgoing_args_size)));
3015 RTX_FRAME_RELATED_P (insn) = 1;
3018 else
3019 frame_size = -1;
3021 /* If there were outgoing arguments or we've done dynamic stack
3022 allocation, then restore the stack pointer from the frame
3023 pointer. This is at most one insn and more efficient than using
3024 GCC's internal mechanism. */
3025 if (frame_pointer_needed
3026 && (crtl->outgoing_args_size || cfun->calls_alloca))
3028 if (cfun->calls_alloca)
3029 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3031 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3032 hard_frame_pointer_rtx,
3033 GEN_INT (0)));
3034 offset = offset - fp_offset;
3037 if (offset > 0)
3039 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3040 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3041 bool skip_wb = true;
3042 rtx cfi_ops = NULL;
3044 if (frame_pointer_needed)
3045 fp_offset = 0;
3046 else if (fp_offset
3047 || reg1 == FIRST_PSEUDO_REGISTER
3048 || (reg2 == FIRST_PSEUDO_REGISTER
3049 && offset >= 256))
3050 skip_wb = false;
3052 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
3053 skip_wb, &cfi_ops);
3054 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
3055 skip_wb, &cfi_ops);
3057 if (need_barrier_p)
3058 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3060 if (skip_wb)
3062 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
3063 rtx rreg1 = gen_rtx_REG (mode1, reg1);
3065 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
3066 if (reg2 == FIRST_PSEUDO_REGISTER)
3068 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
3069 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3070 mem = gen_rtx_MEM (mode1, mem);
3071 insn = emit_move_insn (rreg1, mem);
3073 else
3075 rtx rreg2 = gen_rtx_REG (mode1, reg2);
3077 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
3078 insn = emit_insn (aarch64_gen_loadwb_pair
3079 (mode1, stack_pointer_rtx, rreg1,
3080 rreg2, offset));
3083 else
3085 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
3086 GEN_INT (offset)));
3089 /* Reset the CFA to be SP + FRAME_SIZE. */
3090 rtx new_cfa = stack_pointer_rtx;
3091 if (frame_size > 0)
3092 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
3093 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3094 REG_NOTES (insn) = cfi_ops;
3095 RTX_FRAME_RELATED_P (insn) = 1;
3098 if (frame_size > 0)
3100 if (need_barrier_p)
3101 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3103 if (frame_size >= 0x1000000)
3105 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3106 emit_move_insn (op0, GEN_INT (frame_size));
3107 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
3109 else
3111 int hi_ofs = frame_size & 0xfff000;
3112 int lo_ofs = frame_size & 0x000fff;
3114 if (hi_ofs && lo_ofs)
3116 insn = emit_insn (gen_add2_insn
3117 (stack_pointer_rtx, GEN_INT (hi_ofs)));
3118 RTX_FRAME_RELATED_P (insn) = 1;
3119 frame_size = lo_ofs;
3121 insn = emit_insn (gen_add2_insn
3122 (stack_pointer_rtx, GEN_INT (frame_size)));
3125 /* Reset the CFA to be SP + 0. */
3126 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
3127 RTX_FRAME_RELATED_P (insn) = 1;
3130 /* Stack adjustment for exception handler. */
3131 if (crtl->calls_eh_return)
3133 /* We need to unwind the stack by the offset computed by
3134 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3135 to be SP; letting the CFA move during this adjustment
3136 is just as correct as retaining the CFA from the body
3137 of the function. Therefore, do nothing special. */
3138 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3141 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3142 if (!for_sibcall)
3143 emit_jump_insn (ret_rtx);
3146 /* Return the place to copy the exception unwinding return address to.
3147 This will probably be a stack slot, but could (in theory be the
3148 return register). */
3150 aarch64_final_eh_return_addr (void)
3152 HOST_WIDE_INT fp_offset;
3154 aarch64_layout_frame ();
3156 fp_offset = cfun->machine->frame.frame_size
3157 - cfun->machine->frame.hard_fp_offset;
3159 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3160 return gen_rtx_REG (DImode, LR_REGNUM);
3162 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3163 result in a store to save LR introduced by builtin_eh_return () being
3164 incorrectly deleted because the alias is not detected.
3165 So in the calculation of the address to copy the exception unwinding
3166 return address to, we note 2 cases.
3167 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3168 we return a SP-relative location since all the addresses are SP-relative
3169 in this case. This prevents the store from being optimized away.
3170 If the fp_offset is not 0, then the addresses will be FP-relative and
3171 therefore we return a FP-relative location. */
3173 if (frame_pointer_needed)
3175 if (fp_offset)
3176 return gen_frame_mem (DImode,
3177 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3178 else
3179 return gen_frame_mem (DImode,
3180 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3183 /* If FP is not needed, we calculate the location of LR, which would be
3184 at the top of the saved registers block. */
3186 return gen_frame_mem (DImode,
3187 plus_constant (Pmode,
3188 stack_pointer_rtx,
3189 fp_offset
3190 + cfun->machine->frame.saved_regs_size
3191 - 2 * UNITS_PER_WORD));
3194 /* Possibly output code to build up a constant in a register. For
3195 the benefit of the costs infrastructure, returns the number of
3196 instructions which would be emitted. GENERATE inhibits or
3197 enables code generation. */
3199 static int
3200 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
3202 int insns = 0;
3204 if (aarch64_bitmask_imm (val, DImode))
3206 if (generate)
3207 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
3208 insns = 1;
3210 else
3212 int i;
3213 int ncount = 0;
3214 int zcount = 0;
3215 HOST_WIDE_INT valp = val >> 16;
3216 HOST_WIDE_INT valm;
3217 HOST_WIDE_INT tval;
3219 for (i = 16; i < 64; i += 16)
3221 valm = (valp & 0xffff);
3223 if (valm != 0)
3224 ++ zcount;
3226 if (valm != 0xffff)
3227 ++ ncount;
3229 valp >>= 16;
3232 /* zcount contains the number of additional MOVK instructions
3233 required if the constant is built up with an initial MOVZ instruction,
3234 while ncount is the number of MOVK instructions required if starting
3235 with a MOVN instruction. Choose the sequence that yields the fewest
3236 number of instructions, preferring MOVZ instructions when they are both
3237 the same. */
3238 if (ncount < zcount)
3240 if (generate)
3241 emit_move_insn (gen_rtx_REG (Pmode, regnum),
3242 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3243 tval = 0xffff;
3244 insns++;
3246 else
3248 if (generate)
3249 emit_move_insn (gen_rtx_REG (Pmode, regnum),
3250 GEN_INT (val & 0xffff));
3251 tval = 0;
3252 insns++;
3255 val >>= 16;
3257 for (i = 16; i < 64; i += 16)
3259 if ((val & 0xffff) != tval)
3261 if (generate)
3262 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3263 GEN_INT (i),
3264 GEN_INT (val & 0xffff)));
3265 insns++;
3267 val >>= 16;
3270 return insns;
3273 static void
3274 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3276 HOST_WIDE_INT mdelta = delta;
3277 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3278 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3280 if (mdelta < 0)
3281 mdelta = -mdelta;
3283 if (mdelta >= 4096 * 4096)
3285 (void) aarch64_build_constant (scratchreg, delta, true);
3286 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3288 else if (mdelta > 0)
3290 if (mdelta >= 4096)
3292 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3293 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3294 if (delta < 0)
3295 emit_insn (gen_rtx_SET (this_rtx,
3296 gen_rtx_MINUS (Pmode, this_rtx, shift)));
3297 else
3298 emit_insn (gen_rtx_SET (this_rtx,
3299 gen_rtx_PLUS (Pmode, this_rtx, shift)));
3301 if (mdelta % 4096 != 0)
3303 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3304 emit_insn (gen_rtx_SET (this_rtx,
3305 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3310 /* Output code to add DELTA to the first argument, and then jump
3311 to FUNCTION. Used for C++ multiple inheritance. */
3312 static void
3313 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3314 HOST_WIDE_INT delta,
3315 HOST_WIDE_INT vcall_offset,
3316 tree function)
3318 /* The this pointer is always in x0. Note that this differs from
3319 Arm where the this pointer maybe bumped to r1 if r0 is required
3320 to return a pointer to an aggregate. On AArch64 a result value
3321 pointer will be in x8. */
3322 int this_regno = R0_REGNUM;
3323 rtx this_rtx, temp0, temp1, addr, funexp;
3324 rtx_insn *insn;
3326 reload_completed = 1;
3327 emit_note (NOTE_INSN_PROLOGUE_END);
3329 if (vcall_offset == 0)
3330 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3331 else
3333 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3335 this_rtx = gen_rtx_REG (Pmode, this_regno);
3336 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3337 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3339 addr = this_rtx;
3340 if (delta != 0)
3342 if (delta >= -256 && delta < 256)
3343 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3344 plus_constant (Pmode, this_rtx, delta));
3345 else
3346 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3349 if (Pmode == ptr_mode)
3350 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3351 else
3352 aarch64_emit_move (temp0,
3353 gen_rtx_ZERO_EXTEND (Pmode,
3354 gen_rtx_MEM (ptr_mode, addr)));
3356 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3357 addr = plus_constant (Pmode, temp0, vcall_offset);
3358 else
3360 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3361 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3364 if (Pmode == ptr_mode)
3365 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3366 else
3367 aarch64_emit_move (temp1,
3368 gen_rtx_SIGN_EXTEND (Pmode,
3369 gen_rtx_MEM (ptr_mode, addr)));
3371 emit_insn (gen_add2_insn (this_rtx, temp1));
3374 /* Generate a tail call to the target function. */
3375 if (!TREE_USED (function))
3377 assemble_external (function);
3378 TREE_USED (function) = 1;
3380 funexp = XEXP (DECL_RTL (function), 0);
3381 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3382 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3383 SIBLING_CALL_P (insn) = 1;
3385 insn = get_insns ();
3386 shorten_branches (insn);
3387 final_start_function (insn, file, 1);
3388 final (insn, file, 1);
3389 final_end_function ();
3391 /* Stop pretending to be a post-reload pass. */
3392 reload_completed = 0;
3395 static bool
3396 aarch64_tls_referenced_p (rtx x)
3398 if (!TARGET_HAVE_TLS)
3399 return false;
3400 subrtx_iterator::array_type array;
3401 FOR_EACH_SUBRTX (iter, array, x, ALL)
3403 const_rtx x = *iter;
3404 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3405 return true;
3406 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3407 TLS offsets, not real symbol references. */
3408 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3409 iter.skip_subrtxes ();
3411 return false;
3415 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3416 a left shift of 0 or 12 bits. */
3417 bool
3418 aarch64_uimm12_shift (HOST_WIDE_INT val)
3420 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3421 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3426 /* Return true if val is an immediate that can be loaded into a
3427 register by a MOVZ instruction. */
3428 static bool
3429 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3431 if (GET_MODE_SIZE (mode) > 4)
3433 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3434 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3435 return 1;
3437 else
3439 /* Ignore sign extension. */
3440 val &= (HOST_WIDE_INT) 0xffffffff;
3442 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3443 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3446 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3448 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3450 0x0000000100000001ull,
3451 0x0001000100010001ull,
3452 0x0101010101010101ull,
3453 0x1111111111111111ull,
3454 0x5555555555555555ull,
3458 /* Return true if val is a valid bitmask immediate. */
3460 bool
3461 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3463 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3464 int bits;
3466 /* Check for a single sequence of one bits and return quickly if so.
3467 The special cases of all ones and all zeroes returns false. */
3468 val = (unsigned HOST_WIDE_INT) val_in;
3469 tmp = val + (val & -val);
3471 if (tmp == (tmp & -tmp))
3472 return (val + 1) > 1;
3474 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3475 if (mode == SImode)
3476 val = (val << 32) | (val & 0xffffffff);
3478 /* Invert if the immediate doesn't start with a zero bit - this means we
3479 only need to search for sequences of one bits. */
3480 if (val & 1)
3481 val = ~val;
3483 /* Find the first set bit and set tmp to val with the first sequence of one
3484 bits removed. Return success if there is a single sequence of ones. */
3485 first_one = val & -val;
3486 tmp = val & (val + first_one);
3488 if (tmp == 0)
3489 return true;
3491 /* Find the next set bit and compute the difference in bit position. */
3492 next_one = tmp & -tmp;
3493 bits = clz_hwi (first_one) - clz_hwi (next_one);
3494 mask = val ^ tmp;
3496 /* Check the bit position difference is a power of 2, and that the first
3497 sequence of one bits fits within 'bits' bits. */
3498 if ((mask >> bits) != 0 || bits != (bits & -bits))
3499 return false;
3501 /* Check the sequence of one bits is repeated 64/bits times. */
3502 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3506 /* Return true if val is an immediate that can be loaded into a
3507 register in a single instruction. */
3508 bool
3509 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3511 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3512 return 1;
3513 return aarch64_bitmask_imm (val, mode);
3516 static bool
3517 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3519 rtx base, offset;
3521 if (GET_CODE (x) == HIGH)
3522 return true;
3524 split_const (x, &base, &offset);
3525 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3527 if (aarch64_classify_symbol (base, offset)
3528 != SYMBOL_FORCE_TO_MEM)
3529 return true;
3530 else
3531 /* Avoid generating a 64-bit relocation in ILP32; leave
3532 to aarch64_expand_mov_immediate to handle it properly. */
3533 return mode != ptr_mode;
3536 return aarch64_tls_referenced_p (x);
3539 /* Implement TARGET_CASE_VALUES_THRESHOLD. */
3541 static unsigned int
3542 aarch64_case_values_threshold (void)
3544 /* Use the specified limit for the number of cases before using jump
3545 tables at higher optimization levels. */
3546 if (optimize > 2
3547 && selected_cpu->tune->max_case_values != 0)
3548 return selected_cpu->tune->max_case_values;
3549 else
3550 return default_case_values_threshold ();
3553 /* Return true if register REGNO is a valid index register.
3554 STRICT_P is true if REG_OK_STRICT is in effect. */
3556 bool
3557 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3559 if (!HARD_REGISTER_NUM_P (regno))
3561 if (!strict_p)
3562 return true;
3564 if (!reg_renumber)
3565 return false;
3567 regno = reg_renumber[regno];
3569 return GP_REGNUM_P (regno);
3572 /* Return true if register REGNO is a valid base register for mode MODE.
3573 STRICT_P is true if REG_OK_STRICT is in effect. */
3575 bool
3576 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3578 if (!HARD_REGISTER_NUM_P (regno))
3580 if (!strict_p)
3581 return true;
3583 if (!reg_renumber)
3584 return false;
3586 regno = reg_renumber[regno];
3589 /* The fake registers will be eliminated to either the stack or
3590 hard frame pointer, both of which are usually valid base registers.
3591 Reload deals with the cases where the eliminated form isn't valid. */
3592 return (GP_REGNUM_P (regno)
3593 || regno == SP_REGNUM
3594 || regno == FRAME_POINTER_REGNUM
3595 || regno == ARG_POINTER_REGNUM);
3598 /* Return true if X is a valid base register for mode MODE.
3599 STRICT_P is true if REG_OK_STRICT is in effect. */
3601 static bool
3602 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3604 if (!strict_p && GET_CODE (x) == SUBREG)
3605 x = SUBREG_REG (x);
3607 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3610 /* Return true if address offset is a valid index. If it is, fill in INFO
3611 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3613 static bool
3614 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3615 machine_mode mode, bool strict_p)
3617 enum aarch64_address_type type;
3618 rtx index;
3619 int shift;
3621 /* (reg:P) */
3622 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3623 && GET_MODE (x) == Pmode)
3625 type = ADDRESS_REG_REG;
3626 index = x;
3627 shift = 0;
3629 /* (sign_extend:DI (reg:SI)) */
3630 else if ((GET_CODE (x) == SIGN_EXTEND
3631 || GET_CODE (x) == ZERO_EXTEND)
3632 && GET_MODE (x) == DImode
3633 && GET_MODE (XEXP (x, 0)) == SImode)
3635 type = (GET_CODE (x) == SIGN_EXTEND)
3636 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3637 index = XEXP (x, 0);
3638 shift = 0;
3640 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3641 else if (GET_CODE (x) == MULT
3642 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3643 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3644 && GET_MODE (XEXP (x, 0)) == DImode
3645 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3646 && CONST_INT_P (XEXP (x, 1)))
3648 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3649 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3650 index = XEXP (XEXP (x, 0), 0);
3651 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3653 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3654 else if (GET_CODE (x) == ASHIFT
3655 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3656 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3657 && GET_MODE (XEXP (x, 0)) == DImode
3658 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3659 && CONST_INT_P (XEXP (x, 1)))
3661 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3662 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3663 index = XEXP (XEXP (x, 0), 0);
3664 shift = INTVAL (XEXP (x, 1));
3666 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3667 else if ((GET_CODE (x) == SIGN_EXTRACT
3668 || GET_CODE (x) == ZERO_EXTRACT)
3669 && GET_MODE (x) == DImode
3670 && GET_CODE (XEXP (x, 0)) == MULT
3671 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3672 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3674 type = (GET_CODE (x) == SIGN_EXTRACT)
3675 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3676 index = XEXP (XEXP (x, 0), 0);
3677 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3678 if (INTVAL (XEXP (x, 1)) != 32 + shift
3679 || INTVAL (XEXP (x, 2)) != 0)
3680 shift = -1;
3682 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3683 (const_int 0xffffffff<<shift)) */
3684 else if (GET_CODE (x) == AND
3685 && GET_MODE (x) == DImode
3686 && GET_CODE (XEXP (x, 0)) == MULT
3687 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3688 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3689 && CONST_INT_P (XEXP (x, 1)))
3691 type = ADDRESS_REG_UXTW;
3692 index = XEXP (XEXP (x, 0), 0);
3693 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3694 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3695 shift = -1;
3697 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3698 else if ((GET_CODE (x) == SIGN_EXTRACT
3699 || GET_CODE (x) == ZERO_EXTRACT)
3700 && GET_MODE (x) == DImode
3701 && GET_CODE (XEXP (x, 0)) == ASHIFT
3702 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3703 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3705 type = (GET_CODE (x) == SIGN_EXTRACT)
3706 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3707 index = XEXP (XEXP (x, 0), 0);
3708 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3709 if (INTVAL (XEXP (x, 1)) != 32 + shift
3710 || INTVAL (XEXP (x, 2)) != 0)
3711 shift = -1;
3713 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3714 (const_int 0xffffffff<<shift)) */
3715 else if (GET_CODE (x) == AND
3716 && GET_MODE (x) == DImode
3717 && GET_CODE (XEXP (x, 0)) == ASHIFT
3718 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3719 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3720 && CONST_INT_P (XEXP (x, 1)))
3722 type = ADDRESS_REG_UXTW;
3723 index = XEXP (XEXP (x, 0), 0);
3724 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3725 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3726 shift = -1;
3728 /* (mult:P (reg:P) (const_int scale)) */
3729 else if (GET_CODE (x) == MULT
3730 && GET_MODE (x) == Pmode
3731 && GET_MODE (XEXP (x, 0)) == Pmode
3732 && CONST_INT_P (XEXP (x, 1)))
3734 type = ADDRESS_REG_REG;
3735 index = XEXP (x, 0);
3736 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3738 /* (ashift:P (reg:P) (const_int shift)) */
3739 else if (GET_CODE (x) == ASHIFT
3740 && GET_MODE (x) == Pmode
3741 && GET_MODE (XEXP (x, 0)) == Pmode
3742 && CONST_INT_P (XEXP (x, 1)))
3744 type = ADDRESS_REG_REG;
3745 index = XEXP (x, 0);
3746 shift = INTVAL (XEXP (x, 1));
3748 else
3749 return false;
3751 if (GET_CODE (index) == SUBREG)
3752 index = SUBREG_REG (index);
3754 if ((shift == 0 ||
3755 (shift > 0 && shift <= 3
3756 && (1 << shift) == GET_MODE_SIZE (mode)))
3757 && REG_P (index)
3758 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3760 info->type = type;
3761 info->offset = index;
3762 info->shift = shift;
3763 return true;
3766 return false;
3769 bool
3770 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3772 return (offset >= -64 * GET_MODE_SIZE (mode)
3773 && offset < 64 * GET_MODE_SIZE (mode)
3774 && offset % GET_MODE_SIZE (mode) == 0);
3777 static inline bool
3778 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3779 HOST_WIDE_INT offset)
3781 return offset >= -256 && offset < 256;
3784 static inline bool
3785 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3787 return (offset >= 0
3788 && offset < 4096 * GET_MODE_SIZE (mode)
3789 && offset % GET_MODE_SIZE (mode) == 0);
3792 /* Return true if MODE is one of the modes for which we
3793 support LDP/STP operations. */
3795 static bool
3796 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3798 return mode == SImode || mode == DImode
3799 || mode == SFmode || mode == DFmode
3800 || (aarch64_vector_mode_supported_p (mode)
3801 && GET_MODE_SIZE (mode) == 8);
3804 /* Return true if X is a valid address for machine mode MODE. If it is,
3805 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3806 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3808 static bool
3809 aarch64_classify_address (struct aarch64_address_info *info,
3810 rtx x, machine_mode mode,
3811 RTX_CODE outer_code, bool strict_p)
3813 enum rtx_code code = GET_CODE (x);
3814 rtx op0, op1;
3816 /* On BE, we use load/store pair for all large int mode load/stores. */
3817 bool load_store_pair_p = (outer_code == PARALLEL
3818 || (BYTES_BIG_ENDIAN
3819 && aarch64_vect_struct_mode_p (mode)));
3821 bool allow_reg_index_p =
3822 !load_store_pair_p
3823 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3824 && !aarch64_vect_struct_mode_p (mode);
3826 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3827 REG addressing. */
3828 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3829 && (code != POST_INC && code != REG))
3830 return false;
3832 switch (code)
3834 case REG:
3835 case SUBREG:
3836 info->type = ADDRESS_REG_IMM;
3837 info->base = x;
3838 info->offset = const0_rtx;
3839 return aarch64_base_register_rtx_p (x, strict_p);
3841 case PLUS:
3842 op0 = XEXP (x, 0);
3843 op1 = XEXP (x, 1);
3845 if (! strict_p
3846 && REG_P (op0)
3847 && (op0 == virtual_stack_vars_rtx
3848 || op0 == frame_pointer_rtx
3849 || op0 == arg_pointer_rtx)
3850 && CONST_INT_P (op1))
3852 info->type = ADDRESS_REG_IMM;
3853 info->base = op0;
3854 info->offset = op1;
3856 return true;
3859 if (GET_MODE_SIZE (mode) != 0
3860 && CONST_INT_P (op1)
3861 && aarch64_base_register_rtx_p (op0, strict_p))
3863 HOST_WIDE_INT offset = INTVAL (op1);
3865 info->type = ADDRESS_REG_IMM;
3866 info->base = op0;
3867 info->offset = op1;
3869 /* TImode and TFmode values are allowed in both pairs of X
3870 registers and individual Q registers. The available
3871 address modes are:
3872 X,X: 7-bit signed scaled offset
3873 Q: 9-bit signed offset
3874 We conservatively require an offset representable in either mode.
3876 if (mode == TImode || mode == TFmode)
3877 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3878 && offset_9bit_signed_unscaled_p (mode, offset));
3880 /* A 7bit offset check because OImode will emit a ldp/stp
3881 instruction (only big endian will get here).
3882 For ldp/stp instructions, the offset is scaled for the size of a
3883 single element of the pair. */
3884 if (mode == OImode)
3885 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3887 /* Three 9/12 bit offsets checks because CImode will emit three
3888 ldr/str instructions (only big endian will get here). */
3889 if (mode == CImode)
3890 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3891 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3892 || offset_12bit_unsigned_scaled_p (V16QImode,
3893 offset + 32)));
3895 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3896 instructions (only big endian will get here). */
3897 if (mode == XImode)
3898 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3899 && aarch64_offset_7bit_signed_scaled_p (TImode,
3900 offset + 32));
3902 if (load_store_pair_p)
3903 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3904 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3905 else
3906 return (offset_9bit_signed_unscaled_p (mode, offset)
3907 || offset_12bit_unsigned_scaled_p (mode, offset));
3910 if (allow_reg_index_p)
3912 /* Look for base + (scaled/extended) index register. */
3913 if (aarch64_base_register_rtx_p (op0, strict_p)
3914 && aarch64_classify_index (info, op1, mode, strict_p))
3916 info->base = op0;
3917 return true;
3919 if (aarch64_base_register_rtx_p (op1, strict_p)
3920 && aarch64_classify_index (info, op0, mode, strict_p))
3922 info->base = op1;
3923 return true;
3927 return false;
3929 case POST_INC:
3930 case POST_DEC:
3931 case PRE_INC:
3932 case PRE_DEC:
3933 info->type = ADDRESS_REG_WB;
3934 info->base = XEXP (x, 0);
3935 info->offset = NULL_RTX;
3936 return aarch64_base_register_rtx_p (info->base, strict_p);
3938 case POST_MODIFY:
3939 case PRE_MODIFY:
3940 info->type = ADDRESS_REG_WB;
3941 info->base = XEXP (x, 0);
3942 if (GET_CODE (XEXP (x, 1)) == PLUS
3943 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3944 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3945 && aarch64_base_register_rtx_p (info->base, strict_p))
3947 HOST_WIDE_INT offset;
3948 info->offset = XEXP (XEXP (x, 1), 1);
3949 offset = INTVAL (info->offset);
3951 /* TImode and TFmode values are allowed in both pairs of X
3952 registers and individual Q registers. The available
3953 address modes are:
3954 X,X: 7-bit signed scaled offset
3955 Q: 9-bit signed offset
3956 We conservatively require an offset representable in either mode.
3958 if (mode == TImode || mode == TFmode)
3959 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3960 && offset_9bit_signed_unscaled_p (mode, offset));
3962 if (load_store_pair_p)
3963 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3964 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3965 else
3966 return offset_9bit_signed_unscaled_p (mode, offset);
3968 return false;
3970 case CONST:
3971 case SYMBOL_REF:
3972 case LABEL_REF:
3973 /* load literal: pc-relative constant pool entry. Only supported
3974 for SI mode or larger. */
3975 info->type = ADDRESS_SYMBOLIC;
3977 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3979 rtx sym, addend;
3981 split_const (x, &sym, &addend);
3982 return ((GET_CODE (sym) == LABEL_REF
3983 || (GET_CODE (sym) == SYMBOL_REF
3984 && CONSTANT_POOL_ADDRESS_P (sym)
3985 && !aarch64_nopcrelative_literal_loads)));
3987 return false;
3989 case LO_SUM:
3990 info->type = ADDRESS_LO_SUM;
3991 info->base = XEXP (x, 0);
3992 info->offset = XEXP (x, 1);
3993 if (allow_reg_index_p
3994 && aarch64_base_register_rtx_p (info->base, strict_p))
3996 rtx sym, offs;
3997 split_const (info->offset, &sym, &offs);
3998 if (GET_CODE (sym) == SYMBOL_REF
3999 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4001 /* The symbol and offset must be aligned to the access size. */
4002 unsigned int align;
4003 unsigned int ref_size;
4005 if (CONSTANT_POOL_ADDRESS_P (sym))
4006 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4007 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4009 tree exp = SYMBOL_REF_DECL (sym);
4010 align = TYPE_ALIGN (TREE_TYPE (exp));
4011 align = CONSTANT_ALIGNMENT (exp, align);
4013 else if (SYMBOL_REF_DECL (sym))
4014 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4015 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4016 && SYMBOL_REF_BLOCK (sym) != NULL)
4017 align = SYMBOL_REF_BLOCK (sym)->alignment;
4018 else
4019 align = BITS_PER_UNIT;
4021 ref_size = GET_MODE_SIZE (mode);
4022 if (ref_size == 0)
4023 ref_size = GET_MODE_SIZE (DImode);
4025 return ((INTVAL (offs) & (ref_size - 1)) == 0
4026 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4029 return false;
4031 default:
4032 return false;
4036 bool
4037 aarch64_symbolic_address_p (rtx x)
4039 rtx offset;
4041 split_const (x, &x, &offset);
4042 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4045 /* Classify the base of symbolic expression X. */
4047 enum aarch64_symbol_type
4048 aarch64_classify_symbolic_expression (rtx x)
4050 rtx offset;
4052 split_const (x, &x, &offset);
4053 return aarch64_classify_symbol (x, offset);
4057 /* Return TRUE if X is a legitimate address for accessing memory in
4058 mode MODE. */
4059 static bool
4060 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4062 struct aarch64_address_info addr;
4064 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4067 /* Return TRUE if X is a legitimate address for accessing memory in
4068 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4069 pair operation. */
4070 bool
4071 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4072 RTX_CODE outer_code, bool strict_p)
4074 struct aarch64_address_info addr;
4076 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4079 /* Return TRUE if rtx X is immediate constant 0.0 */
4080 bool
4081 aarch64_float_const_zero_rtx_p (rtx x)
4083 if (GET_MODE (x) == VOIDmode)
4084 return false;
4086 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4087 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4088 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4091 /* Return the fixed registers used for condition codes. */
4093 static bool
4094 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4096 *p1 = CC_REGNUM;
4097 *p2 = INVALID_REGNUM;
4098 return true;
4101 /* Emit call insn with PAT and do aarch64-specific handling. */
4103 void
4104 aarch64_emit_call_insn (rtx pat)
4106 rtx insn = emit_call_insn (pat);
4108 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4109 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4110 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4113 machine_mode
4114 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4116 /* All floating point compares return CCFP if it is an equality
4117 comparison, and CCFPE otherwise. */
4118 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4120 switch (code)
4122 case EQ:
4123 case NE:
4124 case UNORDERED:
4125 case ORDERED:
4126 case UNLT:
4127 case UNLE:
4128 case UNGT:
4129 case UNGE:
4130 case UNEQ:
4131 case LTGT:
4132 return CCFPmode;
4134 case LT:
4135 case LE:
4136 case GT:
4137 case GE:
4138 return CCFPEmode;
4140 default:
4141 gcc_unreachable ();
4145 /* Equality comparisons of short modes against zero can be performed
4146 using the TST instruction with the appropriate bitmask. */
4147 if (y == const0_rtx && REG_P (x)
4148 && (code == EQ || code == NE)
4149 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4150 return CC_NZmode;
4152 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4153 && y == const0_rtx
4154 && (code == EQ || code == NE || code == LT || code == GE)
4155 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4156 || GET_CODE (x) == NEG
4157 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4158 && CONST_INT_P (XEXP (x, 2)))))
4159 return CC_NZmode;
4161 /* A compare with a shifted operand. Because of canonicalization,
4162 the comparison will have to be swapped when we emit the assembly
4163 code. */
4164 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4165 && (REG_P (y) || GET_CODE (y) == SUBREG)
4166 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4167 || GET_CODE (x) == LSHIFTRT
4168 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4169 return CC_SWPmode;
4171 /* Similarly for a negated operand, but we can only do this for
4172 equalities. */
4173 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4174 && (REG_P (y) || GET_CODE (y) == SUBREG)
4175 && (code == EQ || code == NE)
4176 && GET_CODE (x) == NEG)
4177 return CC_Zmode;
4179 /* A compare of a mode narrower than SI mode against zero can be done
4180 by extending the value in the comparison. */
4181 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
4182 && y == const0_rtx)
4183 /* Only use sign-extension if we really need it. */
4184 return ((code == GT || code == GE || code == LE || code == LT)
4185 ? CC_SESWPmode : CC_ZESWPmode);
4187 /* For everything else, return CCmode. */
4188 return CCmode;
4191 static int
4192 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4195 aarch64_get_condition_code (rtx x)
4197 machine_mode mode = GET_MODE (XEXP (x, 0));
4198 enum rtx_code comp_code = GET_CODE (x);
4200 if (GET_MODE_CLASS (mode) != MODE_CC)
4201 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4202 return aarch64_get_condition_code_1 (mode, comp_code);
4205 static int
4206 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4208 int ne = -1, eq = -1;
4209 switch (mode)
4211 case CCFPmode:
4212 case CCFPEmode:
4213 switch (comp_code)
4215 case GE: return AARCH64_GE;
4216 case GT: return AARCH64_GT;
4217 case LE: return AARCH64_LS;
4218 case LT: return AARCH64_MI;
4219 case NE: return AARCH64_NE;
4220 case EQ: return AARCH64_EQ;
4221 case ORDERED: return AARCH64_VC;
4222 case UNORDERED: return AARCH64_VS;
4223 case UNLT: return AARCH64_LT;
4224 case UNLE: return AARCH64_LE;
4225 case UNGT: return AARCH64_HI;
4226 case UNGE: return AARCH64_PL;
4227 default: return -1;
4229 break;
4231 case CC_DNEmode:
4232 ne = AARCH64_NE;
4233 eq = AARCH64_EQ;
4234 break;
4236 case CC_DEQmode:
4237 ne = AARCH64_EQ;
4238 eq = AARCH64_NE;
4239 break;
4241 case CC_DGEmode:
4242 ne = AARCH64_GE;
4243 eq = AARCH64_LT;
4244 break;
4246 case CC_DLTmode:
4247 ne = AARCH64_LT;
4248 eq = AARCH64_GE;
4249 break;
4251 case CC_DGTmode:
4252 ne = AARCH64_GT;
4253 eq = AARCH64_LE;
4254 break;
4256 case CC_DLEmode:
4257 ne = AARCH64_LE;
4258 eq = AARCH64_GT;
4259 break;
4261 case CC_DGEUmode:
4262 ne = AARCH64_CS;
4263 eq = AARCH64_CC;
4264 break;
4266 case CC_DLTUmode:
4267 ne = AARCH64_CC;
4268 eq = AARCH64_CS;
4269 break;
4271 case CC_DGTUmode:
4272 ne = AARCH64_HI;
4273 eq = AARCH64_LS;
4274 break;
4276 case CC_DLEUmode:
4277 ne = AARCH64_LS;
4278 eq = AARCH64_HI;
4279 break;
4281 case CCmode:
4282 switch (comp_code)
4284 case NE: return AARCH64_NE;
4285 case EQ: return AARCH64_EQ;
4286 case GE: return AARCH64_GE;
4287 case GT: return AARCH64_GT;
4288 case LE: return AARCH64_LE;
4289 case LT: return AARCH64_LT;
4290 case GEU: return AARCH64_CS;
4291 case GTU: return AARCH64_HI;
4292 case LEU: return AARCH64_LS;
4293 case LTU: return AARCH64_CC;
4294 default: return -1;
4296 break;
4298 case CC_SWPmode:
4299 case CC_ZESWPmode:
4300 case CC_SESWPmode:
4301 switch (comp_code)
4303 case NE: return AARCH64_NE;
4304 case EQ: return AARCH64_EQ;
4305 case GE: return AARCH64_LE;
4306 case GT: return AARCH64_LT;
4307 case LE: return AARCH64_GE;
4308 case LT: return AARCH64_GT;
4309 case GEU: return AARCH64_LS;
4310 case GTU: return AARCH64_CC;
4311 case LEU: return AARCH64_CS;
4312 case LTU: return AARCH64_HI;
4313 default: return -1;
4315 break;
4317 case CC_NZmode:
4318 switch (comp_code)
4320 case NE: return AARCH64_NE;
4321 case EQ: return AARCH64_EQ;
4322 case GE: return AARCH64_PL;
4323 case LT: return AARCH64_MI;
4324 default: return -1;
4326 break;
4328 case CC_Zmode:
4329 switch (comp_code)
4331 case NE: return AARCH64_NE;
4332 case EQ: return AARCH64_EQ;
4333 default: return -1;
4335 break;
4337 default:
4338 return -1;
4339 break;
4342 if (comp_code == NE)
4343 return ne;
4345 if (comp_code == EQ)
4346 return eq;
4348 return -1;
4351 bool
4352 aarch64_const_vec_all_same_in_range_p (rtx x,
4353 HOST_WIDE_INT minval,
4354 HOST_WIDE_INT maxval)
4356 HOST_WIDE_INT firstval;
4357 int count, i;
4359 if (GET_CODE (x) != CONST_VECTOR
4360 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4361 return false;
4363 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4364 if (firstval < minval || firstval > maxval)
4365 return false;
4367 count = CONST_VECTOR_NUNITS (x);
4368 for (i = 1; i < count; i++)
4369 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4370 return false;
4372 return true;
4375 bool
4376 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4378 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4382 /* N Z C V. */
4383 #define AARCH64_CC_V 1
4384 #define AARCH64_CC_C (1 << 1)
4385 #define AARCH64_CC_Z (1 << 2)
4386 #define AARCH64_CC_N (1 << 3)
4388 /* N Z C V flags for ccmp. The first code is for AND op and the other
4389 is for IOR op. Indexed by AARCH64_COND_CODE. */
4390 static const int aarch64_nzcv_codes[][2] =
4392 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
4393 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
4394 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4395 {0, AARCH64_CC_C}, /* CC, C == 0. */
4396 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4397 {0, AARCH64_CC_N}, /* PL, N == 0. */
4398 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4399 {0, AARCH64_CC_V}, /* VC, V == 0. */
4400 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4401 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4402 {0, AARCH64_CC_V}, /* GE, N == V. */
4403 {AARCH64_CC_V, 0}, /* LT, N != V. */
4404 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4405 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4406 {0, 0}, /* AL, Any. */
4407 {0, 0}, /* NV, Any. */
4411 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4413 switch (mode)
4415 case CC_DNEmode:
4416 return NE;
4418 case CC_DEQmode:
4419 return EQ;
4421 case CC_DLEmode:
4422 return LE;
4424 case CC_DGTmode:
4425 return GT;
4427 case CC_DLTmode:
4428 return LT;
4430 case CC_DGEmode:
4431 return GE;
4433 case CC_DLEUmode:
4434 return LEU;
4436 case CC_DGTUmode:
4437 return GTU;
4439 case CC_DLTUmode:
4440 return LTU;
4442 case CC_DGEUmode:
4443 return GEU;
4445 default:
4446 gcc_unreachable ();
4451 static void
4452 aarch64_print_operand (FILE *f, rtx x, int code)
4454 switch (code)
4456 /* An integer or symbol address without a preceding # sign. */
4457 case 'c':
4458 switch (GET_CODE (x))
4460 case CONST_INT:
4461 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4462 break;
4464 case SYMBOL_REF:
4465 output_addr_const (f, x);
4466 break;
4468 case CONST:
4469 if (GET_CODE (XEXP (x, 0)) == PLUS
4470 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4472 output_addr_const (f, x);
4473 break;
4475 /* Fall through. */
4477 default:
4478 output_operand_lossage ("Unsupported operand for code '%c'", code);
4480 break;
4482 case 'e':
4483 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4485 int n;
4487 if (!CONST_INT_P (x)
4488 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4490 output_operand_lossage ("invalid operand for '%%%c'", code);
4491 return;
4494 switch (n)
4496 case 3:
4497 fputc ('b', f);
4498 break;
4499 case 4:
4500 fputc ('h', f);
4501 break;
4502 case 5:
4503 fputc ('w', f);
4504 break;
4505 default:
4506 output_operand_lossage ("invalid operand for '%%%c'", code);
4507 return;
4510 break;
4512 case 'p':
4514 int n;
4516 /* Print N such that 2^N == X. */
4517 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4519 output_operand_lossage ("invalid operand for '%%%c'", code);
4520 return;
4523 asm_fprintf (f, "%d", n);
4525 break;
4527 case 'P':
4528 /* Print the number of non-zero bits in X (a const_int). */
4529 if (!CONST_INT_P (x))
4531 output_operand_lossage ("invalid operand for '%%%c'", code);
4532 return;
4535 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4536 break;
4538 case 'H':
4539 /* Print the higher numbered register of a pair (TImode) of regs. */
4540 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4542 output_operand_lossage ("invalid operand for '%%%c'", code);
4543 return;
4546 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4547 break;
4549 case 'm':
4551 int cond_code;
4552 /* Print a condition (eq, ne, etc). */
4554 /* CONST_TRUE_RTX means always -- that's the default. */
4555 if (x == const_true_rtx)
4556 return;
4558 if (!COMPARISON_P (x))
4560 output_operand_lossage ("invalid operand for '%%%c'", code);
4561 return;
4564 cond_code = aarch64_get_condition_code (x);
4565 gcc_assert (cond_code >= 0);
4566 fputs (aarch64_condition_codes[cond_code], f);
4568 break;
4570 case 'M':
4572 int cond_code;
4573 /* Print the inverse of a condition (eq <-> ne, etc). */
4575 /* CONST_TRUE_RTX means never -- that's the default. */
4576 if (x == const_true_rtx)
4578 fputs ("nv", f);
4579 return;
4582 if (!COMPARISON_P (x))
4584 output_operand_lossage ("invalid operand for '%%%c'", code);
4585 return;
4587 cond_code = aarch64_get_condition_code (x);
4588 gcc_assert (cond_code >= 0);
4589 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4590 (cond_code)], f);
4592 break;
4594 case 'b':
4595 case 'h':
4596 case 's':
4597 case 'd':
4598 case 'q':
4599 /* Print a scalar FP/SIMD register name. */
4600 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4602 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4603 return;
4605 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4606 break;
4608 case 'S':
4609 case 'T':
4610 case 'U':
4611 case 'V':
4612 /* Print the first FP/SIMD register name in a list. */
4613 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4615 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4616 return;
4618 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4619 break;
4621 case 'R':
4622 /* Print a scalar FP/SIMD register name + 1. */
4623 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4625 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4626 return;
4628 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4629 break;
4631 case 'X':
4632 /* Print bottom 16 bits of integer constant in hex. */
4633 if (!CONST_INT_P (x))
4635 output_operand_lossage ("invalid operand for '%%%c'", code);
4636 return;
4638 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4639 break;
4641 case 'w':
4642 case 'x':
4643 /* Print a general register name or the zero register (32-bit or
4644 64-bit). */
4645 if (x == const0_rtx
4646 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4648 asm_fprintf (f, "%czr", code);
4649 break;
4652 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4654 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4655 break;
4658 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4660 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4661 break;
4664 /* Fall through */
4666 case 0:
4667 /* Print a normal operand, if it's a general register, then we
4668 assume DImode. */
4669 if (x == NULL)
4671 output_operand_lossage ("missing operand");
4672 return;
4675 switch (GET_CODE (x))
4677 case REG:
4678 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4679 break;
4681 case MEM:
4682 output_address (GET_MODE (x), XEXP (x, 0));
4683 break;
4685 case CONST:
4686 case LABEL_REF:
4687 case SYMBOL_REF:
4688 output_addr_const (asm_out_file, x);
4689 break;
4691 case CONST_INT:
4692 asm_fprintf (f, "%wd", INTVAL (x));
4693 break;
4695 case CONST_VECTOR:
4696 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4698 gcc_assert (
4699 aarch64_const_vec_all_same_in_range_p (x,
4700 HOST_WIDE_INT_MIN,
4701 HOST_WIDE_INT_MAX));
4702 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4704 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4706 fputc ('0', f);
4708 else
4709 gcc_unreachable ();
4710 break;
4712 case CONST_DOUBLE:
4713 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4714 be getting CONST_DOUBLEs holding integers. */
4715 gcc_assert (GET_MODE (x) != VOIDmode);
4716 if (aarch64_float_const_zero_rtx_p (x))
4718 fputc ('0', f);
4719 break;
4721 else if (aarch64_float_const_representable_p (x))
4723 #define buf_size 20
4724 char float_buf[buf_size] = {'\0'};
4725 real_to_decimal_for_mode (float_buf,
4726 CONST_DOUBLE_REAL_VALUE (x),
4727 buf_size, buf_size,
4728 1, GET_MODE (x));
4729 asm_fprintf (asm_out_file, "%s", float_buf);
4730 break;
4731 #undef buf_size
4733 output_operand_lossage ("invalid constant");
4734 return;
4735 default:
4736 output_operand_lossage ("invalid operand");
4737 return;
4739 break;
4741 case 'A':
4742 if (GET_CODE (x) == HIGH)
4743 x = XEXP (x, 0);
4745 switch (aarch64_classify_symbolic_expression (x))
4747 case SYMBOL_SMALL_GOT_4G:
4748 asm_fprintf (asm_out_file, ":got:");
4749 break;
4751 case SYMBOL_SMALL_TLSGD:
4752 asm_fprintf (asm_out_file, ":tlsgd:");
4753 break;
4755 case SYMBOL_SMALL_TLSDESC:
4756 asm_fprintf (asm_out_file, ":tlsdesc:");
4757 break;
4759 case SYMBOL_SMALL_TLSIE:
4760 asm_fprintf (asm_out_file, ":gottprel:");
4761 break;
4763 case SYMBOL_TLSLE24:
4764 asm_fprintf (asm_out_file, ":tprel:");
4765 break;
4767 case SYMBOL_TINY_GOT:
4768 gcc_unreachable ();
4769 break;
4771 default:
4772 break;
4774 output_addr_const (asm_out_file, x);
4775 break;
4777 case 'L':
4778 switch (aarch64_classify_symbolic_expression (x))
4780 case SYMBOL_SMALL_GOT_4G:
4781 asm_fprintf (asm_out_file, ":lo12:");
4782 break;
4784 case SYMBOL_SMALL_TLSGD:
4785 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4786 break;
4788 case SYMBOL_SMALL_TLSDESC:
4789 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4790 break;
4792 case SYMBOL_SMALL_TLSIE:
4793 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4794 break;
4796 case SYMBOL_TLSLE12:
4797 asm_fprintf (asm_out_file, ":tprel_lo12:");
4798 break;
4800 case SYMBOL_TLSLE24:
4801 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4802 break;
4804 case SYMBOL_TINY_GOT:
4805 asm_fprintf (asm_out_file, ":got:");
4806 break;
4808 case SYMBOL_TINY_TLSIE:
4809 asm_fprintf (asm_out_file, ":gottprel:");
4810 break;
4812 default:
4813 break;
4815 output_addr_const (asm_out_file, x);
4816 break;
4818 case 'G':
4820 switch (aarch64_classify_symbolic_expression (x))
4822 case SYMBOL_TLSLE24:
4823 asm_fprintf (asm_out_file, ":tprel_hi12:");
4824 break;
4825 default:
4826 break;
4828 output_addr_const (asm_out_file, x);
4829 break;
4831 case 'K':
4833 int cond_code;
4834 /* Print nzcv. */
4836 if (!COMPARISON_P (x))
4838 output_operand_lossage ("invalid operand for '%%%c'", code);
4839 return;
4842 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4843 gcc_assert (cond_code >= 0);
4844 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4846 break;
4848 case 'k':
4850 int cond_code;
4851 /* Print nzcv. */
4853 if (!COMPARISON_P (x))
4855 output_operand_lossage ("invalid operand for '%%%c'", code);
4856 return;
4859 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4860 gcc_assert (cond_code >= 0);
4861 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4863 break;
4865 default:
4866 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4867 return;
4871 static void
4872 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4874 struct aarch64_address_info addr;
4876 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4877 switch (addr.type)
4879 case ADDRESS_REG_IMM:
4880 if (addr.offset == const0_rtx)
4881 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4882 else
4883 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4884 INTVAL (addr.offset));
4885 return;
4887 case ADDRESS_REG_REG:
4888 if (addr.shift == 0)
4889 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4890 reg_names [REGNO (addr.offset)]);
4891 else
4892 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4893 reg_names [REGNO (addr.offset)], addr.shift);
4894 return;
4896 case ADDRESS_REG_UXTW:
4897 if (addr.shift == 0)
4898 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4899 REGNO (addr.offset) - R0_REGNUM);
4900 else
4901 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4902 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4903 return;
4905 case ADDRESS_REG_SXTW:
4906 if (addr.shift == 0)
4907 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4908 REGNO (addr.offset) - R0_REGNUM);
4909 else
4910 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4911 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4912 return;
4914 case ADDRESS_REG_WB:
4915 switch (GET_CODE (x))
4917 case PRE_INC:
4918 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4919 GET_MODE_SIZE (mode));
4920 return;
4921 case POST_INC:
4922 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4923 GET_MODE_SIZE (mode));
4924 return;
4925 case PRE_DEC:
4926 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4927 GET_MODE_SIZE (mode));
4928 return;
4929 case POST_DEC:
4930 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4931 GET_MODE_SIZE (mode));
4932 return;
4933 case PRE_MODIFY:
4934 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4935 INTVAL (addr.offset));
4936 return;
4937 case POST_MODIFY:
4938 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4939 INTVAL (addr.offset));
4940 return;
4941 default:
4942 break;
4944 break;
4946 case ADDRESS_LO_SUM:
4947 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4948 output_addr_const (f, addr.offset);
4949 asm_fprintf (f, "]");
4950 return;
4952 case ADDRESS_SYMBOLIC:
4953 break;
4956 output_addr_const (f, x);
4959 bool
4960 aarch64_label_mentioned_p (rtx x)
4962 const char *fmt;
4963 int i;
4965 if (GET_CODE (x) == LABEL_REF)
4966 return true;
4968 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4969 referencing instruction, but they are constant offsets, not
4970 symbols. */
4971 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4972 return false;
4974 fmt = GET_RTX_FORMAT (GET_CODE (x));
4975 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4977 if (fmt[i] == 'E')
4979 int j;
4981 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4982 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4983 return 1;
4985 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4986 return 1;
4989 return 0;
4992 /* Implement REGNO_REG_CLASS. */
4994 enum reg_class
4995 aarch64_regno_regclass (unsigned regno)
4997 if (GP_REGNUM_P (regno))
4998 return GENERAL_REGS;
5000 if (regno == SP_REGNUM)
5001 return STACK_REG;
5003 if (regno == FRAME_POINTER_REGNUM
5004 || regno == ARG_POINTER_REGNUM)
5005 return POINTER_REGS;
5007 if (FP_REGNUM_P (regno))
5008 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5010 return NO_REGS;
5013 static rtx
5014 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5016 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5017 where mask is selected by alignment and size of the offset.
5018 We try to pick as large a range for the offset as possible to
5019 maximize the chance of a CSE. However, for aligned addresses
5020 we limit the range to 4k so that structures with different sized
5021 elements are likely to use the same base. We need to be careful
5022 not to split a CONST for some forms of address expression, otherwise
5023 it will generate sub-optimal code. */
5025 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5027 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
5028 HOST_WIDE_INT base_offset;
5030 if (GET_CODE (XEXP (x, 0)) == PLUS)
5032 rtx op0 = XEXP (XEXP (x, 0), 0);
5033 rtx op1 = XEXP (XEXP (x, 0), 1);
5035 /* Address expressions of the form Ra + Rb + CONST.
5037 If CONST is within the range supported by the addressing
5038 mode "reg+offset", do not split CONST and use the
5039 sequence
5040 Rt = Ra + Rb;
5041 addr = Rt + CONST. */
5042 if (REG_P (op0) && REG_P (op1))
5044 machine_mode addr_mode = GET_MODE (x);
5045 rtx base = gen_reg_rtx (addr_mode);
5046 rtx addr = plus_constant (addr_mode, base, offset);
5048 if (aarch64_legitimate_address_hook_p (mode, addr, false))
5050 emit_insn (gen_adddi3 (base, op0, op1));
5051 return addr;
5054 /* Address expressions of the form Ra + Rb<<SCALE + CONST.
5056 If Reg + Rb<<SCALE is a valid address expression, do not
5057 split CONST and use the sequence
5058 Rc = CONST;
5059 Rt = Ra + Rc;
5060 addr = Rt + Rb<<SCALE.
5062 Here we split CONST out of memory referece because:
5063 a) We depend on GIMPLE optimizers to pick up common sub
5064 expression involving the scaling operation.
5065 b) The index Rb is likely a loop iv, it's better to split
5066 the CONST so that computation of new base Rt is a loop
5067 invariant and can be moved out of loop. This is more
5068 important when the original base Ra is sfp related. */
5069 else if (REG_P (op0) || REG_P (op1))
5071 machine_mode addr_mode = GET_MODE (x);
5072 rtx base = gen_reg_rtx (addr_mode);
5074 /* Switch to make sure that register is in op0. */
5075 if (REG_P (op1))
5076 std::swap (op0, op1);
5078 rtx addr = gen_rtx_PLUS (addr_mode, op1, base);
5080 if (aarch64_legitimate_address_hook_p (mode, addr, false))
5082 base = force_operand (plus_constant (addr_mode,
5083 op0, offset),
5084 NULL_RTX);
5085 return gen_rtx_PLUS (addr_mode, op1, base);
5090 /* Does it look like we'll need a load/store-pair operation? */
5091 if (GET_MODE_SIZE (mode) > 16
5092 || mode == TImode)
5093 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5094 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5095 /* For offsets aren't a multiple of the access size, the limit is
5096 -256...255. */
5097 else if (offset & (GET_MODE_SIZE (mode) - 1))
5098 base_offset = (offset + 0x100) & ~0x1ff;
5099 else
5100 base_offset = offset & ~0xfff;
5102 if (base_offset == 0)
5103 return x;
5105 offset -= base_offset;
5106 rtx base_reg = gen_reg_rtx (Pmode);
5107 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
5108 NULL_RTX);
5109 emit_move_insn (base_reg, val);
5110 x = plus_constant (Pmode, base_reg, offset);
5113 return x;
5116 /* Try a machine-dependent way of reloading an illegitimate address
5117 operand. If we find one, push the reload and return the new rtx. */
5120 aarch64_legitimize_reload_address (rtx *x_p,
5121 machine_mode mode,
5122 int opnum, int type,
5123 int ind_levels ATTRIBUTE_UNUSED)
5125 rtx x = *x_p;
5127 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
5128 if (aarch64_vect_struct_mode_p (mode)
5129 && GET_CODE (x) == PLUS
5130 && REG_P (XEXP (x, 0))
5131 && CONST_INT_P (XEXP (x, 1)))
5133 rtx orig_rtx = x;
5134 x = copy_rtx (x);
5135 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
5136 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5137 opnum, (enum reload_type) type);
5138 return x;
5141 /* We must recognize output that we have already generated ourselves. */
5142 if (GET_CODE (x) == PLUS
5143 && GET_CODE (XEXP (x, 0)) == PLUS
5144 && REG_P (XEXP (XEXP (x, 0), 0))
5145 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5146 && CONST_INT_P (XEXP (x, 1)))
5148 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5149 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5150 opnum, (enum reload_type) type);
5151 return x;
5154 /* We wish to handle large displacements off a base register by splitting
5155 the addend across an add and the mem insn. This can cut the number of
5156 extra insns needed from 3 to 1. It is only useful for load/store of a
5157 single register with 12 bit offset field. */
5158 if (GET_CODE (x) == PLUS
5159 && REG_P (XEXP (x, 0))
5160 && CONST_INT_P (XEXP (x, 1))
5161 && HARD_REGISTER_P (XEXP (x, 0))
5162 && mode != TImode
5163 && mode != TFmode
5164 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
5166 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
5167 HOST_WIDE_INT low = val & 0xfff;
5168 HOST_WIDE_INT high = val - low;
5169 HOST_WIDE_INT offs;
5170 rtx cst;
5171 machine_mode xmode = GET_MODE (x);
5173 /* In ILP32, xmode can be either DImode or SImode. */
5174 gcc_assert (xmode == DImode || xmode == SImode);
5176 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
5177 BLKmode alignment. */
5178 if (GET_MODE_SIZE (mode) == 0)
5179 return NULL_RTX;
5181 offs = low % GET_MODE_SIZE (mode);
5183 /* Align misaligned offset by adjusting high part to compensate. */
5184 if (offs != 0)
5186 if (aarch64_uimm12_shift (high + offs))
5188 /* Align down. */
5189 low = low - offs;
5190 high = high + offs;
5192 else
5194 /* Align up. */
5195 offs = GET_MODE_SIZE (mode) - offs;
5196 low = low + offs;
5197 high = high + (low & 0x1000) - offs;
5198 low &= 0xfff;
5202 /* Check for overflow. */
5203 if (high + low != val)
5204 return NULL_RTX;
5206 cst = GEN_INT (high);
5207 if (!aarch64_uimm12_shift (high))
5208 cst = force_const_mem (xmode, cst);
5210 /* Reload high part into base reg, leaving the low part
5211 in the mem instruction.
5212 Note that replacing this gen_rtx_PLUS with plus_constant is
5213 wrong in this case because we rely on the
5214 (plus (plus reg c1) c2) structure being preserved so that
5215 XEXP (*p, 0) in push_reload below uses the correct term. */
5216 x = gen_rtx_PLUS (xmode,
5217 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
5218 GEN_INT (low));
5220 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5221 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
5222 opnum, (enum reload_type) type);
5223 return x;
5226 return NULL_RTX;
5230 /* Return the reload icode required for a constant pool in mode. */
5231 static enum insn_code
5232 aarch64_constant_pool_reload_icode (machine_mode mode)
5234 switch (mode)
5236 case SFmode:
5237 return CODE_FOR_aarch64_reload_movcpsfdi;
5239 case DFmode:
5240 return CODE_FOR_aarch64_reload_movcpdfdi;
5242 case TFmode:
5243 return CODE_FOR_aarch64_reload_movcptfdi;
5245 case V8QImode:
5246 return CODE_FOR_aarch64_reload_movcpv8qidi;
5248 case V16QImode:
5249 return CODE_FOR_aarch64_reload_movcpv16qidi;
5251 case V4HImode:
5252 return CODE_FOR_aarch64_reload_movcpv4hidi;
5254 case V8HImode:
5255 return CODE_FOR_aarch64_reload_movcpv8hidi;
5257 case V2SImode:
5258 return CODE_FOR_aarch64_reload_movcpv2sidi;
5260 case V4SImode:
5261 return CODE_FOR_aarch64_reload_movcpv4sidi;
5263 case V2DImode:
5264 return CODE_FOR_aarch64_reload_movcpv2didi;
5266 case V2DFmode:
5267 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5269 default:
5270 gcc_unreachable ();
5273 gcc_unreachable ();
5275 static reg_class_t
5276 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5277 reg_class_t rclass,
5278 machine_mode mode,
5279 secondary_reload_info *sri)
5282 /* If we have to disable direct literal pool loads and stores because the
5283 function is too big, then we need a scratch register. */
5284 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5285 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5286 || targetm.vector_mode_supported_p (GET_MODE (x)))
5287 && aarch64_nopcrelative_literal_loads)
5289 sri->icode = aarch64_constant_pool_reload_icode (mode);
5290 return NO_REGS;
5293 /* Without the TARGET_SIMD instructions we cannot move a Q register
5294 to a Q register directly. We need a scratch. */
5295 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5296 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5297 && reg_class_subset_p (rclass, FP_REGS))
5299 if (mode == TFmode)
5300 sri->icode = CODE_FOR_aarch64_reload_movtf;
5301 else if (mode == TImode)
5302 sri->icode = CODE_FOR_aarch64_reload_movti;
5303 return NO_REGS;
5306 /* A TFmode or TImode memory access should be handled via an FP_REGS
5307 because AArch64 has richer addressing modes for LDR/STR instructions
5308 than LDP/STP instructions. */
5309 if (TARGET_FLOAT && rclass == GENERAL_REGS
5310 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5311 return FP_REGS;
5313 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5314 return GENERAL_REGS;
5316 return NO_REGS;
5319 static bool
5320 aarch64_can_eliminate (const int from, const int to)
5322 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5323 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5325 if (frame_pointer_needed)
5327 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5328 return true;
5329 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5330 return false;
5331 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5332 && !cfun->calls_alloca)
5333 return true;
5334 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5335 return true;
5337 return false;
5339 else
5341 /* If we decided that we didn't need a leaf frame pointer but then used
5342 LR in the function, then we'll want a frame pointer after all, so
5343 prevent this elimination to ensure a frame pointer is used. */
5344 if (to == STACK_POINTER_REGNUM
5345 && flag_omit_leaf_frame_pointer
5346 && df_regs_ever_live_p (LR_REGNUM))
5347 return false;
5350 return true;
5353 HOST_WIDE_INT
5354 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5356 aarch64_layout_frame ();
5358 if (to == HARD_FRAME_POINTER_REGNUM)
5360 if (from == ARG_POINTER_REGNUM)
5361 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5363 if (from == FRAME_POINTER_REGNUM)
5364 return (cfun->machine->frame.hard_fp_offset
5365 - cfun->machine->frame.saved_varargs_size);
5368 if (to == STACK_POINTER_REGNUM)
5370 if (from == FRAME_POINTER_REGNUM)
5371 return (cfun->machine->frame.frame_size
5372 - cfun->machine->frame.saved_varargs_size);
5375 return cfun->machine->frame.frame_size;
5378 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5379 previous frame. */
5382 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5384 if (count != 0)
5385 return const0_rtx;
5386 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5390 static void
5391 aarch64_asm_trampoline_template (FILE *f)
5393 if (TARGET_ILP32)
5395 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5396 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5398 else
5400 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5401 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5403 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5404 assemble_aligned_integer (4, const0_rtx);
5405 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5406 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5409 static void
5410 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5412 rtx fnaddr, mem, a_tramp;
5413 const int tramp_code_sz = 16;
5415 /* Don't need to copy the trailing D-words, we fill those in below. */
5416 emit_block_move (m_tramp, assemble_trampoline_template (),
5417 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5418 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5419 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5420 if (GET_MODE (fnaddr) != ptr_mode)
5421 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5422 emit_move_insn (mem, fnaddr);
5424 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5425 emit_move_insn (mem, chain_value);
5427 /* XXX We should really define a "clear_cache" pattern and use
5428 gen_clear_cache(). */
5429 a_tramp = XEXP (m_tramp, 0);
5430 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5431 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5432 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5433 ptr_mode);
5436 static unsigned char
5437 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5439 switch (regclass)
5441 case CALLER_SAVE_REGS:
5442 case POINTER_REGS:
5443 case GENERAL_REGS:
5444 case ALL_REGS:
5445 case FP_REGS:
5446 case FP_LO_REGS:
5447 return
5448 aarch64_vector_mode_p (mode)
5449 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5450 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5451 case STACK_REG:
5452 return 1;
5454 case NO_REGS:
5455 return 0;
5457 default:
5458 break;
5460 gcc_unreachable ();
5463 static reg_class_t
5464 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5466 if (regclass == POINTER_REGS)
5467 return GENERAL_REGS;
5469 if (regclass == STACK_REG)
5471 if (REG_P(x)
5472 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5473 return regclass;
5475 return NO_REGS;
5478 /* If it's an integer immediate that MOVI can't handle, then
5479 FP_REGS is not an option, so we return NO_REGS instead. */
5480 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5481 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5482 return NO_REGS;
5484 /* Register eliminiation can result in a request for
5485 SP+constant->FP_REGS. We cannot support such operations which
5486 use SP as source and an FP_REG as destination, so reject out
5487 right now. */
5488 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5490 rtx lhs = XEXP (x, 0);
5492 /* Look through a possible SUBREG introduced by ILP32. */
5493 if (GET_CODE (lhs) == SUBREG)
5494 lhs = SUBREG_REG (lhs);
5496 gcc_assert (REG_P (lhs));
5497 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5498 POINTER_REGS));
5499 return NO_REGS;
5502 return regclass;
5505 void
5506 aarch64_asm_output_labelref (FILE* f, const char *name)
5508 asm_fprintf (f, "%U%s", name);
5511 static void
5512 aarch64_elf_asm_constructor (rtx symbol, int priority)
5514 if (priority == DEFAULT_INIT_PRIORITY)
5515 default_ctor_section_asm_out_constructor (symbol, priority);
5516 else
5518 section *s;
5519 char buf[18];
5520 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5521 s = get_section (buf, SECTION_WRITE, NULL);
5522 switch_to_section (s);
5523 assemble_align (POINTER_SIZE);
5524 assemble_aligned_integer (POINTER_BYTES, symbol);
5528 static void
5529 aarch64_elf_asm_destructor (rtx symbol, int priority)
5531 if (priority == DEFAULT_INIT_PRIORITY)
5532 default_dtor_section_asm_out_destructor (symbol, priority);
5533 else
5535 section *s;
5536 char buf[18];
5537 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5538 s = get_section (buf, SECTION_WRITE, NULL);
5539 switch_to_section (s);
5540 assemble_align (POINTER_SIZE);
5541 assemble_aligned_integer (POINTER_BYTES, symbol);
5545 const char*
5546 aarch64_output_casesi (rtx *operands)
5548 char buf[100];
5549 char label[100];
5550 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5551 int index;
5552 static const char *const patterns[4][2] =
5555 "ldrb\t%w3, [%0,%w1,uxtw]",
5556 "add\t%3, %4, %w3, sxtb #2"
5559 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5560 "add\t%3, %4, %w3, sxth #2"
5563 "ldr\t%w3, [%0,%w1,uxtw #2]",
5564 "add\t%3, %4, %w3, sxtw #2"
5566 /* We assume that DImode is only generated when not optimizing and
5567 that we don't really need 64-bit address offsets. That would
5568 imply an object file with 8GB of code in a single function! */
5570 "ldr\t%w3, [%0,%w1,uxtw #2]",
5571 "add\t%3, %4, %w3, sxtw #2"
5575 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5577 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5579 gcc_assert (index >= 0 && index <= 3);
5581 /* Need to implement table size reduction, by chaning the code below. */
5582 output_asm_insn (patterns[index][0], operands);
5583 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5584 snprintf (buf, sizeof (buf),
5585 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5586 output_asm_insn (buf, operands);
5587 output_asm_insn (patterns[index][1], operands);
5588 output_asm_insn ("br\t%3", operands);
5589 assemble_label (asm_out_file, label);
5590 return "";
5594 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5595 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5596 operator. */
5599 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5601 if (shift >= 0 && shift <= 3)
5603 int size;
5604 for (size = 8; size <= 32; size *= 2)
5606 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5607 if (mask == bits << shift)
5608 return size;
5611 return 0;
5614 /* Constant pools are per function only when PC relative
5615 literal loads are true or we are in the large memory
5616 model. */
5618 static inline bool
5619 aarch64_can_use_per_function_literal_pools_p (void)
5621 return (!aarch64_nopcrelative_literal_loads
5622 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5625 static bool
5626 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5628 /* Fixme:: In an ideal world this would work similar
5629 to the logic in aarch64_select_rtx_section but this
5630 breaks bootstrap in gcc go. For now we workaround
5631 this by returning false here. */
5632 return false;
5635 /* Select appropriate section for constants depending
5636 on where we place literal pools. */
5638 static section *
5639 aarch64_select_rtx_section (machine_mode mode,
5640 rtx x,
5641 unsigned HOST_WIDE_INT align)
5643 if (aarch64_can_use_per_function_literal_pools_p ())
5644 return function_section (current_function_decl);
5646 return default_elf_select_rtx_section (mode, x, align);
5649 /* Costs. */
5651 /* Helper function for rtx cost calculation. Strip a shift expression
5652 from X. Returns the inner operand if successful, or the original
5653 expression on failure. */
5654 static rtx
5655 aarch64_strip_shift (rtx x)
5657 rtx op = x;
5659 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5660 we can convert both to ROR during final output. */
5661 if ((GET_CODE (op) == ASHIFT
5662 || GET_CODE (op) == ASHIFTRT
5663 || GET_CODE (op) == LSHIFTRT
5664 || GET_CODE (op) == ROTATERT
5665 || GET_CODE (op) == ROTATE)
5666 && CONST_INT_P (XEXP (op, 1)))
5667 return XEXP (op, 0);
5669 if (GET_CODE (op) == MULT
5670 && CONST_INT_P (XEXP (op, 1))
5671 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5672 return XEXP (op, 0);
5674 return x;
5677 /* Helper function for rtx cost calculation. Strip an extend
5678 expression from X. Returns the inner operand if successful, or the
5679 original expression on failure. We deal with a number of possible
5680 canonicalization variations here. */
5681 static rtx
5682 aarch64_strip_extend (rtx x)
5684 rtx op = x;
5686 /* Zero and sign extraction of a widened value. */
5687 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5688 && XEXP (op, 2) == const0_rtx
5689 && GET_CODE (XEXP (op, 0)) == MULT
5690 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5691 XEXP (op, 1)))
5692 return XEXP (XEXP (op, 0), 0);
5694 /* It can also be represented (for zero-extend) as an AND with an
5695 immediate. */
5696 if (GET_CODE (op) == AND
5697 && GET_CODE (XEXP (op, 0)) == MULT
5698 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5699 && CONST_INT_P (XEXP (op, 1))
5700 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5701 INTVAL (XEXP (op, 1))) != 0)
5702 return XEXP (XEXP (op, 0), 0);
5704 /* Now handle extended register, as this may also have an optional
5705 left shift by 1..4. */
5706 if (GET_CODE (op) == ASHIFT
5707 && CONST_INT_P (XEXP (op, 1))
5708 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5709 op = XEXP (op, 0);
5711 if (GET_CODE (op) == ZERO_EXTEND
5712 || GET_CODE (op) == SIGN_EXTEND)
5713 op = XEXP (op, 0);
5715 if (op != x)
5716 return op;
5718 return x;
5721 /* Return true iff CODE is a shift supported in combination
5722 with arithmetic instructions. */
5724 static bool
5725 aarch64_shift_p (enum rtx_code code)
5727 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5730 /* Helper function for rtx cost calculation. Calculate the cost of
5731 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5732 Return the calculated cost of the expression, recursing manually in to
5733 operands where needed. */
5735 static int
5736 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5738 rtx op0, op1;
5739 const struct cpu_cost_table *extra_cost
5740 = aarch64_tune_params.insn_extra_cost;
5741 int cost = 0;
5742 bool compound_p = (outer == PLUS || outer == MINUS);
5743 machine_mode mode = GET_MODE (x);
5745 gcc_checking_assert (code == MULT);
5747 op0 = XEXP (x, 0);
5748 op1 = XEXP (x, 1);
5750 if (VECTOR_MODE_P (mode))
5751 mode = GET_MODE_INNER (mode);
5753 /* Integer multiply/fma. */
5754 if (GET_MODE_CLASS (mode) == MODE_INT)
5756 /* The multiply will be canonicalized as a shift, cost it as such. */
5757 if (aarch64_shift_p (GET_CODE (x))
5758 || (CONST_INT_P (op1)
5759 && exact_log2 (INTVAL (op1)) > 0))
5761 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5762 || GET_CODE (op0) == SIGN_EXTEND;
5763 if (speed)
5765 if (compound_p)
5767 if (REG_P (op1))
5768 /* ARITH + shift-by-register. */
5769 cost += extra_cost->alu.arith_shift_reg;
5770 else if (is_extend)
5771 /* ARITH + extended register. We don't have a cost field
5772 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5773 cost += extra_cost->alu.extend_arith;
5774 else
5775 /* ARITH + shift-by-immediate. */
5776 cost += extra_cost->alu.arith_shift;
5778 else
5779 /* LSL (immediate). */
5780 cost += extra_cost->alu.shift;
5783 /* Strip extends as we will have costed them in the case above. */
5784 if (is_extend)
5785 op0 = aarch64_strip_extend (op0);
5787 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5789 return cost;
5792 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5793 compound and let the below cases handle it. After all, MNEG is a
5794 special-case alias of MSUB. */
5795 if (GET_CODE (op0) == NEG)
5797 op0 = XEXP (op0, 0);
5798 compound_p = true;
5801 /* Integer multiplies or FMAs have zero/sign extending variants. */
5802 if ((GET_CODE (op0) == ZERO_EXTEND
5803 && GET_CODE (op1) == ZERO_EXTEND)
5804 || (GET_CODE (op0) == SIGN_EXTEND
5805 && GET_CODE (op1) == SIGN_EXTEND))
5807 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5808 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5810 if (speed)
5812 if (compound_p)
5813 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5814 cost += extra_cost->mult[0].extend_add;
5815 else
5816 /* MUL/SMULL/UMULL. */
5817 cost += extra_cost->mult[0].extend;
5820 return cost;
5823 /* This is either an integer multiply or a MADD. In both cases
5824 we want to recurse and cost the operands. */
5825 cost += rtx_cost (op0, mode, MULT, 0, speed);
5826 cost += rtx_cost (op1, mode, MULT, 1, speed);
5828 if (speed)
5830 if (compound_p)
5831 /* MADD/MSUB. */
5832 cost += extra_cost->mult[mode == DImode].add;
5833 else
5834 /* MUL. */
5835 cost += extra_cost->mult[mode == DImode].simple;
5838 return cost;
5840 else
5842 if (speed)
5844 /* Floating-point FMA/FMUL can also support negations of the
5845 operands, unless the rounding mode is upward or downward in
5846 which case FNMUL is different than FMUL with operand negation. */
5847 bool neg0 = GET_CODE (op0) == NEG;
5848 bool neg1 = GET_CODE (op1) == NEG;
5849 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5851 if (neg0)
5852 op0 = XEXP (op0, 0);
5853 if (neg1)
5854 op1 = XEXP (op1, 0);
5857 if (compound_p)
5858 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5859 cost += extra_cost->fp[mode == DFmode].fma;
5860 else
5861 /* FMUL/FNMUL. */
5862 cost += extra_cost->fp[mode == DFmode].mult;
5865 cost += rtx_cost (op0, mode, MULT, 0, speed);
5866 cost += rtx_cost (op1, mode, MULT, 1, speed);
5867 return cost;
5871 static int
5872 aarch64_address_cost (rtx x,
5873 machine_mode mode,
5874 addr_space_t as ATTRIBUTE_UNUSED,
5875 bool speed)
5877 enum rtx_code c = GET_CODE (x);
5878 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5879 struct aarch64_address_info info;
5880 int cost = 0;
5881 info.shift = 0;
5883 if (!aarch64_classify_address (&info, x, mode, c, false))
5885 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5887 /* This is a CONST or SYMBOL ref which will be split
5888 in a different way depending on the code model in use.
5889 Cost it through the generic infrastructure. */
5890 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5891 /* Divide through by the cost of one instruction to
5892 bring it to the same units as the address costs. */
5893 cost_symbol_ref /= COSTS_N_INSNS (1);
5894 /* The cost is then the cost of preparing the address,
5895 followed by an immediate (possibly 0) offset. */
5896 return cost_symbol_ref + addr_cost->imm_offset;
5898 else
5900 /* This is most likely a jump table from a case
5901 statement. */
5902 return addr_cost->register_offset;
5906 switch (info.type)
5908 case ADDRESS_LO_SUM:
5909 case ADDRESS_SYMBOLIC:
5910 case ADDRESS_REG_IMM:
5911 cost += addr_cost->imm_offset;
5912 break;
5914 case ADDRESS_REG_WB:
5915 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5916 cost += addr_cost->pre_modify;
5917 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5918 cost += addr_cost->post_modify;
5919 else
5920 gcc_unreachable ();
5922 break;
5924 case ADDRESS_REG_REG:
5925 cost += addr_cost->register_offset;
5926 break;
5928 case ADDRESS_REG_SXTW:
5929 cost += addr_cost->register_sextend;
5930 break;
5932 case ADDRESS_REG_UXTW:
5933 cost += addr_cost->register_zextend;
5934 break;
5936 default:
5937 gcc_unreachable ();
5941 if (info.shift > 0)
5943 /* For the sake of calculating the cost of the shifted register
5944 component, we can treat same sized modes in the same way. */
5945 switch (GET_MODE_BITSIZE (mode))
5947 case 16:
5948 cost += addr_cost->addr_scale_costs.hi;
5949 break;
5951 case 32:
5952 cost += addr_cost->addr_scale_costs.si;
5953 break;
5955 case 64:
5956 cost += addr_cost->addr_scale_costs.di;
5957 break;
5959 /* We can't tell, or this is a 128-bit vector. */
5960 default:
5961 cost += addr_cost->addr_scale_costs.ti;
5962 break;
5966 return cost;
5969 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5970 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5971 to be taken. */
5974 aarch64_branch_cost (bool speed_p, bool predictable_p)
5976 /* When optimizing for speed, use the cost of unpredictable branches. */
5977 const struct cpu_branch_cost *branch_costs =
5978 aarch64_tune_params.branch_costs;
5980 if (!speed_p || predictable_p)
5981 return branch_costs->predictable;
5982 else
5983 return branch_costs->unpredictable;
5986 /* Return true if the RTX X in mode MODE is a zero or sign extract
5987 usable in an ADD or SUB (extended register) instruction. */
5988 static bool
5989 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5991 /* Catch add with a sign extract.
5992 This is add_<optab><mode>_multp2. */
5993 if (GET_CODE (x) == SIGN_EXTRACT
5994 || GET_CODE (x) == ZERO_EXTRACT)
5996 rtx op0 = XEXP (x, 0);
5997 rtx op1 = XEXP (x, 1);
5998 rtx op2 = XEXP (x, 2);
6000 if (GET_CODE (op0) == MULT
6001 && CONST_INT_P (op1)
6002 && op2 == const0_rtx
6003 && CONST_INT_P (XEXP (op0, 1))
6004 && aarch64_is_extend_from_extract (mode,
6005 XEXP (op0, 1),
6006 op1))
6008 return true;
6011 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6012 No shift. */
6013 else if (GET_CODE (x) == SIGN_EXTEND
6014 || GET_CODE (x) == ZERO_EXTEND)
6015 return REG_P (XEXP (x, 0));
6017 return false;
6020 static bool
6021 aarch64_frint_unspec_p (unsigned int u)
6023 switch (u)
6025 case UNSPEC_FRINTZ:
6026 case UNSPEC_FRINTP:
6027 case UNSPEC_FRINTM:
6028 case UNSPEC_FRINTA:
6029 case UNSPEC_FRINTN:
6030 case UNSPEC_FRINTX:
6031 case UNSPEC_FRINTI:
6032 return true;
6034 default:
6035 return false;
6039 /* Return true iff X is an rtx that will match an extr instruction
6040 i.e. as described in the *extr<mode>5_insn family of patterns.
6041 OP0 and OP1 will be set to the operands of the shifts involved
6042 on success and will be NULL_RTX otherwise. */
6044 static bool
6045 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6047 rtx op0, op1;
6048 machine_mode mode = GET_MODE (x);
6050 *res_op0 = NULL_RTX;
6051 *res_op1 = NULL_RTX;
6053 if (GET_CODE (x) != IOR)
6054 return false;
6056 op0 = XEXP (x, 0);
6057 op1 = XEXP (x, 1);
6059 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6060 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6062 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6063 if (GET_CODE (op1) == ASHIFT)
6064 std::swap (op0, op1);
6066 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6067 return false;
6069 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6070 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6072 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6073 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6075 *res_op0 = XEXP (op0, 0);
6076 *res_op1 = XEXP (op1, 0);
6077 return true;
6081 return false;
6084 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6085 storing it in *COST. Result is true if the total cost of the operation
6086 has now been calculated. */
6087 static bool
6088 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6090 rtx inner;
6091 rtx comparator;
6092 enum rtx_code cmpcode;
6094 if (COMPARISON_P (op0))
6096 inner = XEXP (op0, 0);
6097 comparator = XEXP (op0, 1);
6098 cmpcode = GET_CODE (op0);
6100 else
6102 inner = op0;
6103 comparator = const0_rtx;
6104 cmpcode = NE;
6107 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6109 /* Conditional branch. */
6110 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6111 return true;
6112 else
6114 if (cmpcode == NE || cmpcode == EQ)
6116 if (comparator == const0_rtx)
6118 /* TBZ/TBNZ/CBZ/CBNZ. */
6119 if (GET_CODE (inner) == ZERO_EXTRACT)
6120 /* TBZ/TBNZ. */
6121 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6122 ZERO_EXTRACT, 0, speed);
6123 else
6124 /* CBZ/CBNZ. */
6125 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6127 return true;
6130 else if (cmpcode == LT || cmpcode == GE)
6132 /* TBZ/TBNZ. */
6133 if (comparator == const0_rtx)
6134 return true;
6138 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6140 /* It's a conditional operation based on the status flags,
6141 so it must be some flavor of CSEL. */
6143 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6144 if (GET_CODE (op1) == NEG
6145 || GET_CODE (op1) == NOT
6146 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6147 op1 = XEXP (op1, 0);
6148 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6150 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6151 op1 = XEXP (op1, 0);
6152 op2 = XEXP (op2, 0);
6155 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6156 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6157 return true;
6160 /* We don't know what this is, cost all operands. */
6161 return false;
6164 /* Check whether X is a bitfield operation of the form shift + extend that
6165 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6166 operand to which the bitfield operation is applied. Otherwise return
6167 NULL_RTX. */
6169 static rtx
6170 aarch64_extend_bitfield_pattern_p (rtx x)
6172 rtx_code outer_code = GET_CODE (x);
6173 machine_mode outer_mode = GET_MODE (x);
6175 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6176 && outer_mode != SImode && outer_mode != DImode)
6177 return NULL_RTX;
6179 rtx inner = XEXP (x, 0);
6180 rtx_code inner_code = GET_CODE (inner);
6181 machine_mode inner_mode = GET_MODE (inner);
6182 rtx op = NULL_RTX;
6184 switch (inner_code)
6186 case ASHIFT:
6187 if (CONST_INT_P (XEXP (inner, 1))
6188 && (inner_mode == QImode || inner_mode == HImode))
6189 op = XEXP (inner, 0);
6190 break;
6191 case LSHIFTRT:
6192 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6193 && (inner_mode == QImode || inner_mode == HImode))
6194 op = XEXP (inner, 0);
6195 break;
6196 case ASHIFTRT:
6197 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6198 && (inner_mode == QImode || inner_mode == HImode))
6199 op = XEXP (inner, 0);
6200 break;
6201 default:
6202 break;
6205 return op;
6208 /* Calculate the cost of calculating X, storing it in *COST. Result
6209 is true if the total cost of the operation has now been calculated. */
6210 static bool
6211 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6212 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6214 rtx op0, op1, op2;
6215 const struct cpu_cost_table *extra_cost
6216 = aarch64_tune_params.insn_extra_cost;
6217 int code = GET_CODE (x);
6219 /* By default, assume that everything has equivalent cost to the
6220 cheapest instruction. Any additional costs are applied as a delta
6221 above this default. */
6222 *cost = COSTS_N_INSNS (1);
6224 switch (code)
6226 case SET:
6227 /* The cost depends entirely on the operands to SET. */
6228 *cost = 0;
6229 op0 = SET_DEST (x);
6230 op1 = SET_SRC (x);
6232 switch (GET_CODE (op0))
6234 case MEM:
6235 if (speed)
6237 rtx address = XEXP (op0, 0);
6238 if (VECTOR_MODE_P (mode))
6239 *cost += extra_cost->ldst.storev;
6240 else if (GET_MODE_CLASS (mode) == MODE_INT)
6241 *cost += extra_cost->ldst.store;
6242 else if (mode == SFmode)
6243 *cost += extra_cost->ldst.storef;
6244 else if (mode == DFmode)
6245 *cost += extra_cost->ldst.stored;
6247 *cost +=
6248 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6249 0, speed));
6252 *cost += rtx_cost (op1, mode, SET, 1, speed);
6253 return true;
6255 case SUBREG:
6256 if (! REG_P (SUBREG_REG (op0)))
6257 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6259 /* Fall through. */
6260 case REG:
6261 /* The cost is one per vector-register copied. */
6262 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6264 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6265 / GET_MODE_SIZE (V4SImode);
6266 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6268 /* const0_rtx is in general free, but we will use an
6269 instruction to set a register to 0. */
6270 else if (REG_P (op1) || op1 == const0_rtx)
6272 /* The cost is 1 per register copied. */
6273 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6274 / UNITS_PER_WORD;
6275 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6277 else
6278 /* Cost is just the cost of the RHS of the set. */
6279 *cost += rtx_cost (op1, mode, SET, 1, speed);
6280 return true;
6282 case ZERO_EXTRACT:
6283 case SIGN_EXTRACT:
6284 /* Bit-field insertion. Strip any redundant widening of
6285 the RHS to meet the width of the target. */
6286 if (GET_CODE (op1) == SUBREG)
6287 op1 = SUBREG_REG (op1);
6288 if ((GET_CODE (op1) == ZERO_EXTEND
6289 || GET_CODE (op1) == SIGN_EXTEND)
6290 && CONST_INT_P (XEXP (op0, 1))
6291 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6292 >= INTVAL (XEXP (op0, 1))))
6293 op1 = XEXP (op1, 0);
6295 if (CONST_INT_P (op1))
6297 /* MOV immediate is assumed to always be cheap. */
6298 *cost = COSTS_N_INSNS (1);
6300 else
6302 /* BFM. */
6303 if (speed)
6304 *cost += extra_cost->alu.bfi;
6305 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6308 return true;
6310 default:
6311 /* We can't make sense of this, assume default cost. */
6312 *cost = COSTS_N_INSNS (1);
6313 return false;
6315 return false;
6317 case CONST_INT:
6318 /* If an instruction can incorporate a constant within the
6319 instruction, the instruction's expression avoids calling
6320 rtx_cost() on the constant. If rtx_cost() is called on a
6321 constant, then it is usually because the constant must be
6322 moved into a register by one or more instructions.
6324 The exception is constant 0, which can be expressed
6325 as XZR/WZR and is therefore free. The exception to this is
6326 if we have (set (reg) (const0_rtx)) in which case we must cost
6327 the move. However, we can catch that when we cost the SET, so
6328 we don't need to consider that here. */
6329 if (x == const0_rtx)
6330 *cost = 0;
6331 else
6333 /* To an approximation, building any other constant is
6334 proportionally expensive to the number of instructions
6335 required to build that constant. This is true whether we
6336 are compiling for SPEED or otherwise. */
6337 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6338 (NULL_RTX, x, false, mode));
6340 return true;
6342 case CONST_DOUBLE:
6343 if (speed)
6345 /* mov[df,sf]_aarch64. */
6346 if (aarch64_float_const_representable_p (x))
6347 /* FMOV (scalar immediate). */
6348 *cost += extra_cost->fp[mode == DFmode].fpconst;
6349 else if (!aarch64_float_const_zero_rtx_p (x))
6351 /* This will be a load from memory. */
6352 if (mode == DFmode)
6353 *cost += extra_cost->ldst.loadd;
6354 else
6355 *cost += extra_cost->ldst.loadf;
6357 else
6358 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6359 or MOV v0.s[0], wzr - neither of which are modeled by the
6360 cost tables. Just use the default cost. */
6365 return true;
6367 case MEM:
6368 if (speed)
6370 /* For loads we want the base cost of a load, plus an
6371 approximation for the additional cost of the addressing
6372 mode. */
6373 rtx address = XEXP (x, 0);
6374 if (VECTOR_MODE_P (mode))
6375 *cost += extra_cost->ldst.loadv;
6376 else if (GET_MODE_CLASS (mode) == MODE_INT)
6377 *cost += extra_cost->ldst.load;
6378 else if (mode == SFmode)
6379 *cost += extra_cost->ldst.loadf;
6380 else if (mode == DFmode)
6381 *cost += extra_cost->ldst.loadd;
6383 *cost +=
6384 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6385 0, speed));
6388 return true;
6390 case NEG:
6391 op0 = XEXP (x, 0);
6393 if (VECTOR_MODE_P (mode))
6395 if (speed)
6397 /* FNEG. */
6398 *cost += extra_cost->vect.alu;
6400 return false;
6403 if (GET_MODE_CLASS (mode) == MODE_INT)
6405 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6406 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6408 /* CSETM. */
6409 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6410 return true;
6413 /* Cost this as SUB wzr, X. */
6414 op0 = CONST0_RTX (mode);
6415 op1 = XEXP (x, 0);
6416 goto cost_minus;
6419 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6421 /* Support (neg(fma...)) as a single instruction only if
6422 sign of zeros is unimportant. This matches the decision
6423 making in aarch64.md. */
6424 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6426 /* FNMADD. */
6427 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6428 return true;
6430 if (GET_CODE (op0) == MULT)
6432 /* FNMUL. */
6433 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6434 return true;
6436 if (speed)
6437 /* FNEG. */
6438 *cost += extra_cost->fp[mode == DFmode].neg;
6439 return false;
6442 return false;
6444 case CLRSB:
6445 case CLZ:
6446 if (speed)
6448 if (VECTOR_MODE_P (mode))
6449 *cost += extra_cost->vect.alu;
6450 else
6451 *cost += extra_cost->alu.clz;
6454 return false;
6456 case COMPARE:
6457 op0 = XEXP (x, 0);
6458 op1 = XEXP (x, 1);
6460 if (op1 == const0_rtx
6461 && GET_CODE (op0) == AND)
6463 x = op0;
6464 mode = GET_MODE (op0);
6465 goto cost_logic;
6468 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6470 /* TODO: A write to the CC flags possibly costs extra, this
6471 needs encoding in the cost tables. */
6473 /* CC_ZESWPmode supports zero extend for free. */
6474 if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6475 op0 = XEXP (op0, 0);
6477 mode = GET_MODE (op0);
6478 /* ANDS. */
6479 if (GET_CODE (op0) == AND)
6481 x = op0;
6482 goto cost_logic;
6485 if (GET_CODE (op0) == PLUS)
6487 /* ADDS (and CMN alias). */
6488 x = op0;
6489 goto cost_plus;
6492 if (GET_CODE (op0) == MINUS)
6494 /* SUBS. */
6495 x = op0;
6496 goto cost_minus;
6499 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6500 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6501 && CONST_INT_P (XEXP (op0, 2)))
6503 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6504 Handle it here directly rather than going to cost_logic
6505 since we know the immediate generated for the TST is valid
6506 so we can avoid creating an intermediate rtx for it only
6507 for costing purposes. */
6508 if (speed)
6509 *cost += extra_cost->alu.logical;
6511 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6512 ZERO_EXTRACT, 0, speed);
6513 return true;
6516 if (GET_CODE (op1) == NEG)
6518 /* CMN. */
6519 if (speed)
6520 *cost += extra_cost->alu.arith;
6522 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6523 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6524 return true;
6527 /* CMP.
6529 Compare can freely swap the order of operands, and
6530 canonicalization puts the more complex operation first.
6531 But the integer MINUS logic expects the shift/extend
6532 operation in op1. */
6533 if (! (REG_P (op0)
6534 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6536 op0 = XEXP (x, 1);
6537 op1 = XEXP (x, 0);
6539 goto cost_minus;
6542 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6544 /* FCMP. */
6545 if (speed)
6546 *cost += extra_cost->fp[mode == DFmode].compare;
6548 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6550 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6551 /* FCMP supports constant 0.0 for no extra cost. */
6552 return true;
6554 return false;
6557 if (VECTOR_MODE_P (mode))
6559 /* Vector compare. */
6560 if (speed)
6561 *cost += extra_cost->vect.alu;
6563 if (aarch64_float_const_zero_rtx_p (op1))
6565 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6566 cost. */
6567 return true;
6569 return false;
6571 return false;
6573 case MINUS:
6575 op0 = XEXP (x, 0);
6576 op1 = XEXP (x, 1);
6578 cost_minus:
6579 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6581 /* Detect valid immediates. */
6582 if ((GET_MODE_CLASS (mode) == MODE_INT
6583 || (GET_MODE_CLASS (mode) == MODE_CC
6584 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6585 && CONST_INT_P (op1)
6586 && aarch64_uimm12_shift (INTVAL (op1)))
6588 if (speed)
6589 /* SUB(S) (immediate). */
6590 *cost += extra_cost->alu.arith;
6591 return true;
6594 /* Look for SUB (extended register). */
6595 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6597 if (speed)
6598 *cost += extra_cost->alu.extend_arith;
6600 op1 = aarch64_strip_extend (op1);
6601 *cost += rtx_cost (op1, VOIDmode,
6602 (enum rtx_code) GET_CODE (op1), 0, speed);
6603 return true;
6606 rtx new_op1 = aarch64_strip_extend (op1);
6608 /* Cost this as an FMA-alike operation. */
6609 if ((GET_CODE (new_op1) == MULT
6610 || aarch64_shift_p (GET_CODE (new_op1)))
6611 && code != COMPARE)
6613 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6614 (enum rtx_code) code,
6615 speed);
6616 return true;
6619 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6621 if (speed)
6623 if (VECTOR_MODE_P (mode))
6625 /* Vector SUB. */
6626 *cost += extra_cost->vect.alu;
6628 else if (GET_MODE_CLASS (mode) == MODE_INT)
6630 /* SUB(S). */
6631 *cost += extra_cost->alu.arith;
6633 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6635 /* FSUB. */
6636 *cost += extra_cost->fp[mode == DFmode].addsub;
6639 return true;
6642 case PLUS:
6644 rtx new_op0;
6646 op0 = XEXP (x, 0);
6647 op1 = XEXP (x, 1);
6649 cost_plus:
6650 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6651 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6653 /* CSINC. */
6654 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6655 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6656 return true;
6659 if (GET_MODE_CLASS (mode) == MODE_INT
6660 && CONST_INT_P (op1)
6661 && aarch64_uimm12_shift (INTVAL (op1)))
6663 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6665 if (speed)
6666 /* ADD (immediate). */
6667 *cost += extra_cost->alu.arith;
6668 return true;
6671 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6673 /* Look for ADD (extended register). */
6674 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6676 if (speed)
6677 *cost += extra_cost->alu.extend_arith;
6679 op0 = aarch64_strip_extend (op0);
6680 *cost += rtx_cost (op0, VOIDmode,
6681 (enum rtx_code) GET_CODE (op0), 0, speed);
6682 return true;
6685 /* Strip any extend, leave shifts behind as we will
6686 cost them through mult_cost. */
6687 new_op0 = aarch64_strip_extend (op0);
6689 if (GET_CODE (new_op0) == MULT
6690 || aarch64_shift_p (GET_CODE (new_op0)))
6692 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6693 speed);
6694 return true;
6697 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6699 if (speed)
6701 if (VECTOR_MODE_P (mode))
6703 /* Vector ADD. */
6704 *cost += extra_cost->vect.alu;
6706 else if (GET_MODE_CLASS (mode) == MODE_INT)
6708 /* ADD. */
6709 *cost += extra_cost->alu.arith;
6711 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6713 /* FADD. */
6714 *cost += extra_cost->fp[mode == DFmode].addsub;
6717 return true;
6720 case BSWAP:
6721 *cost = COSTS_N_INSNS (1);
6723 if (speed)
6725 if (VECTOR_MODE_P (mode))
6726 *cost += extra_cost->vect.alu;
6727 else
6728 *cost += extra_cost->alu.rev;
6730 return false;
6732 case IOR:
6733 if (aarch_rev16_p (x))
6735 *cost = COSTS_N_INSNS (1);
6737 if (speed)
6739 if (VECTOR_MODE_P (mode))
6740 *cost += extra_cost->vect.alu;
6741 else
6742 *cost += extra_cost->alu.rev;
6744 return true;
6747 if (aarch64_extr_rtx_p (x, &op0, &op1))
6749 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6750 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6751 if (speed)
6752 *cost += extra_cost->alu.shift;
6754 return true;
6756 /* Fall through. */
6757 case XOR:
6758 case AND:
6759 cost_logic:
6760 op0 = XEXP (x, 0);
6761 op1 = XEXP (x, 1);
6763 if (VECTOR_MODE_P (mode))
6765 if (speed)
6766 *cost += extra_cost->vect.alu;
6767 return true;
6770 if (code == AND
6771 && GET_CODE (op0) == MULT
6772 && CONST_INT_P (XEXP (op0, 1))
6773 && CONST_INT_P (op1)
6774 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6775 INTVAL (op1)) != 0)
6777 /* This is a UBFM/SBFM. */
6778 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6779 if (speed)
6780 *cost += extra_cost->alu.bfx;
6781 return true;
6784 if (GET_MODE_CLASS (mode) == MODE_INT)
6786 /* We possibly get the immediate for free, this is not
6787 modelled. */
6788 if (CONST_INT_P (op1)
6789 && aarch64_bitmask_imm (INTVAL (op1), mode))
6791 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6793 if (speed)
6794 *cost += extra_cost->alu.logical;
6796 return true;
6798 else
6800 rtx new_op0 = op0;
6802 /* Handle ORN, EON, or BIC. */
6803 if (GET_CODE (op0) == NOT)
6804 op0 = XEXP (op0, 0);
6806 new_op0 = aarch64_strip_shift (op0);
6808 /* If we had a shift on op0 then this is a logical-shift-
6809 by-register/immediate operation. Otherwise, this is just
6810 a logical operation. */
6811 if (speed)
6813 if (new_op0 != op0)
6815 /* Shift by immediate. */
6816 if (CONST_INT_P (XEXP (op0, 1)))
6817 *cost += extra_cost->alu.log_shift;
6818 else
6819 *cost += extra_cost->alu.log_shift_reg;
6821 else
6822 *cost += extra_cost->alu.logical;
6825 /* In both cases we want to cost both operands. */
6826 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6827 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6829 return true;
6832 return false;
6834 case NOT:
6835 x = XEXP (x, 0);
6836 op0 = aarch64_strip_shift (x);
6838 if (VECTOR_MODE_P (mode))
6840 /* Vector NOT. */
6841 *cost += extra_cost->vect.alu;
6842 return false;
6845 /* MVN-shifted-reg. */
6846 if (op0 != x)
6848 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6850 if (speed)
6851 *cost += extra_cost->alu.log_shift;
6853 return true;
6855 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6856 Handle the second form here taking care that 'a' in the above can
6857 be a shift. */
6858 else if (GET_CODE (op0) == XOR)
6860 rtx newop0 = XEXP (op0, 0);
6861 rtx newop1 = XEXP (op0, 1);
6862 rtx op0_stripped = aarch64_strip_shift (newop0);
6864 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6865 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6867 if (speed)
6869 if (op0_stripped != newop0)
6870 *cost += extra_cost->alu.log_shift;
6871 else
6872 *cost += extra_cost->alu.logical;
6875 return true;
6877 /* MVN. */
6878 if (speed)
6879 *cost += extra_cost->alu.logical;
6881 return false;
6883 case ZERO_EXTEND:
6885 op0 = XEXP (x, 0);
6886 /* If a value is written in SI mode, then zero extended to DI
6887 mode, the operation will in general be free as a write to
6888 a 'w' register implicitly zeroes the upper bits of an 'x'
6889 register. However, if this is
6891 (set (reg) (zero_extend (reg)))
6893 we must cost the explicit register move. */
6894 if (mode == DImode
6895 && GET_MODE (op0) == SImode
6896 && outer == SET)
6898 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6900 if (!op_cost && speed)
6901 /* MOV. */
6902 *cost += extra_cost->alu.extend;
6903 else
6904 /* Free, the cost is that of the SI mode operation. */
6905 *cost = op_cost;
6907 return true;
6909 else if (MEM_P (op0))
6911 /* All loads can zero extend to any size for free. */
6912 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6913 return true;
6916 op0 = aarch64_extend_bitfield_pattern_p (x);
6917 if (op0)
6919 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6920 if (speed)
6921 *cost += extra_cost->alu.bfx;
6922 return true;
6925 if (speed)
6927 if (VECTOR_MODE_P (mode))
6929 /* UMOV. */
6930 *cost += extra_cost->vect.alu;
6932 else
6934 /* UXTB/UXTH. */
6935 *cost += extra_cost->alu.extend;
6938 return false;
6940 case SIGN_EXTEND:
6941 if (MEM_P (XEXP (x, 0)))
6943 /* LDRSH. */
6944 if (speed)
6946 rtx address = XEXP (XEXP (x, 0), 0);
6947 *cost += extra_cost->ldst.load_sign_extend;
6949 *cost +=
6950 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6951 0, speed));
6953 return true;
6956 op0 = aarch64_extend_bitfield_pattern_p (x);
6957 if (op0)
6959 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6960 if (speed)
6961 *cost += extra_cost->alu.bfx;
6962 return true;
6965 if (speed)
6967 if (VECTOR_MODE_P (mode))
6968 *cost += extra_cost->vect.alu;
6969 else
6970 *cost += extra_cost->alu.extend;
6972 return false;
6974 case ASHIFT:
6975 op0 = XEXP (x, 0);
6976 op1 = XEXP (x, 1);
6978 if (CONST_INT_P (op1))
6980 if (speed)
6982 if (VECTOR_MODE_P (mode))
6984 /* Vector shift (immediate). */
6985 *cost += extra_cost->vect.alu;
6987 else
6989 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6990 aliases. */
6991 *cost += extra_cost->alu.shift;
6995 /* We can incorporate zero/sign extend for free. */
6996 if (GET_CODE (op0) == ZERO_EXTEND
6997 || GET_CODE (op0) == SIGN_EXTEND)
6998 op0 = XEXP (op0, 0);
7000 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7001 return true;
7003 else
7005 if (speed)
7007 if (VECTOR_MODE_P (mode))
7009 /* Vector shift (register). */
7010 *cost += extra_cost->vect.alu;
7012 else
7014 /* LSLV. */
7015 *cost += extra_cost->alu.shift_reg;
7018 return false; /* All arguments need to be in registers. */
7021 case ROTATE:
7022 case ROTATERT:
7023 case LSHIFTRT:
7024 case ASHIFTRT:
7025 op0 = XEXP (x, 0);
7026 op1 = XEXP (x, 1);
7028 if (CONST_INT_P (op1))
7030 /* ASR (immediate) and friends. */
7031 if (speed)
7033 if (VECTOR_MODE_P (mode))
7034 *cost += extra_cost->vect.alu;
7035 else
7036 *cost += extra_cost->alu.shift;
7039 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7040 return true;
7042 else
7045 /* ASR (register) and friends. */
7046 if (speed)
7048 if (VECTOR_MODE_P (mode))
7049 *cost += extra_cost->vect.alu;
7050 else
7051 *cost += extra_cost->alu.shift_reg;
7053 return false; /* All arguments need to be in registers. */
7056 case SYMBOL_REF:
7058 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7059 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7061 /* LDR. */
7062 if (speed)
7063 *cost += extra_cost->ldst.load;
7065 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7066 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7068 /* ADRP, followed by ADD. */
7069 *cost += COSTS_N_INSNS (1);
7070 if (speed)
7071 *cost += 2 * extra_cost->alu.arith;
7073 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7074 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7076 /* ADR. */
7077 if (speed)
7078 *cost += extra_cost->alu.arith;
7081 if (flag_pic)
7083 /* One extra load instruction, after accessing the GOT. */
7084 *cost += COSTS_N_INSNS (1);
7085 if (speed)
7086 *cost += extra_cost->ldst.load;
7088 return true;
7090 case HIGH:
7091 case LO_SUM:
7092 /* ADRP/ADD (immediate). */
7093 if (speed)
7094 *cost += extra_cost->alu.arith;
7095 return true;
7097 case ZERO_EXTRACT:
7098 case SIGN_EXTRACT:
7099 /* UBFX/SBFX. */
7100 if (speed)
7102 if (VECTOR_MODE_P (mode))
7103 *cost += extra_cost->vect.alu;
7104 else
7105 *cost += extra_cost->alu.bfx;
7108 /* We can trust that the immediates used will be correct (there
7109 are no by-register forms), so we need only cost op0. */
7110 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7111 return true;
7113 case MULT:
7114 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7115 /* aarch64_rtx_mult_cost always handles recursion to its
7116 operands. */
7117 return true;
7119 case MOD:
7120 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7121 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7122 an unconditional negate. This case should only ever be reached through
7123 the set_smod_pow2_cheap check in expmed.c. */
7124 if (CONST_INT_P (XEXP (x, 1))
7125 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7126 && (mode == SImode || mode == DImode))
7128 /* We expand to 4 instructions. Reset the baseline. */
7129 *cost = COSTS_N_INSNS (4);
7131 if (speed)
7132 *cost += 2 * extra_cost->alu.logical
7133 + 2 * extra_cost->alu.arith;
7135 return true;
7138 /* Fall-through. */
7139 case UMOD:
7140 if (speed)
7142 if (VECTOR_MODE_P (mode))
7143 *cost += extra_cost->vect.alu;
7144 else if (GET_MODE_CLASS (mode) == MODE_INT)
7145 *cost += (extra_cost->mult[mode == DImode].add
7146 + extra_cost->mult[mode == DImode].idiv);
7147 else if (mode == DFmode)
7148 *cost += (extra_cost->fp[1].mult
7149 + extra_cost->fp[1].div);
7150 else if (mode == SFmode)
7151 *cost += (extra_cost->fp[0].mult
7152 + extra_cost->fp[0].div);
7154 return false; /* All arguments need to be in registers. */
7156 case DIV:
7157 case UDIV:
7158 case SQRT:
7159 if (speed)
7161 if (VECTOR_MODE_P (mode))
7162 *cost += extra_cost->vect.alu;
7163 else if (GET_MODE_CLASS (mode) == MODE_INT)
7164 /* There is no integer SQRT, so only DIV and UDIV can get
7165 here. */
7166 *cost += extra_cost->mult[mode == DImode].idiv;
7167 else
7168 *cost += extra_cost->fp[mode == DFmode].div;
7170 return false; /* All arguments need to be in registers. */
7172 case IF_THEN_ELSE:
7173 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7174 XEXP (x, 2), cost, speed);
7176 case EQ:
7177 case NE:
7178 case GT:
7179 case GTU:
7180 case LT:
7181 case LTU:
7182 case GE:
7183 case GEU:
7184 case LE:
7185 case LEU:
7187 return false; /* All arguments must be in registers. */
7189 case FMA:
7190 op0 = XEXP (x, 0);
7191 op1 = XEXP (x, 1);
7192 op2 = XEXP (x, 2);
7194 if (speed)
7196 if (VECTOR_MODE_P (mode))
7197 *cost += extra_cost->vect.alu;
7198 else
7199 *cost += extra_cost->fp[mode == DFmode].fma;
7202 /* FMSUB, FNMADD, and FNMSUB are free. */
7203 if (GET_CODE (op0) == NEG)
7204 op0 = XEXP (op0, 0);
7206 if (GET_CODE (op2) == NEG)
7207 op2 = XEXP (op2, 0);
7209 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7210 and the by-element operand as operand 0. */
7211 if (GET_CODE (op1) == NEG)
7212 op1 = XEXP (op1, 0);
7214 /* Catch vector-by-element operations. The by-element operand can
7215 either be (vec_duplicate (vec_select (x))) or just
7216 (vec_select (x)), depending on whether we are multiplying by
7217 a vector or a scalar.
7219 Canonicalization is not very good in these cases, FMA4 will put the
7220 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7221 if (GET_CODE (op0) == VEC_DUPLICATE)
7222 op0 = XEXP (op0, 0);
7223 else if (GET_CODE (op1) == VEC_DUPLICATE)
7224 op1 = XEXP (op1, 0);
7226 if (GET_CODE (op0) == VEC_SELECT)
7227 op0 = XEXP (op0, 0);
7228 else if (GET_CODE (op1) == VEC_SELECT)
7229 op1 = XEXP (op1, 0);
7231 /* If the remaining parameters are not registers,
7232 get the cost to put them into registers. */
7233 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7234 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7235 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7236 return true;
7238 case FLOAT:
7239 case UNSIGNED_FLOAT:
7240 if (speed)
7241 *cost += extra_cost->fp[mode == DFmode].fromint;
7242 return false;
7244 case FLOAT_EXTEND:
7245 if (speed)
7247 if (VECTOR_MODE_P (mode))
7249 /*Vector truncate. */
7250 *cost += extra_cost->vect.alu;
7252 else
7253 *cost += extra_cost->fp[mode == DFmode].widen;
7255 return false;
7257 case FLOAT_TRUNCATE:
7258 if (speed)
7260 if (VECTOR_MODE_P (mode))
7262 /*Vector conversion. */
7263 *cost += extra_cost->vect.alu;
7265 else
7266 *cost += extra_cost->fp[mode == DFmode].narrow;
7268 return false;
7270 case FIX:
7271 case UNSIGNED_FIX:
7272 x = XEXP (x, 0);
7273 /* Strip the rounding part. They will all be implemented
7274 by the fcvt* family of instructions anyway. */
7275 if (GET_CODE (x) == UNSPEC)
7277 unsigned int uns_code = XINT (x, 1);
7279 if (uns_code == UNSPEC_FRINTA
7280 || uns_code == UNSPEC_FRINTM
7281 || uns_code == UNSPEC_FRINTN
7282 || uns_code == UNSPEC_FRINTP
7283 || uns_code == UNSPEC_FRINTZ)
7284 x = XVECEXP (x, 0, 0);
7287 if (speed)
7289 if (VECTOR_MODE_P (mode))
7290 *cost += extra_cost->vect.alu;
7291 else
7292 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7295 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7296 fixed-point fcvt. */
7297 if (GET_CODE (x) == MULT
7298 && ((VECTOR_MODE_P (mode)
7299 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7300 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7302 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7303 0, speed);
7304 return true;
7307 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7308 return true;
7310 case ABS:
7311 if (VECTOR_MODE_P (mode))
7313 /* ABS (vector). */
7314 if (speed)
7315 *cost += extra_cost->vect.alu;
7317 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7319 op0 = XEXP (x, 0);
7321 /* FABD, which is analogous to FADD. */
7322 if (GET_CODE (op0) == MINUS)
7324 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7325 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7326 if (speed)
7327 *cost += extra_cost->fp[mode == DFmode].addsub;
7329 return true;
7331 /* Simple FABS is analogous to FNEG. */
7332 if (speed)
7333 *cost += extra_cost->fp[mode == DFmode].neg;
7335 else
7337 /* Integer ABS will either be split to
7338 two arithmetic instructions, or will be an ABS
7339 (scalar), which we don't model. */
7340 *cost = COSTS_N_INSNS (2);
7341 if (speed)
7342 *cost += 2 * extra_cost->alu.arith;
7344 return false;
7346 case SMAX:
7347 case SMIN:
7348 if (speed)
7350 if (VECTOR_MODE_P (mode))
7351 *cost += extra_cost->vect.alu;
7352 else
7354 /* FMAXNM/FMINNM/FMAX/FMIN.
7355 TODO: This may not be accurate for all implementations, but
7356 we do not model this in the cost tables. */
7357 *cost += extra_cost->fp[mode == DFmode].addsub;
7360 return false;
7362 case UNSPEC:
7363 /* The floating point round to integer frint* instructions. */
7364 if (aarch64_frint_unspec_p (XINT (x, 1)))
7366 if (speed)
7367 *cost += extra_cost->fp[mode == DFmode].roundint;
7369 return false;
7372 if (XINT (x, 1) == UNSPEC_RBIT)
7374 if (speed)
7375 *cost += extra_cost->alu.rev;
7377 return false;
7379 break;
7381 case TRUNCATE:
7383 /* Decompose <su>muldi3_highpart. */
7384 if (/* (truncate:DI */
7385 mode == DImode
7386 /* (lshiftrt:TI */
7387 && GET_MODE (XEXP (x, 0)) == TImode
7388 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7389 /* (mult:TI */
7390 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7391 /* (ANY_EXTEND:TI (reg:DI))
7392 (ANY_EXTEND:TI (reg:DI))) */
7393 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7394 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7395 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7396 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7397 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7398 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7399 /* (const_int 64) */
7400 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7401 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7403 /* UMULH/SMULH. */
7404 if (speed)
7405 *cost += extra_cost->mult[mode == DImode].extend;
7406 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7407 mode, MULT, 0, speed);
7408 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7409 mode, MULT, 1, speed);
7410 return true;
7413 /* Fall through. */
7414 default:
7415 break;
7418 if (dump_file && (dump_flags & TDF_DETAILS))
7419 fprintf (dump_file,
7420 "\nFailed to cost RTX. Assuming default cost.\n");
7422 return true;
7425 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7426 calculated for X. This cost is stored in *COST. Returns true
7427 if the total cost of X was calculated. */
7428 static bool
7429 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7430 int param, int *cost, bool speed)
7432 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7434 if (dump_file && (dump_flags & TDF_DETAILS))
7436 print_rtl_single (dump_file, x);
7437 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7438 speed ? "Hot" : "Cold",
7439 *cost, result ? "final" : "partial");
7442 return result;
7445 static int
7446 aarch64_register_move_cost (machine_mode mode,
7447 reg_class_t from_i, reg_class_t to_i)
7449 enum reg_class from = (enum reg_class) from_i;
7450 enum reg_class to = (enum reg_class) to_i;
7451 const struct cpu_regmove_cost *regmove_cost
7452 = aarch64_tune_params.regmove_cost;
7454 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7455 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7456 to = GENERAL_REGS;
7458 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7459 from = GENERAL_REGS;
7461 /* Moving between GPR and stack cost is the same as GP2GP. */
7462 if ((from == GENERAL_REGS && to == STACK_REG)
7463 || (to == GENERAL_REGS && from == STACK_REG))
7464 return regmove_cost->GP2GP;
7466 /* To/From the stack register, we move via the gprs. */
7467 if (to == STACK_REG || from == STACK_REG)
7468 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7469 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7471 if (GET_MODE_SIZE (mode) == 16)
7473 /* 128-bit operations on general registers require 2 instructions. */
7474 if (from == GENERAL_REGS && to == GENERAL_REGS)
7475 return regmove_cost->GP2GP * 2;
7476 else if (from == GENERAL_REGS)
7477 return regmove_cost->GP2FP * 2;
7478 else if (to == GENERAL_REGS)
7479 return regmove_cost->FP2GP * 2;
7481 /* When AdvSIMD instructions are disabled it is not possible to move
7482 a 128-bit value directly between Q registers. This is handled in
7483 secondary reload. A general register is used as a scratch to move
7484 the upper DI value and the lower DI value is moved directly,
7485 hence the cost is the sum of three moves. */
7486 if (! TARGET_SIMD)
7487 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7489 return regmove_cost->FP2FP;
7492 if (from == GENERAL_REGS && to == GENERAL_REGS)
7493 return regmove_cost->GP2GP;
7494 else if (from == GENERAL_REGS)
7495 return regmove_cost->GP2FP;
7496 else if (to == GENERAL_REGS)
7497 return regmove_cost->FP2GP;
7499 return regmove_cost->FP2FP;
7502 static int
7503 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7504 reg_class_t rclass ATTRIBUTE_UNUSED,
7505 bool in ATTRIBUTE_UNUSED)
7507 return aarch64_tune_params.memmov_cost;
7510 /* Return true if it is safe and beneficial to use the rsqrt optabs to
7511 optimize 1.0/sqrt. */
7513 static bool
7514 use_rsqrt_p (void)
7516 return (!flag_trapping_math
7517 && flag_unsafe_math_optimizations
7518 && (aarch64_tune_params.extra_tuning_flags
7519 & AARCH64_EXTRA_TUNE_RECIP_SQRT));
7522 /* Function to decide when to use
7523 reciprocal square root builtins. */
7525 static tree
7526 aarch64_builtin_reciprocal (tree fndecl)
7528 if (!use_rsqrt_p ())
7529 return NULL_TREE;
7530 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7533 typedef rtx (*rsqrte_type) (rtx, rtx);
7535 /* Select reciprocal square root initial estimate
7536 insn depending on machine mode. */
7538 rsqrte_type
7539 get_rsqrte_type (machine_mode mode)
7541 switch (mode)
7543 case DFmode: return gen_aarch64_rsqrte_df2;
7544 case SFmode: return gen_aarch64_rsqrte_sf2;
7545 case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7546 case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7547 case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7548 default: gcc_unreachable ();
7552 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7554 /* Select reciprocal square root Newton-Raphson step
7555 insn depending on machine mode. */
7557 rsqrts_type
7558 get_rsqrts_type (machine_mode mode)
7560 switch (mode)
7562 case DFmode: return gen_aarch64_rsqrts_df3;
7563 case SFmode: return gen_aarch64_rsqrts_sf3;
7564 case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7565 case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7566 case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7567 default: gcc_unreachable ();
7571 /* Emit instruction sequence to compute
7572 reciprocal square root. Use two Newton-Raphson steps
7573 for single precision and three for double precision. */
7575 void
7576 aarch64_emit_swrsqrt (rtx dst, rtx src)
7578 machine_mode mode = GET_MODE (src);
7579 gcc_assert (
7580 mode == SFmode || mode == V2SFmode || mode == V4SFmode
7581 || mode == DFmode || mode == V2DFmode);
7583 rtx xsrc = gen_reg_rtx (mode);
7584 emit_move_insn (xsrc, src);
7585 rtx x0 = gen_reg_rtx (mode);
7587 emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7589 bool double_mode = (mode == DFmode || mode == V2DFmode);
7591 int iterations = double_mode ? 3 : 2;
7593 if (flag_mrecip_low_precision_sqrt)
7594 iterations--;
7596 for (int i = 0; i < iterations; ++i)
7598 rtx x1 = gen_reg_rtx (mode);
7599 rtx x2 = gen_reg_rtx (mode);
7600 rtx x3 = gen_reg_rtx (mode);
7601 emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7603 emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7605 emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7606 x0 = x1;
7609 emit_move_insn (dst, x0);
7612 /* Return the number of instructions that can be issued per cycle. */
7613 static int
7614 aarch64_sched_issue_rate (void)
7616 return aarch64_tune_params.issue_rate;
7619 static int
7620 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7622 int issue_rate = aarch64_sched_issue_rate ();
7624 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7628 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7629 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7630 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7632 static int
7633 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7634 int ready_index)
7636 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7640 /* Vectorizer cost model target hooks. */
7642 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7643 static int
7644 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7645 tree vectype,
7646 int misalign ATTRIBUTE_UNUSED)
7648 unsigned elements;
7650 switch (type_of_cost)
7652 case scalar_stmt:
7653 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7655 case scalar_load:
7656 return aarch64_tune_params.vec_costs->scalar_load_cost;
7658 case scalar_store:
7659 return aarch64_tune_params.vec_costs->scalar_store_cost;
7661 case vector_stmt:
7662 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7664 case vector_load:
7665 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7667 case vector_store:
7668 return aarch64_tune_params.vec_costs->vec_store_cost;
7670 case vec_to_scalar:
7671 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7673 case scalar_to_vec:
7674 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7676 case unaligned_load:
7677 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7679 case unaligned_store:
7680 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7682 case cond_branch_taken:
7683 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7685 case cond_branch_not_taken:
7686 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7688 case vec_perm:
7689 case vec_promote_demote:
7690 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7692 case vec_construct:
7693 elements = TYPE_VECTOR_SUBPARTS (vectype);
7694 return elements / 2 + 1;
7696 default:
7697 gcc_unreachable ();
7701 /* Implement targetm.vectorize.add_stmt_cost. */
7702 static unsigned
7703 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7704 struct _stmt_vec_info *stmt_info, int misalign,
7705 enum vect_cost_model_location where)
7707 unsigned *cost = (unsigned *) data;
7708 unsigned retval = 0;
7710 if (flag_vect_cost_model)
7712 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7713 int stmt_cost =
7714 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7716 /* Statements in an inner loop relative to the loop being
7717 vectorized are weighted more heavily. The value here is
7718 arbitrary and could potentially be improved with analysis. */
7719 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7720 count *= 50; /* FIXME */
7722 retval = (unsigned) (count * stmt_cost);
7723 cost[where] += retval;
7726 return retval;
7729 static void initialize_aarch64_code_model (struct gcc_options *);
7731 /* Enum describing the various ways that the
7732 aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7733 This way their callers can choose what kind of error to give. */
7735 enum aarch64_parse_opt_result
7737 AARCH64_PARSE_OK, /* Parsing was successful. */
7738 AARCH64_PARSE_MISSING_ARG, /* Missing argument. */
7739 AARCH64_PARSE_INVALID_FEATURE, /* Invalid feature modifier. */
7740 AARCH64_PARSE_INVALID_ARG /* Invalid arch, tune, cpu arg. */
7743 /* Parse the architecture extension string STR and update ISA_FLAGS
7744 with the architecture features turned on or off. Return a
7745 aarch64_parse_opt_result describing the result. */
7747 static enum aarch64_parse_opt_result
7748 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7750 /* The extension string is parsed left to right. */
7751 const struct aarch64_option_extension *opt = NULL;
7753 /* Flag to say whether we are adding or removing an extension. */
7754 int adding_ext = -1;
7756 while (str != NULL && *str != 0)
7758 char *ext;
7759 size_t len;
7761 str++;
7762 ext = strchr (str, '+');
7764 if (ext != NULL)
7765 len = ext - str;
7766 else
7767 len = strlen (str);
7769 if (len >= 2 && strncmp (str, "no", 2) == 0)
7771 adding_ext = 0;
7772 len -= 2;
7773 str += 2;
7775 else if (len > 0)
7776 adding_ext = 1;
7778 if (len == 0)
7779 return AARCH64_PARSE_MISSING_ARG;
7782 /* Scan over the extensions table trying to find an exact match. */
7783 for (opt = all_extensions; opt->name != NULL; opt++)
7785 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7787 /* Add or remove the extension. */
7788 if (adding_ext)
7789 *isa_flags |= opt->flags_on;
7790 else
7791 *isa_flags &= ~(opt->flags_off);
7792 break;
7796 if (opt->name == NULL)
7798 /* Extension not found in list. */
7799 return AARCH64_PARSE_INVALID_FEATURE;
7802 str = ext;
7805 return AARCH64_PARSE_OK;
7808 /* Parse the TO_PARSE string and put the architecture struct that it
7809 selects into RES and the architectural features into ISA_FLAGS.
7810 Return an aarch64_parse_opt_result describing the parse result.
7811 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7813 static enum aarch64_parse_opt_result
7814 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7815 unsigned long *isa_flags)
7817 char *ext;
7818 const struct processor *arch;
7819 char *str = (char *) alloca (strlen (to_parse) + 1);
7820 size_t len;
7822 strcpy (str, to_parse);
7824 ext = strchr (str, '+');
7826 if (ext != NULL)
7827 len = ext - str;
7828 else
7829 len = strlen (str);
7831 if (len == 0)
7832 return AARCH64_PARSE_MISSING_ARG;
7835 /* Loop through the list of supported ARCHes to find a match. */
7836 for (arch = all_architectures; arch->name != NULL; arch++)
7838 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7840 unsigned long isa_temp = arch->flags;
7842 if (ext != NULL)
7844 /* TO_PARSE string contains at least one extension. */
7845 enum aarch64_parse_opt_result ext_res
7846 = aarch64_parse_extension (ext, &isa_temp);
7848 if (ext_res != AARCH64_PARSE_OK)
7849 return ext_res;
7851 /* Extension parsing was successful. Confirm the result
7852 arch and ISA flags. */
7853 *res = arch;
7854 *isa_flags = isa_temp;
7855 return AARCH64_PARSE_OK;
7859 /* ARCH name not found in list. */
7860 return AARCH64_PARSE_INVALID_ARG;
7863 /* Parse the TO_PARSE string and put the result tuning in RES and the
7864 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7865 describing the parse result. If there is an error parsing, RES and
7866 ISA_FLAGS are left unchanged. */
7868 static enum aarch64_parse_opt_result
7869 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7870 unsigned long *isa_flags)
7872 char *ext;
7873 const struct processor *cpu;
7874 char *str = (char *) alloca (strlen (to_parse) + 1);
7875 size_t len;
7877 strcpy (str, to_parse);
7879 ext = strchr (str, '+');
7881 if (ext != NULL)
7882 len = ext - str;
7883 else
7884 len = strlen (str);
7886 if (len == 0)
7887 return AARCH64_PARSE_MISSING_ARG;
7890 /* Loop through the list of supported CPUs to find a match. */
7891 for (cpu = all_cores; cpu->name != NULL; cpu++)
7893 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7895 unsigned long isa_temp = cpu->flags;
7898 if (ext != NULL)
7900 /* TO_PARSE string contains at least one extension. */
7901 enum aarch64_parse_opt_result ext_res
7902 = aarch64_parse_extension (ext, &isa_temp);
7904 if (ext_res != AARCH64_PARSE_OK)
7905 return ext_res;
7907 /* Extension parsing was successfull. Confirm the result
7908 cpu and ISA flags. */
7909 *res = cpu;
7910 *isa_flags = isa_temp;
7911 return AARCH64_PARSE_OK;
7915 /* CPU name not found in list. */
7916 return AARCH64_PARSE_INVALID_ARG;
7919 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7920 Return an aarch64_parse_opt_result describing the parse result.
7921 If the parsing fails the RES does not change. */
7923 static enum aarch64_parse_opt_result
7924 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7926 const struct processor *cpu;
7927 char *str = (char *) alloca (strlen (to_parse) + 1);
7929 strcpy (str, to_parse);
7931 /* Loop through the list of supported CPUs to find a match. */
7932 for (cpu = all_cores; cpu->name != NULL; cpu++)
7934 if (strcmp (cpu->name, str) == 0)
7936 *res = cpu;
7937 return AARCH64_PARSE_OK;
7941 /* CPU name not found in list. */
7942 return AARCH64_PARSE_INVALID_ARG;
7945 /* Parse TOKEN, which has length LENGTH to see if it is an option
7946 described in FLAG. If it is, return the index bit for that fusion type.
7947 If not, error (printing OPTION_NAME) and return zero. */
7949 static unsigned int
7950 aarch64_parse_one_option_token (const char *token,
7951 size_t length,
7952 const struct aarch64_flag_desc *flag,
7953 const char *option_name)
7955 for (; flag->name != NULL; flag++)
7957 if (length == strlen (flag->name)
7958 && !strncmp (flag->name, token, length))
7959 return flag->flag;
7962 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7963 return 0;
7966 /* Parse OPTION which is a comma-separated list of flags to enable.
7967 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7968 default state we inherit from the CPU tuning structures. OPTION_NAME
7969 gives the top-level option we are parsing in the -moverride string,
7970 for use in error messages. */
7972 static unsigned int
7973 aarch64_parse_boolean_options (const char *option,
7974 const struct aarch64_flag_desc *flags,
7975 unsigned int initial_state,
7976 const char *option_name)
7978 const char separator = '.';
7979 const char* specs = option;
7980 const char* ntoken = option;
7981 unsigned int found_flags = initial_state;
7983 while ((ntoken = strchr (specs, separator)))
7985 size_t token_length = ntoken - specs;
7986 unsigned token_ops = aarch64_parse_one_option_token (specs,
7987 token_length,
7988 flags,
7989 option_name);
7990 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7991 in the token stream, reset the supported operations. So:
7993 adrp+add.cmp+branch.none.adrp+add
7995 would have the result of turning on only adrp+add fusion. */
7996 if (!token_ops)
7997 found_flags = 0;
7999 found_flags |= token_ops;
8000 specs = ++ntoken;
8003 /* We ended with a comma, print something. */
8004 if (!(*specs))
8006 error ("%s string ill-formed\n", option_name);
8007 return 0;
8010 /* We still have one more token to parse. */
8011 size_t token_length = strlen (specs);
8012 unsigned token_ops = aarch64_parse_one_option_token (specs,
8013 token_length,
8014 flags,
8015 option_name);
8016 if (!token_ops)
8017 found_flags = 0;
8019 found_flags |= token_ops;
8020 return found_flags;
8023 /* Support for overriding instruction fusion. */
8025 static void
8026 aarch64_parse_fuse_string (const char *fuse_string,
8027 struct tune_params *tune)
8029 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8030 aarch64_fusible_pairs,
8031 tune->fusible_ops,
8032 "fuse=");
8035 /* Support for overriding other tuning flags. */
8037 static void
8038 aarch64_parse_tune_string (const char *tune_string,
8039 struct tune_params *tune)
8041 tune->extra_tuning_flags
8042 = aarch64_parse_boolean_options (tune_string,
8043 aarch64_tuning_flags,
8044 tune->extra_tuning_flags,
8045 "tune=");
8048 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8049 we understand. If it is, extract the option string and handoff to
8050 the appropriate function. */
8052 void
8053 aarch64_parse_one_override_token (const char* token,
8054 size_t length,
8055 struct tune_params *tune)
8057 const struct aarch64_tuning_override_function *fn
8058 = aarch64_tuning_override_functions;
8060 const char *option_part = strchr (token, '=');
8061 if (!option_part)
8063 error ("tuning string missing in option (%s)", token);
8064 return;
8067 /* Get the length of the option name. */
8068 length = option_part - token;
8069 /* Skip the '=' to get to the option string. */
8070 option_part++;
8072 for (; fn->name != NULL; fn++)
8074 if (!strncmp (fn->name, token, length))
8076 fn->parse_override (option_part, tune);
8077 return;
8081 error ("unknown tuning option (%s)",token);
8082 return;
8085 /* A checking mechanism for the implementation of the tls size. */
8087 static void
8088 initialize_aarch64_tls_size (struct gcc_options *opts)
8090 if (aarch64_tls_size == 0)
8091 aarch64_tls_size = 24;
8093 switch (opts->x_aarch64_cmodel_var)
8095 case AARCH64_CMODEL_TINY:
8096 /* Both the default and maximum TLS size allowed under tiny is 1M which
8097 needs two instructions to address, so we clamp the size to 24. */
8098 if (aarch64_tls_size > 24)
8099 aarch64_tls_size = 24;
8100 break;
8101 case AARCH64_CMODEL_SMALL:
8102 /* The maximum TLS size allowed under small is 4G. */
8103 if (aarch64_tls_size > 32)
8104 aarch64_tls_size = 32;
8105 break;
8106 case AARCH64_CMODEL_LARGE:
8107 /* The maximum TLS size allowed under large is 16E.
8108 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8109 if (aarch64_tls_size > 48)
8110 aarch64_tls_size = 48;
8111 break;
8112 default:
8113 gcc_unreachable ();
8116 return;
8119 /* Parse STRING looking for options in the format:
8120 string :: option:string
8121 option :: name=substring
8122 name :: {a-z}
8123 substring :: defined by option. */
8125 static void
8126 aarch64_parse_override_string (const char* input_string,
8127 struct tune_params* tune)
8129 const char separator = ':';
8130 size_t string_length = strlen (input_string) + 1;
8131 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8132 char *string = string_root;
8133 strncpy (string, input_string, string_length);
8134 string[string_length - 1] = '\0';
8136 char* ntoken = string;
8138 while ((ntoken = strchr (string, separator)))
8140 size_t token_length = ntoken - string;
8141 /* Make this substring look like a string. */
8142 *ntoken = '\0';
8143 aarch64_parse_one_override_token (string, token_length, tune);
8144 string = ++ntoken;
8147 /* One last option to parse. */
8148 aarch64_parse_one_override_token (string, strlen (string), tune);
8149 free (string_root);
8153 static void
8154 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8156 if (opts->x_flag_omit_frame_pointer)
8157 opts->x_flag_omit_leaf_frame_pointer = false;
8158 else if (opts->x_flag_omit_leaf_frame_pointer)
8159 opts->x_flag_omit_frame_pointer = true;
8161 /* If not optimizing for size, set the default
8162 alignment to what the target wants. */
8163 if (!opts->x_optimize_size)
8165 if (opts->x_align_loops <= 0)
8166 opts->x_align_loops = aarch64_tune_params.loop_align;
8167 if (opts->x_align_jumps <= 0)
8168 opts->x_align_jumps = aarch64_tune_params.jump_align;
8169 if (opts->x_align_functions <= 0)
8170 opts->x_align_functions = aarch64_tune_params.function_align;
8173 /* If nopcrelative_literal_loads is set on the command line, this
8174 implies that the user asked for PC relative literal loads. */
8175 if (opts->x_nopcrelative_literal_loads == 1)
8176 aarch64_nopcrelative_literal_loads = false;
8178 /* If it is not set on the command line, we default to no
8179 pc relative literal loads. */
8180 if (opts->x_nopcrelative_literal_loads == 2)
8181 aarch64_nopcrelative_literal_loads = true;
8183 /* In the tiny memory model it makes no sense
8184 to disallow non PC relative literal pool loads
8185 as many other things will break anyway. */
8186 if (opts->x_nopcrelative_literal_loads
8187 && (aarch64_cmodel == AARCH64_CMODEL_TINY
8188 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
8189 aarch64_nopcrelative_literal_loads = false;
8192 /* 'Unpack' up the internal tuning structs and update the options
8193 in OPTS. The caller must have set up selected_tune and selected_arch
8194 as all the other target-specific codegen decisions are
8195 derived from them. */
8197 void
8198 aarch64_override_options_internal (struct gcc_options *opts)
8200 aarch64_tune_flags = selected_tune->flags;
8201 aarch64_tune = selected_tune->sched_core;
8202 /* Make a copy of the tuning parameters attached to the core, which
8203 we may later overwrite. */
8204 aarch64_tune_params = *(selected_tune->tune);
8205 aarch64_architecture_version = selected_arch->architecture_version;
8207 if (opts->x_aarch64_override_tune_string)
8208 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8209 &aarch64_tune_params);
8211 /* This target defaults to strict volatile bitfields. */
8212 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8213 opts->x_flag_strict_volatile_bitfields = 1;
8215 initialize_aarch64_code_model (opts);
8216 initialize_aarch64_tls_size (opts);
8218 int queue_depth = 0;
8219 switch (aarch64_tune_params.autoprefetcher_model)
8221 case tune_params::AUTOPREFETCHER_OFF:
8222 queue_depth = -1;
8223 break;
8224 case tune_params::AUTOPREFETCHER_WEAK:
8225 queue_depth = 0;
8226 break;
8227 case tune_params::AUTOPREFETCHER_STRONG:
8228 queue_depth = max_insn_queue_index + 1;
8229 break;
8230 default:
8231 gcc_unreachable ();
8234 /* We don't mind passing in global_options_set here as we don't use
8235 the *options_set structs anyway. */
8236 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8237 queue_depth,
8238 opts->x_param_values,
8239 global_options_set.x_param_values);
8241 /* Set the L1 cache line size. */
8242 if (selected_cpu->tune->cache_line_size != 0)
8243 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8244 selected_cpu->tune->cache_line_size,
8245 opts->x_param_values,
8246 global_options_set.x_param_values);
8248 aarch64_override_options_after_change_1 (opts);
8251 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8252 specified in STR and throw errors if appropriate. Put the results if
8253 they are valid in RES and ISA_FLAGS. Return whether the option is
8254 valid. */
8256 static bool
8257 aarch64_validate_mcpu (const char *str, const struct processor **res,
8258 unsigned long *isa_flags)
8260 enum aarch64_parse_opt_result parse_res
8261 = aarch64_parse_cpu (str, res, isa_flags);
8263 if (parse_res == AARCH64_PARSE_OK)
8264 return true;
8266 switch (parse_res)
8268 case AARCH64_PARSE_MISSING_ARG:
8269 error ("missing cpu name in -mcpu=%qs", str);
8270 break;
8271 case AARCH64_PARSE_INVALID_ARG:
8272 error ("unknown value %qs for -mcpu", str);
8273 break;
8274 case AARCH64_PARSE_INVALID_FEATURE:
8275 error ("invalid feature modifier in -mcpu=%qs", str);
8276 break;
8277 default:
8278 gcc_unreachable ();
8281 return false;
8284 /* Validate a command-line -march option. Parse the arch and extensions
8285 (if any) specified in STR and throw errors if appropriate. Put the
8286 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8287 option is valid. */
8289 static bool
8290 aarch64_validate_march (const char *str, const struct processor **res,
8291 unsigned long *isa_flags)
8293 enum aarch64_parse_opt_result parse_res
8294 = aarch64_parse_arch (str, res, isa_flags);
8296 if (parse_res == AARCH64_PARSE_OK)
8297 return true;
8299 switch (parse_res)
8301 case AARCH64_PARSE_MISSING_ARG:
8302 error ("missing arch name in -march=%qs", str);
8303 break;
8304 case AARCH64_PARSE_INVALID_ARG:
8305 error ("unknown value %qs for -march", str);
8306 break;
8307 case AARCH64_PARSE_INVALID_FEATURE:
8308 error ("invalid feature modifier in -march=%qs", str);
8309 break;
8310 default:
8311 gcc_unreachable ();
8314 return false;
8317 /* Validate a command-line -mtune option. Parse the cpu
8318 specified in STR and throw errors if appropriate. Put the
8319 result, if it is valid, in RES. Return whether the option is
8320 valid. */
8322 static bool
8323 aarch64_validate_mtune (const char *str, const struct processor **res)
8325 enum aarch64_parse_opt_result parse_res
8326 = aarch64_parse_tune (str, res);
8328 if (parse_res == AARCH64_PARSE_OK)
8329 return true;
8331 switch (parse_res)
8333 case AARCH64_PARSE_MISSING_ARG:
8334 error ("missing cpu name in -mtune=%qs", str);
8335 break;
8336 case AARCH64_PARSE_INVALID_ARG:
8337 error ("unknown value %qs for -mtune", str);
8338 break;
8339 default:
8340 gcc_unreachable ();
8342 return false;
8345 /* Return the CPU corresponding to the enum CPU.
8346 If it doesn't specify a cpu, return the default. */
8348 static const struct processor *
8349 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8351 if (cpu != aarch64_none)
8352 return &all_cores[cpu];
8354 /* The & 0x3f is to extract the bottom 6 bits that encode the
8355 default cpu as selected by the --with-cpu GCC configure option
8356 in config.gcc.
8357 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8358 flags mechanism should be reworked to make it more sane. */
8359 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8362 /* Return the architecture corresponding to the enum ARCH.
8363 If it doesn't specify a valid architecture, return the default. */
8365 static const struct processor *
8366 aarch64_get_arch (enum aarch64_arch arch)
8368 if (arch != aarch64_no_arch)
8369 return &all_architectures[arch];
8371 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8373 return &all_architectures[cpu->arch];
8376 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8377 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8378 tuning structs. In particular it must set selected_tune and
8379 aarch64_isa_flags that define the available ISA features and tuning
8380 decisions. It must also set selected_arch as this will be used to
8381 output the .arch asm tags for each function. */
8383 static void
8384 aarch64_override_options (void)
8386 unsigned long cpu_isa = 0;
8387 unsigned long arch_isa = 0;
8388 aarch64_isa_flags = 0;
8390 bool valid_cpu = true;
8391 bool valid_tune = true;
8392 bool valid_arch = true;
8394 selected_cpu = NULL;
8395 selected_arch = NULL;
8396 selected_tune = NULL;
8398 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8399 If either of -march or -mtune is given, they override their
8400 respective component of -mcpu. */
8401 if (aarch64_cpu_string)
8402 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8403 &cpu_isa);
8405 if (aarch64_arch_string)
8406 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8407 &arch_isa);
8409 if (aarch64_tune_string)
8410 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8412 /* If the user did not specify a processor, choose the default
8413 one for them. This will be the CPU set during configuration using
8414 --with-cpu, otherwise it is "generic". */
8415 if (!selected_cpu)
8417 if (selected_arch)
8419 selected_cpu = &all_cores[selected_arch->ident];
8420 aarch64_isa_flags = arch_isa;
8421 explicit_arch = selected_arch->arch;
8423 else
8425 /* Get default configure-time CPU. */
8426 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8427 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8430 if (selected_tune)
8431 explicit_tune_core = selected_tune->ident;
8433 /* If both -mcpu and -march are specified check that they are architecturally
8434 compatible, warn if they're not and prefer the -march ISA flags. */
8435 else if (selected_arch)
8437 if (selected_arch->arch != selected_cpu->arch)
8439 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8440 all_architectures[selected_cpu->arch].name,
8441 selected_arch->name);
8443 aarch64_isa_flags = arch_isa;
8444 explicit_arch = selected_arch->arch;
8445 explicit_tune_core = selected_tune ? selected_tune->ident
8446 : selected_cpu->ident;
8448 else
8450 /* -mcpu but no -march. */
8451 aarch64_isa_flags = cpu_isa;
8452 explicit_tune_core = selected_tune ? selected_tune->ident
8453 : selected_cpu->ident;
8454 gcc_assert (selected_cpu);
8455 selected_arch = &all_architectures[selected_cpu->arch];
8456 explicit_arch = selected_arch->arch;
8459 /* Set the arch as well as we will need it when outputing
8460 the .arch directive in assembly. */
8461 if (!selected_arch)
8463 gcc_assert (selected_cpu);
8464 selected_arch = &all_architectures[selected_cpu->arch];
8467 if (!selected_tune)
8468 selected_tune = selected_cpu;
8470 #ifndef HAVE_AS_MABI_OPTION
8471 /* The compiler may have been configured with 2.23.* binutils, which does
8472 not have support for ILP32. */
8473 if (TARGET_ILP32)
8474 error ("Assembler does not support -mabi=ilp32");
8475 #endif
8477 /* Make sure we properly set up the explicit options. */
8478 if ((aarch64_cpu_string && valid_cpu)
8479 || (aarch64_tune_string && valid_tune))
8480 gcc_assert (explicit_tune_core != aarch64_none);
8482 if ((aarch64_cpu_string && valid_cpu)
8483 || (aarch64_arch_string && valid_arch))
8484 gcc_assert (explicit_arch != aarch64_no_arch);
8486 aarch64_override_options_internal (&global_options);
8488 /* Save these options as the default ones in case we push and pop them later
8489 while processing functions with potential target attributes. */
8490 target_option_default_node = target_option_current_node
8491 = build_target_option_node (&global_options);
8493 aarch64_register_fma_steering ();
8497 /* Implement targetm.override_options_after_change. */
8499 static void
8500 aarch64_override_options_after_change (void)
8502 aarch64_override_options_after_change_1 (&global_options);
8505 static struct machine_function *
8506 aarch64_init_machine_status (void)
8508 struct machine_function *machine;
8509 machine = ggc_cleared_alloc<machine_function> ();
8510 return machine;
8513 void
8514 aarch64_init_expanders (void)
8516 init_machine_status = aarch64_init_machine_status;
8519 /* A checking mechanism for the implementation of the various code models. */
8520 static void
8521 initialize_aarch64_code_model (struct gcc_options *opts)
8523 if (opts->x_flag_pic)
8525 switch (opts->x_aarch64_cmodel_var)
8527 case AARCH64_CMODEL_TINY:
8528 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8529 break;
8530 case AARCH64_CMODEL_SMALL:
8531 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8532 aarch64_cmodel = (flag_pic == 2
8533 ? AARCH64_CMODEL_SMALL_PIC
8534 : AARCH64_CMODEL_SMALL_SPIC);
8535 #else
8536 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8537 #endif
8538 break;
8539 case AARCH64_CMODEL_LARGE:
8540 sorry ("code model %qs with -f%s", "large",
8541 opts->x_flag_pic > 1 ? "PIC" : "pic");
8542 break;
8543 default:
8544 gcc_unreachable ();
8547 else
8548 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8551 /* Implement TARGET_OPTION_SAVE. */
8553 static void
8554 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8556 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8559 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8560 using the information saved in PTR. */
8562 static void
8563 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8565 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8566 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8567 opts->x_explicit_arch = ptr->x_explicit_arch;
8568 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8569 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8571 aarch64_override_options_internal (opts);
8574 /* Implement TARGET_OPTION_PRINT. */
8576 static void
8577 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8579 const struct processor *cpu
8580 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8581 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8582 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8583 std::string extension
8584 = aarch64_get_extension_string_for_isa_flags (isa_flags);
8586 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8587 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8588 arch->name, extension.c_str ());
8591 static GTY(()) tree aarch64_previous_fndecl;
8593 void
8594 aarch64_reset_previous_fndecl (void)
8596 aarch64_previous_fndecl = NULL;
8599 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8600 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8601 of the function, if such exists. This function may be called multiple
8602 times on a single function so use aarch64_previous_fndecl to avoid
8603 setting up identical state. */
8605 static void
8606 aarch64_set_current_function (tree fndecl)
8608 tree old_tree = (aarch64_previous_fndecl
8609 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8610 : NULL_TREE);
8612 tree new_tree = (fndecl
8613 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
8614 : NULL_TREE);
8617 if (fndecl && fndecl != aarch64_previous_fndecl)
8619 aarch64_previous_fndecl = fndecl;
8620 if (old_tree == new_tree)
8623 else if (new_tree && new_tree != target_option_default_node)
8625 cl_target_option_restore (&global_options,
8626 TREE_TARGET_OPTION (new_tree));
8627 if (TREE_TARGET_GLOBALS (new_tree))
8628 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8629 else
8630 TREE_TARGET_GLOBALS (new_tree)
8631 = save_target_globals_default_opts ();
8634 else if (old_tree && old_tree != target_option_default_node)
8636 new_tree = target_option_current_node;
8637 cl_target_option_restore (&global_options,
8638 TREE_TARGET_OPTION (new_tree));
8639 if (TREE_TARGET_GLOBALS (new_tree))
8640 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8641 else if (new_tree == target_option_default_node)
8642 restore_target_globals (&default_target_globals);
8643 else
8644 TREE_TARGET_GLOBALS (new_tree)
8645 = save_target_globals_default_opts ();
8649 if (!fndecl)
8650 return;
8652 /* If we turned on SIMD make sure that any vector parameters are re-laid out
8653 so that they use proper vector modes. */
8654 if (TARGET_SIMD)
8656 tree parms = DECL_ARGUMENTS (fndecl);
8657 for (; parms && parms != void_list_node; parms = TREE_CHAIN (parms))
8659 if (TREE_CODE (parms) == PARM_DECL
8660 && VECTOR_TYPE_P (TREE_TYPE (parms))
8661 && DECL_MODE (parms) != TYPE_MODE (TREE_TYPE (parms)))
8662 relayout_decl (parms);
8667 /* Enum describing the various ways we can handle attributes.
8668 In many cases we can reuse the generic option handling machinery. */
8670 enum aarch64_attr_opt_type
8672 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8673 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8674 aarch64_attr_enum, /* Attribute sets an enum variable. */
8675 aarch64_attr_custom /* Attribute requires a custom handling function. */
8678 /* All the information needed to handle a target attribute.
8679 NAME is the name of the attribute.
8680 ATTR_TYPE specifies the type of behaviour of the attribute as described
8681 in the definition of enum aarch64_attr_opt_type.
8682 ALLOW_NEG is true if the attribute supports a "no-" form.
8683 HANDLER is the function that takes the attribute string and whether
8684 it is a pragma or attribute and handles the option. It is needed only
8685 when the ATTR_TYPE is aarch64_attr_custom.
8686 OPT_NUM is the enum specifying the option that the attribute modifies.
8687 This is needed for attributes that mirror the behaviour of a command-line
8688 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8689 aarch64_attr_enum. */
8691 struct aarch64_attribute_info
8693 const char *name;
8694 enum aarch64_attr_opt_type attr_type;
8695 bool allow_neg;
8696 bool (*handler) (const char *, const char *);
8697 enum opt_code opt_num;
8700 /* Handle the ARCH_STR argument to the arch= target attribute.
8701 PRAGMA_OR_ATTR is used in potential error messages. */
8703 static bool
8704 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8706 const struct processor *tmp_arch = NULL;
8707 enum aarch64_parse_opt_result parse_res
8708 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8710 if (parse_res == AARCH64_PARSE_OK)
8712 gcc_assert (tmp_arch);
8713 selected_arch = tmp_arch;
8714 explicit_arch = selected_arch->arch;
8715 return true;
8718 switch (parse_res)
8720 case AARCH64_PARSE_MISSING_ARG:
8721 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8722 break;
8723 case AARCH64_PARSE_INVALID_ARG:
8724 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8725 break;
8726 case AARCH64_PARSE_INVALID_FEATURE:
8727 error ("invalid feature modifier %qs for 'arch' target %s",
8728 str, pragma_or_attr);
8729 break;
8730 default:
8731 gcc_unreachable ();
8734 return false;
8737 /* Handle the argument CPU_STR to the cpu= target attribute.
8738 PRAGMA_OR_ATTR is used in potential error messages. */
8740 static bool
8741 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8743 const struct processor *tmp_cpu = NULL;
8744 enum aarch64_parse_opt_result parse_res
8745 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8747 if (parse_res == AARCH64_PARSE_OK)
8749 gcc_assert (tmp_cpu);
8750 selected_tune = tmp_cpu;
8751 explicit_tune_core = selected_tune->ident;
8753 selected_arch = &all_architectures[tmp_cpu->arch];
8754 explicit_arch = selected_arch->arch;
8755 return true;
8758 switch (parse_res)
8760 case AARCH64_PARSE_MISSING_ARG:
8761 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8762 break;
8763 case AARCH64_PARSE_INVALID_ARG:
8764 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8765 break;
8766 case AARCH64_PARSE_INVALID_FEATURE:
8767 error ("invalid feature modifier %qs for 'cpu' target %s",
8768 str, pragma_or_attr);
8769 break;
8770 default:
8771 gcc_unreachable ();
8774 return false;
8777 /* Handle the argument STR to the tune= target attribute.
8778 PRAGMA_OR_ATTR is used in potential error messages. */
8780 static bool
8781 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8783 const struct processor *tmp_tune = NULL;
8784 enum aarch64_parse_opt_result parse_res
8785 = aarch64_parse_tune (str, &tmp_tune);
8787 if (parse_res == AARCH64_PARSE_OK)
8789 gcc_assert (tmp_tune);
8790 selected_tune = tmp_tune;
8791 explicit_tune_core = selected_tune->ident;
8792 return true;
8795 switch (parse_res)
8797 case AARCH64_PARSE_INVALID_ARG:
8798 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8799 break;
8800 default:
8801 gcc_unreachable ();
8804 return false;
8807 /* Parse an architecture extensions target attribute string specified in STR.
8808 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8809 if successful. Update aarch64_isa_flags to reflect the ISA features
8810 modified.
8811 PRAGMA_OR_ATTR is used in potential error messages. */
8813 static bool
8814 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8816 enum aarch64_parse_opt_result parse_res;
8817 unsigned long isa_flags = aarch64_isa_flags;
8819 /* We allow "+nothing" in the beginning to clear out all architectural
8820 features if the user wants to handpick specific features. */
8821 if (strncmp ("+nothing", str, 8) == 0)
8823 isa_flags = 0;
8824 str += 8;
8827 parse_res = aarch64_parse_extension (str, &isa_flags);
8829 if (parse_res == AARCH64_PARSE_OK)
8831 aarch64_isa_flags = isa_flags;
8832 return true;
8835 switch (parse_res)
8837 case AARCH64_PARSE_MISSING_ARG:
8838 error ("missing feature modifier in target %s %qs",
8839 pragma_or_attr, str);
8840 break;
8842 case AARCH64_PARSE_INVALID_FEATURE:
8843 error ("invalid feature modifier in target %s %qs",
8844 pragma_or_attr, str);
8845 break;
8847 default:
8848 gcc_unreachable ();
8851 return false;
8854 /* The target attributes that we support. On top of these we also support just
8855 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8856 handled explicitly in aarch64_process_one_target_attr. */
8858 static const struct aarch64_attribute_info aarch64_attributes[] =
8860 { "general-regs-only", aarch64_attr_mask, false, NULL,
8861 OPT_mgeneral_regs_only },
8862 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8863 OPT_mfix_cortex_a53_835769 },
8864 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8865 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8866 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8867 OPT_momit_leaf_frame_pointer },
8868 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8869 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8870 OPT_march_ },
8871 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8872 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8873 OPT_mtune_ },
8874 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8877 /* Parse ARG_STR which contains the definition of one target attribute.
8878 Show appropriate errors if any or return true if the attribute is valid.
8879 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8880 we're processing a target attribute or pragma. */
8882 static bool
8883 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8885 bool invert = false;
8887 size_t len = strlen (arg_str);
8889 if (len == 0)
8891 error ("malformed target %s", pragma_or_attr);
8892 return false;
8895 char *str_to_check = (char *) alloca (len + 1);
8896 strcpy (str_to_check, arg_str);
8898 /* Skip leading whitespace. */
8899 while (*str_to_check == ' ' || *str_to_check == '\t')
8900 str_to_check++;
8902 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8903 It is easier to detect and handle it explicitly here rather than going
8904 through the machinery for the rest of the target attributes in this
8905 function. */
8906 if (*str_to_check == '+')
8907 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8909 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8911 invert = true;
8912 str_to_check += 3;
8914 char *arg = strchr (str_to_check, '=');
8916 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8917 and point ARG to "foo". */
8918 if (arg)
8920 *arg = '\0';
8921 arg++;
8923 const struct aarch64_attribute_info *p_attr;
8924 bool found = false;
8925 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8927 /* If the names don't match up, or the user has given an argument
8928 to an attribute that doesn't accept one, or didn't give an argument
8929 to an attribute that expects one, fail to match. */
8930 if (strcmp (str_to_check, p_attr->name) != 0)
8931 continue;
8933 found = true;
8934 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8935 || p_attr->attr_type == aarch64_attr_enum;
8937 if (attr_need_arg_p ^ (arg != NULL))
8939 error ("target %s %qs does not accept an argument",
8940 pragma_or_attr, str_to_check);
8941 return false;
8944 /* If the name matches but the attribute does not allow "no-" versions
8945 then we can't match. */
8946 if (invert && !p_attr->allow_neg)
8948 error ("target %s %qs does not allow a negated form",
8949 pragma_or_attr, str_to_check);
8950 return false;
8953 switch (p_attr->attr_type)
8955 /* Has a custom handler registered.
8956 For example, cpu=, arch=, tune=. */
8957 case aarch64_attr_custom:
8958 gcc_assert (p_attr->handler);
8959 if (!p_attr->handler (arg, pragma_or_attr))
8960 return false;
8961 break;
8963 /* Either set or unset a boolean option. */
8964 case aarch64_attr_bool:
8966 struct cl_decoded_option decoded;
8968 generate_option (p_attr->opt_num, NULL, !invert,
8969 CL_TARGET, &decoded);
8970 aarch64_handle_option (&global_options, &global_options_set,
8971 &decoded, input_location);
8972 break;
8974 /* Set or unset a bit in the target_flags. aarch64_handle_option
8975 should know what mask to apply given the option number. */
8976 case aarch64_attr_mask:
8978 struct cl_decoded_option decoded;
8979 /* We only need to specify the option number.
8980 aarch64_handle_option will know which mask to apply. */
8981 decoded.opt_index = p_attr->opt_num;
8982 decoded.value = !invert;
8983 aarch64_handle_option (&global_options, &global_options_set,
8984 &decoded, input_location);
8985 break;
8987 /* Use the option setting machinery to set an option to an enum. */
8988 case aarch64_attr_enum:
8990 gcc_assert (arg);
8991 bool valid;
8992 int value;
8993 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8994 &value, CL_TARGET);
8995 if (valid)
8997 set_option (&global_options, NULL, p_attr->opt_num, value,
8998 NULL, DK_UNSPECIFIED, input_location,
8999 global_dc);
9001 else
9003 error ("target %s %s=%s is not valid",
9004 pragma_or_attr, str_to_check, arg);
9006 break;
9008 default:
9009 gcc_unreachable ();
9013 /* If we reached here we either have found an attribute and validated
9014 it or didn't match any. If we matched an attribute but its arguments
9015 were malformed we will have returned false already. */
9016 return found;
9019 /* Count how many times the character C appears in
9020 NULL-terminated string STR. */
9022 static unsigned int
9023 num_occurences_in_str (char c, char *str)
9025 unsigned int res = 0;
9026 while (*str != '\0')
9028 if (*str == c)
9029 res++;
9031 str++;
9034 return res;
9037 /* Parse the tree in ARGS that contains the target attribute information
9038 and update the global target options space. PRAGMA_OR_ATTR is a string
9039 to be used in error messages, specifying whether this is processing
9040 a target attribute or a target pragma. */
9042 bool
9043 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9045 if (TREE_CODE (args) == TREE_LIST)
9049 tree head = TREE_VALUE (args);
9050 if (head)
9052 if (!aarch64_process_target_attr (head, pragma_or_attr))
9053 return false;
9055 args = TREE_CHAIN (args);
9056 } while (args);
9058 return true;
9060 /* We expect to find a string to parse. */
9061 gcc_assert (TREE_CODE (args) == STRING_CST);
9063 size_t len = strlen (TREE_STRING_POINTER (args));
9064 char *str_to_check = (char *) alloca (len + 1);
9065 strcpy (str_to_check, TREE_STRING_POINTER (args));
9067 if (len == 0)
9069 error ("malformed target %s value", pragma_or_attr);
9070 return false;
9073 /* Used to catch empty spaces between commas i.e.
9074 attribute ((target ("attr1,,attr2"))). */
9075 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9077 /* Handle multiple target attributes separated by ','. */
9078 char *token = strtok (str_to_check, ",");
9080 unsigned int num_attrs = 0;
9081 while (token)
9083 num_attrs++;
9084 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9086 error ("target %s %qs is invalid", pragma_or_attr, token);
9087 return false;
9090 token = strtok (NULL, ",");
9093 if (num_attrs != num_commas + 1)
9095 error ("malformed target %s list %qs",
9096 pragma_or_attr, TREE_STRING_POINTER (args));
9097 return false;
9100 return true;
9103 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9104 process attribute ((target ("..."))). */
9106 static bool
9107 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9109 struct cl_target_option cur_target;
9110 bool ret;
9111 tree old_optimize;
9112 tree new_target, new_optimize;
9113 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9115 /* If what we're processing is the current pragma string then the
9116 target option node is already stored in target_option_current_node
9117 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9118 having to re-parse the string. This is especially useful to keep
9119 arm_neon.h compile times down since that header contains a lot
9120 of intrinsics enclosed in pragmas. */
9121 if (!existing_target && args == current_target_pragma)
9123 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9124 return true;
9126 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9128 old_optimize = build_optimization_node (&global_options);
9129 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9131 /* If the function changed the optimization levels as well as setting
9132 target options, start with the optimizations specified. */
9133 if (func_optimize && func_optimize != old_optimize)
9134 cl_optimization_restore (&global_options,
9135 TREE_OPTIMIZATION (func_optimize));
9137 /* Save the current target options to restore at the end. */
9138 cl_target_option_save (&cur_target, &global_options);
9140 /* If fndecl already has some target attributes applied to it, unpack
9141 them so that we add this attribute on top of them, rather than
9142 overwriting them. */
9143 if (existing_target)
9145 struct cl_target_option *existing_options
9146 = TREE_TARGET_OPTION (existing_target);
9148 if (existing_options)
9149 cl_target_option_restore (&global_options, existing_options);
9151 else
9152 cl_target_option_restore (&global_options,
9153 TREE_TARGET_OPTION (target_option_current_node));
9156 ret = aarch64_process_target_attr (args, "attribute");
9158 /* Set up any additional state. */
9159 if (ret)
9161 aarch64_override_options_internal (&global_options);
9162 /* Initialize SIMD builtins if we haven't already.
9163 Set current_target_pragma to NULL for the duration so that
9164 the builtin initialization code doesn't try to tag the functions
9165 being built with the attributes specified by any current pragma, thus
9166 going into an infinite recursion. */
9167 if (TARGET_SIMD)
9169 tree saved_current_target_pragma = current_target_pragma;
9170 current_target_pragma = NULL;
9171 aarch64_init_simd_builtins ();
9172 current_target_pragma = saved_current_target_pragma;
9174 new_target = build_target_option_node (&global_options);
9176 else
9177 new_target = NULL;
9179 new_optimize = build_optimization_node (&global_options);
9181 if (fndecl && ret)
9183 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9185 if (old_optimize != new_optimize)
9186 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9189 cl_target_option_restore (&global_options, &cur_target);
9191 if (old_optimize != new_optimize)
9192 cl_optimization_restore (&global_options,
9193 TREE_OPTIMIZATION (old_optimize));
9194 return ret;
9197 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9198 tri-bool options (yes, no, don't care) and the default value is
9199 DEF, determine whether to reject inlining. */
9201 static bool
9202 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9203 int dont_care, int def)
9205 /* If the callee doesn't care, always allow inlining. */
9206 if (callee == dont_care)
9207 return true;
9209 /* If the caller doesn't care, always allow inlining. */
9210 if (caller == dont_care)
9211 return true;
9213 /* Otherwise, allow inlining if either the callee and caller values
9214 agree, or if the callee is using the default value. */
9215 return (callee == caller || callee == def);
9218 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9219 to inline CALLEE into CALLER based on target-specific info.
9220 Make sure that the caller and callee have compatible architectural
9221 features. Then go through the other possible target attributes
9222 and see if they can block inlining. Try not to reject always_inline
9223 callees unless they are incompatible architecturally. */
9225 static bool
9226 aarch64_can_inline_p (tree caller, tree callee)
9228 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9229 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9231 /* If callee has no option attributes, then it is ok to inline. */
9232 if (!callee_tree)
9233 return true;
9235 struct cl_target_option *caller_opts
9236 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9237 : target_option_default_node);
9239 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9242 /* Callee's ISA flags should be a subset of the caller's. */
9243 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9244 != callee_opts->x_aarch64_isa_flags)
9245 return false;
9247 /* Allow non-strict aligned functions inlining into strict
9248 aligned ones. */
9249 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9250 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9251 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9252 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9253 return false;
9255 bool always_inline = lookup_attribute ("always_inline",
9256 DECL_ATTRIBUTES (callee));
9258 /* If the architectural features match up and the callee is always_inline
9259 then the other attributes don't matter. */
9260 if (always_inline)
9261 return true;
9263 if (caller_opts->x_aarch64_cmodel_var
9264 != callee_opts->x_aarch64_cmodel_var)
9265 return false;
9267 if (caller_opts->x_aarch64_tls_dialect
9268 != callee_opts->x_aarch64_tls_dialect)
9269 return false;
9271 /* Honour explicit requests to workaround errata. */
9272 if (!aarch64_tribools_ok_for_inlining_p (
9273 caller_opts->x_aarch64_fix_a53_err835769,
9274 callee_opts->x_aarch64_fix_a53_err835769,
9275 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9276 return false;
9278 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9279 caller and calle and they don't match up, reject inlining. */
9280 if (!aarch64_tribools_ok_for_inlining_p (
9281 caller_opts->x_flag_omit_leaf_frame_pointer,
9282 callee_opts->x_flag_omit_leaf_frame_pointer,
9283 2, 1))
9284 return false;
9286 /* If the callee has specific tuning overrides, respect them. */
9287 if (callee_opts->x_aarch64_override_tune_string != NULL
9288 && caller_opts->x_aarch64_override_tune_string == NULL)
9289 return false;
9291 /* If the user specified tuning override strings for the
9292 caller and callee and they don't match up, reject inlining.
9293 We just do a string compare here, we don't analyze the meaning
9294 of the string, as it would be too costly for little gain. */
9295 if (callee_opts->x_aarch64_override_tune_string
9296 && caller_opts->x_aarch64_override_tune_string
9297 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9298 caller_opts->x_aarch64_override_tune_string) != 0))
9299 return false;
9301 return true;
9304 /* Return true if SYMBOL_REF X binds locally. */
9306 static bool
9307 aarch64_symbol_binds_local_p (const_rtx x)
9309 return (SYMBOL_REF_DECL (x)
9310 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9311 : SYMBOL_REF_LOCAL_P (x));
9314 /* Return true if SYMBOL_REF X is thread local */
9315 static bool
9316 aarch64_tls_symbol_p (rtx x)
9318 if (! TARGET_HAVE_TLS)
9319 return false;
9321 if (GET_CODE (x) != SYMBOL_REF)
9322 return false;
9324 return SYMBOL_REF_TLS_MODEL (x) != 0;
9327 /* Classify a TLS symbol into one of the TLS kinds. */
9328 enum aarch64_symbol_type
9329 aarch64_classify_tls_symbol (rtx x)
9331 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9333 switch (tls_kind)
9335 case TLS_MODEL_GLOBAL_DYNAMIC:
9336 case TLS_MODEL_LOCAL_DYNAMIC:
9337 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9339 case TLS_MODEL_INITIAL_EXEC:
9340 switch (aarch64_cmodel)
9342 case AARCH64_CMODEL_TINY:
9343 case AARCH64_CMODEL_TINY_PIC:
9344 return SYMBOL_TINY_TLSIE;
9345 default:
9346 return SYMBOL_SMALL_TLSIE;
9349 case TLS_MODEL_LOCAL_EXEC:
9350 if (aarch64_tls_size == 12)
9351 return SYMBOL_TLSLE12;
9352 else if (aarch64_tls_size == 24)
9353 return SYMBOL_TLSLE24;
9354 else if (aarch64_tls_size == 32)
9355 return SYMBOL_TLSLE32;
9356 else if (aarch64_tls_size == 48)
9357 return SYMBOL_TLSLE48;
9358 else
9359 gcc_unreachable ();
9361 case TLS_MODEL_EMULATED:
9362 case TLS_MODEL_NONE:
9363 return SYMBOL_FORCE_TO_MEM;
9365 default:
9366 gcc_unreachable ();
9370 /* Return the method that should be used to access SYMBOL_REF or
9371 LABEL_REF X. */
9373 enum aarch64_symbol_type
9374 aarch64_classify_symbol (rtx x, rtx offset)
9376 if (GET_CODE (x) == LABEL_REF)
9378 switch (aarch64_cmodel)
9380 case AARCH64_CMODEL_LARGE:
9381 return SYMBOL_FORCE_TO_MEM;
9383 case AARCH64_CMODEL_TINY_PIC:
9384 case AARCH64_CMODEL_TINY:
9385 return SYMBOL_TINY_ABSOLUTE;
9387 case AARCH64_CMODEL_SMALL_SPIC:
9388 case AARCH64_CMODEL_SMALL_PIC:
9389 case AARCH64_CMODEL_SMALL:
9390 return SYMBOL_SMALL_ABSOLUTE;
9392 default:
9393 gcc_unreachable ();
9397 if (GET_CODE (x) == SYMBOL_REF)
9399 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
9401 /* This is alright even in PIC code as the constant
9402 pool reference is always PC relative and within
9403 the same translation unit. */
9404 if (nopcrelative_literal_loads
9405 && CONSTANT_POOL_ADDRESS_P (x))
9406 return SYMBOL_SMALL_ABSOLUTE;
9407 else
9408 return SYMBOL_FORCE_TO_MEM;
9411 if (aarch64_tls_symbol_p (x))
9412 return aarch64_classify_tls_symbol (x);
9414 switch (aarch64_cmodel)
9416 case AARCH64_CMODEL_TINY:
9417 /* When we retreive symbol + offset address, we have to make sure
9418 the offset does not cause overflow of the final address. But
9419 we have no way of knowing the address of symbol at compile time
9420 so we can't accurately say if the distance between the PC and
9421 symbol + offset is outside the addressible range of +/-1M in the
9422 TINY code model. So we rely on images not being greater than
9423 1M and cap the offset at 1M and anything beyond 1M will have to
9424 be loaded using an alternative mechanism. */
9425 if (SYMBOL_REF_WEAK (x)
9426 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9427 return SYMBOL_FORCE_TO_MEM;
9428 return SYMBOL_TINY_ABSOLUTE;
9430 case AARCH64_CMODEL_SMALL:
9431 /* Same reasoning as the tiny code model, but the offset cap here is
9432 4G. */
9433 if (SYMBOL_REF_WEAK (x)
9434 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9435 HOST_WIDE_INT_C (4294967264)))
9436 return SYMBOL_FORCE_TO_MEM;
9437 return SYMBOL_SMALL_ABSOLUTE;
9439 case AARCH64_CMODEL_TINY_PIC:
9440 if (!aarch64_symbol_binds_local_p (x))
9441 return SYMBOL_TINY_GOT;
9442 return SYMBOL_TINY_ABSOLUTE;
9444 case AARCH64_CMODEL_SMALL_SPIC:
9445 case AARCH64_CMODEL_SMALL_PIC:
9446 if (!aarch64_symbol_binds_local_p (x))
9447 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9448 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9449 return SYMBOL_SMALL_ABSOLUTE;
9451 default:
9452 gcc_unreachable ();
9456 /* By default push everything into the constant pool. */
9457 return SYMBOL_FORCE_TO_MEM;
9460 bool
9461 aarch64_constant_address_p (rtx x)
9463 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9466 bool
9467 aarch64_legitimate_pic_operand_p (rtx x)
9469 if (GET_CODE (x) == SYMBOL_REF
9470 || (GET_CODE (x) == CONST
9471 && GET_CODE (XEXP (x, 0)) == PLUS
9472 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9473 return false;
9475 return true;
9478 /* Return true if X holds either a quarter-precision or
9479 floating-point +0.0 constant. */
9480 static bool
9481 aarch64_valid_floating_const (machine_mode mode, rtx x)
9483 if (!CONST_DOUBLE_P (x))
9484 return false;
9486 if (aarch64_float_const_zero_rtx_p (x))
9487 return true;
9489 /* We only handle moving 0.0 to a TFmode register. */
9490 if (!(mode == SFmode || mode == DFmode))
9491 return false;
9493 return aarch64_float_const_representable_p (x);
9496 static bool
9497 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9499 /* Do not allow vector struct mode constants. We could support
9500 0 and -1 easily, but they need support in aarch64-simd.md. */
9501 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9502 return false;
9504 /* This could probably go away because
9505 we now decompose CONST_INTs according to expand_mov_immediate. */
9506 if ((GET_CODE (x) == CONST_VECTOR
9507 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9508 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9509 return !targetm.cannot_force_const_mem (mode, x);
9511 if (GET_CODE (x) == HIGH
9512 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9513 return true;
9515 return aarch64_constant_address_p (x);
9519 aarch64_load_tp (rtx target)
9521 if (!target
9522 || GET_MODE (target) != Pmode
9523 || !register_operand (target, Pmode))
9524 target = gen_reg_rtx (Pmode);
9526 /* Can return in any reg. */
9527 emit_insn (gen_aarch64_load_tp_hard (target));
9528 return target;
9531 /* On AAPCS systems, this is the "struct __va_list". */
9532 static GTY(()) tree va_list_type;
9534 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9535 Return the type to use as __builtin_va_list.
9537 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9539 struct __va_list
9541 void *__stack;
9542 void *__gr_top;
9543 void *__vr_top;
9544 int __gr_offs;
9545 int __vr_offs;
9546 }; */
9548 static tree
9549 aarch64_build_builtin_va_list (void)
9551 tree va_list_name;
9552 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9554 /* Create the type. */
9555 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9556 /* Give it the required name. */
9557 va_list_name = build_decl (BUILTINS_LOCATION,
9558 TYPE_DECL,
9559 get_identifier ("__va_list"),
9560 va_list_type);
9561 DECL_ARTIFICIAL (va_list_name) = 1;
9562 TYPE_NAME (va_list_type) = va_list_name;
9563 TYPE_STUB_DECL (va_list_type) = va_list_name;
9565 /* Create the fields. */
9566 f_stack = build_decl (BUILTINS_LOCATION,
9567 FIELD_DECL, get_identifier ("__stack"),
9568 ptr_type_node);
9569 f_grtop = build_decl (BUILTINS_LOCATION,
9570 FIELD_DECL, get_identifier ("__gr_top"),
9571 ptr_type_node);
9572 f_vrtop = build_decl (BUILTINS_LOCATION,
9573 FIELD_DECL, get_identifier ("__vr_top"),
9574 ptr_type_node);
9575 f_groff = build_decl (BUILTINS_LOCATION,
9576 FIELD_DECL, get_identifier ("__gr_offs"),
9577 integer_type_node);
9578 f_vroff = build_decl (BUILTINS_LOCATION,
9579 FIELD_DECL, get_identifier ("__vr_offs"),
9580 integer_type_node);
9582 DECL_ARTIFICIAL (f_stack) = 1;
9583 DECL_ARTIFICIAL (f_grtop) = 1;
9584 DECL_ARTIFICIAL (f_vrtop) = 1;
9585 DECL_ARTIFICIAL (f_groff) = 1;
9586 DECL_ARTIFICIAL (f_vroff) = 1;
9588 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9589 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9590 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9591 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9592 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9594 TYPE_FIELDS (va_list_type) = f_stack;
9595 DECL_CHAIN (f_stack) = f_grtop;
9596 DECL_CHAIN (f_grtop) = f_vrtop;
9597 DECL_CHAIN (f_vrtop) = f_groff;
9598 DECL_CHAIN (f_groff) = f_vroff;
9600 /* Compute its layout. */
9601 layout_type (va_list_type);
9603 return va_list_type;
9606 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9607 static void
9608 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9610 const CUMULATIVE_ARGS *cum;
9611 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9612 tree stack, grtop, vrtop, groff, vroff;
9613 tree t;
9614 int gr_save_area_size;
9615 int vr_save_area_size;
9616 int vr_offset;
9618 cum = &crtl->args.info;
9619 gr_save_area_size
9620 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
9621 vr_save_area_size
9622 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
9624 if (!TARGET_FLOAT)
9626 gcc_assert (cum->aapcs_nvrn == 0);
9627 vr_save_area_size = 0;
9630 f_stack = TYPE_FIELDS (va_list_type_node);
9631 f_grtop = DECL_CHAIN (f_stack);
9632 f_vrtop = DECL_CHAIN (f_grtop);
9633 f_groff = DECL_CHAIN (f_vrtop);
9634 f_vroff = DECL_CHAIN (f_groff);
9636 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9637 NULL_TREE);
9638 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9639 NULL_TREE);
9640 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9641 NULL_TREE);
9642 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9643 NULL_TREE);
9644 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9645 NULL_TREE);
9647 /* Emit code to initialize STACK, which points to the next varargs stack
9648 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9649 by named arguments. STACK is 8-byte aligned. */
9650 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9651 if (cum->aapcs_stack_size > 0)
9652 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9653 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9654 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9656 /* Emit code to initialize GRTOP, the top of the GR save area.
9657 virtual_incoming_args_rtx should have been 16 byte aligned. */
9658 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9659 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9660 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9662 /* Emit code to initialize VRTOP, the top of the VR save area.
9663 This address is gr_save_area_bytes below GRTOP, rounded
9664 down to the next 16-byte boundary. */
9665 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9666 vr_offset = ROUND_UP (gr_save_area_size,
9667 STACK_BOUNDARY / BITS_PER_UNIT);
9669 if (vr_offset)
9670 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9671 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9672 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9674 /* Emit code to initialize GROFF, the offset from GRTOP of the
9675 next GPR argument. */
9676 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9677 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9678 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9680 /* Likewise emit code to initialize VROFF, the offset from FTOP
9681 of the next VR argument. */
9682 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9683 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9684 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9687 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9689 static tree
9690 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9691 gimple_seq *post_p ATTRIBUTE_UNUSED)
9693 tree addr;
9694 bool indirect_p;
9695 bool is_ha; /* is HFA or HVA. */
9696 bool dw_align; /* double-word align. */
9697 machine_mode ag_mode = VOIDmode;
9698 int nregs;
9699 machine_mode mode;
9701 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9702 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9703 HOST_WIDE_INT size, rsize, adjust, align;
9704 tree t, u, cond1, cond2;
9706 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9707 if (indirect_p)
9708 type = build_pointer_type (type);
9710 mode = TYPE_MODE (type);
9712 f_stack = TYPE_FIELDS (va_list_type_node);
9713 f_grtop = DECL_CHAIN (f_stack);
9714 f_vrtop = DECL_CHAIN (f_grtop);
9715 f_groff = DECL_CHAIN (f_vrtop);
9716 f_vroff = DECL_CHAIN (f_groff);
9718 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9719 f_stack, NULL_TREE);
9720 size = int_size_in_bytes (type);
9721 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9723 dw_align = false;
9724 adjust = 0;
9725 if (aarch64_vfp_is_call_or_return_candidate (mode,
9726 type,
9727 &ag_mode,
9728 &nregs,
9729 &is_ha))
9731 /* TYPE passed in fp/simd registers. */
9732 if (!TARGET_FLOAT)
9733 aarch64_err_no_fpadvsimd (mode, "varargs");
9735 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9736 unshare_expr (valist), f_vrtop, NULL_TREE);
9737 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9738 unshare_expr (valist), f_vroff, NULL_TREE);
9740 rsize = nregs * UNITS_PER_VREG;
9742 if (is_ha)
9744 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9745 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9747 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9748 && size < UNITS_PER_VREG)
9750 adjust = UNITS_PER_VREG - size;
9753 else
9755 /* TYPE passed in general registers. */
9756 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9757 unshare_expr (valist), f_grtop, NULL_TREE);
9758 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9759 unshare_expr (valist), f_groff, NULL_TREE);
9760 rsize = ROUND_UP (size, UNITS_PER_WORD);
9761 nregs = rsize / UNITS_PER_WORD;
9763 if (align > 8)
9764 dw_align = true;
9766 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9767 && size < UNITS_PER_WORD)
9769 adjust = UNITS_PER_WORD - size;
9773 /* Get a local temporary for the field value. */
9774 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9776 /* Emit code to branch if off >= 0. */
9777 t = build2 (GE_EXPR, boolean_type_node, off,
9778 build_int_cst (TREE_TYPE (off), 0));
9779 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9781 if (dw_align)
9783 /* Emit: offs = (offs + 15) & -16. */
9784 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9785 build_int_cst (TREE_TYPE (off), 15));
9786 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9787 build_int_cst (TREE_TYPE (off), -16));
9788 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9790 else
9791 roundup = NULL;
9793 /* Update ap.__[g|v]r_offs */
9794 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9795 build_int_cst (TREE_TYPE (off), rsize));
9796 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9798 /* String up. */
9799 if (roundup)
9800 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9802 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9803 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9804 build_int_cst (TREE_TYPE (f_off), 0));
9805 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9807 /* String up: make sure the assignment happens before the use. */
9808 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9809 COND_EXPR_ELSE (cond1) = t;
9811 /* Prepare the trees handling the argument that is passed on the stack;
9812 the top level node will store in ON_STACK. */
9813 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9814 if (align > 8)
9816 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9817 t = fold_convert (intDI_type_node, arg);
9818 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9819 build_int_cst (TREE_TYPE (t), 15));
9820 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9821 build_int_cst (TREE_TYPE (t), -16));
9822 t = fold_convert (TREE_TYPE (arg), t);
9823 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9825 else
9826 roundup = NULL;
9827 /* Advance ap.__stack */
9828 t = fold_convert (intDI_type_node, arg);
9829 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9830 build_int_cst (TREE_TYPE (t), size + 7));
9831 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9832 build_int_cst (TREE_TYPE (t), -8));
9833 t = fold_convert (TREE_TYPE (arg), t);
9834 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9835 /* String up roundup and advance. */
9836 if (roundup)
9837 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9838 /* String up with arg */
9839 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9840 /* Big-endianness related address adjustment. */
9841 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9842 && size < UNITS_PER_WORD)
9844 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9845 size_int (UNITS_PER_WORD - size));
9846 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9849 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9850 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9852 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9853 t = off;
9854 if (adjust)
9855 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9856 build_int_cst (TREE_TYPE (off), adjust));
9858 t = fold_convert (sizetype, t);
9859 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9861 if (is_ha)
9863 /* type ha; // treat as "struct {ftype field[n];}"
9864 ... [computing offs]
9865 for (i = 0; i <nregs; ++i, offs += 16)
9866 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9867 return ha; */
9868 int i;
9869 tree tmp_ha, field_t, field_ptr_t;
9871 /* Declare a local variable. */
9872 tmp_ha = create_tmp_var_raw (type, "ha");
9873 gimple_add_tmp_var (tmp_ha);
9875 /* Establish the base type. */
9876 switch (ag_mode)
9878 case SFmode:
9879 field_t = float_type_node;
9880 field_ptr_t = float_ptr_type_node;
9881 break;
9882 case DFmode:
9883 field_t = double_type_node;
9884 field_ptr_t = double_ptr_type_node;
9885 break;
9886 case TFmode:
9887 field_t = long_double_type_node;
9888 field_ptr_t = long_double_ptr_type_node;
9889 break;
9890 /* The half precision and quad precision are not fully supported yet. Enable
9891 the following code after the support is complete. Need to find the correct
9892 type node for __fp16 *. */
9893 #if 0
9894 case HFmode:
9895 field_t = float_type_node;
9896 field_ptr_t = float_ptr_type_node;
9897 break;
9898 #endif
9899 case V2SImode:
9900 case V4SImode:
9902 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9903 field_t = build_vector_type_for_mode (innertype, ag_mode);
9904 field_ptr_t = build_pointer_type (field_t);
9906 break;
9907 default:
9908 gcc_assert (0);
9911 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
9912 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9913 addr = t;
9914 t = fold_convert (field_ptr_t, addr);
9915 t = build2 (MODIFY_EXPR, field_t,
9916 build1 (INDIRECT_REF, field_t, tmp_ha),
9917 build1 (INDIRECT_REF, field_t, t));
9919 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
9920 for (i = 1; i < nregs; ++i)
9922 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9923 u = fold_convert (field_ptr_t, addr);
9924 u = build2 (MODIFY_EXPR, field_t,
9925 build2 (MEM_REF, field_t, tmp_ha,
9926 build_int_cst (field_ptr_t,
9927 (i *
9928 int_size_in_bytes (field_t)))),
9929 build1 (INDIRECT_REF, field_t, u));
9930 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9933 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9934 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9937 COND_EXPR_ELSE (cond2) = t;
9938 addr = fold_convert (build_pointer_type (type), cond1);
9939 addr = build_va_arg_indirect_ref (addr);
9941 if (indirect_p)
9942 addr = build_va_arg_indirect_ref (addr);
9944 return addr;
9947 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
9949 static void
9950 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9951 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9952 int no_rtl)
9954 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9955 CUMULATIVE_ARGS local_cum;
9956 int gr_saved, vr_saved;
9958 /* The caller has advanced CUM up to, but not beyond, the last named
9959 argument. Advance a local copy of CUM past the last "real" named
9960 argument, to find out how many registers are left over. */
9961 local_cum = *cum;
9962 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9964 /* Found out how many registers we need to save. */
9965 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9966 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9968 if (!TARGET_FLOAT)
9970 gcc_assert (local_cum.aapcs_nvrn == 0);
9971 vr_saved = 0;
9974 if (!no_rtl)
9976 if (gr_saved > 0)
9978 rtx ptr, mem;
9980 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
9981 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9982 - gr_saved * UNITS_PER_WORD);
9983 mem = gen_frame_mem (BLKmode, ptr);
9984 set_mem_alias_set (mem, get_varargs_alias_set ());
9986 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9987 mem, gr_saved);
9989 if (vr_saved > 0)
9991 /* We can't use move_block_from_reg, because it will use
9992 the wrong mode, storing D regs only. */
9993 machine_mode mode = TImode;
9994 int off, i;
9996 /* Set OFF to the offset from virtual_incoming_args_rtx of
9997 the first vector register. The VR save area lies below
9998 the GR one, and is aligned to 16 bytes. */
9999 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10000 STACK_BOUNDARY / BITS_PER_UNIT);
10001 off -= vr_saved * UNITS_PER_VREG;
10003 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
10005 rtx ptr, mem;
10007 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10008 mem = gen_frame_mem (mode, ptr);
10009 set_mem_alias_set (mem, get_varargs_alias_set ());
10010 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
10011 off += UNITS_PER_VREG;
10016 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10017 any complication of having crtl->args.pretend_args_size changed. */
10018 cfun->machine->frame.saved_varargs_size
10019 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10020 STACK_BOUNDARY / BITS_PER_UNIT)
10021 + vr_saved * UNITS_PER_VREG);
10024 static void
10025 aarch64_conditional_register_usage (void)
10027 int i;
10028 if (!TARGET_FLOAT)
10030 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10032 fixed_regs[i] = 1;
10033 call_used_regs[i] = 1;
10038 /* Walk down the type tree of TYPE counting consecutive base elements.
10039 If *MODEP is VOIDmode, then set it to the first valid floating point
10040 type. If a non-floating point type is found, or if a floating point
10041 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10042 otherwise return the count in the sub-tree. */
10043 static int
10044 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10046 machine_mode mode;
10047 HOST_WIDE_INT size;
10049 switch (TREE_CODE (type))
10051 case REAL_TYPE:
10052 mode = TYPE_MODE (type);
10053 if (mode != DFmode && mode != SFmode && mode != TFmode)
10054 return -1;
10056 if (*modep == VOIDmode)
10057 *modep = mode;
10059 if (*modep == mode)
10060 return 1;
10062 break;
10064 case COMPLEX_TYPE:
10065 mode = TYPE_MODE (TREE_TYPE (type));
10066 if (mode != DFmode && mode != SFmode && mode != TFmode)
10067 return -1;
10069 if (*modep == VOIDmode)
10070 *modep = mode;
10072 if (*modep == mode)
10073 return 2;
10075 break;
10077 case VECTOR_TYPE:
10078 /* Use V2SImode and V4SImode as representatives of all 64-bit
10079 and 128-bit vector types. */
10080 size = int_size_in_bytes (type);
10081 switch (size)
10083 case 8:
10084 mode = V2SImode;
10085 break;
10086 case 16:
10087 mode = V4SImode;
10088 break;
10089 default:
10090 return -1;
10093 if (*modep == VOIDmode)
10094 *modep = mode;
10096 /* Vector modes are considered to be opaque: two vectors are
10097 equivalent for the purposes of being homogeneous aggregates
10098 if they are the same size. */
10099 if (*modep == mode)
10100 return 1;
10102 break;
10104 case ARRAY_TYPE:
10106 int count;
10107 tree index = TYPE_DOMAIN (type);
10109 /* Can't handle incomplete types nor sizes that are not
10110 fixed. */
10111 if (!COMPLETE_TYPE_P (type)
10112 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10113 return -1;
10115 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10116 if (count == -1
10117 || !index
10118 || !TYPE_MAX_VALUE (index)
10119 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10120 || !TYPE_MIN_VALUE (index)
10121 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10122 || count < 0)
10123 return -1;
10125 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10126 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10128 /* There must be no padding. */
10129 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10130 return -1;
10132 return count;
10135 case RECORD_TYPE:
10137 int count = 0;
10138 int sub_count;
10139 tree field;
10141 /* Can't handle incomplete types nor sizes that are not
10142 fixed. */
10143 if (!COMPLETE_TYPE_P (type)
10144 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10145 return -1;
10147 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10149 if (TREE_CODE (field) != FIELD_DECL)
10150 continue;
10152 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10153 if (sub_count < 0)
10154 return -1;
10155 count += sub_count;
10158 /* There must be no padding. */
10159 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10160 return -1;
10162 return count;
10165 case UNION_TYPE:
10166 case QUAL_UNION_TYPE:
10168 /* These aren't very interesting except in a degenerate case. */
10169 int count = 0;
10170 int sub_count;
10171 tree field;
10173 /* Can't handle incomplete types nor sizes that are not
10174 fixed. */
10175 if (!COMPLETE_TYPE_P (type)
10176 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10177 return -1;
10179 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10181 if (TREE_CODE (field) != FIELD_DECL)
10182 continue;
10184 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10185 if (sub_count < 0)
10186 return -1;
10187 count = count > sub_count ? count : sub_count;
10190 /* There must be no padding. */
10191 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10192 return -1;
10194 return count;
10197 default:
10198 break;
10201 return -1;
10204 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10205 type as described in AAPCS64 \S 4.1.2.
10207 See the comment above aarch64_composite_type_p for the notes on MODE. */
10209 static bool
10210 aarch64_short_vector_p (const_tree type,
10211 machine_mode mode)
10213 HOST_WIDE_INT size = -1;
10215 if (type && TREE_CODE (type) == VECTOR_TYPE)
10216 size = int_size_in_bytes (type);
10217 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10218 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10219 size = GET_MODE_SIZE (mode);
10221 return (size == 8 || size == 16);
10224 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10225 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10226 array types. The C99 floating-point complex types are also considered
10227 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10228 types, which are GCC extensions and out of the scope of AAPCS64, are
10229 treated as composite types here as well.
10231 Note that MODE itself is not sufficient in determining whether a type
10232 is such a composite type or not. This is because
10233 stor-layout.c:compute_record_mode may have already changed the MODE
10234 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10235 structure with only one field may have its MODE set to the mode of the
10236 field. Also an integer mode whose size matches the size of the
10237 RECORD_TYPE type may be used to substitute the original mode
10238 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10239 solely relied on. */
10241 static bool
10242 aarch64_composite_type_p (const_tree type,
10243 machine_mode mode)
10245 if (aarch64_short_vector_p (type, mode))
10246 return false;
10248 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10249 return true;
10251 if (mode == BLKmode
10252 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10253 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10254 return true;
10256 return false;
10259 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10260 shall be passed or returned in simd/fp register(s) (providing these
10261 parameter passing registers are available).
10263 Upon successful return, *COUNT returns the number of needed registers,
10264 *BASE_MODE returns the mode of the individual register and when IS_HAF
10265 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10266 floating-point aggregate or a homogeneous short-vector aggregate. */
10268 static bool
10269 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10270 const_tree type,
10271 machine_mode *base_mode,
10272 int *count,
10273 bool *is_ha)
10275 machine_mode new_mode = VOIDmode;
10276 bool composite_p = aarch64_composite_type_p (type, mode);
10278 if (is_ha != NULL) *is_ha = false;
10280 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10281 || aarch64_short_vector_p (type, mode))
10283 *count = 1;
10284 new_mode = mode;
10286 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10288 if (is_ha != NULL) *is_ha = true;
10289 *count = 2;
10290 new_mode = GET_MODE_INNER (mode);
10292 else if (type && composite_p)
10294 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10296 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10298 if (is_ha != NULL) *is_ha = true;
10299 *count = ag_count;
10301 else
10302 return false;
10304 else
10305 return false;
10307 *base_mode = new_mode;
10308 return true;
10311 /* Implement TARGET_STRUCT_VALUE_RTX. */
10313 static rtx
10314 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10315 int incoming ATTRIBUTE_UNUSED)
10317 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10320 /* Implements target hook vector_mode_supported_p. */
10321 static bool
10322 aarch64_vector_mode_supported_p (machine_mode mode)
10324 if (TARGET_SIMD
10325 && (mode == V4SImode || mode == V8HImode
10326 || mode == V16QImode || mode == V2DImode
10327 || mode == V2SImode || mode == V4HImode
10328 || mode == V8QImode || mode == V2SFmode
10329 || mode == V4SFmode || mode == V2DFmode
10330 || mode == V4HFmode || mode == V8HFmode
10331 || mode == V1DFmode))
10332 return true;
10334 return false;
10337 /* Return appropriate SIMD container
10338 for MODE within a vector of WIDTH bits. */
10339 static machine_mode
10340 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10342 gcc_assert (width == 64 || width == 128);
10343 if (TARGET_SIMD)
10345 if (width == 128)
10346 switch (mode)
10348 case DFmode:
10349 return V2DFmode;
10350 case SFmode:
10351 return V4SFmode;
10352 case SImode:
10353 return V4SImode;
10354 case HImode:
10355 return V8HImode;
10356 case QImode:
10357 return V16QImode;
10358 case DImode:
10359 return V2DImode;
10360 default:
10361 break;
10363 else
10364 switch (mode)
10366 case SFmode:
10367 return V2SFmode;
10368 case SImode:
10369 return V2SImode;
10370 case HImode:
10371 return V4HImode;
10372 case QImode:
10373 return V8QImode;
10374 default:
10375 break;
10378 return word_mode;
10381 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10382 static machine_mode
10383 aarch64_preferred_simd_mode (machine_mode mode)
10385 return aarch64_simd_container_mode (mode, 128);
10388 /* Return the bitmask of possible vector sizes for the vectorizer
10389 to iterate over. */
10390 static unsigned int
10391 aarch64_autovectorize_vector_sizes (void)
10393 return (16 | 8);
10396 /* Implement TARGET_MANGLE_TYPE. */
10398 static const char *
10399 aarch64_mangle_type (const_tree type)
10401 /* The AArch64 ABI documents say that "__va_list" has to be
10402 managled as if it is in the "std" namespace. */
10403 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10404 return "St9__va_list";
10406 /* Half-precision float. */
10407 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10408 return "Dh";
10410 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10411 builtin types. */
10412 if (TYPE_NAME (type) != NULL)
10413 return aarch64_mangle_builtin_type (type);
10415 /* Use the default mangling. */
10416 return NULL;
10420 /* Return true if the rtx_insn contains a MEM RTX somewhere
10421 in it. */
10423 static bool
10424 has_memory_op (rtx_insn *mem_insn)
10426 subrtx_iterator::array_type array;
10427 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10428 if (MEM_P (*iter))
10429 return true;
10431 return false;
10434 /* Find the first rtx_insn before insn that will generate an assembly
10435 instruction. */
10437 static rtx_insn *
10438 aarch64_prev_real_insn (rtx_insn *insn)
10440 if (!insn)
10441 return NULL;
10445 insn = prev_real_insn (insn);
10447 while (insn && recog_memoized (insn) < 0);
10449 return insn;
10452 static bool
10453 is_madd_op (enum attr_type t1)
10455 unsigned int i;
10456 /* A number of these may be AArch32 only. */
10457 enum attr_type mlatypes[] = {
10458 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10459 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10460 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10463 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10465 if (t1 == mlatypes[i])
10466 return true;
10469 return false;
10472 /* Check if there is a register dependency between a load and the insn
10473 for which we hold recog_data. */
10475 static bool
10476 dep_between_memop_and_curr (rtx memop)
10478 rtx load_reg;
10479 int opno;
10481 gcc_assert (GET_CODE (memop) == SET);
10483 if (!REG_P (SET_DEST (memop)))
10484 return false;
10486 load_reg = SET_DEST (memop);
10487 for (opno = 1; opno < recog_data.n_operands; opno++)
10489 rtx operand = recog_data.operand[opno];
10490 if (REG_P (operand)
10491 && reg_overlap_mentioned_p (load_reg, operand))
10492 return true;
10495 return false;
10499 /* When working around the Cortex-A53 erratum 835769,
10500 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10501 instruction and has a preceding memory instruction such that a NOP
10502 should be inserted between them. */
10504 bool
10505 aarch64_madd_needs_nop (rtx_insn* insn)
10507 enum attr_type attr_type;
10508 rtx_insn *prev;
10509 rtx body;
10511 if (!TARGET_FIX_ERR_A53_835769)
10512 return false;
10514 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10515 return false;
10517 attr_type = get_attr_type (insn);
10518 if (!is_madd_op (attr_type))
10519 return false;
10521 prev = aarch64_prev_real_insn (insn);
10522 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10523 Restore recog state to INSN to avoid state corruption. */
10524 extract_constrain_insn_cached (insn);
10526 if (!prev || !has_memory_op (prev))
10527 return false;
10529 body = single_set (prev);
10531 /* If the previous insn is a memory op and there is no dependency between
10532 it and the DImode madd, emit a NOP between them. If body is NULL then we
10533 have a complex memory operation, probably a load/store pair.
10534 Be conservative for now and emit a NOP. */
10535 if (GET_MODE (recog_data.operand[0]) == DImode
10536 && (!body || !dep_between_memop_and_curr (body)))
10537 return true;
10539 return false;
10544 /* Implement FINAL_PRESCAN_INSN. */
10546 void
10547 aarch64_final_prescan_insn (rtx_insn *insn)
10549 if (aarch64_madd_needs_nop (insn))
10550 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10554 /* Return the equivalent letter for size. */
10555 static char
10556 sizetochar (int size)
10558 switch (size)
10560 case 64: return 'd';
10561 case 32: return 's';
10562 case 16: return 'h';
10563 case 8 : return 'b';
10564 default: gcc_unreachable ();
10568 /* Return true iff x is a uniform vector of floating-point
10569 constants, and the constant can be represented in
10570 quarter-precision form. Note, as aarch64_float_const_representable
10571 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10572 static bool
10573 aarch64_vect_float_const_representable_p (rtx x)
10575 rtx elt;
10576 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10577 && const_vec_duplicate_p (x, &elt)
10578 && aarch64_float_const_representable_p (elt));
10581 /* Return true for valid and false for invalid. */
10582 bool
10583 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10584 struct simd_immediate_info *info)
10586 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10587 matches = 1; \
10588 for (i = 0; i < idx; i += (STRIDE)) \
10589 if (!(TEST)) \
10590 matches = 0; \
10591 if (matches) \
10593 immtype = (CLASS); \
10594 elsize = (ELSIZE); \
10595 eshift = (SHIFT); \
10596 emvn = (NEG); \
10597 break; \
10600 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10601 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10602 unsigned char bytes[16];
10603 int immtype = -1, matches;
10604 unsigned int invmask = inverse ? 0xff : 0;
10605 int eshift, emvn;
10607 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10609 if (! (aarch64_simd_imm_zero_p (op, mode)
10610 || aarch64_vect_float_const_representable_p (op)))
10611 return false;
10613 if (info)
10615 info->value = CONST_VECTOR_ELT (op, 0);
10616 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10617 info->mvn = false;
10618 info->shift = 0;
10621 return true;
10624 /* Splat vector constant out into a byte vector. */
10625 for (i = 0; i < n_elts; i++)
10627 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10628 it must be laid out in the vector register in reverse order. */
10629 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10630 unsigned HOST_WIDE_INT elpart;
10632 gcc_assert (CONST_INT_P (el));
10633 elpart = INTVAL (el);
10635 for (unsigned int byte = 0; byte < innersize; byte++)
10637 bytes[idx++] = (elpart & 0xff) ^ invmask;
10638 elpart >>= BITS_PER_UNIT;
10643 /* Sanity check. */
10644 gcc_assert (idx == GET_MODE_SIZE (mode));
10648 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10649 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10651 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10652 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10654 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10655 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10657 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10658 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10660 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10662 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10664 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10665 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10667 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10668 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10670 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10671 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10673 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10674 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10676 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10678 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10680 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10681 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10683 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10684 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10686 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10687 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10689 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10690 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10692 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10694 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10695 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10697 while (0);
10699 if (immtype == -1)
10700 return false;
10702 if (info)
10704 info->element_width = elsize;
10705 info->mvn = emvn != 0;
10706 info->shift = eshift;
10708 unsigned HOST_WIDE_INT imm = 0;
10710 if (immtype >= 12 && immtype <= 15)
10711 info->msl = true;
10713 /* Un-invert bytes of recognized vector, if necessary. */
10714 if (invmask != 0)
10715 for (i = 0; i < idx; i++)
10716 bytes[i] ^= invmask;
10718 if (immtype == 17)
10720 /* FIXME: Broken on 32-bit H_W_I hosts. */
10721 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10723 for (i = 0; i < 8; i++)
10724 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10725 << (i * BITS_PER_UNIT);
10728 info->value = GEN_INT (imm);
10730 else
10732 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10733 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10735 /* Construct 'abcdefgh' because the assembler cannot handle
10736 generic constants. */
10737 if (info->mvn)
10738 imm = ~imm;
10739 imm = (imm >> info->shift) & 0xff;
10740 info->value = GEN_INT (imm);
10744 return true;
10745 #undef CHECK
10748 /* Check of immediate shift constants are within range. */
10749 bool
10750 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10752 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10753 if (left)
10754 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10755 else
10756 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10759 /* Return true if X is a uniform vector where all elements
10760 are either the floating-point constant 0.0 or the
10761 integer constant 0. */
10762 bool
10763 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10765 return x == CONST0_RTX (mode);
10769 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10770 operation of width WIDTH at bit position POS. */
10773 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10775 gcc_assert (CONST_INT_P (width));
10776 gcc_assert (CONST_INT_P (pos));
10778 unsigned HOST_WIDE_INT mask
10779 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10780 return GEN_INT (mask << UINTVAL (pos));
10783 bool
10784 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10786 HOST_WIDE_INT imm = INTVAL (x);
10787 int i;
10789 for (i = 0; i < 8; i++)
10791 unsigned int byte = imm & 0xff;
10792 if (byte != 0xff && byte != 0)
10793 return false;
10794 imm >>= 8;
10797 return true;
10800 bool
10801 aarch64_mov_operand_p (rtx x, machine_mode mode)
10803 if (GET_CODE (x) == HIGH
10804 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10805 return true;
10807 if (CONST_INT_P (x))
10808 return true;
10810 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10811 return true;
10813 return aarch64_classify_symbolic_expression (x)
10814 == SYMBOL_TINY_ABSOLUTE;
10817 /* Return a const_int vector of VAL. */
10819 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10821 int nunits = GET_MODE_NUNITS (mode);
10822 rtvec v = rtvec_alloc (nunits);
10823 int i;
10825 for (i=0; i < nunits; i++)
10826 RTVEC_ELT (v, i) = GEN_INT (val);
10828 return gen_rtx_CONST_VECTOR (mode, v);
10831 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10833 bool
10834 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10836 machine_mode vmode;
10838 gcc_assert (!VECTOR_MODE_P (mode));
10839 vmode = aarch64_preferred_simd_mode (mode);
10840 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10841 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10844 /* Construct and return a PARALLEL RTX vector with elements numbering the
10845 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10846 the vector - from the perspective of the architecture. This does not
10847 line up with GCC's perspective on lane numbers, so we end up with
10848 different masks depending on our target endian-ness. The diagram
10849 below may help. We must draw the distinction when building masks
10850 which select one half of the vector. An instruction selecting
10851 architectural low-lanes for a big-endian target, must be described using
10852 a mask selecting GCC high-lanes.
10854 Big-Endian Little-Endian
10856 GCC 0 1 2 3 3 2 1 0
10857 | x | x | x | x | | x | x | x | x |
10858 Architecture 3 2 1 0 3 2 1 0
10860 Low Mask: { 2, 3 } { 0, 1 }
10861 High Mask: { 0, 1 } { 2, 3 }
10865 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10867 int nunits = GET_MODE_NUNITS (mode);
10868 rtvec v = rtvec_alloc (nunits / 2);
10869 int high_base = nunits / 2;
10870 int low_base = 0;
10871 int base;
10872 rtx t1;
10873 int i;
10875 if (BYTES_BIG_ENDIAN)
10876 base = high ? low_base : high_base;
10877 else
10878 base = high ? high_base : low_base;
10880 for (i = 0; i < nunits / 2; i++)
10881 RTVEC_ELT (v, i) = GEN_INT (base + i);
10883 t1 = gen_rtx_PARALLEL (mode, v);
10884 return t1;
10887 /* Check OP for validity as a PARALLEL RTX vector with elements
10888 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10889 from the perspective of the architecture. See the diagram above
10890 aarch64_simd_vect_par_cnst_half for more details. */
10892 bool
10893 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10894 bool high)
10896 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10897 HOST_WIDE_INT count_op = XVECLEN (op, 0);
10898 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10899 int i = 0;
10901 if (!VECTOR_MODE_P (mode))
10902 return false;
10904 if (count_op != count_ideal)
10905 return false;
10907 for (i = 0; i < count_ideal; i++)
10909 rtx elt_op = XVECEXP (op, 0, i);
10910 rtx elt_ideal = XVECEXP (ideal, 0, i);
10912 if (!CONST_INT_P (elt_op)
10913 || INTVAL (elt_ideal) != INTVAL (elt_op))
10914 return false;
10916 return true;
10919 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
10920 HIGH (exclusive). */
10921 void
10922 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10923 const_tree exp)
10925 HOST_WIDE_INT lane;
10926 gcc_assert (CONST_INT_P (operand));
10927 lane = INTVAL (operand);
10929 if (lane < low || lane >= high)
10931 if (exp)
10932 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10933 else
10934 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10938 /* Return TRUE if OP is a valid vector addressing mode. */
10939 bool
10940 aarch64_simd_mem_operand_p (rtx op)
10942 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10943 || REG_P (XEXP (op, 0)));
10946 /* Emit a register copy from operand to operand, taking care not to
10947 early-clobber source registers in the process.
10949 COUNT is the number of components into which the copy needs to be
10950 decomposed. */
10951 void
10952 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10953 unsigned int count)
10955 unsigned int i;
10956 int rdest = REGNO (operands[0]);
10957 int rsrc = REGNO (operands[1]);
10959 if (!reg_overlap_mentioned_p (operands[0], operands[1])
10960 || rdest < rsrc)
10961 for (i = 0; i < count; i++)
10962 emit_move_insn (gen_rtx_REG (mode, rdest + i),
10963 gen_rtx_REG (mode, rsrc + i));
10964 else
10965 for (i = 0; i < count; i++)
10966 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10967 gen_rtx_REG (mode, rsrc + count - i - 1));
10970 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10971 one of VSTRUCT modes: OI, CI or XI. */
10973 aarch64_simd_attr_length_move (rtx_insn *insn)
10975 machine_mode mode;
10977 extract_insn_cached (insn);
10979 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10981 mode = GET_MODE (recog_data.operand[0]);
10982 switch (mode)
10984 case OImode:
10985 return 8;
10986 case CImode:
10987 return 12;
10988 case XImode:
10989 return 16;
10990 default:
10991 gcc_unreachable ();
10994 return 4;
10997 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10998 one of VSTRUCT modes: OI, CI, or XI. */
11000 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11002 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11005 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11006 alignment of a vector to 128 bits. */
11007 static HOST_WIDE_INT
11008 aarch64_simd_vector_alignment (const_tree type)
11010 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11011 return MIN (align, 128);
11014 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11015 static bool
11016 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11018 if (is_packed)
11019 return false;
11021 /* We guarantee alignment for vectors up to 128-bits. */
11022 if (tree_int_cst_compare (TYPE_SIZE (type),
11023 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11024 return false;
11026 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11027 return true;
11030 /* If VALS is a vector constant that can be loaded into a register
11031 using DUP, generate instructions to do so and return an RTX to
11032 assign to the register. Otherwise return NULL_RTX. */
11033 static rtx
11034 aarch64_simd_dup_constant (rtx vals)
11036 machine_mode mode = GET_MODE (vals);
11037 machine_mode inner_mode = GET_MODE_INNER (mode);
11038 rtx x;
11040 if (!const_vec_duplicate_p (vals, &x))
11041 return NULL_RTX;
11043 /* We can load this constant by using DUP and a constant in a
11044 single ARM register. This will be cheaper than a vector
11045 load. */
11046 x = copy_to_mode_reg (inner_mode, x);
11047 return gen_rtx_VEC_DUPLICATE (mode, x);
11051 /* Generate code to load VALS, which is a PARALLEL containing only
11052 constants (for vec_init) or CONST_VECTOR, efficiently into a
11053 register. Returns an RTX to copy into the register, or NULL_RTX
11054 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11055 static rtx
11056 aarch64_simd_make_constant (rtx vals)
11058 machine_mode mode = GET_MODE (vals);
11059 rtx const_dup;
11060 rtx const_vec = NULL_RTX;
11061 int n_elts = GET_MODE_NUNITS (mode);
11062 int n_const = 0;
11063 int i;
11065 if (GET_CODE (vals) == CONST_VECTOR)
11066 const_vec = vals;
11067 else if (GET_CODE (vals) == PARALLEL)
11069 /* A CONST_VECTOR must contain only CONST_INTs and
11070 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11071 Only store valid constants in a CONST_VECTOR. */
11072 for (i = 0; i < n_elts; ++i)
11074 rtx x = XVECEXP (vals, 0, i);
11075 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11076 n_const++;
11078 if (n_const == n_elts)
11079 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11081 else
11082 gcc_unreachable ();
11084 if (const_vec != NULL_RTX
11085 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11086 /* Load using MOVI/MVNI. */
11087 return const_vec;
11088 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11089 /* Loaded using DUP. */
11090 return const_dup;
11091 else if (const_vec != NULL_RTX)
11092 /* Load from constant pool. We can not take advantage of single-cycle
11093 LD1 because we need a PC-relative addressing mode. */
11094 return const_vec;
11095 else
11096 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11097 We can not construct an initializer. */
11098 return NULL_RTX;
11101 void
11102 aarch64_expand_vector_init (rtx target, rtx vals)
11104 machine_mode mode = GET_MODE (target);
11105 machine_mode inner_mode = GET_MODE_INNER (mode);
11106 int n_elts = GET_MODE_NUNITS (mode);
11107 int n_var = 0;
11108 rtx any_const = NULL_RTX;
11109 bool all_same = true;
11111 for (int i = 0; i < n_elts; ++i)
11113 rtx x = XVECEXP (vals, 0, i);
11114 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
11115 ++n_var;
11116 else
11117 any_const = x;
11119 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
11120 all_same = false;
11123 if (n_var == 0)
11125 rtx constant = aarch64_simd_make_constant (vals);
11126 if (constant != NULL_RTX)
11128 emit_move_insn (target, constant);
11129 return;
11133 /* Splat a single non-constant element if we can. */
11134 if (all_same)
11136 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
11137 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11138 return;
11141 /* Half the fields (or less) are non-constant. Load constant then overwrite
11142 varying fields. Hope that this is more efficient than using the stack. */
11143 if (n_var <= n_elts/2)
11145 rtx copy = copy_rtx (vals);
11147 /* Load constant part of vector. We really don't care what goes into the
11148 parts we will overwrite, but we're more likely to be able to load the
11149 constant efficiently if it has fewer, larger, repeating parts
11150 (see aarch64_simd_valid_immediate). */
11151 for (int i = 0; i < n_elts; i++)
11153 rtx x = XVECEXP (vals, 0, i);
11154 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11155 continue;
11156 rtx subst = any_const;
11157 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11159 /* Look in the copied vector, as more elements are const. */
11160 rtx test = XVECEXP (copy, 0, i ^ bit);
11161 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11163 subst = test;
11164 break;
11167 XVECEXP (copy, 0, i) = subst;
11169 aarch64_expand_vector_init (target, copy);
11171 /* Insert variables. */
11172 enum insn_code icode = optab_handler (vec_set_optab, mode);
11173 gcc_assert (icode != CODE_FOR_nothing);
11175 for (int i = 0; i < n_elts; i++)
11177 rtx x = XVECEXP (vals, 0, i);
11178 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11179 continue;
11180 x = copy_to_mode_reg (inner_mode, x);
11181 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11183 return;
11186 /* Construct the vector in memory one field at a time
11187 and load the whole vector. */
11188 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
11189 for (int i = 0; i < n_elts; i++)
11190 emit_move_insn (adjust_address_nv (mem, inner_mode,
11191 i * GET_MODE_SIZE (inner_mode)),
11192 XVECEXP (vals, 0, i));
11193 emit_move_insn (target, mem);
11197 static unsigned HOST_WIDE_INT
11198 aarch64_shift_truncation_mask (machine_mode mode)
11200 return
11201 (aarch64_vector_mode_supported_p (mode)
11202 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11205 /* Select a format to encode pointers in exception handling data. */
11207 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11209 int type;
11210 switch (aarch64_cmodel)
11212 case AARCH64_CMODEL_TINY:
11213 case AARCH64_CMODEL_TINY_PIC:
11214 case AARCH64_CMODEL_SMALL:
11215 case AARCH64_CMODEL_SMALL_PIC:
11216 case AARCH64_CMODEL_SMALL_SPIC:
11217 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11218 for everything. */
11219 type = DW_EH_PE_sdata4;
11220 break;
11221 default:
11222 /* No assumptions here. 8-byte relocs required. */
11223 type = DW_EH_PE_sdata8;
11224 break;
11226 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11229 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11230 by the function fndecl. */
11232 void
11233 aarch64_declare_function_name (FILE *stream, const char* name,
11234 tree fndecl)
11236 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11238 struct cl_target_option *targ_options;
11239 if (target_parts)
11240 targ_options = TREE_TARGET_OPTION (target_parts);
11241 else
11242 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11243 gcc_assert (targ_options);
11245 const struct processor *this_arch
11246 = aarch64_get_arch (targ_options->x_explicit_arch);
11248 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11249 std::string extension
11250 = aarch64_get_extension_string_for_isa_flags (isa_flags);
11251 asm_fprintf (asm_out_file, "\t.arch %s%s\n",
11252 this_arch->name, extension.c_str ());
11254 /* Print the cpu name we're tuning for in the comments, might be
11255 useful to readers of the generated asm. */
11257 const struct processor *this_tune
11258 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11260 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11261 this_tune->name);
11263 /* Don't forget the type directive for ELF. */
11264 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11265 ASM_OUTPUT_LABEL (stream, name);
11268 /* Emit load exclusive. */
11270 static void
11271 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11272 rtx mem, rtx model_rtx)
11274 rtx (*gen) (rtx, rtx, rtx);
11276 switch (mode)
11278 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11279 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11280 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11281 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11282 default:
11283 gcc_unreachable ();
11286 emit_insn (gen (rval, mem, model_rtx));
11289 /* Emit store exclusive. */
11291 static void
11292 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11293 rtx rval, rtx mem, rtx model_rtx)
11295 rtx (*gen) (rtx, rtx, rtx, rtx);
11297 switch (mode)
11299 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11300 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11301 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11302 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11303 default:
11304 gcc_unreachable ();
11307 emit_insn (gen (bval, rval, mem, model_rtx));
11310 /* Mark the previous jump instruction as unlikely. */
11312 static void
11313 aarch64_emit_unlikely_jump (rtx insn)
11315 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11317 insn = emit_jump_insn (insn);
11318 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11321 /* Expand a compare and swap pattern. */
11323 void
11324 aarch64_expand_compare_and_swap (rtx operands[])
11326 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11327 machine_mode mode, cmp_mode;
11328 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11329 int idx;
11330 gen_cas_fn gen;
11331 const gen_cas_fn split_cas[] =
11333 gen_aarch64_compare_and_swapqi,
11334 gen_aarch64_compare_and_swaphi,
11335 gen_aarch64_compare_and_swapsi,
11336 gen_aarch64_compare_and_swapdi
11338 const gen_cas_fn atomic_cas[] =
11340 gen_aarch64_compare_and_swapqi_lse,
11341 gen_aarch64_compare_and_swaphi_lse,
11342 gen_aarch64_compare_and_swapsi_lse,
11343 gen_aarch64_compare_and_swapdi_lse
11346 bval = operands[0];
11347 rval = operands[1];
11348 mem = operands[2];
11349 oldval = operands[3];
11350 newval = operands[4];
11351 is_weak = operands[5];
11352 mod_s = operands[6];
11353 mod_f = operands[7];
11354 mode = GET_MODE (mem);
11355 cmp_mode = mode;
11357 /* Normally the succ memory model must be stronger than fail, but in the
11358 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11359 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11361 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11362 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11363 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11365 switch (mode)
11367 case QImode:
11368 case HImode:
11369 /* For short modes, we're going to perform the comparison in SImode,
11370 so do the zero-extension now. */
11371 cmp_mode = SImode;
11372 rval = gen_reg_rtx (SImode);
11373 oldval = convert_modes (SImode, mode, oldval, true);
11374 /* Fall through. */
11376 case SImode:
11377 case DImode:
11378 /* Force the value into a register if needed. */
11379 if (!aarch64_plus_operand (oldval, mode))
11380 oldval = force_reg (cmp_mode, oldval);
11381 break;
11383 default:
11384 gcc_unreachable ();
11387 switch (mode)
11389 case QImode: idx = 0; break;
11390 case HImode: idx = 1; break;
11391 case SImode: idx = 2; break;
11392 case DImode: idx = 3; break;
11393 default:
11394 gcc_unreachable ();
11396 if (TARGET_LSE)
11397 gen = atomic_cas[idx];
11398 else
11399 gen = split_cas[idx];
11401 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11403 if (mode == QImode || mode == HImode)
11404 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11406 x = gen_rtx_REG (CCmode, CC_REGNUM);
11407 x = gen_rtx_EQ (SImode, x, const0_rtx);
11408 emit_insn (gen_rtx_SET (bval, x));
11411 /* Test whether the target supports using a atomic load-operate instruction.
11412 CODE is the operation and AFTER is TRUE if the data in memory after the
11413 operation should be returned and FALSE if the data before the operation
11414 should be returned. Returns FALSE if the operation isn't supported by the
11415 architecture. */
11417 bool
11418 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11420 if (!TARGET_LSE)
11421 return false;
11423 switch (code)
11425 case SET:
11426 case AND:
11427 case IOR:
11428 case XOR:
11429 case MINUS:
11430 case PLUS:
11431 return true;
11432 default:
11433 return false;
11437 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11438 sequence implementing an atomic operation. */
11440 static void
11441 aarch64_emit_post_barrier (enum memmodel model)
11443 const enum memmodel base_model = memmodel_base (model);
11445 if (is_mm_sync (model)
11446 && (base_model == MEMMODEL_ACQUIRE
11447 || base_model == MEMMODEL_ACQ_REL
11448 || base_model == MEMMODEL_SEQ_CST))
11450 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11454 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11455 for the data in memory. EXPECTED is the value expected to be in memory.
11456 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11457 is the memory ordering to use. */
11459 void
11460 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11461 rtx expected, rtx desired,
11462 rtx model)
11464 rtx (*gen) (rtx, rtx, rtx, rtx);
11465 machine_mode mode;
11467 mode = GET_MODE (mem);
11469 switch (mode)
11471 case QImode: gen = gen_aarch64_atomic_casqi; break;
11472 case HImode: gen = gen_aarch64_atomic_cashi; break;
11473 case SImode: gen = gen_aarch64_atomic_cassi; break;
11474 case DImode: gen = gen_aarch64_atomic_casdi; break;
11475 default:
11476 gcc_unreachable ();
11479 /* Move the expected value into the CAS destination register. */
11480 emit_insn (gen_rtx_SET (rval, expected));
11482 /* Emit the CAS. */
11483 emit_insn (gen (rval, mem, desired, model));
11485 /* Compare the expected value with the value loaded by the CAS, to establish
11486 whether the swap was made. */
11487 aarch64_gen_compare_reg (EQ, rval, expected);
11490 /* Split a compare and swap pattern. */
11492 void
11493 aarch64_split_compare_and_swap (rtx operands[])
11495 rtx rval, mem, oldval, newval, scratch;
11496 machine_mode mode;
11497 bool is_weak;
11498 rtx_code_label *label1, *label2;
11499 rtx x, cond;
11500 enum memmodel model;
11501 rtx model_rtx;
11503 rval = operands[0];
11504 mem = operands[1];
11505 oldval = operands[2];
11506 newval = operands[3];
11507 is_weak = (operands[4] != const0_rtx);
11508 model_rtx = operands[5];
11509 scratch = operands[7];
11510 mode = GET_MODE (mem);
11511 model = memmodel_from_int (INTVAL (model_rtx));
11513 label1 = NULL;
11514 if (!is_weak)
11516 label1 = gen_label_rtx ();
11517 emit_label (label1);
11519 label2 = gen_label_rtx ();
11521 /* The initial load can be relaxed for a __sync operation since a final
11522 barrier will be emitted to stop code hoisting. */
11523 if (is_mm_sync (model))
11524 aarch64_emit_load_exclusive (mode, rval, mem,
11525 GEN_INT (MEMMODEL_RELAXED));
11526 else
11527 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11529 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11530 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11531 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11532 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11533 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11535 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11537 if (!is_weak)
11539 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11540 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11541 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11542 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11544 else
11546 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11547 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11548 emit_insn (gen_rtx_SET (cond, x));
11551 emit_label (label2);
11553 /* Emit any final barrier needed for a __sync operation. */
11554 if (is_mm_sync (model))
11555 aarch64_emit_post_barrier (model);
11558 /* Emit a BIC instruction. */
11560 static void
11561 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11563 rtx shift_rtx = GEN_INT (shift);
11564 rtx (*gen) (rtx, rtx, rtx, rtx);
11566 switch (mode)
11568 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11569 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11570 default:
11571 gcc_unreachable ();
11574 emit_insn (gen (dst, s2, shift_rtx, s1));
11577 /* Emit an atomic swap. */
11579 static void
11580 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11581 rtx mem, rtx model)
11583 rtx (*gen) (rtx, rtx, rtx, rtx);
11585 switch (mode)
11587 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11588 case HImode: gen = gen_aarch64_atomic_swphi; break;
11589 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11590 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11591 default:
11592 gcc_unreachable ();
11595 emit_insn (gen (dst, mem, value, model));
11598 /* Operations supported by aarch64_emit_atomic_load_op. */
11600 enum aarch64_atomic_load_op_code
11602 AARCH64_LDOP_PLUS, /* A + B */
11603 AARCH64_LDOP_XOR, /* A ^ B */
11604 AARCH64_LDOP_OR, /* A | B */
11605 AARCH64_LDOP_BIC /* A & ~B */
11608 /* Emit an atomic load-operate. */
11610 static void
11611 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11612 machine_mode mode, rtx dst, rtx src,
11613 rtx mem, rtx model)
11615 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11616 const aarch64_atomic_load_op_fn plus[] =
11618 gen_aarch64_atomic_loadaddqi,
11619 gen_aarch64_atomic_loadaddhi,
11620 gen_aarch64_atomic_loadaddsi,
11621 gen_aarch64_atomic_loadadddi
11623 const aarch64_atomic_load_op_fn eor[] =
11625 gen_aarch64_atomic_loadeorqi,
11626 gen_aarch64_atomic_loadeorhi,
11627 gen_aarch64_atomic_loadeorsi,
11628 gen_aarch64_atomic_loadeordi
11630 const aarch64_atomic_load_op_fn ior[] =
11632 gen_aarch64_atomic_loadsetqi,
11633 gen_aarch64_atomic_loadsethi,
11634 gen_aarch64_atomic_loadsetsi,
11635 gen_aarch64_atomic_loadsetdi
11637 const aarch64_atomic_load_op_fn bic[] =
11639 gen_aarch64_atomic_loadclrqi,
11640 gen_aarch64_atomic_loadclrhi,
11641 gen_aarch64_atomic_loadclrsi,
11642 gen_aarch64_atomic_loadclrdi
11644 aarch64_atomic_load_op_fn gen;
11645 int idx = 0;
11647 switch (mode)
11649 case QImode: idx = 0; break;
11650 case HImode: idx = 1; break;
11651 case SImode: idx = 2; break;
11652 case DImode: idx = 3; break;
11653 default:
11654 gcc_unreachable ();
11657 switch (code)
11659 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11660 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11661 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11662 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11663 default:
11664 gcc_unreachable ();
11667 emit_insn (gen (dst, mem, src, model));
11670 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11671 location to store the data read from memory. OUT_RESULT is the location to
11672 store the result of the operation. MEM is the memory location to read and
11673 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11674 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11675 be NULL. */
11677 void
11678 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11679 rtx mem, rtx value, rtx model_rtx)
11681 machine_mode mode = GET_MODE (mem);
11682 machine_mode wmode = (mode == DImode ? DImode : SImode);
11683 const bool short_mode = (mode < SImode);
11684 aarch64_atomic_load_op_code ldop_code;
11685 rtx src;
11686 rtx x;
11688 if (out_data)
11689 out_data = gen_lowpart (mode, out_data);
11691 if (out_result)
11692 out_result = gen_lowpart (mode, out_result);
11694 /* Make sure the value is in a register, putting it into a destination
11695 register if it needs to be manipulated. */
11696 if (!register_operand (value, mode)
11697 || code == AND || code == MINUS)
11699 src = out_result ? out_result : out_data;
11700 emit_move_insn (src, gen_lowpart (mode, value));
11702 else
11703 src = value;
11704 gcc_assert (register_operand (src, mode));
11706 /* Preprocess the data for the operation as necessary. If the operation is
11707 a SET then emit a swap instruction and finish. */
11708 switch (code)
11710 case SET:
11711 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11712 return;
11714 case MINUS:
11715 /* Negate the value and treat it as a PLUS. */
11717 rtx neg_src;
11719 /* Resize the value if necessary. */
11720 if (short_mode)
11721 src = gen_lowpart (wmode, src);
11723 neg_src = gen_rtx_NEG (wmode, src);
11724 emit_insn (gen_rtx_SET (src, neg_src));
11726 if (short_mode)
11727 src = gen_lowpart (mode, src);
11729 /* Fall-through. */
11730 case PLUS:
11731 ldop_code = AARCH64_LDOP_PLUS;
11732 break;
11734 case IOR:
11735 ldop_code = AARCH64_LDOP_OR;
11736 break;
11738 case XOR:
11739 ldop_code = AARCH64_LDOP_XOR;
11740 break;
11742 case AND:
11744 rtx not_src;
11746 /* Resize the value if necessary. */
11747 if (short_mode)
11748 src = gen_lowpart (wmode, src);
11750 not_src = gen_rtx_NOT (wmode, src);
11751 emit_insn (gen_rtx_SET (src, not_src));
11753 if (short_mode)
11754 src = gen_lowpart (mode, src);
11756 ldop_code = AARCH64_LDOP_BIC;
11757 break;
11759 default:
11760 /* The operation can't be done with atomic instructions. */
11761 gcc_unreachable ();
11764 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11766 /* If necessary, calculate the data in memory after the update by redoing the
11767 operation from values in registers. */
11768 if (!out_result)
11769 return;
11771 if (short_mode)
11773 src = gen_lowpart (wmode, src);
11774 out_data = gen_lowpart (wmode, out_data);
11775 out_result = gen_lowpart (wmode, out_result);
11778 x = NULL_RTX;
11780 switch (code)
11782 case MINUS:
11783 case PLUS:
11784 x = gen_rtx_PLUS (wmode, out_data, src);
11785 break;
11786 case IOR:
11787 x = gen_rtx_IOR (wmode, out_data, src);
11788 break;
11789 case XOR:
11790 x = gen_rtx_XOR (wmode, out_data, src);
11791 break;
11792 case AND:
11793 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11794 return;
11795 default:
11796 gcc_unreachable ();
11799 emit_set_insn (out_result, x);
11801 return;
11804 /* Split an atomic operation. */
11806 void
11807 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11808 rtx value, rtx model_rtx, rtx cond)
11810 machine_mode mode = GET_MODE (mem);
11811 machine_mode wmode = (mode == DImode ? DImode : SImode);
11812 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11813 const bool is_sync = is_mm_sync (model);
11814 rtx_code_label *label;
11815 rtx x;
11817 /* Split the atomic operation into a sequence. */
11818 label = gen_label_rtx ();
11819 emit_label (label);
11821 if (new_out)
11822 new_out = gen_lowpart (wmode, new_out);
11823 if (old_out)
11824 old_out = gen_lowpart (wmode, old_out);
11825 else
11826 old_out = new_out;
11827 value = simplify_gen_subreg (wmode, value, mode, 0);
11829 /* The initial load can be relaxed for a __sync operation since a final
11830 barrier will be emitted to stop code hoisting. */
11831 if (is_sync)
11832 aarch64_emit_load_exclusive (mode, old_out, mem,
11833 GEN_INT (MEMMODEL_RELAXED));
11834 else
11835 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11837 switch (code)
11839 case SET:
11840 new_out = value;
11841 break;
11843 case NOT:
11844 x = gen_rtx_AND (wmode, old_out, value);
11845 emit_insn (gen_rtx_SET (new_out, x));
11846 x = gen_rtx_NOT (wmode, new_out);
11847 emit_insn (gen_rtx_SET (new_out, x));
11848 break;
11850 case MINUS:
11851 if (CONST_INT_P (value))
11853 value = GEN_INT (-INTVAL (value));
11854 code = PLUS;
11856 /* Fall through. */
11858 default:
11859 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11860 emit_insn (gen_rtx_SET (new_out, x));
11861 break;
11864 aarch64_emit_store_exclusive (mode, cond, mem,
11865 gen_lowpart (mode, new_out), model_rtx);
11867 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11868 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11869 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11870 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11872 /* Emit any final barrier needed for a __sync operation. */
11873 if (is_sync)
11874 aarch64_emit_post_barrier (model);
11877 static void
11878 aarch64_init_libfuncs (void)
11880 /* Half-precision float operations. The compiler handles all operations
11881 with NULL libfuncs by converting to SFmode. */
11883 /* Conversions. */
11884 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11885 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11887 /* Arithmetic. */
11888 set_optab_libfunc (add_optab, HFmode, NULL);
11889 set_optab_libfunc (sdiv_optab, HFmode, NULL);
11890 set_optab_libfunc (smul_optab, HFmode, NULL);
11891 set_optab_libfunc (neg_optab, HFmode, NULL);
11892 set_optab_libfunc (sub_optab, HFmode, NULL);
11894 /* Comparisons. */
11895 set_optab_libfunc (eq_optab, HFmode, NULL);
11896 set_optab_libfunc (ne_optab, HFmode, NULL);
11897 set_optab_libfunc (lt_optab, HFmode, NULL);
11898 set_optab_libfunc (le_optab, HFmode, NULL);
11899 set_optab_libfunc (ge_optab, HFmode, NULL);
11900 set_optab_libfunc (gt_optab, HFmode, NULL);
11901 set_optab_libfunc (unord_optab, HFmode, NULL);
11904 /* Target hook for c_mode_for_suffix. */
11905 static machine_mode
11906 aarch64_c_mode_for_suffix (char suffix)
11908 if (suffix == 'q')
11909 return TFmode;
11911 return VOIDmode;
11914 /* We can only represent floating point constants which will fit in
11915 "quarter-precision" values. These values are characterised by
11916 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
11919 (-1)^s * (n/16) * 2^r
11921 Where:
11922 's' is the sign bit.
11923 'n' is an integer in the range 16 <= n <= 31.
11924 'r' is an integer in the range -3 <= r <= 4. */
11926 /* Return true iff X can be represented by a quarter-precision
11927 floating point immediate operand X. Note, we cannot represent 0.0. */
11928 bool
11929 aarch64_float_const_representable_p (rtx x)
11931 /* This represents our current view of how many bits
11932 make up the mantissa. */
11933 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11934 int exponent;
11935 unsigned HOST_WIDE_INT mantissa, mask;
11936 REAL_VALUE_TYPE r, m;
11937 bool fail;
11939 if (!CONST_DOUBLE_P (x))
11940 return false;
11942 /* We don't support HFmode constants yet. */
11943 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11944 return false;
11946 r = *CONST_DOUBLE_REAL_VALUE (x);
11948 /* We cannot represent infinities, NaNs or +/-zero. We won't
11949 know if we have +zero until we analyse the mantissa, but we
11950 can reject the other invalid values. */
11951 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11952 || REAL_VALUE_MINUS_ZERO (r))
11953 return false;
11955 /* Extract exponent. */
11956 r = real_value_abs (&r);
11957 exponent = REAL_EXP (&r);
11959 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11960 highest (sign) bit, with a fixed binary point at bit point_pos.
11961 m1 holds the low part of the mantissa, m2 the high part.
11962 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11963 bits for the mantissa, this can fail (low bits will be lost). */
11964 real_ldexp (&m, &r, point_pos - exponent);
11965 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11967 /* If the low part of the mantissa has bits set we cannot represent
11968 the value. */
11969 if (w.elt (0) != 0)
11970 return false;
11971 /* We have rejected the lower HOST_WIDE_INT, so update our
11972 understanding of how many bits lie in the mantissa and
11973 look only at the high HOST_WIDE_INT. */
11974 mantissa = w.elt (1);
11975 point_pos -= HOST_BITS_PER_WIDE_INT;
11977 /* We can only represent values with a mantissa of the form 1.xxxx. */
11978 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11979 if ((mantissa & mask) != 0)
11980 return false;
11982 /* Having filtered unrepresentable values, we may now remove all
11983 but the highest 5 bits. */
11984 mantissa >>= point_pos - 5;
11986 /* We cannot represent the value 0.0, so reject it. This is handled
11987 elsewhere. */
11988 if (mantissa == 0)
11989 return false;
11991 /* Then, as bit 4 is always set, we can mask it off, leaving
11992 the mantissa in the range [0, 15]. */
11993 mantissa &= ~(1 << 4);
11994 gcc_assert (mantissa <= 15);
11996 /* GCC internally does not use IEEE754-like encoding (where normalized
11997 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
11998 Our mantissa values are shifted 4 places to the left relative to
11999 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12000 by 5 places to correct for GCC's representation. */
12001 exponent = 5 - exponent;
12003 return (exponent >= 0 && exponent <= 7);
12006 char*
12007 aarch64_output_simd_mov_immediate (rtx const_vector,
12008 machine_mode mode,
12009 unsigned width)
12011 bool is_valid;
12012 static char templ[40];
12013 const char *mnemonic;
12014 const char *shift_op;
12015 unsigned int lane_count = 0;
12016 char element_char;
12018 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12020 /* This will return true to show const_vector is legal for use as either
12021 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12022 also update INFO to show how the immediate should be generated. */
12023 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12024 gcc_assert (is_valid);
12026 element_char = sizetochar (info.element_width);
12027 lane_count = width / info.element_width;
12029 mode = GET_MODE_INNER (mode);
12030 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12032 gcc_assert (info.shift == 0 && ! info.mvn);
12033 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12034 move immediate path. */
12035 if (aarch64_float_const_zero_rtx_p (info.value))
12036 info.value = GEN_INT (0);
12037 else
12039 #define buf_size 20
12040 char float_buf[buf_size] = {'\0'};
12041 real_to_decimal_for_mode (float_buf,
12042 CONST_DOUBLE_REAL_VALUE (info.value),
12043 buf_size, buf_size, 1, mode);
12044 #undef buf_size
12046 if (lane_count == 1)
12047 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12048 else
12049 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12050 lane_count, element_char, float_buf);
12051 return templ;
12055 mnemonic = info.mvn ? "mvni" : "movi";
12056 shift_op = info.msl ? "msl" : "lsl";
12058 gcc_assert (CONST_INT_P (info.value));
12059 if (lane_count == 1)
12060 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12061 mnemonic, UINTVAL (info.value));
12062 else if (info.shift)
12063 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12064 ", %s %d", mnemonic, lane_count, element_char,
12065 UINTVAL (info.value), shift_op, info.shift);
12066 else
12067 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12068 mnemonic, lane_count, element_char, UINTVAL (info.value));
12069 return templ;
12072 char*
12073 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12074 machine_mode mode)
12076 machine_mode vmode;
12078 gcc_assert (!VECTOR_MODE_P (mode));
12079 vmode = aarch64_simd_container_mode (mode, 64);
12080 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12081 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12084 /* Split operands into moves from op[1] + op[2] into op[0]. */
12086 void
12087 aarch64_split_combinev16qi (rtx operands[3])
12089 unsigned int dest = REGNO (operands[0]);
12090 unsigned int src1 = REGNO (operands[1]);
12091 unsigned int src2 = REGNO (operands[2]);
12092 machine_mode halfmode = GET_MODE (operands[1]);
12093 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12094 rtx destlo, desthi;
12096 gcc_assert (halfmode == V16QImode);
12098 if (src1 == dest && src2 == dest + halfregs)
12100 /* No-op move. Can't split to nothing; emit something. */
12101 emit_note (NOTE_INSN_DELETED);
12102 return;
12105 /* Preserve register attributes for variable tracking. */
12106 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12107 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12108 GET_MODE_SIZE (halfmode));
12110 /* Special case of reversed high/low parts. */
12111 if (reg_overlap_mentioned_p (operands[2], destlo)
12112 && reg_overlap_mentioned_p (operands[1], desthi))
12114 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12115 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12116 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12118 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12120 /* Try to avoid unnecessary moves if part of the result
12121 is in the right place already. */
12122 if (src1 != dest)
12123 emit_move_insn (destlo, operands[1]);
12124 if (src2 != dest + halfregs)
12125 emit_move_insn (desthi, operands[2]);
12127 else
12129 if (src2 != dest + halfregs)
12130 emit_move_insn (desthi, operands[2]);
12131 if (src1 != dest)
12132 emit_move_insn (destlo, operands[1]);
12136 /* vec_perm support. */
12138 #define MAX_VECT_LEN 16
12140 struct expand_vec_perm_d
12142 rtx target, op0, op1;
12143 unsigned char perm[MAX_VECT_LEN];
12144 machine_mode vmode;
12145 unsigned char nelt;
12146 bool one_vector_p;
12147 bool testing_p;
12150 /* Generate a variable permutation. */
12152 static void
12153 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12155 machine_mode vmode = GET_MODE (target);
12156 bool one_vector_p = rtx_equal_p (op0, op1);
12158 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12159 gcc_checking_assert (GET_MODE (op0) == vmode);
12160 gcc_checking_assert (GET_MODE (op1) == vmode);
12161 gcc_checking_assert (GET_MODE (sel) == vmode);
12162 gcc_checking_assert (TARGET_SIMD);
12164 if (one_vector_p)
12166 if (vmode == V8QImode)
12168 /* Expand the argument to a V16QI mode by duplicating it. */
12169 rtx pair = gen_reg_rtx (V16QImode);
12170 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12171 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12173 else
12175 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12178 else
12180 rtx pair;
12182 if (vmode == V8QImode)
12184 pair = gen_reg_rtx (V16QImode);
12185 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12186 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12188 else
12190 pair = gen_reg_rtx (OImode);
12191 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12192 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12197 void
12198 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12200 machine_mode vmode = GET_MODE (target);
12201 unsigned int nelt = GET_MODE_NUNITS (vmode);
12202 bool one_vector_p = rtx_equal_p (op0, op1);
12203 rtx mask;
12205 /* The TBL instruction does not use a modulo index, so we must take care
12206 of that ourselves. */
12207 mask = aarch64_simd_gen_const_vector_dup (vmode,
12208 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12209 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12211 /* For big-endian, we also need to reverse the index within the vector
12212 (but not which vector). */
12213 if (BYTES_BIG_ENDIAN)
12215 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12216 if (!one_vector_p)
12217 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12218 sel = expand_simple_binop (vmode, XOR, sel, mask,
12219 NULL, 0, OPTAB_LIB_WIDEN);
12221 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12224 /* Recognize patterns suitable for the TRN instructions. */
12225 static bool
12226 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12228 unsigned int i, odd, mask, nelt = d->nelt;
12229 rtx out, in0, in1, x;
12230 rtx (*gen) (rtx, rtx, rtx);
12231 machine_mode vmode = d->vmode;
12233 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12234 return false;
12236 /* Note that these are little-endian tests.
12237 We correct for big-endian later. */
12238 if (d->perm[0] == 0)
12239 odd = 0;
12240 else if (d->perm[0] == 1)
12241 odd = 1;
12242 else
12243 return false;
12244 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12246 for (i = 0; i < nelt; i += 2)
12248 if (d->perm[i] != i + odd)
12249 return false;
12250 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12251 return false;
12254 /* Success! */
12255 if (d->testing_p)
12256 return true;
12258 in0 = d->op0;
12259 in1 = d->op1;
12260 if (BYTES_BIG_ENDIAN)
12262 x = in0, in0 = in1, in1 = x;
12263 odd = !odd;
12265 out = d->target;
12267 if (odd)
12269 switch (vmode)
12271 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12272 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12273 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12274 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12275 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12276 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12277 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12278 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12279 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12280 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12281 default:
12282 return false;
12285 else
12287 switch (vmode)
12289 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12290 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12291 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12292 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12293 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12294 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12295 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12296 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12297 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12298 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12299 default:
12300 return false;
12304 emit_insn (gen (out, in0, in1));
12305 return true;
12308 /* Recognize patterns suitable for the UZP instructions. */
12309 static bool
12310 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12312 unsigned int i, odd, mask, nelt = d->nelt;
12313 rtx out, in0, in1, x;
12314 rtx (*gen) (rtx, rtx, rtx);
12315 machine_mode vmode = d->vmode;
12317 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12318 return false;
12320 /* Note that these are little-endian tests.
12321 We correct for big-endian later. */
12322 if (d->perm[0] == 0)
12323 odd = 0;
12324 else if (d->perm[0] == 1)
12325 odd = 1;
12326 else
12327 return false;
12328 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12330 for (i = 0; i < nelt; i++)
12332 unsigned elt = (i * 2 + odd) & mask;
12333 if (d->perm[i] != elt)
12334 return false;
12337 /* Success! */
12338 if (d->testing_p)
12339 return true;
12341 in0 = d->op0;
12342 in1 = d->op1;
12343 if (BYTES_BIG_ENDIAN)
12345 x = in0, in0 = in1, in1 = x;
12346 odd = !odd;
12348 out = d->target;
12350 if (odd)
12352 switch (vmode)
12354 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12355 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12356 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12357 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12358 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12359 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12360 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12361 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12362 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12363 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12364 default:
12365 return false;
12368 else
12370 switch (vmode)
12372 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12373 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12374 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12375 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12376 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12377 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12378 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12379 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12380 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12381 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12382 default:
12383 return false;
12387 emit_insn (gen (out, in0, in1));
12388 return true;
12391 /* Recognize patterns suitable for the ZIP instructions. */
12392 static bool
12393 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12395 unsigned int i, high, mask, nelt = d->nelt;
12396 rtx out, in0, in1, x;
12397 rtx (*gen) (rtx, rtx, rtx);
12398 machine_mode vmode = d->vmode;
12400 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12401 return false;
12403 /* Note that these are little-endian tests.
12404 We correct for big-endian later. */
12405 high = nelt / 2;
12406 if (d->perm[0] == high)
12407 /* Do Nothing. */
12409 else if (d->perm[0] == 0)
12410 high = 0;
12411 else
12412 return false;
12413 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12415 for (i = 0; i < nelt / 2; i++)
12417 unsigned elt = (i + high) & mask;
12418 if (d->perm[i * 2] != elt)
12419 return false;
12420 elt = (elt + nelt) & mask;
12421 if (d->perm[i * 2 + 1] != elt)
12422 return false;
12425 /* Success! */
12426 if (d->testing_p)
12427 return true;
12429 in0 = d->op0;
12430 in1 = d->op1;
12431 if (BYTES_BIG_ENDIAN)
12433 x = in0, in0 = in1, in1 = x;
12434 high = !high;
12436 out = d->target;
12438 if (high)
12440 switch (vmode)
12442 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12443 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12444 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12445 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12446 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12447 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12448 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12449 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12450 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12451 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12452 default:
12453 return false;
12456 else
12458 switch (vmode)
12460 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12461 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12462 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12463 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12464 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12465 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12466 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12467 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12468 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12469 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12470 default:
12471 return false;
12475 emit_insn (gen (out, in0, in1));
12476 return true;
12479 /* Recognize patterns for the EXT insn. */
12481 static bool
12482 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12484 unsigned int i, nelt = d->nelt;
12485 rtx (*gen) (rtx, rtx, rtx, rtx);
12486 rtx offset;
12488 unsigned int location = d->perm[0]; /* Always < nelt. */
12490 /* Check if the extracted indices are increasing by one. */
12491 for (i = 1; i < nelt; i++)
12493 unsigned int required = location + i;
12494 if (d->one_vector_p)
12496 /* We'll pass the same vector in twice, so allow indices to wrap. */
12497 required &= (nelt - 1);
12499 if (d->perm[i] != required)
12500 return false;
12503 switch (d->vmode)
12505 case V16QImode: gen = gen_aarch64_extv16qi; break;
12506 case V8QImode: gen = gen_aarch64_extv8qi; break;
12507 case V4HImode: gen = gen_aarch64_extv4hi; break;
12508 case V8HImode: gen = gen_aarch64_extv8hi; break;
12509 case V2SImode: gen = gen_aarch64_extv2si; break;
12510 case V4SImode: gen = gen_aarch64_extv4si; break;
12511 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12512 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12513 case V2DImode: gen = gen_aarch64_extv2di; break;
12514 case V2DFmode: gen = gen_aarch64_extv2df; break;
12515 default:
12516 return false;
12519 /* Success! */
12520 if (d->testing_p)
12521 return true;
12523 /* The case where (location == 0) is a no-op for both big- and little-endian,
12524 and is removed by the mid-end at optimization levels -O1 and higher. */
12526 if (BYTES_BIG_ENDIAN && (location != 0))
12528 /* After setup, we want the high elements of the first vector (stored
12529 at the LSB end of the register), and the low elements of the second
12530 vector (stored at the MSB end of the register). So swap. */
12531 std::swap (d->op0, d->op1);
12532 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12533 location = nelt - location;
12536 offset = GEN_INT (location);
12537 emit_insn (gen (d->target, d->op0, d->op1, offset));
12538 return true;
12541 /* Recognize patterns for the REV insns. */
12543 static bool
12544 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12546 unsigned int i, j, diff, nelt = d->nelt;
12547 rtx (*gen) (rtx, rtx);
12549 if (!d->one_vector_p)
12550 return false;
12552 diff = d->perm[0];
12553 switch (diff)
12555 case 7:
12556 switch (d->vmode)
12558 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12559 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12560 default:
12561 return false;
12563 break;
12564 case 3:
12565 switch (d->vmode)
12567 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12568 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12569 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12570 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12571 default:
12572 return false;
12574 break;
12575 case 1:
12576 switch (d->vmode)
12578 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12579 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12580 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12581 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12582 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12583 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12584 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12585 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12586 default:
12587 return false;
12589 break;
12590 default:
12591 return false;
12594 for (i = 0; i < nelt ; i += diff + 1)
12595 for (j = 0; j <= diff; j += 1)
12597 /* This is guaranteed to be true as the value of diff
12598 is 7, 3, 1 and we should have enough elements in the
12599 queue to generate this. Getting a vector mask with a
12600 value of diff other than these values implies that
12601 something is wrong by the time we get here. */
12602 gcc_assert (i + j < nelt);
12603 if (d->perm[i + j] != i + diff - j)
12604 return false;
12607 /* Success! */
12608 if (d->testing_p)
12609 return true;
12611 emit_insn (gen (d->target, d->op0));
12612 return true;
12615 static bool
12616 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12618 rtx (*gen) (rtx, rtx, rtx);
12619 rtx out = d->target;
12620 rtx in0;
12621 machine_mode vmode = d->vmode;
12622 unsigned int i, elt, nelt = d->nelt;
12623 rtx lane;
12625 elt = d->perm[0];
12626 for (i = 1; i < nelt; i++)
12628 if (elt != d->perm[i])
12629 return false;
12632 /* The generic preparation in aarch64_expand_vec_perm_const_1
12633 swaps the operand order and the permute indices if it finds
12634 d->perm[0] to be in the second operand. Thus, we can always
12635 use d->op0 and need not do any extra arithmetic to get the
12636 correct lane number. */
12637 in0 = d->op0;
12638 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12640 switch (vmode)
12642 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12643 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12644 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12645 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12646 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12647 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12648 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12649 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12650 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12651 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12652 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12653 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12654 default:
12655 return false;
12658 emit_insn (gen (out, in0, lane));
12659 return true;
12662 static bool
12663 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12665 rtx rperm[MAX_VECT_LEN], sel;
12666 machine_mode vmode = d->vmode;
12667 unsigned int i, nelt = d->nelt;
12669 if (d->testing_p)
12670 return true;
12672 /* Generic code will try constant permutation twice. Once with the
12673 original mode and again with the elements lowered to QImode.
12674 So wait and don't do the selector expansion ourselves. */
12675 if (vmode != V8QImode && vmode != V16QImode)
12676 return false;
12678 for (i = 0; i < nelt; ++i)
12680 int nunits = GET_MODE_NUNITS (vmode);
12682 /* If big-endian and two vectors we end up with a weird mixed-endian
12683 mode on NEON. Reverse the index within each word but not the word
12684 itself. */
12685 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12686 : d->perm[i]);
12688 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12689 sel = force_reg (vmode, sel);
12691 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12692 return true;
12695 static bool
12696 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12698 /* The pattern matching functions above are written to look for a small
12699 number to begin the sequence (0, 1, N/2). If we begin with an index
12700 from the second operand, we can swap the operands. */
12701 if (d->perm[0] >= d->nelt)
12703 unsigned i, nelt = d->nelt;
12705 gcc_assert (nelt == (nelt & -nelt));
12706 for (i = 0; i < nelt; ++i)
12707 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12709 std::swap (d->op0, d->op1);
12712 if (TARGET_SIMD)
12714 if (aarch64_evpc_rev (d))
12715 return true;
12716 else if (aarch64_evpc_ext (d))
12717 return true;
12718 else if (aarch64_evpc_dup (d))
12719 return true;
12720 else if (aarch64_evpc_zip (d))
12721 return true;
12722 else if (aarch64_evpc_uzp (d))
12723 return true;
12724 else if (aarch64_evpc_trn (d))
12725 return true;
12726 return aarch64_evpc_tbl (d);
12728 return false;
12731 /* Expand a vec_perm_const pattern. */
12733 bool
12734 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12736 struct expand_vec_perm_d d;
12737 int i, nelt, which;
12739 d.target = target;
12740 d.op0 = op0;
12741 d.op1 = op1;
12743 d.vmode = GET_MODE (target);
12744 gcc_assert (VECTOR_MODE_P (d.vmode));
12745 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12746 d.testing_p = false;
12748 for (i = which = 0; i < nelt; ++i)
12750 rtx e = XVECEXP (sel, 0, i);
12751 int ei = INTVAL (e) & (2 * nelt - 1);
12752 which |= (ei < nelt ? 1 : 2);
12753 d.perm[i] = ei;
12756 switch (which)
12758 default:
12759 gcc_unreachable ();
12761 case 3:
12762 d.one_vector_p = false;
12763 if (!rtx_equal_p (op0, op1))
12764 break;
12766 /* The elements of PERM do not suggest that only the first operand
12767 is used, but both operands are identical. Allow easier matching
12768 of the permutation by folding the permutation into the single
12769 input vector. */
12770 /* Fall Through. */
12771 case 2:
12772 for (i = 0; i < nelt; ++i)
12773 d.perm[i] &= nelt - 1;
12774 d.op0 = op1;
12775 d.one_vector_p = true;
12776 break;
12778 case 1:
12779 d.op1 = op0;
12780 d.one_vector_p = true;
12781 break;
12784 return aarch64_expand_vec_perm_const_1 (&d);
12787 static bool
12788 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12789 const unsigned char *sel)
12791 struct expand_vec_perm_d d;
12792 unsigned int i, nelt, which;
12793 bool ret;
12795 d.vmode = vmode;
12796 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12797 d.testing_p = true;
12798 memcpy (d.perm, sel, nelt);
12800 /* Calculate whether all elements are in one vector. */
12801 for (i = which = 0; i < nelt; ++i)
12803 unsigned char e = d.perm[i];
12804 gcc_assert (e < 2 * nelt);
12805 which |= (e < nelt ? 1 : 2);
12808 /* If all elements are from the second vector, reindex as if from the
12809 first vector. */
12810 if (which == 2)
12811 for (i = 0; i < nelt; ++i)
12812 d.perm[i] -= nelt;
12814 /* Check whether the mask can be applied to a single vector. */
12815 d.one_vector_p = (which != 3);
12817 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12818 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12819 if (!d.one_vector_p)
12820 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12822 start_sequence ();
12823 ret = aarch64_expand_vec_perm_const_1 (&d);
12824 end_sequence ();
12826 return ret;
12829 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
12830 bool
12831 aarch64_cannot_change_mode_class (machine_mode from,
12832 machine_mode to,
12833 enum reg_class rclass)
12835 /* We cannot allow word_mode subregs of full vector modes.
12836 Otherwise the middle-end will assume it's ok to store to
12837 (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
12838 of the 128-bit register. However, after reload the subreg will
12839 be dropped leaving a plain DImode store. See PR67609 for a more
12840 detailed dicussion. In all other cases, we want to be permissive
12841 and return false. */
12842 return (reg_classes_intersect_p (FP_REGS, rclass)
12843 && GET_MODE_SIZE (to) == UNITS_PER_WORD
12844 && GET_MODE_SIZE (from) > UNITS_PER_WORD);
12848 aarch64_reverse_mask (enum machine_mode mode)
12850 /* We have to reverse each vector because we dont have
12851 a permuted load that can reverse-load according to ABI rules. */
12852 rtx mask;
12853 rtvec v = rtvec_alloc (16);
12854 int i, j;
12855 int nunits = GET_MODE_NUNITS (mode);
12856 int usize = GET_MODE_UNIT_SIZE (mode);
12858 gcc_assert (BYTES_BIG_ENDIAN);
12859 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12861 for (i = 0; i < nunits; i++)
12862 for (j = 0; j < usize; j++)
12863 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12864 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12865 return force_reg (V16QImode, mask);
12868 /* Implement MODES_TIEABLE_P. */
12870 bool
12871 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12873 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12874 return true;
12876 /* We specifically want to allow elements of "structure" modes to
12877 be tieable to the structure. This more general condition allows
12878 other rarer situations too. */
12879 if (TARGET_SIMD
12880 && aarch64_vector_mode_p (mode1)
12881 && aarch64_vector_mode_p (mode2))
12882 return true;
12884 return false;
12887 /* Return a new RTX holding the result of moving POINTER forward by
12888 AMOUNT bytes. */
12890 static rtx
12891 aarch64_move_pointer (rtx pointer, int amount)
12893 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12895 return adjust_automodify_address (pointer, GET_MODE (pointer),
12896 next, amount);
12899 /* Return a new RTX holding the result of moving POINTER forward by the
12900 size of the mode it points to. */
12902 static rtx
12903 aarch64_progress_pointer (rtx pointer)
12905 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12907 return aarch64_move_pointer (pointer, amount);
12910 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12911 MODE bytes. */
12913 static void
12914 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12915 machine_mode mode)
12917 rtx reg = gen_reg_rtx (mode);
12919 /* "Cast" the pointers to the correct mode. */
12920 *src = adjust_address (*src, mode, 0);
12921 *dst = adjust_address (*dst, mode, 0);
12922 /* Emit the memcpy. */
12923 emit_move_insn (reg, *src);
12924 emit_move_insn (*dst, reg);
12925 /* Move the pointers forward. */
12926 *src = aarch64_progress_pointer (*src);
12927 *dst = aarch64_progress_pointer (*dst);
12930 /* Expand movmem, as if from a __builtin_memcpy. Return true if
12931 we succeed, otherwise return false. */
12933 bool
12934 aarch64_expand_movmem (rtx *operands)
12936 unsigned int n;
12937 rtx dst = operands[0];
12938 rtx src = operands[1];
12939 rtx base;
12940 bool speed_p = !optimize_function_for_size_p (cfun);
12942 /* When optimizing for size, give a better estimate of the length of a
12943 memcpy call, but use the default otherwise. */
12944 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12946 /* We can't do anything smart if the amount to copy is not constant. */
12947 if (!CONST_INT_P (operands[2]))
12948 return false;
12950 n = UINTVAL (operands[2]);
12952 /* Try to keep the number of instructions low. For cases below 16 bytes we
12953 need to make at most two moves. For cases above 16 bytes it will be one
12954 move for each 16 byte chunk, then at most two additional moves. */
12955 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12956 return false;
12958 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12959 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12961 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12962 src = adjust_automodify_address (src, VOIDmode, base, 0);
12964 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12965 1-byte chunk. */
12966 if (n < 4)
12968 if (n >= 2)
12970 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12971 n -= 2;
12974 if (n == 1)
12975 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12977 return true;
12980 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
12981 4-byte chunk, partially overlapping with the previously copied chunk. */
12982 if (n < 8)
12984 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12985 n -= 4;
12986 if (n > 0)
12988 int move = n - 4;
12990 src = aarch64_move_pointer (src, move);
12991 dst = aarch64_move_pointer (dst, move);
12992 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12994 return true;
12997 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
12998 them, then (if applicable) an 8-byte chunk. */
12999 while (n >= 8)
13001 if (n / 16)
13003 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13004 n -= 16;
13006 else
13008 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13009 n -= 8;
13013 /* Finish the final bytes of the copy. We can always do this in one
13014 instruction. We either copy the exact amount we need, or partially
13015 overlap with the previous chunk we copied and copy 8-bytes. */
13016 if (n == 0)
13017 return true;
13018 else if (n == 1)
13019 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13020 else if (n == 2)
13021 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13022 else if (n == 4)
13023 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13024 else
13026 if (n == 3)
13028 src = aarch64_move_pointer (src, -1);
13029 dst = aarch64_move_pointer (dst, -1);
13030 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13032 else
13034 int move = n - 8;
13036 src = aarch64_move_pointer (src, move);
13037 dst = aarch64_move_pointer (dst, move);
13038 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13042 return true;
13045 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13047 static unsigned HOST_WIDE_INT
13048 aarch64_asan_shadow_offset (void)
13050 return (HOST_WIDE_INT_1 << 36);
13053 static bool
13054 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13055 unsigned int align,
13056 enum by_pieces_operation op,
13057 bool speed_p)
13059 /* STORE_BY_PIECES can be used when copying a constant string, but
13060 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13061 For now we always fail this and let the move_by_pieces code copy
13062 the string from read-only memory. */
13063 if (op == STORE_BY_PIECES)
13064 return false;
13066 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13069 static enum machine_mode
13070 aarch64_code_to_ccmode (enum rtx_code code)
13072 switch (code)
13074 case NE:
13075 return CC_DNEmode;
13077 case EQ:
13078 return CC_DEQmode;
13080 case LE:
13081 return CC_DLEmode;
13083 case LT:
13084 return CC_DLTmode;
13086 case GE:
13087 return CC_DGEmode;
13089 case GT:
13090 return CC_DGTmode;
13092 case LEU:
13093 return CC_DLEUmode;
13095 case LTU:
13096 return CC_DLTUmode;
13098 case GEU:
13099 return CC_DGEUmode;
13101 case GTU:
13102 return CC_DGTUmode;
13104 default:
13105 return CCmode;
13109 static rtx
13110 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13111 int code, tree treeop0, tree treeop1)
13113 enum machine_mode op_mode, cmp_mode, cc_mode;
13114 rtx op0, op1, cmp, target;
13115 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13116 enum insn_code icode;
13117 struct expand_operand ops[4];
13119 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
13120 if (cc_mode == CCmode)
13121 return NULL_RTX;
13123 start_sequence ();
13124 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13126 op_mode = GET_MODE (op0);
13127 if (op_mode == VOIDmode)
13128 op_mode = GET_MODE (op1);
13130 switch (op_mode)
13132 case QImode:
13133 case HImode:
13134 case SImode:
13135 cmp_mode = SImode;
13136 icode = CODE_FOR_cmpsi;
13137 break;
13139 case DImode:
13140 cmp_mode = DImode;
13141 icode = CODE_FOR_cmpdi;
13142 break;
13144 default:
13145 end_sequence ();
13146 return NULL_RTX;
13149 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13150 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13151 if (!op0 || !op1)
13153 end_sequence ();
13154 return NULL_RTX;
13156 *prep_seq = get_insns ();
13157 end_sequence ();
13159 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
13160 target = gen_rtx_REG (CCmode, CC_REGNUM);
13162 create_output_operand (&ops[0], target, CCmode);
13163 create_fixed_operand (&ops[1], cmp);
13164 create_fixed_operand (&ops[2], op0);
13165 create_fixed_operand (&ops[3], op1);
13167 start_sequence ();
13168 if (!maybe_expand_insn (icode, 4, ops))
13170 end_sequence ();
13171 return NULL_RTX;
13173 *gen_seq = get_insns ();
13174 end_sequence ();
13176 return gen_rtx_REG (cc_mode, CC_REGNUM);
13179 static rtx
13180 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13181 tree treeop0, tree treeop1, int bit_code)
13183 rtx op0, op1, cmp0, cmp1, target;
13184 enum machine_mode op_mode, cmp_mode, cc_mode;
13185 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13186 enum insn_code icode = CODE_FOR_ccmp_andsi;
13187 struct expand_operand ops[6];
13189 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
13190 if (cc_mode == CCmode)
13191 return NULL_RTX;
13193 push_to_sequence ((rtx_insn*) *prep_seq);
13194 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13196 op_mode = GET_MODE (op0);
13197 if (op_mode == VOIDmode)
13198 op_mode = GET_MODE (op1);
13200 switch (op_mode)
13202 case QImode:
13203 case HImode:
13204 case SImode:
13205 cmp_mode = SImode;
13206 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
13207 : CODE_FOR_ccmp_iorsi;
13208 break;
13210 case DImode:
13211 cmp_mode = DImode;
13212 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
13213 : CODE_FOR_ccmp_iordi;
13214 break;
13216 default:
13217 end_sequence ();
13218 return NULL_RTX;
13221 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13222 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13223 if (!op0 || !op1)
13225 end_sequence ();
13226 return NULL_RTX;
13228 *prep_seq = get_insns ();
13229 end_sequence ();
13231 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13232 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
13233 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
13235 create_fixed_operand (&ops[0], prev);
13236 create_fixed_operand (&ops[1], target);
13237 create_fixed_operand (&ops[2], op0);
13238 create_fixed_operand (&ops[3], op1);
13239 create_fixed_operand (&ops[4], cmp0);
13240 create_fixed_operand (&ops[5], cmp1);
13242 push_to_sequence ((rtx_insn*) *gen_seq);
13243 if (!maybe_expand_insn (icode, 6, ops))
13245 end_sequence ();
13246 return NULL_RTX;
13249 *gen_seq = get_insns ();
13250 end_sequence ();
13252 return target;
13255 #undef TARGET_GEN_CCMP_FIRST
13256 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13258 #undef TARGET_GEN_CCMP_NEXT
13259 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13261 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13262 instruction fusion of some sort. */
13264 static bool
13265 aarch64_macro_fusion_p (void)
13267 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13271 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13272 should be kept together during scheduling. */
13274 static bool
13275 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13277 rtx set_dest;
13278 rtx prev_set = single_set (prev);
13279 rtx curr_set = single_set (curr);
13280 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13281 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13283 if (!aarch64_macro_fusion_p ())
13284 return false;
13286 if (simple_sets_p
13287 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13289 /* We are trying to match:
13290 prev (mov) == (set (reg r0) (const_int imm16))
13291 curr (movk) == (set (zero_extract (reg r0)
13292 (const_int 16)
13293 (const_int 16))
13294 (const_int imm16_1)) */
13296 set_dest = SET_DEST (curr_set);
13298 if (GET_CODE (set_dest) == ZERO_EXTRACT
13299 && CONST_INT_P (SET_SRC (curr_set))
13300 && CONST_INT_P (SET_SRC (prev_set))
13301 && CONST_INT_P (XEXP (set_dest, 2))
13302 && INTVAL (XEXP (set_dest, 2)) == 16
13303 && REG_P (XEXP (set_dest, 0))
13304 && REG_P (SET_DEST (prev_set))
13305 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13307 return true;
13311 if (simple_sets_p
13312 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13315 /* We're trying to match:
13316 prev (adrp) == (set (reg r1)
13317 (high (symbol_ref ("SYM"))))
13318 curr (add) == (set (reg r0)
13319 (lo_sum (reg r1)
13320 (symbol_ref ("SYM"))))
13321 Note that r0 need not necessarily be the same as r1, especially
13322 during pre-regalloc scheduling. */
13324 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13325 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13327 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13328 && REG_P (XEXP (SET_SRC (curr_set), 0))
13329 && REGNO (XEXP (SET_SRC (curr_set), 0))
13330 == REGNO (SET_DEST (prev_set))
13331 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13332 XEXP (SET_SRC (curr_set), 1)))
13333 return true;
13337 if (simple_sets_p
13338 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13341 /* We're trying to match:
13342 prev (movk) == (set (zero_extract (reg r0)
13343 (const_int 16)
13344 (const_int 32))
13345 (const_int imm16_1))
13346 curr (movk) == (set (zero_extract (reg r0)
13347 (const_int 16)
13348 (const_int 48))
13349 (const_int imm16_2)) */
13351 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13352 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13353 && REG_P (XEXP (SET_DEST (prev_set), 0))
13354 && REG_P (XEXP (SET_DEST (curr_set), 0))
13355 && REGNO (XEXP (SET_DEST (prev_set), 0))
13356 == REGNO (XEXP (SET_DEST (curr_set), 0))
13357 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13358 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13359 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13360 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13361 && CONST_INT_P (SET_SRC (prev_set))
13362 && CONST_INT_P (SET_SRC (curr_set)))
13363 return true;
13366 if (simple_sets_p
13367 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13369 /* We're trying to match:
13370 prev (adrp) == (set (reg r0)
13371 (high (symbol_ref ("SYM"))))
13372 curr (ldr) == (set (reg r1)
13373 (mem (lo_sum (reg r0)
13374 (symbol_ref ("SYM")))))
13376 curr (ldr) == (set (reg r1)
13377 (zero_extend (mem
13378 (lo_sum (reg r0)
13379 (symbol_ref ("SYM")))))) */
13380 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13381 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13383 rtx curr_src = SET_SRC (curr_set);
13385 if (GET_CODE (curr_src) == ZERO_EXTEND)
13386 curr_src = XEXP (curr_src, 0);
13388 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13389 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13390 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13391 == REGNO (SET_DEST (prev_set))
13392 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13393 XEXP (SET_SRC (prev_set), 0)))
13394 return true;
13398 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13399 && any_condjump_p (curr))
13401 enum attr_type prev_type = get_attr_type (prev);
13403 /* FIXME: this misses some which is considered simple arthematic
13404 instructions for ThunderX. Simple shifts are missed here. */
13405 if (prev_type == TYPE_ALUS_SREG
13406 || prev_type == TYPE_ALUS_IMM
13407 || prev_type == TYPE_LOGICS_REG
13408 || prev_type == TYPE_LOGICS_IMM)
13409 return true;
13412 return false;
13415 /* If MEM is in the form of [base+offset], extract the two parts
13416 of address and set to BASE and OFFSET, otherwise return false
13417 after clearing BASE and OFFSET. */
13419 bool
13420 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13422 rtx addr;
13424 gcc_assert (MEM_P (mem));
13426 addr = XEXP (mem, 0);
13428 if (REG_P (addr))
13430 *base = addr;
13431 *offset = const0_rtx;
13432 return true;
13435 if (GET_CODE (addr) == PLUS
13436 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13438 *base = XEXP (addr, 0);
13439 *offset = XEXP (addr, 1);
13440 return true;
13443 *base = NULL_RTX;
13444 *offset = NULL_RTX;
13446 return false;
13449 /* Types for scheduling fusion. */
13450 enum sched_fusion_type
13452 SCHED_FUSION_NONE = 0,
13453 SCHED_FUSION_LD_SIGN_EXTEND,
13454 SCHED_FUSION_LD_ZERO_EXTEND,
13455 SCHED_FUSION_LD,
13456 SCHED_FUSION_ST,
13457 SCHED_FUSION_NUM
13460 /* If INSN is a load or store of address in the form of [base+offset],
13461 extract the two parts and set to BASE and OFFSET. Return scheduling
13462 fusion type this INSN is. */
13464 static enum sched_fusion_type
13465 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13467 rtx x, dest, src;
13468 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13470 gcc_assert (INSN_P (insn));
13471 x = PATTERN (insn);
13472 if (GET_CODE (x) != SET)
13473 return SCHED_FUSION_NONE;
13475 src = SET_SRC (x);
13476 dest = SET_DEST (x);
13478 machine_mode dest_mode = GET_MODE (dest);
13480 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13481 return SCHED_FUSION_NONE;
13483 if (GET_CODE (src) == SIGN_EXTEND)
13485 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13486 src = XEXP (src, 0);
13487 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13488 return SCHED_FUSION_NONE;
13490 else if (GET_CODE (src) == ZERO_EXTEND)
13492 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13493 src = XEXP (src, 0);
13494 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13495 return SCHED_FUSION_NONE;
13498 if (GET_CODE (src) == MEM && REG_P (dest))
13499 extract_base_offset_in_addr (src, base, offset);
13500 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13502 fusion = SCHED_FUSION_ST;
13503 extract_base_offset_in_addr (dest, base, offset);
13505 else
13506 return SCHED_FUSION_NONE;
13508 if (*base == NULL_RTX || *offset == NULL_RTX)
13509 fusion = SCHED_FUSION_NONE;
13511 return fusion;
13514 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13516 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13517 and PRI are only calculated for these instructions. For other instruction,
13518 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13519 type instruction fusion can be added by returning different priorities.
13521 It's important that irrelevant instructions get the largest FUSION_PRI. */
13523 static void
13524 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13525 int *fusion_pri, int *pri)
13527 int tmp, off_val;
13528 rtx base, offset;
13529 enum sched_fusion_type fusion;
13531 gcc_assert (INSN_P (insn));
13533 tmp = max_pri - 1;
13534 fusion = fusion_load_store (insn, &base, &offset);
13535 if (fusion == SCHED_FUSION_NONE)
13537 *pri = tmp;
13538 *fusion_pri = tmp;
13539 return;
13542 /* Set FUSION_PRI according to fusion type and base register. */
13543 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13545 /* Calculate PRI. */
13546 tmp /= 2;
13548 /* INSN with smaller offset goes first. */
13549 off_val = (int)(INTVAL (offset));
13550 if (off_val >= 0)
13551 tmp -= (off_val & 0xfffff);
13552 else
13553 tmp += ((- off_val) & 0xfffff);
13555 *pri = tmp;
13556 return;
13559 /* Given OPERANDS of consecutive load/store, check if we can merge
13560 them into ldp/stp. LOAD is true if they are load instructions.
13561 MODE is the mode of memory operands. */
13563 bool
13564 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13565 enum machine_mode mode)
13567 HOST_WIDE_INT offval_1, offval_2, msize;
13568 enum reg_class rclass_1, rclass_2;
13569 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13571 if (load)
13573 mem_1 = operands[1];
13574 mem_2 = operands[3];
13575 reg_1 = operands[0];
13576 reg_2 = operands[2];
13577 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13578 if (REGNO (reg_1) == REGNO (reg_2))
13579 return false;
13581 else
13583 mem_1 = operands[0];
13584 mem_2 = operands[2];
13585 reg_1 = operands[1];
13586 reg_2 = operands[3];
13589 /* The mems cannot be volatile. */
13590 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13591 return false;
13593 /* Check if the addresses are in the form of [base+offset]. */
13594 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13595 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13596 return false;
13597 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13598 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13599 return false;
13601 /* Check if the bases are same. */
13602 if (!rtx_equal_p (base_1, base_2))
13603 return false;
13605 offval_1 = INTVAL (offset_1);
13606 offval_2 = INTVAL (offset_2);
13607 msize = GET_MODE_SIZE (mode);
13608 /* Check if the offsets are consecutive. */
13609 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13610 return false;
13612 /* Check if the addresses are clobbered by load. */
13613 if (load)
13615 if (reg_mentioned_p (reg_1, mem_1))
13616 return false;
13618 /* In increasing order, the last load can clobber the address. */
13619 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13620 return false;
13623 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13624 rclass_1 = FP_REGS;
13625 else
13626 rclass_1 = GENERAL_REGS;
13628 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13629 rclass_2 = FP_REGS;
13630 else
13631 rclass_2 = GENERAL_REGS;
13633 /* Check if the registers are of same class. */
13634 if (rclass_1 != rclass_2)
13635 return false;
13637 return true;
13640 /* Given OPERANDS of consecutive load/store, check if we can merge
13641 them into ldp/stp by adjusting the offset. LOAD is true if they
13642 are load instructions. MODE is the mode of memory operands.
13644 Given below consecutive stores:
13646 str w1, [xb, 0x100]
13647 str w1, [xb, 0x104]
13648 str w1, [xb, 0x108]
13649 str w1, [xb, 0x10c]
13651 Though the offsets are out of the range supported by stp, we can
13652 still pair them after adjusting the offset, like:
13654 add scratch, xb, 0x100
13655 stp w1, w1, [scratch]
13656 stp w1, w1, [scratch, 0x8]
13658 The peephole patterns detecting this opportunity should guarantee
13659 the scratch register is avaliable. */
13661 bool
13662 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13663 enum machine_mode mode)
13665 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13666 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13667 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13668 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13670 if (load)
13672 reg_1 = operands[0];
13673 mem_1 = operands[1];
13674 reg_2 = operands[2];
13675 mem_2 = operands[3];
13676 reg_3 = operands[4];
13677 mem_3 = operands[5];
13678 reg_4 = operands[6];
13679 mem_4 = operands[7];
13680 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13681 && REG_P (reg_3) && REG_P (reg_4));
13682 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13683 return false;
13685 else
13687 mem_1 = operands[0];
13688 reg_1 = operands[1];
13689 mem_2 = operands[2];
13690 reg_2 = operands[3];
13691 mem_3 = operands[4];
13692 reg_3 = operands[5];
13693 mem_4 = operands[6];
13694 reg_4 = operands[7];
13696 /* Skip if memory operand is by itslef valid for ldp/stp. */
13697 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13698 return false;
13700 /* The mems cannot be volatile. */
13701 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13702 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13703 return false;
13705 /* Check if the addresses are in the form of [base+offset]. */
13706 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13707 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13708 return false;
13709 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13710 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13711 return false;
13712 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13713 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13714 return false;
13715 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13716 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13717 return false;
13719 /* Check if the bases are same. */
13720 if (!rtx_equal_p (base_1, base_2)
13721 || !rtx_equal_p (base_2, base_3)
13722 || !rtx_equal_p (base_3, base_4))
13723 return false;
13725 offval_1 = INTVAL (offset_1);
13726 offval_2 = INTVAL (offset_2);
13727 offval_3 = INTVAL (offset_3);
13728 offval_4 = INTVAL (offset_4);
13729 msize = GET_MODE_SIZE (mode);
13730 /* Check if the offsets are consecutive. */
13731 if ((offval_1 != (offval_2 + msize)
13732 || offval_1 != (offval_3 + msize * 2)
13733 || offval_1 != (offval_4 + msize * 3))
13734 && (offval_4 != (offval_3 + msize)
13735 || offval_4 != (offval_2 + msize * 2)
13736 || offval_4 != (offval_1 + msize * 3)))
13737 return false;
13739 /* Check if the addresses are clobbered by load. */
13740 if (load)
13742 if (reg_mentioned_p (reg_1, mem_1)
13743 || reg_mentioned_p (reg_2, mem_2)
13744 || reg_mentioned_p (reg_3, mem_3))
13745 return false;
13747 /* In increasing order, the last load can clobber the address. */
13748 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13749 return false;
13752 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13753 rclass_1 = FP_REGS;
13754 else
13755 rclass_1 = GENERAL_REGS;
13757 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13758 rclass_2 = FP_REGS;
13759 else
13760 rclass_2 = GENERAL_REGS;
13762 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13763 rclass_3 = FP_REGS;
13764 else
13765 rclass_3 = GENERAL_REGS;
13767 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13768 rclass_4 = FP_REGS;
13769 else
13770 rclass_4 = GENERAL_REGS;
13772 /* Check if the registers are of same class. */
13773 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13774 return false;
13776 return true;
13779 /* Given OPERANDS of consecutive load/store, this function pairs them
13780 into ldp/stp after adjusting the offset. It depends on the fact
13781 that addresses of load/store instructions are in increasing order.
13782 MODE is the mode of memory operands. CODE is the rtl operator
13783 which should be applied to all memory operands, it's SIGN_EXTEND,
13784 ZERO_EXTEND or UNKNOWN. */
13786 bool
13787 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13788 enum machine_mode mode, RTX_CODE code)
13790 rtx base, offset, t1, t2;
13791 rtx mem_1, mem_2, mem_3, mem_4;
13792 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13794 if (load)
13796 mem_1 = operands[1];
13797 mem_2 = operands[3];
13798 mem_3 = operands[5];
13799 mem_4 = operands[7];
13801 else
13803 mem_1 = operands[0];
13804 mem_2 = operands[2];
13805 mem_3 = operands[4];
13806 mem_4 = operands[6];
13807 gcc_assert (code == UNKNOWN);
13810 extract_base_offset_in_addr (mem_1, &base, &offset);
13811 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13813 /* Adjust offset thus it can fit in ldp/stp instruction. */
13814 msize = GET_MODE_SIZE (mode);
13815 stp_off_limit = msize * 0x40;
13816 off_val = INTVAL (offset);
13817 abs_off = (off_val < 0) ? -off_val : off_val;
13818 new_off = abs_off % stp_off_limit;
13819 adj_off = abs_off - new_off;
13821 /* Further adjust to make sure all offsets are OK. */
13822 if ((new_off + msize * 2) >= stp_off_limit)
13824 adj_off += stp_off_limit;
13825 new_off -= stp_off_limit;
13828 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13829 if (adj_off >= 0x1000)
13830 return false;
13832 if (off_val < 0)
13834 adj_off = -adj_off;
13835 new_off = -new_off;
13838 /* Create new memory references. */
13839 mem_1 = change_address (mem_1, VOIDmode,
13840 plus_constant (DImode, operands[8], new_off));
13842 /* Check if the adjusted address is OK for ldp/stp. */
13843 if (!aarch64_mem_pair_operand (mem_1, mode))
13844 return false;
13846 msize = GET_MODE_SIZE (mode);
13847 mem_2 = change_address (mem_2, VOIDmode,
13848 plus_constant (DImode,
13849 operands[8],
13850 new_off + msize));
13851 mem_3 = change_address (mem_3, VOIDmode,
13852 plus_constant (DImode,
13853 operands[8],
13854 new_off + msize * 2));
13855 mem_4 = change_address (mem_4, VOIDmode,
13856 plus_constant (DImode,
13857 operands[8],
13858 new_off + msize * 3));
13860 if (code == ZERO_EXTEND)
13862 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13863 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13864 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13865 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13867 else if (code == SIGN_EXTEND)
13869 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13870 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13871 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13872 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13875 if (load)
13877 operands[1] = mem_1;
13878 operands[3] = mem_2;
13879 operands[5] = mem_3;
13880 operands[7] = mem_4;
13882 else
13884 operands[0] = mem_1;
13885 operands[2] = mem_2;
13886 operands[4] = mem_3;
13887 operands[6] = mem_4;
13890 /* Emit adjusting instruction. */
13891 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13892 /* Emit ldp/stp instructions. */
13893 t1 = gen_rtx_SET (operands[0], operands[1]);
13894 t2 = gen_rtx_SET (operands[2], operands[3]);
13895 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13896 t1 = gen_rtx_SET (operands[4], operands[5]);
13897 t2 = gen_rtx_SET (operands[6], operands[7]);
13898 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13899 return true;
13902 /* Return 1 if pseudo register should be created and used to hold
13903 GOT address for PIC code. */
13905 bool
13906 aarch64_use_pseudo_pic_reg (void)
13908 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13911 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
13913 static int
13914 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13916 switch (XINT (x, 1))
13918 case UNSPEC_GOTSMALLPIC:
13919 case UNSPEC_GOTSMALLPIC28K:
13920 case UNSPEC_GOTTINYPIC:
13921 return 0;
13922 default:
13923 break;
13926 return default_unspec_may_trap_p (x, flags);
13930 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13931 return the log2 of that value. Otherwise return -1. */
13934 aarch64_fpconst_pow_of_2 (rtx x)
13936 const REAL_VALUE_TYPE *r;
13938 if (!CONST_DOUBLE_P (x))
13939 return -1;
13941 r = CONST_DOUBLE_REAL_VALUE (x);
13943 if (REAL_VALUE_NEGATIVE (*r)
13944 || REAL_VALUE_ISNAN (*r)
13945 || REAL_VALUE_ISINF (*r)
13946 || !real_isinteger (r, DFmode))
13947 return -1;
13949 return exact_log2 (real_to_integer (r));
13952 /* If X is a vector of equal CONST_DOUBLE values and that value is
13953 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
13956 aarch64_vec_fpconst_pow_of_2 (rtx x)
13958 if (GET_CODE (x) != CONST_VECTOR)
13959 return -1;
13961 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13962 return -1;
13964 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13965 if (firstval <= 0)
13966 return -1;
13968 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13969 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13970 return -1;
13972 return firstval;
13975 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
13976 static tree
13977 aarch64_promoted_type (const_tree t)
13979 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13980 return float_type_node;
13981 return NULL_TREE;
13984 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
13986 static bool
13987 aarch64_optab_supported_p (int op, machine_mode, machine_mode,
13988 optimization_type opt_type)
13990 switch (op)
13992 case rsqrt_optab:
13993 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
13995 default:
13996 return true;
14000 #undef TARGET_ADDRESS_COST
14001 #define TARGET_ADDRESS_COST aarch64_address_cost
14003 /* This hook will determines whether unnamed bitfields affect the alignment
14004 of the containing structure. The hook returns true if the structure
14005 should inherit the alignment requirements of an unnamed bitfield's
14006 type. */
14007 #undef TARGET_ALIGN_ANON_BITFIELD
14008 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14010 #undef TARGET_ASM_ALIGNED_DI_OP
14011 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14013 #undef TARGET_ASM_ALIGNED_HI_OP
14014 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14016 #undef TARGET_ASM_ALIGNED_SI_OP
14017 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14019 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14020 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14021 hook_bool_const_tree_hwi_hwi_const_tree_true
14023 #undef TARGET_ASM_OUTPUT_MI_THUNK
14024 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14026 #undef TARGET_ASM_SELECT_RTX_SECTION
14027 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14029 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14030 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14032 #undef TARGET_BUILD_BUILTIN_VA_LIST
14033 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14035 #undef TARGET_CALLEE_COPIES
14036 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14038 #undef TARGET_CAN_ELIMINATE
14039 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14041 #undef TARGET_CAN_INLINE_P
14042 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14044 #undef TARGET_CANNOT_FORCE_CONST_MEM
14045 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14047 #undef TARGET_CASE_VALUES_THRESHOLD
14048 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14050 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14051 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14053 /* Only the least significant bit is used for initialization guard
14054 variables. */
14055 #undef TARGET_CXX_GUARD_MASK_BIT
14056 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14058 #undef TARGET_C_MODE_FOR_SUFFIX
14059 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14061 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14062 #undef TARGET_DEFAULT_TARGET_FLAGS
14063 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14064 #endif
14066 #undef TARGET_CLASS_MAX_NREGS
14067 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14069 #undef TARGET_BUILTIN_DECL
14070 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14072 #undef TARGET_BUILTIN_RECIPROCAL
14073 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14075 #undef TARGET_EXPAND_BUILTIN
14076 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14078 #undef TARGET_EXPAND_BUILTIN_VA_START
14079 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14081 #undef TARGET_FOLD_BUILTIN
14082 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14084 #undef TARGET_FUNCTION_ARG
14085 #define TARGET_FUNCTION_ARG aarch64_function_arg
14087 #undef TARGET_FUNCTION_ARG_ADVANCE
14088 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14090 #undef TARGET_FUNCTION_ARG_BOUNDARY
14091 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14093 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14094 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14096 #undef TARGET_FUNCTION_VALUE
14097 #define TARGET_FUNCTION_VALUE aarch64_function_value
14099 #undef TARGET_FUNCTION_VALUE_REGNO_P
14100 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14102 #undef TARGET_FRAME_POINTER_REQUIRED
14103 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14105 #undef TARGET_GIMPLE_FOLD_BUILTIN
14106 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14108 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14109 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14111 #undef TARGET_INIT_BUILTINS
14112 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14114 #undef TARGET_LEGITIMATE_ADDRESS_P
14115 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14117 #undef TARGET_LEGITIMATE_CONSTANT_P
14118 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14120 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14121 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14123 #undef TARGET_LRA_P
14124 #define TARGET_LRA_P hook_bool_void_true
14126 #undef TARGET_MANGLE_TYPE
14127 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14129 #undef TARGET_MEMORY_MOVE_COST
14130 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14132 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14133 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14135 #undef TARGET_MUST_PASS_IN_STACK
14136 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14138 /* This target hook should return true if accesses to volatile bitfields
14139 should use the narrowest mode possible. It should return false if these
14140 accesses should use the bitfield container type. */
14141 #undef TARGET_NARROW_VOLATILE_BITFIELD
14142 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14144 #undef TARGET_OPTION_OVERRIDE
14145 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14147 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14148 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14149 aarch64_override_options_after_change
14151 #undef TARGET_OPTION_SAVE
14152 #define TARGET_OPTION_SAVE aarch64_option_save
14154 #undef TARGET_OPTION_RESTORE
14155 #define TARGET_OPTION_RESTORE aarch64_option_restore
14157 #undef TARGET_OPTION_PRINT
14158 #define TARGET_OPTION_PRINT aarch64_option_print
14160 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14161 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14163 #undef TARGET_SET_CURRENT_FUNCTION
14164 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14166 #undef TARGET_PASS_BY_REFERENCE
14167 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14169 #undef TARGET_PREFERRED_RELOAD_CLASS
14170 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14172 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14173 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14175 #undef TARGET_PROMOTED_TYPE
14176 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14178 #undef TARGET_SECONDARY_RELOAD
14179 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14181 #undef TARGET_SHIFT_TRUNCATION_MASK
14182 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14184 #undef TARGET_SETUP_INCOMING_VARARGS
14185 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14187 #undef TARGET_STRUCT_VALUE_RTX
14188 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14190 #undef TARGET_REGISTER_MOVE_COST
14191 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14193 #undef TARGET_RETURN_IN_MEMORY
14194 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14196 #undef TARGET_RETURN_IN_MSB
14197 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14199 #undef TARGET_RTX_COSTS
14200 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14202 #undef TARGET_SCHED_ISSUE_RATE
14203 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14205 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14206 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14207 aarch64_sched_first_cycle_multipass_dfa_lookahead
14209 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14210 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14211 aarch64_first_cycle_multipass_dfa_lookahead_guard
14213 #undef TARGET_TRAMPOLINE_INIT
14214 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14216 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14217 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14219 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14220 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14222 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14223 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14225 #undef TARGET_VECTORIZE_ADD_STMT_COST
14226 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14228 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14229 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14230 aarch64_builtin_vectorization_cost
14232 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14233 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14235 #undef TARGET_VECTORIZE_BUILTINS
14236 #define TARGET_VECTORIZE_BUILTINS
14238 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14239 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14240 aarch64_builtin_vectorized_function
14242 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14243 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14244 aarch64_autovectorize_vector_sizes
14246 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14247 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14248 aarch64_atomic_assign_expand_fenv
14250 /* Section anchor support. */
14252 #undef TARGET_MIN_ANCHOR_OFFSET
14253 #define TARGET_MIN_ANCHOR_OFFSET -256
14255 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14256 byte offset; we can do much more for larger data types, but have no way
14257 to determine the size of the access. We assume accesses are aligned. */
14258 #undef TARGET_MAX_ANCHOR_OFFSET
14259 #define TARGET_MAX_ANCHOR_OFFSET 4095
14261 #undef TARGET_VECTOR_ALIGNMENT
14262 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14264 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14265 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14266 aarch64_simd_vector_alignment_reachable
14268 /* vec_perm support. */
14270 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14271 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14272 aarch64_vectorize_vec_perm_const_ok
14274 #undef TARGET_INIT_LIBFUNCS
14275 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14277 #undef TARGET_FIXED_CONDITION_CODE_REGS
14278 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14280 #undef TARGET_FLAGS_REGNUM
14281 #define TARGET_FLAGS_REGNUM CC_REGNUM
14283 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14284 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14286 #undef TARGET_ASAN_SHADOW_OFFSET
14287 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14289 #undef TARGET_LEGITIMIZE_ADDRESS
14290 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14292 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14293 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14294 aarch64_use_by_pieces_infrastructure_p
14296 #undef TARGET_CAN_USE_DOLOOP_P
14297 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14299 #undef TARGET_SCHED_MACRO_FUSION_P
14300 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14302 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14303 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14305 #undef TARGET_SCHED_FUSION_PRIORITY
14306 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14308 #undef TARGET_UNSPEC_MAY_TRAP_P
14309 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14311 #undef TARGET_USE_PSEUDO_PIC_REG
14312 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14314 #undef TARGET_PRINT_OPERAND
14315 #define TARGET_PRINT_OPERAND aarch64_print_operand
14317 #undef TARGET_PRINT_OPERAND_ADDRESS
14318 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14320 #undef TARGET_OPTAB_SUPPORTED_P
14321 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14323 struct gcc_target targetm = TARGET_INITIALIZER;
14325 #include "gt-aarch64.h"