[AArch64] Remove aarch64_simd_attr_length_move
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob3ec5c9fb635b70337fa49293b07ffc6ad9b6e9f4
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "diagnostic.h"
40 #include "insn-attr.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "stor-layout.h"
44 #include "calls.h"
45 #include "varasm.h"
46 #include "output.h"
47 #include "flags.h"
48 #include "explow.h"
49 #include "expr.h"
50 #include "reload.h"
51 #include "langhooks.h"
52 #include "opts.h"
53 #include "params.h"
54 #include "gimplify.h"
55 #include "dwarf2.h"
56 #include "gimple-iterator.h"
57 #include "tree-vectorizer.h"
58 #include "aarch64-cost-tables.h"
59 #include "dumpfile.h"
60 #include "builtins.h"
61 #include "rtl-iter.h"
62 #include "tm-constrs.h"
63 #include "sched-int.h"
64 #include "cortex-a57-fma-steering.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 /* This file should be included last. */
69 #include "target-def.h"
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
74 /* Classifies an address.
76 ADDRESS_REG_IMM
77 A simple base register plus immediate offset.
79 ADDRESS_REG_WB
80 A base register indexed by immediate offset with writeback.
82 ADDRESS_REG_REG
83 A base register indexed by (optionally scaled) register.
85 ADDRESS_REG_UXTW
86 A base register indexed by (optionally scaled) zero-extended register.
88 ADDRESS_REG_SXTW
89 A base register indexed by (optionally scaled) sign-extended register.
91 ADDRESS_LO_SUM
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
94 ADDRESS_SYMBOLIC:
95 A constant symbolic address, in pc-relative literal pool. */
97 enum aarch64_address_type {
98 ADDRESS_REG_IMM,
99 ADDRESS_REG_WB,
100 ADDRESS_REG_REG,
101 ADDRESS_REG_UXTW,
102 ADDRESS_REG_SXTW,
103 ADDRESS_LO_SUM,
104 ADDRESS_SYMBOLIC
107 struct aarch64_address_info {
108 enum aarch64_address_type type;
109 rtx base;
110 rtx offset;
111 int shift;
112 enum aarch64_symbol_type symbol_type;
115 struct simd_immediate_info
117 rtx value;
118 int shift;
119 int element_width;
120 bool mvn;
121 bool msl;
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel;
127 #ifdef HAVE_AS_TLS
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
130 #endif
132 static bool aarch64_composite_type_p (const_tree, machine_mode);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
134 const_tree,
135 machine_mode *, int *,
136 bool *);
137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
142 const unsigned char *sel);
143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version;
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune = cortexa53;
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags = 0;
154 /* Global flag for PC relative loads. */
155 bool aarch64_nopcrelative_literal_loads;
157 /* Support for command line parsing of boolean flags in the tuning
158 structures. */
159 struct aarch64_flag_desc
161 const char* name;
162 unsigned int flag;
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
169 { "none", AARCH64_FUSE_NOTHING },
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL },
172 { NULL, AARCH64_FUSE_NOTHING }
174 #undef AARCH64_FUION_PAIR
176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
177 { name, AARCH64_EXTRA_TUNE_##internal_name },
178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
180 { "none", AARCH64_EXTRA_TUNE_NONE },
181 #include "aarch64-tuning-flags.def"
182 { "all", AARCH64_EXTRA_TUNE_ALL },
183 { NULL, AARCH64_EXTRA_TUNE_NONE }
185 #undef AARCH64_EXTRA_TUNING_OPTION
187 /* Tuning parameters. */
189 static const struct cpu_addrcost_table generic_addrcost_table =
192 0, /* hi */
193 0, /* si */
194 0, /* di */
195 0, /* ti */
197 0, /* pre_modify */
198 0, /* post_modify */
199 0, /* register_offset */
200 0, /* register_sextend */
201 0, /* register_zextend */
202 0 /* imm_offset */
205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
208 1, /* hi */
209 0, /* si */
210 0, /* di */
211 1, /* ti */
213 0, /* pre_modify */
214 0, /* post_modify */
215 0, /* register_offset */
216 0, /* register_sextend */
217 0, /* register_zextend */
218 0, /* imm_offset */
221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
224 0, /* hi */
225 0, /* si */
226 0, /* di */
227 2, /* ti */
229 0, /* pre_modify */
230 0, /* post_modify */
231 1, /* register_offset */
232 1, /* register_sextend */
233 2, /* register_zextend */
234 0, /* imm_offset */
237 static const struct cpu_addrcost_table xgene1_addrcost_table =
240 1, /* hi */
241 0, /* si */
242 0, /* di */
243 1, /* ti */
245 1, /* pre_modify */
246 0, /* post_modify */
247 0, /* register_offset */
248 1, /* register_sextend */
249 1, /* register_zextend */
250 0, /* imm_offset */
253 static const struct cpu_regmove_cost generic_regmove_cost =
255 1, /* GP2GP */
256 /* Avoid the use of slow int<->fp moves for spilling by setting
257 their cost higher than memmov_cost. */
258 5, /* GP2FP */
259 5, /* FP2GP */
260 2 /* FP2FP */
263 static const struct cpu_regmove_cost cortexa57_regmove_cost =
265 1, /* GP2GP */
266 /* Avoid the use of slow int<->fp moves for spilling by setting
267 their cost higher than memmov_cost. */
268 5, /* GP2FP */
269 5, /* FP2GP */
270 2 /* FP2FP */
273 static const struct cpu_regmove_cost cortexa53_regmove_cost =
275 1, /* GP2GP */
276 /* Avoid the use of slow int<->fp moves for spilling by setting
277 their cost higher than memmov_cost. */
278 5, /* GP2FP */
279 5, /* FP2GP */
280 2 /* FP2FP */
283 static const struct cpu_regmove_cost exynosm1_regmove_cost =
285 1, /* GP2GP */
286 /* Avoid the use of slow int<->fp moves for spilling by setting
287 their cost higher than memmov_cost (actual, 4 and 9). */
288 9, /* GP2FP */
289 9, /* FP2GP */
290 1 /* FP2FP */
293 static const struct cpu_regmove_cost thunderx_regmove_cost =
295 2, /* GP2GP */
296 2, /* GP2FP */
297 6, /* FP2GP */
298 4 /* FP2FP */
301 static const struct cpu_regmove_cost xgene1_regmove_cost =
303 1, /* GP2GP */
304 /* Avoid the use of slow int<->fp moves for spilling by setting
305 their cost higher than memmov_cost. */
306 8, /* GP2FP */
307 8, /* FP2GP */
308 2 /* FP2FP */
311 /* Generic costs for vector insn classes. */
312 static const struct cpu_vector_cost generic_vector_cost =
314 1, /* scalar_stmt_cost */
315 1, /* scalar_load_cost */
316 1, /* scalar_store_cost */
317 1, /* vec_stmt_cost */
318 2, /* vec_permute_cost */
319 1, /* vec_to_scalar_cost */
320 1, /* scalar_to_vec_cost */
321 1, /* vec_align_load_cost */
322 1, /* vec_unalign_load_cost */
323 1, /* vec_unalign_store_cost */
324 1, /* vec_store_cost */
325 3, /* cond_taken_branch_cost */
326 1 /* cond_not_taken_branch_cost */
329 /* Generic costs for vector insn classes. */
330 static const struct cpu_vector_cost cortexa57_vector_cost =
332 1, /* scalar_stmt_cost */
333 4, /* scalar_load_cost */
334 1, /* scalar_store_cost */
335 3, /* vec_stmt_cost */
336 3, /* vec_permute_cost */
337 8, /* vec_to_scalar_cost */
338 8, /* scalar_to_vec_cost */
339 5, /* vec_align_load_cost */
340 5, /* vec_unalign_load_cost */
341 1, /* vec_unalign_store_cost */
342 1, /* vec_store_cost */
343 1, /* cond_taken_branch_cost */
344 1 /* cond_not_taken_branch_cost */
347 static const struct cpu_vector_cost exynosm1_vector_cost =
349 1, /* scalar_stmt_cost */
350 5, /* scalar_load_cost */
351 1, /* scalar_store_cost */
352 3, /* vec_stmt_cost */
353 3, /* vec_permute_cost */
354 3, /* vec_to_scalar_cost */
355 3, /* scalar_to_vec_cost */
356 5, /* vec_align_load_cost */
357 5, /* vec_unalign_load_cost */
358 1, /* vec_unalign_store_cost */
359 1, /* vec_store_cost */
360 1, /* cond_taken_branch_cost */
361 1 /* cond_not_taken_branch_cost */
364 /* Generic costs for vector insn classes. */
365 static const struct cpu_vector_cost xgene1_vector_cost =
367 1, /* scalar_stmt_cost */
368 5, /* scalar_load_cost */
369 1, /* scalar_store_cost */
370 2, /* vec_stmt_cost */
371 2, /* vec_permute_cost */
372 4, /* vec_to_scalar_cost */
373 4, /* scalar_to_vec_cost */
374 10, /* vec_align_load_cost */
375 10, /* vec_unalign_load_cost */
376 2, /* vec_unalign_store_cost */
377 2, /* vec_store_cost */
378 2, /* cond_taken_branch_cost */
379 1 /* cond_not_taken_branch_cost */
382 /* Generic costs for branch instructions. */
383 static const struct cpu_branch_cost generic_branch_cost =
385 2, /* Predictable. */
386 2 /* Unpredictable. */
389 /* Branch costs for Cortex-A57. */
390 static const struct cpu_branch_cost cortexa57_branch_cost =
392 1, /* Predictable. */
393 3 /* Unpredictable. */
396 static const struct tune_params generic_tunings =
398 &cortexa57_extra_costs,
399 &generic_addrcost_table,
400 &generic_regmove_cost,
401 &generic_vector_cost,
402 &generic_branch_cost,
403 4, /* memmov_cost */
404 2, /* issue_rate */
405 AARCH64_FUSE_NOTHING, /* fusible_ops */
406 8, /* function_align. */
407 8, /* jump_align. */
408 4, /* loop_align. */
409 2, /* int_reassoc_width. */
410 4, /* fp_reassoc_width. */
411 1, /* vec_reassoc_width. */
412 2, /* min_div_recip_mul_sf. */
413 2, /* min_div_recip_mul_df. */
414 0, /* max_case_values. */
415 0, /* cache_line_size. */
416 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
417 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
420 static const struct tune_params cortexa35_tunings =
422 &cortexa53_extra_costs,
423 &generic_addrcost_table,
424 &cortexa53_regmove_cost,
425 &generic_vector_cost,
426 &generic_branch_cost,
427 4, /* memmov_cost */
428 1, /* issue_rate */
429 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
430 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
431 8, /* function_align. */
432 8, /* jump_align. */
433 4, /* loop_align. */
434 2, /* int_reassoc_width. */
435 4, /* fp_reassoc_width. */
436 1, /* vec_reassoc_width. */
437 2, /* min_div_recip_mul_sf. */
438 2, /* min_div_recip_mul_df. */
439 0, /* max_case_values. */
440 0, /* cache_line_size. */
441 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
442 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
445 static const struct tune_params cortexa53_tunings =
447 &cortexa53_extra_costs,
448 &generic_addrcost_table,
449 &cortexa53_regmove_cost,
450 &generic_vector_cost,
451 &generic_branch_cost,
452 4, /* memmov_cost */
453 2, /* issue_rate */
454 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
455 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
456 8, /* function_align. */
457 8, /* jump_align. */
458 4, /* loop_align. */
459 2, /* int_reassoc_width. */
460 4, /* fp_reassoc_width. */
461 1, /* vec_reassoc_width. */
462 2, /* min_div_recip_mul_sf. */
463 2, /* min_div_recip_mul_df. */
464 0, /* max_case_values. */
465 0, /* cache_line_size. */
466 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
467 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
470 static const struct tune_params cortexa57_tunings =
472 &cortexa57_extra_costs,
473 &cortexa57_addrcost_table,
474 &cortexa57_regmove_cost,
475 &cortexa57_vector_cost,
476 &cortexa57_branch_cost,
477 4, /* memmov_cost */
478 3, /* issue_rate */
479 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
480 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
481 16, /* function_align. */
482 8, /* jump_align. */
483 4, /* loop_align. */
484 2, /* int_reassoc_width. */
485 4, /* fp_reassoc_width. */
486 1, /* vec_reassoc_width. */
487 2, /* min_div_recip_mul_sf. */
488 2, /* min_div_recip_mul_df. */
489 0, /* max_case_values. */
490 0, /* cache_line_size. */
491 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
492 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
495 static const struct tune_params cortexa72_tunings =
497 &cortexa57_extra_costs,
498 &cortexa57_addrcost_table,
499 &cortexa57_regmove_cost,
500 &cortexa57_vector_cost,
501 &generic_branch_cost,
502 4, /* memmov_cost */
503 3, /* issue_rate */
504 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
505 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
506 16, /* function_align. */
507 8, /* jump_align. */
508 4, /* loop_align. */
509 2, /* int_reassoc_width. */
510 4, /* fp_reassoc_width. */
511 1, /* vec_reassoc_width. */
512 2, /* min_div_recip_mul_sf. */
513 2, /* min_div_recip_mul_df. */
514 0, /* max_case_values. */
515 0, /* cache_line_size. */
516 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
517 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
520 static const struct tune_params exynosm1_tunings =
522 &exynosm1_extra_costs,
523 &exynosm1_addrcost_table,
524 &exynosm1_regmove_cost,
525 &exynosm1_vector_cost,
526 &generic_branch_cost,
527 4, /* memmov_cost */
528 3, /* issue_rate */
529 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
530 4, /* function_align. */
531 4, /* jump_align. */
532 4, /* loop_align. */
533 2, /* int_reassoc_width. */
534 4, /* fp_reassoc_width. */
535 1, /* vec_reassoc_width. */
536 2, /* min_div_recip_mul_sf. */
537 2, /* min_div_recip_mul_df. */
538 48, /* max_case_values. */
539 64, /* cache_line_size. */
540 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
541 (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
544 static const struct tune_params thunderx_tunings =
546 &thunderx_extra_costs,
547 &generic_addrcost_table,
548 &thunderx_regmove_cost,
549 &generic_vector_cost,
550 &generic_branch_cost,
551 6, /* memmov_cost */
552 2, /* issue_rate */
553 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
554 8, /* function_align. */
555 8, /* jump_align. */
556 8, /* loop_align. */
557 2, /* int_reassoc_width. */
558 4, /* fp_reassoc_width. */
559 1, /* vec_reassoc_width. */
560 2, /* min_div_recip_mul_sf. */
561 2, /* min_div_recip_mul_df. */
562 0, /* max_case_values. */
563 0, /* cache_line_size. */
564 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
565 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
568 static const struct tune_params xgene1_tunings =
570 &xgene1_extra_costs,
571 &xgene1_addrcost_table,
572 &xgene1_regmove_cost,
573 &xgene1_vector_cost,
574 &generic_branch_cost,
575 6, /* memmov_cost */
576 4, /* issue_rate */
577 AARCH64_FUSE_NOTHING, /* fusible_ops */
578 16, /* function_align. */
579 8, /* jump_align. */
580 16, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 0, /* cache_line_size. */
588 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
589 (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
592 /* Support for fine-grained override of the tuning structures. */
593 struct aarch64_tuning_override_function
595 const char* name;
596 void (*parse_override)(const char*, struct tune_params*);
599 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
600 static void aarch64_parse_tune_string (const char*, struct tune_params*);
602 static const struct aarch64_tuning_override_function
603 aarch64_tuning_override_functions[] =
605 { "fuse", aarch64_parse_fuse_string },
606 { "tune", aarch64_parse_tune_string },
607 { NULL, NULL }
610 /* A processor implementing AArch64. */
611 struct processor
613 const char *const name;
614 enum aarch64_processor ident;
615 enum aarch64_processor sched_core;
616 enum aarch64_arch arch;
617 unsigned architecture_version;
618 const unsigned long flags;
619 const struct tune_params *const tune;
622 /* Architectures implementing AArch64. */
623 static const struct processor all_architectures[] =
625 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
626 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
627 #include "aarch64-arches.def"
628 #undef AARCH64_ARCH
629 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
632 /* Processor cores implementing AArch64. */
633 static const struct processor all_cores[] =
635 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
636 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
637 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
638 FLAGS, &COSTS##_tunings},
639 #include "aarch64-cores.def"
640 #undef AARCH64_CORE
641 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
642 AARCH64_FL_FOR_ARCH8, &generic_tunings},
643 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
647 /* Target specification. These are populated by the -march, -mtune, -mcpu
648 handling code or by target attributes. */
649 static const struct processor *selected_arch;
650 static const struct processor *selected_cpu;
651 static const struct processor *selected_tune;
653 /* The current tuning set. */
654 struct tune_params aarch64_tune_params = generic_tunings;
656 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
658 /* An ISA extension in the co-processor and main instruction set space. */
659 struct aarch64_option_extension
661 const char *const name;
662 const unsigned long flags_on;
663 const unsigned long flags_off;
666 typedef enum aarch64_cond_code
668 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
669 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
670 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
672 aarch64_cc;
674 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
676 /* The condition codes of the processor, and the inverse function. */
677 static const char * const aarch64_condition_codes[] =
679 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
680 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
683 /* Generate code to enable conditional branches in functions over 1 MiB. */
684 const char *
685 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
686 const char * branch_format)
688 rtx_code_label * tmp_label = gen_label_rtx ();
689 char label_buf[256];
690 char buffer[128];
691 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
692 CODE_LABEL_NUMBER (tmp_label));
693 const char *label_ptr = targetm.strip_name_encoding (label_buf);
694 rtx dest_label = operands[pos_label];
695 operands[pos_label] = tmp_label;
697 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
698 output_asm_insn (buffer, operands);
700 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
701 operands[pos_label] = dest_label;
702 output_asm_insn (buffer, operands);
703 return "";
706 void
707 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
709 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
710 if (TARGET_GENERAL_REGS_ONLY)
711 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
712 else
713 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
716 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
717 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
718 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
719 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
720 cost (in this case the best class is the lowest cost one). Using ALL_REGS
721 irrespectively of its cost results in bad allocations with many redundant
722 int<->FP moves which are expensive on various cores.
723 To avoid this we don't allow ALL_REGS as the allocno class, but force a
724 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
725 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
726 Otherwise set the allocno class depending on the mode.
727 The result of this is that it is no longer inefficient to have a higher
728 memory move cost than the register move cost.
731 static reg_class_t
732 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
733 reg_class_t best_class)
735 enum machine_mode mode;
737 if (allocno_class != ALL_REGS)
738 return allocno_class;
740 if (best_class != ALL_REGS)
741 return best_class;
743 mode = PSEUDO_REGNO_MODE (regno);
744 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
747 static unsigned int
748 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
750 if (GET_MODE_UNIT_SIZE (mode) == 4)
751 return aarch64_tune_params.min_div_recip_mul_sf;
752 return aarch64_tune_params.min_div_recip_mul_df;
755 static int
756 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
757 enum machine_mode mode)
759 if (VECTOR_MODE_P (mode))
760 return aarch64_tune_params.vec_reassoc_width;
761 if (INTEGRAL_MODE_P (mode))
762 return aarch64_tune_params.int_reassoc_width;
763 if (FLOAT_MODE_P (mode))
764 return aarch64_tune_params.fp_reassoc_width;
765 return 1;
768 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
769 unsigned
770 aarch64_dbx_register_number (unsigned regno)
772 if (GP_REGNUM_P (regno))
773 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
774 else if (regno == SP_REGNUM)
775 return AARCH64_DWARF_SP;
776 else if (FP_REGNUM_P (regno))
777 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
779 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
780 equivalent DWARF register. */
781 return DWARF_FRAME_REGISTERS;
784 /* Return TRUE if MODE is any of the large INT modes. */
785 static bool
786 aarch64_vect_struct_mode_p (machine_mode mode)
788 return mode == OImode || mode == CImode || mode == XImode;
791 /* Return TRUE if MODE is any of the vector modes. */
792 static bool
793 aarch64_vector_mode_p (machine_mode mode)
795 return aarch64_vector_mode_supported_p (mode)
796 || aarch64_vect_struct_mode_p (mode);
799 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
800 static bool
801 aarch64_array_mode_supported_p (machine_mode mode,
802 unsigned HOST_WIDE_INT nelems)
804 if (TARGET_SIMD
805 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
806 || AARCH64_VALID_SIMD_DREG_MODE (mode))
807 && (nelems >= 2 && nelems <= 4))
808 return true;
810 return false;
813 /* Implement HARD_REGNO_NREGS. */
816 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
818 switch (aarch64_regno_regclass (regno))
820 case FP_REGS:
821 case FP_LO_REGS:
822 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
823 default:
824 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
826 gcc_unreachable ();
829 /* Implement HARD_REGNO_MODE_OK. */
832 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
834 if (GET_MODE_CLASS (mode) == MODE_CC)
835 return regno == CC_REGNUM;
837 if (regno == SP_REGNUM)
838 /* The purpose of comparing with ptr_mode is to support the
839 global register variable associated with the stack pointer
840 register via the syntax of asm ("wsp") in ILP32. */
841 return mode == Pmode || mode == ptr_mode;
843 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
844 return mode == Pmode;
846 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
847 return 1;
849 if (FP_REGNUM_P (regno))
851 if (aarch64_vect_struct_mode_p (mode))
852 return
853 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
854 else
855 return 1;
858 return 0;
861 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
862 machine_mode
863 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
864 machine_mode mode)
866 /* Handle modes that fit within single registers. */
867 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
869 if (GET_MODE_SIZE (mode) >= 4)
870 return mode;
871 else
872 return SImode;
874 /* Fall back to generic for multi-reg and very large modes. */
875 else
876 return choose_hard_reg_mode (regno, nregs, false);
879 /* Return true if calls to DECL should be treated as
880 long-calls (ie called via a register). */
881 static bool
882 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
884 return false;
887 /* Return true if calls to symbol-ref SYM should be treated as
888 long-calls (ie called via a register). */
889 bool
890 aarch64_is_long_call_p (rtx sym)
892 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
895 /* Return true if calls to symbol-ref SYM should not go through
896 plt stubs. */
898 bool
899 aarch64_is_noplt_call_p (rtx sym)
901 const_tree decl = SYMBOL_REF_DECL (sym);
903 if (flag_pic
904 && decl
905 && (!flag_plt
906 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
907 && !targetm.binds_local_p (decl))
908 return true;
910 return false;
913 /* Return true if the offsets to a zero/sign-extract operation
914 represent an expression that matches an extend operation. The
915 operands represent the paramters from
917 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
918 bool
919 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
920 rtx extract_imm)
922 HOST_WIDE_INT mult_val, extract_val;
924 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
925 return false;
927 mult_val = INTVAL (mult_imm);
928 extract_val = INTVAL (extract_imm);
930 if (extract_val > 8
931 && extract_val < GET_MODE_BITSIZE (mode)
932 && exact_log2 (extract_val & ~7) > 0
933 && (extract_val & 7) <= 4
934 && mult_val == (1 << (extract_val & 7)))
935 return true;
937 return false;
940 /* Emit an insn that's a simple single-set. Both the operands must be
941 known to be valid. */
942 inline static rtx
943 emit_set_insn (rtx x, rtx y)
945 return emit_insn (gen_rtx_SET (x, y));
948 /* X and Y are two things to compare using CODE. Emit the compare insn and
949 return the rtx for register 0 in the proper mode. */
951 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
953 machine_mode mode = SELECT_CC_MODE (code, x, y);
954 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
956 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
957 return cc_reg;
960 /* Build the SYMBOL_REF for __tls_get_addr. */
962 static GTY(()) rtx tls_get_addr_libfunc;
965 aarch64_tls_get_addr (void)
967 if (!tls_get_addr_libfunc)
968 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
969 return tls_get_addr_libfunc;
972 /* Return the TLS model to use for ADDR. */
974 static enum tls_model
975 tls_symbolic_operand_type (rtx addr)
977 enum tls_model tls_kind = TLS_MODEL_NONE;
978 rtx sym, addend;
980 if (GET_CODE (addr) == CONST)
982 split_const (addr, &sym, &addend);
983 if (GET_CODE (sym) == SYMBOL_REF)
984 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
986 else if (GET_CODE (addr) == SYMBOL_REF)
987 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
989 return tls_kind;
992 /* We'll allow lo_sum's in addresses in our legitimate addresses
993 so that combine would take care of combining addresses where
994 necessary, but for generation purposes, we'll generate the address
995 as :
996 RTL Absolute
997 tmp = hi (symbol_ref); adrp x1, foo
998 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1001 PIC TLS
1002 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1003 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1004 bl __tls_get_addr
1007 Load TLS symbol, depending on TLS mechanism and TLS access model.
1009 Global Dynamic - Traditional TLS:
1010 adrp tmp, :tlsgd:imm
1011 add dest, tmp, #:tlsgd_lo12:imm
1012 bl __tls_get_addr
1014 Global Dynamic - TLS Descriptors:
1015 adrp dest, :tlsdesc:imm
1016 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1017 add dest, dest, #:tlsdesc_lo12:imm
1018 blr tmp
1019 mrs tp, tpidr_el0
1020 add dest, dest, tp
1022 Initial Exec:
1023 mrs tp, tpidr_el0
1024 adrp tmp, :gottprel:imm
1025 ldr dest, [tmp, #:gottprel_lo12:imm]
1026 add dest, dest, tp
1028 Local Exec:
1029 mrs tp, tpidr_el0
1030 add t0, tp, #:tprel_hi12:imm, lsl #12
1031 add t0, t0, #:tprel_lo12_nc:imm
1034 static void
1035 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1036 enum aarch64_symbol_type type)
1038 switch (type)
1040 case SYMBOL_SMALL_ABSOLUTE:
1042 /* In ILP32, the mode of dest can be either SImode or DImode. */
1043 rtx tmp_reg = dest;
1044 machine_mode mode = GET_MODE (dest);
1046 gcc_assert (mode == Pmode || mode == ptr_mode);
1048 if (can_create_pseudo_p ())
1049 tmp_reg = gen_reg_rtx (mode);
1051 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1052 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1053 return;
1056 case SYMBOL_TINY_ABSOLUTE:
1057 emit_insn (gen_rtx_SET (dest, imm));
1058 return;
1060 case SYMBOL_SMALL_GOT_28K:
1062 machine_mode mode = GET_MODE (dest);
1063 rtx gp_rtx = pic_offset_table_rtx;
1064 rtx insn;
1065 rtx mem;
1067 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1068 here before rtl expand. Tree IVOPT will generate rtl pattern to
1069 decide rtx costs, in which case pic_offset_table_rtx is not
1070 initialized. For that case no need to generate the first adrp
1071 instruction as the final cost for global variable access is
1072 one instruction. */
1073 if (gp_rtx != NULL)
1075 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1076 using the page base as GOT base, the first page may be wasted,
1077 in the worst scenario, there is only 28K space for GOT).
1079 The generate instruction sequence for accessing global variable
1082 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1084 Only one instruction needed. But we must initialize
1085 pic_offset_table_rtx properly. We generate initialize insn for
1086 every global access, and allow CSE to remove all redundant.
1088 The final instruction sequences will look like the following
1089 for multiply global variables access.
1091 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1093 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1094 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1095 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1096 ... */
1098 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1099 crtl->uses_pic_offset_table = 1;
1100 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1102 if (mode != GET_MODE (gp_rtx))
1103 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1106 if (mode == ptr_mode)
1108 if (mode == DImode)
1109 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1110 else
1111 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1113 mem = XVECEXP (SET_SRC (insn), 0, 0);
1115 else
1117 gcc_assert (mode == Pmode);
1119 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1120 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1123 /* The operand is expected to be MEM. Whenever the related insn
1124 pattern changed, above code which calculate mem should be
1125 updated. */
1126 gcc_assert (GET_CODE (mem) == MEM);
1127 MEM_READONLY_P (mem) = 1;
1128 MEM_NOTRAP_P (mem) = 1;
1129 emit_insn (insn);
1130 return;
1133 case SYMBOL_SMALL_GOT_4G:
1135 /* In ILP32, the mode of dest can be either SImode or DImode,
1136 while the got entry is always of SImode size. The mode of
1137 dest depends on how dest is used: if dest is assigned to a
1138 pointer (e.g. in the memory), it has SImode; it may have
1139 DImode if dest is dereferenced to access the memeory.
1140 This is why we have to handle three different ldr_got_small
1141 patterns here (two patterns for ILP32). */
1143 rtx insn;
1144 rtx mem;
1145 rtx tmp_reg = dest;
1146 machine_mode mode = GET_MODE (dest);
1148 if (can_create_pseudo_p ())
1149 tmp_reg = gen_reg_rtx (mode);
1151 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1152 if (mode == ptr_mode)
1154 if (mode == DImode)
1155 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1156 else
1157 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1159 mem = XVECEXP (SET_SRC (insn), 0, 0);
1161 else
1163 gcc_assert (mode == Pmode);
1165 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1166 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1169 gcc_assert (GET_CODE (mem) == MEM);
1170 MEM_READONLY_P (mem) = 1;
1171 MEM_NOTRAP_P (mem) = 1;
1172 emit_insn (insn);
1173 return;
1176 case SYMBOL_SMALL_TLSGD:
1178 rtx_insn *insns;
1179 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1181 start_sequence ();
1182 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1183 insns = get_insns ();
1184 end_sequence ();
1186 RTL_CONST_CALL_P (insns) = 1;
1187 emit_libcall_block (insns, dest, result, imm);
1188 return;
1191 case SYMBOL_SMALL_TLSDESC:
1193 machine_mode mode = GET_MODE (dest);
1194 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1195 rtx tp;
1197 gcc_assert (mode == Pmode || mode == ptr_mode);
1199 /* In ILP32, the got entry is always of SImode size. Unlike
1200 small GOT, the dest is fixed at reg 0. */
1201 if (TARGET_ILP32)
1202 emit_insn (gen_tlsdesc_small_si (imm));
1203 else
1204 emit_insn (gen_tlsdesc_small_di (imm));
1205 tp = aarch64_load_tp (NULL);
1207 if (mode != Pmode)
1208 tp = gen_lowpart (mode, tp);
1210 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1211 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1212 return;
1215 case SYMBOL_SMALL_TLSIE:
1217 /* In ILP32, the mode of dest can be either SImode or DImode,
1218 while the got entry is always of SImode size. The mode of
1219 dest depends on how dest is used: if dest is assigned to a
1220 pointer (e.g. in the memory), it has SImode; it may have
1221 DImode if dest is dereferenced to access the memeory.
1222 This is why we have to handle three different tlsie_small
1223 patterns here (two patterns for ILP32). */
1224 machine_mode mode = GET_MODE (dest);
1225 rtx tmp_reg = gen_reg_rtx (mode);
1226 rtx tp = aarch64_load_tp (NULL);
1228 if (mode == ptr_mode)
1230 if (mode == DImode)
1231 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1232 else
1234 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1235 tp = gen_lowpart (mode, tp);
1238 else
1240 gcc_assert (mode == Pmode);
1241 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1244 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1245 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1246 return;
1249 case SYMBOL_TLSLE12:
1250 case SYMBOL_TLSLE24:
1251 case SYMBOL_TLSLE32:
1252 case SYMBOL_TLSLE48:
1254 machine_mode mode = GET_MODE (dest);
1255 rtx tp = aarch64_load_tp (NULL);
1257 if (mode != Pmode)
1258 tp = gen_lowpart (mode, tp);
1260 switch (type)
1262 case SYMBOL_TLSLE12:
1263 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1264 (dest, tp, imm));
1265 break;
1266 case SYMBOL_TLSLE24:
1267 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1268 (dest, tp, imm));
1269 break;
1270 case SYMBOL_TLSLE32:
1271 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1272 (dest, imm));
1273 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1274 (dest, dest, tp));
1275 break;
1276 case SYMBOL_TLSLE48:
1277 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1278 (dest, imm));
1279 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1280 (dest, dest, tp));
1281 break;
1282 default:
1283 gcc_unreachable ();
1286 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1287 return;
1290 case SYMBOL_TINY_GOT:
1291 emit_insn (gen_ldr_got_tiny (dest, imm));
1292 return;
1294 case SYMBOL_TINY_TLSIE:
1296 machine_mode mode = GET_MODE (dest);
1297 rtx tp = aarch64_load_tp (NULL);
1299 if (mode == ptr_mode)
1301 if (mode == DImode)
1302 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1303 else
1305 tp = gen_lowpart (mode, tp);
1306 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1309 else
1311 gcc_assert (mode == Pmode);
1312 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1315 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1316 return;
1319 default:
1320 gcc_unreachable ();
1324 /* Emit a move from SRC to DEST. Assume that the move expanders can
1325 handle all moves if !can_create_pseudo_p (). The distinction is
1326 important because, unlike emit_move_insn, the move expanders know
1327 how to force Pmode objects into the constant pool even when the
1328 constant pool address is not itself legitimate. */
1329 static rtx
1330 aarch64_emit_move (rtx dest, rtx src)
1332 return (can_create_pseudo_p ()
1333 ? emit_move_insn (dest, src)
1334 : emit_move_insn_1 (dest, src));
1337 /* Split a 128-bit move operation into two 64-bit move operations,
1338 taking care to handle partial overlap of register to register
1339 copies. Special cases are needed when moving between GP regs and
1340 FP regs. SRC can be a register, constant or memory; DST a register
1341 or memory. If either operand is memory it must not have any side
1342 effects. */
1343 void
1344 aarch64_split_128bit_move (rtx dst, rtx src)
1346 rtx dst_lo, dst_hi;
1347 rtx src_lo, src_hi;
1349 machine_mode mode = GET_MODE (dst);
1351 gcc_assert (mode == TImode || mode == TFmode);
1352 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1353 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1355 if (REG_P (dst) && REG_P (src))
1357 int src_regno = REGNO (src);
1358 int dst_regno = REGNO (dst);
1360 /* Handle FP <-> GP regs. */
1361 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1363 src_lo = gen_lowpart (word_mode, src);
1364 src_hi = gen_highpart (word_mode, src);
1366 if (mode == TImode)
1368 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1369 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1371 else
1373 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1374 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1376 return;
1378 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1380 dst_lo = gen_lowpart (word_mode, dst);
1381 dst_hi = gen_highpart (word_mode, dst);
1383 if (mode == TImode)
1385 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1386 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1388 else
1390 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1391 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1393 return;
1397 dst_lo = gen_lowpart (word_mode, dst);
1398 dst_hi = gen_highpart (word_mode, dst);
1399 src_lo = gen_lowpart (word_mode, src);
1400 src_hi = gen_highpart_mode (word_mode, mode, src);
1402 /* At most one pairing may overlap. */
1403 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1405 aarch64_emit_move (dst_hi, src_hi);
1406 aarch64_emit_move (dst_lo, src_lo);
1408 else
1410 aarch64_emit_move (dst_lo, src_lo);
1411 aarch64_emit_move (dst_hi, src_hi);
1415 bool
1416 aarch64_split_128bit_move_p (rtx dst, rtx src)
1418 return (! REG_P (src)
1419 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1422 /* Split a complex SIMD combine. */
1424 void
1425 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1427 machine_mode src_mode = GET_MODE (src1);
1428 machine_mode dst_mode = GET_MODE (dst);
1430 gcc_assert (VECTOR_MODE_P (dst_mode));
1432 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1434 rtx (*gen) (rtx, rtx, rtx);
1436 switch (src_mode)
1438 case V8QImode:
1439 gen = gen_aarch64_simd_combinev8qi;
1440 break;
1441 case V4HImode:
1442 gen = gen_aarch64_simd_combinev4hi;
1443 break;
1444 case V2SImode:
1445 gen = gen_aarch64_simd_combinev2si;
1446 break;
1447 case V4HFmode:
1448 gen = gen_aarch64_simd_combinev4hf;
1449 break;
1450 case V2SFmode:
1451 gen = gen_aarch64_simd_combinev2sf;
1452 break;
1453 case DImode:
1454 gen = gen_aarch64_simd_combinedi;
1455 break;
1456 case DFmode:
1457 gen = gen_aarch64_simd_combinedf;
1458 break;
1459 default:
1460 gcc_unreachable ();
1463 emit_insn (gen (dst, src1, src2));
1464 return;
1468 /* Split a complex SIMD move. */
1470 void
1471 aarch64_split_simd_move (rtx dst, rtx src)
1473 machine_mode src_mode = GET_MODE (src);
1474 machine_mode dst_mode = GET_MODE (dst);
1476 gcc_assert (VECTOR_MODE_P (dst_mode));
1478 if (REG_P (dst) && REG_P (src))
1480 rtx (*gen) (rtx, rtx);
1482 gcc_assert (VECTOR_MODE_P (src_mode));
1484 switch (src_mode)
1486 case V16QImode:
1487 gen = gen_aarch64_split_simd_movv16qi;
1488 break;
1489 case V8HImode:
1490 gen = gen_aarch64_split_simd_movv8hi;
1491 break;
1492 case V4SImode:
1493 gen = gen_aarch64_split_simd_movv4si;
1494 break;
1495 case V2DImode:
1496 gen = gen_aarch64_split_simd_movv2di;
1497 break;
1498 case V8HFmode:
1499 gen = gen_aarch64_split_simd_movv8hf;
1500 break;
1501 case V4SFmode:
1502 gen = gen_aarch64_split_simd_movv4sf;
1503 break;
1504 case V2DFmode:
1505 gen = gen_aarch64_split_simd_movv2df;
1506 break;
1507 default:
1508 gcc_unreachable ();
1511 emit_insn (gen (dst, src));
1512 return;
1516 bool
1517 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1518 machine_mode ymode, rtx y)
1520 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1521 gcc_assert (r != NULL);
1522 return rtx_equal_p (x, r);
1526 static rtx
1527 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1529 if (can_create_pseudo_p ())
1530 return force_reg (mode, value);
1531 else
1533 x = aarch64_emit_move (x, value);
1534 return x;
1539 static rtx
1540 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1542 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1544 rtx high;
1545 /* Load the full offset into a register. This
1546 might be improvable in the future. */
1547 high = GEN_INT (offset);
1548 offset = 0;
1549 high = aarch64_force_temporary (mode, temp, high);
1550 reg = aarch64_force_temporary (mode, temp,
1551 gen_rtx_PLUS (mode, high, reg));
1553 return plus_constant (mode, reg, offset);
1556 static int
1557 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1558 machine_mode mode)
1560 int i;
1561 unsigned HOST_WIDE_INT val, val2, mask;
1562 int one_match, zero_match;
1563 int num_insns;
1565 val = INTVAL (imm);
1567 if (aarch64_move_imm (val, mode))
1569 if (generate)
1570 emit_insn (gen_rtx_SET (dest, imm));
1571 return 1;
1574 if ((val >> 32) == 0 || mode == SImode)
1576 if (generate)
1578 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1579 if (mode == SImode)
1580 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1581 GEN_INT ((val >> 16) & 0xffff)));
1582 else
1583 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1584 GEN_INT ((val >> 16) & 0xffff)));
1586 return 2;
1589 /* Remaining cases are all for DImode. */
1591 mask = 0xffff;
1592 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1593 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1594 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1595 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1597 if (zero_match != 2 && one_match != 2)
1599 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1600 For a 64-bit bitmask try whether changing 16 bits to all ones or
1601 zeroes creates a valid bitmask. To check any repeated bitmask,
1602 try using 16 bits from the other 32-bit half of val. */
1604 for (i = 0; i < 64; i += 16, mask <<= 16)
1606 val2 = val & ~mask;
1607 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1608 break;
1609 val2 = val | mask;
1610 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1611 break;
1612 val2 = val2 & ~mask;
1613 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1614 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1615 break;
1617 if (i != 64)
1619 if (generate)
1621 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1622 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1623 GEN_INT ((val >> i) & 0xffff)));
1625 return 2;
1629 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1630 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1631 otherwise skip zero bits. */
1633 num_insns = 1;
1634 mask = 0xffff;
1635 val2 = one_match > zero_match ? ~val : val;
1636 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1638 if (generate)
1639 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1640 ? (val | ~(mask << i))
1641 : (val & (mask << i)))));
1642 for (i += 16; i < 64; i += 16)
1644 if ((val2 & (mask << i)) == 0)
1645 continue;
1646 if (generate)
1647 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1648 GEN_INT ((val >> i) & 0xffff)));
1649 num_insns ++;
1652 return num_insns;
1656 void
1657 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1659 machine_mode mode = GET_MODE (dest);
1661 gcc_assert (mode == SImode || mode == DImode);
1663 /* Check on what type of symbol it is. */
1664 if (GET_CODE (imm) == SYMBOL_REF
1665 || GET_CODE (imm) == LABEL_REF
1666 || GET_CODE (imm) == CONST)
1668 rtx mem, base, offset;
1669 enum aarch64_symbol_type sty;
1671 /* If we have (const (plus symbol offset)), separate out the offset
1672 before we start classifying the symbol. */
1673 split_const (imm, &base, &offset);
1675 sty = aarch64_classify_symbol (base, offset);
1676 switch (sty)
1678 case SYMBOL_FORCE_TO_MEM:
1679 if (offset != const0_rtx
1680 && targetm.cannot_force_const_mem (mode, imm))
1682 gcc_assert (can_create_pseudo_p ());
1683 base = aarch64_force_temporary (mode, dest, base);
1684 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1685 aarch64_emit_move (dest, base);
1686 return;
1689 mem = force_const_mem (ptr_mode, imm);
1690 gcc_assert (mem);
1692 /* If we aren't generating PC relative literals, then
1693 we need to expand the literal pool access carefully.
1694 This is something that needs to be done in a number
1695 of places, so could well live as a separate function. */
1696 if (aarch64_nopcrelative_literal_loads)
1698 gcc_assert (can_create_pseudo_p ());
1699 base = gen_reg_rtx (ptr_mode);
1700 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1701 mem = gen_rtx_MEM (ptr_mode, base);
1704 if (mode != ptr_mode)
1705 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1707 emit_insn (gen_rtx_SET (dest, mem));
1709 return;
1711 case SYMBOL_SMALL_TLSGD:
1712 case SYMBOL_SMALL_TLSDESC:
1713 case SYMBOL_SMALL_TLSIE:
1714 case SYMBOL_SMALL_GOT_28K:
1715 case SYMBOL_SMALL_GOT_4G:
1716 case SYMBOL_TINY_GOT:
1717 case SYMBOL_TINY_TLSIE:
1718 if (offset != const0_rtx)
1720 gcc_assert(can_create_pseudo_p ());
1721 base = aarch64_force_temporary (mode, dest, base);
1722 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1723 aarch64_emit_move (dest, base);
1724 return;
1726 /* FALLTHRU */
1728 case SYMBOL_SMALL_ABSOLUTE:
1729 case SYMBOL_TINY_ABSOLUTE:
1730 case SYMBOL_TLSLE12:
1731 case SYMBOL_TLSLE24:
1732 case SYMBOL_TLSLE32:
1733 case SYMBOL_TLSLE48:
1734 aarch64_load_symref_appropriately (dest, imm, sty);
1735 return;
1737 default:
1738 gcc_unreachable ();
1742 if (!CONST_INT_P (imm))
1744 if (GET_CODE (imm) == HIGH)
1745 emit_insn (gen_rtx_SET (dest, imm));
1746 else
1748 rtx mem = force_const_mem (mode, imm);
1749 gcc_assert (mem);
1750 emit_insn (gen_rtx_SET (dest, mem));
1753 return;
1756 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1759 static bool
1760 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1761 tree exp ATTRIBUTE_UNUSED)
1763 /* Currently, always true. */
1764 return true;
1767 /* Implement TARGET_PASS_BY_REFERENCE. */
1769 static bool
1770 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1771 machine_mode mode,
1772 const_tree type,
1773 bool named ATTRIBUTE_UNUSED)
1775 HOST_WIDE_INT size;
1776 machine_mode dummymode;
1777 int nregs;
1779 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1780 size = (mode == BLKmode && type)
1781 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1783 /* Aggregates are passed by reference based on their size. */
1784 if (type && AGGREGATE_TYPE_P (type))
1786 size = int_size_in_bytes (type);
1789 /* Variable sized arguments are always returned by reference. */
1790 if (size < 0)
1791 return true;
1793 /* Can this be a candidate to be passed in fp/simd register(s)? */
1794 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1795 &dummymode, &nregs,
1796 NULL))
1797 return false;
1799 /* Arguments which are variable sized or larger than 2 registers are
1800 passed by reference unless they are a homogenous floating point
1801 aggregate. */
1802 return size > 2 * UNITS_PER_WORD;
1805 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1806 static bool
1807 aarch64_return_in_msb (const_tree valtype)
1809 machine_mode dummy_mode;
1810 int dummy_int;
1812 /* Never happens in little-endian mode. */
1813 if (!BYTES_BIG_ENDIAN)
1814 return false;
1816 /* Only composite types smaller than or equal to 16 bytes can
1817 be potentially returned in registers. */
1818 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1819 || int_size_in_bytes (valtype) <= 0
1820 || int_size_in_bytes (valtype) > 16)
1821 return false;
1823 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1824 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1825 is always passed/returned in the least significant bits of fp/simd
1826 register(s). */
1827 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1828 &dummy_mode, &dummy_int, NULL))
1829 return false;
1831 return true;
1834 /* Implement TARGET_FUNCTION_VALUE.
1835 Define how to find the value returned by a function. */
1837 static rtx
1838 aarch64_function_value (const_tree type, const_tree func,
1839 bool outgoing ATTRIBUTE_UNUSED)
1841 machine_mode mode;
1842 int unsignedp;
1843 int count;
1844 machine_mode ag_mode;
1846 mode = TYPE_MODE (type);
1847 if (INTEGRAL_TYPE_P (type))
1848 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1850 if (aarch64_return_in_msb (type))
1852 HOST_WIDE_INT size = int_size_in_bytes (type);
1854 if (size % UNITS_PER_WORD != 0)
1856 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1857 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1861 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1862 &ag_mode, &count, NULL))
1864 if (!aarch64_composite_type_p (type, mode))
1866 gcc_assert (count == 1 && mode == ag_mode);
1867 return gen_rtx_REG (mode, V0_REGNUM);
1869 else
1871 int i;
1872 rtx par;
1874 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1875 for (i = 0; i < count; i++)
1877 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1878 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1879 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1880 XVECEXP (par, 0, i) = tmp;
1882 return par;
1885 else
1886 return gen_rtx_REG (mode, R0_REGNUM);
1889 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1890 Return true if REGNO is the number of a hard register in which the values
1891 of called function may come back. */
1893 static bool
1894 aarch64_function_value_regno_p (const unsigned int regno)
1896 /* Maximum of 16 bytes can be returned in the general registers. Examples
1897 of 16-byte return values are: 128-bit integers and 16-byte small
1898 structures (excluding homogeneous floating-point aggregates). */
1899 if (regno == R0_REGNUM || regno == R1_REGNUM)
1900 return true;
1902 /* Up to four fp/simd registers can return a function value, e.g. a
1903 homogeneous floating-point aggregate having four members. */
1904 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1905 return TARGET_FLOAT;
1907 return false;
1910 /* Implement TARGET_RETURN_IN_MEMORY.
1912 If the type T of the result of a function is such that
1913 void func (T arg)
1914 would require that arg be passed as a value in a register (or set of
1915 registers) according to the parameter passing rules, then the result
1916 is returned in the same registers as would be used for such an
1917 argument. */
1919 static bool
1920 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1922 HOST_WIDE_INT size;
1923 machine_mode ag_mode;
1924 int count;
1926 if (!AGGREGATE_TYPE_P (type)
1927 && TREE_CODE (type) != COMPLEX_TYPE
1928 && TREE_CODE (type) != VECTOR_TYPE)
1929 /* Simple scalar types always returned in registers. */
1930 return false;
1932 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1933 type,
1934 &ag_mode,
1935 &count,
1936 NULL))
1937 return false;
1939 /* Types larger than 2 registers returned in memory. */
1940 size = int_size_in_bytes (type);
1941 return (size < 0 || size > 2 * UNITS_PER_WORD);
1944 static bool
1945 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1946 const_tree type, int *nregs)
1948 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1949 return aarch64_vfp_is_call_or_return_candidate (mode,
1950 type,
1951 &pcum->aapcs_vfp_rmode,
1952 nregs,
1953 NULL);
1956 /* Given MODE and TYPE of a function argument, return the alignment in
1957 bits. The idea is to suppress any stronger alignment requested by
1958 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1959 This is a helper function for local use only. */
1961 static unsigned int
1962 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1964 unsigned int alignment;
1966 if (type)
1968 if (!integer_zerop (TYPE_SIZE (type)))
1970 if (TYPE_MODE (type) == mode)
1971 alignment = TYPE_ALIGN (type);
1972 else
1973 alignment = GET_MODE_ALIGNMENT (mode);
1975 else
1976 alignment = 0;
1978 else
1979 alignment = GET_MODE_ALIGNMENT (mode);
1981 return alignment;
1984 /* Layout a function argument according to the AAPCS64 rules. The rule
1985 numbers refer to the rule numbers in the AAPCS64. */
1987 static void
1988 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1989 const_tree type,
1990 bool named ATTRIBUTE_UNUSED)
1992 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1993 int ncrn, nvrn, nregs;
1994 bool allocate_ncrn, allocate_nvrn;
1995 HOST_WIDE_INT size;
1997 /* We need to do this once per argument. */
1998 if (pcum->aapcs_arg_processed)
1999 return;
2001 pcum->aapcs_arg_processed = true;
2003 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2004 size
2005 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2006 UNITS_PER_WORD);
2008 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2009 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2010 mode,
2011 type,
2012 &nregs);
2014 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2015 The following code thus handles passing by SIMD/FP registers first. */
2017 nvrn = pcum->aapcs_nvrn;
2019 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2020 and homogenous short-vector aggregates (HVA). */
2021 if (allocate_nvrn)
2023 if (!TARGET_FLOAT)
2024 aarch64_err_no_fpadvsimd (mode, "argument");
2026 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2028 pcum->aapcs_nextnvrn = nvrn + nregs;
2029 if (!aarch64_composite_type_p (type, mode))
2031 gcc_assert (nregs == 1);
2032 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2034 else
2036 rtx par;
2037 int i;
2038 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2039 for (i = 0; i < nregs; i++)
2041 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2042 V0_REGNUM + nvrn + i);
2043 tmp = gen_rtx_EXPR_LIST
2044 (VOIDmode, tmp,
2045 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2046 XVECEXP (par, 0, i) = tmp;
2048 pcum->aapcs_reg = par;
2050 return;
2052 else
2054 /* C.3 NSRN is set to 8. */
2055 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2056 goto on_stack;
2060 ncrn = pcum->aapcs_ncrn;
2061 nregs = size / UNITS_PER_WORD;
2063 /* C6 - C9. though the sign and zero extension semantics are
2064 handled elsewhere. This is the case where the argument fits
2065 entirely general registers. */
2066 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2068 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2070 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2072 /* C.8 if the argument has an alignment of 16 then the NGRN is
2073 rounded up to the next even number. */
2074 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2076 ++ncrn;
2077 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2079 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2080 A reg is still generated for it, but the caller should be smart
2081 enough not to use it. */
2082 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2084 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2086 else
2088 rtx par;
2089 int i;
2091 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2092 for (i = 0; i < nregs; i++)
2094 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2095 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2096 GEN_INT (i * UNITS_PER_WORD));
2097 XVECEXP (par, 0, i) = tmp;
2099 pcum->aapcs_reg = par;
2102 pcum->aapcs_nextncrn = ncrn + nregs;
2103 return;
2106 /* C.11 */
2107 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2109 /* The argument is passed on stack; record the needed number of words for
2110 this argument and align the total size if necessary. */
2111 on_stack:
2112 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2113 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2114 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2115 16 / UNITS_PER_WORD);
2116 return;
2119 /* Implement TARGET_FUNCTION_ARG. */
2121 static rtx
2122 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2123 const_tree type, bool named)
2125 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2126 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2128 if (mode == VOIDmode)
2129 return NULL_RTX;
2131 aarch64_layout_arg (pcum_v, mode, type, named);
2132 return pcum->aapcs_reg;
2135 void
2136 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2137 const_tree fntype ATTRIBUTE_UNUSED,
2138 rtx libname ATTRIBUTE_UNUSED,
2139 const_tree fndecl ATTRIBUTE_UNUSED,
2140 unsigned n_named ATTRIBUTE_UNUSED)
2142 pcum->aapcs_ncrn = 0;
2143 pcum->aapcs_nvrn = 0;
2144 pcum->aapcs_nextncrn = 0;
2145 pcum->aapcs_nextnvrn = 0;
2146 pcum->pcs_variant = ARM_PCS_AAPCS64;
2147 pcum->aapcs_reg = NULL_RTX;
2148 pcum->aapcs_arg_processed = false;
2149 pcum->aapcs_stack_words = 0;
2150 pcum->aapcs_stack_size = 0;
2152 if (!TARGET_FLOAT
2153 && fndecl && TREE_PUBLIC (fndecl)
2154 && fntype && fntype != error_mark_node)
2156 const_tree type = TREE_TYPE (fntype);
2157 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2158 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2159 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2160 &mode, &nregs, NULL))
2161 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2163 return;
2166 static void
2167 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2168 machine_mode mode,
2169 const_tree type,
2170 bool named)
2172 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2173 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2175 aarch64_layout_arg (pcum_v, mode, type, named);
2176 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2177 != (pcum->aapcs_stack_words != 0));
2178 pcum->aapcs_arg_processed = false;
2179 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2180 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2181 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2182 pcum->aapcs_stack_words = 0;
2183 pcum->aapcs_reg = NULL_RTX;
2187 bool
2188 aarch64_function_arg_regno_p (unsigned regno)
2190 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2191 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2194 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2195 PARM_BOUNDARY bits of alignment, but will be given anything up
2196 to STACK_BOUNDARY bits if the type requires it. This makes sure
2197 that both before and after the layout of each argument, the Next
2198 Stacked Argument Address (NSAA) will have a minimum alignment of
2199 8 bytes. */
2201 static unsigned int
2202 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2204 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2206 if (alignment < PARM_BOUNDARY)
2207 alignment = PARM_BOUNDARY;
2208 if (alignment > STACK_BOUNDARY)
2209 alignment = STACK_BOUNDARY;
2210 return alignment;
2213 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2215 Return true if an argument passed on the stack should be padded upwards,
2216 i.e. if the least-significant byte of the stack slot has useful data.
2218 Small aggregate types are placed in the lowest memory address.
2220 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2222 bool
2223 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2225 /* On little-endian targets, the least significant byte of every stack
2226 argument is passed at the lowest byte address of the stack slot. */
2227 if (!BYTES_BIG_ENDIAN)
2228 return true;
2230 /* Otherwise, integral, floating-point and pointer types are padded downward:
2231 the least significant byte of a stack argument is passed at the highest
2232 byte address of the stack slot. */
2233 if (type
2234 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2235 || POINTER_TYPE_P (type))
2236 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2237 return false;
2239 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2240 return true;
2243 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2245 It specifies padding for the last (may also be the only)
2246 element of a block move between registers and memory. If
2247 assuming the block is in the memory, padding upward means that
2248 the last element is padded after its highest significant byte,
2249 while in downward padding, the last element is padded at the
2250 its least significant byte side.
2252 Small aggregates and small complex types are always padded
2253 upwards.
2255 We don't need to worry about homogeneous floating-point or
2256 short-vector aggregates; their move is not affected by the
2257 padding direction determined here. Regardless of endianness,
2258 each element of such an aggregate is put in the least
2259 significant bits of a fp/simd register.
2261 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2262 register has useful data, and return the opposite if the most
2263 significant byte does. */
2265 bool
2266 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2267 bool first ATTRIBUTE_UNUSED)
2270 /* Small composite types are always padded upward. */
2271 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2273 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2274 : GET_MODE_SIZE (mode));
2275 if (size < 2 * UNITS_PER_WORD)
2276 return true;
2279 /* Otherwise, use the default padding. */
2280 return !BYTES_BIG_ENDIAN;
2283 static machine_mode
2284 aarch64_libgcc_cmp_return_mode (void)
2286 return SImode;
2289 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2291 /* We use the 12-bit shifted immediate arithmetic instructions so values
2292 must be multiple of (1 << 12), i.e. 4096. */
2293 #define ARITH_FACTOR 4096
2295 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2296 #error Cannot use simple address calculation for stack probing
2297 #endif
2299 /* The pair of scratch registers used for stack probing. */
2300 #define PROBE_STACK_FIRST_REG 9
2301 #define PROBE_STACK_SECOND_REG 10
2303 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2304 inclusive. These are offsets from the current stack pointer. */
2306 static void
2307 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2309 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2311 /* See the same assertion on PROBE_INTERVAL above. */
2312 gcc_assert ((first % ARITH_FACTOR) == 0);
2314 /* See if we have a constant small number of probes to generate. If so,
2315 that's the easy case. */
2316 if (size <= PROBE_INTERVAL)
2318 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2320 emit_set_insn (reg1,
2321 plus_constant (ptr_mode,
2322 stack_pointer_rtx, -(first + base)));
2323 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2326 /* The run-time loop is made up of 8 insns in the generic case while the
2327 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2328 else if (size <= 4 * PROBE_INTERVAL)
2330 HOST_WIDE_INT i, rem;
2332 emit_set_insn (reg1,
2333 plus_constant (ptr_mode,
2334 stack_pointer_rtx,
2335 -(first + PROBE_INTERVAL)));
2336 emit_stack_probe (reg1);
2338 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2339 it exceeds SIZE. If only two probes are needed, this will not
2340 generate any code. Then probe at FIRST + SIZE. */
2341 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2343 emit_set_insn (reg1,
2344 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2345 emit_stack_probe (reg1);
2348 rem = size - (i - PROBE_INTERVAL);
2349 if (rem > 256)
2351 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2353 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2354 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2356 else
2357 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2360 /* Otherwise, do the same as above, but in a loop. Note that we must be
2361 extra careful with variables wrapping around because we might be at
2362 the very top (or the very bottom) of the address space and we have
2363 to be able to handle this case properly; in particular, we use an
2364 equality test for the loop condition. */
2365 else
2367 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2369 /* Step 1: round SIZE to the previous multiple of the interval. */
2371 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2374 /* Step 2: compute initial and final value of the loop counter. */
2376 /* TEST_ADDR = SP + FIRST. */
2377 emit_set_insn (reg1,
2378 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2380 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2381 emit_set_insn (reg2,
2382 plus_constant (ptr_mode, stack_pointer_rtx,
2383 -(first + rounded_size)));
2386 /* Step 3: the loop
2390 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2391 probe at TEST_ADDR
2393 while (TEST_ADDR != LAST_ADDR)
2395 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2396 until it is equal to ROUNDED_SIZE. */
2398 if (ptr_mode == DImode)
2399 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2400 else
2401 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2404 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2405 that SIZE is equal to ROUNDED_SIZE. */
2407 if (size != rounded_size)
2409 HOST_WIDE_INT rem = size - rounded_size;
2411 if (rem > 256)
2413 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2415 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2416 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2418 else
2419 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2423 /* Make sure nothing is scheduled before we are done. */
2424 emit_insn (gen_blockage ());
2427 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2428 absolute addresses. */
2430 const char *
2431 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2433 static int labelno = 0;
2434 char loop_lab[32];
2435 rtx xops[2];
2437 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2439 /* Loop. */
2440 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2442 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2443 xops[0] = reg1;
2444 xops[1] = GEN_INT (PROBE_INTERVAL);
2445 output_asm_insn ("sub\t%0, %0, %1", xops);
2447 /* Probe at TEST_ADDR. */
2448 output_asm_insn ("str\txzr, [%0]", xops);
2450 /* Test if TEST_ADDR == LAST_ADDR. */
2451 xops[1] = reg2;
2452 output_asm_insn ("cmp\t%0, %1", xops);
2454 /* Branch. */
2455 fputs ("\tb.ne\t", asm_out_file);
2456 assemble_name_raw (asm_out_file, loop_lab);
2457 fputc ('\n', asm_out_file);
2459 return "";
2462 static bool
2463 aarch64_frame_pointer_required (void)
2465 /* In aarch64_override_options_after_change
2466 flag_omit_leaf_frame_pointer turns off the frame pointer by
2467 default. Turn it back on now if we've not got a leaf
2468 function. */
2469 if (flag_omit_leaf_frame_pointer
2470 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2471 return true;
2473 return false;
2476 /* Mark the registers that need to be saved by the callee and calculate
2477 the size of the callee-saved registers area and frame record (both FP
2478 and LR may be omitted). */
2479 static void
2480 aarch64_layout_frame (void)
2482 HOST_WIDE_INT offset = 0;
2483 int regno;
2485 if (reload_completed && cfun->machine->frame.laid_out)
2486 return;
2488 #define SLOT_NOT_REQUIRED (-2)
2489 #define SLOT_REQUIRED (-1)
2491 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2492 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2494 /* First mark all the registers that really need to be saved... */
2495 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2496 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2498 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2499 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2501 /* ... that includes the eh data registers (if needed)... */
2502 if (crtl->calls_eh_return)
2503 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2504 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2505 = SLOT_REQUIRED;
2507 /* ... and any callee saved register that dataflow says is live. */
2508 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2509 if (df_regs_ever_live_p (regno)
2510 && (regno == R30_REGNUM
2511 || !call_used_regs[regno]))
2512 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2514 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2515 if (df_regs_ever_live_p (regno)
2516 && !call_used_regs[regno])
2517 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2519 if (frame_pointer_needed)
2521 /* FP and LR are placed in the linkage record. */
2522 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2523 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2524 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2525 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2526 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2527 offset += 2 * UNITS_PER_WORD;
2530 /* Now assign stack slots for them. */
2531 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2532 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2534 cfun->machine->frame.reg_offset[regno] = offset;
2535 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2536 cfun->machine->frame.wb_candidate1 = regno;
2537 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2538 cfun->machine->frame.wb_candidate2 = regno;
2539 offset += UNITS_PER_WORD;
2542 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2543 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2545 cfun->machine->frame.reg_offset[regno] = offset;
2546 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2547 cfun->machine->frame.wb_candidate1 = regno;
2548 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2549 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2550 cfun->machine->frame.wb_candidate2 = regno;
2551 offset += UNITS_PER_WORD;
2554 cfun->machine->frame.padding0 =
2555 (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2556 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2558 cfun->machine->frame.saved_regs_size = offset;
2560 cfun->machine->frame.hard_fp_offset
2561 = ROUND_UP (cfun->machine->frame.saved_varargs_size
2562 + get_frame_size ()
2563 + cfun->machine->frame.saved_regs_size,
2564 STACK_BOUNDARY / BITS_PER_UNIT);
2566 cfun->machine->frame.frame_size
2567 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2568 + crtl->outgoing_args_size,
2569 STACK_BOUNDARY / BITS_PER_UNIT);
2571 cfun->machine->frame.laid_out = true;
2574 static bool
2575 aarch64_register_saved_on_entry (int regno)
2577 return cfun->machine->frame.reg_offset[regno] >= 0;
2580 static unsigned
2581 aarch64_next_callee_save (unsigned regno, unsigned limit)
2583 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2584 regno ++;
2585 return regno;
2588 static void
2589 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2590 HOST_WIDE_INT adjustment)
2592 rtx base_rtx = stack_pointer_rtx;
2593 rtx insn, reg, mem;
2595 reg = gen_rtx_REG (mode, regno);
2596 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2597 plus_constant (Pmode, base_rtx, -adjustment));
2598 mem = gen_rtx_MEM (mode, mem);
2600 insn = emit_move_insn (mem, reg);
2601 RTX_FRAME_RELATED_P (insn) = 1;
2604 static rtx
2605 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2606 HOST_WIDE_INT adjustment)
2608 switch (mode)
2610 case DImode:
2611 return gen_storewb_pairdi_di (base, base, reg, reg2,
2612 GEN_INT (-adjustment),
2613 GEN_INT (UNITS_PER_WORD - adjustment));
2614 case DFmode:
2615 return gen_storewb_pairdf_di (base, base, reg, reg2,
2616 GEN_INT (-adjustment),
2617 GEN_INT (UNITS_PER_WORD - adjustment));
2618 default:
2619 gcc_unreachable ();
2623 static void
2624 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2625 unsigned regno2, HOST_WIDE_INT adjustment)
2627 rtx_insn *insn;
2628 rtx reg1 = gen_rtx_REG (mode, regno1);
2629 rtx reg2 = gen_rtx_REG (mode, regno2);
2631 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2632 reg2, adjustment));
2633 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2634 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2635 RTX_FRAME_RELATED_P (insn) = 1;
2638 static rtx
2639 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2640 HOST_WIDE_INT adjustment)
2642 switch (mode)
2644 case DImode:
2645 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2646 GEN_INT (UNITS_PER_WORD));
2647 case DFmode:
2648 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2649 GEN_INT (UNITS_PER_WORD));
2650 default:
2651 gcc_unreachable ();
2655 static rtx
2656 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2657 rtx reg2)
2659 switch (mode)
2661 case DImode:
2662 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2664 case DFmode:
2665 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2667 default:
2668 gcc_unreachable ();
2672 static rtx
2673 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2674 rtx mem2)
2676 switch (mode)
2678 case DImode:
2679 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2681 case DFmode:
2682 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2684 default:
2685 gcc_unreachable ();
2690 static void
2691 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2692 unsigned start, unsigned limit, bool skip_wb)
2694 rtx_insn *insn;
2695 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2696 ? gen_frame_mem : gen_rtx_MEM);
2697 unsigned regno;
2698 unsigned regno2;
2700 for (regno = aarch64_next_callee_save (start, limit);
2701 regno <= limit;
2702 regno = aarch64_next_callee_save (regno + 1, limit))
2704 rtx reg, mem;
2705 HOST_WIDE_INT offset;
2707 if (skip_wb
2708 && (regno == cfun->machine->frame.wb_candidate1
2709 || regno == cfun->machine->frame.wb_candidate2))
2710 continue;
2712 reg = gen_rtx_REG (mode, regno);
2713 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2714 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2715 offset));
2717 regno2 = aarch64_next_callee_save (regno + 1, limit);
2719 if (regno2 <= limit
2720 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2721 == cfun->machine->frame.reg_offset[regno2]))
2724 rtx reg2 = gen_rtx_REG (mode, regno2);
2725 rtx mem2;
2727 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2728 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2729 offset));
2730 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2731 reg2));
2733 /* The first part of a frame-related parallel insn is
2734 always assumed to be relevant to the frame
2735 calculations; subsequent parts, are only
2736 frame-related if explicitly marked. */
2737 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2738 regno = regno2;
2740 else
2741 insn = emit_move_insn (mem, reg);
2743 RTX_FRAME_RELATED_P (insn) = 1;
2747 static void
2748 aarch64_restore_callee_saves (machine_mode mode,
2749 HOST_WIDE_INT start_offset, unsigned start,
2750 unsigned limit, bool skip_wb, rtx *cfi_ops)
2752 rtx base_rtx = stack_pointer_rtx;
2753 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2754 ? gen_frame_mem : gen_rtx_MEM);
2755 unsigned regno;
2756 unsigned regno2;
2757 HOST_WIDE_INT offset;
2759 for (regno = aarch64_next_callee_save (start, limit);
2760 regno <= limit;
2761 regno = aarch64_next_callee_save (regno + 1, limit))
2763 rtx reg, mem;
2765 if (skip_wb
2766 && (regno == cfun->machine->frame.wb_candidate1
2767 || regno == cfun->machine->frame.wb_candidate2))
2768 continue;
2770 reg = gen_rtx_REG (mode, regno);
2771 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2772 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2774 regno2 = aarch64_next_callee_save (regno + 1, limit);
2776 if (regno2 <= limit
2777 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2778 == cfun->machine->frame.reg_offset[regno2]))
2780 rtx reg2 = gen_rtx_REG (mode, regno2);
2781 rtx mem2;
2783 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2784 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2785 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2787 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2788 regno = regno2;
2790 else
2791 emit_move_insn (reg, mem);
2792 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2796 /* AArch64 stack frames generated by this compiler look like:
2798 +-------------------------------+
2800 | incoming stack arguments |
2802 +-------------------------------+
2803 | | <-- incoming stack pointer (aligned)
2804 | callee-allocated save area |
2805 | for register varargs |
2807 +-------------------------------+
2808 | local variables | <-- frame_pointer_rtx
2810 +-------------------------------+
2811 | padding0 | \
2812 +-------------------------------+ |
2813 | callee-saved registers | | frame.saved_regs_size
2814 +-------------------------------+ |
2815 | LR' | |
2816 +-------------------------------+ |
2817 | FP' | / <- hard_frame_pointer_rtx (aligned)
2818 +-------------------------------+
2819 | dynamic allocation |
2820 +-------------------------------+
2821 | padding |
2822 +-------------------------------+
2823 | outgoing stack arguments | <-- arg_pointer
2825 +-------------------------------+
2826 | | <-- stack_pointer_rtx (aligned)
2828 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2829 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2830 unchanged. */
2832 /* Generate the prologue instructions for entry into a function.
2833 Establish the stack frame by decreasing the stack pointer with a
2834 properly calculated size and, if necessary, create a frame record
2835 filled with the values of LR and previous frame pointer. The
2836 current FP is also set up if it is in use. */
2838 void
2839 aarch64_expand_prologue (void)
2841 /* sub sp, sp, #<frame_size>
2842 stp {fp, lr}, [sp, #<frame_size> - 16]
2843 add fp, sp, #<frame_size> - hardfp_offset
2844 stp {cs_reg}, [fp, #-16] etc.
2846 sub sp, sp, <final_adjustment_if_any>
2848 HOST_WIDE_INT frame_size, offset;
2849 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2850 HOST_WIDE_INT hard_fp_offset;
2851 rtx_insn *insn;
2853 aarch64_layout_frame ();
2855 offset = frame_size = cfun->machine->frame.frame_size;
2856 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2857 fp_offset = frame_size - hard_fp_offset;
2859 if (flag_stack_usage_info)
2860 current_function_static_stack_size = frame_size;
2862 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
2864 if (crtl->is_leaf && !cfun->calls_alloca)
2866 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
2867 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
2868 frame_size - STACK_CHECK_PROTECT);
2870 else if (frame_size > 0)
2871 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
2874 /* Store pairs and load pairs have a range only -512 to 504. */
2875 if (offset >= 512)
2877 /* When the frame has a large size, an initial decrease is done on
2878 the stack pointer to jump over the callee-allocated save area for
2879 register varargs, the local variable area and/or the callee-saved
2880 register area. This will allow the pre-index write-back
2881 store pair instructions to be used for setting up the stack frame
2882 efficiently. */
2883 offset = hard_fp_offset;
2884 if (offset >= 512)
2885 offset = cfun->machine->frame.saved_regs_size;
2887 frame_size -= (offset + crtl->outgoing_args_size);
2888 fp_offset = 0;
2890 if (frame_size >= 0x1000000)
2892 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2893 emit_move_insn (op0, GEN_INT (-frame_size));
2894 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2896 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2897 gen_rtx_SET (stack_pointer_rtx,
2898 plus_constant (Pmode, stack_pointer_rtx,
2899 -frame_size)));
2900 RTX_FRAME_RELATED_P (insn) = 1;
2902 else if (frame_size > 0)
2904 int hi_ofs = frame_size & 0xfff000;
2905 int lo_ofs = frame_size & 0x000fff;
2907 if (hi_ofs)
2909 insn = emit_insn (gen_add2_insn
2910 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2911 RTX_FRAME_RELATED_P (insn) = 1;
2913 if (lo_ofs)
2915 insn = emit_insn (gen_add2_insn
2916 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2917 RTX_FRAME_RELATED_P (insn) = 1;
2921 else
2922 frame_size = -1;
2924 if (offset > 0)
2926 bool skip_wb = false;
2928 if (frame_pointer_needed)
2930 skip_wb = true;
2932 if (fp_offset)
2934 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2935 GEN_INT (-offset)));
2936 RTX_FRAME_RELATED_P (insn) = 1;
2938 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2939 R30_REGNUM, false);
2941 else
2942 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2944 /* Set up frame pointer to point to the location of the
2945 previous frame pointer on the stack. */
2946 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2947 stack_pointer_rtx,
2948 GEN_INT (fp_offset)));
2949 RTX_FRAME_RELATED_P (insn) = 1;
2950 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2952 else
2954 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2955 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2957 if (fp_offset
2958 || reg1 == FIRST_PSEUDO_REGISTER
2959 || (reg2 == FIRST_PSEUDO_REGISTER
2960 && offset >= 256))
2962 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2963 GEN_INT (-offset)));
2964 RTX_FRAME_RELATED_P (insn) = 1;
2966 else
2968 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2970 skip_wb = true;
2972 if (reg2 == FIRST_PSEUDO_REGISTER)
2973 aarch64_pushwb_single_reg (mode1, reg1, offset);
2974 else
2975 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2979 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2980 skip_wb);
2981 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2982 skip_wb);
2985 /* when offset >= 512,
2986 sub sp, sp, #<outgoing_args_size> */
2987 if (frame_size > -1)
2989 if (crtl->outgoing_args_size > 0)
2991 insn = emit_insn (gen_add2_insn
2992 (stack_pointer_rtx,
2993 GEN_INT (- crtl->outgoing_args_size)));
2994 RTX_FRAME_RELATED_P (insn) = 1;
2999 /* Return TRUE if we can use a simple_return insn.
3001 This function checks whether the callee saved stack is empty, which
3002 means no restore actions are need. The pro_and_epilogue will use
3003 this to check whether shrink-wrapping opt is feasible. */
3005 bool
3006 aarch64_use_return_insn_p (void)
3008 if (!reload_completed)
3009 return false;
3011 if (crtl->profile)
3012 return false;
3014 aarch64_layout_frame ();
3016 return cfun->machine->frame.frame_size == 0;
3019 /* Generate the epilogue instructions for returning from a function. */
3020 void
3021 aarch64_expand_epilogue (bool for_sibcall)
3023 HOST_WIDE_INT frame_size, offset;
3024 HOST_WIDE_INT fp_offset;
3025 HOST_WIDE_INT hard_fp_offset;
3026 rtx_insn *insn;
3027 /* We need to add memory barrier to prevent read from deallocated stack. */
3028 bool need_barrier_p = (get_frame_size () != 0
3029 || cfun->machine->frame.saved_varargs_size);
3031 aarch64_layout_frame ();
3033 offset = frame_size = cfun->machine->frame.frame_size;
3034 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
3035 fp_offset = frame_size - hard_fp_offset;
3037 /* Store pairs and load pairs have a range only -512 to 504. */
3038 if (offset >= 512)
3040 offset = hard_fp_offset;
3041 if (offset >= 512)
3042 offset = cfun->machine->frame.saved_regs_size;
3044 frame_size -= (offset + crtl->outgoing_args_size);
3045 fp_offset = 0;
3046 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
3048 insn = emit_insn (gen_add2_insn
3049 (stack_pointer_rtx,
3050 GEN_INT (crtl->outgoing_args_size)));
3051 RTX_FRAME_RELATED_P (insn) = 1;
3054 else
3055 frame_size = -1;
3057 /* If there were outgoing arguments or we've done dynamic stack
3058 allocation, then restore the stack pointer from the frame
3059 pointer. This is at most one insn and more efficient than using
3060 GCC's internal mechanism. */
3061 if (frame_pointer_needed
3062 && (crtl->outgoing_args_size || cfun->calls_alloca))
3064 if (cfun->calls_alloca)
3065 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3067 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3068 hard_frame_pointer_rtx,
3069 GEN_INT (0)));
3070 offset = offset - fp_offset;
3073 if (offset > 0)
3075 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3076 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3077 bool skip_wb = true;
3078 rtx cfi_ops = NULL;
3080 if (frame_pointer_needed)
3081 fp_offset = 0;
3082 else if (fp_offset
3083 || reg1 == FIRST_PSEUDO_REGISTER
3084 || (reg2 == FIRST_PSEUDO_REGISTER
3085 && offset >= 256))
3086 skip_wb = false;
3088 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
3089 skip_wb, &cfi_ops);
3090 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
3091 skip_wb, &cfi_ops);
3093 if (need_barrier_p)
3094 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3096 if (skip_wb)
3098 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
3099 rtx rreg1 = gen_rtx_REG (mode1, reg1);
3101 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
3102 if (reg2 == FIRST_PSEUDO_REGISTER)
3104 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
3105 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3106 mem = gen_rtx_MEM (mode1, mem);
3107 insn = emit_move_insn (rreg1, mem);
3109 else
3111 rtx rreg2 = gen_rtx_REG (mode1, reg2);
3113 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
3114 insn = emit_insn (aarch64_gen_loadwb_pair
3115 (mode1, stack_pointer_rtx, rreg1,
3116 rreg2, offset));
3119 else
3121 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
3122 GEN_INT (offset)));
3125 /* Reset the CFA to be SP + FRAME_SIZE. */
3126 rtx new_cfa = stack_pointer_rtx;
3127 if (frame_size > 0)
3128 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
3129 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3130 REG_NOTES (insn) = cfi_ops;
3131 RTX_FRAME_RELATED_P (insn) = 1;
3134 if (frame_size > 0)
3136 if (need_barrier_p)
3137 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3139 if (frame_size >= 0x1000000)
3141 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3142 emit_move_insn (op0, GEN_INT (frame_size));
3143 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
3145 else
3147 int hi_ofs = frame_size & 0xfff000;
3148 int lo_ofs = frame_size & 0x000fff;
3150 if (hi_ofs && lo_ofs)
3152 insn = emit_insn (gen_add2_insn
3153 (stack_pointer_rtx, GEN_INT (hi_ofs)));
3154 RTX_FRAME_RELATED_P (insn) = 1;
3155 frame_size = lo_ofs;
3157 insn = emit_insn (gen_add2_insn
3158 (stack_pointer_rtx, GEN_INT (frame_size)));
3161 /* Reset the CFA to be SP + 0. */
3162 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
3163 RTX_FRAME_RELATED_P (insn) = 1;
3166 /* Stack adjustment for exception handler. */
3167 if (crtl->calls_eh_return)
3169 /* We need to unwind the stack by the offset computed by
3170 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3171 to be SP; letting the CFA move during this adjustment
3172 is just as correct as retaining the CFA from the body
3173 of the function. Therefore, do nothing special. */
3174 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3177 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3178 if (!for_sibcall)
3179 emit_jump_insn (ret_rtx);
3182 /* Return the place to copy the exception unwinding return address to.
3183 This will probably be a stack slot, but could (in theory be the
3184 return register). */
3186 aarch64_final_eh_return_addr (void)
3188 HOST_WIDE_INT fp_offset;
3190 aarch64_layout_frame ();
3192 fp_offset = cfun->machine->frame.frame_size
3193 - cfun->machine->frame.hard_fp_offset;
3195 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3196 return gen_rtx_REG (DImode, LR_REGNUM);
3198 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3199 result in a store to save LR introduced by builtin_eh_return () being
3200 incorrectly deleted because the alias is not detected.
3201 So in the calculation of the address to copy the exception unwinding
3202 return address to, we note 2 cases.
3203 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3204 we return a SP-relative location since all the addresses are SP-relative
3205 in this case. This prevents the store from being optimized away.
3206 If the fp_offset is not 0, then the addresses will be FP-relative and
3207 therefore we return a FP-relative location. */
3209 if (frame_pointer_needed)
3211 if (fp_offset)
3212 return gen_frame_mem (DImode,
3213 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3214 else
3215 return gen_frame_mem (DImode,
3216 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3219 /* If FP is not needed, we calculate the location of LR, which would be
3220 at the top of the saved registers block. */
3222 return gen_frame_mem (DImode,
3223 plus_constant (Pmode,
3224 stack_pointer_rtx,
3225 fp_offset
3226 + cfun->machine->frame.saved_regs_size
3227 - 2 * UNITS_PER_WORD));
3230 /* Possibly output code to build up a constant in a register. For
3231 the benefit of the costs infrastructure, returns the number of
3232 instructions which would be emitted. GENERATE inhibits or
3233 enables code generation. */
3235 static int
3236 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
3238 int insns = 0;
3240 if (aarch64_bitmask_imm (val, DImode))
3242 if (generate)
3243 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
3244 insns = 1;
3246 else
3248 int i;
3249 int ncount = 0;
3250 int zcount = 0;
3251 HOST_WIDE_INT valp = val >> 16;
3252 HOST_WIDE_INT valm;
3253 HOST_WIDE_INT tval;
3255 for (i = 16; i < 64; i += 16)
3257 valm = (valp & 0xffff);
3259 if (valm != 0)
3260 ++ zcount;
3262 if (valm != 0xffff)
3263 ++ ncount;
3265 valp >>= 16;
3268 /* zcount contains the number of additional MOVK instructions
3269 required if the constant is built up with an initial MOVZ instruction,
3270 while ncount is the number of MOVK instructions required if starting
3271 with a MOVN instruction. Choose the sequence that yields the fewest
3272 number of instructions, preferring MOVZ instructions when they are both
3273 the same. */
3274 if (ncount < zcount)
3276 if (generate)
3277 emit_move_insn (gen_rtx_REG (Pmode, regnum),
3278 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3279 tval = 0xffff;
3280 insns++;
3282 else
3284 if (generate)
3285 emit_move_insn (gen_rtx_REG (Pmode, regnum),
3286 GEN_INT (val & 0xffff));
3287 tval = 0;
3288 insns++;
3291 val >>= 16;
3293 for (i = 16; i < 64; i += 16)
3295 if ((val & 0xffff) != tval)
3297 if (generate)
3298 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3299 GEN_INT (i),
3300 GEN_INT (val & 0xffff)));
3301 insns++;
3303 val >>= 16;
3306 return insns;
3309 static void
3310 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3312 HOST_WIDE_INT mdelta = delta;
3313 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3314 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3316 if (mdelta < 0)
3317 mdelta = -mdelta;
3319 if (mdelta >= 4096 * 4096)
3321 (void) aarch64_build_constant (scratchreg, delta, true);
3322 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3324 else if (mdelta > 0)
3326 if (mdelta >= 4096)
3328 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3329 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3330 if (delta < 0)
3331 emit_insn (gen_rtx_SET (this_rtx,
3332 gen_rtx_MINUS (Pmode, this_rtx, shift)));
3333 else
3334 emit_insn (gen_rtx_SET (this_rtx,
3335 gen_rtx_PLUS (Pmode, this_rtx, shift)));
3337 if (mdelta % 4096 != 0)
3339 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3340 emit_insn (gen_rtx_SET (this_rtx,
3341 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3346 /* Output code to add DELTA to the first argument, and then jump
3347 to FUNCTION. Used for C++ multiple inheritance. */
3348 static void
3349 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3350 HOST_WIDE_INT delta,
3351 HOST_WIDE_INT vcall_offset,
3352 tree function)
3354 /* The this pointer is always in x0. Note that this differs from
3355 Arm where the this pointer maybe bumped to r1 if r0 is required
3356 to return a pointer to an aggregate. On AArch64 a result value
3357 pointer will be in x8. */
3358 int this_regno = R0_REGNUM;
3359 rtx this_rtx, temp0, temp1, addr, funexp;
3360 rtx_insn *insn;
3362 reload_completed = 1;
3363 emit_note (NOTE_INSN_PROLOGUE_END);
3365 if (vcall_offset == 0)
3366 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3367 else
3369 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3371 this_rtx = gen_rtx_REG (Pmode, this_regno);
3372 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3373 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3375 addr = this_rtx;
3376 if (delta != 0)
3378 if (delta >= -256 && delta < 256)
3379 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3380 plus_constant (Pmode, this_rtx, delta));
3381 else
3382 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3385 if (Pmode == ptr_mode)
3386 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3387 else
3388 aarch64_emit_move (temp0,
3389 gen_rtx_ZERO_EXTEND (Pmode,
3390 gen_rtx_MEM (ptr_mode, addr)));
3392 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3393 addr = plus_constant (Pmode, temp0, vcall_offset);
3394 else
3396 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3397 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3400 if (Pmode == ptr_mode)
3401 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3402 else
3403 aarch64_emit_move (temp1,
3404 gen_rtx_SIGN_EXTEND (Pmode,
3405 gen_rtx_MEM (ptr_mode, addr)));
3407 emit_insn (gen_add2_insn (this_rtx, temp1));
3410 /* Generate a tail call to the target function. */
3411 if (!TREE_USED (function))
3413 assemble_external (function);
3414 TREE_USED (function) = 1;
3416 funexp = XEXP (DECL_RTL (function), 0);
3417 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3418 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3419 SIBLING_CALL_P (insn) = 1;
3421 insn = get_insns ();
3422 shorten_branches (insn);
3423 final_start_function (insn, file, 1);
3424 final (insn, file, 1);
3425 final_end_function ();
3427 /* Stop pretending to be a post-reload pass. */
3428 reload_completed = 0;
3431 static bool
3432 aarch64_tls_referenced_p (rtx x)
3434 if (!TARGET_HAVE_TLS)
3435 return false;
3436 subrtx_iterator::array_type array;
3437 FOR_EACH_SUBRTX (iter, array, x, ALL)
3439 const_rtx x = *iter;
3440 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3441 return true;
3442 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3443 TLS offsets, not real symbol references. */
3444 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3445 iter.skip_subrtxes ();
3447 return false;
3451 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3452 a left shift of 0 or 12 bits. */
3453 bool
3454 aarch64_uimm12_shift (HOST_WIDE_INT val)
3456 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3457 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3462 /* Return true if val is an immediate that can be loaded into a
3463 register by a MOVZ instruction. */
3464 static bool
3465 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3467 if (GET_MODE_SIZE (mode) > 4)
3469 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3470 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3471 return 1;
3473 else
3475 /* Ignore sign extension. */
3476 val &= (HOST_WIDE_INT) 0xffffffff;
3478 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3479 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3482 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3484 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3486 0x0000000100000001ull,
3487 0x0001000100010001ull,
3488 0x0101010101010101ull,
3489 0x1111111111111111ull,
3490 0x5555555555555555ull,
3494 /* Return true if val is a valid bitmask immediate. */
3496 bool
3497 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3499 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3500 int bits;
3502 /* Check for a single sequence of one bits and return quickly if so.
3503 The special cases of all ones and all zeroes returns false. */
3504 val = (unsigned HOST_WIDE_INT) val_in;
3505 tmp = val + (val & -val);
3507 if (tmp == (tmp & -tmp))
3508 return (val + 1) > 1;
3510 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3511 if (mode == SImode)
3512 val = (val << 32) | (val & 0xffffffff);
3514 /* Invert if the immediate doesn't start with a zero bit - this means we
3515 only need to search for sequences of one bits. */
3516 if (val & 1)
3517 val = ~val;
3519 /* Find the first set bit and set tmp to val with the first sequence of one
3520 bits removed. Return success if there is a single sequence of ones. */
3521 first_one = val & -val;
3522 tmp = val & (val + first_one);
3524 if (tmp == 0)
3525 return true;
3527 /* Find the next set bit and compute the difference in bit position. */
3528 next_one = tmp & -tmp;
3529 bits = clz_hwi (first_one) - clz_hwi (next_one);
3530 mask = val ^ tmp;
3532 /* Check the bit position difference is a power of 2, and that the first
3533 sequence of one bits fits within 'bits' bits. */
3534 if ((mask >> bits) != 0 || bits != (bits & -bits))
3535 return false;
3537 /* Check the sequence of one bits is repeated 64/bits times. */
3538 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3542 /* Return true if val is an immediate that can be loaded into a
3543 register in a single instruction. */
3544 bool
3545 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3547 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3548 return 1;
3549 return aarch64_bitmask_imm (val, mode);
3552 static bool
3553 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3555 rtx base, offset;
3557 if (GET_CODE (x) == HIGH)
3558 return true;
3560 split_const (x, &base, &offset);
3561 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3563 if (aarch64_classify_symbol (base, offset)
3564 != SYMBOL_FORCE_TO_MEM)
3565 return true;
3566 else
3567 /* Avoid generating a 64-bit relocation in ILP32; leave
3568 to aarch64_expand_mov_immediate to handle it properly. */
3569 return mode != ptr_mode;
3572 return aarch64_tls_referenced_p (x);
3575 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3576 The expansion for a table switch is quite expensive due to the number
3577 of instructions, the table lookup and hard to predict indirect jump.
3578 When optimizing for speed, and -O3 enabled, use the per-core tuning if
3579 set, otherwise use tables for > 16 cases as a tradeoff between size and
3580 performance. When optimizing for size, use the default setting. */
3582 static unsigned int
3583 aarch64_case_values_threshold (void)
3585 /* Use the specified limit for the number of cases before using jump
3586 tables at higher optimization levels. */
3587 if (optimize > 2
3588 && selected_cpu->tune->max_case_values != 0)
3589 return selected_cpu->tune->max_case_values;
3590 else
3591 return optimize_size ? default_case_values_threshold () : 17;
3594 /* Return true if register REGNO is a valid index register.
3595 STRICT_P is true if REG_OK_STRICT is in effect. */
3597 bool
3598 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3600 if (!HARD_REGISTER_NUM_P (regno))
3602 if (!strict_p)
3603 return true;
3605 if (!reg_renumber)
3606 return false;
3608 regno = reg_renumber[regno];
3610 return GP_REGNUM_P (regno);
3613 /* Return true if register REGNO is a valid base register for mode MODE.
3614 STRICT_P is true if REG_OK_STRICT is in effect. */
3616 bool
3617 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3619 if (!HARD_REGISTER_NUM_P (regno))
3621 if (!strict_p)
3622 return true;
3624 if (!reg_renumber)
3625 return false;
3627 regno = reg_renumber[regno];
3630 /* The fake registers will be eliminated to either the stack or
3631 hard frame pointer, both of which are usually valid base registers.
3632 Reload deals with the cases where the eliminated form isn't valid. */
3633 return (GP_REGNUM_P (regno)
3634 || regno == SP_REGNUM
3635 || regno == FRAME_POINTER_REGNUM
3636 || regno == ARG_POINTER_REGNUM);
3639 /* Return true if X is a valid base register for mode MODE.
3640 STRICT_P is true if REG_OK_STRICT is in effect. */
3642 static bool
3643 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3645 if (!strict_p && GET_CODE (x) == SUBREG)
3646 x = SUBREG_REG (x);
3648 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3651 /* Return true if address offset is a valid index. If it is, fill in INFO
3652 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3654 static bool
3655 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3656 machine_mode mode, bool strict_p)
3658 enum aarch64_address_type type;
3659 rtx index;
3660 int shift;
3662 /* (reg:P) */
3663 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3664 && GET_MODE (x) == Pmode)
3666 type = ADDRESS_REG_REG;
3667 index = x;
3668 shift = 0;
3670 /* (sign_extend:DI (reg:SI)) */
3671 else if ((GET_CODE (x) == SIGN_EXTEND
3672 || GET_CODE (x) == ZERO_EXTEND)
3673 && GET_MODE (x) == DImode
3674 && GET_MODE (XEXP (x, 0)) == SImode)
3676 type = (GET_CODE (x) == SIGN_EXTEND)
3677 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3678 index = XEXP (x, 0);
3679 shift = 0;
3681 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3682 else if (GET_CODE (x) == MULT
3683 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3684 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3685 && GET_MODE (XEXP (x, 0)) == DImode
3686 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3687 && CONST_INT_P (XEXP (x, 1)))
3689 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3690 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3691 index = XEXP (XEXP (x, 0), 0);
3692 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3694 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3695 else if (GET_CODE (x) == ASHIFT
3696 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3697 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3698 && GET_MODE (XEXP (x, 0)) == DImode
3699 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3700 && CONST_INT_P (XEXP (x, 1)))
3702 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3703 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3704 index = XEXP (XEXP (x, 0), 0);
3705 shift = INTVAL (XEXP (x, 1));
3707 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3708 else if ((GET_CODE (x) == SIGN_EXTRACT
3709 || GET_CODE (x) == ZERO_EXTRACT)
3710 && GET_MODE (x) == DImode
3711 && GET_CODE (XEXP (x, 0)) == MULT
3712 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3713 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3715 type = (GET_CODE (x) == SIGN_EXTRACT)
3716 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3717 index = XEXP (XEXP (x, 0), 0);
3718 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3719 if (INTVAL (XEXP (x, 1)) != 32 + shift
3720 || INTVAL (XEXP (x, 2)) != 0)
3721 shift = -1;
3723 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3724 (const_int 0xffffffff<<shift)) */
3725 else if (GET_CODE (x) == AND
3726 && GET_MODE (x) == DImode
3727 && GET_CODE (XEXP (x, 0)) == MULT
3728 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3729 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3730 && CONST_INT_P (XEXP (x, 1)))
3732 type = ADDRESS_REG_UXTW;
3733 index = XEXP (XEXP (x, 0), 0);
3734 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3735 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3736 shift = -1;
3738 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3739 else if ((GET_CODE (x) == SIGN_EXTRACT
3740 || GET_CODE (x) == ZERO_EXTRACT)
3741 && GET_MODE (x) == DImode
3742 && GET_CODE (XEXP (x, 0)) == ASHIFT
3743 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3744 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3746 type = (GET_CODE (x) == SIGN_EXTRACT)
3747 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3748 index = XEXP (XEXP (x, 0), 0);
3749 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3750 if (INTVAL (XEXP (x, 1)) != 32 + shift
3751 || INTVAL (XEXP (x, 2)) != 0)
3752 shift = -1;
3754 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3755 (const_int 0xffffffff<<shift)) */
3756 else if (GET_CODE (x) == AND
3757 && GET_MODE (x) == DImode
3758 && GET_CODE (XEXP (x, 0)) == ASHIFT
3759 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3760 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3761 && CONST_INT_P (XEXP (x, 1)))
3763 type = ADDRESS_REG_UXTW;
3764 index = XEXP (XEXP (x, 0), 0);
3765 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3766 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3767 shift = -1;
3769 /* (mult:P (reg:P) (const_int scale)) */
3770 else if (GET_CODE (x) == MULT
3771 && GET_MODE (x) == Pmode
3772 && GET_MODE (XEXP (x, 0)) == Pmode
3773 && CONST_INT_P (XEXP (x, 1)))
3775 type = ADDRESS_REG_REG;
3776 index = XEXP (x, 0);
3777 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3779 /* (ashift:P (reg:P) (const_int shift)) */
3780 else if (GET_CODE (x) == ASHIFT
3781 && GET_MODE (x) == Pmode
3782 && GET_MODE (XEXP (x, 0)) == Pmode
3783 && CONST_INT_P (XEXP (x, 1)))
3785 type = ADDRESS_REG_REG;
3786 index = XEXP (x, 0);
3787 shift = INTVAL (XEXP (x, 1));
3789 else
3790 return false;
3792 if (GET_CODE (index) == SUBREG)
3793 index = SUBREG_REG (index);
3795 if ((shift == 0 ||
3796 (shift > 0 && shift <= 3
3797 && (1 << shift) == GET_MODE_SIZE (mode)))
3798 && REG_P (index)
3799 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3801 info->type = type;
3802 info->offset = index;
3803 info->shift = shift;
3804 return true;
3807 return false;
3810 bool
3811 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3813 return (offset >= -64 * GET_MODE_SIZE (mode)
3814 && offset < 64 * GET_MODE_SIZE (mode)
3815 && offset % GET_MODE_SIZE (mode) == 0);
3818 static inline bool
3819 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3820 HOST_WIDE_INT offset)
3822 return offset >= -256 && offset < 256;
3825 static inline bool
3826 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3828 return (offset >= 0
3829 && offset < 4096 * GET_MODE_SIZE (mode)
3830 && offset % GET_MODE_SIZE (mode) == 0);
3833 /* Return true if MODE is one of the modes for which we
3834 support LDP/STP operations. */
3836 static bool
3837 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3839 return mode == SImode || mode == DImode
3840 || mode == SFmode || mode == DFmode
3841 || (aarch64_vector_mode_supported_p (mode)
3842 && GET_MODE_SIZE (mode) == 8);
3845 /* Return true if REGNO is a virtual pointer register, or an eliminable
3846 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
3847 include stack_pointer or hard_frame_pointer. */
3848 static bool
3849 virt_or_elim_regno_p (unsigned regno)
3851 return ((regno >= FIRST_VIRTUAL_REGISTER
3852 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3853 || regno == FRAME_POINTER_REGNUM
3854 || regno == ARG_POINTER_REGNUM);
3857 /* Return true if X is a valid address for machine mode MODE. If it is,
3858 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3859 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3861 static bool
3862 aarch64_classify_address (struct aarch64_address_info *info,
3863 rtx x, machine_mode mode,
3864 RTX_CODE outer_code, bool strict_p)
3866 enum rtx_code code = GET_CODE (x);
3867 rtx op0, op1;
3869 /* On BE, we use load/store pair for all large int mode load/stores. */
3870 bool load_store_pair_p = (outer_code == PARALLEL
3871 || (BYTES_BIG_ENDIAN
3872 && aarch64_vect_struct_mode_p (mode)));
3874 bool allow_reg_index_p =
3875 !load_store_pair_p
3876 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3877 && !aarch64_vect_struct_mode_p (mode);
3879 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3880 REG addressing. */
3881 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3882 && (code != POST_INC && code != REG))
3883 return false;
3885 switch (code)
3887 case REG:
3888 case SUBREG:
3889 info->type = ADDRESS_REG_IMM;
3890 info->base = x;
3891 info->offset = const0_rtx;
3892 return aarch64_base_register_rtx_p (x, strict_p);
3894 case PLUS:
3895 op0 = XEXP (x, 0);
3896 op1 = XEXP (x, 1);
3898 if (! strict_p
3899 && REG_P (op0)
3900 && virt_or_elim_regno_p (REGNO (op0))
3901 && CONST_INT_P (op1))
3903 info->type = ADDRESS_REG_IMM;
3904 info->base = op0;
3905 info->offset = op1;
3907 return true;
3910 if (GET_MODE_SIZE (mode) != 0
3911 && CONST_INT_P (op1)
3912 && aarch64_base_register_rtx_p (op0, strict_p))
3914 HOST_WIDE_INT offset = INTVAL (op1);
3916 info->type = ADDRESS_REG_IMM;
3917 info->base = op0;
3918 info->offset = op1;
3920 /* TImode and TFmode values are allowed in both pairs of X
3921 registers and individual Q registers. The available
3922 address modes are:
3923 X,X: 7-bit signed scaled offset
3924 Q: 9-bit signed offset
3925 We conservatively require an offset representable in either mode.
3927 if (mode == TImode || mode == TFmode)
3928 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3929 && offset_9bit_signed_unscaled_p (mode, offset));
3931 /* A 7bit offset check because OImode will emit a ldp/stp
3932 instruction (only big endian will get here).
3933 For ldp/stp instructions, the offset is scaled for the size of a
3934 single element of the pair. */
3935 if (mode == OImode)
3936 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3938 /* Three 9/12 bit offsets checks because CImode will emit three
3939 ldr/str instructions (only big endian will get here). */
3940 if (mode == CImode)
3941 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3942 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3943 || offset_12bit_unsigned_scaled_p (V16QImode,
3944 offset + 32)));
3946 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3947 instructions (only big endian will get here). */
3948 if (mode == XImode)
3949 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3950 && aarch64_offset_7bit_signed_scaled_p (TImode,
3951 offset + 32));
3953 if (load_store_pair_p)
3954 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3955 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3956 else
3957 return (offset_9bit_signed_unscaled_p (mode, offset)
3958 || offset_12bit_unsigned_scaled_p (mode, offset));
3961 if (allow_reg_index_p)
3963 /* Look for base + (scaled/extended) index register. */
3964 if (aarch64_base_register_rtx_p (op0, strict_p)
3965 && aarch64_classify_index (info, op1, mode, strict_p))
3967 info->base = op0;
3968 return true;
3970 if (aarch64_base_register_rtx_p (op1, strict_p)
3971 && aarch64_classify_index (info, op0, mode, strict_p))
3973 info->base = op1;
3974 return true;
3978 return false;
3980 case POST_INC:
3981 case POST_DEC:
3982 case PRE_INC:
3983 case PRE_DEC:
3984 info->type = ADDRESS_REG_WB;
3985 info->base = XEXP (x, 0);
3986 info->offset = NULL_RTX;
3987 return aarch64_base_register_rtx_p (info->base, strict_p);
3989 case POST_MODIFY:
3990 case PRE_MODIFY:
3991 info->type = ADDRESS_REG_WB;
3992 info->base = XEXP (x, 0);
3993 if (GET_CODE (XEXP (x, 1)) == PLUS
3994 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3995 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3996 && aarch64_base_register_rtx_p (info->base, strict_p))
3998 HOST_WIDE_INT offset;
3999 info->offset = XEXP (XEXP (x, 1), 1);
4000 offset = INTVAL (info->offset);
4002 /* TImode and TFmode values are allowed in both pairs of X
4003 registers and individual Q registers. The available
4004 address modes are:
4005 X,X: 7-bit signed scaled offset
4006 Q: 9-bit signed offset
4007 We conservatively require an offset representable in either mode.
4009 if (mode == TImode || mode == TFmode)
4010 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4011 && offset_9bit_signed_unscaled_p (mode, offset));
4013 if (load_store_pair_p)
4014 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4015 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4016 else
4017 return offset_9bit_signed_unscaled_p (mode, offset);
4019 return false;
4021 case CONST:
4022 case SYMBOL_REF:
4023 case LABEL_REF:
4024 /* load literal: pc-relative constant pool entry. Only supported
4025 for SI mode or larger. */
4026 info->type = ADDRESS_SYMBOLIC;
4028 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4030 rtx sym, addend;
4032 split_const (x, &sym, &addend);
4033 return ((GET_CODE (sym) == LABEL_REF
4034 || (GET_CODE (sym) == SYMBOL_REF
4035 && CONSTANT_POOL_ADDRESS_P (sym)
4036 && !aarch64_nopcrelative_literal_loads)));
4038 return false;
4040 case LO_SUM:
4041 info->type = ADDRESS_LO_SUM;
4042 info->base = XEXP (x, 0);
4043 info->offset = XEXP (x, 1);
4044 if (allow_reg_index_p
4045 && aarch64_base_register_rtx_p (info->base, strict_p))
4047 rtx sym, offs;
4048 split_const (info->offset, &sym, &offs);
4049 if (GET_CODE (sym) == SYMBOL_REF
4050 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4052 /* The symbol and offset must be aligned to the access size. */
4053 unsigned int align;
4054 unsigned int ref_size;
4056 if (CONSTANT_POOL_ADDRESS_P (sym))
4057 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4058 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4060 tree exp = SYMBOL_REF_DECL (sym);
4061 align = TYPE_ALIGN (TREE_TYPE (exp));
4062 align = CONSTANT_ALIGNMENT (exp, align);
4064 else if (SYMBOL_REF_DECL (sym))
4065 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4066 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4067 && SYMBOL_REF_BLOCK (sym) != NULL)
4068 align = SYMBOL_REF_BLOCK (sym)->alignment;
4069 else
4070 align = BITS_PER_UNIT;
4072 ref_size = GET_MODE_SIZE (mode);
4073 if (ref_size == 0)
4074 ref_size = GET_MODE_SIZE (DImode);
4076 return ((INTVAL (offs) & (ref_size - 1)) == 0
4077 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4080 return false;
4082 default:
4083 return false;
4087 bool
4088 aarch64_symbolic_address_p (rtx x)
4090 rtx offset;
4092 split_const (x, &x, &offset);
4093 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4096 /* Classify the base of symbolic expression X. */
4098 enum aarch64_symbol_type
4099 aarch64_classify_symbolic_expression (rtx x)
4101 rtx offset;
4103 split_const (x, &x, &offset);
4104 return aarch64_classify_symbol (x, offset);
4108 /* Return TRUE if X is a legitimate address for accessing memory in
4109 mode MODE. */
4110 static bool
4111 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4113 struct aarch64_address_info addr;
4115 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4118 /* Return TRUE if X is a legitimate address for accessing memory in
4119 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4120 pair operation. */
4121 bool
4122 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4123 RTX_CODE outer_code, bool strict_p)
4125 struct aarch64_address_info addr;
4127 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4130 /* Return TRUE if rtx X is immediate constant 0.0 */
4131 bool
4132 aarch64_float_const_zero_rtx_p (rtx x)
4134 if (GET_MODE (x) == VOIDmode)
4135 return false;
4137 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4138 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4139 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4142 /* Return the fixed registers used for condition codes. */
4144 static bool
4145 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4147 *p1 = CC_REGNUM;
4148 *p2 = INVALID_REGNUM;
4149 return true;
4152 /* Emit call insn with PAT and do aarch64-specific handling. */
4154 void
4155 aarch64_emit_call_insn (rtx pat)
4157 rtx insn = emit_call_insn (pat);
4159 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4160 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4161 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4164 machine_mode
4165 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4167 /* All floating point compares return CCFP if it is an equality
4168 comparison, and CCFPE otherwise. */
4169 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4171 switch (code)
4173 case EQ:
4174 case NE:
4175 case UNORDERED:
4176 case ORDERED:
4177 case UNLT:
4178 case UNLE:
4179 case UNGT:
4180 case UNGE:
4181 case UNEQ:
4182 case LTGT:
4183 return CCFPmode;
4185 case LT:
4186 case LE:
4187 case GT:
4188 case GE:
4189 return CCFPEmode;
4191 default:
4192 gcc_unreachable ();
4196 /* Equality comparisons of short modes against zero can be performed
4197 using the TST instruction with the appropriate bitmask. */
4198 if (y == const0_rtx && REG_P (x)
4199 && (code == EQ || code == NE)
4200 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4201 return CC_NZmode;
4203 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4204 && y == const0_rtx
4205 && (code == EQ || code == NE || code == LT || code == GE)
4206 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4207 || GET_CODE (x) == NEG
4208 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4209 && CONST_INT_P (XEXP (x, 2)))))
4210 return CC_NZmode;
4212 /* A compare with a shifted operand. Because of canonicalization,
4213 the comparison will have to be swapped when we emit the assembly
4214 code. */
4215 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4216 && (REG_P (y) || GET_CODE (y) == SUBREG)
4217 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4218 || GET_CODE (x) == LSHIFTRT
4219 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4220 return CC_SWPmode;
4222 /* Similarly for a negated operand, but we can only do this for
4223 equalities. */
4224 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4225 && (REG_P (y) || GET_CODE (y) == SUBREG)
4226 && (code == EQ || code == NE)
4227 && GET_CODE (x) == NEG)
4228 return CC_Zmode;
4230 /* A test for unsigned overflow. */
4231 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4232 && code == NE
4233 && GET_CODE (x) == PLUS
4234 && GET_CODE (y) == ZERO_EXTEND)
4235 return CC_Cmode;
4237 /* For everything else, return CCmode. */
4238 return CCmode;
4241 static int
4242 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4245 aarch64_get_condition_code (rtx x)
4247 machine_mode mode = GET_MODE (XEXP (x, 0));
4248 enum rtx_code comp_code = GET_CODE (x);
4250 if (GET_MODE_CLASS (mode) != MODE_CC)
4251 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4252 return aarch64_get_condition_code_1 (mode, comp_code);
4255 static int
4256 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4258 switch (mode)
4260 case CCFPmode:
4261 case CCFPEmode:
4262 switch (comp_code)
4264 case GE: return AARCH64_GE;
4265 case GT: return AARCH64_GT;
4266 case LE: return AARCH64_LS;
4267 case LT: return AARCH64_MI;
4268 case NE: return AARCH64_NE;
4269 case EQ: return AARCH64_EQ;
4270 case ORDERED: return AARCH64_VC;
4271 case UNORDERED: return AARCH64_VS;
4272 case UNLT: return AARCH64_LT;
4273 case UNLE: return AARCH64_LE;
4274 case UNGT: return AARCH64_HI;
4275 case UNGE: return AARCH64_PL;
4276 default: return -1;
4278 break;
4280 case CCmode:
4281 switch (comp_code)
4283 case NE: return AARCH64_NE;
4284 case EQ: return AARCH64_EQ;
4285 case GE: return AARCH64_GE;
4286 case GT: return AARCH64_GT;
4287 case LE: return AARCH64_LE;
4288 case LT: return AARCH64_LT;
4289 case GEU: return AARCH64_CS;
4290 case GTU: return AARCH64_HI;
4291 case LEU: return AARCH64_LS;
4292 case LTU: return AARCH64_CC;
4293 default: return -1;
4295 break;
4297 case CC_SWPmode:
4298 switch (comp_code)
4300 case NE: return AARCH64_NE;
4301 case EQ: return AARCH64_EQ;
4302 case GE: return AARCH64_LE;
4303 case GT: return AARCH64_LT;
4304 case LE: return AARCH64_GE;
4305 case LT: return AARCH64_GT;
4306 case GEU: return AARCH64_LS;
4307 case GTU: return AARCH64_CC;
4308 case LEU: return AARCH64_CS;
4309 case LTU: return AARCH64_HI;
4310 default: return -1;
4312 break;
4314 case CC_NZmode:
4315 switch (comp_code)
4317 case NE: return AARCH64_NE;
4318 case EQ: return AARCH64_EQ;
4319 case GE: return AARCH64_PL;
4320 case LT: return AARCH64_MI;
4321 default: return -1;
4323 break;
4325 case CC_Zmode:
4326 switch (comp_code)
4328 case NE: return AARCH64_NE;
4329 case EQ: return AARCH64_EQ;
4330 default: return -1;
4332 break;
4334 case CC_Cmode:
4335 switch (comp_code)
4337 case NE: return AARCH64_CS;
4338 case EQ: return AARCH64_CC;
4339 default: return -1;
4341 break;
4343 default:
4344 return -1;
4345 break;
4348 return -1;
4351 bool
4352 aarch64_const_vec_all_same_in_range_p (rtx x,
4353 HOST_WIDE_INT minval,
4354 HOST_WIDE_INT maxval)
4356 HOST_WIDE_INT firstval;
4357 int count, i;
4359 if (GET_CODE (x) != CONST_VECTOR
4360 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4361 return false;
4363 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4364 if (firstval < minval || firstval > maxval)
4365 return false;
4367 count = CONST_VECTOR_NUNITS (x);
4368 for (i = 1; i < count; i++)
4369 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4370 return false;
4372 return true;
4375 bool
4376 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4378 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4382 /* N Z C V. */
4383 #define AARCH64_CC_V 1
4384 #define AARCH64_CC_C (1 << 1)
4385 #define AARCH64_CC_Z (1 << 2)
4386 #define AARCH64_CC_N (1 << 3)
4388 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4389 static const int aarch64_nzcv_codes[] =
4391 0, /* EQ, Z == 1. */
4392 AARCH64_CC_Z, /* NE, Z == 0. */
4393 0, /* CS, C == 1. */
4394 AARCH64_CC_C, /* CC, C == 0. */
4395 0, /* MI, N == 1. */
4396 AARCH64_CC_N, /* PL, N == 0. */
4397 0, /* VS, V == 1. */
4398 AARCH64_CC_V, /* VC, V == 0. */
4399 0, /* HI, C ==1 && Z == 0. */
4400 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4401 AARCH64_CC_V, /* GE, N == V. */
4402 0, /* LT, N != V. */
4403 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4404 0, /* LE, !(Z == 0 && N == V). */
4405 0, /* AL, Any. */
4406 0 /* NV, Any. */
4409 static void
4410 aarch64_print_operand (FILE *f, rtx x, int code)
4412 switch (code)
4414 /* An integer or symbol address without a preceding # sign. */
4415 case 'c':
4416 switch (GET_CODE (x))
4418 case CONST_INT:
4419 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4420 break;
4422 case SYMBOL_REF:
4423 output_addr_const (f, x);
4424 break;
4426 case CONST:
4427 if (GET_CODE (XEXP (x, 0)) == PLUS
4428 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4430 output_addr_const (f, x);
4431 break;
4433 /* Fall through. */
4435 default:
4436 output_operand_lossage ("Unsupported operand for code '%c'", code);
4438 break;
4440 case 'e':
4441 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4443 int n;
4445 if (!CONST_INT_P (x)
4446 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4448 output_operand_lossage ("invalid operand for '%%%c'", code);
4449 return;
4452 switch (n)
4454 case 3:
4455 fputc ('b', f);
4456 break;
4457 case 4:
4458 fputc ('h', f);
4459 break;
4460 case 5:
4461 fputc ('w', f);
4462 break;
4463 default:
4464 output_operand_lossage ("invalid operand for '%%%c'", code);
4465 return;
4468 break;
4470 case 'p':
4472 int n;
4474 /* Print N such that 2^N == X. */
4475 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4477 output_operand_lossage ("invalid operand for '%%%c'", code);
4478 return;
4481 asm_fprintf (f, "%d", n);
4483 break;
4485 case 'P':
4486 /* Print the number of non-zero bits in X (a const_int). */
4487 if (!CONST_INT_P (x))
4489 output_operand_lossage ("invalid operand for '%%%c'", code);
4490 return;
4493 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4494 break;
4496 case 'H':
4497 /* Print the higher numbered register of a pair (TImode) of regs. */
4498 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4500 output_operand_lossage ("invalid operand for '%%%c'", code);
4501 return;
4504 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4505 break;
4507 case 'M':
4508 case 'm':
4510 int cond_code;
4511 /* Print a condition (eq, ne, etc) or its inverse. */
4513 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4514 if (x == const_true_rtx)
4516 if (code == 'M')
4517 fputs ("nv", f);
4518 return;
4521 if (!COMPARISON_P (x))
4523 output_operand_lossage ("invalid operand for '%%%c'", code);
4524 return;
4527 cond_code = aarch64_get_condition_code (x);
4528 gcc_assert (cond_code >= 0);
4529 if (code == 'M')
4530 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4531 fputs (aarch64_condition_codes[cond_code], f);
4533 break;
4535 case 'b':
4536 case 'h':
4537 case 's':
4538 case 'd':
4539 case 'q':
4540 /* Print a scalar FP/SIMD register name. */
4541 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4543 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4544 return;
4546 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4547 break;
4549 case 'S':
4550 case 'T':
4551 case 'U':
4552 case 'V':
4553 /* Print the first FP/SIMD register name in a list. */
4554 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4556 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4557 return;
4559 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4560 break;
4562 case 'R':
4563 /* Print a scalar FP/SIMD register name + 1. */
4564 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4566 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4567 return;
4569 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4570 break;
4572 case 'X':
4573 /* Print bottom 16 bits of integer constant in hex. */
4574 if (!CONST_INT_P (x))
4576 output_operand_lossage ("invalid operand for '%%%c'", code);
4577 return;
4579 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4580 break;
4582 case 'w':
4583 case 'x':
4584 /* Print a general register name or the zero register (32-bit or
4585 64-bit). */
4586 if (x == const0_rtx
4587 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4589 asm_fprintf (f, "%czr", code);
4590 break;
4593 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4595 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4596 break;
4599 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4601 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4602 break;
4605 /* Fall through */
4607 case 0:
4608 /* Print a normal operand, if it's a general register, then we
4609 assume DImode. */
4610 if (x == NULL)
4612 output_operand_lossage ("missing operand");
4613 return;
4616 switch (GET_CODE (x))
4618 case REG:
4619 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4620 break;
4622 case MEM:
4623 output_address (GET_MODE (x), XEXP (x, 0));
4624 break;
4626 case CONST:
4627 case LABEL_REF:
4628 case SYMBOL_REF:
4629 output_addr_const (asm_out_file, x);
4630 break;
4632 case CONST_INT:
4633 asm_fprintf (f, "%wd", INTVAL (x));
4634 break;
4636 case CONST_VECTOR:
4637 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4639 gcc_assert (
4640 aarch64_const_vec_all_same_in_range_p (x,
4641 HOST_WIDE_INT_MIN,
4642 HOST_WIDE_INT_MAX));
4643 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4645 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4647 fputc ('0', f);
4649 else
4650 gcc_unreachable ();
4651 break;
4653 case CONST_DOUBLE:
4654 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4655 be getting CONST_DOUBLEs holding integers. */
4656 gcc_assert (GET_MODE (x) != VOIDmode);
4657 if (aarch64_float_const_zero_rtx_p (x))
4659 fputc ('0', f);
4660 break;
4662 else if (aarch64_float_const_representable_p (x))
4664 #define buf_size 20
4665 char float_buf[buf_size] = {'\0'};
4666 real_to_decimal_for_mode (float_buf,
4667 CONST_DOUBLE_REAL_VALUE (x),
4668 buf_size, buf_size,
4669 1, GET_MODE (x));
4670 asm_fprintf (asm_out_file, "%s", float_buf);
4671 break;
4672 #undef buf_size
4674 output_operand_lossage ("invalid constant");
4675 return;
4676 default:
4677 output_operand_lossage ("invalid operand");
4678 return;
4680 break;
4682 case 'A':
4683 if (GET_CODE (x) == HIGH)
4684 x = XEXP (x, 0);
4686 switch (aarch64_classify_symbolic_expression (x))
4688 case SYMBOL_SMALL_GOT_4G:
4689 asm_fprintf (asm_out_file, ":got:");
4690 break;
4692 case SYMBOL_SMALL_TLSGD:
4693 asm_fprintf (asm_out_file, ":tlsgd:");
4694 break;
4696 case SYMBOL_SMALL_TLSDESC:
4697 asm_fprintf (asm_out_file, ":tlsdesc:");
4698 break;
4700 case SYMBOL_SMALL_TLSIE:
4701 asm_fprintf (asm_out_file, ":gottprel:");
4702 break;
4704 case SYMBOL_TLSLE24:
4705 asm_fprintf (asm_out_file, ":tprel:");
4706 break;
4708 case SYMBOL_TINY_GOT:
4709 gcc_unreachable ();
4710 break;
4712 default:
4713 break;
4715 output_addr_const (asm_out_file, x);
4716 break;
4718 case 'L':
4719 switch (aarch64_classify_symbolic_expression (x))
4721 case SYMBOL_SMALL_GOT_4G:
4722 asm_fprintf (asm_out_file, ":lo12:");
4723 break;
4725 case SYMBOL_SMALL_TLSGD:
4726 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4727 break;
4729 case SYMBOL_SMALL_TLSDESC:
4730 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4731 break;
4733 case SYMBOL_SMALL_TLSIE:
4734 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4735 break;
4737 case SYMBOL_TLSLE12:
4738 asm_fprintf (asm_out_file, ":tprel_lo12:");
4739 break;
4741 case SYMBOL_TLSLE24:
4742 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4743 break;
4745 case SYMBOL_TINY_GOT:
4746 asm_fprintf (asm_out_file, ":got:");
4747 break;
4749 case SYMBOL_TINY_TLSIE:
4750 asm_fprintf (asm_out_file, ":gottprel:");
4751 break;
4753 default:
4754 break;
4756 output_addr_const (asm_out_file, x);
4757 break;
4759 case 'G':
4761 switch (aarch64_classify_symbolic_expression (x))
4763 case SYMBOL_TLSLE24:
4764 asm_fprintf (asm_out_file, ":tprel_hi12:");
4765 break;
4766 default:
4767 break;
4769 output_addr_const (asm_out_file, x);
4770 break;
4772 case 'k':
4774 HOST_WIDE_INT cond_code;
4775 /* Print nzcv. */
4777 if (!CONST_INT_P (x))
4779 output_operand_lossage ("invalid operand for '%%%c'", code);
4780 return;
4783 cond_code = INTVAL (x);
4784 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4785 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4787 break;
4789 default:
4790 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4791 return;
4795 static void
4796 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4798 struct aarch64_address_info addr;
4800 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4801 switch (addr.type)
4803 case ADDRESS_REG_IMM:
4804 if (addr.offset == const0_rtx)
4805 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4806 else
4807 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4808 INTVAL (addr.offset));
4809 return;
4811 case ADDRESS_REG_REG:
4812 if (addr.shift == 0)
4813 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4814 reg_names [REGNO (addr.offset)]);
4815 else
4816 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4817 reg_names [REGNO (addr.offset)], addr.shift);
4818 return;
4820 case ADDRESS_REG_UXTW:
4821 if (addr.shift == 0)
4822 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4823 REGNO (addr.offset) - R0_REGNUM);
4824 else
4825 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4826 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4827 return;
4829 case ADDRESS_REG_SXTW:
4830 if (addr.shift == 0)
4831 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4832 REGNO (addr.offset) - R0_REGNUM);
4833 else
4834 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4835 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4836 return;
4838 case ADDRESS_REG_WB:
4839 switch (GET_CODE (x))
4841 case PRE_INC:
4842 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4843 GET_MODE_SIZE (mode));
4844 return;
4845 case POST_INC:
4846 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4847 GET_MODE_SIZE (mode));
4848 return;
4849 case PRE_DEC:
4850 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4851 GET_MODE_SIZE (mode));
4852 return;
4853 case POST_DEC:
4854 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4855 GET_MODE_SIZE (mode));
4856 return;
4857 case PRE_MODIFY:
4858 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4859 INTVAL (addr.offset));
4860 return;
4861 case POST_MODIFY:
4862 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4863 INTVAL (addr.offset));
4864 return;
4865 default:
4866 break;
4868 break;
4870 case ADDRESS_LO_SUM:
4871 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4872 output_addr_const (f, addr.offset);
4873 asm_fprintf (f, "]");
4874 return;
4876 case ADDRESS_SYMBOLIC:
4877 break;
4880 output_addr_const (f, x);
4883 bool
4884 aarch64_label_mentioned_p (rtx x)
4886 const char *fmt;
4887 int i;
4889 if (GET_CODE (x) == LABEL_REF)
4890 return true;
4892 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4893 referencing instruction, but they are constant offsets, not
4894 symbols. */
4895 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4896 return false;
4898 fmt = GET_RTX_FORMAT (GET_CODE (x));
4899 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4901 if (fmt[i] == 'E')
4903 int j;
4905 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4906 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4907 return 1;
4909 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4910 return 1;
4913 return 0;
4916 /* Implement REGNO_REG_CLASS. */
4918 enum reg_class
4919 aarch64_regno_regclass (unsigned regno)
4921 if (GP_REGNUM_P (regno))
4922 return GENERAL_REGS;
4924 if (regno == SP_REGNUM)
4925 return STACK_REG;
4927 if (regno == FRAME_POINTER_REGNUM
4928 || regno == ARG_POINTER_REGNUM)
4929 return POINTER_REGS;
4931 if (FP_REGNUM_P (regno))
4932 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4934 return NO_REGS;
4937 static rtx
4938 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4940 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4941 where mask is selected by alignment and size of the offset.
4942 We try to pick as large a range for the offset as possible to
4943 maximize the chance of a CSE. However, for aligned addresses
4944 we limit the range to 4k so that structures with different sized
4945 elements are likely to use the same base. We need to be careful
4946 not to split a CONST for some forms of address expression, otherwise
4947 it will generate sub-optimal code. */
4949 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4951 rtx base = XEXP (x, 0);
4952 rtx offset_rtx XEXP (x, 1);
4953 HOST_WIDE_INT offset = INTVAL (offset_rtx);
4955 if (GET_CODE (base) == PLUS)
4957 rtx op0 = XEXP (base, 0);
4958 rtx op1 = XEXP (base, 1);
4960 /* Force any scaling into a temp for CSE. */
4961 op0 = force_reg (Pmode, op0);
4962 op1 = force_reg (Pmode, op1);
4964 /* Let the pointer register be in op0. */
4965 if (REG_POINTER (op1))
4966 std::swap (op0, op1);
4968 /* If the pointer is virtual or frame related, then we know that
4969 virtual register instantiation or register elimination is going
4970 to apply a second constant. We want the two constants folded
4971 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
4972 if (virt_or_elim_regno_p (REGNO (op0)))
4974 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
4975 NULL_RTX, true, OPTAB_DIRECT);
4976 return gen_rtx_PLUS (Pmode, base, op1);
4979 /* Otherwise, in order to encourage CSE (and thence loop strength
4980 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
4981 base = expand_binop (Pmode, add_optab, op0, op1,
4982 NULL_RTX, true, OPTAB_DIRECT);
4983 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
4986 /* Does it look like we'll need a load/store-pair operation? */
4987 HOST_WIDE_INT base_offset;
4988 if (GET_MODE_SIZE (mode) > 16
4989 || mode == TImode)
4990 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4991 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4992 /* For offsets aren't a multiple of the access size, the limit is
4993 -256...255. */
4994 else if (offset & (GET_MODE_SIZE (mode) - 1))
4995 base_offset = (offset + 0x100) & ~0x1ff;
4996 else
4997 base_offset = offset & ~0xfff;
4999 if (base_offset != 0)
5001 base = plus_constant (Pmode, base, base_offset);
5002 base = force_operand (base, NULL_RTX);
5003 return plus_constant (Pmode, base, offset - base_offset);
5007 return x;
5010 /* Return the reload icode required for a constant pool in mode. */
5011 static enum insn_code
5012 aarch64_constant_pool_reload_icode (machine_mode mode)
5014 switch (mode)
5016 case SFmode:
5017 return CODE_FOR_aarch64_reload_movcpsfdi;
5019 case DFmode:
5020 return CODE_FOR_aarch64_reload_movcpdfdi;
5022 case TFmode:
5023 return CODE_FOR_aarch64_reload_movcptfdi;
5025 case V8QImode:
5026 return CODE_FOR_aarch64_reload_movcpv8qidi;
5028 case V16QImode:
5029 return CODE_FOR_aarch64_reload_movcpv16qidi;
5031 case V4HImode:
5032 return CODE_FOR_aarch64_reload_movcpv4hidi;
5034 case V8HImode:
5035 return CODE_FOR_aarch64_reload_movcpv8hidi;
5037 case V2SImode:
5038 return CODE_FOR_aarch64_reload_movcpv2sidi;
5040 case V4SImode:
5041 return CODE_FOR_aarch64_reload_movcpv4sidi;
5043 case V2DImode:
5044 return CODE_FOR_aarch64_reload_movcpv2didi;
5046 case V2DFmode:
5047 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5049 default:
5050 gcc_unreachable ();
5053 gcc_unreachable ();
5055 static reg_class_t
5056 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5057 reg_class_t rclass,
5058 machine_mode mode,
5059 secondary_reload_info *sri)
5062 /* If we have to disable direct literal pool loads and stores because the
5063 function is too big, then we need a scratch register. */
5064 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5065 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5066 || targetm.vector_mode_supported_p (GET_MODE (x)))
5067 && aarch64_nopcrelative_literal_loads)
5069 sri->icode = aarch64_constant_pool_reload_icode (mode);
5070 return NO_REGS;
5073 /* Without the TARGET_SIMD instructions we cannot move a Q register
5074 to a Q register directly. We need a scratch. */
5075 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5076 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5077 && reg_class_subset_p (rclass, FP_REGS))
5079 if (mode == TFmode)
5080 sri->icode = CODE_FOR_aarch64_reload_movtf;
5081 else if (mode == TImode)
5082 sri->icode = CODE_FOR_aarch64_reload_movti;
5083 return NO_REGS;
5086 /* A TFmode or TImode memory access should be handled via an FP_REGS
5087 because AArch64 has richer addressing modes for LDR/STR instructions
5088 than LDP/STP instructions. */
5089 if (TARGET_FLOAT && rclass == GENERAL_REGS
5090 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5091 return FP_REGS;
5093 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5094 return GENERAL_REGS;
5096 return NO_REGS;
5099 static bool
5100 aarch64_can_eliminate (const int from, const int to)
5102 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5103 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5105 if (frame_pointer_needed)
5107 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5108 return true;
5109 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5110 return false;
5111 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5112 && !cfun->calls_alloca)
5113 return true;
5114 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5115 return true;
5117 return false;
5119 else
5121 /* If we decided that we didn't need a leaf frame pointer but then used
5122 LR in the function, then we'll want a frame pointer after all, so
5123 prevent this elimination to ensure a frame pointer is used. */
5124 if (to == STACK_POINTER_REGNUM
5125 && flag_omit_leaf_frame_pointer
5126 && df_regs_ever_live_p (LR_REGNUM))
5127 return false;
5130 return true;
5133 HOST_WIDE_INT
5134 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5136 aarch64_layout_frame ();
5138 if (to == HARD_FRAME_POINTER_REGNUM)
5140 if (from == ARG_POINTER_REGNUM)
5141 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5143 if (from == FRAME_POINTER_REGNUM)
5144 return (cfun->machine->frame.hard_fp_offset
5145 - cfun->machine->frame.saved_varargs_size);
5148 if (to == STACK_POINTER_REGNUM)
5150 if (from == FRAME_POINTER_REGNUM)
5151 return (cfun->machine->frame.frame_size
5152 - cfun->machine->frame.saved_varargs_size);
5155 return cfun->machine->frame.frame_size;
5158 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5159 previous frame. */
5162 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5164 if (count != 0)
5165 return const0_rtx;
5166 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5170 static void
5171 aarch64_asm_trampoline_template (FILE *f)
5173 if (TARGET_ILP32)
5175 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5176 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5178 else
5180 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5181 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5183 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5184 assemble_aligned_integer (4, const0_rtx);
5185 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5186 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5189 static void
5190 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5192 rtx fnaddr, mem, a_tramp;
5193 const int tramp_code_sz = 16;
5195 /* Don't need to copy the trailing D-words, we fill those in below. */
5196 emit_block_move (m_tramp, assemble_trampoline_template (),
5197 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5198 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5199 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5200 if (GET_MODE (fnaddr) != ptr_mode)
5201 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5202 emit_move_insn (mem, fnaddr);
5204 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5205 emit_move_insn (mem, chain_value);
5207 /* XXX We should really define a "clear_cache" pattern and use
5208 gen_clear_cache(). */
5209 a_tramp = XEXP (m_tramp, 0);
5210 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5211 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5212 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5213 ptr_mode);
5216 static unsigned char
5217 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5219 switch (regclass)
5221 case CALLER_SAVE_REGS:
5222 case POINTER_REGS:
5223 case GENERAL_REGS:
5224 case ALL_REGS:
5225 case FP_REGS:
5226 case FP_LO_REGS:
5227 return
5228 aarch64_vector_mode_p (mode)
5229 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5230 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5231 case STACK_REG:
5232 return 1;
5234 case NO_REGS:
5235 return 0;
5237 default:
5238 break;
5240 gcc_unreachable ();
5243 static reg_class_t
5244 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5246 if (regclass == POINTER_REGS)
5247 return GENERAL_REGS;
5249 if (regclass == STACK_REG)
5251 if (REG_P(x)
5252 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5253 return regclass;
5255 return NO_REGS;
5258 /* If it's an integer immediate that MOVI can't handle, then
5259 FP_REGS is not an option, so we return NO_REGS instead. */
5260 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5261 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5262 return NO_REGS;
5264 /* Register eliminiation can result in a request for
5265 SP+constant->FP_REGS. We cannot support such operations which
5266 use SP as source and an FP_REG as destination, so reject out
5267 right now. */
5268 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5270 rtx lhs = XEXP (x, 0);
5272 /* Look through a possible SUBREG introduced by ILP32. */
5273 if (GET_CODE (lhs) == SUBREG)
5274 lhs = SUBREG_REG (lhs);
5276 gcc_assert (REG_P (lhs));
5277 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5278 POINTER_REGS));
5279 return NO_REGS;
5282 return regclass;
5285 void
5286 aarch64_asm_output_labelref (FILE* f, const char *name)
5288 asm_fprintf (f, "%U%s", name);
5291 static void
5292 aarch64_elf_asm_constructor (rtx symbol, int priority)
5294 if (priority == DEFAULT_INIT_PRIORITY)
5295 default_ctor_section_asm_out_constructor (symbol, priority);
5296 else
5298 section *s;
5299 char buf[18];
5300 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5301 s = get_section (buf, SECTION_WRITE, NULL);
5302 switch_to_section (s);
5303 assemble_align (POINTER_SIZE);
5304 assemble_aligned_integer (POINTER_BYTES, symbol);
5308 static void
5309 aarch64_elf_asm_destructor (rtx symbol, int priority)
5311 if (priority == DEFAULT_INIT_PRIORITY)
5312 default_dtor_section_asm_out_destructor (symbol, priority);
5313 else
5315 section *s;
5316 char buf[18];
5317 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5318 s = get_section (buf, SECTION_WRITE, NULL);
5319 switch_to_section (s);
5320 assemble_align (POINTER_SIZE);
5321 assemble_aligned_integer (POINTER_BYTES, symbol);
5325 const char*
5326 aarch64_output_casesi (rtx *operands)
5328 char buf[100];
5329 char label[100];
5330 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5331 int index;
5332 static const char *const patterns[4][2] =
5335 "ldrb\t%w3, [%0,%w1,uxtw]",
5336 "add\t%3, %4, %w3, sxtb #2"
5339 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5340 "add\t%3, %4, %w3, sxth #2"
5343 "ldr\t%w3, [%0,%w1,uxtw #2]",
5344 "add\t%3, %4, %w3, sxtw #2"
5346 /* We assume that DImode is only generated when not optimizing and
5347 that we don't really need 64-bit address offsets. That would
5348 imply an object file with 8GB of code in a single function! */
5350 "ldr\t%w3, [%0,%w1,uxtw #2]",
5351 "add\t%3, %4, %w3, sxtw #2"
5355 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5357 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5359 gcc_assert (index >= 0 && index <= 3);
5361 /* Need to implement table size reduction, by chaning the code below. */
5362 output_asm_insn (patterns[index][0], operands);
5363 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5364 snprintf (buf, sizeof (buf),
5365 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5366 output_asm_insn (buf, operands);
5367 output_asm_insn (patterns[index][1], operands);
5368 output_asm_insn ("br\t%3", operands);
5369 assemble_label (asm_out_file, label);
5370 return "";
5374 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5375 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5376 operator. */
5379 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5381 if (shift >= 0 && shift <= 3)
5383 int size;
5384 for (size = 8; size <= 32; size *= 2)
5386 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5387 if (mask == bits << shift)
5388 return size;
5391 return 0;
5394 /* Constant pools are per function only when PC relative
5395 literal loads are true or we are in the large memory
5396 model. */
5398 static inline bool
5399 aarch64_can_use_per_function_literal_pools_p (void)
5401 return (!aarch64_nopcrelative_literal_loads
5402 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5405 static bool
5406 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5408 /* Fixme:: In an ideal world this would work similar
5409 to the logic in aarch64_select_rtx_section but this
5410 breaks bootstrap in gcc go. For now we workaround
5411 this by returning false here. */
5412 return false;
5415 /* Select appropriate section for constants depending
5416 on where we place literal pools. */
5418 static section *
5419 aarch64_select_rtx_section (machine_mode mode,
5420 rtx x,
5421 unsigned HOST_WIDE_INT align)
5423 if (aarch64_can_use_per_function_literal_pools_p ())
5424 return function_section (current_function_decl);
5426 return default_elf_select_rtx_section (mode, x, align);
5429 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5430 void
5431 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5432 HOST_WIDE_INT offset)
5434 /* When using per-function literal pools, we must ensure that any code
5435 section is aligned to the minimal instruction length, lest we get
5436 errors from the assembler re "unaligned instructions". */
5437 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5438 ASM_OUTPUT_ALIGN (f, 2);
5441 /* Costs. */
5443 /* Helper function for rtx cost calculation. Strip a shift expression
5444 from X. Returns the inner operand if successful, or the original
5445 expression on failure. */
5446 static rtx
5447 aarch64_strip_shift (rtx x)
5449 rtx op = x;
5451 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5452 we can convert both to ROR during final output. */
5453 if ((GET_CODE (op) == ASHIFT
5454 || GET_CODE (op) == ASHIFTRT
5455 || GET_CODE (op) == LSHIFTRT
5456 || GET_CODE (op) == ROTATERT
5457 || GET_CODE (op) == ROTATE)
5458 && CONST_INT_P (XEXP (op, 1)))
5459 return XEXP (op, 0);
5461 if (GET_CODE (op) == MULT
5462 && CONST_INT_P (XEXP (op, 1))
5463 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5464 return XEXP (op, 0);
5466 return x;
5469 /* Helper function for rtx cost calculation. Strip an extend
5470 expression from X. Returns the inner operand if successful, or the
5471 original expression on failure. We deal with a number of possible
5472 canonicalization variations here. */
5473 static rtx
5474 aarch64_strip_extend (rtx x)
5476 rtx op = x;
5478 /* Zero and sign extraction of a widened value. */
5479 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5480 && XEXP (op, 2) == const0_rtx
5481 && GET_CODE (XEXP (op, 0)) == MULT
5482 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5483 XEXP (op, 1)))
5484 return XEXP (XEXP (op, 0), 0);
5486 /* It can also be represented (for zero-extend) as an AND with an
5487 immediate. */
5488 if (GET_CODE (op) == AND
5489 && GET_CODE (XEXP (op, 0)) == MULT
5490 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5491 && CONST_INT_P (XEXP (op, 1))
5492 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5493 INTVAL (XEXP (op, 1))) != 0)
5494 return XEXP (XEXP (op, 0), 0);
5496 /* Now handle extended register, as this may also have an optional
5497 left shift by 1..4. */
5498 if (GET_CODE (op) == ASHIFT
5499 && CONST_INT_P (XEXP (op, 1))
5500 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5501 op = XEXP (op, 0);
5503 if (GET_CODE (op) == ZERO_EXTEND
5504 || GET_CODE (op) == SIGN_EXTEND)
5505 op = XEXP (op, 0);
5507 if (op != x)
5508 return op;
5510 return x;
5513 /* Return true iff CODE is a shift supported in combination
5514 with arithmetic instructions. */
5516 static bool
5517 aarch64_shift_p (enum rtx_code code)
5519 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5522 /* Helper function for rtx cost calculation. Calculate the cost of
5523 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5524 Return the calculated cost of the expression, recursing manually in to
5525 operands where needed. */
5527 static int
5528 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5530 rtx op0, op1;
5531 const struct cpu_cost_table *extra_cost
5532 = aarch64_tune_params.insn_extra_cost;
5533 int cost = 0;
5534 bool compound_p = (outer == PLUS || outer == MINUS);
5535 machine_mode mode = GET_MODE (x);
5537 gcc_checking_assert (code == MULT);
5539 op0 = XEXP (x, 0);
5540 op1 = XEXP (x, 1);
5542 if (VECTOR_MODE_P (mode))
5543 mode = GET_MODE_INNER (mode);
5545 /* Integer multiply/fma. */
5546 if (GET_MODE_CLASS (mode) == MODE_INT)
5548 /* The multiply will be canonicalized as a shift, cost it as such. */
5549 if (aarch64_shift_p (GET_CODE (x))
5550 || (CONST_INT_P (op1)
5551 && exact_log2 (INTVAL (op1)) > 0))
5553 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5554 || GET_CODE (op0) == SIGN_EXTEND;
5555 if (speed)
5557 if (compound_p)
5559 if (REG_P (op1))
5560 /* ARITH + shift-by-register. */
5561 cost += extra_cost->alu.arith_shift_reg;
5562 else if (is_extend)
5563 /* ARITH + extended register. We don't have a cost field
5564 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5565 cost += extra_cost->alu.extend_arith;
5566 else
5567 /* ARITH + shift-by-immediate. */
5568 cost += extra_cost->alu.arith_shift;
5570 else
5571 /* LSL (immediate). */
5572 cost += extra_cost->alu.shift;
5575 /* Strip extends as we will have costed them in the case above. */
5576 if (is_extend)
5577 op0 = aarch64_strip_extend (op0);
5579 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5581 return cost;
5584 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5585 compound and let the below cases handle it. After all, MNEG is a
5586 special-case alias of MSUB. */
5587 if (GET_CODE (op0) == NEG)
5589 op0 = XEXP (op0, 0);
5590 compound_p = true;
5593 /* Integer multiplies or FMAs have zero/sign extending variants. */
5594 if ((GET_CODE (op0) == ZERO_EXTEND
5595 && GET_CODE (op1) == ZERO_EXTEND)
5596 || (GET_CODE (op0) == SIGN_EXTEND
5597 && GET_CODE (op1) == SIGN_EXTEND))
5599 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5600 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5602 if (speed)
5604 if (compound_p)
5605 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5606 cost += extra_cost->mult[0].extend_add;
5607 else
5608 /* MUL/SMULL/UMULL. */
5609 cost += extra_cost->mult[0].extend;
5612 return cost;
5615 /* This is either an integer multiply or a MADD. In both cases
5616 we want to recurse and cost the operands. */
5617 cost += rtx_cost (op0, mode, MULT, 0, speed);
5618 cost += rtx_cost (op1, mode, MULT, 1, speed);
5620 if (speed)
5622 if (compound_p)
5623 /* MADD/MSUB. */
5624 cost += extra_cost->mult[mode == DImode].add;
5625 else
5626 /* MUL. */
5627 cost += extra_cost->mult[mode == DImode].simple;
5630 return cost;
5632 else
5634 if (speed)
5636 /* Floating-point FMA/FMUL can also support negations of the
5637 operands, unless the rounding mode is upward or downward in
5638 which case FNMUL is different than FMUL with operand negation. */
5639 bool neg0 = GET_CODE (op0) == NEG;
5640 bool neg1 = GET_CODE (op1) == NEG;
5641 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5643 if (neg0)
5644 op0 = XEXP (op0, 0);
5645 if (neg1)
5646 op1 = XEXP (op1, 0);
5649 if (compound_p)
5650 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5651 cost += extra_cost->fp[mode == DFmode].fma;
5652 else
5653 /* FMUL/FNMUL. */
5654 cost += extra_cost->fp[mode == DFmode].mult;
5657 cost += rtx_cost (op0, mode, MULT, 0, speed);
5658 cost += rtx_cost (op1, mode, MULT, 1, speed);
5659 return cost;
5663 static int
5664 aarch64_address_cost (rtx x,
5665 machine_mode mode,
5666 addr_space_t as ATTRIBUTE_UNUSED,
5667 bool speed)
5669 enum rtx_code c = GET_CODE (x);
5670 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5671 struct aarch64_address_info info;
5672 int cost = 0;
5673 info.shift = 0;
5675 if (!aarch64_classify_address (&info, x, mode, c, false))
5677 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5679 /* This is a CONST or SYMBOL ref which will be split
5680 in a different way depending on the code model in use.
5681 Cost it through the generic infrastructure. */
5682 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5683 /* Divide through by the cost of one instruction to
5684 bring it to the same units as the address costs. */
5685 cost_symbol_ref /= COSTS_N_INSNS (1);
5686 /* The cost is then the cost of preparing the address,
5687 followed by an immediate (possibly 0) offset. */
5688 return cost_symbol_ref + addr_cost->imm_offset;
5690 else
5692 /* This is most likely a jump table from a case
5693 statement. */
5694 return addr_cost->register_offset;
5698 switch (info.type)
5700 case ADDRESS_LO_SUM:
5701 case ADDRESS_SYMBOLIC:
5702 case ADDRESS_REG_IMM:
5703 cost += addr_cost->imm_offset;
5704 break;
5706 case ADDRESS_REG_WB:
5707 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5708 cost += addr_cost->pre_modify;
5709 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5710 cost += addr_cost->post_modify;
5711 else
5712 gcc_unreachable ();
5714 break;
5716 case ADDRESS_REG_REG:
5717 cost += addr_cost->register_offset;
5718 break;
5720 case ADDRESS_REG_SXTW:
5721 cost += addr_cost->register_sextend;
5722 break;
5724 case ADDRESS_REG_UXTW:
5725 cost += addr_cost->register_zextend;
5726 break;
5728 default:
5729 gcc_unreachable ();
5733 if (info.shift > 0)
5735 /* For the sake of calculating the cost of the shifted register
5736 component, we can treat same sized modes in the same way. */
5737 switch (GET_MODE_BITSIZE (mode))
5739 case 16:
5740 cost += addr_cost->addr_scale_costs.hi;
5741 break;
5743 case 32:
5744 cost += addr_cost->addr_scale_costs.si;
5745 break;
5747 case 64:
5748 cost += addr_cost->addr_scale_costs.di;
5749 break;
5751 /* We can't tell, or this is a 128-bit vector. */
5752 default:
5753 cost += addr_cost->addr_scale_costs.ti;
5754 break;
5758 return cost;
5761 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5762 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5763 to be taken. */
5766 aarch64_branch_cost (bool speed_p, bool predictable_p)
5768 /* When optimizing for speed, use the cost of unpredictable branches. */
5769 const struct cpu_branch_cost *branch_costs =
5770 aarch64_tune_params.branch_costs;
5772 if (!speed_p || predictable_p)
5773 return branch_costs->predictable;
5774 else
5775 return branch_costs->unpredictable;
5778 /* Return true if the RTX X in mode MODE is a zero or sign extract
5779 usable in an ADD or SUB (extended register) instruction. */
5780 static bool
5781 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5783 /* Catch add with a sign extract.
5784 This is add_<optab><mode>_multp2. */
5785 if (GET_CODE (x) == SIGN_EXTRACT
5786 || GET_CODE (x) == ZERO_EXTRACT)
5788 rtx op0 = XEXP (x, 0);
5789 rtx op1 = XEXP (x, 1);
5790 rtx op2 = XEXP (x, 2);
5792 if (GET_CODE (op0) == MULT
5793 && CONST_INT_P (op1)
5794 && op2 == const0_rtx
5795 && CONST_INT_P (XEXP (op0, 1))
5796 && aarch64_is_extend_from_extract (mode,
5797 XEXP (op0, 1),
5798 op1))
5800 return true;
5803 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5804 No shift. */
5805 else if (GET_CODE (x) == SIGN_EXTEND
5806 || GET_CODE (x) == ZERO_EXTEND)
5807 return REG_P (XEXP (x, 0));
5809 return false;
5812 static bool
5813 aarch64_frint_unspec_p (unsigned int u)
5815 switch (u)
5817 case UNSPEC_FRINTZ:
5818 case UNSPEC_FRINTP:
5819 case UNSPEC_FRINTM:
5820 case UNSPEC_FRINTA:
5821 case UNSPEC_FRINTN:
5822 case UNSPEC_FRINTX:
5823 case UNSPEC_FRINTI:
5824 return true;
5826 default:
5827 return false;
5831 /* Return true iff X is an rtx that will match an extr instruction
5832 i.e. as described in the *extr<mode>5_insn family of patterns.
5833 OP0 and OP1 will be set to the operands of the shifts involved
5834 on success and will be NULL_RTX otherwise. */
5836 static bool
5837 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5839 rtx op0, op1;
5840 machine_mode mode = GET_MODE (x);
5842 *res_op0 = NULL_RTX;
5843 *res_op1 = NULL_RTX;
5845 if (GET_CODE (x) != IOR)
5846 return false;
5848 op0 = XEXP (x, 0);
5849 op1 = XEXP (x, 1);
5851 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5852 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5854 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5855 if (GET_CODE (op1) == ASHIFT)
5856 std::swap (op0, op1);
5858 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5859 return false;
5861 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5862 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5864 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5865 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5867 *res_op0 = XEXP (op0, 0);
5868 *res_op1 = XEXP (op1, 0);
5869 return true;
5873 return false;
5876 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5877 storing it in *COST. Result is true if the total cost of the operation
5878 has now been calculated. */
5879 static bool
5880 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5882 rtx inner;
5883 rtx comparator;
5884 enum rtx_code cmpcode;
5886 if (COMPARISON_P (op0))
5888 inner = XEXP (op0, 0);
5889 comparator = XEXP (op0, 1);
5890 cmpcode = GET_CODE (op0);
5892 else
5894 inner = op0;
5895 comparator = const0_rtx;
5896 cmpcode = NE;
5899 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5901 /* Conditional branch. */
5902 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5903 return true;
5904 else
5906 if (cmpcode == NE || cmpcode == EQ)
5908 if (comparator == const0_rtx)
5910 /* TBZ/TBNZ/CBZ/CBNZ. */
5911 if (GET_CODE (inner) == ZERO_EXTRACT)
5912 /* TBZ/TBNZ. */
5913 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5914 ZERO_EXTRACT, 0, speed);
5915 else
5916 /* CBZ/CBNZ. */
5917 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5919 return true;
5922 else if (cmpcode == LT || cmpcode == GE)
5924 /* TBZ/TBNZ. */
5925 if (comparator == const0_rtx)
5926 return true;
5930 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5932 /* CCMP. */
5933 if (GET_CODE (op1) == COMPARE)
5935 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
5936 if (XEXP (op1, 1) == const0_rtx)
5937 *cost += 1;
5938 if (speed)
5940 machine_mode mode = GET_MODE (XEXP (op1, 0));
5941 const struct cpu_cost_table *extra_cost
5942 = aarch64_tune_params.insn_extra_cost;
5944 if (GET_MODE_CLASS (mode) == MODE_INT)
5945 *cost += extra_cost->alu.arith;
5946 else
5947 *cost += extra_cost->fp[mode == DFmode].compare;
5949 return true;
5952 /* It's a conditional operation based on the status flags,
5953 so it must be some flavor of CSEL. */
5955 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5956 if (GET_CODE (op1) == NEG
5957 || GET_CODE (op1) == NOT
5958 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5959 op1 = XEXP (op1, 0);
5960 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
5962 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
5963 op1 = XEXP (op1, 0);
5964 op2 = XEXP (op2, 0);
5967 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
5968 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
5969 return true;
5972 /* We don't know what this is, cost all operands. */
5973 return false;
5976 /* Check whether X is a bitfield operation of the form shift + extend that
5977 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
5978 operand to which the bitfield operation is applied. Otherwise return
5979 NULL_RTX. */
5981 static rtx
5982 aarch64_extend_bitfield_pattern_p (rtx x)
5984 rtx_code outer_code = GET_CODE (x);
5985 machine_mode outer_mode = GET_MODE (x);
5987 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
5988 && outer_mode != SImode && outer_mode != DImode)
5989 return NULL_RTX;
5991 rtx inner = XEXP (x, 0);
5992 rtx_code inner_code = GET_CODE (inner);
5993 machine_mode inner_mode = GET_MODE (inner);
5994 rtx op = NULL_RTX;
5996 switch (inner_code)
5998 case ASHIFT:
5999 if (CONST_INT_P (XEXP (inner, 1))
6000 && (inner_mode == QImode || inner_mode == HImode))
6001 op = XEXP (inner, 0);
6002 break;
6003 case LSHIFTRT:
6004 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6005 && (inner_mode == QImode || inner_mode == HImode))
6006 op = XEXP (inner, 0);
6007 break;
6008 case ASHIFTRT:
6009 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6010 && (inner_mode == QImode || inner_mode == HImode))
6011 op = XEXP (inner, 0);
6012 break;
6013 default:
6014 break;
6017 return op;
6020 /* Calculate the cost of calculating X, storing it in *COST. Result
6021 is true if the total cost of the operation has now been calculated. */
6022 static bool
6023 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6024 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6026 rtx op0, op1, op2;
6027 const struct cpu_cost_table *extra_cost
6028 = aarch64_tune_params.insn_extra_cost;
6029 int code = GET_CODE (x);
6031 /* By default, assume that everything has equivalent cost to the
6032 cheapest instruction. Any additional costs are applied as a delta
6033 above this default. */
6034 *cost = COSTS_N_INSNS (1);
6036 switch (code)
6038 case SET:
6039 /* The cost depends entirely on the operands to SET. */
6040 *cost = 0;
6041 op0 = SET_DEST (x);
6042 op1 = SET_SRC (x);
6044 switch (GET_CODE (op0))
6046 case MEM:
6047 if (speed)
6049 rtx address = XEXP (op0, 0);
6050 if (VECTOR_MODE_P (mode))
6051 *cost += extra_cost->ldst.storev;
6052 else if (GET_MODE_CLASS (mode) == MODE_INT)
6053 *cost += extra_cost->ldst.store;
6054 else if (mode == SFmode)
6055 *cost += extra_cost->ldst.storef;
6056 else if (mode == DFmode)
6057 *cost += extra_cost->ldst.stored;
6059 *cost +=
6060 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6061 0, speed));
6064 *cost += rtx_cost (op1, mode, SET, 1, speed);
6065 return true;
6067 case SUBREG:
6068 if (! REG_P (SUBREG_REG (op0)))
6069 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6071 /* Fall through. */
6072 case REG:
6073 /* The cost is one per vector-register copied. */
6074 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6076 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6077 / GET_MODE_SIZE (V4SImode);
6078 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6080 /* const0_rtx is in general free, but we will use an
6081 instruction to set a register to 0. */
6082 else if (REG_P (op1) || op1 == const0_rtx)
6084 /* The cost is 1 per register copied. */
6085 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6086 / UNITS_PER_WORD;
6087 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6089 else
6090 /* Cost is just the cost of the RHS of the set. */
6091 *cost += rtx_cost (op1, mode, SET, 1, speed);
6092 return true;
6094 case ZERO_EXTRACT:
6095 case SIGN_EXTRACT:
6096 /* Bit-field insertion. Strip any redundant widening of
6097 the RHS to meet the width of the target. */
6098 if (GET_CODE (op1) == SUBREG)
6099 op1 = SUBREG_REG (op1);
6100 if ((GET_CODE (op1) == ZERO_EXTEND
6101 || GET_CODE (op1) == SIGN_EXTEND)
6102 && CONST_INT_P (XEXP (op0, 1))
6103 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6104 >= INTVAL (XEXP (op0, 1))))
6105 op1 = XEXP (op1, 0);
6107 if (CONST_INT_P (op1))
6109 /* MOV immediate is assumed to always be cheap. */
6110 *cost = COSTS_N_INSNS (1);
6112 else
6114 /* BFM. */
6115 if (speed)
6116 *cost += extra_cost->alu.bfi;
6117 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6120 return true;
6122 default:
6123 /* We can't make sense of this, assume default cost. */
6124 *cost = COSTS_N_INSNS (1);
6125 return false;
6127 return false;
6129 case CONST_INT:
6130 /* If an instruction can incorporate a constant within the
6131 instruction, the instruction's expression avoids calling
6132 rtx_cost() on the constant. If rtx_cost() is called on a
6133 constant, then it is usually because the constant must be
6134 moved into a register by one or more instructions.
6136 The exception is constant 0, which can be expressed
6137 as XZR/WZR and is therefore free. The exception to this is
6138 if we have (set (reg) (const0_rtx)) in which case we must cost
6139 the move. However, we can catch that when we cost the SET, so
6140 we don't need to consider that here. */
6141 if (x == const0_rtx)
6142 *cost = 0;
6143 else
6145 /* To an approximation, building any other constant is
6146 proportionally expensive to the number of instructions
6147 required to build that constant. This is true whether we
6148 are compiling for SPEED or otherwise. */
6149 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6150 (NULL_RTX, x, false, mode));
6152 return true;
6154 case CONST_DOUBLE:
6155 if (speed)
6157 /* mov[df,sf]_aarch64. */
6158 if (aarch64_float_const_representable_p (x))
6159 /* FMOV (scalar immediate). */
6160 *cost += extra_cost->fp[mode == DFmode].fpconst;
6161 else if (!aarch64_float_const_zero_rtx_p (x))
6163 /* This will be a load from memory. */
6164 if (mode == DFmode)
6165 *cost += extra_cost->ldst.loadd;
6166 else
6167 *cost += extra_cost->ldst.loadf;
6169 else
6170 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6171 or MOV v0.s[0], wzr - neither of which are modeled by the
6172 cost tables. Just use the default cost. */
6177 return true;
6179 case MEM:
6180 if (speed)
6182 /* For loads we want the base cost of a load, plus an
6183 approximation for the additional cost of the addressing
6184 mode. */
6185 rtx address = XEXP (x, 0);
6186 if (VECTOR_MODE_P (mode))
6187 *cost += extra_cost->ldst.loadv;
6188 else if (GET_MODE_CLASS (mode) == MODE_INT)
6189 *cost += extra_cost->ldst.load;
6190 else if (mode == SFmode)
6191 *cost += extra_cost->ldst.loadf;
6192 else if (mode == DFmode)
6193 *cost += extra_cost->ldst.loadd;
6195 *cost +=
6196 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6197 0, speed));
6200 return true;
6202 case NEG:
6203 op0 = XEXP (x, 0);
6205 if (VECTOR_MODE_P (mode))
6207 if (speed)
6209 /* FNEG. */
6210 *cost += extra_cost->vect.alu;
6212 return false;
6215 if (GET_MODE_CLASS (mode) == MODE_INT)
6217 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6218 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6220 /* CSETM. */
6221 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6222 return true;
6225 /* Cost this as SUB wzr, X. */
6226 op0 = CONST0_RTX (mode);
6227 op1 = XEXP (x, 0);
6228 goto cost_minus;
6231 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6233 /* Support (neg(fma...)) as a single instruction only if
6234 sign of zeros is unimportant. This matches the decision
6235 making in aarch64.md. */
6236 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6238 /* FNMADD. */
6239 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6240 return true;
6242 if (GET_CODE (op0) == MULT)
6244 /* FNMUL. */
6245 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6246 return true;
6248 if (speed)
6249 /* FNEG. */
6250 *cost += extra_cost->fp[mode == DFmode].neg;
6251 return false;
6254 return false;
6256 case CLRSB:
6257 case CLZ:
6258 if (speed)
6260 if (VECTOR_MODE_P (mode))
6261 *cost += extra_cost->vect.alu;
6262 else
6263 *cost += extra_cost->alu.clz;
6266 return false;
6268 case COMPARE:
6269 op0 = XEXP (x, 0);
6270 op1 = XEXP (x, 1);
6272 if (op1 == const0_rtx
6273 && GET_CODE (op0) == AND)
6275 x = op0;
6276 mode = GET_MODE (op0);
6277 goto cost_logic;
6280 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6282 /* TODO: A write to the CC flags possibly costs extra, this
6283 needs encoding in the cost tables. */
6285 mode = GET_MODE (op0);
6286 /* ANDS. */
6287 if (GET_CODE (op0) == AND)
6289 x = op0;
6290 goto cost_logic;
6293 if (GET_CODE (op0) == PLUS)
6295 /* ADDS (and CMN alias). */
6296 x = op0;
6297 goto cost_plus;
6300 if (GET_CODE (op0) == MINUS)
6302 /* SUBS. */
6303 x = op0;
6304 goto cost_minus;
6307 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6308 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6309 && CONST_INT_P (XEXP (op0, 2)))
6311 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6312 Handle it here directly rather than going to cost_logic
6313 since we know the immediate generated for the TST is valid
6314 so we can avoid creating an intermediate rtx for it only
6315 for costing purposes. */
6316 if (speed)
6317 *cost += extra_cost->alu.logical;
6319 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6320 ZERO_EXTRACT, 0, speed);
6321 return true;
6324 if (GET_CODE (op1) == NEG)
6326 /* CMN. */
6327 if (speed)
6328 *cost += extra_cost->alu.arith;
6330 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6331 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6332 return true;
6335 /* CMP.
6337 Compare can freely swap the order of operands, and
6338 canonicalization puts the more complex operation first.
6339 But the integer MINUS logic expects the shift/extend
6340 operation in op1. */
6341 if (! (REG_P (op0)
6342 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6344 op0 = XEXP (x, 1);
6345 op1 = XEXP (x, 0);
6347 goto cost_minus;
6350 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6352 /* FCMP. */
6353 if (speed)
6354 *cost += extra_cost->fp[mode == DFmode].compare;
6356 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6358 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6359 /* FCMP supports constant 0.0 for no extra cost. */
6360 return true;
6362 return false;
6365 if (VECTOR_MODE_P (mode))
6367 /* Vector compare. */
6368 if (speed)
6369 *cost += extra_cost->vect.alu;
6371 if (aarch64_float_const_zero_rtx_p (op1))
6373 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6374 cost. */
6375 return true;
6377 return false;
6379 return false;
6381 case MINUS:
6383 op0 = XEXP (x, 0);
6384 op1 = XEXP (x, 1);
6386 cost_minus:
6387 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6389 /* Detect valid immediates. */
6390 if ((GET_MODE_CLASS (mode) == MODE_INT
6391 || (GET_MODE_CLASS (mode) == MODE_CC
6392 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6393 && CONST_INT_P (op1)
6394 && aarch64_uimm12_shift (INTVAL (op1)))
6396 if (speed)
6397 /* SUB(S) (immediate). */
6398 *cost += extra_cost->alu.arith;
6399 return true;
6402 /* Look for SUB (extended register). */
6403 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6405 if (speed)
6406 *cost += extra_cost->alu.extend_arith;
6408 op1 = aarch64_strip_extend (op1);
6409 *cost += rtx_cost (op1, VOIDmode,
6410 (enum rtx_code) GET_CODE (op1), 0, speed);
6411 return true;
6414 rtx new_op1 = aarch64_strip_extend (op1);
6416 /* Cost this as an FMA-alike operation. */
6417 if ((GET_CODE (new_op1) == MULT
6418 || aarch64_shift_p (GET_CODE (new_op1)))
6419 && code != COMPARE)
6421 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6422 (enum rtx_code) code,
6423 speed);
6424 return true;
6427 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6429 if (speed)
6431 if (VECTOR_MODE_P (mode))
6433 /* Vector SUB. */
6434 *cost += extra_cost->vect.alu;
6436 else if (GET_MODE_CLASS (mode) == MODE_INT)
6438 /* SUB(S). */
6439 *cost += extra_cost->alu.arith;
6441 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6443 /* FSUB. */
6444 *cost += extra_cost->fp[mode == DFmode].addsub;
6447 return true;
6450 case PLUS:
6452 rtx new_op0;
6454 op0 = XEXP (x, 0);
6455 op1 = XEXP (x, 1);
6457 cost_plus:
6458 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6459 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6461 /* CSINC. */
6462 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6463 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6464 return true;
6467 if (GET_MODE_CLASS (mode) == MODE_INT
6468 && CONST_INT_P (op1)
6469 && aarch64_uimm12_shift (INTVAL (op1)))
6471 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6473 if (speed)
6474 /* ADD (immediate). */
6475 *cost += extra_cost->alu.arith;
6476 return true;
6479 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6481 /* Look for ADD (extended register). */
6482 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6484 if (speed)
6485 *cost += extra_cost->alu.extend_arith;
6487 op0 = aarch64_strip_extend (op0);
6488 *cost += rtx_cost (op0, VOIDmode,
6489 (enum rtx_code) GET_CODE (op0), 0, speed);
6490 return true;
6493 /* Strip any extend, leave shifts behind as we will
6494 cost them through mult_cost. */
6495 new_op0 = aarch64_strip_extend (op0);
6497 if (GET_CODE (new_op0) == MULT
6498 || aarch64_shift_p (GET_CODE (new_op0)))
6500 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6501 speed);
6502 return true;
6505 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6507 if (speed)
6509 if (VECTOR_MODE_P (mode))
6511 /* Vector ADD. */
6512 *cost += extra_cost->vect.alu;
6514 else if (GET_MODE_CLASS (mode) == MODE_INT)
6516 /* ADD. */
6517 *cost += extra_cost->alu.arith;
6519 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6521 /* FADD. */
6522 *cost += extra_cost->fp[mode == DFmode].addsub;
6525 return true;
6528 case BSWAP:
6529 *cost = COSTS_N_INSNS (1);
6531 if (speed)
6533 if (VECTOR_MODE_P (mode))
6534 *cost += extra_cost->vect.alu;
6535 else
6536 *cost += extra_cost->alu.rev;
6538 return false;
6540 case IOR:
6541 if (aarch_rev16_p (x))
6543 *cost = COSTS_N_INSNS (1);
6545 if (speed)
6547 if (VECTOR_MODE_P (mode))
6548 *cost += extra_cost->vect.alu;
6549 else
6550 *cost += extra_cost->alu.rev;
6552 return true;
6555 if (aarch64_extr_rtx_p (x, &op0, &op1))
6557 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6558 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6559 if (speed)
6560 *cost += extra_cost->alu.shift;
6562 return true;
6564 /* Fall through. */
6565 case XOR:
6566 case AND:
6567 cost_logic:
6568 op0 = XEXP (x, 0);
6569 op1 = XEXP (x, 1);
6571 if (VECTOR_MODE_P (mode))
6573 if (speed)
6574 *cost += extra_cost->vect.alu;
6575 return true;
6578 if (code == AND
6579 && GET_CODE (op0) == MULT
6580 && CONST_INT_P (XEXP (op0, 1))
6581 && CONST_INT_P (op1)
6582 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6583 INTVAL (op1)) != 0)
6585 /* This is a UBFM/SBFM. */
6586 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6587 if (speed)
6588 *cost += extra_cost->alu.bfx;
6589 return true;
6592 if (GET_MODE_CLASS (mode) == MODE_INT)
6594 /* We possibly get the immediate for free, this is not
6595 modelled. */
6596 if (CONST_INT_P (op1)
6597 && aarch64_bitmask_imm (INTVAL (op1), mode))
6599 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6601 if (speed)
6602 *cost += extra_cost->alu.logical;
6604 return true;
6606 else
6608 rtx new_op0 = op0;
6610 /* Handle ORN, EON, or BIC. */
6611 if (GET_CODE (op0) == NOT)
6612 op0 = XEXP (op0, 0);
6614 new_op0 = aarch64_strip_shift (op0);
6616 /* If we had a shift on op0 then this is a logical-shift-
6617 by-register/immediate operation. Otherwise, this is just
6618 a logical operation. */
6619 if (speed)
6621 if (new_op0 != op0)
6623 /* Shift by immediate. */
6624 if (CONST_INT_P (XEXP (op0, 1)))
6625 *cost += extra_cost->alu.log_shift;
6626 else
6627 *cost += extra_cost->alu.log_shift_reg;
6629 else
6630 *cost += extra_cost->alu.logical;
6633 /* In both cases we want to cost both operands. */
6634 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6635 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6637 return true;
6640 return false;
6642 case NOT:
6643 x = XEXP (x, 0);
6644 op0 = aarch64_strip_shift (x);
6646 if (VECTOR_MODE_P (mode))
6648 /* Vector NOT. */
6649 *cost += extra_cost->vect.alu;
6650 return false;
6653 /* MVN-shifted-reg. */
6654 if (op0 != x)
6656 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6658 if (speed)
6659 *cost += extra_cost->alu.log_shift;
6661 return true;
6663 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6664 Handle the second form here taking care that 'a' in the above can
6665 be a shift. */
6666 else if (GET_CODE (op0) == XOR)
6668 rtx newop0 = XEXP (op0, 0);
6669 rtx newop1 = XEXP (op0, 1);
6670 rtx op0_stripped = aarch64_strip_shift (newop0);
6672 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6673 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6675 if (speed)
6677 if (op0_stripped != newop0)
6678 *cost += extra_cost->alu.log_shift;
6679 else
6680 *cost += extra_cost->alu.logical;
6683 return true;
6685 /* MVN. */
6686 if (speed)
6687 *cost += extra_cost->alu.logical;
6689 return false;
6691 case ZERO_EXTEND:
6693 op0 = XEXP (x, 0);
6694 /* If a value is written in SI mode, then zero extended to DI
6695 mode, the operation will in general be free as a write to
6696 a 'w' register implicitly zeroes the upper bits of an 'x'
6697 register. However, if this is
6699 (set (reg) (zero_extend (reg)))
6701 we must cost the explicit register move. */
6702 if (mode == DImode
6703 && GET_MODE (op0) == SImode
6704 && outer == SET)
6706 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6708 if (!op_cost && speed)
6709 /* MOV. */
6710 *cost += extra_cost->alu.extend;
6711 else
6712 /* Free, the cost is that of the SI mode operation. */
6713 *cost = op_cost;
6715 return true;
6717 else if (MEM_P (op0))
6719 /* All loads can zero extend to any size for free. */
6720 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6721 return true;
6724 op0 = aarch64_extend_bitfield_pattern_p (x);
6725 if (op0)
6727 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6728 if (speed)
6729 *cost += extra_cost->alu.bfx;
6730 return true;
6733 if (speed)
6735 if (VECTOR_MODE_P (mode))
6737 /* UMOV. */
6738 *cost += extra_cost->vect.alu;
6740 else
6742 /* UXTB/UXTH. */
6743 *cost += extra_cost->alu.extend;
6746 return false;
6748 case SIGN_EXTEND:
6749 if (MEM_P (XEXP (x, 0)))
6751 /* LDRSH. */
6752 if (speed)
6754 rtx address = XEXP (XEXP (x, 0), 0);
6755 *cost += extra_cost->ldst.load_sign_extend;
6757 *cost +=
6758 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6759 0, speed));
6761 return true;
6764 op0 = aarch64_extend_bitfield_pattern_p (x);
6765 if (op0)
6767 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6768 if (speed)
6769 *cost += extra_cost->alu.bfx;
6770 return true;
6773 if (speed)
6775 if (VECTOR_MODE_P (mode))
6776 *cost += extra_cost->vect.alu;
6777 else
6778 *cost += extra_cost->alu.extend;
6780 return false;
6782 case ASHIFT:
6783 op0 = XEXP (x, 0);
6784 op1 = XEXP (x, 1);
6786 if (CONST_INT_P (op1))
6788 if (speed)
6790 if (VECTOR_MODE_P (mode))
6792 /* Vector shift (immediate). */
6793 *cost += extra_cost->vect.alu;
6795 else
6797 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6798 aliases. */
6799 *cost += extra_cost->alu.shift;
6803 /* We can incorporate zero/sign extend for free. */
6804 if (GET_CODE (op0) == ZERO_EXTEND
6805 || GET_CODE (op0) == SIGN_EXTEND)
6806 op0 = XEXP (op0, 0);
6808 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6809 return true;
6811 else
6813 if (speed)
6815 if (VECTOR_MODE_P (mode))
6817 /* Vector shift (register). */
6818 *cost += extra_cost->vect.alu;
6820 else
6822 /* LSLV. */
6823 *cost += extra_cost->alu.shift_reg;
6826 return false; /* All arguments need to be in registers. */
6829 case ROTATE:
6830 case ROTATERT:
6831 case LSHIFTRT:
6832 case ASHIFTRT:
6833 op0 = XEXP (x, 0);
6834 op1 = XEXP (x, 1);
6836 if (CONST_INT_P (op1))
6838 /* ASR (immediate) and friends. */
6839 if (speed)
6841 if (VECTOR_MODE_P (mode))
6842 *cost += extra_cost->vect.alu;
6843 else
6844 *cost += extra_cost->alu.shift;
6847 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6848 return true;
6850 else
6853 /* ASR (register) and friends. */
6854 if (speed)
6856 if (VECTOR_MODE_P (mode))
6857 *cost += extra_cost->vect.alu;
6858 else
6859 *cost += extra_cost->alu.shift_reg;
6861 return false; /* All arguments need to be in registers. */
6864 case SYMBOL_REF:
6866 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6867 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6869 /* LDR. */
6870 if (speed)
6871 *cost += extra_cost->ldst.load;
6873 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6874 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6876 /* ADRP, followed by ADD. */
6877 *cost += COSTS_N_INSNS (1);
6878 if (speed)
6879 *cost += 2 * extra_cost->alu.arith;
6881 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6882 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6884 /* ADR. */
6885 if (speed)
6886 *cost += extra_cost->alu.arith;
6889 if (flag_pic)
6891 /* One extra load instruction, after accessing the GOT. */
6892 *cost += COSTS_N_INSNS (1);
6893 if (speed)
6894 *cost += extra_cost->ldst.load;
6896 return true;
6898 case HIGH:
6899 case LO_SUM:
6900 /* ADRP/ADD (immediate). */
6901 if (speed)
6902 *cost += extra_cost->alu.arith;
6903 return true;
6905 case ZERO_EXTRACT:
6906 case SIGN_EXTRACT:
6907 /* UBFX/SBFX. */
6908 if (speed)
6910 if (VECTOR_MODE_P (mode))
6911 *cost += extra_cost->vect.alu;
6912 else
6913 *cost += extra_cost->alu.bfx;
6916 /* We can trust that the immediates used will be correct (there
6917 are no by-register forms), so we need only cost op0. */
6918 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
6919 return true;
6921 case MULT:
6922 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6923 /* aarch64_rtx_mult_cost always handles recursion to its
6924 operands. */
6925 return true;
6927 case MOD:
6928 /* We can expand signed mod by power of 2 using a NEGS, two parallel
6929 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
6930 an unconditional negate. This case should only ever be reached through
6931 the set_smod_pow2_cheap check in expmed.c. */
6932 if (CONST_INT_P (XEXP (x, 1))
6933 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
6934 && (mode == SImode || mode == DImode))
6936 /* We expand to 4 instructions. Reset the baseline. */
6937 *cost = COSTS_N_INSNS (4);
6939 if (speed)
6940 *cost += 2 * extra_cost->alu.logical
6941 + 2 * extra_cost->alu.arith;
6943 return true;
6946 /* Fall-through. */
6947 case UMOD:
6948 if (speed)
6950 if (VECTOR_MODE_P (mode))
6951 *cost += extra_cost->vect.alu;
6952 else if (GET_MODE_CLASS (mode) == MODE_INT)
6953 *cost += (extra_cost->mult[mode == DImode].add
6954 + extra_cost->mult[mode == DImode].idiv);
6955 else if (mode == DFmode)
6956 *cost += (extra_cost->fp[1].mult
6957 + extra_cost->fp[1].div);
6958 else if (mode == SFmode)
6959 *cost += (extra_cost->fp[0].mult
6960 + extra_cost->fp[0].div);
6962 return false; /* All arguments need to be in registers. */
6964 case DIV:
6965 case UDIV:
6966 case SQRT:
6967 if (speed)
6969 if (VECTOR_MODE_P (mode))
6970 *cost += extra_cost->vect.alu;
6971 else if (GET_MODE_CLASS (mode) == MODE_INT)
6972 /* There is no integer SQRT, so only DIV and UDIV can get
6973 here. */
6974 *cost += extra_cost->mult[mode == DImode].idiv;
6975 else
6976 *cost += extra_cost->fp[mode == DFmode].div;
6978 return false; /* All arguments need to be in registers. */
6980 case IF_THEN_ELSE:
6981 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6982 XEXP (x, 2), cost, speed);
6984 case EQ:
6985 case NE:
6986 case GT:
6987 case GTU:
6988 case LT:
6989 case LTU:
6990 case GE:
6991 case GEU:
6992 case LE:
6993 case LEU:
6995 return false; /* All arguments must be in registers. */
6997 case FMA:
6998 op0 = XEXP (x, 0);
6999 op1 = XEXP (x, 1);
7000 op2 = XEXP (x, 2);
7002 if (speed)
7004 if (VECTOR_MODE_P (mode))
7005 *cost += extra_cost->vect.alu;
7006 else
7007 *cost += extra_cost->fp[mode == DFmode].fma;
7010 /* FMSUB, FNMADD, and FNMSUB are free. */
7011 if (GET_CODE (op0) == NEG)
7012 op0 = XEXP (op0, 0);
7014 if (GET_CODE (op2) == NEG)
7015 op2 = XEXP (op2, 0);
7017 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7018 and the by-element operand as operand 0. */
7019 if (GET_CODE (op1) == NEG)
7020 op1 = XEXP (op1, 0);
7022 /* Catch vector-by-element operations. The by-element operand can
7023 either be (vec_duplicate (vec_select (x))) or just
7024 (vec_select (x)), depending on whether we are multiplying by
7025 a vector or a scalar.
7027 Canonicalization is not very good in these cases, FMA4 will put the
7028 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7029 if (GET_CODE (op0) == VEC_DUPLICATE)
7030 op0 = XEXP (op0, 0);
7031 else if (GET_CODE (op1) == VEC_DUPLICATE)
7032 op1 = XEXP (op1, 0);
7034 if (GET_CODE (op0) == VEC_SELECT)
7035 op0 = XEXP (op0, 0);
7036 else if (GET_CODE (op1) == VEC_SELECT)
7037 op1 = XEXP (op1, 0);
7039 /* If the remaining parameters are not registers,
7040 get the cost to put them into registers. */
7041 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7042 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7043 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7044 return true;
7046 case FLOAT:
7047 case UNSIGNED_FLOAT:
7048 if (speed)
7049 *cost += extra_cost->fp[mode == DFmode].fromint;
7050 return false;
7052 case FLOAT_EXTEND:
7053 if (speed)
7055 if (VECTOR_MODE_P (mode))
7057 /*Vector truncate. */
7058 *cost += extra_cost->vect.alu;
7060 else
7061 *cost += extra_cost->fp[mode == DFmode].widen;
7063 return false;
7065 case FLOAT_TRUNCATE:
7066 if (speed)
7068 if (VECTOR_MODE_P (mode))
7070 /*Vector conversion. */
7071 *cost += extra_cost->vect.alu;
7073 else
7074 *cost += extra_cost->fp[mode == DFmode].narrow;
7076 return false;
7078 case FIX:
7079 case UNSIGNED_FIX:
7080 x = XEXP (x, 0);
7081 /* Strip the rounding part. They will all be implemented
7082 by the fcvt* family of instructions anyway. */
7083 if (GET_CODE (x) == UNSPEC)
7085 unsigned int uns_code = XINT (x, 1);
7087 if (uns_code == UNSPEC_FRINTA
7088 || uns_code == UNSPEC_FRINTM
7089 || uns_code == UNSPEC_FRINTN
7090 || uns_code == UNSPEC_FRINTP
7091 || uns_code == UNSPEC_FRINTZ)
7092 x = XVECEXP (x, 0, 0);
7095 if (speed)
7097 if (VECTOR_MODE_P (mode))
7098 *cost += extra_cost->vect.alu;
7099 else
7100 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7103 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7104 fixed-point fcvt. */
7105 if (GET_CODE (x) == MULT
7106 && ((VECTOR_MODE_P (mode)
7107 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7108 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7110 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7111 0, speed);
7112 return true;
7115 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7116 return true;
7118 case ABS:
7119 if (VECTOR_MODE_P (mode))
7121 /* ABS (vector). */
7122 if (speed)
7123 *cost += extra_cost->vect.alu;
7125 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7127 op0 = XEXP (x, 0);
7129 /* FABD, which is analogous to FADD. */
7130 if (GET_CODE (op0) == MINUS)
7132 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7133 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7134 if (speed)
7135 *cost += extra_cost->fp[mode == DFmode].addsub;
7137 return true;
7139 /* Simple FABS is analogous to FNEG. */
7140 if (speed)
7141 *cost += extra_cost->fp[mode == DFmode].neg;
7143 else
7145 /* Integer ABS will either be split to
7146 two arithmetic instructions, or will be an ABS
7147 (scalar), which we don't model. */
7148 *cost = COSTS_N_INSNS (2);
7149 if (speed)
7150 *cost += 2 * extra_cost->alu.arith;
7152 return false;
7154 case SMAX:
7155 case SMIN:
7156 if (speed)
7158 if (VECTOR_MODE_P (mode))
7159 *cost += extra_cost->vect.alu;
7160 else
7162 /* FMAXNM/FMINNM/FMAX/FMIN.
7163 TODO: This may not be accurate for all implementations, but
7164 we do not model this in the cost tables. */
7165 *cost += extra_cost->fp[mode == DFmode].addsub;
7168 return false;
7170 case UNSPEC:
7171 /* The floating point round to integer frint* instructions. */
7172 if (aarch64_frint_unspec_p (XINT (x, 1)))
7174 if (speed)
7175 *cost += extra_cost->fp[mode == DFmode].roundint;
7177 return false;
7180 if (XINT (x, 1) == UNSPEC_RBIT)
7182 if (speed)
7183 *cost += extra_cost->alu.rev;
7185 return false;
7187 break;
7189 case TRUNCATE:
7191 /* Decompose <su>muldi3_highpart. */
7192 if (/* (truncate:DI */
7193 mode == DImode
7194 /* (lshiftrt:TI */
7195 && GET_MODE (XEXP (x, 0)) == TImode
7196 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7197 /* (mult:TI */
7198 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7199 /* (ANY_EXTEND:TI (reg:DI))
7200 (ANY_EXTEND:TI (reg:DI))) */
7201 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7202 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7203 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7204 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7205 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7206 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7207 /* (const_int 64) */
7208 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7209 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7211 /* UMULH/SMULH. */
7212 if (speed)
7213 *cost += extra_cost->mult[mode == DImode].extend;
7214 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7215 mode, MULT, 0, speed);
7216 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7217 mode, MULT, 1, speed);
7218 return true;
7221 /* Fall through. */
7222 default:
7223 break;
7226 if (dump_file && (dump_flags & TDF_DETAILS))
7227 fprintf (dump_file,
7228 "\nFailed to cost RTX. Assuming default cost.\n");
7230 return true;
7233 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7234 calculated for X. This cost is stored in *COST. Returns true
7235 if the total cost of X was calculated. */
7236 static bool
7237 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7238 int param, int *cost, bool speed)
7240 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7242 if (dump_file && (dump_flags & TDF_DETAILS))
7244 print_rtl_single (dump_file, x);
7245 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7246 speed ? "Hot" : "Cold",
7247 *cost, result ? "final" : "partial");
7250 return result;
7253 static int
7254 aarch64_register_move_cost (machine_mode mode,
7255 reg_class_t from_i, reg_class_t to_i)
7257 enum reg_class from = (enum reg_class) from_i;
7258 enum reg_class to = (enum reg_class) to_i;
7259 const struct cpu_regmove_cost *regmove_cost
7260 = aarch64_tune_params.regmove_cost;
7262 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7263 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7264 to = GENERAL_REGS;
7266 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7267 from = GENERAL_REGS;
7269 /* Moving between GPR and stack cost is the same as GP2GP. */
7270 if ((from == GENERAL_REGS && to == STACK_REG)
7271 || (to == GENERAL_REGS && from == STACK_REG))
7272 return regmove_cost->GP2GP;
7274 /* To/From the stack register, we move via the gprs. */
7275 if (to == STACK_REG || from == STACK_REG)
7276 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7277 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7279 if (GET_MODE_SIZE (mode) == 16)
7281 /* 128-bit operations on general registers require 2 instructions. */
7282 if (from == GENERAL_REGS && to == GENERAL_REGS)
7283 return regmove_cost->GP2GP * 2;
7284 else if (from == GENERAL_REGS)
7285 return regmove_cost->GP2FP * 2;
7286 else if (to == GENERAL_REGS)
7287 return regmove_cost->FP2GP * 2;
7289 /* When AdvSIMD instructions are disabled it is not possible to move
7290 a 128-bit value directly between Q registers. This is handled in
7291 secondary reload. A general register is used as a scratch to move
7292 the upper DI value and the lower DI value is moved directly,
7293 hence the cost is the sum of three moves. */
7294 if (! TARGET_SIMD)
7295 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7297 return regmove_cost->FP2FP;
7300 if (from == GENERAL_REGS && to == GENERAL_REGS)
7301 return regmove_cost->GP2GP;
7302 else if (from == GENERAL_REGS)
7303 return regmove_cost->GP2FP;
7304 else if (to == GENERAL_REGS)
7305 return regmove_cost->FP2GP;
7307 return regmove_cost->FP2FP;
7310 static int
7311 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7312 reg_class_t rclass ATTRIBUTE_UNUSED,
7313 bool in ATTRIBUTE_UNUSED)
7315 return aarch64_tune_params.memmov_cost;
7318 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7319 to optimize 1.0/sqrt. */
7321 static bool
7322 use_rsqrt_p (void)
7324 return (!flag_trapping_math
7325 && flag_unsafe_math_optimizations
7326 && ((aarch64_tune_params.extra_tuning_flags
7327 & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
7328 || flag_mrecip_low_precision_sqrt));
7331 /* Function to decide when to use the approximate reciprocal square root
7332 builtin. */
7334 static tree
7335 aarch64_builtin_reciprocal (tree fndecl)
7337 if (!use_rsqrt_p ())
7338 return NULL_TREE;
7339 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7342 typedef rtx (*rsqrte_type) (rtx, rtx);
7344 /* Select reciprocal square root initial estimate
7345 insn depending on machine mode. */
7347 rsqrte_type
7348 get_rsqrte_type (machine_mode mode)
7350 switch (mode)
7352 case DFmode: return gen_aarch64_rsqrte_df2;
7353 case SFmode: return gen_aarch64_rsqrte_sf2;
7354 case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7355 case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7356 case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7357 default: gcc_unreachable ();
7361 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7363 /* Select reciprocal square root Newton-Raphson step
7364 insn depending on machine mode. */
7366 rsqrts_type
7367 get_rsqrts_type (machine_mode mode)
7369 switch (mode)
7371 case DFmode: return gen_aarch64_rsqrts_df3;
7372 case SFmode: return gen_aarch64_rsqrts_sf3;
7373 case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7374 case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7375 case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7376 default: gcc_unreachable ();
7380 /* Emit instruction sequence to compute the reciprocal square root using the
7381 Newton-Raphson series. Iterate over the series twice for SF
7382 and thrice for DF. */
7384 void
7385 aarch64_emit_approx_rsqrt (rtx dst, rtx src)
7387 machine_mode mode = GET_MODE (src);
7388 gcc_assert (
7389 mode == SFmode || mode == V2SFmode || mode == V4SFmode
7390 || mode == DFmode || mode == V2DFmode);
7392 rtx xsrc = gen_reg_rtx (mode);
7393 emit_move_insn (xsrc, src);
7394 rtx x0 = gen_reg_rtx (mode);
7396 emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7398 bool double_mode = (mode == DFmode || mode == V2DFmode);
7400 int iterations = double_mode ? 3 : 2;
7402 /* Optionally iterate over the series one less time than otherwise. */
7403 if (flag_mrecip_low_precision_sqrt)
7404 iterations--;
7406 for (int i = 0; i < iterations; ++i)
7408 rtx x1 = gen_reg_rtx (mode);
7409 rtx x2 = gen_reg_rtx (mode);
7410 rtx x3 = gen_reg_rtx (mode);
7411 emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7413 emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7415 emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7416 x0 = x1;
7419 emit_move_insn (dst, x0);
7422 /* Return the number of instructions that can be issued per cycle. */
7423 static int
7424 aarch64_sched_issue_rate (void)
7426 return aarch64_tune_params.issue_rate;
7429 static int
7430 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7432 int issue_rate = aarch64_sched_issue_rate ();
7434 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7438 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7439 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7440 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7442 static int
7443 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7444 int ready_index)
7446 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7450 /* Vectorizer cost model target hooks. */
7452 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7453 static int
7454 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7455 tree vectype,
7456 int misalign ATTRIBUTE_UNUSED)
7458 unsigned elements;
7460 switch (type_of_cost)
7462 case scalar_stmt:
7463 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7465 case scalar_load:
7466 return aarch64_tune_params.vec_costs->scalar_load_cost;
7468 case scalar_store:
7469 return aarch64_tune_params.vec_costs->scalar_store_cost;
7471 case vector_stmt:
7472 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7474 case vector_load:
7475 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7477 case vector_store:
7478 return aarch64_tune_params.vec_costs->vec_store_cost;
7480 case vec_to_scalar:
7481 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7483 case scalar_to_vec:
7484 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7486 case unaligned_load:
7487 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7489 case unaligned_store:
7490 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7492 case cond_branch_taken:
7493 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7495 case cond_branch_not_taken:
7496 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7498 case vec_perm:
7499 return aarch64_tune_params.vec_costs->vec_permute_cost;
7501 case vec_promote_demote:
7502 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7504 case vec_construct:
7505 elements = TYPE_VECTOR_SUBPARTS (vectype);
7506 return elements / 2 + 1;
7508 default:
7509 gcc_unreachable ();
7513 /* Implement targetm.vectorize.add_stmt_cost. */
7514 static unsigned
7515 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7516 struct _stmt_vec_info *stmt_info, int misalign,
7517 enum vect_cost_model_location where)
7519 unsigned *cost = (unsigned *) data;
7520 unsigned retval = 0;
7522 if (flag_vect_cost_model)
7524 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7525 int stmt_cost =
7526 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7528 /* Statements in an inner loop relative to the loop being
7529 vectorized are weighted more heavily. The value here is
7530 arbitrary and could potentially be improved with analysis. */
7531 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7532 count *= 50; /* FIXME */
7534 retval = (unsigned) (count * stmt_cost);
7535 cost[where] += retval;
7538 return retval;
7541 static void initialize_aarch64_code_model (struct gcc_options *);
7543 /* Parse the TO_PARSE string and put the architecture struct that it
7544 selects into RES and the architectural features into ISA_FLAGS.
7545 Return an aarch64_parse_opt_result describing the parse result.
7546 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7548 static enum aarch64_parse_opt_result
7549 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7550 unsigned long *isa_flags)
7552 char *ext;
7553 const struct processor *arch;
7554 char *str = (char *) alloca (strlen (to_parse) + 1);
7555 size_t len;
7557 strcpy (str, to_parse);
7559 ext = strchr (str, '+');
7561 if (ext != NULL)
7562 len = ext - str;
7563 else
7564 len = strlen (str);
7566 if (len == 0)
7567 return AARCH64_PARSE_MISSING_ARG;
7570 /* Loop through the list of supported ARCHes to find a match. */
7571 for (arch = all_architectures; arch->name != NULL; arch++)
7573 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7575 unsigned long isa_temp = arch->flags;
7577 if (ext != NULL)
7579 /* TO_PARSE string contains at least one extension. */
7580 enum aarch64_parse_opt_result ext_res
7581 = aarch64_parse_extension (ext, &isa_temp);
7583 if (ext_res != AARCH64_PARSE_OK)
7584 return ext_res;
7586 /* Extension parsing was successful. Confirm the result
7587 arch and ISA flags. */
7588 *res = arch;
7589 *isa_flags = isa_temp;
7590 return AARCH64_PARSE_OK;
7594 /* ARCH name not found in list. */
7595 return AARCH64_PARSE_INVALID_ARG;
7598 /* Parse the TO_PARSE string and put the result tuning in RES and the
7599 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7600 describing the parse result. If there is an error parsing, RES and
7601 ISA_FLAGS are left unchanged. */
7603 static enum aarch64_parse_opt_result
7604 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7605 unsigned long *isa_flags)
7607 char *ext;
7608 const struct processor *cpu;
7609 char *str = (char *) alloca (strlen (to_parse) + 1);
7610 size_t len;
7612 strcpy (str, to_parse);
7614 ext = strchr (str, '+');
7616 if (ext != NULL)
7617 len = ext - str;
7618 else
7619 len = strlen (str);
7621 if (len == 0)
7622 return AARCH64_PARSE_MISSING_ARG;
7625 /* Loop through the list of supported CPUs to find a match. */
7626 for (cpu = all_cores; cpu->name != NULL; cpu++)
7628 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7630 unsigned long isa_temp = cpu->flags;
7633 if (ext != NULL)
7635 /* TO_PARSE string contains at least one extension. */
7636 enum aarch64_parse_opt_result ext_res
7637 = aarch64_parse_extension (ext, &isa_temp);
7639 if (ext_res != AARCH64_PARSE_OK)
7640 return ext_res;
7642 /* Extension parsing was successfull. Confirm the result
7643 cpu and ISA flags. */
7644 *res = cpu;
7645 *isa_flags = isa_temp;
7646 return AARCH64_PARSE_OK;
7650 /* CPU name not found in list. */
7651 return AARCH64_PARSE_INVALID_ARG;
7654 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7655 Return an aarch64_parse_opt_result describing the parse result.
7656 If the parsing fails the RES does not change. */
7658 static enum aarch64_parse_opt_result
7659 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7661 const struct processor *cpu;
7662 char *str = (char *) alloca (strlen (to_parse) + 1);
7664 strcpy (str, to_parse);
7666 /* Loop through the list of supported CPUs to find a match. */
7667 for (cpu = all_cores; cpu->name != NULL; cpu++)
7669 if (strcmp (cpu->name, str) == 0)
7671 *res = cpu;
7672 return AARCH64_PARSE_OK;
7676 /* CPU name not found in list. */
7677 return AARCH64_PARSE_INVALID_ARG;
7680 /* Parse TOKEN, which has length LENGTH to see if it is an option
7681 described in FLAG. If it is, return the index bit for that fusion type.
7682 If not, error (printing OPTION_NAME) and return zero. */
7684 static unsigned int
7685 aarch64_parse_one_option_token (const char *token,
7686 size_t length,
7687 const struct aarch64_flag_desc *flag,
7688 const char *option_name)
7690 for (; flag->name != NULL; flag++)
7692 if (length == strlen (flag->name)
7693 && !strncmp (flag->name, token, length))
7694 return flag->flag;
7697 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7698 return 0;
7701 /* Parse OPTION which is a comma-separated list of flags to enable.
7702 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7703 default state we inherit from the CPU tuning structures. OPTION_NAME
7704 gives the top-level option we are parsing in the -moverride string,
7705 for use in error messages. */
7707 static unsigned int
7708 aarch64_parse_boolean_options (const char *option,
7709 const struct aarch64_flag_desc *flags,
7710 unsigned int initial_state,
7711 const char *option_name)
7713 const char separator = '.';
7714 const char* specs = option;
7715 const char* ntoken = option;
7716 unsigned int found_flags = initial_state;
7718 while ((ntoken = strchr (specs, separator)))
7720 size_t token_length = ntoken - specs;
7721 unsigned token_ops = aarch64_parse_one_option_token (specs,
7722 token_length,
7723 flags,
7724 option_name);
7725 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7726 in the token stream, reset the supported operations. So:
7728 adrp+add.cmp+branch.none.adrp+add
7730 would have the result of turning on only adrp+add fusion. */
7731 if (!token_ops)
7732 found_flags = 0;
7734 found_flags |= token_ops;
7735 specs = ++ntoken;
7738 /* We ended with a comma, print something. */
7739 if (!(*specs))
7741 error ("%s string ill-formed\n", option_name);
7742 return 0;
7745 /* We still have one more token to parse. */
7746 size_t token_length = strlen (specs);
7747 unsigned token_ops = aarch64_parse_one_option_token (specs,
7748 token_length,
7749 flags,
7750 option_name);
7751 if (!token_ops)
7752 found_flags = 0;
7754 found_flags |= token_ops;
7755 return found_flags;
7758 /* Support for overriding instruction fusion. */
7760 static void
7761 aarch64_parse_fuse_string (const char *fuse_string,
7762 struct tune_params *tune)
7764 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7765 aarch64_fusible_pairs,
7766 tune->fusible_ops,
7767 "fuse=");
7770 /* Support for overriding other tuning flags. */
7772 static void
7773 aarch64_parse_tune_string (const char *tune_string,
7774 struct tune_params *tune)
7776 tune->extra_tuning_flags
7777 = aarch64_parse_boolean_options (tune_string,
7778 aarch64_tuning_flags,
7779 tune->extra_tuning_flags,
7780 "tune=");
7783 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7784 we understand. If it is, extract the option string and handoff to
7785 the appropriate function. */
7787 void
7788 aarch64_parse_one_override_token (const char* token,
7789 size_t length,
7790 struct tune_params *tune)
7792 const struct aarch64_tuning_override_function *fn
7793 = aarch64_tuning_override_functions;
7795 const char *option_part = strchr (token, '=');
7796 if (!option_part)
7798 error ("tuning string missing in option (%s)", token);
7799 return;
7802 /* Get the length of the option name. */
7803 length = option_part - token;
7804 /* Skip the '=' to get to the option string. */
7805 option_part++;
7807 for (; fn->name != NULL; fn++)
7809 if (!strncmp (fn->name, token, length))
7811 fn->parse_override (option_part, tune);
7812 return;
7816 error ("unknown tuning option (%s)",token);
7817 return;
7820 /* A checking mechanism for the implementation of the tls size. */
7822 static void
7823 initialize_aarch64_tls_size (struct gcc_options *opts)
7825 if (aarch64_tls_size == 0)
7826 aarch64_tls_size = 24;
7828 switch (opts->x_aarch64_cmodel_var)
7830 case AARCH64_CMODEL_TINY:
7831 /* Both the default and maximum TLS size allowed under tiny is 1M which
7832 needs two instructions to address, so we clamp the size to 24. */
7833 if (aarch64_tls_size > 24)
7834 aarch64_tls_size = 24;
7835 break;
7836 case AARCH64_CMODEL_SMALL:
7837 /* The maximum TLS size allowed under small is 4G. */
7838 if (aarch64_tls_size > 32)
7839 aarch64_tls_size = 32;
7840 break;
7841 case AARCH64_CMODEL_LARGE:
7842 /* The maximum TLS size allowed under large is 16E.
7843 FIXME: 16E should be 64bit, we only support 48bit offset now. */
7844 if (aarch64_tls_size > 48)
7845 aarch64_tls_size = 48;
7846 break;
7847 default:
7848 gcc_unreachable ();
7851 return;
7854 /* Parse STRING looking for options in the format:
7855 string :: option:string
7856 option :: name=substring
7857 name :: {a-z}
7858 substring :: defined by option. */
7860 static void
7861 aarch64_parse_override_string (const char* input_string,
7862 struct tune_params* tune)
7864 const char separator = ':';
7865 size_t string_length = strlen (input_string) + 1;
7866 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7867 char *string = string_root;
7868 strncpy (string, input_string, string_length);
7869 string[string_length - 1] = '\0';
7871 char* ntoken = string;
7873 while ((ntoken = strchr (string, separator)))
7875 size_t token_length = ntoken - string;
7876 /* Make this substring look like a string. */
7877 *ntoken = '\0';
7878 aarch64_parse_one_override_token (string, token_length, tune);
7879 string = ++ntoken;
7882 /* One last option to parse. */
7883 aarch64_parse_one_override_token (string, strlen (string), tune);
7884 free (string_root);
7888 static void
7889 aarch64_override_options_after_change_1 (struct gcc_options *opts)
7891 /* The logic here is that if we are disabling all frame pointer generation
7892 then we do not need to disable leaf frame pointer generation as a
7893 separate operation. But if we are *only* disabling leaf frame pointer
7894 generation then we set flag_omit_frame_pointer to true, but in
7895 aarch64_frame_pointer_required we return false only for leaf functions.
7897 PR 70044: We have to be careful about being called multiple times for the
7898 same function. Once we have decided to set flag_omit_frame_pointer just
7899 so that we can omit leaf frame pointers, we must then not interpret a
7900 second call as meaning that all frame pointer generation should be
7901 omitted. We do this by setting flag_omit_frame_pointer to a special,
7902 non-zero value. */
7903 if (opts->x_flag_omit_frame_pointer == 2)
7904 opts->x_flag_omit_frame_pointer = 0;
7906 if (opts->x_flag_omit_frame_pointer)
7907 opts->x_flag_omit_leaf_frame_pointer = false;
7908 else if (opts->x_flag_omit_leaf_frame_pointer)
7909 opts->x_flag_omit_frame_pointer = 2;
7911 /* If not optimizing for size, set the default
7912 alignment to what the target wants. */
7913 if (!opts->x_optimize_size)
7915 if (opts->x_align_loops <= 0)
7916 opts->x_align_loops = aarch64_tune_params.loop_align;
7917 if (opts->x_align_jumps <= 0)
7918 opts->x_align_jumps = aarch64_tune_params.jump_align;
7919 if (opts->x_align_functions <= 0)
7920 opts->x_align_functions = aarch64_tune_params.function_align;
7923 /* If nopcrelative_literal_loads is set on the command line, this
7924 implies that the user asked for PC relative literal loads. */
7925 if (opts->x_nopcrelative_literal_loads == 1)
7926 aarch64_nopcrelative_literal_loads = false;
7928 /* If it is not set on the command line, we default to no pc
7929 relative literal loads, unless the workaround for Cortex-A53
7930 erratum 843419 is in effect. */
7931 /* This is PR70113. When building the Linux kernel with
7932 CONFIG_ARM64_ERRATUM_843419, support for relocations
7933 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
7934 removed from the kernel to avoid loading objects with possibly
7935 offending sequences. With nopcrelative_literal_loads, we would
7936 generate such relocations, preventing the kernel build from
7937 succeeding. */
7938 if (opts->x_nopcrelative_literal_loads == 2
7939 && !TARGET_FIX_ERR_A53_843419)
7940 aarch64_nopcrelative_literal_loads = true;
7942 /* In the tiny memory model it makes no sense
7943 to disallow non PC relative literal pool loads
7944 as many other things will break anyway. */
7945 if (opts->x_nopcrelative_literal_loads
7946 && (aarch64_cmodel == AARCH64_CMODEL_TINY
7947 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
7948 aarch64_nopcrelative_literal_loads = false;
7951 /* 'Unpack' up the internal tuning structs and update the options
7952 in OPTS. The caller must have set up selected_tune and selected_arch
7953 as all the other target-specific codegen decisions are
7954 derived from them. */
7956 void
7957 aarch64_override_options_internal (struct gcc_options *opts)
7959 aarch64_tune_flags = selected_tune->flags;
7960 aarch64_tune = selected_tune->sched_core;
7961 /* Make a copy of the tuning parameters attached to the core, which
7962 we may later overwrite. */
7963 aarch64_tune_params = *(selected_tune->tune);
7964 aarch64_architecture_version = selected_arch->architecture_version;
7966 if (opts->x_aarch64_override_tune_string)
7967 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
7968 &aarch64_tune_params);
7970 /* This target defaults to strict volatile bitfields. */
7971 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7972 opts->x_flag_strict_volatile_bitfields = 1;
7974 initialize_aarch64_code_model (opts);
7975 initialize_aarch64_tls_size (opts);
7977 int queue_depth = 0;
7978 switch (aarch64_tune_params.autoprefetcher_model)
7980 case tune_params::AUTOPREFETCHER_OFF:
7981 queue_depth = -1;
7982 break;
7983 case tune_params::AUTOPREFETCHER_WEAK:
7984 queue_depth = 0;
7985 break;
7986 case tune_params::AUTOPREFETCHER_STRONG:
7987 queue_depth = max_insn_queue_index + 1;
7988 break;
7989 default:
7990 gcc_unreachable ();
7993 /* We don't mind passing in global_options_set here as we don't use
7994 the *options_set structs anyway. */
7995 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
7996 queue_depth,
7997 opts->x_param_values,
7998 global_options_set.x_param_values);
8000 /* Set the L1 cache line size. */
8001 if (selected_cpu->tune->cache_line_size != 0)
8002 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8003 selected_cpu->tune->cache_line_size,
8004 opts->x_param_values,
8005 global_options_set.x_param_values);
8007 aarch64_override_options_after_change_1 (opts);
8010 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8011 specified in STR and throw errors if appropriate. Put the results if
8012 they are valid in RES and ISA_FLAGS. Return whether the option is
8013 valid. */
8015 static bool
8016 aarch64_validate_mcpu (const char *str, const struct processor **res,
8017 unsigned long *isa_flags)
8019 enum aarch64_parse_opt_result parse_res
8020 = aarch64_parse_cpu (str, res, isa_flags);
8022 if (parse_res == AARCH64_PARSE_OK)
8023 return true;
8025 switch (parse_res)
8027 case AARCH64_PARSE_MISSING_ARG:
8028 error ("missing cpu name in -mcpu=%qs", str);
8029 break;
8030 case AARCH64_PARSE_INVALID_ARG:
8031 error ("unknown value %qs for -mcpu", str);
8032 break;
8033 case AARCH64_PARSE_INVALID_FEATURE:
8034 error ("invalid feature modifier in -mcpu=%qs", str);
8035 break;
8036 default:
8037 gcc_unreachable ();
8040 return false;
8043 /* Validate a command-line -march option. Parse the arch and extensions
8044 (if any) specified in STR and throw errors if appropriate. Put the
8045 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8046 option is valid. */
8048 static bool
8049 aarch64_validate_march (const char *str, const struct processor **res,
8050 unsigned long *isa_flags)
8052 enum aarch64_parse_opt_result parse_res
8053 = aarch64_parse_arch (str, res, isa_flags);
8055 if (parse_res == AARCH64_PARSE_OK)
8056 return true;
8058 switch (parse_res)
8060 case AARCH64_PARSE_MISSING_ARG:
8061 error ("missing arch name in -march=%qs", str);
8062 break;
8063 case AARCH64_PARSE_INVALID_ARG:
8064 error ("unknown value %qs for -march", str);
8065 break;
8066 case AARCH64_PARSE_INVALID_FEATURE:
8067 error ("invalid feature modifier in -march=%qs", str);
8068 break;
8069 default:
8070 gcc_unreachable ();
8073 return false;
8076 /* Validate a command-line -mtune option. Parse the cpu
8077 specified in STR and throw errors if appropriate. Put the
8078 result, if it is valid, in RES. Return whether the option is
8079 valid. */
8081 static bool
8082 aarch64_validate_mtune (const char *str, const struct processor **res)
8084 enum aarch64_parse_opt_result parse_res
8085 = aarch64_parse_tune (str, res);
8087 if (parse_res == AARCH64_PARSE_OK)
8088 return true;
8090 switch (parse_res)
8092 case AARCH64_PARSE_MISSING_ARG:
8093 error ("missing cpu name in -mtune=%qs", str);
8094 break;
8095 case AARCH64_PARSE_INVALID_ARG:
8096 error ("unknown value %qs for -mtune", str);
8097 break;
8098 default:
8099 gcc_unreachable ();
8101 return false;
8104 /* Return the CPU corresponding to the enum CPU.
8105 If it doesn't specify a cpu, return the default. */
8107 static const struct processor *
8108 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8110 if (cpu != aarch64_none)
8111 return &all_cores[cpu];
8113 /* The & 0x3f is to extract the bottom 6 bits that encode the
8114 default cpu as selected by the --with-cpu GCC configure option
8115 in config.gcc.
8116 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8117 flags mechanism should be reworked to make it more sane. */
8118 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8121 /* Return the architecture corresponding to the enum ARCH.
8122 If it doesn't specify a valid architecture, return the default. */
8124 static const struct processor *
8125 aarch64_get_arch (enum aarch64_arch arch)
8127 if (arch != aarch64_no_arch)
8128 return &all_architectures[arch];
8130 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8132 return &all_architectures[cpu->arch];
8135 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8136 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8137 tuning structs. In particular it must set selected_tune and
8138 aarch64_isa_flags that define the available ISA features and tuning
8139 decisions. It must also set selected_arch as this will be used to
8140 output the .arch asm tags for each function. */
8142 static void
8143 aarch64_override_options (void)
8145 unsigned long cpu_isa = 0;
8146 unsigned long arch_isa = 0;
8147 aarch64_isa_flags = 0;
8149 bool valid_cpu = true;
8150 bool valid_tune = true;
8151 bool valid_arch = true;
8153 selected_cpu = NULL;
8154 selected_arch = NULL;
8155 selected_tune = NULL;
8157 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8158 If either of -march or -mtune is given, they override their
8159 respective component of -mcpu. */
8160 if (aarch64_cpu_string)
8161 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8162 &cpu_isa);
8164 if (aarch64_arch_string)
8165 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8166 &arch_isa);
8168 if (aarch64_tune_string)
8169 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8171 /* If the user did not specify a processor, choose the default
8172 one for them. This will be the CPU set during configuration using
8173 --with-cpu, otherwise it is "generic". */
8174 if (!selected_cpu)
8176 if (selected_arch)
8178 selected_cpu = &all_cores[selected_arch->ident];
8179 aarch64_isa_flags = arch_isa;
8180 explicit_arch = selected_arch->arch;
8182 else
8184 /* Get default configure-time CPU. */
8185 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8186 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8189 if (selected_tune)
8190 explicit_tune_core = selected_tune->ident;
8192 /* If both -mcpu and -march are specified check that they are architecturally
8193 compatible, warn if they're not and prefer the -march ISA flags. */
8194 else if (selected_arch)
8196 if (selected_arch->arch != selected_cpu->arch)
8198 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8199 all_architectures[selected_cpu->arch].name,
8200 selected_arch->name);
8202 aarch64_isa_flags = arch_isa;
8203 explicit_arch = selected_arch->arch;
8204 explicit_tune_core = selected_tune ? selected_tune->ident
8205 : selected_cpu->ident;
8207 else
8209 /* -mcpu but no -march. */
8210 aarch64_isa_flags = cpu_isa;
8211 explicit_tune_core = selected_tune ? selected_tune->ident
8212 : selected_cpu->ident;
8213 gcc_assert (selected_cpu);
8214 selected_arch = &all_architectures[selected_cpu->arch];
8215 explicit_arch = selected_arch->arch;
8218 /* Set the arch as well as we will need it when outputing
8219 the .arch directive in assembly. */
8220 if (!selected_arch)
8222 gcc_assert (selected_cpu);
8223 selected_arch = &all_architectures[selected_cpu->arch];
8226 if (!selected_tune)
8227 selected_tune = selected_cpu;
8229 #ifndef HAVE_AS_MABI_OPTION
8230 /* The compiler may have been configured with 2.23.* binutils, which does
8231 not have support for ILP32. */
8232 if (TARGET_ILP32)
8233 error ("Assembler does not support -mabi=ilp32");
8234 #endif
8236 /* Make sure we properly set up the explicit options. */
8237 if ((aarch64_cpu_string && valid_cpu)
8238 || (aarch64_tune_string && valid_tune))
8239 gcc_assert (explicit_tune_core != aarch64_none);
8241 if ((aarch64_cpu_string && valid_cpu)
8242 || (aarch64_arch_string && valid_arch))
8243 gcc_assert (explicit_arch != aarch64_no_arch);
8245 aarch64_override_options_internal (&global_options);
8247 /* Save these options as the default ones in case we push and pop them later
8248 while processing functions with potential target attributes. */
8249 target_option_default_node = target_option_current_node
8250 = build_target_option_node (&global_options);
8252 aarch64_register_fma_steering ();
8256 /* Implement targetm.override_options_after_change. */
8258 static void
8259 aarch64_override_options_after_change (void)
8261 aarch64_override_options_after_change_1 (&global_options);
8264 static struct machine_function *
8265 aarch64_init_machine_status (void)
8267 struct machine_function *machine;
8268 machine = ggc_cleared_alloc<machine_function> ();
8269 return machine;
8272 void
8273 aarch64_init_expanders (void)
8275 init_machine_status = aarch64_init_machine_status;
8278 /* A checking mechanism for the implementation of the various code models. */
8279 static void
8280 initialize_aarch64_code_model (struct gcc_options *opts)
8282 if (opts->x_flag_pic)
8284 switch (opts->x_aarch64_cmodel_var)
8286 case AARCH64_CMODEL_TINY:
8287 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8288 break;
8289 case AARCH64_CMODEL_SMALL:
8290 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8291 aarch64_cmodel = (flag_pic == 2
8292 ? AARCH64_CMODEL_SMALL_PIC
8293 : AARCH64_CMODEL_SMALL_SPIC);
8294 #else
8295 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8296 #endif
8297 break;
8298 case AARCH64_CMODEL_LARGE:
8299 sorry ("code model %qs with -f%s", "large",
8300 opts->x_flag_pic > 1 ? "PIC" : "pic");
8301 break;
8302 default:
8303 gcc_unreachable ();
8306 else
8307 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8310 /* Implement TARGET_OPTION_SAVE. */
8312 static void
8313 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8315 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8318 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8319 using the information saved in PTR. */
8321 static void
8322 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8324 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8325 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8326 opts->x_explicit_arch = ptr->x_explicit_arch;
8327 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8328 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8330 aarch64_override_options_internal (opts);
8333 /* Implement TARGET_OPTION_PRINT. */
8335 static void
8336 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8338 const struct processor *cpu
8339 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8340 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8341 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8342 std::string extension
8343 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8345 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8346 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8347 arch->name, extension.c_str ());
8350 static GTY(()) tree aarch64_previous_fndecl;
8352 void
8353 aarch64_reset_previous_fndecl (void)
8355 aarch64_previous_fndecl = NULL;
8358 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8359 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8360 make sure optab availability predicates are recomputed when necessary. */
8362 void
8363 aarch64_save_restore_target_globals (tree new_tree)
8365 if (TREE_TARGET_GLOBALS (new_tree))
8366 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8367 else if (new_tree == target_option_default_node)
8368 restore_target_globals (&default_target_globals);
8369 else
8370 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8373 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8374 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8375 of the function, if such exists. This function may be called multiple
8376 times on a single function so use aarch64_previous_fndecl to avoid
8377 setting up identical state. */
8379 static void
8380 aarch64_set_current_function (tree fndecl)
8382 if (!fndecl || fndecl == aarch64_previous_fndecl)
8383 return;
8385 tree old_tree = (aarch64_previous_fndecl
8386 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8387 : NULL_TREE);
8389 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8391 /* If current function has no attributes but the previous one did,
8392 use the default node. */
8393 if (!new_tree && old_tree)
8394 new_tree = target_option_default_node;
8396 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
8397 the default have been handled by aarch64_save_restore_target_globals from
8398 aarch64_pragma_target_parse. */
8399 if (old_tree == new_tree)
8400 return;
8402 aarch64_previous_fndecl = fndecl;
8404 /* First set the target options. */
8405 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8407 aarch64_save_restore_target_globals (new_tree);
8410 /* Enum describing the various ways we can handle attributes.
8411 In many cases we can reuse the generic option handling machinery. */
8413 enum aarch64_attr_opt_type
8415 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8416 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8417 aarch64_attr_enum, /* Attribute sets an enum variable. */
8418 aarch64_attr_custom /* Attribute requires a custom handling function. */
8421 /* All the information needed to handle a target attribute.
8422 NAME is the name of the attribute.
8423 ATTR_TYPE specifies the type of behavior of the attribute as described
8424 in the definition of enum aarch64_attr_opt_type.
8425 ALLOW_NEG is true if the attribute supports a "no-" form.
8426 HANDLER is the function that takes the attribute string and whether
8427 it is a pragma or attribute and handles the option. It is needed only
8428 when the ATTR_TYPE is aarch64_attr_custom.
8429 OPT_NUM is the enum specifying the option that the attribute modifies.
8430 This is needed for attributes that mirror the behavior of a command-line
8431 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8432 aarch64_attr_enum. */
8434 struct aarch64_attribute_info
8436 const char *name;
8437 enum aarch64_attr_opt_type attr_type;
8438 bool allow_neg;
8439 bool (*handler) (const char *, const char *);
8440 enum opt_code opt_num;
8443 /* Handle the ARCH_STR argument to the arch= target attribute.
8444 PRAGMA_OR_ATTR is used in potential error messages. */
8446 static bool
8447 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8449 const struct processor *tmp_arch = NULL;
8450 enum aarch64_parse_opt_result parse_res
8451 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8453 if (parse_res == AARCH64_PARSE_OK)
8455 gcc_assert (tmp_arch);
8456 selected_arch = tmp_arch;
8457 explicit_arch = selected_arch->arch;
8458 return true;
8461 switch (parse_res)
8463 case AARCH64_PARSE_MISSING_ARG:
8464 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8465 break;
8466 case AARCH64_PARSE_INVALID_ARG:
8467 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8468 break;
8469 case AARCH64_PARSE_INVALID_FEATURE:
8470 error ("invalid feature modifier %qs for 'arch' target %s",
8471 str, pragma_or_attr);
8472 break;
8473 default:
8474 gcc_unreachable ();
8477 return false;
8480 /* Handle the argument CPU_STR to the cpu= target attribute.
8481 PRAGMA_OR_ATTR is used in potential error messages. */
8483 static bool
8484 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8486 const struct processor *tmp_cpu = NULL;
8487 enum aarch64_parse_opt_result parse_res
8488 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8490 if (parse_res == AARCH64_PARSE_OK)
8492 gcc_assert (tmp_cpu);
8493 selected_tune = tmp_cpu;
8494 explicit_tune_core = selected_tune->ident;
8496 selected_arch = &all_architectures[tmp_cpu->arch];
8497 explicit_arch = selected_arch->arch;
8498 return true;
8501 switch (parse_res)
8503 case AARCH64_PARSE_MISSING_ARG:
8504 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8505 break;
8506 case AARCH64_PARSE_INVALID_ARG:
8507 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8508 break;
8509 case AARCH64_PARSE_INVALID_FEATURE:
8510 error ("invalid feature modifier %qs for 'cpu' target %s",
8511 str, pragma_or_attr);
8512 break;
8513 default:
8514 gcc_unreachable ();
8517 return false;
8520 /* Handle the argument STR to the tune= target attribute.
8521 PRAGMA_OR_ATTR is used in potential error messages. */
8523 static bool
8524 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8526 const struct processor *tmp_tune = NULL;
8527 enum aarch64_parse_opt_result parse_res
8528 = aarch64_parse_tune (str, &tmp_tune);
8530 if (parse_res == AARCH64_PARSE_OK)
8532 gcc_assert (tmp_tune);
8533 selected_tune = tmp_tune;
8534 explicit_tune_core = selected_tune->ident;
8535 return true;
8538 switch (parse_res)
8540 case AARCH64_PARSE_INVALID_ARG:
8541 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8542 break;
8543 default:
8544 gcc_unreachable ();
8547 return false;
8550 /* Parse an architecture extensions target attribute string specified in STR.
8551 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8552 if successful. Update aarch64_isa_flags to reflect the ISA features
8553 modified.
8554 PRAGMA_OR_ATTR is used in potential error messages. */
8556 static bool
8557 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8559 enum aarch64_parse_opt_result parse_res;
8560 unsigned long isa_flags = aarch64_isa_flags;
8562 /* We allow "+nothing" in the beginning to clear out all architectural
8563 features if the user wants to handpick specific features. */
8564 if (strncmp ("+nothing", str, 8) == 0)
8566 isa_flags = 0;
8567 str += 8;
8570 parse_res = aarch64_parse_extension (str, &isa_flags);
8572 if (parse_res == AARCH64_PARSE_OK)
8574 aarch64_isa_flags = isa_flags;
8575 return true;
8578 switch (parse_res)
8580 case AARCH64_PARSE_MISSING_ARG:
8581 error ("missing feature modifier in target %s %qs",
8582 pragma_or_attr, str);
8583 break;
8585 case AARCH64_PARSE_INVALID_FEATURE:
8586 error ("invalid feature modifier in target %s %qs",
8587 pragma_or_attr, str);
8588 break;
8590 default:
8591 gcc_unreachable ();
8594 return false;
8597 /* The target attributes that we support. On top of these we also support just
8598 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8599 handled explicitly in aarch64_process_one_target_attr. */
8601 static const struct aarch64_attribute_info aarch64_attributes[] =
8603 { "general-regs-only", aarch64_attr_mask, false, NULL,
8604 OPT_mgeneral_regs_only },
8605 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8606 OPT_mfix_cortex_a53_835769 },
8607 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8608 OPT_mfix_cortex_a53_843419 },
8609 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8610 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8611 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8612 OPT_momit_leaf_frame_pointer },
8613 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8614 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8615 OPT_march_ },
8616 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8617 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8618 OPT_mtune_ },
8619 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8622 /* Parse ARG_STR which contains the definition of one target attribute.
8623 Show appropriate errors if any or return true if the attribute is valid.
8624 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8625 we're processing a target attribute or pragma. */
8627 static bool
8628 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8630 bool invert = false;
8632 size_t len = strlen (arg_str);
8634 if (len == 0)
8636 error ("malformed target %s", pragma_or_attr);
8637 return false;
8640 char *str_to_check = (char *) alloca (len + 1);
8641 strcpy (str_to_check, arg_str);
8643 /* Skip leading whitespace. */
8644 while (*str_to_check == ' ' || *str_to_check == '\t')
8645 str_to_check++;
8647 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8648 It is easier to detect and handle it explicitly here rather than going
8649 through the machinery for the rest of the target attributes in this
8650 function. */
8651 if (*str_to_check == '+')
8652 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8654 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8656 invert = true;
8657 str_to_check += 3;
8659 char *arg = strchr (str_to_check, '=');
8661 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8662 and point ARG to "foo". */
8663 if (arg)
8665 *arg = '\0';
8666 arg++;
8668 const struct aarch64_attribute_info *p_attr;
8669 bool found = false;
8670 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8672 /* If the names don't match up, or the user has given an argument
8673 to an attribute that doesn't accept one, or didn't give an argument
8674 to an attribute that expects one, fail to match. */
8675 if (strcmp (str_to_check, p_attr->name) != 0)
8676 continue;
8678 found = true;
8679 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8680 || p_attr->attr_type == aarch64_attr_enum;
8682 if (attr_need_arg_p ^ (arg != NULL))
8684 error ("target %s %qs does not accept an argument",
8685 pragma_or_attr, str_to_check);
8686 return false;
8689 /* If the name matches but the attribute does not allow "no-" versions
8690 then we can't match. */
8691 if (invert && !p_attr->allow_neg)
8693 error ("target %s %qs does not allow a negated form",
8694 pragma_or_attr, str_to_check);
8695 return false;
8698 switch (p_attr->attr_type)
8700 /* Has a custom handler registered.
8701 For example, cpu=, arch=, tune=. */
8702 case aarch64_attr_custom:
8703 gcc_assert (p_attr->handler);
8704 if (!p_attr->handler (arg, pragma_or_attr))
8705 return false;
8706 break;
8708 /* Either set or unset a boolean option. */
8709 case aarch64_attr_bool:
8711 struct cl_decoded_option decoded;
8713 generate_option (p_attr->opt_num, NULL, !invert,
8714 CL_TARGET, &decoded);
8715 aarch64_handle_option (&global_options, &global_options_set,
8716 &decoded, input_location);
8717 break;
8719 /* Set or unset a bit in the target_flags. aarch64_handle_option
8720 should know what mask to apply given the option number. */
8721 case aarch64_attr_mask:
8723 struct cl_decoded_option decoded;
8724 /* We only need to specify the option number.
8725 aarch64_handle_option will know which mask to apply. */
8726 decoded.opt_index = p_attr->opt_num;
8727 decoded.value = !invert;
8728 aarch64_handle_option (&global_options, &global_options_set,
8729 &decoded, input_location);
8730 break;
8732 /* Use the option setting machinery to set an option to an enum. */
8733 case aarch64_attr_enum:
8735 gcc_assert (arg);
8736 bool valid;
8737 int value;
8738 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8739 &value, CL_TARGET);
8740 if (valid)
8742 set_option (&global_options, NULL, p_attr->opt_num, value,
8743 NULL, DK_UNSPECIFIED, input_location,
8744 global_dc);
8746 else
8748 error ("target %s %s=%s is not valid",
8749 pragma_or_attr, str_to_check, arg);
8751 break;
8753 default:
8754 gcc_unreachable ();
8758 /* If we reached here we either have found an attribute and validated
8759 it or didn't match any. If we matched an attribute but its arguments
8760 were malformed we will have returned false already. */
8761 return found;
8764 /* Count how many times the character C appears in
8765 NULL-terminated string STR. */
8767 static unsigned int
8768 num_occurences_in_str (char c, char *str)
8770 unsigned int res = 0;
8771 while (*str != '\0')
8773 if (*str == c)
8774 res++;
8776 str++;
8779 return res;
8782 /* Parse the tree in ARGS that contains the target attribute information
8783 and update the global target options space. PRAGMA_OR_ATTR is a string
8784 to be used in error messages, specifying whether this is processing
8785 a target attribute or a target pragma. */
8787 bool
8788 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
8790 if (TREE_CODE (args) == TREE_LIST)
8794 tree head = TREE_VALUE (args);
8795 if (head)
8797 if (!aarch64_process_target_attr (head, pragma_or_attr))
8798 return false;
8800 args = TREE_CHAIN (args);
8801 } while (args);
8803 return true;
8805 /* We expect to find a string to parse. */
8806 gcc_assert (TREE_CODE (args) == STRING_CST);
8808 size_t len = strlen (TREE_STRING_POINTER (args));
8809 char *str_to_check = (char *) alloca (len + 1);
8810 strcpy (str_to_check, TREE_STRING_POINTER (args));
8812 if (len == 0)
8814 error ("malformed target %s value", pragma_or_attr);
8815 return false;
8818 /* Used to catch empty spaces between commas i.e.
8819 attribute ((target ("attr1,,attr2"))). */
8820 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
8822 /* Handle multiple target attributes separated by ','. */
8823 char *token = strtok (str_to_check, ",");
8825 unsigned int num_attrs = 0;
8826 while (token)
8828 num_attrs++;
8829 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
8831 error ("target %s %qs is invalid", pragma_or_attr, token);
8832 return false;
8835 token = strtok (NULL, ",");
8838 if (num_attrs != num_commas + 1)
8840 error ("malformed target %s list %qs",
8841 pragma_or_attr, TREE_STRING_POINTER (args));
8842 return false;
8845 return true;
8848 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
8849 process attribute ((target ("..."))). */
8851 static bool
8852 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
8854 struct cl_target_option cur_target;
8855 bool ret;
8856 tree old_optimize;
8857 tree new_target, new_optimize;
8858 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8860 /* If what we're processing is the current pragma string then the
8861 target option node is already stored in target_option_current_node
8862 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
8863 having to re-parse the string. This is especially useful to keep
8864 arm_neon.h compile times down since that header contains a lot
8865 of intrinsics enclosed in pragmas. */
8866 if (!existing_target && args == current_target_pragma)
8868 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
8869 return true;
8871 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8873 old_optimize = build_optimization_node (&global_options);
8874 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8876 /* If the function changed the optimization levels as well as setting
8877 target options, start with the optimizations specified. */
8878 if (func_optimize && func_optimize != old_optimize)
8879 cl_optimization_restore (&global_options,
8880 TREE_OPTIMIZATION (func_optimize));
8882 /* Save the current target options to restore at the end. */
8883 cl_target_option_save (&cur_target, &global_options);
8885 /* If fndecl already has some target attributes applied to it, unpack
8886 them so that we add this attribute on top of them, rather than
8887 overwriting them. */
8888 if (existing_target)
8890 struct cl_target_option *existing_options
8891 = TREE_TARGET_OPTION (existing_target);
8893 if (existing_options)
8894 cl_target_option_restore (&global_options, existing_options);
8896 else
8897 cl_target_option_restore (&global_options,
8898 TREE_TARGET_OPTION (target_option_current_node));
8901 ret = aarch64_process_target_attr (args, "attribute");
8903 /* Set up any additional state. */
8904 if (ret)
8906 aarch64_override_options_internal (&global_options);
8907 /* Initialize SIMD builtins if we haven't already.
8908 Set current_target_pragma to NULL for the duration so that
8909 the builtin initialization code doesn't try to tag the functions
8910 being built with the attributes specified by any current pragma, thus
8911 going into an infinite recursion. */
8912 if (TARGET_SIMD)
8914 tree saved_current_target_pragma = current_target_pragma;
8915 current_target_pragma = NULL;
8916 aarch64_init_simd_builtins ();
8917 current_target_pragma = saved_current_target_pragma;
8919 new_target = build_target_option_node (&global_options);
8921 else
8922 new_target = NULL;
8924 new_optimize = build_optimization_node (&global_options);
8926 if (fndecl && ret)
8928 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
8930 if (old_optimize != new_optimize)
8931 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
8934 cl_target_option_restore (&global_options, &cur_target);
8936 if (old_optimize != new_optimize)
8937 cl_optimization_restore (&global_options,
8938 TREE_OPTIMIZATION (old_optimize));
8939 return ret;
8942 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
8943 tri-bool options (yes, no, don't care) and the default value is
8944 DEF, determine whether to reject inlining. */
8946 static bool
8947 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
8948 int dont_care, int def)
8950 /* If the callee doesn't care, always allow inlining. */
8951 if (callee == dont_care)
8952 return true;
8954 /* If the caller doesn't care, always allow inlining. */
8955 if (caller == dont_care)
8956 return true;
8958 /* Otherwise, allow inlining if either the callee and caller values
8959 agree, or if the callee is using the default value. */
8960 return (callee == caller || callee == def);
8963 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
8964 to inline CALLEE into CALLER based on target-specific info.
8965 Make sure that the caller and callee have compatible architectural
8966 features. Then go through the other possible target attributes
8967 and see if they can block inlining. Try not to reject always_inline
8968 callees unless they are incompatible architecturally. */
8970 static bool
8971 aarch64_can_inline_p (tree caller, tree callee)
8973 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
8974 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
8976 /* If callee has no option attributes, then it is ok to inline. */
8977 if (!callee_tree)
8978 return true;
8980 struct cl_target_option *caller_opts
8981 = TREE_TARGET_OPTION (caller_tree ? caller_tree
8982 : target_option_default_node);
8984 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
8987 /* Callee's ISA flags should be a subset of the caller's. */
8988 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
8989 != callee_opts->x_aarch64_isa_flags)
8990 return false;
8992 /* Allow non-strict aligned functions inlining into strict
8993 aligned ones. */
8994 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
8995 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
8996 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
8997 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
8998 return false;
9000 bool always_inline = lookup_attribute ("always_inline",
9001 DECL_ATTRIBUTES (callee));
9003 /* If the architectural features match up and the callee is always_inline
9004 then the other attributes don't matter. */
9005 if (always_inline)
9006 return true;
9008 if (caller_opts->x_aarch64_cmodel_var
9009 != callee_opts->x_aarch64_cmodel_var)
9010 return false;
9012 if (caller_opts->x_aarch64_tls_dialect
9013 != callee_opts->x_aarch64_tls_dialect)
9014 return false;
9016 /* Honour explicit requests to workaround errata. */
9017 if (!aarch64_tribools_ok_for_inlining_p (
9018 caller_opts->x_aarch64_fix_a53_err835769,
9019 callee_opts->x_aarch64_fix_a53_err835769,
9020 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9021 return false;
9023 if (!aarch64_tribools_ok_for_inlining_p (
9024 caller_opts->x_aarch64_fix_a53_err843419,
9025 callee_opts->x_aarch64_fix_a53_err843419,
9026 2, TARGET_FIX_ERR_A53_843419))
9027 return false;
9029 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9030 caller and calle and they don't match up, reject inlining. */
9031 if (!aarch64_tribools_ok_for_inlining_p (
9032 caller_opts->x_flag_omit_leaf_frame_pointer,
9033 callee_opts->x_flag_omit_leaf_frame_pointer,
9034 2, 1))
9035 return false;
9037 /* If the callee has specific tuning overrides, respect them. */
9038 if (callee_opts->x_aarch64_override_tune_string != NULL
9039 && caller_opts->x_aarch64_override_tune_string == NULL)
9040 return false;
9042 /* If the user specified tuning override strings for the
9043 caller and callee and they don't match up, reject inlining.
9044 We just do a string compare here, we don't analyze the meaning
9045 of the string, as it would be too costly for little gain. */
9046 if (callee_opts->x_aarch64_override_tune_string
9047 && caller_opts->x_aarch64_override_tune_string
9048 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9049 caller_opts->x_aarch64_override_tune_string) != 0))
9050 return false;
9052 return true;
9055 /* Return true if SYMBOL_REF X binds locally. */
9057 static bool
9058 aarch64_symbol_binds_local_p (const_rtx x)
9060 return (SYMBOL_REF_DECL (x)
9061 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9062 : SYMBOL_REF_LOCAL_P (x));
9065 /* Return true if SYMBOL_REF X is thread local */
9066 static bool
9067 aarch64_tls_symbol_p (rtx x)
9069 if (! TARGET_HAVE_TLS)
9070 return false;
9072 if (GET_CODE (x) != SYMBOL_REF)
9073 return false;
9075 return SYMBOL_REF_TLS_MODEL (x) != 0;
9078 /* Classify a TLS symbol into one of the TLS kinds. */
9079 enum aarch64_symbol_type
9080 aarch64_classify_tls_symbol (rtx x)
9082 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9084 switch (tls_kind)
9086 case TLS_MODEL_GLOBAL_DYNAMIC:
9087 case TLS_MODEL_LOCAL_DYNAMIC:
9088 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9090 case TLS_MODEL_INITIAL_EXEC:
9091 switch (aarch64_cmodel)
9093 case AARCH64_CMODEL_TINY:
9094 case AARCH64_CMODEL_TINY_PIC:
9095 return SYMBOL_TINY_TLSIE;
9096 default:
9097 return SYMBOL_SMALL_TLSIE;
9100 case TLS_MODEL_LOCAL_EXEC:
9101 if (aarch64_tls_size == 12)
9102 return SYMBOL_TLSLE12;
9103 else if (aarch64_tls_size == 24)
9104 return SYMBOL_TLSLE24;
9105 else if (aarch64_tls_size == 32)
9106 return SYMBOL_TLSLE32;
9107 else if (aarch64_tls_size == 48)
9108 return SYMBOL_TLSLE48;
9109 else
9110 gcc_unreachable ();
9112 case TLS_MODEL_EMULATED:
9113 case TLS_MODEL_NONE:
9114 return SYMBOL_FORCE_TO_MEM;
9116 default:
9117 gcc_unreachable ();
9121 /* Return the method that should be used to access SYMBOL_REF or
9122 LABEL_REF X. */
9124 enum aarch64_symbol_type
9125 aarch64_classify_symbol (rtx x, rtx offset)
9127 if (GET_CODE (x) == LABEL_REF)
9129 switch (aarch64_cmodel)
9131 case AARCH64_CMODEL_LARGE:
9132 return SYMBOL_FORCE_TO_MEM;
9134 case AARCH64_CMODEL_TINY_PIC:
9135 case AARCH64_CMODEL_TINY:
9136 return SYMBOL_TINY_ABSOLUTE;
9138 case AARCH64_CMODEL_SMALL_SPIC:
9139 case AARCH64_CMODEL_SMALL_PIC:
9140 case AARCH64_CMODEL_SMALL:
9141 return SYMBOL_SMALL_ABSOLUTE;
9143 default:
9144 gcc_unreachable ();
9148 if (GET_CODE (x) == SYMBOL_REF)
9150 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
9152 /* This is alright even in PIC code as the constant
9153 pool reference is always PC relative and within
9154 the same translation unit. */
9155 if (nopcrelative_literal_loads
9156 && CONSTANT_POOL_ADDRESS_P (x))
9157 return SYMBOL_SMALL_ABSOLUTE;
9158 else
9159 return SYMBOL_FORCE_TO_MEM;
9162 if (aarch64_tls_symbol_p (x))
9163 return aarch64_classify_tls_symbol (x);
9165 switch (aarch64_cmodel)
9167 case AARCH64_CMODEL_TINY:
9168 /* When we retreive symbol + offset address, we have to make sure
9169 the offset does not cause overflow of the final address. But
9170 we have no way of knowing the address of symbol at compile time
9171 so we can't accurately say if the distance between the PC and
9172 symbol + offset is outside the addressible range of +/-1M in the
9173 TINY code model. So we rely on images not being greater than
9174 1M and cap the offset at 1M and anything beyond 1M will have to
9175 be loaded using an alternative mechanism. */
9176 if (SYMBOL_REF_WEAK (x)
9177 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9178 return SYMBOL_FORCE_TO_MEM;
9179 return SYMBOL_TINY_ABSOLUTE;
9181 case AARCH64_CMODEL_SMALL:
9182 /* Same reasoning as the tiny code model, but the offset cap here is
9183 4G. */
9184 if (SYMBOL_REF_WEAK (x)
9185 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9186 HOST_WIDE_INT_C (4294967264)))
9187 return SYMBOL_FORCE_TO_MEM;
9188 return SYMBOL_SMALL_ABSOLUTE;
9190 case AARCH64_CMODEL_TINY_PIC:
9191 if (!aarch64_symbol_binds_local_p (x))
9192 return SYMBOL_TINY_GOT;
9193 return SYMBOL_TINY_ABSOLUTE;
9195 case AARCH64_CMODEL_SMALL_SPIC:
9196 case AARCH64_CMODEL_SMALL_PIC:
9197 if (!aarch64_symbol_binds_local_p (x))
9198 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9199 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9200 return SYMBOL_SMALL_ABSOLUTE;
9202 default:
9203 gcc_unreachable ();
9207 /* By default push everything into the constant pool. */
9208 return SYMBOL_FORCE_TO_MEM;
9211 bool
9212 aarch64_constant_address_p (rtx x)
9214 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9217 bool
9218 aarch64_legitimate_pic_operand_p (rtx x)
9220 if (GET_CODE (x) == SYMBOL_REF
9221 || (GET_CODE (x) == CONST
9222 && GET_CODE (XEXP (x, 0)) == PLUS
9223 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9224 return false;
9226 return true;
9229 /* Return true if X holds either a quarter-precision or
9230 floating-point +0.0 constant. */
9231 static bool
9232 aarch64_valid_floating_const (machine_mode mode, rtx x)
9234 if (!CONST_DOUBLE_P (x))
9235 return false;
9237 if (aarch64_float_const_zero_rtx_p (x))
9238 return true;
9240 /* We only handle moving 0.0 to a TFmode register. */
9241 if (!(mode == SFmode || mode == DFmode))
9242 return false;
9244 return aarch64_float_const_representable_p (x);
9247 static bool
9248 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9250 /* Do not allow vector struct mode constants. We could support
9251 0 and -1 easily, but they need support in aarch64-simd.md. */
9252 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9253 return false;
9255 /* This could probably go away because
9256 we now decompose CONST_INTs according to expand_mov_immediate. */
9257 if ((GET_CODE (x) == CONST_VECTOR
9258 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9259 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9260 return !targetm.cannot_force_const_mem (mode, x);
9262 if (GET_CODE (x) == HIGH
9263 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9264 return true;
9266 return aarch64_constant_address_p (x);
9270 aarch64_load_tp (rtx target)
9272 if (!target
9273 || GET_MODE (target) != Pmode
9274 || !register_operand (target, Pmode))
9275 target = gen_reg_rtx (Pmode);
9277 /* Can return in any reg. */
9278 emit_insn (gen_aarch64_load_tp_hard (target));
9279 return target;
9282 /* On AAPCS systems, this is the "struct __va_list". */
9283 static GTY(()) tree va_list_type;
9285 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9286 Return the type to use as __builtin_va_list.
9288 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9290 struct __va_list
9292 void *__stack;
9293 void *__gr_top;
9294 void *__vr_top;
9295 int __gr_offs;
9296 int __vr_offs;
9297 }; */
9299 static tree
9300 aarch64_build_builtin_va_list (void)
9302 tree va_list_name;
9303 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9305 /* Create the type. */
9306 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9307 /* Give it the required name. */
9308 va_list_name = build_decl (BUILTINS_LOCATION,
9309 TYPE_DECL,
9310 get_identifier ("__va_list"),
9311 va_list_type);
9312 DECL_ARTIFICIAL (va_list_name) = 1;
9313 TYPE_NAME (va_list_type) = va_list_name;
9314 TYPE_STUB_DECL (va_list_type) = va_list_name;
9316 /* Create the fields. */
9317 f_stack = build_decl (BUILTINS_LOCATION,
9318 FIELD_DECL, get_identifier ("__stack"),
9319 ptr_type_node);
9320 f_grtop = build_decl (BUILTINS_LOCATION,
9321 FIELD_DECL, get_identifier ("__gr_top"),
9322 ptr_type_node);
9323 f_vrtop = build_decl (BUILTINS_LOCATION,
9324 FIELD_DECL, get_identifier ("__vr_top"),
9325 ptr_type_node);
9326 f_groff = build_decl (BUILTINS_LOCATION,
9327 FIELD_DECL, get_identifier ("__gr_offs"),
9328 integer_type_node);
9329 f_vroff = build_decl (BUILTINS_LOCATION,
9330 FIELD_DECL, get_identifier ("__vr_offs"),
9331 integer_type_node);
9333 /* Tell tree-stdarg pass about our internal offset fields.
9334 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9335 purpose to identify whether the code is updating va_list internal
9336 offset fields through irregular way. */
9337 va_list_gpr_counter_field = f_groff;
9338 va_list_fpr_counter_field = f_vroff;
9340 DECL_ARTIFICIAL (f_stack) = 1;
9341 DECL_ARTIFICIAL (f_grtop) = 1;
9342 DECL_ARTIFICIAL (f_vrtop) = 1;
9343 DECL_ARTIFICIAL (f_groff) = 1;
9344 DECL_ARTIFICIAL (f_vroff) = 1;
9346 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9347 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9348 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9349 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9350 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9352 TYPE_FIELDS (va_list_type) = f_stack;
9353 DECL_CHAIN (f_stack) = f_grtop;
9354 DECL_CHAIN (f_grtop) = f_vrtop;
9355 DECL_CHAIN (f_vrtop) = f_groff;
9356 DECL_CHAIN (f_groff) = f_vroff;
9358 /* Compute its layout. */
9359 layout_type (va_list_type);
9361 return va_list_type;
9364 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9365 static void
9366 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9368 const CUMULATIVE_ARGS *cum;
9369 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9370 tree stack, grtop, vrtop, groff, vroff;
9371 tree t;
9372 int gr_save_area_size = cfun->va_list_gpr_size;
9373 int vr_save_area_size = cfun->va_list_fpr_size;
9374 int vr_offset;
9376 cum = &crtl->args.info;
9377 if (cfun->va_list_gpr_size)
9378 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9379 cfun->va_list_gpr_size);
9380 if (cfun->va_list_fpr_size)
9381 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9382 * UNITS_PER_VREG, cfun->va_list_fpr_size);
9384 if (!TARGET_FLOAT)
9386 gcc_assert (cum->aapcs_nvrn == 0);
9387 vr_save_area_size = 0;
9390 f_stack = TYPE_FIELDS (va_list_type_node);
9391 f_grtop = DECL_CHAIN (f_stack);
9392 f_vrtop = DECL_CHAIN (f_grtop);
9393 f_groff = DECL_CHAIN (f_vrtop);
9394 f_vroff = DECL_CHAIN (f_groff);
9396 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9397 NULL_TREE);
9398 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9399 NULL_TREE);
9400 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9401 NULL_TREE);
9402 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9403 NULL_TREE);
9404 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9405 NULL_TREE);
9407 /* Emit code to initialize STACK, which points to the next varargs stack
9408 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9409 by named arguments. STACK is 8-byte aligned. */
9410 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9411 if (cum->aapcs_stack_size > 0)
9412 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9413 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9414 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9416 /* Emit code to initialize GRTOP, the top of the GR save area.
9417 virtual_incoming_args_rtx should have been 16 byte aligned. */
9418 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9419 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9420 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9422 /* Emit code to initialize VRTOP, the top of the VR save area.
9423 This address is gr_save_area_bytes below GRTOP, rounded
9424 down to the next 16-byte boundary. */
9425 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9426 vr_offset = ROUND_UP (gr_save_area_size,
9427 STACK_BOUNDARY / BITS_PER_UNIT);
9429 if (vr_offset)
9430 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9431 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9432 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9434 /* Emit code to initialize GROFF, the offset from GRTOP of the
9435 next GPR argument. */
9436 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9437 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9438 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9440 /* Likewise emit code to initialize VROFF, the offset from FTOP
9441 of the next VR argument. */
9442 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9443 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9444 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9447 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9449 static tree
9450 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9451 gimple_seq *post_p ATTRIBUTE_UNUSED)
9453 tree addr;
9454 bool indirect_p;
9455 bool is_ha; /* is HFA or HVA. */
9456 bool dw_align; /* double-word align. */
9457 machine_mode ag_mode = VOIDmode;
9458 int nregs;
9459 machine_mode mode;
9461 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9462 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9463 HOST_WIDE_INT size, rsize, adjust, align;
9464 tree t, u, cond1, cond2;
9466 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9467 if (indirect_p)
9468 type = build_pointer_type (type);
9470 mode = TYPE_MODE (type);
9472 f_stack = TYPE_FIELDS (va_list_type_node);
9473 f_grtop = DECL_CHAIN (f_stack);
9474 f_vrtop = DECL_CHAIN (f_grtop);
9475 f_groff = DECL_CHAIN (f_vrtop);
9476 f_vroff = DECL_CHAIN (f_groff);
9478 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9479 f_stack, NULL_TREE);
9480 size = int_size_in_bytes (type);
9481 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9483 dw_align = false;
9484 adjust = 0;
9485 if (aarch64_vfp_is_call_or_return_candidate (mode,
9486 type,
9487 &ag_mode,
9488 &nregs,
9489 &is_ha))
9491 /* TYPE passed in fp/simd registers. */
9492 if (!TARGET_FLOAT)
9493 aarch64_err_no_fpadvsimd (mode, "varargs");
9495 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9496 unshare_expr (valist), f_vrtop, NULL_TREE);
9497 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9498 unshare_expr (valist), f_vroff, NULL_TREE);
9500 rsize = nregs * UNITS_PER_VREG;
9502 if (is_ha)
9504 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9505 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9507 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9508 && size < UNITS_PER_VREG)
9510 adjust = UNITS_PER_VREG - size;
9513 else
9515 /* TYPE passed in general registers. */
9516 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9517 unshare_expr (valist), f_grtop, NULL_TREE);
9518 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9519 unshare_expr (valist), f_groff, NULL_TREE);
9520 rsize = ROUND_UP (size, UNITS_PER_WORD);
9521 nregs = rsize / UNITS_PER_WORD;
9523 if (align > 8)
9524 dw_align = true;
9526 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9527 && size < UNITS_PER_WORD)
9529 adjust = UNITS_PER_WORD - size;
9533 /* Get a local temporary for the field value. */
9534 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9536 /* Emit code to branch if off >= 0. */
9537 t = build2 (GE_EXPR, boolean_type_node, off,
9538 build_int_cst (TREE_TYPE (off), 0));
9539 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9541 if (dw_align)
9543 /* Emit: offs = (offs + 15) & -16. */
9544 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9545 build_int_cst (TREE_TYPE (off), 15));
9546 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9547 build_int_cst (TREE_TYPE (off), -16));
9548 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9550 else
9551 roundup = NULL;
9553 /* Update ap.__[g|v]r_offs */
9554 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9555 build_int_cst (TREE_TYPE (off), rsize));
9556 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9558 /* String up. */
9559 if (roundup)
9560 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9562 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9563 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9564 build_int_cst (TREE_TYPE (f_off), 0));
9565 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9567 /* String up: make sure the assignment happens before the use. */
9568 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9569 COND_EXPR_ELSE (cond1) = t;
9571 /* Prepare the trees handling the argument that is passed on the stack;
9572 the top level node will store in ON_STACK. */
9573 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9574 if (align > 8)
9576 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9577 t = fold_convert (intDI_type_node, arg);
9578 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9579 build_int_cst (TREE_TYPE (t), 15));
9580 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9581 build_int_cst (TREE_TYPE (t), -16));
9582 t = fold_convert (TREE_TYPE (arg), t);
9583 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9585 else
9586 roundup = NULL;
9587 /* Advance ap.__stack */
9588 t = fold_convert (intDI_type_node, arg);
9589 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9590 build_int_cst (TREE_TYPE (t), size + 7));
9591 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9592 build_int_cst (TREE_TYPE (t), -8));
9593 t = fold_convert (TREE_TYPE (arg), t);
9594 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9595 /* String up roundup and advance. */
9596 if (roundup)
9597 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9598 /* String up with arg */
9599 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9600 /* Big-endianness related address adjustment. */
9601 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9602 && size < UNITS_PER_WORD)
9604 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9605 size_int (UNITS_PER_WORD - size));
9606 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9609 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9610 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9612 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9613 t = off;
9614 if (adjust)
9615 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9616 build_int_cst (TREE_TYPE (off), adjust));
9618 t = fold_convert (sizetype, t);
9619 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9621 if (is_ha)
9623 /* type ha; // treat as "struct {ftype field[n];}"
9624 ... [computing offs]
9625 for (i = 0; i <nregs; ++i, offs += 16)
9626 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9627 return ha; */
9628 int i;
9629 tree tmp_ha, field_t, field_ptr_t;
9631 /* Declare a local variable. */
9632 tmp_ha = create_tmp_var_raw (type, "ha");
9633 gimple_add_tmp_var (tmp_ha);
9635 /* Establish the base type. */
9636 switch (ag_mode)
9638 case SFmode:
9639 field_t = float_type_node;
9640 field_ptr_t = float_ptr_type_node;
9641 break;
9642 case DFmode:
9643 field_t = double_type_node;
9644 field_ptr_t = double_ptr_type_node;
9645 break;
9646 case TFmode:
9647 field_t = long_double_type_node;
9648 field_ptr_t = long_double_ptr_type_node;
9649 break;
9650 /* The half precision and quad precision are not fully supported yet. Enable
9651 the following code after the support is complete. Need to find the correct
9652 type node for __fp16 *. */
9653 #if 0
9654 case HFmode:
9655 field_t = float_type_node;
9656 field_ptr_t = float_ptr_type_node;
9657 break;
9658 #endif
9659 case V2SImode:
9660 case V4SImode:
9662 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9663 field_t = build_vector_type_for_mode (innertype, ag_mode);
9664 field_ptr_t = build_pointer_type (field_t);
9666 break;
9667 default:
9668 gcc_assert (0);
9671 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
9672 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9673 addr = t;
9674 t = fold_convert (field_ptr_t, addr);
9675 t = build2 (MODIFY_EXPR, field_t,
9676 build1 (INDIRECT_REF, field_t, tmp_ha),
9677 build1 (INDIRECT_REF, field_t, t));
9679 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
9680 for (i = 1; i < nregs; ++i)
9682 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9683 u = fold_convert (field_ptr_t, addr);
9684 u = build2 (MODIFY_EXPR, field_t,
9685 build2 (MEM_REF, field_t, tmp_ha,
9686 build_int_cst (field_ptr_t,
9687 (i *
9688 int_size_in_bytes (field_t)))),
9689 build1 (INDIRECT_REF, field_t, u));
9690 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9693 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9694 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9697 COND_EXPR_ELSE (cond2) = t;
9698 addr = fold_convert (build_pointer_type (type), cond1);
9699 addr = build_va_arg_indirect_ref (addr);
9701 if (indirect_p)
9702 addr = build_va_arg_indirect_ref (addr);
9704 return addr;
9707 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
9709 static void
9710 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9711 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9712 int no_rtl)
9714 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9715 CUMULATIVE_ARGS local_cum;
9716 int gr_saved = cfun->va_list_gpr_size;
9717 int vr_saved = cfun->va_list_fpr_size;
9719 /* The caller has advanced CUM up to, but not beyond, the last named
9720 argument. Advance a local copy of CUM past the last "real" named
9721 argument, to find out how many registers are left over. */
9722 local_cum = *cum;
9723 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9725 /* Found out how many registers we need to save.
9726 Honor tree-stdvar analysis results. */
9727 if (cfun->va_list_gpr_size)
9728 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
9729 cfun->va_list_gpr_size / UNITS_PER_WORD);
9730 if (cfun->va_list_fpr_size)
9731 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
9732 cfun->va_list_fpr_size / UNITS_PER_VREG);
9734 if (!TARGET_FLOAT)
9736 gcc_assert (local_cum.aapcs_nvrn == 0);
9737 vr_saved = 0;
9740 if (!no_rtl)
9742 if (gr_saved > 0)
9744 rtx ptr, mem;
9746 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
9747 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9748 - gr_saved * UNITS_PER_WORD);
9749 mem = gen_frame_mem (BLKmode, ptr);
9750 set_mem_alias_set (mem, get_varargs_alias_set ());
9752 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9753 mem, gr_saved);
9755 if (vr_saved > 0)
9757 /* We can't use move_block_from_reg, because it will use
9758 the wrong mode, storing D regs only. */
9759 machine_mode mode = TImode;
9760 int off, i, vr_start;
9762 /* Set OFF to the offset from virtual_incoming_args_rtx of
9763 the first vector register. The VR save area lies below
9764 the GR one, and is aligned to 16 bytes. */
9765 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
9766 STACK_BOUNDARY / BITS_PER_UNIT);
9767 off -= vr_saved * UNITS_PER_VREG;
9769 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
9770 for (i = 0; i < vr_saved; ++i)
9772 rtx ptr, mem;
9774 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9775 mem = gen_frame_mem (mode, ptr);
9776 set_mem_alias_set (mem, get_varargs_alias_set ());
9777 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
9778 off += UNITS_PER_VREG;
9783 /* We don't save the size into *PRETEND_SIZE because we want to avoid
9784 any complication of having crtl->args.pretend_args_size changed. */
9785 cfun->machine->frame.saved_varargs_size
9786 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
9787 STACK_BOUNDARY / BITS_PER_UNIT)
9788 + vr_saved * UNITS_PER_VREG);
9791 static void
9792 aarch64_conditional_register_usage (void)
9794 int i;
9795 if (!TARGET_FLOAT)
9797 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9799 fixed_regs[i] = 1;
9800 call_used_regs[i] = 1;
9805 /* Walk down the type tree of TYPE counting consecutive base elements.
9806 If *MODEP is VOIDmode, then set it to the first valid floating point
9807 type. If a non-floating point type is found, or if a floating point
9808 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9809 otherwise return the count in the sub-tree. */
9810 static int
9811 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
9813 machine_mode mode;
9814 HOST_WIDE_INT size;
9816 switch (TREE_CODE (type))
9818 case REAL_TYPE:
9819 mode = TYPE_MODE (type);
9820 if (mode != DFmode && mode != SFmode && mode != TFmode)
9821 return -1;
9823 if (*modep == VOIDmode)
9824 *modep = mode;
9826 if (*modep == mode)
9827 return 1;
9829 break;
9831 case COMPLEX_TYPE:
9832 mode = TYPE_MODE (TREE_TYPE (type));
9833 if (mode != DFmode && mode != SFmode && mode != TFmode)
9834 return -1;
9836 if (*modep == VOIDmode)
9837 *modep = mode;
9839 if (*modep == mode)
9840 return 2;
9842 break;
9844 case VECTOR_TYPE:
9845 /* Use V2SImode and V4SImode as representatives of all 64-bit
9846 and 128-bit vector types. */
9847 size = int_size_in_bytes (type);
9848 switch (size)
9850 case 8:
9851 mode = V2SImode;
9852 break;
9853 case 16:
9854 mode = V4SImode;
9855 break;
9856 default:
9857 return -1;
9860 if (*modep == VOIDmode)
9861 *modep = mode;
9863 /* Vector modes are considered to be opaque: two vectors are
9864 equivalent for the purposes of being homogeneous aggregates
9865 if they are the same size. */
9866 if (*modep == mode)
9867 return 1;
9869 break;
9871 case ARRAY_TYPE:
9873 int count;
9874 tree index = TYPE_DOMAIN (type);
9876 /* Can't handle incomplete types nor sizes that are not
9877 fixed. */
9878 if (!COMPLETE_TYPE_P (type)
9879 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9880 return -1;
9882 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
9883 if (count == -1
9884 || !index
9885 || !TYPE_MAX_VALUE (index)
9886 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
9887 || !TYPE_MIN_VALUE (index)
9888 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
9889 || count < 0)
9890 return -1;
9892 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
9893 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
9895 /* There must be no padding. */
9896 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9897 return -1;
9899 return count;
9902 case RECORD_TYPE:
9904 int count = 0;
9905 int sub_count;
9906 tree field;
9908 /* Can't handle incomplete types nor sizes that are not
9909 fixed. */
9910 if (!COMPLETE_TYPE_P (type)
9911 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9912 return -1;
9914 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9916 if (TREE_CODE (field) != FIELD_DECL)
9917 continue;
9919 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9920 if (sub_count < 0)
9921 return -1;
9922 count += sub_count;
9925 /* There must be no padding. */
9926 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9927 return -1;
9929 return count;
9932 case UNION_TYPE:
9933 case QUAL_UNION_TYPE:
9935 /* These aren't very interesting except in a degenerate case. */
9936 int count = 0;
9937 int sub_count;
9938 tree field;
9940 /* Can't handle incomplete types nor sizes that are not
9941 fixed. */
9942 if (!COMPLETE_TYPE_P (type)
9943 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9944 return -1;
9946 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9948 if (TREE_CODE (field) != FIELD_DECL)
9949 continue;
9951 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9952 if (sub_count < 0)
9953 return -1;
9954 count = count > sub_count ? count : sub_count;
9957 /* There must be no padding. */
9958 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9959 return -1;
9961 return count;
9964 default:
9965 break;
9968 return -1;
9971 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
9972 type as described in AAPCS64 \S 4.1.2.
9974 See the comment above aarch64_composite_type_p for the notes on MODE. */
9976 static bool
9977 aarch64_short_vector_p (const_tree type,
9978 machine_mode mode)
9980 HOST_WIDE_INT size = -1;
9982 if (type && TREE_CODE (type) == VECTOR_TYPE)
9983 size = int_size_in_bytes (type);
9984 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
9985 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
9986 size = GET_MODE_SIZE (mode);
9988 return (size == 8 || size == 16);
9991 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
9992 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
9993 array types. The C99 floating-point complex types are also considered
9994 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
9995 types, which are GCC extensions and out of the scope of AAPCS64, are
9996 treated as composite types here as well.
9998 Note that MODE itself is not sufficient in determining whether a type
9999 is such a composite type or not. This is because
10000 stor-layout.c:compute_record_mode may have already changed the MODE
10001 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10002 structure with only one field may have its MODE set to the mode of the
10003 field. Also an integer mode whose size matches the size of the
10004 RECORD_TYPE type may be used to substitute the original mode
10005 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10006 solely relied on. */
10008 static bool
10009 aarch64_composite_type_p (const_tree type,
10010 machine_mode mode)
10012 if (aarch64_short_vector_p (type, mode))
10013 return false;
10015 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10016 return true;
10018 if (mode == BLKmode
10019 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10020 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10021 return true;
10023 return false;
10026 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10027 shall be passed or returned in simd/fp register(s) (providing these
10028 parameter passing registers are available).
10030 Upon successful return, *COUNT returns the number of needed registers,
10031 *BASE_MODE returns the mode of the individual register and when IS_HAF
10032 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10033 floating-point aggregate or a homogeneous short-vector aggregate. */
10035 static bool
10036 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10037 const_tree type,
10038 machine_mode *base_mode,
10039 int *count,
10040 bool *is_ha)
10042 machine_mode new_mode = VOIDmode;
10043 bool composite_p = aarch64_composite_type_p (type, mode);
10045 if (is_ha != NULL) *is_ha = false;
10047 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10048 || aarch64_short_vector_p (type, mode))
10050 *count = 1;
10051 new_mode = mode;
10053 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10055 if (is_ha != NULL) *is_ha = true;
10056 *count = 2;
10057 new_mode = GET_MODE_INNER (mode);
10059 else if (type && composite_p)
10061 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10063 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10065 if (is_ha != NULL) *is_ha = true;
10066 *count = ag_count;
10068 else
10069 return false;
10071 else
10072 return false;
10074 *base_mode = new_mode;
10075 return true;
10078 /* Implement TARGET_STRUCT_VALUE_RTX. */
10080 static rtx
10081 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10082 int incoming ATTRIBUTE_UNUSED)
10084 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10087 /* Implements target hook vector_mode_supported_p. */
10088 static bool
10089 aarch64_vector_mode_supported_p (machine_mode mode)
10091 if (TARGET_SIMD
10092 && (mode == V4SImode || mode == V8HImode
10093 || mode == V16QImode || mode == V2DImode
10094 || mode == V2SImode || mode == V4HImode
10095 || mode == V8QImode || mode == V2SFmode
10096 || mode == V4SFmode || mode == V2DFmode
10097 || mode == V4HFmode || mode == V8HFmode
10098 || mode == V1DFmode))
10099 return true;
10101 return false;
10104 /* Return appropriate SIMD container
10105 for MODE within a vector of WIDTH bits. */
10106 static machine_mode
10107 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10109 gcc_assert (width == 64 || width == 128);
10110 if (TARGET_SIMD)
10112 if (width == 128)
10113 switch (mode)
10115 case DFmode:
10116 return V2DFmode;
10117 case SFmode:
10118 return V4SFmode;
10119 case SImode:
10120 return V4SImode;
10121 case HImode:
10122 return V8HImode;
10123 case QImode:
10124 return V16QImode;
10125 case DImode:
10126 return V2DImode;
10127 default:
10128 break;
10130 else
10131 switch (mode)
10133 case SFmode:
10134 return V2SFmode;
10135 case SImode:
10136 return V2SImode;
10137 case HImode:
10138 return V4HImode;
10139 case QImode:
10140 return V8QImode;
10141 default:
10142 break;
10145 return word_mode;
10148 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10149 static machine_mode
10150 aarch64_preferred_simd_mode (machine_mode mode)
10152 return aarch64_simd_container_mode (mode, 128);
10155 /* Return the bitmask of possible vector sizes for the vectorizer
10156 to iterate over. */
10157 static unsigned int
10158 aarch64_autovectorize_vector_sizes (void)
10160 return (16 | 8);
10163 /* Implement TARGET_MANGLE_TYPE. */
10165 static const char *
10166 aarch64_mangle_type (const_tree type)
10168 /* The AArch64 ABI documents say that "__va_list" has to be
10169 managled as if it is in the "std" namespace. */
10170 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10171 return "St9__va_list";
10173 /* Half-precision float. */
10174 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10175 return "Dh";
10177 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10178 builtin types. */
10179 if (TYPE_NAME (type) != NULL)
10180 return aarch64_mangle_builtin_type (type);
10182 /* Use the default mangling. */
10183 return NULL;
10187 /* Return true if the rtx_insn contains a MEM RTX somewhere
10188 in it. */
10190 static bool
10191 has_memory_op (rtx_insn *mem_insn)
10193 subrtx_iterator::array_type array;
10194 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10195 if (MEM_P (*iter))
10196 return true;
10198 return false;
10201 /* Find the first rtx_insn before insn that will generate an assembly
10202 instruction. */
10204 static rtx_insn *
10205 aarch64_prev_real_insn (rtx_insn *insn)
10207 if (!insn)
10208 return NULL;
10212 insn = prev_real_insn (insn);
10214 while (insn && recog_memoized (insn) < 0);
10216 return insn;
10219 static bool
10220 is_madd_op (enum attr_type t1)
10222 unsigned int i;
10223 /* A number of these may be AArch32 only. */
10224 enum attr_type mlatypes[] = {
10225 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10226 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10227 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10230 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10232 if (t1 == mlatypes[i])
10233 return true;
10236 return false;
10239 /* Check if there is a register dependency between a load and the insn
10240 for which we hold recog_data. */
10242 static bool
10243 dep_between_memop_and_curr (rtx memop)
10245 rtx load_reg;
10246 int opno;
10248 gcc_assert (GET_CODE (memop) == SET);
10250 if (!REG_P (SET_DEST (memop)))
10251 return false;
10253 load_reg = SET_DEST (memop);
10254 for (opno = 1; opno < recog_data.n_operands; opno++)
10256 rtx operand = recog_data.operand[opno];
10257 if (REG_P (operand)
10258 && reg_overlap_mentioned_p (load_reg, operand))
10259 return true;
10262 return false;
10266 /* When working around the Cortex-A53 erratum 835769,
10267 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10268 instruction and has a preceding memory instruction such that a NOP
10269 should be inserted between them. */
10271 bool
10272 aarch64_madd_needs_nop (rtx_insn* insn)
10274 enum attr_type attr_type;
10275 rtx_insn *prev;
10276 rtx body;
10278 if (!TARGET_FIX_ERR_A53_835769)
10279 return false;
10281 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10282 return false;
10284 attr_type = get_attr_type (insn);
10285 if (!is_madd_op (attr_type))
10286 return false;
10288 prev = aarch64_prev_real_insn (insn);
10289 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10290 Restore recog state to INSN to avoid state corruption. */
10291 extract_constrain_insn_cached (insn);
10293 if (!prev || !has_memory_op (prev))
10294 return false;
10296 body = single_set (prev);
10298 /* If the previous insn is a memory op and there is no dependency between
10299 it and the DImode madd, emit a NOP between them. If body is NULL then we
10300 have a complex memory operation, probably a load/store pair.
10301 Be conservative for now and emit a NOP. */
10302 if (GET_MODE (recog_data.operand[0]) == DImode
10303 && (!body || !dep_between_memop_and_curr (body)))
10304 return true;
10306 return false;
10311 /* Implement FINAL_PRESCAN_INSN. */
10313 void
10314 aarch64_final_prescan_insn (rtx_insn *insn)
10316 if (aarch64_madd_needs_nop (insn))
10317 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10321 /* Return the equivalent letter for size. */
10322 static char
10323 sizetochar (int size)
10325 switch (size)
10327 case 64: return 'd';
10328 case 32: return 's';
10329 case 16: return 'h';
10330 case 8 : return 'b';
10331 default: gcc_unreachable ();
10335 /* Return true iff x is a uniform vector of floating-point
10336 constants, and the constant can be represented in
10337 quarter-precision form. Note, as aarch64_float_const_representable
10338 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10339 static bool
10340 aarch64_vect_float_const_representable_p (rtx x)
10342 rtx elt;
10343 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10344 && const_vec_duplicate_p (x, &elt)
10345 && aarch64_float_const_representable_p (elt));
10348 /* Return true for valid and false for invalid. */
10349 bool
10350 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10351 struct simd_immediate_info *info)
10353 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10354 matches = 1; \
10355 for (i = 0; i < idx; i += (STRIDE)) \
10356 if (!(TEST)) \
10357 matches = 0; \
10358 if (matches) \
10360 immtype = (CLASS); \
10361 elsize = (ELSIZE); \
10362 eshift = (SHIFT); \
10363 emvn = (NEG); \
10364 break; \
10367 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10368 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10369 unsigned char bytes[16];
10370 int immtype = -1, matches;
10371 unsigned int invmask = inverse ? 0xff : 0;
10372 int eshift, emvn;
10374 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10376 if (! (aarch64_simd_imm_zero_p (op, mode)
10377 || aarch64_vect_float_const_representable_p (op)))
10378 return false;
10380 if (info)
10382 info->value = CONST_VECTOR_ELT (op, 0);
10383 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10384 info->mvn = false;
10385 info->shift = 0;
10388 return true;
10391 /* Splat vector constant out into a byte vector. */
10392 for (i = 0; i < n_elts; i++)
10394 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10395 it must be laid out in the vector register in reverse order. */
10396 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10397 unsigned HOST_WIDE_INT elpart;
10399 gcc_assert (CONST_INT_P (el));
10400 elpart = INTVAL (el);
10402 for (unsigned int byte = 0; byte < innersize; byte++)
10404 bytes[idx++] = (elpart & 0xff) ^ invmask;
10405 elpart >>= BITS_PER_UNIT;
10410 /* Sanity check. */
10411 gcc_assert (idx == GET_MODE_SIZE (mode));
10415 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10416 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10418 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10419 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10421 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10422 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10424 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10425 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10427 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10429 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10431 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10432 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10434 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10435 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10437 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10438 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10440 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10441 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10443 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10445 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10447 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10448 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10450 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10451 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10453 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10454 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10456 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10457 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10459 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10461 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10462 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10464 while (0);
10466 if (immtype == -1)
10467 return false;
10469 if (info)
10471 info->element_width = elsize;
10472 info->mvn = emvn != 0;
10473 info->shift = eshift;
10475 unsigned HOST_WIDE_INT imm = 0;
10477 if (immtype >= 12 && immtype <= 15)
10478 info->msl = true;
10480 /* Un-invert bytes of recognized vector, if necessary. */
10481 if (invmask != 0)
10482 for (i = 0; i < idx; i++)
10483 bytes[i] ^= invmask;
10485 if (immtype == 17)
10487 /* FIXME: Broken on 32-bit H_W_I hosts. */
10488 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10490 for (i = 0; i < 8; i++)
10491 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10492 << (i * BITS_PER_UNIT);
10495 info->value = GEN_INT (imm);
10497 else
10499 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10500 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10502 /* Construct 'abcdefgh' because the assembler cannot handle
10503 generic constants. */
10504 if (info->mvn)
10505 imm = ~imm;
10506 imm = (imm >> info->shift) & 0xff;
10507 info->value = GEN_INT (imm);
10511 return true;
10512 #undef CHECK
10515 /* Check of immediate shift constants are within range. */
10516 bool
10517 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10519 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10520 if (left)
10521 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10522 else
10523 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10526 /* Return true if X is a uniform vector where all elements
10527 are either the floating-point constant 0.0 or the
10528 integer constant 0. */
10529 bool
10530 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10532 return x == CONST0_RTX (mode);
10536 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10537 operation of width WIDTH at bit position POS. */
10540 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10542 gcc_assert (CONST_INT_P (width));
10543 gcc_assert (CONST_INT_P (pos));
10545 unsigned HOST_WIDE_INT mask
10546 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10547 return GEN_INT (mask << UINTVAL (pos));
10550 bool
10551 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10553 HOST_WIDE_INT imm = INTVAL (x);
10554 int i;
10556 for (i = 0; i < 8; i++)
10558 unsigned int byte = imm & 0xff;
10559 if (byte != 0xff && byte != 0)
10560 return false;
10561 imm >>= 8;
10564 return true;
10567 bool
10568 aarch64_mov_operand_p (rtx x, machine_mode mode)
10570 if (GET_CODE (x) == HIGH
10571 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10572 return true;
10574 if (CONST_INT_P (x))
10575 return true;
10577 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10578 return true;
10580 return aarch64_classify_symbolic_expression (x)
10581 == SYMBOL_TINY_ABSOLUTE;
10584 /* Return a const_int vector of VAL. */
10586 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10588 int nunits = GET_MODE_NUNITS (mode);
10589 rtvec v = rtvec_alloc (nunits);
10590 int i;
10592 for (i=0; i < nunits; i++)
10593 RTVEC_ELT (v, i) = GEN_INT (val);
10595 return gen_rtx_CONST_VECTOR (mode, v);
10598 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10600 bool
10601 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10603 machine_mode vmode;
10605 gcc_assert (!VECTOR_MODE_P (mode));
10606 vmode = aarch64_preferred_simd_mode (mode);
10607 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10608 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10611 /* Construct and return a PARALLEL RTX vector with elements numbering the
10612 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10613 the vector - from the perspective of the architecture. This does not
10614 line up with GCC's perspective on lane numbers, so we end up with
10615 different masks depending on our target endian-ness. The diagram
10616 below may help. We must draw the distinction when building masks
10617 which select one half of the vector. An instruction selecting
10618 architectural low-lanes for a big-endian target, must be described using
10619 a mask selecting GCC high-lanes.
10621 Big-Endian Little-Endian
10623 GCC 0 1 2 3 3 2 1 0
10624 | x | x | x | x | | x | x | x | x |
10625 Architecture 3 2 1 0 3 2 1 0
10627 Low Mask: { 2, 3 } { 0, 1 }
10628 High Mask: { 0, 1 } { 2, 3 }
10632 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10634 int nunits = GET_MODE_NUNITS (mode);
10635 rtvec v = rtvec_alloc (nunits / 2);
10636 int high_base = nunits / 2;
10637 int low_base = 0;
10638 int base;
10639 rtx t1;
10640 int i;
10642 if (BYTES_BIG_ENDIAN)
10643 base = high ? low_base : high_base;
10644 else
10645 base = high ? high_base : low_base;
10647 for (i = 0; i < nunits / 2; i++)
10648 RTVEC_ELT (v, i) = GEN_INT (base + i);
10650 t1 = gen_rtx_PARALLEL (mode, v);
10651 return t1;
10654 /* Check OP for validity as a PARALLEL RTX vector with elements
10655 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10656 from the perspective of the architecture. See the diagram above
10657 aarch64_simd_vect_par_cnst_half for more details. */
10659 bool
10660 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10661 bool high)
10663 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10664 HOST_WIDE_INT count_op = XVECLEN (op, 0);
10665 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10666 int i = 0;
10668 if (!VECTOR_MODE_P (mode))
10669 return false;
10671 if (count_op != count_ideal)
10672 return false;
10674 for (i = 0; i < count_ideal; i++)
10676 rtx elt_op = XVECEXP (op, 0, i);
10677 rtx elt_ideal = XVECEXP (ideal, 0, i);
10679 if (!CONST_INT_P (elt_op)
10680 || INTVAL (elt_ideal) != INTVAL (elt_op))
10681 return false;
10683 return true;
10686 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
10687 HIGH (exclusive). */
10688 void
10689 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10690 const_tree exp)
10692 HOST_WIDE_INT lane;
10693 gcc_assert (CONST_INT_P (operand));
10694 lane = INTVAL (operand);
10696 if (lane < low || lane >= high)
10698 if (exp)
10699 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10700 else
10701 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10705 /* Return TRUE if OP is a valid vector addressing mode. */
10706 bool
10707 aarch64_simd_mem_operand_p (rtx op)
10709 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10710 || REG_P (XEXP (op, 0)));
10713 /* Emit a register copy from operand to operand, taking care not to
10714 early-clobber source registers in the process.
10716 COUNT is the number of components into which the copy needs to be
10717 decomposed. */
10718 void
10719 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10720 unsigned int count)
10722 unsigned int i;
10723 int rdest = REGNO (operands[0]);
10724 int rsrc = REGNO (operands[1]);
10726 if (!reg_overlap_mentioned_p (operands[0], operands[1])
10727 || rdest < rsrc)
10728 for (i = 0; i < count; i++)
10729 emit_move_insn (gen_rtx_REG (mode, rdest + i),
10730 gen_rtx_REG (mode, rsrc + i));
10731 else
10732 for (i = 0; i < count; i++)
10733 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10734 gen_rtx_REG (mode, rsrc + count - i - 1));
10737 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10738 one of VSTRUCT modes: OI, CI, or XI. */
10740 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10742 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10745 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
10746 alignment of a vector to 128 bits. */
10747 static HOST_WIDE_INT
10748 aarch64_simd_vector_alignment (const_tree type)
10750 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10751 return MIN (align, 128);
10754 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
10755 static bool
10756 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10758 if (is_packed)
10759 return false;
10761 /* We guarantee alignment for vectors up to 128-bits. */
10762 if (tree_int_cst_compare (TYPE_SIZE (type),
10763 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10764 return false;
10766 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
10767 return true;
10770 /* If VALS is a vector constant that can be loaded into a register
10771 using DUP, generate instructions to do so and return an RTX to
10772 assign to the register. Otherwise return NULL_RTX. */
10773 static rtx
10774 aarch64_simd_dup_constant (rtx vals)
10776 machine_mode mode = GET_MODE (vals);
10777 machine_mode inner_mode = GET_MODE_INNER (mode);
10778 rtx x;
10780 if (!const_vec_duplicate_p (vals, &x))
10781 return NULL_RTX;
10783 /* We can load this constant by using DUP and a constant in a
10784 single ARM register. This will be cheaper than a vector
10785 load. */
10786 x = copy_to_mode_reg (inner_mode, x);
10787 return gen_rtx_VEC_DUPLICATE (mode, x);
10791 /* Generate code to load VALS, which is a PARALLEL containing only
10792 constants (for vec_init) or CONST_VECTOR, efficiently into a
10793 register. Returns an RTX to copy into the register, or NULL_RTX
10794 for a PARALLEL that can not be converted into a CONST_VECTOR. */
10795 static rtx
10796 aarch64_simd_make_constant (rtx vals)
10798 machine_mode mode = GET_MODE (vals);
10799 rtx const_dup;
10800 rtx const_vec = NULL_RTX;
10801 int n_elts = GET_MODE_NUNITS (mode);
10802 int n_const = 0;
10803 int i;
10805 if (GET_CODE (vals) == CONST_VECTOR)
10806 const_vec = vals;
10807 else if (GET_CODE (vals) == PARALLEL)
10809 /* A CONST_VECTOR must contain only CONST_INTs and
10810 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
10811 Only store valid constants in a CONST_VECTOR. */
10812 for (i = 0; i < n_elts; ++i)
10814 rtx x = XVECEXP (vals, 0, i);
10815 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10816 n_const++;
10818 if (n_const == n_elts)
10819 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
10821 else
10822 gcc_unreachable ();
10824 if (const_vec != NULL_RTX
10825 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
10826 /* Load using MOVI/MVNI. */
10827 return const_vec;
10828 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
10829 /* Loaded using DUP. */
10830 return const_dup;
10831 else if (const_vec != NULL_RTX)
10832 /* Load from constant pool. We can not take advantage of single-cycle
10833 LD1 because we need a PC-relative addressing mode. */
10834 return const_vec;
10835 else
10836 /* A PARALLEL containing something not valid inside CONST_VECTOR.
10837 We can not construct an initializer. */
10838 return NULL_RTX;
10841 /* Expand a vector initialisation sequence, such that TARGET is
10842 initialised to contain VALS. */
10844 void
10845 aarch64_expand_vector_init (rtx target, rtx vals)
10847 machine_mode mode = GET_MODE (target);
10848 machine_mode inner_mode = GET_MODE_INNER (mode);
10849 /* The number of vector elements. */
10850 int n_elts = GET_MODE_NUNITS (mode);
10851 /* The number of vector elements which are not constant. */
10852 int n_var = 0;
10853 rtx any_const = NULL_RTX;
10854 /* The first element of vals. */
10855 rtx v0 = XVECEXP (vals, 0, 0);
10856 bool all_same = true;
10858 /* Count the number of variable elements to initialise. */
10859 for (int i = 0; i < n_elts; ++i)
10861 rtx x = XVECEXP (vals, 0, i);
10862 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
10863 ++n_var;
10864 else
10865 any_const = x;
10867 all_same &= rtx_equal_p (x, v0);
10870 /* No variable elements, hand off to aarch64_simd_make_constant which knows
10871 how best to handle this. */
10872 if (n_var == 0)
10874 rtx constant = aarch64_simd_make_constant (vals);
10875 if (constant != NULL_RTX)
10877 emit_move_insn (target, constant);
10878 return;
10882 /* Splat a single non-constant element if we can. */
10883 if (all_same)
10885 rtx x = copy_to_mode_reg (inner_mode, v0);
10886 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
10887 return;
10890 /* Initialise a vector which is part-variable. We want to first try
10891 to build those lanes which are constant in the most efficient way we
10892 can. */
10893 if (n_var != n_elts)
10895 rtx copy = copy_rtx (vals);
10897 /* Load constant part of vector. We really don't care what goes into the
10898 parts we will overwrite, but we're more likely to be able to load the
10899 constant efficiently if it has fewer, larger, repeating parts
10900 (see aarch64_simd_valid_immediate). */
10901 for (int i = 0; i < n_elts; i++)
10903 rtx x = XVECEXP (vals, 0, i);
10904 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10905 continue;
10906 rtx subst = any_const;
10907 for (int bit = n_elts / 2; bit > 0; bit /= 2)
10909 /* Look in the copied vector, as more elements are const. */
10910 rtx test = XVECEXP (copy, 0, i ^ bit);
10911 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
10913 subst = test;
10914 break;
10917 XVECEXP (copy, 0, i) = subst;
10919 aarch64_expand_vector_init (target, copy);
10922 /* Insert the variable lanes directly. */
10924 enum insn_code icode = optab_handler (vec_set_optab, mode);
10925 gcc_assert (icode != CODE_FOR_nothing);
10927 for (int i = 0; i < n_elts; i++)
10929 rtx x = XVECEXP (vals, 0, i);
10930 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10931 continue;
10932 x = copy_to_mode_reg (inner_mode, x);
10933 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
10937 static unsigned HOST_WIDE_INT
10938 aarch64_shift_truncation_mask (machine_mode mode)
10940 return
10941 (!SHIFT_COUNT_TRUNCATED
10942 || aarch64_vector_mode_supported_p (mode)
10943 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
10946 /* Select a format to encode pointers in exception handling data. */
10948 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
10950 int type;
10951 switch (aarch64_cmodel)
10953 case AARCH64_CMODEL_TINY:
10954 case AARCH64_CMODEL_TINY_PIC:
10955 case AARCH64_CMODEL_SMALL:
10956 case AARCH64_CMODEL_SMALL_PIC:
10957 case AARCH64_CMODEL_SMALL_SPIC:
10958 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
10959 for everything. */
10960 type = DW_EH_PE_sdata4;
10961 break;
10962 default:
10963 /* No assumptions here. 8-byte relocs required. */
10964 type = DW_EH_PE_sdata8;
10965 break;
10967 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
10970 /* The last .arch and .tune assembly strings that we printed. */
10971 static std::string aarch64_last_printed_arch_string;
10972 static std::string aarch64_last_printed_tune_string;
10974 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
10975 by the function fndecl. */
10977 void
10978 aarch64_declare_function_name (FILE *stream, const char* name,
10979 tree fndecl)
10981 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10983 struct cl_target_option *targ_options;
10984 if (target_parts)
10985 targ_options = TREE_TARGET_OPTION (target_parts);
10986 else
10987 targ_options = TREE_TARGET_OPTION (target_option_current_node);
10988 gcc_assert (targ_options);
10990 const struct processor *this_arch
10991 = aarch64_get_arch (targ_options->x_explicit_arch);
10993 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
10994 std::string extension
10995 = aarch64_get_extension_string_for_isa_flags (isa_flags,
10996 this_arch->flags);
10997 /* Only update the assembler .arch string if it is distinct from the last
10998 such string we printed. */
10999 std::string to_print = this_arch->name + extension;
11000 if (to_print != aarch64_last_printed_arch_string)
11002 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11003 aarch64_last_printed_arch_string = to_print;
11006 /* Print the cpu name we're tuning for in the comments, might be
11007 useful to readers of the generated asm. Do it only when it changes
11008 from function to function and verbose assembly is requested. */
11009 const struct processor *this_tune
11010 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11012 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11014 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11015 this_tune->name);
11016 aarch64_last_printed_tune_string = this_tune->name;
11019 /* Don't forget the type directive for ELF. */
11020 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11021 ASM_OUTPUT_LABEL (stream, name);
11024 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11026 static void
11027 aarch64_start_file (void)
11029 struct cl_target_option *default_options
11030 = TREE_TARGET_OPTION (target_option_default_node);
11032 const struct processor *default_arch
11033 = aarch64_get_arch (default_options->x_explicit_arch);
11034 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11035 std::string extension
11036 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11037 default_arch->flags);
11039 aarch64_last_printed_arch_string = default_arch->name + extension;
11040 aarch64_last_printed_tune_string = "";
11041 asm_fprintf (asm_out_file, "\t.arch %s\n",
11042 aarch64_last_printed_arch_string.c_str ());
11044 default_file_start ();
11047 /* Emit load exclusive. */
11049 static void
11050 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11051 rtx mem, rtx model_rtx)
11053 rtx (*gen) (rtx, rtx, rtx);
11055 switch (mode)
11057 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11058 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11059 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11060 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11061 default:
11062 gcc_unreachable ();
11065 emit_insn (gen (rval, mem, model_rtx));
11068 /* Emit store exclusive. */
11070 static void
11071 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11072 rtx rval, rtx mem, rtx model_rtx)
11074 rtx (*gen) (rtx, rtx, rtx, rtx);
11076 switch (mode)
11078 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11079 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11080 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11081 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11082 default:
11083 gcc_unreachable ();
11086 emit_insn (gen (bval, rval, mem, model_rtx));
11089 /* Mark the previous jump instruction as unlikely. */
11091 static void
11092 aarch64_emit_unlikely_jump (rtx insn)
11094 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11096 insn = emit_jump_insn (insn);
11097 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11100 /* Expand a compare and swap pattern. */
11102 void
11103 aarch64_expand_compare_and_swap (rtx operands[])
11105 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11106 machine_mode mode, cmp_mode;
11107 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11108 int idx;
11109 gen_cas_fn gen;
11110 const gen_cas_fn split_cas[] =
11112 gen_aarch64_compare_and_swapqi,
11113 gen_aarch64_compare_and_swaphi,
11114 gen_aarch64_compare_and_swapsi,
11115 gen_aarch64_compare_and_swapdi
11117 const gen_cas_fn atomic_cas[] =
11119 gen_aarch64_compare_and_swapqi_lse,
11120 gen_aarch64_compare_and_swaphi_lse,
11121 gen_aarch64_compare_and_swapsi_lse,
11122 gen_aarch64_compare_and_swapdi_lse
11125 bval = operands[0];
11126 rval = operands[1];
11127 mem = operands[2];
11128 oldval = operands[3];
11129 newval = operands[4];
11130 is_weak = operands[5];
11131 mod_s = operands[6];
11132 mod_f = operands[7];
11133 mode = GET_MODE (mem);
11134 cmp_mode = mode;
11136 /* Normally the succ memory model must be stronger than fail, but in the
11137 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11138 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11140 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11141 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11142 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11144 switch (mode)
11146 case QImode:
11147 case HImode:
11148 /* For short modes, we're going to perform the comparison in SImode,
11149 so do the zero-extension now. */
11150 cmp_mode = SImode;
11151 rval = gen_reg_rtx (SImode);
11152 oldval = convert_modes (SImode, mode, oldval, true);
11153 /* Fall through. */
11155 case SImode:
11156 case DImode:
11157 /* Force the value into a register if needed. */
11158 if (!aarch64_plus_operand (oldval, mode))
11159 oldval = force_reg (cmp_mode, oldval);
11160 break;
11162 default:
11163 gcc_unreachable ();
11166 switch (mode)
11168 case QImode: idx = 0; break;
11169 case HImode: idx = 1; break;
11170 case SImode: idx = 2; break;
11171 case DImode: idx = 3; break;
11172 default:
11173 gcc_unreachable ();
11175 if (TARGET_LSE)
11176 gen = atomic_cas[idx];
11177 else
11178 gen = split_cas[idx];
11180 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11182 if (mode == QImode || mode == HImode)
11183 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11185 x = gen_rtx_REG (CCmode, CC_REGNUM);
11186 x = gen_rtx_EQ (SImode, x, const0_rtx);
11187 emit_insn (gen_rtx_SET (bval, x));
11190 /* Test whether the target supports using a atomic load-operate instruction.
11191 CODE is the operation and AFTER is TRUE if the data in memory after the
11192 operation should be returned and FALSE if the data before the operation
11193 should be returned. Returns FALSE if the operation isn't supported by the
11194 architecture. */
11196 bool
11197 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11199 if (!TARGET_LSE)
11200 return false;
11202 switch (code)
11204 case SET:
11205 case AND:
11206 case IOR:
11207 case XOR:
11208 case MINUS:
11209 case PLUS:
11210 return true;
11211 default:
11212 return false;
11216 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11217 sequence implementing an atomic operation. */
11219 static void
11220 aarch64_emit_post_barrier (enum memmodel model)
11222 const enum memmodel base_model = memmodel_base (model);
11224 if (is_mm_sync (model)
11225 && (base_model == MEMMODEL_ACQUIRE
11226 || base_model == MEMMODEL_ACQ_REL
11227 || base_model == MEMMODEL_SEQ_CST))
11229 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11233 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11234 for the data in memory. EXPECTED is the value expected to be in memory.
11235 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11236 is the memory ordering to use. */
11238 void
11239 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11240 rtx expected, rtx desired,
11241 rtx model)
11243 rtx (*gen) (rtx, rtx, rtx, rtx);
11244 machine_mode mode;
11246 mode = GET_MODE (mem);
11248 switch (mode)
11250 case QImode: gen = gen_aarch64_atomic_casqi; break;
11251 case HImode: gen = gen_aarch64_atomic_cashi; break;
11252 case SImode: gen = gen_aarch64_atomic_cassi; break;
11253 case DImode: gen = gen_aarch64_atomic_casdi; break;
11254 default:
11255 gcc_unreachable ();
11258 /* Move the expected value into the CAS destination register. */
11259 emit_insn (gen_rtx_SET (rval, expected));
11261 /* Emit the CAS. */
11262 emit_insn (gen (rval, mem, desired, model));
11264 /* Compare the expected value with the value loaded by the CAS, to establish
11265 whether the swap was made. */
11266 aarch64_gen_compare_reg (EQ, rval, expected);
11269 /* Split a compare and swap pattern. */
11271 void
11272 aarch64_split_compare_and_swap (rtx operands[])
11274 rtx rval, mem, oldval, newval, scratch;
11275 machine_mode mode;
11276 bool is_weak;
11277 rtx_code_label *label1, *label2;
11278 rtx x, cond;
11279 enum memmodel model;
11280 rtx model_rtx;
11282 rval = operands[0];
11283 mem = operands[1];
11284 oldval = operands[2];
11285 newval = operands[3];
11286 is_weak = (operands[4] != const0_rtx);
11287 model_rtx = operands[5];
11288 scratch = operands[7];
11289 mode = GET_MODE (mem);
11290 model = memmodel_from_int (INTVAL (model_rtx));
11292 label1 = NULL;
11293 if (!is_weak)
11295 label1 = gen_label_rtx ();
11296 emit_label (label1);
11298 label2 = gen_label_rtx ();
11300 /* The initial load can be relaxed for a __sync operation since a final
11301 barrier will be emitted to stop code hoisting. */
11302 if (is_mm_sync (model))
11303 aarch64_emit_load_exclusive (mode, rval, mem,
11304 GEN_INT (MEMMODEL_RELAXED));
11305 else
11306 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11308 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11309 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11310 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11311 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11312 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11314 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11316 if (!is_weak)
11318 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11319 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11320 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11321 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11323 else
11325 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11326 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11327 emit_insn (gen_rtx_SET (cond, x));
11330 emit_label (label2);
11332 /* Emit any final barrier needed for a __sync operation. */
11333 if (is_mm_sync (model))
11334 aarch64_emit_post_barrier (model);
11337 /* Emit a BIC instruction. */
11339 static void
11340 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11342 rtx shift_rtx = GEN_INT (shift);
11343 rtx (*gen) (rtx, rtx, rtx, rtx);
11345 switch (mode)
11347 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11348 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11349 default:
11350 gcc_unreachable ();
11353 emit_insn (gen (dst, s2, shift_rtx, s1));
11356 /* Emit an atomic swap. */
11358 static void
11359 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11360 rtx mem, rtx model)
11362 rtx (*gen) (rtx, rtx, rtx, rtx);
11364 switch (mode)
11366 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11367 case HImode: gen = gen_aarch64_atomic_swphi; break;
11368 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11369 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11370 default:
11371 gcc_unreachable ();
11374 emit_insn (gen (dst, mem, value, model));
11377 /* Operations supported by aarch64_emit_atomic_load_op. */
11379 enum aarch64_atomic_load_op_code
11381 AARCH64_LDOP_PLUS, /* A + B */
11382 AARCH64_LDOP_XOR, /* A ^ B */
11383 AARCH64_LDOP_OR, /* A | B */
11384 AARCH64_LDOP_BIC /* A & ~B */
11387 /* Emit an atomic load-operate. */
11389 static void
11390 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11391 machine_mode mode, rtx dst, rtx src,
11392 rtx mem, rtx model)
11394 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11395 const aarch64_atomic_load_op_fn plus[] =
11397 gen_aarch64_atomic_loadaddqi,
11398 gen_aarch64_atomic_loadaddhi,
11399 gen_aarch64_atomic_loadaddsi,
11400 gen_aarch64_atomic_loadadddi
11402 const aarch64_atomic_load_op_fn eor[] =
11404 gen_aarch64_atomic_loadeorqi,
11405 gen_aarch64_atomic_loadeorhi,
11406 gen_aarch64_atomic_loadeorsi,
11407 gen_aarch64_atomic_loadeordi
11409 const aarch64_atomic_load_op_fn ior[] =
11411 gen_aarch64_atomic_loadsetqi,
11412 gen_aarch64_atomic_loadsethi,
11413 gen_aarch64_atomic_loadsetsi,
11414 gen_aarch64_atomic_loadsetdi
11416 const aarch64_atomic_load_op_fn bic[] =
11418 gen_aarch64_atomic_loadclrqi,
11419 gen_aarch64_atomic_loadclrhi,
11420 gen_aarch64_atomic_loadclrsi,
11421 gen_aarch64_atomic_loadclrdi
11423 aarch64_atomic_load_op_fn gen;
11424 int idx = 0;
11426 switch (mode)
11428 case QImode: idx = 0; break;
11429 case HImode: idx = 1; break;
11430 case SImode: idx = 2; break;
11431 case DImode: idx = 3; break;
11432 default:
11433 gcc_unreachable ();
11436 switch (code)
11438 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11439 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11440 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11441 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11442 default:
11443 gcc_unreachable ();
11446 emit_insn (gen (dst, mem, src, model));
11449 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11450 location to store the data read from memory. OUT_RESULT is the location to
11451 store the result of the operation. MEM is the memory location to read and
11452 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11453 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11454 be NULL. */
11456 void
11457 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11458 rtx mem, rtx value, rtx model_rtx)
11460 machine_mode mode = GET_MODE (mem);
11461 machine_mode wmode = (mode == DImode ? DImode : SImode);
11462 const bool short_mode = (mode < SImode);
11463 aarch64_atomic_load_op_code ldop_code;
11464 rtx src;
11465 rtx x;
11467 if (out_data)
11468 out_data = gen_lowpart (mode, out_data);
11470 if (out_result)
11471 out_result = gen_lowpart (mode, out_result);
11473 /* Make sure the value is in a register, putting it into a destination
11474 register if it needs to be manipulated. */
11475 if (!register_operand (value, mode)
11476 || code == AND || code == MINUS)
11478 src = out_result ? out_result : out_data;
11479 emit_move_insn (src, gen_lowpart (mode, value));
11481 else
11482 src = value;
11483 gcc_assert (register_operand (src, mode));
11485 /* Preprocess the data for the operation as necessary. If the operation is
11486 a SET then emit a swap instruction and finish. */
11487 switch (code)
11489 case SET:
11490 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11491 return;
11493 case MINUS:
11494 /* Negate the value and treat it as a PLUS. */
11496 rtx neg_src;
11498 /* Resize the value if necessary. */
11499 if (short_mode)
11500 src = gen_lowpart (wmode, src);
11502 neg_src = gen_rtx_NEG (wmode, src);
11503 emit_insn (gen_rtx_SET (src, neg_src));
11505 if (short_mode)
11506 src = gen_lowpart (mode, src);
11508 /* Fall-through. */
11509 case PLUS:
11510 ldop_code = AARCH64_LDOP_PLUS;
11511 break;
11513 case IOR:
11514 ldop_code = AARCH64_LDOP_OR;
11515 break;
11517 case XOR:
11518 ldop_code = AARCH64_LDOP_XOR;
11519 break;
11521 case AND:
11523 rtx not_src;
11525 /* Resize the value if necessary. */
11526 if (short_mode)
11527 src = gen_lowpart (wmode, src);
11529 not_src = gen_rtx_NOT (wmode, src);
11530 emit_insn (gen_rtx_SET (src, not_src));
11532 if (short_mode)
11533 src = gen_lowpart (mode, src);
11535 ldop_code = AARCH64_LDOP_BIC;
11536 break;
11538 default:
11539 /* The operation can't be done with atomic instructions. */
11540 gcc_unreachable ();
11543 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11545 /* If necessary, calculate the data in memory after the update by redoing the
11546 operation from values in registers. */
11547 if (!out_result)
11548 return;
11550 if (short_mode)
11552 src = gen_lowpart (wmode, src);
11553 out_data = gen_lowpart (wmode, out_data);
11554 out_result = gen_lowpart (wmode, out_result);
11557 x = NULL_RTX;
11559 switch (code)
11561 case MINUS:
11562 case PLUS:
11563 x = gen_rtx_PLUS (wmode, out_data, src);
11564 break;
11565 case IOR:
11566 x = gen_rtx_IOR (wmode, out_data, src);
11567 break;
11568 case XOR:
11569 x = gen_rtx_XOR (wmode, out_data, src);
11570 break;
11571 case AND:
11572 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11573 return;
11574 default:
11575 gcc_unreachable ();
11578 emit_set_insn (out_result, x);
11580 return;
11583 /* Split an atomic operation. */
11585 void
11586 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11587 rtx value, rtx model_rtx, rtx cond)
11589 machine_mode mode = GET_MODE (mem);
11590 machine_mode wmode = (mode == DImode ? DImode : SImode);
11591 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11592 const bool is_sync = is_mm_sync (model);
11593 rtx_code_label *label;
11594 rtx x;
11596 /* Split the atomic operation into a sequence. */
11597 label = gen_label_rtx ();
11598 emit_label (label);
11600 if (new_out)
11601 new_out = gen_lowpart (wmode, new_out);
11602 if (old_out)
11603 old_out = gen_lowpart (wmode, old_out);
11604 else
11605 old_out = new_out;
11606 value = simplify_gen_subreg (wmode, value, mode, 0);
11608 /* The initial load can be relaxed for a __sync operation since a final
11609 barrier will be emitted to stop code hoisting. */
11610 if (is_sync)
11611 aarch64_emit_load_exclusive (mode, old_out, mem,
11612 GEN_INT (MEMMODEL_RELAXED));
11613 else
11614 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11616 switch (code)
11618 case SET:
11619 new_out = value;
11620 break;
11622 case NOT:
11623 x = gen_rtx_AND (wmode, old_out, value);
11624 emit_insn (gen_rtx_SET (new_out, x));
11625 x = gen_rtx_NOT (wmode, new_out);
11626 emit_insn (gen_rtx_SET (new_out, x));
11627 break;
11629 case MINUS:
11630 if (CONST_INT_P (value))
11632 value = GEN_INT (-INTVAL (value));
11633 code = PLUS;
11635 /* Fall through. */
11637 default:
11638 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11639 emit_insn (gen_rtx_SET (new_out, x));
11640 break;
11643 aarch64_emit_store_exclusive (mode, cond, mem,
11644 gen_lowpart (mode, new_out), model_rtx);
11646 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11647 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11648 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11649 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11651 /* Emit any final barrier needed for a __sync operation. */
11652 if (is_sync)
11653 aarch64_emit_post_barrier (model);
11656 static void
11657 aarch64_init_libfuncs (void)
11659 /* Half-precision float operations. The compiler handles all operations
11660 with NULL libfuncs by converting to SFmode. */
11662 /* Conversions. */
11663 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11664 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11666 /* Arithmetic. */
11667 set_optab_libfunc (add_optab, HFmode, NULL);
11668 set_optab_libfunc (sdiv_optab, HFmode, NULL);
11669 set_optab_libfunc (smul_optab, HFmode, NULL);
11670 set_optab_libfunc (neg_optab, HFmode, NULL);
11671 set_optab_libfunc (sub_optab, HFmode, NULL);
11673 /* Comparisons. */
11674 set_optab_libfunc (eq_optab, HFmode, NULL);
11675 set_optab_libfunc (ne_optab, HFmode, NULL);
11676 set_optab_libfunc (lt_optab, HFmode, NULL);
11677 set_optab_libfunc (le_optab, HFmode, NULL);
11678 set_optab_libfunc (ge_optab, HFmode, NULL);
11679 set_optab_libfunc (gt_optab, HFmode, NULL);
11680 set_optab_libfunc (unord_optab, HFmode, NULL);
11683 /* Target hook for c_mode_for_suffix. */
11684 static machine_mode
11685 aarch64_c_mode_for_suffix (char suffix)
11687 if (suffix == 'q')
11688 return TFmode;
11690 return VOIDmode;
11693 /* We can only represent floating point constants which will fit in
11694 "quarter-precision" values. These values are characterised by
11695 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
11698 (-1)^s * (n/16) * 2^r
11700 Where:
11701 's' is the sign bit.
11702 'n' is an integer in the range 16 <= n <= 31.
11703 'r' is an integer in the range -3 <= r <= 4. */
11705 /* Return true iff X can be represented by a quarter-precision
11706 floating point immediate operand X. Note, we cannot represent 0.0. */
11707 bool
11708 aarch64_float_const_representable_p (rtx x)
11710 /* This represents our current view of how many bits
11711 make up the mantissa. */
11712 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11713 int exponent;
11714 unsigned HOST_WIDE_INT mantissa, mask;
11715 REAL_VALUE_TYPE r, m;
11716 bool fail;
11718 if (!CONST_DOUBLE_P (x))
11719 return false;
11721 /* We don't support HFmode constants yet. */
11722 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11723 return false;
11725 r = *CONST_DOUBLE_REAL_VALUE (x);
11727 /* We cannot represent infinities, NaNs or +/-zero. We won't
11728 know if we have +zero until we analyse the mantissa, but we
11729 can reject the other invalid values. */
11730 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11731 || REAL_VALUE_MINUS_ZERO (r))
11732 return false;
11734 /* Extract exponent. */
11735 r = real_value_abs (&r);
11736 exponent = REAL_EXP (&r);
11738 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11739 highest (sign) bit, with a fixed binary point at bit point_pos.
11740 m1 holds the low part of the mantissa, m2 the high part.
11741 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11742 bits for the mantissa, this can fail (low bits will be lost). */
11743 real_ldexp (&m, &r, point_pos - exponent);
11744 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11746 /* If the low part of the mantissa has bits set we cannot represent
11747 the value. */
11748 if (w.elt (0) != 0)
11749 return false;
11750 /* We have rejected the lower HOST_WIDE_INT, so update our
11751 understanding of how many bits lie in the mantissa and
11752 look only at the high HOST_WIDE_INT. */
11753 mantissa = w.elt (1);
11754 point_pos -= HOST_BITS_PER_WIDE_INT;
11756 /* We can only represent values with a mantissa of the form 1.xxxx. */
11757 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11758 if ((mantissa & mask) != 0)
11759 return false;
11761 /* Having filtered unrepresentable values, we may now remove all
11762 but the highest 5 bits. */
11763 mantissa >>= point_pos - 5;
11765 /* We cannot represent the value 0.0, so reject it. This is handled
11766 elsewhere. */
11767 if (mantissa == 0)
11768 return false;
11770 /* Then, as bit 4 is always set, we can mask it off, leaving
11771 the mantissa in the range [0, 15]. */
11772 mantissa &= ~(1 << 4);
11773 gcc_assert (mantissa <= 15);
11775 /* GCC internally does not use IEEE754-like encoding (where normalized
11776 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
11777 Our mantissa values are shifted 4 places to the left relative to
11778 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11779 by 5 places to correct for GCC's representation. */
11780 exponent = 5 - exponent;
11782 return (exponent >= 0 && exponent <= 7);
11785 char*
11786 aarch64_output_simd_mov_immediate (rtx const_vector,
11787 machine_mode mode,
11788 unsigned width)
11790 bool is_valid;
11791 static char templ[40];
11792 const char *mnemonic;
11793 const char *shift_op;
11794 unsigned int lane_count = 0;
11795 char element_char;
11797 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
11799 /* This will return true to show const_vector is legal for use as either
11800 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
11801 also update INFO to show how the immediate should be generated. */
11802 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
11803 gcc_assert (is_valid);
11805 element_char = sizetochar (info.element_width);
11806 lane_count = width / info.element_width;
11808 mode = GET_MODE_INNER (mode);
11809 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11811 gcc_assert (info.shift == 0 && ! info.mvn);
11812 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
11813 move immediate path. */
11814 if (aarch64_float_const_zero_rtx_p (info.value))
11815 info.value = GEN_INT (0);
11816 else
11818 const unsigned int buf_size = 20;
11819 char float_buf[buf_size] = {'\0'};
11820 real_to_decimal_for_mode (float_buf,
11821 CONST_DOUBLE_REAL_VALUE (info.value),
11822 buf_size, buf_size, 1, mode);
11824 if (lane_count == 1)
11825 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
11826 else
11827 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
11828 lane_count, element_char, float_buf);
11829 return templ;
11833 mnemonic = info.mvn ? "mvni" : "movi";
11834 shift_op = info.msl ? "msl" : "lsl";
11836 gcc_assert (CONST_INT_P (info.value));
11837 if (lane_count == 1)
11838 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
11839 mnemonic, UINTVAL (info.value));
11840 else if (info.shift)
11841 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
11842 ", %s %d", mnemonic, lane_count, element_char,
11843 UINTVAL (info.value), shift_op, info.shift);
11844 else
11845 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
11846 mnemonic, lane_count, element_char, UINTVAL (info.value));
11847 return templ;
11850 char*
11851 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
11852 machine_mode mode)
11854 machine_mode vmode;
11856 gcc_assert (!VECTOR_MODE_P (mode));
11857 vmode = aarch64_simd_container_mode (mode, 64);
11858 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
11859 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
11862 /* Split operands into moves from op[1] + op[2] into op[0]. */
11864 void
11865 aarch64_split_combinev16qi (rtx operands[3])
11867 unsigned int dest = REGNO (operands[0]);
11868 unsigned int src1 = REGNO (operands[1]);
11869 unsigned int src2 = REGNO (operands[2]);
11870 machine_mode halfmode = GET_MODE (operands[1]);
11871 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
11872 rtx destlo, desthi;
11874 gcc_assert (halfmode == V16QImode);
11876 if (src1 == dest && src2 == dest + halfregs)
11878 /* No-op move. Can't split to nothing; emit something. */
11879 emit_note (NOTE_INSN_DELETED);
11880 return;
11883 /* Preserve register attributes for variable tracking. */
11884 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
11885 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
11886 GET_MODE_SIZE (halfmode));
11888 /* Special case of reversed high/low parts. */
11889 if (reg_overlap_mentioned_p (operands[2], destlo)
11890 && reg_overlap_mentioned_p (operands[1], desthi))
11892 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11893 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
11894 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11896 else if (!reg_overlap_mentioned_p (operands[2], destlo))
11898 /* Try to avoid unnecessary moves if part of the result
11899 is in the right place already. */
11900 if (src1 != dest)
11901 emit_move_insn (destlo, operands[1]);
11902 if (src2 != dest + halfregs)
11903 emit_move_insn (desthi, operands[2]);
11905 else
11907 if (src2 != dest + halfregs)
11908 emit_move_insn (desthi, operands[2]);
11909 if (src1 != dest)
11910 emit_move_insn (destlo, operands[1]);
11914 /* vec_perm support. */
11916 #define MAX_VECT_LEN 16
11918 struct expand_vec_perm_d
11920 rtx target, op0, op1;
11921 unsigned char perm[MAX_VECT_LEN];
11922 machine_mode vmode;
11923 unsigned char nelt;
11924 bool one_vector_p;
11925 bool testing_p;
11928 /* Generate a variable permutation. */
11930 static void
11931 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
11933 machine_mode vmode = GET_MODE (target);
11934 bool one_vector_p = rtx_equal_p (op0, op1);
11936 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
11937 gcc_checking_assert (GET_MODE (op0) == vmode);
11938 gcc_checking_assert (GET_MODE (op1) == vmode);
11939 gcc_checking_assert (GET_MODE (sel) == vmode);
11940 gcc_checking_assert (TARGET_SIMD);
11942 if (one_vector_p)
11944 if (vmode == V8QImode)
11946 /* Expand the argument to a V16QI mode by duplicating it. */
11947 rtx pair = gen_reg_rtx (V16QImode);
11948 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
11949 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11951 else
11953 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
11956 else
11958 rtx pair;
11960 if (vmode == V8QImode)
11962 pair = gen_reg_rtx (V16QImode);
11963 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
11964 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11966 else
11968 pair = gen_reg_rtx (OImode);
11969 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
11970 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
11975 void
11976 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
11978 machine_mode vmode = GET_MODE (target);
11979 unsigned int nelt = GET_MODE_NUNITS (vmode);
11980 bool one_vector_p = rtx_equal_p (op0, op1);
11981 rtx mask;
11983 /* The TBL instruction does not use a modulo index, so we must take care
11984 of that ourselves. */
11985 mask = aarch64_simd_gen_const_vector_dup (vmode,
11986 one_vector_p ? nelt - 1 : 2 * nelt - 1);
11987 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
11989 /* For big-endian, we also need to reverse the index within the vector
11990 (but not which vector). */
11991 if (BYTES_BIG_ENDIAN)
11993 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
11994 if (!one_vector_p)
11995 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
11996 sel = expand_simple_binop (vmode, XOR, sel, mask,
11997 NULL, 0, OPTAB_LIB_WIDEN);
11999 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12002 /* Recognize patterns suitable for the TRN instructions. */
12003 static bool
12004 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12006 unsigned int i, odd, mask, nelt = d->nelt;
12007 rtx out, in0, in1, x;
12008 rtx (*gen) (rtx, rtx, rtx);
12009 machine_mode vmode = d->vmode;
12011 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12012 return false;
12014 /* Note that these are little-endian tests.
12015 We correct for big-endian later. */
12016 if (d->perm[0] == 0)
12017 odd = 0;
12018 else if (d->perm[0] == 1)
12019 odd = 1;
12020 else
12021 return false;
12022 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12024 for (i = 0; i < nelt; i += 2)
12026 if (d->perm[i] != i + odd)
12027 return false;
12028 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12029 return false;
12032 /* Success! */
12033 if (d->testing_p)
12034 return true;
12036 in0 = d->op0;
12037 in1 = d->op1;
12038 if (BYTES_BIG_ENDIAN)
12040 x = in0, in0 = in1, in1 = x;
12041 odd = !odd;
12043 out = d->target;
12045 if (odd)
12047 switch (vmode)
12049 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12050 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12051 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12052 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12053 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12054 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12055 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12056 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12057 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12058 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12059 default:
12060 return false;
12063 else
12065 switch (vmode)
12067 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12068 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12069 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12070 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12071 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12072 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12073 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12074 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12075 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12076 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12077 default:
12078 return false;
12082 emit_insn (gen (out, in0, in1));
12083 return true;
12086 /* Recognize patterns suitable for the UZP instructions. */
12087 static bool
12088 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12090 unsigned int i, odd, mask, nelt = d->nelt;
12091 rtx out, in0, in1, x;
12092 rtx (*gen) (rtx, rtx, rtx);
12093 machine_mode vmode = d->vmode;
12095 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12096 return false;
12098 /* Note that these are little-endian tests.
12099 We correct for big-endian later. */
12100 if (d->perm[0] == 0)
12101 odd = 0;
12102 else if (d->perm[0] == 1)
12103 odd = 1;
12104 else
12105 return false;
12106 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12108 for (i = 0; i < nelt; i++)
12110 unsigned elt = (i * 2 + odd) & mask;
12111 if (d->perm[i] != elt)
12112 return false;
12115 /* Success! */
12116 if (d->testing_p)
12117 return true;
12119 in0 = d->op0;
12120 in1 = d->op1;
12121 if (BYTES_BIG_ENDIAN)
12123 x = in0, in0 = in1, in1 = x;
12124 odd = !odd;
12126 out = d->target;
12128 if (odd)
12130 switch (vmode)
12132 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12133 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12134 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12135 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12136 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12137 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12138 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12139 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12140 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12141 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12142 default:
12143 return false;
12146 else
12148 switch (vmode)
12150 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12151 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12152 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12153 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12154 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12155 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12156 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12157 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12158 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12159 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12160 default:
12161 return false;
12165 emit_insn (gen (out, in0, in1));
12166 return true;
12169 /* Recognize patterns suitable for the ZIP instructions. */
12170 static bool
12171 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12173 unsigned int i, high, mask, nelt = d->nelt;
12174 rtx out, in0, in1, x;
12175 rtx (*gen) (rtx, rtx, rtx);
12176 machine_mode vmode = d->vmode;
12178 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12179 return false;
12181 /* Note that these are little-endian tests.
12182 We correct for big-endian later. */
12183 high = nelt / 2;
12184 if (d->perm[0] == high)
12185 /* Do Nothing. */
12187 else if (d->perm[0] == 0)
12188 high = 0;
12189 else
12190 return false;
12191 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12193 for (i = 0; i < nelt / 2; i++)
12195 unsigned elt = (i + high) & mask;
12196 if (d->perm[i * 2] != elt)
12197 return false;
12198 elt = (elt + nelt) & mask;
12199 if (d->perm[i * 2 + 1] != elt)
12200 return false;
12203 /* Success! */
12204 if (d->testing_p)
12205 return true;
12207 in0 = d->op0;
12208 in1 = d->op1;
12209 if (BYTES_BIG_ENDIAN)
12211 x = in0, in0 = in1, in1 = x;
12212 high = !high;
12214 out = d->target;
12216 if (high)
12218 switch (vmode)
12220 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12221 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12222 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12223 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12224 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12225 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12226 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12227 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12228 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12229 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12230 default:
12231 return false;
12234 else
12236 switch (vmode)
12238 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12239 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12240 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12241 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12242 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12243 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12244 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12245 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12246 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12247 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12248 default:
12249 return false;
12253 emit_insn (gen (out, in0, in1));
12254 return true;
12257 /* Recognize patterns for the EXT insn. */
12259 static bool
12260 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12262 unsigned int i, nelt = d->nelt;
12263 rtx (*gen) (rtx, rtx, rtx, rtx);
12264 rtx offset;
12266 unsigned int location = d->perm[0]; /* Always < nelt. */
12268 /* Check if the extracted indices are increasing by one. */
12269 for (i = 1; i < nelt; i++)
12271 unsigned int required = location + i;
12272 if (d->one_vector_p)
12274 /* We'll pass the same vector in twice, so allow indices to wrap. */
12275 required &= (nelt - 1);
12277 if (d->perm[i] != required)
12278 return false;
12281 switch (d->vmode)
12283 case V16QImode: gen = gen_aarch64_extv16qi; break;
12284 case V8QImode: gen = gen_aarch64_extv8qi; break;
12285 case V4HImode: gen = gen_aarch64_extv4hi; break;
12286 case V8HImode: gen = gen_aarch64_extv8hi; break;
12287 case V2SImode: gen = gen_aarch64_extv2si; break;
12288 case V4SImode: gen = gen_aarch64_extv4si; break;
12289 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12290 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12291 case V2DImode: gen = gen_aarch64_extv2di; break;
12292 case V2DFmode: gen = gen_aarch64_extv2df; break;
12293 default:
12294 return false;
12297 /* Success! */
12298 if (d->testing_p)
12299 return true;
12301 /* The case where (location == 0) is a no-op for both big- and little-endian,
12302 and is removed by the mid-end at optimization levels -O1 and higher. */
12304 if (BYTES_BIG_ENDIAN && (location != 0))
12306 /* After setup, we want the high elements of the first vector (stored
12307 at the LSB end of the register), and the low elements of the second
12308 vector (stored at the MSB end of the register). So swap. */
12309 std::swap (d->op0, d->op1);
12310 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12311 location = nelt - location;
12314 offset = GEN_INT (location);
12315 emit_insn (gen (d->target, d->op0, d->op1, offset));
12316 return true;
12319 /* Recognize patterns for the REV insns. */
12321 static bool
12322 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12324 unsigned int i, j, diff, nelt = d->nelt;
12325 rtx (*gen) (rtx, rtx);
12327 if (!d->one_vector_p)
12328 return false;
12330 diff = d->perm[0];
12331 switch (diff)
12333 case 7:
12334 switch (d->vmode)
12336 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12337 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12338 default:
12339 return false;
12341 break;
12342 case 3:
12343 switch (d->vmode)
12345 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12346 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12347 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12348 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12349 default:
12350 return false;
12352 break;
12353 case 1:
12354 switch (d->vmode)
12356 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12357 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12358 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12359 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12360 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12361 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12362 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12363 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12364 default:
12365 return false;
12367 break;
12368 default:
12369 return false;
12372 for (i = 0; i < nelt ; i += diff + 1)
12373 for (j = 0; j <= diff; j += 1)
12375 /* This is guaranteed to be true as the value of diff
12376 is 7, 3, 1 and we should have enough elements in the
12377 queue to generate this. Getting a vector mask with a
12378 value of diff other than these values implies that
12379 something is wrong by the time we get here. */
12380 gcc_assert (i + j < nelt);
12381 if (d->perm[i + j] != i + diff - j)
12382 return false;
12385 /* Success! */
12386 if (d->testing_p)
12387 return true;
12389 emit_insn (gen (d->target, d->op0));
12390 return true;
12393 static bool
12394 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12396 rtx (*gen) (rtx, rtx, rtx);
12397 rtx out = d->target;
12398 rtx in0;
12399 machine_mode vmode = d->vmode;
12400 unsigned int i, elt, nelt = d->nelt;
12401 rtx lane;
12403 elt = d->perm[0];
12404 for (i = 1; i < nelt; i++)
12406 if (elt != d->perm[i])
12407 return false;
12410 /* The generic preparation in aarch64_expand_vec_perm_const_1
12411 swaps the operand order and the permute indices if it finds
12412 d->perm[0] to be in the second operand. Thus, we can always
12413 use d->op0 and need not do any extra arithmetic to get the
12414 correct lane number. */
12415 in0 = d->op0;
12416 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12418 switch (vmode)
12420 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12421 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12422 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12423 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12424 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12425 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12426 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12427 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12428 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12429 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12430 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12431 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12432 default:
12433 return false;
12436 emit_insn (gen (out, in0, lane));
12437 return true;
12440 static bool
12441 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12443 rtx rperm[MAX_VECT_LEN], sel;
12444 machine_mode vmode = d->vmode;
12445 unsigned int i, nelt = d->nelt;
12447 if (d->testing_p)
12448 return true;
12450 /* Generic code will try constant permutation twice. Once with the
12451 original mode and again with the elements lowered to QImode.
12452 So wait and don't do the selector expansion ourselves. */
12453 if (vmode != V8QImode && vmode != V16QImode)
12454 return false;
12456 for (i = 0; i < nelt; ++i)
12458 int nunits = GET_MODE_NUNITS (vmode);
12460 /* If big-endian and two vectors we end up with a weird mixed-endian
12461 mode on NEON. Reverse the index within each word but not the word
12462 itself. */
12463 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12464 : d->perm[i]);
12466 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12467 sel = force_reg (vmode, sel);
12469 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12470 return true;
12473 static bool
12474 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12476 /* The pattern matching functions above are written to look for a small
12477 number to begin the sequence (0, 1, N/2). If we begin with an index
12478 from the second operand, we can swap the operands. */
12479 if (d->perm[0] >= d->nelt)
12481 unsigned i, nelt = d->nelt;
12483 gcc_assert (nelt == (nelt & -nelt));
12484 for (i = 0; i < nelt; ++i)
12485 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12487 std::swap (d->op0, d->op1);
12490 if (TARGET_SIMD)
12492 if (aarch64_evpc_rev (d))
12493 return true;
12494 else if (aarch64_evpc_ext (d))
12495 return true;
12496 else if (aarch64_evpc_dup (d))
12497 return true;
12498 else if (aarch64_evpc_zip (d))
12499 return true;
12500 else if (aarch64_evpc_uzp (d))
12501 return true;
12502 else if (aarch64_evpc_trn (d))
12503 return true;
12504 return aarch64_evpc_tbl (d);
12506 return false;
12509 /* Expand a vec_perm_const pattern. */
12511 bool
12512 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12514 struct expand_vec_perm_d d;
12515 int i, nelt, which;
12517 d.target = target;
12518 d.op0 = op0;
12519 d.op1 = op1;
12521 d.vmode = GET_MODE (target);
12522 gcc_assert (VECTOR_MODE_P (d.vmode));
12523 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12524 d.testing_p = false;
12526 for (i = which = 0; i < nelt; ++i)
12528 rtx e = XVECEXP (sel, 0, i);
12529 int ei = INTVAL (e) & (2 * nelt - 1);
12530 which |= (ei < nelt ? 1 : 2);
12531 d.perm[i] = ei;
12534 switch (which)
12536 default:
12537 gcc_unreachable ();
12539 case 3:
12540 d.one_vector_p = false;
12541 if (!rtx_equal_p (op0, op1))
12542 break;
12544 /* The elements of PERM do not suggest that only the first operand
12545 is used, but both operands are identical. Allow easier matching
12546 of the permutation by folding the permutation into the single
12547 input vector. */
12548 /* Fall Through. */
12549 case 2:
12550 for (i = 0; i < nelt; ++i)
12551 d.perm[i] &= nelt - 1;
12552 d.op0 = op1;
12553 d.one_vector_p = true;
12554 break;
12556 case 1:
12557 d.op1 = op0;
12558 d.one_vector_p = true;
12559 break;
12562 return aarch64_expand_vec_perm_const_1 (&d);
12565 static bool
12566 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12567 const unsigned char *sel)
12569 struct expand_vec_perm_d d;
12570 unsigned int i, nelt, which;
12571 bool ret;
12573 d.vmode = vmode;
12574 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12575 d.testing_p = true;
12576 memcpy (d.perm, sel, nelt);
12578 /* Calculate whether all elements are in one vector. */
12579 for (i = which = 0; i < nelt; ++i)
12581 unsigned char e = d.perm[i];
12582 gcc_assert (e < 2 * nelt);
12583 which |= (e < nelt ? 1 : 2);
12586 /* If all elements are from the second vector, reindex as if from the
12587 first vector. */
12588 if (which == 2)
12589 for (i = 0; i < nelt; ++i)
12590 d.perm[i] -= nelt;
12592 /* Check whether the mask can be applied to a single vector. */
12593 d.one_vector_p = (which != 3);
12595 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12596 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12597 if (!d.one_vector_p)
12598 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12600 start_sequence ();
12601 ret = aarch64_expand_vec_perm_const_1 (&d);
12602 end_sequence ();
12604 return ret;
12608 aarch64_reverse_mask (enum machine_mode mode)
12610 /* We have to reverse each vector because we dont have
12611 a permuted load that can reverse-load according to ABI rules. */
12612 rtx mask;
12613 rtvec v = rtvec_alloc (16);
12614 int i, j;
12615 int nunits = GET_MODE_NUNITS (mode);
12616 int usize = GET_MODE_UNIT_SIZE (mode);
12618 gcc_assert (BYTES_BIG_ENDIAN);
12619 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12621 for (i = 0; i < nunits; i++)
12622 for (j = 0; j < usize; j++)
12623 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12624 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12625 return force_reg (V16QImode, mask);
12628 /* Implement MODES_TIEABLE_P. */
12630 bool
12631 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12633 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12634 return true;
12636 /* We specifically want to allow elements of "structure" modes to
12637 be tieable to the structure. This more general condition allows
12638 other rarer situations too. */
12639 if (TARGET_SIMD
12640 && aarch64_vector_mode_p (mode1)
12641 && aarch64_vector_mode_p (mode2))
12642 return true;
12644 return false;
12647 /* Return a new RTX holding the result of moving POINTER forward by
12648 AMOUNT bytes. */
12650 static rtx
12651 aarch64_move_pointer (rtx pointer, int amount)
12653 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12655 return adjust_automodify_address (pointer, GET_MODE (pointer),
12656 next, amount);
12659 /* Return a new RTX holding the result of moving POINTER forward by the
12660 size of the mode it points to. */
12662 static rtx
12663 aarch64_progress_pointer (rtx pointer)
12665 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12667 return aarch64_move_pointer (pointer, amount);
12670 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12671 MODE bytes. */
12673 static void
12674 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12675 machine_mode mode)
12677 rtx reg = gen_reg_rtx (mode);
12679 /* "Cast" the pointers to the correct mode. */
12680 *src = adjust_address (*src, mode, 0);
12681 *dst = adjust_address (*dst, mode, 0);
12682 /* Emit the memcpy. */
12683 emit_move_insn (reg, *src);
12684 emit_move_insn (*dst, reg);
12685 /* Move the pointers forward. */
12686 *src = aarch64_progress_pointer (*src);
12687 *dst = aarch64_progress_pointer (*dst);
12690 /* Expand movmem, as if from a __builtin_memcpy. Return true if
12691 we succeed, otherwise return false. */
12693 bool
12694 aarch64_expand_movmem (rtx *operands)
12696 unsigned int n;
12697 rtx dst = operands[0];
12698 rtx src = operands[1];
12699 rtx base;
12700 bool speed_p = !optimize_function_for_size_p (cfun);
12702 /* When optimizing for size, give a better estimate of the length of a
12703 memcpy call, but use the default otherwise. */
12704 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12706 /* We can't do anything smart if the amount to copy is not constant. */
12707 if (!CONST_INT_P (operands[2]))
12708 return false;
12710 n = UINTVAL (operands[2]);
12712 /* Try to keep the number of instructions low. For cases below 16 bytes we
12713 need to make at most two moves. For cases above 16 bytes it will be one
12714 move for each 16 byte chunk, then at most two additional moves. */
12715 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12716 return false;
12718 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12719 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12721 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12722 src = adjust_automodify_address (src, VOIDmode, base, 0);
12724 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12725 1-byte chunk. */
12726 if (n < 4)
12728 if (n >= 2)
12730 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12731 n -= 2;
12734 if (n == 1)
12735 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12737 return true;
12740 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
12741 4-byte chunk, partially overlapping with the previously copied chunk. */
12742 if (n < 8)
12744 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12745 n -= 4;
12746 if (n > 0)
12748 int move = n - 4;
12750 src = aarch64_move_pointer (src, move);
12751 dst = aarch64_move_pointer (dst, move);
12752 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12754 return true;
12757 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
12758 them, then (if applicable) an 8-byte chunk. */
12759 while (n >= 8)
12761 if (n / 16)
12763 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12764 n -= 16;
12766 else
12768 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12769 n -= 8;
12773 /* Finish the final bytes of the copy. We can always do this in one
12774 instruction. We either copy the exact amount we need, or partially
12775 overlap with the previous chunk we copied and copy 8-bytes. */
12776 if (n == 0)
12777 return true;
12778 else if (n == 1)
12779 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12780 else if (n == 2)
12781 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12782 else if (n == 4)
12783 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12784 else
12786 if (n == 3)
12788 src = aarch64_move_pointer (src, -1);
12789 dst = aarch64_move_pointer (dst, -1);
12790 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12792 else
12794 int move = n - 8;
12796 src = aarch64_move_pointer (src, move);
12797 dst = aarch64_move_pointer (dst, move);
12798 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12802 return true;
12805 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
12807 static unsigned HOST_WIDE_INT
12808 aarch64_asan_shadow_offset (void)
12810 return (HOST_WIDE_INT_1 << 36);
12813 static bool
12814 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
12815 unsigned int align,
12816 enum by_pieces_operation op,
12817 bool speed_p)
12819 /* STORE_BY_PIECES can be used when copying a constant string, but
12820 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
12821 For now we always fail this and let the move_by_pieces code copy
12822 the string from read-only memory. */
12823 if (op == STORE_BY_PIECES)
12824 return false;
12826 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
12829 static rtx
12830 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
12831 int code, tree treeop0, tree treeop1)
12833 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
12834 rtx op0, op1;
12835 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12836 insn_code icode;
12837 struct expand_operand ops[4];
12839 start_sequence ();
12840 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12842 op_mode = GET_MODE (op0);
12843 if (op_mode == VOIDmode)
12844 op_mode = GET_MODE (op1);
12846 switch (op_mode)
12848 case QImode:
12849 case HImode:
12850 case SImode:
12851 cmp_mode = SImode;
12852 icode = CODE_FOR_cmpsi;
12853 break;
12855 case DImode:
12856 cmp_mode = DImode;
12857 icode = CODE_FOR_cmpdi;
12858 break;
12860 case SFmode:
12861 cmp_mode = SFmode;
12862 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
12863 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
12864 break;
12866 case DFmode:
12867 cmp_mode = DFmode;
12868 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
12869 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
12870 break;
12872 default:
12873 end_sequence ();
12874 return NULL_RTX;
12877 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
12878 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
12879 if (!op0 || !op1)
12881 end_sequence ();
12882 return NULL_RTX;
12884 *prep_seq = get_insns ();
12885 end_sequence ();
12887 create_fixed_operand (&ops[0], op0);
12888 create_fixed_operand (&ops[1], op1);
12890 start_sequence ();
12891 if (!maybe_expand_insn (icode, 2, ops))
12893 end_sequence ();
12894 return NULL_RTX;
12896 *gen_seq = get_insns ();
12897 end_sequence ();
12899 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
12900 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
12903 static rtx
12904 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
12905 tree treeop0, tree treeop1, int bit_code)
12907 rtx op0, op1, target;
12908 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
12909 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12910 insn_code icode;
12911 struct expand_operand ops[6];
12912 int aarch64_cond;
12914 push_to_sequence ((rtx_insn*) *prep_seq);
12915 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12917 op_mode = GET_MODE (op0);
12918 if (op_mode == VOIDmode)
12919 op_mode = GET_MODE (op1);
12921 switch (op_mode)
12923 case QImode:
12924 case HImode:
12925 case SImode:
12926 cmp_mode = SImode;
12927 icode = CODE_FOR_ccmpsi;
12928 break;
12930 case DImode:
12931 cmp_mode = DImode;
12932 icode = CODE_FOR_ccmpdi;
12933 break;
12935 case SFmode:
12936 cmp_mode = SFmode;
12937 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
12938 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
12939 break;
12941 case DFmode:
12942 cmp_mode = DFmode;
12943 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
12944 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
12945 break;
12947 default:
12948 end_sequence ();
12949 return NULL_RTX;
12952 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12953 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12954 if (!op0 || !op1)
12956 end_sequence ();
12957 return NULL_RTX;
12959 *prep_seq = get_insns ();
12960 end_sequence ();
12962 target = gen_rtx_REG (cc_mode, CC_REGNUM);
12963 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
12965 if (bit_code != AND)
12967 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
12968 GET_MODE (XEXP (prev, 0))),
12969 VOIDmode, XEXP (prev, 0), const0_rtx);
12970 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
12973 create_fixed_operand (&ops[0], XEXP (prev, 0));
12974 create_fixed_operand (&ops[1], target);
12975 create_fixed_operand (&ops[2], op0);
12976 create_fixed_operand (&ops[3], op1);
12977 create_fixed_operand (&ops[4], prev);
12978 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
12980 push_to_sequence ((rtx_insn*) *gen_seq);
12981 if (!maybe_expand_insn (icode, 6, ops))
12983 end_sequence ();
12984 return NULL_RTX;
12987 *gen_seq = get_insns ();
12988 end_sequence ();
12990 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
12993 #undef TARGET_GEN_CCMP_FIRST
12994 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
12996 #undef TARGET_GEN_CCMP_NEXT
12997 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
12999 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13000 instruction fusion of some sort. */
13002 static bool
13003 aarch64_macro_fusion_p (void)
13005 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13009 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13010 should be kept together during scheduling. */
13012 static bool
13013 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13015 rtx set_dest;
13016 rtx prev_set = single_set (prev);
13017 rtx curr_set = single_set (curr);
13018 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13019 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13021 if (!aarch64_macro_fusion_p ())
13022 return false;
13024 if (simple_sets_p
13025 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13027 /* We are trying to match:
13028 prev (mov) == (set (reg r0) (const_int imm16))
13029 curr (movk) == (set (zero_extract (reg r0)
13030 (const_int 16)
13031 (const_int 16))
13032 (const_int imm16_1)) */
13034 set_dest = SET_DEST (curr_set);
13036 if (GET_CODE (set_dest) == ZERO_EXTRACT
13037 && CONST_INT_P (SET_SRC (curr_set))
13038 && CONST_INT_P (SET_SRC (prev_set))
13039 && CONST_INT_P (XEXP (set_dest, 2))
13040 && INTVAL (XEXP (set_dest, 2)) == 16
13041 && REG_P (XEXP (set_dest, 0))
13042 && REG_P (SET_DEST (prev_set))
13043 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13045 return true;
13049 if (simple_sets_p
13050 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13053 /* We're trying to match:
13054 prev (adrp) == (set (reg r1)
13055 (high (symbol_ref ("SYM"))))
13056 curr (add) == (set (reg r0)
13057 (lo_sum (reg r1)
13058 (symbol_ref ("SYM"))))
13059 Note that r0 need not necessarily be the same as r1, especially
13060 during pre-regalloc scheduling. */
13062 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13063 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13065 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13066 && REG_P (XEXP (SET_SRC (curr_set), 0))
13067 && REGNO (XEXP (SET_SRC (curr_set), 0))
13068 == REGNO (SET_DEST (prev_set))
13069 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13070 XEXP (SET_SRC (curr_set), 1)))
13071 return true;
13075 if (simple_sets_p
13076 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13079 /* We're trying to match:
13080 prev (movk) == (set (zero_extract (reg r0)
13081 (const_int 16)
13082 (const_int 32))
13083 (const_int imm16_1))
13084 curr (movk) == (set (zero_extract (reg r0)
13085 (const_int 16)
13086 (const_int 48))
13087 (const_int imm16_2)) */
13089 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13090 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13091 && REG_P (XEXP (SET_DEST (prev_set), 0))
13092 && REG_P (XEXP (SET_DEST (curr_set), 0))
13093 && REGNO (XEXP (SET_DEST (prev_set), 0))
13094 == REGNO (XEXP (SET_DEST (curr_set), 0))
13095 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13096 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13097 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13098 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13099 && CONST_INT_P (SET_SRC (prev_set))
13100 && CONST_INT_P (SET_SRC (curr_set)))
13101 return true;
13104 if (simple_sets_p
13105 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13107 /* We're trying to match:
13108 prev (adrp) == (set (reg r0)
13109 (high (symbol_ref ("SYM"))))
13110 curr (ldr) == (set (reg r1)
13111 (mem (lo_sum (reg r0)
13112 (symbol_ref ("SYM")))))
13114 curr (ldr) == (set (reg r1)
13115 (zero_extend (mem
13116 (lo_sum (reg r0)
13117 (symbol_ref ("SYM")))))) */
13118 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13119 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13121 rtx curr_src = SET_SRC (curr_set);
13123 if (GET_CODE (curr_src) == ZERO_EXTEND)
13124 curr_src = XEXP (curr_src, 0);
13126 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13127 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13128 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13129 == REGNO (SET_DEST (prev_set))
13130 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13131 XEXP (SET_SRC (prev_set), 0)))
13132 return true;
13136 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13137 && aarch_crypto_can_dual_issue (prev, curr))
13138 return true;
13140 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13141 && any_condjump_p (curr))
13143 enum attr_type prev_type = get_attr_type (prev);
13145 /* FIXME: this misses some which is considered simple arthematic
13146 instructions for ThunderX. Simple shifts are missed here. */
13147 if (prev_type == TYPE_ALUS_SREG
13148 || prev_type == TYPE_ALUS_IMM
13149 || prev_type == TYPE_LOGICS_REG
13150 || prev_type == TYPE_LOGICS_IMM)
13151 return true;
13154 return false;
13157 /* Return true iff the instruction fusion described by OP is enabled. */
13159 bool
13160 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13162 return (aarch64_tune_params.fusible_ops & op) != 0;
13165 /* If MEM is in the form of [base+offset], extract the two parts
13166 of address and set to BASE and OFFSET, otherwise return false
13167 after clearing BASE and OFFSET. */
13169 bool
13170 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13172 rtx addr;
13174 gcc_assert (MEM_P (mem));
13176 addr = XEXP (mem, 0);
13178 if (REG_P (addr))
13180 *base = addr;
13181 *offset = const0_rtx;
13182 return true;
13185 if (GET_CODE (addr) == PLUS
13186 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13188 *base = XEXP (addr, 0);
13189 *offset = XEXP (addr, 1);
13190 return true;
13193 *base = NULL_RTX;
13194 *offset = NULL_RTX;
13196 return false;
13199 /* Types for scheduling fusion. */
13200 enum sched_fusion_type
13202 SCHED_FUSION_NONE = 0,
13203 SCHED_FUSION_LD_SIGN_EXTEND,
13204 SCHED_FUSION_LD_ZERO_EXTEND,
13205 SCHED_FUSION_LD,
13206 SCHED_FUSION_ST,
13207 SCHED_FUSION_NUM
13210 /* If INSN is a load or store of address in the form of [base+offset],
13211 extract the two parts and set to BASE and OFFSET. Return scheduling
13212 fusion type this INSN is. */
13214 static enum sched_fusion_type
13215 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13217 rtx x, dest, src;
13218 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13220 gcc_assert (INSN_P (insn));
13221 x = PATTERN (insn);
13222 if (GET_CODE (x) != SET)
13223 return SCHED_FUSION_NONE;
13225 src = SET_SRC (x);
13226 dest = SET_DEST (x);
13228 machine_mode dest_mode = GET_MODE (dest);
13230 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13231 return SCHED_FUSION_NONE;
13233 if (GET_CODE (src) == SIGN_EXTEND)
13235 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13236 src = XEXP (src, 0);
13237 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13238 return SCHED_FUSION_NONE;
13240 else if (GET_CODE (src) == ZERO_EXTEND)
13242 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13243 src = XEXP (src, 0);
13244 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13245 return SCHED_FUSION_NONE;
13248 if (GET_CODE (src) == MEM && REG_P (dest))
13249 extract_base_offset_in_addr (src, base, offset);
13250 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13252 fusion = SCHED_FUSION_ST;
13253 extract_base_offset_in_addr (dest, base, offset);
13255 else
13256 return SCHED_FUSION_NONE;
13258 if (*base == NULL_RTX || *offset == NULL_RTX)
13259 fusion = SCHED_FUSION_NONE;
13261 return fusion;
13264 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13266 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13267 and PRI are only calculated for these instructions. For other instruction,
13268 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13269 type instruction fusion can be added by returning different priorities.
13271 It's important that irrelevant instructions get the largest FUSION_PRI. */
13273 static void
13274 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13275 int *fusion_pri, int *pri)
13277 int tmp, off_val;
13278 rtx base, offset;
13279 enum sched_fusion_type fusion;
13281 gcc_assert (INSN_P (insn));
13283 tmp = max_pri - 1;
13284 fusion = fusion_load_store (insn, &base, &offset);
13285 if (fusion == SCHED_FUSION_NONE)
13287 *pri = tmp;
13288 *fusion_pri = tmp;
13289 return;
13292 /* Set FUSION_PRI according to fusion type and base register. */
13293 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13295 /* Calculate PRI. */
13296 tmp /= 2;
13298 /* INSN with smaller offset goes first. */
13299 off_val = (int)(INTVAL (offset));
13300 if (off_val >= 0)
13301 tmp -= (off_val & 0xfffff);
13302 else
13303 tmp += ((- off_val) & 0xfffff);
13305 *pri = tmp;
13306 return;
13309 /* Given OPERANDS of consecutive load/store, check if we can merge
13310 them into ldp/stp. LOAD is true if they are load instructions.
13311 MODE is the mode of memory operands. */
13313 bool
13314 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13315 enum machine_mode mode)
13317 HOST_WIDE_INT offval_1, offval_2, msize;
13318 enum reg_class rclass_1, rclass_2;
13319 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13321 if (load)
13323 mem_1 = operands[1];
13324 mem_2 = operands[3];
13325 reg_1 = operands[0];
13326 reg_2 = operands[2];
13327 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13328 if (REGNO (reg_1) == REGNO (reg_2))
13329 return false;
13331 else
13333 mem_1 = operands[0];
13334 mem_2 = operands[2];
13335 reg_1 = operands[1];
13336 reg_2 = operands[3];
13339 /* The mems cannot be volatile. */
13340 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13341 return false;
13343 /* Check if the addresses are in the form of [base+offset]. */
13344 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13345 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13346 return false;
13347 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13348 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13349 return false;
13351 /* Check if the bases are same. */
13352 if (!rtx_equal_p (base_1, base_2))
13353 return false;
13355 offval_1 = INTVAL (offset_1);
13356 offval_2 = INTVAL (offset_2);
13357 msize = GET_MODE_SIZE (mode);
13358 /* Check if the offsets are consecutive. */
13359 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13360 return false;
13362 /* Check if the addresses are clobbered by load. */
13363 if (load)
13365 if (reg_mentioned_p (reg_1, mem_1))
13366 return false;
13368 /* In increasing order, the last load can clobber the address. */
13369 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13370 return false;
13373 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13374 rclass_1 = FP_REGS;
13375 else
13376 rclass_1 = GENERAL_REGS;
13378 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13379 rclass_2 = FP_REGS;
13380 else
13381 rclass_2 = GENERAL_REGS;
13383 /* Check if the registers are of same class. */
13384 if (rclass_1 != rclass_2)
13385 return false;
13387 return true;
13390 /* Given OPERANDS of consecutive load/store, check if we can merge
13391 them into ldp/stp by adjusting the offset. LOAD is true if they
13392 are load instructions. MODE is the mode of memory operands.
13394 Given below consecutive stores:
13396 str w1, [xb, 0x100]
13397 str w1, [xb, 0x104]
13398 str w1, [xb, 0x108]
13399 str w1, [xb, 0x10c]
13401 Though the offsets are out of the range supported by stp, we can
13402 still pair them after adjusting the offset, like:
13404 add scratch, xb, 0x100
13405 stp w1, w1, [scratch]
13406 stp w1, w1, [scratch, 0x8]
13408 The peephole patterns detecting this opportunity should guarantee
13409 the scratch register is avaliable. */
13411 bool
13412 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13413 enum machine_mode mode)
13415 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13416 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13417 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13418 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13420 if (load)
13422 reg_1 = operands[0];
13423 mem_1 = operands[1];
13424 reg_2 = operands[2];
13425 mem_2 = operands[3];
13426 reg_3 = operands[4];
13427 mem_3 = operands[5];
13428 reg_4 = operands[6];
13429 mem_4 = operands[7];
13430 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13431 && REG_P (reg_3) && REG_P (reg_4));
13432 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13433 return false;
13435 else
13437 mem_1 = operands[0];
13438 reg_1 = operands[1];
13439 mem_2 = operands[2];
13440 reg_2 = operands[3];
13441 mem_3 = operands[4];
13442 reg_3 = operands[5];
13443 mem_4 = operands[6];
13444 reg_4 = operands[7];
13446 /* Skip if memory operand is by itslef valid for ldp/stp. */
13447 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13448 return false;
13450 /* The mems cannot be volatile. */
13451 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13452 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13453 return false;
13455 /* Check if the addresses are in the form of [base+offset]. */
13456 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13457 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13458 return false;
13459 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13460 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13461 return false;
13462 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13463 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13464 return false;
13465 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13466 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13467 return false;
13469 /* Check if the bases are same. */
13470 if (!rtx_equal_p (base_1, base_2)
13471 || !rtx_equal_p (base_2, base_3)
13472 || !rtx_equal_p (base_3, base_4))
13473 return false;
13475 offval_1 = INTVAL (offset_1);
13476 offval_2 = INTVAL (offset_2);
13477 offval_3 = INTVAL (offset_3);
13478 offval_4 = INTVAL (offset_4);
13479 msize = GET_MODE_SIZE (mode);
13480 /* Check if the offsets are consecutive. */
13481 if ((offval_1 != (offval_2 + msize)
13482 || offval_1 != (offval_3 + msize * 2)
13483 || offval_1 != (offval_4 + msize * 3))
13484 && (offval_4 != (offval_3 + msize)
13485 || offval_4 != (offval_2 + msize * 2)
13486 || offval_4 != (offval_1 + msize * 3)))
13487 return false;
13489 /* Check if the addresses are clobbered by load. */
13490 if (load)
13492 if (reg_mentioned_p (reg_1, mem_1)
13493 || reg_mentioned_p (reg_2, mem_2)
13494 || reg_mentioned_p (reg_3, mem_3))
13495 return false;
13497 /* In increasing order, the last load can clobber the address. */
13498 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13499 return false;
13502 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13503 rclass_1 = FP_REGS;
13504 else
13505 rclass_1 = GENERAL_REGS;
13507 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13508 rclass_2 = FP_REGS;
13509 else
13510 rclass_2 = GENERAL_REGS;
13512 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13513 rclass_3 = FP_REGS;
13514 else
13515 rclass_3 = GENERAL_REGS;
13517 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13518 rclass_4 = FP_REGS;
13519 else
13520 rclass_4 = GENERAL_REGS;
13522 /* Check if the registers are of same class. */
13523 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13524 return false;
13526 return true;
13529 /* Given OPERANDS of consecutive load/store, this function pairs them
13530 into ldp/stp after adjusting the offset. It depends on the fact
13531 that addresses of load/store instructions are in increasing order.
13532 MODE is the mode of memory operands. CODE is the rtl operator
13533 which should be applied to all memory operands, it's SIGN_EXTEND,
13534 ZERO_EXTEND or UNKNOWN. */
13536 bool
13537 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13538 enum machine_mode mode, RTX_CODE code)
13540 rtx base, offset, t1, t2;
13541 rtx mem_1, mem_2, mem_3, mem_4;
13542 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13544 if (load)
13546 mem_1 = operands[1];
13547 mem_2 = operands[3];
13548 mem_3 = operands[5];
13549 mem_4 = operands[7];
13551 else
13553 mem_1 = operands[0];
13554 mem_2 = operands[2];
13555 mem_3 = operands[4];
13556 mem_4 = operands[6];
13557 gcc_assert (code == UNKNOWN);
13560 extract_base_offset_in_addr (mem_1, &base, &offset);
13561 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13563 /* Adjust offset thus it can fit in ldp/stp instruction. */
13564 msize = GET_MODE_SIZE (mode);
13565 stp_off_limit = msize * 0x40;
13566 off_val = INTVAL (offset);
13567 abs_off = (off_val < 0) ? -off_val : off_val;
13568 new_off = abs_off % stp_off_limit;
13569 adj_off = abs_off - new_off;
13571 /* Further adjust to make sure all offsets are OK. */
13572 if ((new_off + msize * 2) >= stp_off_limit)
13574 adj_off += stp_off_limit;
13575 new_off -= stp_off_limit;
13578 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13579 if (adj_off >= 0x1000)
13580 return false;
13582 if (off_val < 0)
13584 adj_off = -adj_off;
13585 new_off = -new_off;
13588 /* Create new memory references. */
13589 mem_1 = change_address (mem_1, VOIDmode,
13590 plus_constant (DImode, operands[8], new_off));
13592 /* Check if the adjusted address is OK for ldp/stp. */
13593 if (!aarch64_mem_pair_operand (mem_1, mode))
13594 return false;
13596 msize = GET_MODE_SIZE (mode);
13597 mem_2 = change_address (mem_2, VOIDmode,
13598 plus_constant (DImode,
13599 operands[8],
13600 new_off + msize));
13601 mem_3 = change_address (mem_3, VOIDmode,
13602 plus_constant (DImode,
13603 operands[8],
13604 new_off + msize * 2));
13605 mem_4 = change_address (mem_4, VOIDmode,
13606 plus_constant (DImode,
13607 operands[8],
13608 new_off + msize * 3));
13610 if (code == ZERO_EXTEND)
13612 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13613 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13614 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13615 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13617 else if (code == SIGN_EXTEND)
13619 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13620 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13621 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13622 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13625 if (load)
13627 operands[1] = mem_1;
13628 operands[3] = mem_2;
13629 operands[5] = mem_3;
13630 operands[7] = mem_4;
13632 else
13634 operands[0] = mem_1;
13635 operands[2] = mem_2;
13636 operands[4] = mem_3;
13637 operands[6] = mem_4;
13640 /* Emit adjusting instruction. */
13641 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13642 /* Emit ldp/stp instructions. */
13643 t1 = gen_rtx_SET (operands[0], operands[1]);
13644 t2 = gen_rtx_SET (operands[2], operands[3]);
13645 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13646 t1 = gen_rtx_SET (operands[4], operands[5]);
13647 t2 = gen_rtx_SET (operands[6], operands[7]);
13648 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13649 return true;
13652 /* Return 1 if pseudo register should be created and used to hold
13653 GOT address for PIC code. */
13655 bool
13656 aarch64_use_pseudo_pic_reg (void)
13658 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13661 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
13663 static int
13664 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13666 switch (XINT (x, 1))
13668 case UNSPEC_GOTSMALLPIC:
13669 case UNSPEC_GOTSMALLPIC28K:
13670 case UNSPEC_GOTTINYPIC:
13671 return 0;
13672 default:
13673 break;
13676 return default_unspec_may_trap_p (x, flags);
13680 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13681 return the log2 of that value. Otherwise return -1. */
13684 aarch64_fpconst_pow_of_2 (rtx x)
13686 const REAL_VALUE_TYPE *r;
13688 if (!CONST_DOUBLE_P (x))
13689 return -1;
13691 r = CONST_DOUBLE_REAL_VALUE (x);
13693 if (REAL_VALUE_NEGATIVE (*r)
13694 || REAL_VALUE_ISNAN (*r)
13695 || REAL_VALUE_ISINF (*r)
13696 || !real_isinteger (r, DFmode))
13697 return -1;
13699 return exact_log2 (real_to_integer (r));
13702 /* If X is a vector of equal CONST_DOUBLE values and that value is
13703 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
13706 aarch64_vec_fpconst_pow_of_2 (rtx x)
13708 if (GET_CODE (x) != CONST_VECTOR)
13709 return -1;
13711 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13712 return -1;
13714 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13715 if (firstval <= 0)
13716 return -1;
13718 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13719 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13720 return -1;
13722 return firstval;
13725 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
13726 static tree
13727 aarch64_promoted_type (const_tree t)
13729 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13730 return float_type_node;
13731 return NULL_TREE;
13734 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
13736 static bool
13737 aarch64_optab_supported_p (int op, machine_mode, machine_mode,
13738 optimization_type opt_type)
13740 switch (op)
13742 case rsqrt_optab:
13743 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
13745 default:
13746 return true;
13750 #undef TARGET_ADDRESS_COST
13751 #define TARGET_ADDRESS_COST aarch64_address_cost
13753 /* This hook will determines whether unnamed bitfields affect the alignment
13754 of the containing structure. The hook returns true if the structure
13755 should inherit the alignment requirements of an unnamed bitfield's
13756 type. */
13757 #undef TARGET_ALIGN_ANON_BITFIELD
13758 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
13760 #undef TARGET_ASM_ALIGNED_DI_OP
13761 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
13763 #undef TARGET_ASM_ALIGNED_HI_OP
13764 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
13766 #undef TARGET_ASM_ALIGNED_SI_OP
13767 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
13769 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
13770 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
13771 hook_bool_const_tree_hwi_hwi_const_tree_true
13773 #undef TARGET_ASM_FILE_START
13774 #define TARGET_ASM_FILE_START aarch64_start_file
13776 #undef TARGET_ASM_OUTPUT_MI_THUNK
13777 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
13779 #undef TARGET_ASM_SELECT_RTX_SECTION
13780 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
13782 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
13783 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
13785 #undef TARGET_BUILD_BUILTIN_VA_LIST
13786 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
13788 #undef TARGET_CALLEE_COPIES
13789 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
13791 #undef TARGET_CAN_ELIMINATE
13792 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
13794 #undef TARGET_CAN_INLINE_P
13795 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
13797 #undef TARGET_CANNOT_FORCE_CONST_MEM
13798 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
13800 #undef TARGET_CASE_VALUES_THRESHOLD
13801 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
13803 #undef TARGET_CONDITIONAL_REGISTER_USAGE
13804 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
13806 /* Only the least significant bit is used for initialization guard
13807 variables. */
13808 #undef TARGET_CXX_GUARD_MASK_BIT
13809 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
13811 #undef TARGET_C_MODE_FOR_SUFFIX
13812 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
13814 #ifdef TARGET_BIG_ENDIAN_DEFAULT
13815 #undef TARGET_DEFAULT_TARGET_FLAGS
13816 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
13817 #endif
13819 #undef TARGET_CLASS_MAX_NREGS
13820 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
13822 #undef TARGET_BUILTIN_DECL
13823 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
13825 #undef TARGET_BUILTIN_RECIPROCAL
13826 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
13828 #undef TARGET_EXPAND_BUILTIN
13829 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
13831 #undef TARGET_EXPAND_BUILTIN_VA_START
13832 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
13834 #undef TARGET_FOLD_BUILTIN
13835 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
13837 #undef TARGET_FUNCTION_ARG
13838 #define TARGET_FUNCTION_ARG aarch64_function_arg
13840 #undef TARGET_FUNCTION_ARG_ADVANCE
13841 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
13843 #undef TARGET_FUNCTION_ARG_BOUNDARY
13844 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
13846 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
13847 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
13849 #undef TARGET_FUNCTION_VALUE
13850 #define TARGET_FUNCTION_VALUE aarch64_function_value
13852 #undef TARGET_FUNCTION_VALUE_REGNO_P
13853 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
13855 #undef TARGET_FRAME_POINTER_REQUIRED
13856 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
13858 #undef TARGET_GIMPLE_FOLD_BUILTIN
13859 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
13861 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
13862 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
13864 #undef TARGET_INIT_BUILTINS
13865 #define TARGET_INIT_BUILTINS aarch64_init_builtins
13867 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
13868 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
13869 aarch64_ira_change_pseudo_allocno_class
13871 #undef TARGET_LEGITIMATE_ADDRESS_P
13872 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
13874 #undef TARGET_LEGITIMATE_CONSTANT_P
13875 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
13877 #undef TARGET_LIBGCC_CMP_RETURN_MODE
13878 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
13880 #undef TARGET_LRA_P
13881 #define TARGET_LRA_P hook_bool_void_true
13883 #undef TARGET_MANGLE_TYPE
13884 #define TARGET_MANGLE_TYPE aarch64_mangle_type
13886 #undef TARGET_MEMORY_MOVE_COST
13887 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
13889 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
13890 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
13892 #undef TARGET_MUST_PASS_IN_STACK
13893 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
13895 /* This target hook should return true if accesses to volatile bitfields
13896 should use the narrowest mode possible. It should return false if these
13897 accesses should use the bitfield container type. */
13898 #undef TARGET_NARROW_VOLATILE_BITFIELD
13899 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
13901 #undef TARGET_OPTION_OVERRIDE
13902 #define TARGET_OPTION_OVERRIDE aarch64_override_options
13904 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
13905 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
13906 aarch64_override_options_after_change
13908 #undef TARGET_OPTION_SAVE
13909 #define TARGET_OPTION_SAVE aarch64_option_save
13911 #undef TARGET_OPTION_RESTORE
13912 #define TARGET_OPTION_RESTORE aarch64_option_restore
13914 #undef TARGET_OPTION_PRINT
13915 #define TARGET_OPTION_PRINT aarch64_option_print
13917 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
13918 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
13920 #undef TARGET_SET_CURRENT_FUNCTION
13921 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
13923 #undef TARGET_PASS_BY_REFERENCE
13924 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
13926 #undef TARGET_PREFERRED_RELOAD_CLASS
13927 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
13929 #undef TARGET_SCHED_REASSOCIATION_WIDTH
13930 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
13932 #undef TARGET_PROMOTED_TYPE
13933 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
13935 #undef TARGET_SECONDARY_RELOAD
13936 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
13938 #undef TARGET_SHIFT_TRUNCATION_MASK
13939 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
13941 #undef TARGET_SETUP_INCOMING_VARARGS
13942 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
13944 #undef TARGET_STRUCT_VALUE_RTX
13945 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
13947 #undef TARGET_REGISTER_MOVE_COST
13948 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
13950 #undef TARGET_RETURN_IN_MEMORY
13951 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
13953 #undef TARGET_RETURN_IN_MSB
13954 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
13956 #undef TARGET_RTX_COSTS
13957 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
13959 #undef TARGET_SCHED_ISSUE_RATE
13960 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
13962 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
13963 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
13964 aarch64_sched_first_cycle_multipass_dfa_lookahead
13966 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
13967 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
13968 aarch64_first_cycle_multipass_dfa_lookahead_guard
13970 #undef TARGET_TRAMPOLINE_INIT
13971 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
13973 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
13974 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
13976 #undef TARGET_VECTOR_MODE_SUPPORTED_P
13977 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
13979 #undef TARGET_ARRAY_MODE_SUPPORTED_P
13980 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
13982 #undef TARGET_VECTORIZE_ADD_STMT_COST
13983 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
13985 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
13986 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
13987 aarch64_builtin_vectorization_cost
13989 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
13990 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
13992 #undef TARGET_VECTORIZE_BUILTINS
13993 #define TARGET_VECTORIZE_BUILTINS
13995 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
13996 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
13997 aarch64_builtin_vectorized_function
13999 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14000 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14001 aarch64_autovectorize_vector_sizes
14003 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14004 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14005 aarch64_atomic_assign_expand_fenv
14007 /* Section anchor support. */
14009 #undef TARGET_MIN_ANCHOR_OFFSET
14010 #define TARGET_MIN_ANCHOR_OFFSET -256
14012 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14013 byte offset; we can do much more for larger data types, but have no way
14014 to determine the size of the access. We assume accesses are aligned. */
14015 #undef TARGET_MAX_ANCHOR_OFFSET
14016 #define TARGET_MAX_ANCHOR_OFFSET 4095
14018 #undef TARGET_VECTOR_ALIGNMENT
14019 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14021 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14022 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14023 aarch64_simd_vector_alignment_reachable
14025 /* vec_perm support. */
14027 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14028 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14029 aarch64_vectorize_vec_perm_const_ok
14031 #undef TARGET_INIT_LIBFUNCS
14032 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14034 #undef TARGET_FIXED_CONDITION_CODE_REGS
14035 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14037 #undef TARGET_FLAGS_REGNUM
14038 #define TARGET_FLAGS_REGNUM CC_REGNUM
14040 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14041 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14043 #undef TARGET_ASAN_SHADOW_OFFSET
14044 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14046 #undef TARGET_LEGITIMIZE_ADDRESS
14047 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14049 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14050 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14051 aarch64_use_by_pieces_infrastructure_p
14053 #undef TARGET_CAN_USE_DOLOOP_P
14054 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14056 #undef TARGET_SCHED_MACRO_FUSION_P
14057 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14059 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14060 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14062 #undef TARGET_SCHED_FUSION_PRIORITY
14063 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14065 #undef TARGET_UNSPEC_MAY_TRAP_P
14066 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14068 #undef TARGET_USE_PSEUDO_PIC_REG
14069 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14071 #undef TARGET_PRINT_OPERAND
14072 #define TARGET_PRINT_OPERAND aarch64_print_operand
14074 #undef TARGET_PRINT_OPERAND_ADDRESS
14075 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14077 #undef TARGET_OPTAB_SUPPORTED_P
14078 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14080 #undef TARGET_OMIT_STRUCT_RETURN_REG
14081 #define TARGET_OMIT_STRUCT_RETURN_REG true
14083 struct gcc_target targetm = TARGET_INITIALIZER;
14085 #include "gt-aarch64.h"