1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
25 #include "insn-codes.h"
27 #include "insn-attr.h"
31 #include "double-int.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
44 #include "dominance.h"
50 #include "cfgcleanup.h"
52 #include "basic-block.h"
54 #include "hard-reg-set.h"
59 #include "statistics.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
72 #include "target-def.h"
73 #include "targhooks.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
84 #include "gimple-expr.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
106 A simple base register plus immediate offset.
109 A base register indexed by immediate offset with writeback.
112 A base register indexed by (optionally scaled) register.
115 A base register indexed by (optionally scaled) zero-extended register.
118 A base register indexed by (optionally scaled) sign-extended register.
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type
{
136 struct aarch64_address_info
{
137 enum aarch64_address_type type
;
141 enum aarch64_symbol_type symbol_type
;
144 struct simd_immediate_info
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel
;
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
161 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
164 machine_mode
*, int *,
166 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
167 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode
);
170 static unsigned bit_count (unsigned HOST_WIDE_INT
);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
172 const unsigned char *sel
);
173 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version
;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune
= cortexa53
;
181 /* The current tuning set. */
182 const struct tune_params
*aarch64_tune_params
;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags
= 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags
= 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table
=
202 0, /* register_offset */
203 0, /* register_extend */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table
=
217 0, /* register_offset */
218 0, /* register_extend */
222 static const struct cpu_addrcost_table xgene1_addrcost_table
=
232 0, /* register_offset */
233 1, /* register_extend */
237 static const struct cpu_regmove_cost generic_regmove_cost
=
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
267 static const struct cpu_regmove_cost thunderx_regmove_cost
=
275 static const struct cpu_regmove_cost xgene1_regmove_cost
=
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost
=
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost
=
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost
=
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 static const struct tune_params generic_tunings
=
345 &cortexa57_extra_costs
,
346 &generic_addrcost_table
,
347 &generic_regmove_cost
,
348 &generic_vector_cost
,
351 AARCH64_FUSE_NOTHING
, /* fuseable_ops */
352 8, /* function_align. */
355 2, /* int_reassoc_width. */
356 4, /* fp_reassoc_width. */
357 1 /* vec_reassoc_width. */
360 static const struct tune_params cortexa53_tunings
=
362 &cortexa53_extra_costs
,
363 &generic_addrcost_table
,
364 &cortexa53_regmove_cost
,
365 &generic_vector_cost
,
368 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
369 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fuseable_ops */
370 8, /* function_align. */
373 2, /* int_reassoc_width. */
374 4, /* fp_reassoc_width. */
375 1 /* vec_reassoc_width. */
378 static const struct tune_params cortexa57_tunings
=
380 &cortexa57_extra_costs
,
381 &cortexa57_addrcost_table
,
382 &cortexa57_regmove_cost
,
383 &cortexa57_vector_cost
,
386 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
387 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
388 16, /* function_align. */
391 2, /* int_reassoc_width. */
392 4, /* fp_reassoc_width. */
393 1 /* vec_reassoc_width. */
396 static const struct tune_params thunderx_tunings
=
398 &thunderx_extra_costs
,
399 &generic_addrcost_table
,
400 &thunderx_regmove_cost
,
401 &generic_vector_cost
,
404 AARCH64_FUSE_CMP_BRANCH
, /* fuseable_ops */
405 8, /* function_align. */
408 2, /* int_reassoc_width. */
409 4, /* fp_reassoc_width. */
410 1 /* vec_reassoc_width. */
413 static const struct tune_params xgene1_tunings
=
416 &xgene1_addrcost_table
,
417 &xgene1_regmove_cost
,
421 AARCH64_FUSE_NOTHING
, /* fuseable_ops */
422 16, /* function_align. */
424 16, /* loop_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1 /* vec_reassoc_width. */
430 /* A processor implementing AArch64. */
433 const char *const name
;
434 enum aarch64_processor core
;
436 unsigned architecture_version
;
437 const unsigned long flags
;
438 const struct tune_params
*const tune
;
441 /* Processor cores implementing AArch64. */
442 static const struct processor all_cores
[] =
444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
445 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
446 #include "aarch64-cores.def"
448 {"generic", cortexa53
, "8", 8, AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
449 {NULL
, aarch64_none
, NULL
, 0, 0, NULL
}
452 /* Architectures implementing AArch64. */
453 static const struct processor all_architectures
[] =
455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
456 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
457 #include "aarch64-arches.def"
459 {NULL
, aarch64_none
, NULL
, 0, 0, NULL
}
462 /* Target specification. These are populated as commandline arguments
463 are processed, or NULL if not specified. */
464 static const struct processor
*selected_arch
;
465 static const struct processor
*selected_cpu
;
466 static const struct processor
*selected_tune
;
468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
470 /* An ISA extension in the co-processor and main instruction set space. */
471 struct aarch64_option_extension
473 const char *const name
;
474 const unsigned long flags_on
;
475 const unsigned long flags_off
;
478 /* ISA extensions in AArch64. */
479 static const struct aarch64_option_extension all_extensions
[] =
481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
482 {NAME, FLAGS_ON, FLAGS_OFF},
483 #include "aarch64-option-extensions.def"
484 #undef AARCH64_OPT_EXTENSION
488 /* Used to track the size of an address when generating a pre/post
489 increment address. */
490 static machine_mode aarch64_memory_reference_mode
;
492 /* A table of valid AArch64 "bitmask immediate" values for
493 logical instructions. */
495 #define AARCH64_NUM_BITMASKS 5334
496 static unsigned HOST_WIDE_INT aarch64_bitmasks
[AARCH64_NUM_BITMASKS
];
498 typedef enum aarch64_cond_code
500 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
501 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
502 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
508 /* The condition codes of the processor, and the inverse function. */
509 static const char * const aarch64_condition_codes
[] =
511 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
512 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED
)
522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
523 enum machine_mode mode
)
525 if (VECTOR_MODE_P (mode
))
526 return aarch64_tune_params
->vec_reassoc_width
;
527 if (INTEGRAL_MODE_P (mode
))
528 return aarch64_tune_params
->int_reassoc_width
;
529 if (FLOAT_MODE_P (mode
))
530 return aarch64_tune_params
->fp_reassoc_width
;
534 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
536 aarch64_dbx_register_number (unsigned regno
)
538 if (GP_REGNUM_P (regno
))
539 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
540 else if (regno
== SP_REGNUM
)
541 return AARCH64_DWARF_SP
;
542 else if (FP_REGNUM_P (regno
))
543 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
545 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
546 equivalent DWARF register. */
547 return DWARF_FRAME_REGISTERS
;
550 /* Return TRUE if MODE is any of the large INT modes. */
552 aarch64_vect_struct_mode_p (machine_mode mode
)
554 return mode
== OImode
|| mode
== CImode
|| mode
== XImode
;
557 /* Return TRUE if MODE is any of the vector modes. */
559 aarch64_vector_mode_p (machine_mode mode
)
561 return aarch64_vector_mode_supported_p (mode
)
562 || aarch64_vect_struct_mode_p (mode
);
565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
567 aarch64_array_mode_supported_p (machine_mode mode
,
568 unsigned HOST_WIDE_INT nelems
)
571 && AARCH64_VALID_SIMD_QREG_MODE (mode
)
572 && (nelems
>= 2 && nelems
<= 4))
578 /* Implement HARD_REGNO_NREGS. */
581 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
583 switch (aarch64_regno_regclass (regno
))
587 return (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
;
589 return (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
594 /* Implement HARD_REGNO_MODE_OK. */
597 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
599 if (GET_MODE_CLASS (mode
) == MODE_CC
)
600 return regno
== CC_REGNUM
;
602 if (regno
== SP_REGNUM
)
603 /* The purpose of comparing with ptr_mode is to support the
604 global register variable associated with the stack pointer
605 register via the syntax of asm ("wsp") in ILP32. */
606 return mode
== Pmode
|| mode
== ptr_mode
;
608 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
609 return mode
== Pmode
;
611 if (GP_REGNUM_P (regno
) && ! aarch64_vect_struct_mode_p (mode
))
614 if (FP_REGNUM_P (regno
))
616 if (aarch64_vect_struct_mode_p (mode
))
618 (regno
+ aarch64_hard_regno_nregs (regno
, mode
) - 1) <= V31_REGNUM
;
626 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
628 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned nregs
,
631 /* Handle modes that fit within single registers. */
632 if (nregs
== 1 && GET_MODE_SIZE (mode
) <= 16)
634 if (GET_MODE_SIZE (mode
) >= 4)
639 /* Fall back to generic for multi-reg and very large modes. */
641 return choose_hard_reg_mode (regno
, nregs
, false);
644 /* Return true if calls to DECL should be treated as
645 long-calls (ie called via a register). */
647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
652 /* Return true if calls to symbol-ref SYM should be treated as
653 long-calls (ie called via a register). */
655 aarch64_is_long_call_p (rtx sym
)
657 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
660 /* Return true if the offsets to a zero/sign-extract operation
661 represent an expression that matches an extend operation. The
662 operands represent the paramters from
664 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
666 aarch64_is_extend_from_extract (machine_mode mode
, rtx mult_imm
,
669 HOST_WIDE_INT mult_val
, extract_val
;
671 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
674 mult_val
= INTVAL (mult_imm
);
675 extract_val
= INTVAL (extract_imm
);
678 && extract_val
< GET_MODE_BITSIZE (mode
)
679 && exact_log2 (extract_val
& ~7) > 0
680 && (extract_val
& 7) <= 4
681 && mult_val
== (1 << (extract_val
& 7)))
687 /* Emit an insn that's a simple single-set. Both the operands must be
688 known to be valid. */
690 emit_set_insn (rtx x
, rtx y
)
692 return emit_insn (gen_rtx_SET (VOIDmode
, x
, y
));
695 /* X and Y are two things to compare using CODE. Emit the compare insn and
696 return the rtx for register 0 in the proper mode. */
698 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
700 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
701 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
703 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
707 /* Build the SYMBOL_REF for __tls_get_addr. */
709 static GTY(()) rtx tls_get_addr_libfunc
;
712 aarch64_tls_get_addr (void)
714 if (!tls_get_addr_libfunc
)
715 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
716 return tls_get_addr_libfunc
;
719 /* Return the TLS model to use for ADDR. */
721 static enum tls_model
722 tls_symbolic_operand_type (rtx addr
)
724 enum tls_model tls_kind
= TLS_MODEL_NONE
;
727 if (GET_CODE (addr
) == CONST
)
729 split_const (addr
, &sym
, &addend
);
730 if (GET_CODE (sym
) == SYMBOL_REF
)
731 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
733 else if (GET_CODE (addr
) == SYMBOL_REF
)
734 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
739 /* We'll allow lo_sum's in addresses in our legitimate addresses
740 so that combine would take care of combining addresses where
741 necessary, but for generation purposes, we'll generate the address
744 tmp = hi (symbol_ref); adrp x1, foo
745 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
749 adrp x1, :got:foo adrp tmp, :tlsgd:foo
750 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
754 Load TLS symbol, depending on TLS mechanism and TLS access model.
756 Global Dynamic - Traditional TLS:
758 add dest, tmp, #:tlsgd_lo12:imm
761 Global Dynamic - TLS Descriptors:
762 adrp dest, :tlsdesc:imm
763 ldr tmp, [dest, #:tlsdesc_lo12:imm]
764 add dest, dest, #:tlsdesc_lo12:imm
771 adrp tmp, :gottprel:imm
772 ldr dest, [tmp, #:gottprel_lo12:imm]
777 add t0, tp, #:tprel_hi12:imm, lsl #12
778 add t0, t0, #:tprel_lo12_nc:imm
782 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
783 enum aarch64_symbol_type type
)
787 case SYMBOL_SMALL_ABSOLUTE
:
789 /* In ILP32, the mode of dest can be either SImode or DImode. */
791 machine_mode mode
= GET_MODE (dest
);
793 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
795 if (can_create_pseudo_p ())
796 tmp_reg
= gen_reg_rtx (mode
);
798 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
799 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
803 case SYMBOL_TINY_ABSOLUTE
:
804 emit_insn (gen_rtx_SET (Pmode
, dest
, imm
));
807 case SYMBOL_SMALL_GOT
:
809 /* In ILP32, the mode of dest can be either SImode or DImode,
810 while the got entry is always of SImode size. The mode of
811 dest depends on how dest is used: if dest is assigned to a
812 pointer (e.g. in the memory), it has SImode; it may have
813 DImode if dest is dereferenced to access the memeory.
814 This is why we have to handle three different ldr_got_small
815 patterns here (two patterns for ILP32). */
817 machine_mode mode
= GET_MODE (dest
);
819 if (can_create_pseudo_p ())
820 tmp_reg
= gen_reg_rtx (mode
);
822 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
823 if (mode
== ptr_mode
)
826 emit_insn (gen_ldr_got_small_di (dest
, tmp_reg
, imm
));
828 emit_insn (gen_ldr_got_small_si (dest
, tmp_reg
, imm
));
832 gcc_assert (mode
== Pmode
);
833 emit_insn (gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
));
839 case SYMBOL_SMALL_TLSGD
:
842 rtx result
= gen_rtx_REG (Pmode
, R0_REGNUM
);
845 aarch64_emit_call_insn (gen_tlsgd_small (result
, imm
));
846 insns
= get_insns ();
849 RTL_CONST_CALL_P (insns
) = 1;
850 emit_libcall_block (insns
, dest
, result
, imm
);
854 case SYMBOL_SMALL_TLSDESC
:
856 machine_mode mode
= GET_MODE (dest
);
857 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
860 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
862 /* In ILP32, the got entry is always of SImode size. Unlike
863 small GOT, the dest is fixed at reg 0. */
865 emit_insn (gen_tlsdesc_small_si (imm
));
867 emit_insn (gen_tlsdesc_small_di (imm
));
868 tp
= aarch64_load_tp (NULL
);
871 tp
= gen_lowpart (mode
, tp
);
873 emit_insn (gen_rtx_SET (mode
, dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
874 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
878 case SYMBOL_SMALL_GOTTPREL
:
880 /* In ILP32, the mode of dest can be either SImode or DImode,
881 while the got entry is always of SImode size. The mode of
882 dest depends on how dest is used: if dest is assigned to a
883 pointer (e.g. in the memory), it has SImode; it may have
884 DImode if dest is dereferenced to access the memeory.
885 This is why we have to handle three different tlsie_small
886 patterns here (two patterns for ILP32). */
887 machine_mode mode
= GET_MODE (dest
);
888 rtx tmp_reg
= gen_reg_rtx (mode
);
889 rtx tp
= aarch64_load_tp (NULL
);
891 if (mode
== ptr_mode
)
894 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
897 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
898 tp
= gen_lowpart (mode
, tp
);
903 gcc_assert (mode
== Pmode
);
904 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
907 emit_insn (gen_rtx_SET (mode
, dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
908 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
912 case SYMBOL_SMALL_TPREL
:
914 rtx tp
= aarch64_load_tp (NULL
);
916 if (GET_MODE (dest
) != Pmode
)
917 tp
= gen_lowpart (GET_MODE (dest
), tp
);
919 emit_insn (gen_tlsle_small (dest
, tp
, imm
));
920 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
924 case SYMBOL_TINY_GOT
:
925 emit_insn (gen_ldr_got_tiny (dest
, imm
));
933 /* Emit a move from SRC to DEST. Assume that the move expanders can
934 handle all moves if !can_create_pseudo_p (). The distinction is
935 important because, unlike emit_move_insn, the move expanders know
936 how to force Pmode objects into the constant pool even when the
937 constant pool address is not itself legitimate. */
939 aarch64_emit_move (rtx dest
, rtx src
)
941 return (can_create_pseudo_p ()
942 ? emit_move_insn (dest
, src
)
943 : emit_move_insn_1 (dest
, src
));
946 /* Split a 128-bit move operation into two 64-bit move operations,
947 taking care to handle partial overlap of register to register
948 copies. Special cases are needed when moving between GP regs and
949 FP regs. SRC can be a register, constant or memory; DST a register
950 or memory. If either operand is memory it must not have any side
953 aarch64_split_128bit_move (rtx dst
, rtx src
)
958 machine_mode mode
= GET_MODE (dst
);
960 gcc_assert (mode
== TImode
|| mode
== TFmode
);
961 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
962 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
964 if (REG_P (dst
) && REG_P (src
))
966 int src_regno
= REGNO (src
);
967 int dst_regno
= REGNO (dst
);
969 /* Handle FP <-> GP regs. */
970 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
972 src_lo
= gen_lowpart (word_mode
, src
);
973 src_hi
= gen_highpart (word_mode
, src
);
977 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
978 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
982 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
983 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
987 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
989 dst_lo
= gen_lowpart (word_mode
, dst
);
990 dst_hi
= gen_highpart (word_mode
, dst
);
994 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
995 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
999 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1000 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1006 dst_lo
= gen_lowpart (word_mode
, dst
);
1007 dst_hi
= gen_highpart (word_mode
, dst
);
1008 src_lo
= gen_lowpart (word_mode
, src
);
1009 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1011 /* At most one pairing may overlap. */
1012 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1014 aarch64_emit_move (dst_hi
, src_hi
);
1015 aarch64_emit_move (dst_lo
, src_lo
);
1019 aarch64_emit_move (dst_lo
, src_lo
);
1020 aarch64_emit_move (dst_hi
, src_hi
);
1025 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1027 return (! REG_P (src
)
1028 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1031 /* Split a complex SIMD combine. */
1034 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1036 machine_mode src_mode
= GET_MODE (src1
);
1037 machine_mode dst_mode
= GET_MODE (dst
);
1039 gcc_assert (VECTOR_MODE_P (dst_mode
));
1041 if (REG_P (dst
) && REG_P (src1
) && REG_P (src2
))
1043 rtx (*gen
) (rtx
, rtx
, rtx
);
1048 gen
= gen_aarch64_simd_combinev8qi
;
1051 gen
= gen_aarch64_simd_combinev4hi
;
1054 gen
= gen_aarch64_simd_combinev2si
;
1057 gen
= gen_aarch64_simd_combinev2sf
;
1060 gen
= gen_aarch64_simd_combinedi
;
1063 gen
= gen_aarch64_simd_combinedf
;
1069 emit_insn (gen (dst
, src1
, src2
));
1074 /* Split a complex SIMD move. */
1077 aarch64_split_simd_move (rtx dst
, rtx src
)
1079 machine_mode src_mode
= GET_MODE (src
);
1080 machine_mode dst_mode
= GET_MODE (dst
);
1082 gcc_assert (VECTOR_MODE_P (dst_mode
));
1084 if (REG_P (dst
) && REG_P (src
))
1086 rtx (*gen
) (rtx
, rtx
);
1088 gcc_assert (VECTOR_MODE_P (src_mode
));
1093 gen
= gen_aarch64_split_simd_movv16qi
;
1096 gen
= gen_aarch64_split_simd_movv8hi
;
1099 gen
= gen_aarch64_split_simd_movv4si
;
1102 gen
= gen_aarch64_split_simd_movv2di
;
1105 gen
= gen_aarch64_split_simd_movv4sf
;
1108 gen
= gen_aarch64_split_simd_movv2df
;
1114 emit_insn (gen (dst
, src
));
1120 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
1122 if (can_create_pseudo_p ())
1123 return force_reg (mode
, value
);
1126 x
= aarch64_emit_move (x
, value
);
1133 aarch64_add_offset (machine_mode mode
, rtx temp
, rtx reg
, HOST_WIDE_INT offset
)
1135 if (!aarch64_plus_immediate (GEN_INT (offset
), mode
))
1138 /* Load the full offset into a register. This
1139 might be improvable in the future. */
1140 high
= GEN_INT (offset
);
1142 high
= aarch64_force_temporary (mode
, temp
, high
);
1143 reg
= aarch64_force_temporary (mode
, temp
,
1144 gen_rtx_PLUS (mode
, high
, reg
));
1146 return plus_constant (mode
, reg
, offset
);
1150 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
1153 unsigned HOST_WIDE_INT mask
;
1156 unsigned HOST_WIDE_INT val
;
1159 int one_match
, zero_match
, first_not_ffff_match
;
1162 if (CONST_INT_P (imm
) && aarch64_move_imm (INTVAL (imm
), mode
))
1165 emit_insn (gen_rtx_SET (VOIDmode
, dest
, imm
));
1172 /* We know we can't do this in 1 insn, and we must be able to do it
1173 in two; so don't mess around looking for sequences that don't buy
1177 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1178 GEN_INT (INTVAL (imm
) & 0xffff)));
1179 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
1180 GEN_INT ((INTVAL (imm
) >> 16) & 0xffff)));
1186 /* Remaining cases are all for DImode. */
1189 subtargets
= optimize
&& can_create_pseudo_p ();
1194 first_not_ffff_match
= -1;
1196 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1198 if ((val
& mask
) == mask
)
1202 if (first_not_ffff_match
< 0)
1203 first_not_ffff_match
= i
;
1204 if ((val
& mask
) == 0)
1211 /* Set one of the quarters and then insert back into result. */
1212 mask
= 0xffffll
<< first_not_ffff_match
;
1215 emit_insn (gen_rtx_SET (VOIDmode
, dest
, GEN_INT (val
| mask
)));
1216 emit_insn (gen_insv_immdi (dest
, GEN_INT (first_not_ffff_match
),
1217 GEN_INT ((val
>> first_not_ffff_match
)
1224 if (zero_match
== 2)
1225 goto simple_sequence
;
1227 mask
= 0x0ffff0000UL
;
1228 for (i
= 16; i
< 64; i
+= 16, mask
<<= 16)
1230 HOST_WIDE_INT comp
= mask
& ~(mask
- 1);
1232 if (aarch64_uimm12_shift (val
- (val
& mask
)))
1236 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1237 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1238 GEN_INT (val
& mask
)));
1239 emit_insn (gen_adddi3 (dest
, subtarget
,
1240 GEN_INT (val
- (val
& mask
))));
1245 else if (aarch64_uimm12_shift (-(val
- ((val
+ comp
) & mask
))))
1249 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1250 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1251 GEN_INT ((val
+ comp
) & mask
)));
1252 emit_insn (gen_adddi3 (dest
, subtarget
,
1253 GEN_INT (val
- ((val
+ comp
) & mask
))));
1258 else if (aarch64_uimm12_shift (val
- ((val
- comp
) | ~mask
)))
1262 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1263 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1264 GEN_INT ((val
- comp
) | ~mask
)));
1265 emit_insn (gen_adddi3 (dest
, subtarget
,
1266 GEN_INT (val
- ((val
- comp
) | ~mask
))));
1271 else if (aarch64_uimm12_shift (-(val
- (val
| ~mask
))))
1275 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1276 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1277 GEN_INT (val
| ~mask
)));
1278 emit_insn (gen_adddi3 (dest
, subtarget
,
1279 GEN_INT (val
- (val
| ~mask
))));
1286 /* See if we can do it by arithmetically combining two
1288 for (i
= 0; i
< AARCH64_NUM_BITMASKS
; i
++)
1293 if (aarch64_uimm12_shift (val
- aarch64_bitmasks
[i
])
1294 || aarch64_uimm12_shift (-val
+ aarch64_bitmasks
[i
]))
1298 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1299 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1300 GEN_INT (aarch64_bitmasks
[i
])));
1301 emit_insn (gen_adddi3 (dest
, subtarget
,
1302 GEN_INT (val
- aarch64_bitmasks
[i
])));
1308 for (j
= 0; j
< 64; j
+= 16, mask
<<= 16)
1310 if ((aarch64_bitmasks
[i
] & ~mask
) == (val
& ~mask
))
1314 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1315 GEN_INT (aarch64_bitmasks
[i
])));
1316 emit_insn (gen_insv_immdi (dest
, GEN_INT (j
),
1317 GEN_INT ((val
>> j
) & 0xffff)));
1325 /* See if we can do it by logically combining two immediates. */
1326 for (i
= 0; i
< AARCH64_NUM_BITMASKS
; i
++)
1328 if ((aarch64_bitmasks
[i
] & val
) == aarch64_bitmasks
[i
])
1332 for (j
= i
+ 1; j
< AARCH64_NUM_BITMASKS
; j
++)
1333 if (val
== (aarch64_bitmasks
[i
] | aarch64_bitmasks
[j
]))
1337 subtarget
= subtargets
? gen_reg_rtx (mode
) : dest
;
1338 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1339 GEN_INT (aarch64_bitmasks
[i
])));
1340 emit_insn (gen_iordi3 (dest
, subtarget
,
1341 GEN_INT (aarch64_bitmasks
[j
])));
1347 else if ((val
& aarch64_bitmasks
[i
]) == val
)
1351 for (j
= i
+ 1; j
< AARCH64_NUM_BITMASKS
; j
++)
1352 if (val
== (aarch64_bitmasks
[j
] & aarch64_bitmasks
[i
]))
1356 subtarget
= subtargets
? gen_reg_rtx (mode
) : dest
;
1357 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1358 GEN_INT (aarch64_bitmasks
[j
])));
1359 emit_insn (gen_anddi3 (dest
, subtarget
,
1360 GEN_INT (aarch64_bitmasks
[i
])));
1368 if (one_match
> zero_match
)
1370 /* Set either first three quarters or all but the third. */
1371 mask
= 0xffffll
<< (16 - first_not_ffff_match
);
1373 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1374 GEN_INT (val
| mask
| 0xffffffff00000000ull
)));
1377 /* Now insert other two quarters. */
1378 for (i
= first_not_ffff_match
+ 16, mask
<<= (first_not_ffff_match
<< 1);
1379 i
< 64; i
+= 16, mask
<<= 16)
1381 if ((val
& mask
) != mask
)
1384 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1385 GEN_INT ((val
>> i
) & 0xffff)));
1395 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1397 if ((val
& mask
) != 0)
1402 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1403 GEN_INT (val
& mask
)));
1410 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1411 GEN_INT ((val
>> i
) & 0xffff)));
1422 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
1424 machine_mode mode
= GET_MODE (dest
);
1426 gcc_assert (mode
== SImode
|| mode
== DImode
);
1428 /* Check on what type of symbol it is. */
1429 if (GET_CODE (imm
) == SYMBOL_REF
1430 || GET_CODE (imm
) == LABEL_REF
1431 || GET_CODE (imm
) == CONST
)
1433 rtx mem
, base
, offset
;
1434 enum aarch64_symbol_type sty
;
1436 /* If we have (const (plus symbol offset)), separate out the offset
1437 before we start classifying the symbol. */
1438 split_const (imm
, &base
, &offset
);
1440 sty
= aarch64_classify_symbol (base
, offset
, SYMBOL_CONTEXT_ADR
);
1443 case SYMBOL_FORCE_TO_MEM
:
1444 if (offset
!= const0_rtx
1445 && targetm
.cannot_force_const_mem (mode
, imm
))
1447 gcc_assert (can_create_pseudo_p ());
1448 base
= aarch64_force_temporary (mode
, dest
, base
);
1449 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1450 aarch64_emit_move (dest
, base
);
1453 mem
= force_const_mem (ptr_mode
, imm
);
1455 if (mode
!= ptr_mode
)
1456 mem
= gen_rtx_ZERO_EXTEND (mode
, mem
);
1457 emit_insn (gen_rtx_SET (VOIDmode
, dest
, mem
));
1460 case SYMBOL_SMALL_TLSGD
:
1461 case SYMBOL_SMALL_TLSDESC
:
1462 case SYMBOL_SMALL_GOTTPREL
:
1463 case SYMBOL_SMALL_GOT
:
1464 case SYMBOL_TINY_GOT
:
1465 if (offset
!= const0_rtx
)
1467 gcc_assert(can_create_pseudo_p ());
1468 base
= aarch64_force_temporary (mode
, dest
, base
);
1469 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1470 aarch64_emit_move (dest
, base
);
1475 case SYMBOL_SMALL_TPREL
:
1476 case SYMBOL_SMALL_ABSOLUTE
:
1477 case SYMBOL_TINY_ABSOLUTE
:
1478 aarch64_load_symref_appropriately (dest
, imm
, sty
);
1486 if (!CONST_INT_P (imm
))
1488 if (GET_CODE (imm
) == HIGH
)
1489 emit_insn (gen_rtx_SET (VOIDmode
, dest
, imm
));
1492 rtx mem
= force_const_mem (mode
, imm
);
1494 emit_insn (gen_rtx_SET (VOIDmode
, dest
, mem
));
1500 aarch64_internal_mov_immediate (dest
, imm
, true, GET_MODE (dest
));
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
1505 tree exp ATTRIBUTE_UNUSED
)
1507 /* Currently, always true. */
1511 /* Implement TARGET_PASS_BY_REFERENCE. */
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
1517 bool named ATTRIBUTE_UNUSED
)
1520 machine_mode dummymode
;
1523 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1524 size
= (mode
== BLKmode
&& type
)
1525 ? int_size_in_bytes (type
) : (int) GET_MODE_SIZE (mode
);
1527 /* Aggregates are passed by reference based on their size. */
1528 if (type
&& AGGREGATE_TYPE_P (type
))
1530 size
= int_size_in_bytes (type
);
1533 /* Variable sized arguments are always returned by reference. */
1537 /* Can this be a candidate to be passed in fp/simd register(s)? */
1538 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
1543 /* Arguments which are variable sized or larger than 2 registers are
1544 passed by reference unless they are a homogenous floating point
1546 return size
> 2 * UNITS_PER_WORD
;
1549 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1551 aarch64_return_in_msb (const_tree valtype
)
1553 machine_mode dummy_mode
;
1556 /* Never happens in little-endian mode. */
1557 if (!BYTES_BIG_ENDIAN
)
1560 /* Only composite types smaller than or equal to 16 bytes can
1561 be potentially returned in registers. */
1562 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
1563 || int_size_in_bytes (valtype
) <= 0
1564 || int_size_in_bytes (valtype
) > 16)
1567 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569 is always passed/returned in the least significant bits of fp/simd
1571 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
1572 &dummy_mode
, &dummy_int
, NULL
))
1578 /* Implement TARGET_FUNCTION_VALUE.
1579 Define how to find the value returned by a function. */
1582 aarch64_function_value (const_tree type
, const_tree func
,
1583 bool outgoing ATTRIBUTE_UNUSED
)
1588 machine_mode ag_mode
;
1590 mode
= TYPE_MODE (type
);
1591 if (INTEGRAL_TYPE_P (type
))
1592 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
1594 if (aarch64_return_in_msb (type
))
1596 HOST_WIDE_INT size
= int_size_in_bytes (type
);
1598 if (size
% UNITS_PER_WORD
!= 0)
1600 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
1601 mode
= mode_for_size (size
* BITS_PER_UNIT
, MODE_INT
, 0);
1605 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
1606 &ag_mode
, &count
, NULL
))
1608 if (!aarch64_composite_type_p (type
, mode
))
1610 gcc_assert (count
== 1 && mode
== ag_mode
);
1611 return gen_rtx_REG (mode
, V0_REGNUM
);
1618 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
1619 for (i
= 0; i
< count
; i
++)
1621 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
1622 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
1623 GEN_INT (i
* GET_MODE_SIZE (ag_mode
)));
1624 XVECEXP (par
, 0, i
) = tmp
;
1630 return gen_rtx_REG (mode
, R0_REGNUM
);
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634 Return true if REGNO is the number of a hard register in which the values
1635 of called function may come back. */
1638 aarch64_function_value_regno_p (const unsigned int regno
)
1640 /* Maximum of 16 bytes can be returned in the general registers. Examples
1641 of 16-byte return values are: 128-bit integers and 16-byte small
1642 structures (excluding homogeneous floating-point aggregates). */
1643 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
1646 /* Up to four fp/simd registers can return a function value, e.g. a
1647 homogeneous floating-point aggregate having four members. */
1648 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
1649 return !TARGET_GENERAL_REGS_ONLY
;
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1656 If the type T of the result of a function is such that
1658 would require that arg be passed as a value in a register (or set of
1659 registers) according to the parameter passing rules, then the result
1660 is returned in the same registers as would be used for such an
1664 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
1667 machine_mode ag_mode
;
1670 if (!AGGREGATE_TYPE_P (type
)
1671 && TREE_CODE (type
) != COMPLEX_TYPE
1672 && TREE_CODE (type
) != VECTOR_TYPE
)
1673 /* Simple scalar types always returned in registers. */
1676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
1683 /* Types larger than 2 registers returned in memory. */
1684 size
= int_size_in_bytes (type
);
1685 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
1690 const_tree type
, int *nregs
)
1692 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1693 return aarch64_vfp_is_call_or_return_candidate (mode
,
1695 &pcum
->aapcs_vfp_rmode
,
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701 bits. The idea is to suppress any stronger alignment requested by
1702 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703 This is a helper function for local use only. */
1706 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
1708 unsigned int alignment
;
1712 if (!integer_zerop (TYPE_SIZE (type
)))
1714 if (TYPE_MODE (type
) == mode
)
1715 alignment
= TYPE_ALIGN (type
);
1717 alignment
= GET_MODE_ALIGNMENT (mode
);
1723 alignment
= GET_MODE_ALIGNMENT (mode
);
1728 /* Layout a function argument according to the AAPCS64 rules. The rule
1729 numbers refer to the rule numbers in the AAPCS64. */
1732 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
1734 bool named ATTRIBUTE_UNUSED
)
1736 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1737 int ncrn
, nvrn
, nregs
;
1738 bool allocate_ncrn
, allocate_nvrn
;
1741 /* We need to do this once per argument. */
1742 if (pcum
->aapcs_arg_processed
)
1745 pcum
->aapcs_arg_processed
= true;
1747 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1749 = AARCH64_ROUND_UP (type
? int_size_in_bytes (type
) : GET_MODE_SIZE (mode
),
1752 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
1753 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
1758 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759 The following code thus handles passing by SIMD/FP registers first. */
1761 nvrn
= pcum
->aapcs_nvrn
;
1763 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764 and homogenous short-vector aggregates (HVA). */
1767 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
1769 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
1770 if (!aarch64_composite_type_p (type
, mode
))
1772 gcc_assert (nregs
== 1);
1773 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
1779 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
1780 for (i
= 0; i
< nregs
; i
++)
1782 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
1783 V0_REGNUM
+ nvrn
+ i
);
1784 tmp
= gen_rtx_EXPR_LIST
1786 GEN_INT (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
)));
1787 XVECEXP (par
, 0, i
) = tmp
;
1789 pcum
->aapcs_reg
= par
;
1795 /* C.3 NSRN is set to 8. */
1796 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
1801 ncrn
= pcum
->aapcs_ncrn
;
1802 nregs
= size
/ UNITS_PER_WORD
;
1804 /* C6 - C9. though the sign and zero extension semantics are
1805 handled elsewhere. This is the case where the argument fits
1806 entirely general registers. */
1807 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
1809 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
1811 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
1813 /* C.8 if the argument has an alignment of 16 then the NGRN is
1814 rounded up to the next even number. */
1815 if (nregs
== 2 && alignment
== 16 * BITS_PER_UNIT
&& ncrn
% 2)
1818 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
1820 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821 A reg is still generated for it, but the caller should be smart
1822 enough not to use it. */
1823 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
1825 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
1832 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
1833 for (i
= 0; i
< nregs
; i
++)
1835 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
1836 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
1837 GEN_INT (i
* UNITS_PER_WORD
));
1838 XVECEXP (par
, 0, i
) = tmp
;
1840 pcum
->aapcs_reg
= par
;
1843 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
1848 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
1850 /* The argument is passed on stack; record the needed number of words for
1851 this argument and align the total size if necessary. */
1853 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
1854 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
1855 pcum
->aapcs_stack_size
= AARCH64_ROUND_UP (pcum
->aapcs_stack_size
,
1856 16 / UNITS_PER_WORD
);
1860 /* Implement TARGET_FUNCTION_ARG. */
1863 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
1864 const_tree type
, bool named
)
1866 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1867 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
1869 if (mode
== VOIDmode
)
1872 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
1873 return pcum
->aapcs_reg
;
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
1878 const_tree fntype ATTRIBUTE_UNUSED
,
1879 rtx libname ATTRIBUTE_UNUSED
,
1880 const_tree fndecl ATTRIBUTE_UNUSED
,
1881 unsigned n_named ATTRIBUTE_UNUSED
)
1883 pcum
->aapcs_ncrn
= 0;
1884 pcum
->aapcs_nvrn
= 0;
1885 pcum
->aapcs_nextncrn
= 0;
1886 pcum
->aapcs_nextnvrn
= 0;
1887 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
1888 pcum
->aapcs_reg
= NULL_RTX
;
1889 pcum
->aapcs_arg_processed
= false;
1890 pcum
->aapcs_stack_words
= 0;
1891 pcum
->aapcs_stack_size
= 0;
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
1902 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1903 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
1905 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
1906 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
1907 != (pcum
->aapcs_stack_words
!= 0));
1908 pcum
->aapcs_arg_processed
= false;
1909 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
1910 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
1911 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
1912 pcum
->aapcs_stack_words
= 0;
1913 pcum
->aapcs_reg
= NULL_RTX
;
1918 aarch64_function_arg_regno_p (unsigned regno
)
1920 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
1921 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
1924 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1925 PARM_BOUNDARY bits of alignment, but will be given anything up
1926 to STACK_BOUNDARY bits if the type requires it. This makes sure
1927 that both before and after the layout of each argument, the Next
1928 Stacked Argument Address (NSAA) will have a minimum alignment of
1932 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
1934 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
1936 if (alignment
< PARM_BOUNDARY
)
1937 alignment
= PARM_BOUNDARY
;
1938 if (alignment
> STACK_BOUNDARY
)
1939 alignment
= STACK_BOUNDARY
;
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1945 Return true if an argument passed on the stack should be padded upwards,
1946 i.e. if the least-significant byte of the stack slot has useful data.
1948 Small aggregate types are placed in the lowest memory address.
1950 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1953 aarch64_pad_arg_upward (machine_mode mode
, const_tree type
)
1955 /* On little-endian targets, the least significant byte of every stack
1956 argument is passed at the lowest byte address of the stack slot. */
1957 if (!BYTES_BIG_ENDIAN
)
1960 /* Otherwise, integral, floating-point and pointer types are padded downward:
1961 the least significant byte of a stack argument is passed at the highest
1962 byte address of the stack slot. */
1964 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
1965 || POINTER_TYPE_P (type
))
1966 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
1969 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1975 It specifies padding for the last (may also be the only)
1976 element of a block move between registers and memory. If
1977 assuming the block is in the memory, padding upward means that
1978 the last element is padded after its highest significant byte,
1979 while in downward padding, the last element is padded at the
1980 its least significant byte side.
1982 Small aggregates and small complex types are always padded
1985 We don't need to worry about homogeneous floating-point or
1986 short-vector aggregates; their move is not affected by the
1987 padding direction determined here. Regardless of endianness,
1988 each element of such an aggregate is put in the least
1989 significant bits of a fp/simd register.
1991 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992 register has useful data, and return the opposite if the most
1993 significant byte does. */
1996 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
1997 bool first ATTRIBUTE_UNUSED
)
2000 /* Small composite types are always padded upward. */
2001 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
2003 HOST_WIDE_INT size
= (type
? int_size_in_bytes (type
)
2004 : GET_MODE_SIZE (mode
));
2005 if (size
< 2 * UNITS_PER_WORD
)
2009 /* Otherwise, use the default padding. */
2010 return !BYTES_BIG_ENDIAN
;
2014 aarch64_libgcc_cmp_return_mode (void)
2020 aarch64_frame_pointer_required (void)
2022 /* In aarch64_override_options_after_change
2023 flag_omit_leaf_frame_pointer turns off the frame pointer by
2024 default. Turn it back on now if we've not got a leaf
2026 if (flag_omit_leaf_frame_pointer
2027 && (!crtl
->is_leaf
|| df_regs_ever_live_p (LR_REGNUM
)))
2033 /* Mark the registers that need to be saved by the callee and calculate
2034 the size of the callee-saved registers area and frame record (both FP
2035 and LR may be omitted). */
2037 aarch64_layout_frame (void)
2039 HOST_WIDE_INT offset
= 0;
2042 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED (-1)
2048 cfun
->machine
->frame
.wb_candidate1
= FIRST_PSEUDO_REGISTER
;
2049 cfun
->machine
->frame
.wb_candidate2
= FIRST_PSEUDO_REGISTER
;
2051 /* First mark all the registers that really need to be saved... */
2052 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2053 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2055 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2056 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2058 /* ... that includes the eh data registers (if needed)... */
2059 if (crtl
->calls_eh_return
)
2060 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
2061 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
2064 /* ... and any callee saved register that dataflow says is live. */
2065 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2066 if (df_regs_ever_live_p (regno
)
2067 && (regno
== R30_REGNUM
2068 || !call_used_regs
[regno
]))
2069 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2071 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2072 if (df_regs_ever_live_p (regno
)
2073 && !call_used_regs
[regno
])
2074 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2076 if (frame_pointer_needed
)
2078 /* FP and LR are placed in the linkage record. */
2079 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
2080 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
2081 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
2082 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
2083 cfun
->machine
->frame
.hardfp_offset
= 2 * UNITS_PER_WORD
;
2084 offset
+= 2 * UNITS_PER_WORD
;
2087 /* Now assign stack slots for them. */
2088 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2089 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2091 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2092 if (cfun
->machine
->frame
.wb_candidate1
== FIRST_PSEUDO_REGISTER
)
2093 cfun
->machine
->frame
.wb_candidate1
= regno
;
2094 else if (cfun
->machine
->frame
.wb_candidate2
== FIRST_PSEUDO_REGISTER
)
2095 cfun
->machine
->frame
.wb_candidate2
= regno
;
2096 offset
+= UNITS_PER_WORD
;
2099 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2100 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2102 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2103 if (cfun
->machine
->frame
.wb_candidate1
== FIRST_PSEUDO_REGISTER
)
2104 cfun
->machine
->frame
.wb_candidate1
= regno
;
2105 else if (cfun
->machine
->frame
.wb_candidate2
== FIRST_PSEUDO_REGISTER
2106 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
2107 cfun
->machine
->frame
.wb_candidate2
= regno
;
2108 offset
+= UNITS_PER_WORD
;
2111 cfun
->machine
->frame
.padding0
=
2112 (AARCH64_ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
) - offset
);
2113 offset
= AARCH64_ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2115 cfun
->machine
->frame
.saved_regs_size
= offset
;
2117 cfun
->machine
->frame
.hard_fp_offset
2118 = AARCH64_ROUND_UP (cfun
->machine
->frame
.saved_varargs_size
2120 + cfun
->machine
->frame
.saved_regs_size
,
2121 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2123 cfun
->machine
->frame
.frame_size
2124 = AARCH64_ROUND_UP (cfun
->machine
->frame
.hard_fp_offset
2125 + crtl
->outgoing_args_size
,
2126 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2128 cfun
->machine
->frame
.laid_out
= true;
2132 aarch64_register_saved_on_entry (int regno
)
2134 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
2138 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
2140 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
2146 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
2147 HOST_WIDE_INT adjustment
)
2149 rtx base_rtx
= stack_pointer_rtx
;
2152 reg
= gen_rtx_REG (mode
, regno
);
2153 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
2154 plus_constant (Pmode
, base_rtx
, -adjustment
));
2155 mem
= gen_rtx_MEM (mode
, mem
);
2157 insn
= emit_move_insn (mem
, reg
);
2158 RTX_FRAME_RELATED_P (insn
) = 1;
2162 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
2163 HOST_WIDE_INT adjustment
)
2168 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
2169 GEN_INT (-adjustment
),
2170 GEN_INT (UNITS_PER_WORD
- adjustment
));
2172 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
2173 GEN_INT (-adjustment
),
2174 GEN_INT (UNITS_PER_WORD
- adjustment
));
2181 aarch64_pushwb_pair_reg (machine_mode mode
, unsigned regno1
,
2182 unsigned regno2
, HOST_WIDE_INT adjustment
)
2185 rtx reg1
= gen_rtx_REG (mode
, regno1
);
2186 rtx reg2
= gen_rtx_REG (mode
, regno2
);
2188 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
2190 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
2191 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
2192 RTX_FRAME_RELATED_P (insn
) = 1;
2196 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
2197 HOST_WIDE_INT adjustment
)
2202 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
2203 GEN_INT (UNITS_PER_WORD
));
2205 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
2206 GEN_INT (UNITS_PER_WORD
));
2213 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
2219 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
2222 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
2230 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
2236 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
2239 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
2248 aarch64_save_callee_saves (machine_mode mode
, HOST_WIDE_INT start_offset
,
2249 unsigned start
, unsigned limit
, bool skip_wb
)
2252 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
2253 ? gen_frame_mem
: gen_rtx_MEM
);
2257 for (regno
= aarch64_next_callee_save (start
, limit
);
2259 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
2262 HOST_WIDE_INT offset
;
2265 && (regno
== cfun
->machine
->frame
.wb_candidate1
2266 || regno
== cfun
->machine
->frame
.wb_candidate2
))
2269 reg
= gen_rtx_REG (mode
, regno
);
2270 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
2271 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
2274 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
2277 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
2278 == cfun
->machine
->frame
.reg_offset
[regno2
]))
2281 rtx reg2
= gen_rtx_REG (mode
, regno2
);
2284 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
2285 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
2287 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
2290 /* The first part of a frame-related parallel insn is
2291 always assumed to be relevant to the frame
2292 calculations; subsequent parts, are only
2293 frame-related if explicitly marked. */
2294 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
2298 insn
= emit_move_insn (mem
, reg
);
2300 RTX_FRAME_RELATED_P (insn
) = 1;
2305 aarch64_restore_callee_saves (machine_mode mode
,
2306 HOST_WIDE_INT start_offset
, unsigned start
,
2307 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
2309 rtx base_rtx
= stack_pointer_rtx
;
2310 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
2311 ? gen_frame_mem
: gen_rtx_MEM
);
2314 HOST_WIDE_INT offset
;
2316 for (regno
= aarch64_next_callee_save (start
, limit
);
2318 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
2323 && (regno
== cfun
->machine
->frame
.wb_candidate1
2324 || regno
== cfun
->machine
->frame
.wb_candidate2
))
2327 reg
= gen_rtx_REG (mode
, regno
);
2328 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
2329 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
2331 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
2334 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
2335 == cfun
->machine
->frame
.reg_offset
[regno2
]))
2337 rtx reg2
= gen_rtx_REG (mode
, regno2
);
2340 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
2341 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
2342 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
2344 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
2348 emit_move_insn (reg
, mem
);
2349 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
2353 /* AArch64 stack frames generated by this compiler look like:
2355 +-------------------------------+
2357 | incoming stack arguments |
2359 +-------------------------------+
2360 | | <-- incoming stack pointer (aligned)
2361 | callee-allocated save area |
2362 | for register varargs |
2364 +-------------------------------+
2365 | local variables | <-- frame_pointer_rtx
2367 +-------------------------------+
2369 +-------------------------------+ |
2370 | callee-saved registers | | frame.saved_regs_size
2371 +-------------------------------+ |
2373 +-------------------------------+ |
2374 | FP' | / <- hard_frame_pointer_rtx (aligned)
2375 +-------------------------------+
2376 | dynamic allocation |
2377 +-------------------------------+
2379 +-------------------------------+
2380 | outgoing stack arguments | <-- arg_pointer
2382 +-------------------------------+
2383 | | <-- stack_pointer_rtx (aligned)
2385 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2389 /* Generate the prologue instructions for entry into a function.
2390 Establish the stack frame by decreasing the stack pointer with a
2391 properly calculated size and, if necessary, create a frame record
2392 filled with the values of LR and previous frame pointer. The
2393 current FP is also set up if it is in use. */
2396 aarch64_expand_prologue (void)
2398 /* sub sp, sp, #<frame_size>
2399 stp {fp, lr}, [sp, #<frame_size> - 16]
2400 add fp, sp, #<frame_size> - hardfp_offset
2401 stp {cs_reg}, [fp, #-16] etc.
2403 sub sp, sp, <final_adjustment_if_any>
2405 HOST_WIDE_INT frame_size
, offset
;
2406 HOST_WIDE_INT fp_offset
; /* Offset from hard FP to SP. */
2407 HOST_WIDE_INT hard_fp_offset
;
2410 aarch64_layout_frame ();
2412 offset
= frame_size
= cfun
->machine
->frame
.frame_size
;
2413 hard_fp_offset
= cfun
->machine
->frame
.hard_fp_offset
;
2414 fp_offset
= frame_size
- hard_fp_offset
;
2416 if (flag_stack_usage_info
)
2417 current_function_static_stack_size
= frame_size
;
2419 /* Store pairs and load pairs have a range only -512 to 504. */
2422 /* When the frame has a large size, an initial decrease is done on
2423 the stack pointer to jump over the callee-allocated save area for
2424 register varargs, the local variable area and/or the callee-saved
2425 register area. This will allow the pre-index write-back
2426 store pair instructions to be used for setting up the stack frame
2428 offset
= hard_fp_offset
;
2430 offset
= cfun
->machine
->frame
.saved_regs_size
;
2432 frame_size
-= (offset
+ crtl
->outgoing_args_size
);
2435 if (frame_size
>= 0x1000000)
2437 rtx op0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
2438 emit_move_insn (op0
, GEN_INT (-frame_size
));
2439 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
, op0
));
2441 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2442 gen_rtx_SET (VOIDmode
, stack_pointer_rtx
,
2443 plus_constant (Pmode
, stack_pointer_rtx
,
2445 RTX_FRAME_RELATED_P (insn
) = 1;
2447 else if (frame_size
> 0)
2449 int hi_ofs
= frame_size
& 0xfff000;
2450 int lo_ofs
= frame_size
& 0x000fff;
2454 insn
= emit_insn (gen_add2_insn
2455 (stack_pointer_rtx
, GEN_INT (-hi_ofs
)));
2456 RTX_FRAME_RELATED_P (insn
) = 1;
2460 insn
= emit_insn (gen_add2_insn
2461 (stack_pointer_rtx
, GEN_INT (-lo_ofs
)));
2462 RTX_FRAME_RELATED_P (insn
) = 1;
2471 bool skip_wb
= false;
2473 if (frame_pointer_needed
)
2479 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
,
2480 GEN_INT (-offset
)));
2481 RTX_FRAME_RELATED_P (insn
) = 1;
2483 aarch64_save_callee_saves (DImode
, fp_offset
, R29_REGNUM
,
2487 aarch64_pushwb_pair_reg (DImode
, R29_REGNUM
, R30_REGNUM
, offset
);
2489 /* Set up frame pointer to point to the location of the
2490 previous frame pointer on the stack. */
2491 insn
= emit_insn (gen_add3_insn (hard_frame_pointer_rtx
,
2493 GEN_INT (fp_offset
)));
2494 RTX_FRAME_RELATED_P (insn
) = 1;
2495 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
2499 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
2500 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
2503 || reg1
== FIRST_PSEUDO_REGISTER
2504 || (reg2
== FIRST_PSEUDO_REGISTER
2507 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
,
2508 GEN_INT (-offset
)));
2509 RTX_FRAME_RELATED_P (insn
) = 1;
2513 machine_mode mode1
= (reg1
<= R30_REGNUM
) ? DImode
: DFmode
;
2517 if (reg2
== FIRST_PSEUDO_REGISTER
)
2518 aarch64_pushwb_single_reg (mode1
, reg1
, offset
);
2520 aarch64_pushwb_pair_reg (mode1
, reg1
, reg2
, offset
);
2524 aarch64_save_callee_saves (DImode
, fp_offset
, R0_REGNUM
, R30_REGNUM
,
2526 aarch64_save_callee_saves (DFmode
, fp_offset
, V0_REGNUM
, V31_REGNUM
,
2530 /* when offset >= 512,
2531 sub sp, sp, #<outgoing_args_size> */
2532 if (frame_size
> -1)
2534 if (crtl
->outgoing_args_size
> 0)
2536 insn
= emit_insn (gen_add2_insn
2538 GEN_INT (- crtl
->outgoing_args_size
)));
2539 RTX_FRAME_RELATED_P (insn
) = 1;
2544 /* Return TRUE if we can use a simple_return insn.
2546 This function checks whether the callee saved stack is empty, which
2547 means no restore actions are need. The pro_and_epilogue will use
2548 this to check whether shrink-wrapping opt is feasible. */
2551 aarch64_use_return_insn_p (void)
2553 if (!reload_completed
)
2559 aarch64_layout_frame ();
2561 return cfun
->machine
->frame
.frame_size
== 0;
2564 /* Generate the epilogue instructions for returning from a function. */
2566 aarch64_expand_epilogue (bool for_sibcall
)
2568 HOST_WIDE_INT frame_size
, offset
;
2569 HOST_WIDE_INT fp_offset
;
2570 HOST_WIDE_INT hard_fp_offset
;
2572 /* We need to add memory barrier to prevent read from deallocated stack. */
2573 bool need_barrier_p
= (get_frame_size () != 0
2574 || cfun
->machine
->frame
.saved_varargs_size
);
2576 aarch64_layout_frame ();
2578 offset
= frame_size
= cfun
->machine
->frame
.frame_size
;
2579 hard_fp_offset
= cfun
->machine
->frame
.hard_fp_offset
;
2580 fp_offset
= frame_size
- hard_fp_offset
;
2582 /* Store pairs and load pairs have a range only -512 to 504. */
2585 offset
= hard_fp_offset
;
2587 offset
= cfun
->machine
->frame
.saved_regs_size
;
2589 frame_size
-= (offset
+ crtl
->outgoing_args_size
);
2591 if (!frame_pointer_needed
&& crtl
->outgoing_args_size
> 0)
2593 insn
= emit_insn (gen_add2_insn
2595 GEN_INT (crtl
->outgoing_args_size
)));
2596 RTX_FRAME_RELATED_P (insn
) = 1;
2602 /* If there were outgoing arguments or we've done dynamic stack
2603 allocation, then restore the stack pointer from the frame
2604 pointer. This is at most one insn and more efficient than using
2605 GCC's internal mechanism. */
2606 if (frame_pointer_needed
2607 && (crtl
->outgoing_args_size
|| cfun
->calls_alloca
))
2609 if (cfun
->calls_alloca
)
2610 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
2612 insn
= emit_insn (gen_add3_insn (stack_pointer_rtx
,
2613 hard_frame_pointer_rtx
,
2615 offset
= offset
- fp_offset
;
2620 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
2621 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
2622 bool skip_wb
= true;
2625 if (frame_pointer_needed
)
2628 || reg1
== FIRST_PSEUDO_REGISTER
2629 || (reg2
== FIRST_PSEUDO_REGISTER
2633 aarch64_restore_callee_saves (DImode
, fp_offset
, R0_REGNUM
, R30_REGNUM
,
2635 aarch64_restore_callee_saves (DFmode
, fp_offset
, V0_REGNUM
, V31_REGNUM
,
2639 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
2643 machine_mode mode1
= (reg1
<= R30_REGNUM
) ? DImode
: DFmode
;
2644 rtx rreg1
= gen_rtx_REG (mode1
, reg1
);
2646 cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, rreg1
, cfi_ops
);
2647 if (reg2
== FIRST_PSEUDO_REGISTER
)
2649 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, offset
);
2650 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
2651 mem
= gen_rtx_MEM (mode1
, mem
);
2652 insn
= emit_move_insn (rreg1
, mem
);
2656 rtx rreg2
= gen_rtx_REG (mode1
, reg2
);
2658 cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, rreg2
, cfi_ops
);
2659 insn
= emit_insn (aarch64_gen_loadwb_pair
2660 (mode1
, stack_pointer_rtx
, rreg1
,
2666 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
,
2670 /* Reset the CFA to be SP + FRAME_SIZE. */
2671 rtx new_cfa
= stack_pointer_rtx
;
2673 new_cfa
= plus_constant (Pmode
, new_cfa
, frame_size
);
2674 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
2675 REG_NOTES (insn
) = cfi_ops
;
2676 RTX_FRAME_RELATED_P (insn
) = 1;
2682 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
2684 if (frame_size
>= 0x1000000)
2686 rtx op0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
2687 emit_move_insn (op0
, GEN_INT (frame_size
));
2688 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
, op0
));
2692 int hi_ofs
= frame_size
& 0xfff000;
2693 int lo_ofs
= frame_size
& 0x000fff;
2695 if (hi_ofs
&& lo_ofs
)
2697 insn
= emit_insn (gen_add2_insn
2698 (stack_pointer_rtx
, GEN_INT (hi_ofs
)));
2699 RTX_FRAME_RELATED_P (insn
) = 1;
2700 frame_size
= lo_ofs
;
2702 insn
= emit_insn (gen_add2_insn
2703 (stack_pointer_rtx
, GEN_INT (frame_size
)));
2706 /* Reset the CFA to be SP + 0. */
2707 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_pointer_rtx
);
2708 RTX_FRAME_RELATED_P (insn
) = 1;
2711 /* Stack adjustment for exception handler. */
2712 if (crtl
->calls_eh_return
)
2714 /* We need to unwind the stack by the offset computed by
2715 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2716 to be SP; letting the CFA move during this adjustment
2717 is just as correct as retaining the CFA from the body
2718 of the function. Therefore, do nothing special. */
2719 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
2722 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
2724 emit_jump_insn (ret_rtx
);
2727 /* Return the place to copy the exception unwinding return address to.
2728 This will probably be a stack slot, but could (in theory be the
2729 return register). */
2731 aarch64_final_eh_return_addr (void)
2733 HOST_WIDE_INT fp_offset
;
2735 aarch64_layout_frame ();
2737 fp_offset
= cfun
->machine
->frame
.frame_size
2738 - cfun
->machine
->frame
.hard_fp_offset
;
2740 if (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] < 0)
2741 return gen_rtx_REG (DImode
, LR_REGNUM
);
2743 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2744 result in a store to save LR introduced by builtin_eh_return () being
2745 incorrectly deleted because the alias is not detected.
2746 So in the calculation of the address to copy the exception unwinding
2747 return address to, we note 2 cases.
2748 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749 we return a SP-relative location since all the addresses are SP-relative
2750 in this case. This prevents the store from being optimized away.
2751 If the fp_offset is not 0, then the addresses will be FP-relative and
2752 therefore we return a FP-relative location. */
2754 if (frame_pointer_needed
)
2757 return gen_frame_mem (DImode
,
2758 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
2760 return gen_frame_mem (DImode
,
2761 plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
));
2764 /* If FP is not needed, we calculate the location of LR, which would be
2765 at the top of the saved registers block. */
2767 return gen_frame_mem (DImode
,
2768 plus_constant (Pmode
,
2771 + cfun
->machine
->frame
.saved_regs_size
2772 - 2 * UNITS_PER_WORD
));
2775 /* Possibly output code to build up a constant in a register. For
2776 the benefit of the costs infrastructure, returns the number of
2777 instructions which would be emitted. GENERATE inhibits or
2778 enables code generation. */
2781 aarch64_build_constant (int regnum
, HOST_WIDE_INT val
, bool generate
)
2785 if (aarch64_bitmask_imm (val
, DImode
))
2788 emit_move_insn (gen_rtx_REG (Pmode
, regnum
), GEN_INT (val
));
2796 HOST_WIDE_INT valp
= val
>> 16;
2800 for (i
= 16; i
< 64; i
+= 16)
2802 valm
= (valp
& 0xffff);
2813 /* zcount contains the number of additional MOVK instructions
2814 required if the constant is built up with an initial MOVZ instruction,
2815 while ncount is the number of MOVK instructions required if starting
2816 with a MOVN instruction. Choose the sequence that yields the fewest
2817 number of instructions, preferring MOVZ instructions when they are both
2819 if (ncount
< zcount
)
2822 emit_move_insn (gen_rtx_REG (Pmode
, regnum
),
2823 GEN_INT (val
| ~(HOST_WIDE_INT
) 0xffff));
2830 emit_move_insn (gen_rtx_REG (Pmode
, regnum
),
2831 GEN_INT (val
& 0xffff));
2838 for (i
= 16; i
< 64; i
+= 16)
2840 if ((val
& 0xffff) != tval
)
2843 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode
, regnum
),
2845 GEN_INT (val
& 0xffff)));
2855 aarch64_add_constant (int regnum
, int scratchreg
, HOST_WIDE_INT delta
)
2857 HOST_WIDE_INT mdelta
= delta
;
2858 rtx this_rtx
= gen_rtx_REG (Pmode
, regnum
);
2859 rtx scratch_rtx
= gen_rtx_REG (Pmode
, scratchreg
);
2864 if (mdelta
>= 4096 * 4096)
2866 (void) aarch64_build_constant (scratchreg
, delta
, true);
2867 emit_insn (gen_add3_insn (this_rtx
, this_rtx
, scratch_rtx
));
2869 else if (mdelta
> 0)
2873 emit_insn (gen_rtx_SET (Pmode
, scratch_rtx
, GEN_INT (mdelta
/ 4096)));
2874 rtx shift
= gen_rtx_ASHIFT (Pmode
, scratch_rtx
, GEN_INT (12));
2876 emit_insn (gen_rtx_SET (Pmode
, this_rtx
,
2877 gen_rtx_MINUS (Pmode
, this_rtx
, shift
)));
2879 emit_insn (gen_rtx_SET (Pmode
, this_rtx
,
2880 gen_rtx_PLUS (Pmode
, this_rtx
, shift
)));
2882 if (mdelta
% 4096 != 0)
2884 scratch_rtx
= GEN_INT ((delta
< 0 ? -1 : 1) * (mdelta
% 4096));
2885 emit_insn (gen_rtx_SET (Pmode
, this_rtx
,
2886 gen_rtx_PLUS (Pmode
, this_rtx
, scratch_rtx
)));
2891 /* Output code to add DELTA to the first argument, and then jump
2892 to FUNCTION. Used for C++ multiple inheritance. */
2894 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
2895 HOST_WIDE_INT delta
,
2896 HOST_WIDE_INT vcall_offset
,
2899 /* The this pointer is always in x0. Note that this differs from
2900 Arm where the this pointer maybe bumped to r1 if r0 is required
2901 to return a pointer to an aggregate. On AArch64 a result value
2902 pointer will be in x8. */
2903 int this_regno
= R0_REGNUM
;
2904 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
2907 reload_completed
= 1;
2908 emit_note (NOTE_INSN_PROLOGUE_END
);
2910 if (vcall_offset
== 0)
2911 aarch64_add_constant (this_regno
, IP1_REGNUM
, delta
);
2914 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
2916 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
2917 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
2918 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
2923 if (delta
>= -256 && delta
< 256)
2924 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
2925 plus_constant (Pmode
, this_rtx
, delta
));
2927 aarch64_add_constant (this_regno
, IP1_REGNUM
, delta
);
2930 if (Pmode
== ptr_mode
)
2931 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
2933 aarch64_emit_move (temp0
,
2934 gen_rtx_ZERO_EXTEND (Pmode
,
2935 gen_rtx_MEM (ptr_mode
, addr
)));
2937 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
2938 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
2941 (void) aarch64_build_constant (IP1_REGNUM
, vcall_offset
, true);
2942 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
2945 if (Pmode
== ptr_mode
)
2946 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
2948 aarch64_emit_move (temp1
,
2949 gen_rtx_SIGN_EXTEND (Pmode
,
2950 gen_rtx_MEM (ptr_mode
, addr
)));
2952 emit_insn (gen_add2_insn (this_rtx
, temp1
));
2955 /* Generate a tail call to the target function. */
2956 if (!TREE_USED (function
))
2958 assemble_external (function
);
2959 TREE_USED (function
) = 1;
2961 funexp
= XEXP (DECL_RTL (function
), 0);
2962 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
2963 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
2964 SIBLING_CALL_P (insn
) = 1;
2966 insn
= get_insns ();
2967 shorten_branches (insn
);
2968 final_start_function (insn
, file
, 1);
2969 final (insn
, file
, 1);
2970 final_end_function ();
2972 /* Stop pretending to be a post-reload pass. */
2973 reload_completed
= 0;
2977 aarch64_tls_referenced_p (rtx x
)
2979 if (!TARGET_HAVE_TLS
)
2981 subrtx_iterator::array_type array
;
2982 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
2984 const_rtx x
= *iter
;
2985 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
2987 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988 TLS offsets, not real symbol references. */
2989 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
2990 iter
.skip_subrtxes ();
2997 aarch64_bitmasks_cmp (const void *i1
, const void *i2
)
2999 const unsigned HOST_WIDE_INT
*imm1
= (const unsigned HOST_WIDE_INT
*) i1
;
3000 const unsigned HOST_WIDE_INT
*imm2
= (const unsigned HOST_WIDE_INT
*) i2
;
3011 aarch64_build_bitmask_table (void)
3013 unsigned HOST_WIDE_INT mask
, imm
;
3014 unsigned int log_e
, e
, s
, r
;
3015 unsigned int nimms
= 0;
3017 for (log_e
= 1; log_e
<= 6; log_e
++)
3021 mask
= ~(HOST_WIDE_INT
) 0;
3023 mask
= ((HOST_WIDE_INT
) 1 << e
) - 1;
3024 for (s
= 1; s
< e
; s
++)
3026 for (r
= 0; r
< e
; r
++)
3028 /* set s consecutive bits to 1 (s < 64) */
3029 imm
= ((unsigned HOST_WIDE_INT
)1 << s
) - 1;
3030 /* rotate right by r */
3032 imm
= ((imm
>> r
) | (imm
<< (e
- r
))) & mask
;
3033 /* replicate the constant depending on SIMD size */
3035 case 1: imm
|= (imm
<< 2);
3036 case 2: imm
|= (imm
<< 4);
3037 case 3: imm
|= (imm
<< 8);
3038 case 4: imm
|= (imm
<< 16);
3039 case 5: imm
|= (imm
<< 32);
3045 gcc_assert (nimms
< AARCH64_NUM_BITMASKS
);
3046 aarch64_bitmasks
[nimms
++] = imm
;
3051 gcc_assert (nimms
== AARCH64_NUM_BITMASKS
);
3052 qsort (aarch64_bitmasks
, nimms
, sizeof (aarch64_bitmasks
[0]),
3053 aarch64_bitmasks_cmp
);
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058 a left shift of 0 or 12 bits. */
3060 aarch64_uimm12_shift (HOST_WIDE_INT val
)
3062 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
3063 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
3068 /* Return true if val is an immediate that can be loaded into a
3069 register by a MOVZ instruction. */
3071 aarch64_movw_imm (HOST_WIDE_INT val
, machine_mode mode
)
3073 if (GET_MODE_SIZE (mode
) > 4)
3075 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
3076 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
3081 /* Ignore sign extension. */
3082 val
&= (HOST_WIDE_INT
) 0xffffffff;
3084 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
3085 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
3089 /* Return true if val is a valid bitmask immediate. */
3091 aarch64_bitmask_imm (HOST_WIDE_INT val
, machine_mode mode
)
3093 if (GET_MODE_SIZE (mode
) < 8)
3095 /* Replicate bit pattern. */
3096 val
&= (HOST_WIDE_INT
) 0xffffffff;
3099 return bsearch (&val
, aarch64_bitmasks
, AARCH64_NUM_BITMASKS
,
3100 sizeof (aarch64_bitmasks
[0]), aarch64_bitmasks_cmp
) != NULL
;
3104 /* Return true if val is an immediate that can be loaded into a
3105 register in a single instruction. */
3107 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
3109 if (aarch64_movw_imm (val
, mode
) || aarch64_movw_imm (~val
, mode
))
3111 return aarch64_bitmask_imm (val
, mode
);
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
3119 if (GET_CODE (x
) == HIGH
)
3122 split_const (x
, &base
, &offset
);
3123 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
3125 if (aarch64_classify_symbol (base
, offset
, SYMBOL_CONTEXT_ADR
)
3126 != SYMBOL_FORCE_TO_MEM
)
3129 /* Avoid generating a 64-bit relocation in ILP32; leave
3130 to aarch64_expand_mov_immediate to handle it properly. */
3131 return mode
!= ptr_mode
;
3134 return aarch64_tls_referenced_p (x
);
3137 /* Return true if register REGNO is a valid index register.
3138 STRICT_P is true if REG_OK_STRICT is in effect. */
3141 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
3143 if (!HARD_REGISTER_NUM_P (regno
))
3151 regno
= reg_renumber
[regno
];
3153 return GP_REGNUM_P (regno
);
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157 STRICT_P is true if REG_OK_STRICT is in effect. */
3160 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
3162 if (!HARD_REGISTER_NUM_P (regno
))
3170 regno
= reg_renumber
[regno
];
3173 /* The fake registers will be eliminated to either the stack or
3174 hard frame pointer, both of which are usually valid base registers.
3175 Reload deals with the cases where the eliminated form isn't valid. */
3176 return (GP_REGNUM_P (regno
)
3177 || regno
== SP_REGNUM
3178 || regno
== FRAME_POINTER_REGNUM
3179 || regno
== ARG_POINTER_REGNUM
);
3182 /* Return true if X is a valid base register for mode MODE.
3183 STRICT_P is true if REG_OK_STRICT is in effect. */
3186 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
3188 if (!strict_p
&& GET_CODE (x
) == SUBREG
)
3191 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
3194 /* Return true if address offset is a valid index. If it is, fill in INFO
3195 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3198 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
3199 machine_mode mode
, bool strict_p
)
3201 enum aarch64_address_type type
;
3206 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
3207 && GET_MODE (x
) == Pmode
)
3209 type
= ADDRESS_REG_REG
;
3213 /* (sign_extend:DI (reg:SI)) */
3214 else if ((GET_CODE (x
) == SIGN_EXTEND
3215 || GET_CODE (x
) == ZERO_EXTEND
)
3216 && GET_MODE (x
) == DImode
3217 && GET_MODE (XEXP (x
, 0)) == SImode
)
3219 type
= (GET_CODE (x
) == SIGN_EXTEND
)
3220 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3221 index
= XEXP (x
, 0);
3224 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225 else if (GET_CODE (x
) == MULT
3226 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
3227 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
3228 && GET_MODE (XEXP (x
, 0)) == DImode
3229 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
3230 && CONST_INT_P (XEXP (x
, 1)))
3232 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
3233 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3234 index
= XEXP (XEXP (x
, 0), 0);
3235 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
3237 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238 else if (GET_CODE (x
) == ASHIFT
3239 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
3240 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
3241 && GET_MODE (XEXP (x
, 0)) == DImode
3242 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
3243 && CONST_INT_P (XEXP (x
, 1)))
3245 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
3246 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3247 index
= XEXP (XEXP (x
, 0), 0);
3248 shift
= INTVAL (XEXP (x
, 1));
3250 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251 else if ((GET_CODE (x
) == SIGN_EXTRACT
3252 || GET_CODE (x
) == ZERO_EXTRACT
)
3253 && GET_MODE (x
) == DImode
3254 && GET_CODE (XEXP (x
, 0)) == MULT
3255 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3256 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
3258 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
3259 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3260 index
= XEXP (XEXP (x
, 0), 0);
3261 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
3262 if (INTVAL (XEXP (x
, 1)) != 32 + shift
3263 || INTVAL (XEXP (x
, 2)) != 0)
3266 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267 (const_int 0xffffffff<<shift)) */
3268 else if (GET_CODE (x
) == AND
3269 && GET_MODE (x
) == DImode
3270 && GET_CODE (XEXP (x
, 0)) == MULT
3271 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3272 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
3273 && CONST_INT_P (XEXP (x
, 1)))
3275 type
= ADDRESS_REG_UXTW
;
3276 index
= XEXP (XEXP (x
, 0), 0);
3277 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
3278 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
3281 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282 else if ((GET_CODE (x
) == SIGN_EXTRACT
3283 || GET_CODE (x
) == ZERO_EXTRACT
)
3284 && GET_MODE (x
) == DImode
3285 && GET_CODE (XEXP (x
, 0)) == ASHIFT
3286 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3287 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
3289 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
3290 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3291 index
= XEXP (XEXP (x
, 0), 0);
3292 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
3293 if (INTVAL (XEXP (x
, 1)) != 32 + shift
3294 || INTVAL (XEXP (x
, 2)) != 0)
3297 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298 (const_int 0xffffffff<<shift)) */
3299 else if (GET_CODE (x
) == AND
3300 && GET_MODE (x
) == DImode
3301 && GET_CODE (XEXP (x
, 0)) == ASHIFT
3302 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3303 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
3304 && CONST_INT_P (XEXP (x
, 1)))
3306 type
= ADDRESS_REG_UXTW
;
3307 index
= XEXP (XEXP (x
, 0), 0);
3308 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
3309 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
3312 /* (mult:P (reg:P) (const_int scale)) */
3313 else if (GET_CODE (x
) == MULT
3314 && GET_MODE (x
) == Pmode
3315 && GET_MODE (XEXP (x
, 0)) == Pmode
3316 && CONST_INT_P (XEXP (x
, 1)))
3318 type
= ADDRESS_REG_REG
;
3319 index
= XEXP (x
, 0);
3320 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
3322 /* (ashift:P (reg:P) (const_int shift)) */
3323 else if (GET_CODE (x
) == ASHIFT
3324 && GET_MODE (x
) == Pmode
3325 && GET_MODE (XEXP (x
, 0)) == Pmode
3326 && CONST_INT_P (XEXP (x
, 1)))
3328 type
= ADDRESS_REG_REG
;
3329 index
= XEXP (x
, 0);
3330 shift
= INTVAL (XEXP (x
, 1));
3335 if (GET_CODE (index
) == SUBREG
)
3336 index
= SUBREG_REG (index
);
3339 (shift
> 0 && shift
<= 3
3340 && (1 << shift
) == GET_MODE_SIZE (mode
)))
3342 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
3345 info
->offset
= index
;
3346 info
->shift
= shift
;
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3356 return (offset
>= -64 * GET_MODE_SIZE (mode
)
3357 && offset
< 64 * GET_MODE_SIZE (mode
)
3358 && offset
% GET_MODE_SIZE (mode
) == 0);
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
3363 HOST_WIDE_INT offset
)
3365 return offset
>= -256 && offset
< 256;
3369 offset_12bit_unsigned_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3372 && offset
< 4096 * GET_MODE_SIZE (mode
)
3373 && offset
% GET_MODE_SIZE (mode
) == 0);
3376 /* Return true if X is a valid address for machine mode MODE. If it is,
3377 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3378 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3381 aarch64_classify_address (struct aarch64_address_info
*info
,
3382 rtx x
, machine_mode mode
,
3383 RTX_CODE outer_code
, bool strict_p
)
3385 enum rtx_code code
= GET_CODE (x
);
3388 /* On BE, we use load/store pair for all large int mode load/stores. */
3389 bool load_store_pair_p
= (outer_code
== PARALLEL
3390 || (BYTES_BIG_ENDIAN
3391 && aarch64_vect_struct_mode_p (mode
)));
3393 bool allow_reg_index_p
=
3395 && (GET_MODE_SIZE (mode
) != 16 || aarch64_vector_mode_supported_p (mode
))
3396 && !aarch64_vect_struct_mode_p (mode
);
3398 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3400 if (aarch64_vect_struct_mode_p (mode
) && !BYTES_BIG_ENDIAN
3401 && (code
!= POST_INC
&& code
!= REG
))
3408 info
->type
= ADDRESS_REG_IMM
;
3410 info
->offset
= const0_rtx
;
3411 return aarch64_base_register_rtx_p (x
, strict_p
);
3419 && (op0
== virtual_stack_vars_rtx
3420 || op0
== frame_pointer_rtx
3421 || op0
== arg_pointer_rtx
)
3422 && CONST_INT_P (op1
))
3424 info
->type
= ADDRESS_REG_IMM
;
3431 if (GET_MODE_SIZE (mode
) != 0
3432 && CONST_INT_P (op1
)
3433 && aarch64_base_register_rtx_p (op0
, strict_p
))
3435 HOST_WIDE_INT offset
= INTVAL (op1
);
3437 info
->type
= ADDRESS_REG_IMM
;
3441 /* TImode and TFmode values are allowed in both pairs of X
3442 registers and individual Q registers. The available
3444 X,X: 7-bit signed scaled offset
3445 Q: 9-bit signed offset
3446 We conservatively require an offset representable in either mode.
3448 if (mode
== TImode
|| mode
== TFmode
)
3449 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
3450 && offset_9bit_signed_unscaled_p (mode
, offset
));
3452 /* A 7bit offset check because OImode will emit a ldp/stp
3453 instruction (only big endian will get here).
3454 For ldp/stp instructions, the offset is scaled for the size of a
3455 single element of the pair. */
3457 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
3459 /* Three 9/12 bit offsets checks because CImode will emit three
3460 ldr/str instructions (only big endian will get here). */
3462 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
3463 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
3464 || offset_12bit_unsigned_scaled_p (V16QImode
,
3467 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468 instructions (only big endian will get here). */
3470 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
3471 && aarch64_offset_7bit_signed_scaled_p (TImode
,
3474 if (load_store_pair_p
)
3475 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
3476 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
3478 return (offset_9bit_signed_unscaled_p (mode
, offset
)
3479 || offset_12bit_unsigned_scaled_p (mode
, offset
));
3482 if (allow_reg_index_p
)
3484 /* Look for base + (scaled/extended) index register. */
3485 if (aarch64_base_register_rtx_p (op0
, strict_p
)
3486 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
3491 if (aarch64_base_register_rtx_p (op1
, strict_p
)
3492 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
3505 info
->type
= ADDRESS_REG_WB
;
3506 info
->base
= XEXP (x
, 0);
3507 info
->offset
= NULL_RTX
;
3508 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
3512 info
->type
= ADDRESS_REG_WB
;
3513 info
->base
= XEXP (x
, 0);
3514 if (GET_CODE (XEXP (x
, 1)) == PLUS
3515 && CONST_INT_P (XEXP (XEXP (x
, 1), 1))
3516 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
3517 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
3519 HOST_WIDE_INT offset
;
3520 info
->offset
= XEXP (XEXP (x
, 1), 1);
3521 offset
= INTVAL (info
->offset
);
3523 /* TImode and TFmode values are allowed in both pairs of X
3524 registers and individual Q registers. The available
3526 X,X: 7-bit signed scaled offset
3527 Q: 9-bit signed offset
3528 We conservatively require an offset representable in either mode.
3530 if (mode
== TImode
|| mode
== TFmode
)
3531 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
3532 && offset_9bit_signed_unscaled_p (mode
, offset
));
3534 if (load_store_pair_p
)
3535 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
3536 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
3538 return offset_9bit_signed_unscaled_p (mode
, offset
);
3545 /* load literal: pc-relative constant pool entry. Only supported
3546 for SI mode or larger. */
3547 info
->type
= ADDRESS_SYMBOLIC
;
3549 if (!load_store_pair_p
&& GET_MODE_SIZE (mode
) >= 4)
3553 split_const (x
, &sym
, &addend
);
3554 return (GET_CODE (sym
) == LABEL_REF
3555 || (GET_CODE (sym
) == SYMBOL_REF
3556 && CONSTANT_POOL_ADDRESS_P (sym
)));
3561 info
->type
= ADDRESS_LO_SUM
;
3562 info
->base
= XEXP (x
, 0);
3563 info
->offset
= XEXP (x
, 1);
3564 if (allow_reg_index_p
3565 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
3568 split_const (info
->offset
, &sym
, &offs
);
3569 if (GET_CODE (sym
) == SYMBOL_REF
3570 && (aarch64_classify_symbol (sym
, offs
, SYMBOL_CONTEXT_MEM
)
3571 == SYMBOL_SMALL_ABSOLUTE
))
3573 /* The symbol and offset must be aligned to the access size. */
3575 unsigned int ref_size
;
3577 if (CONSTANT_POOL_ADDRESS_P (sym
))
3578 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
3579 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
3581 tree exp
= SYMBOL_REF_DECL (sym
);
3582 align
= TYPE_ALIGN (TREE_TYPE (exp
));
3583 align
= CONSTANT_ALIGNMENT (exp
, align
);
3585 else if (SYMBOL_REF_DECL (sym
))
3586 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
3587 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
3588 && SYMBOL_REF_BLOCK (sym
) != NULL
)
3589 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
3591 align
= BITS_PER_UNIT
;
3593 ref_size
= GET_MODE_SIZE (mode
);
3595 ref_size
= GET_MODE_SIZE (DImode
);
3597 return ((INTVAL (offs
) & (ref_size
- 1)) == 0
3598 && ((align
/ BITS_PER_UNIT
) & (ref_size
- 1)) == 0);
3609 aarch64_symbolic_address_p (rtx x
)
3613 split_const (x
, &x
, &offset
);
3614 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
3617 /* Classify the base of symbolic expression X, given that X appears in
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x
,
3622 enum aarch64_symbol_context context
)
3626 split_const (x
, &x
, &offset
);
3627 return aarch64_classify_symbol (x
, offset
, context
);
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3634 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
3636 struct aarch64_address_info addr
;
3638 return aarch64_classify_address (&addr
, x
, mode
, MEM
, strict_p
);
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3645 aarch64_legitimate_address_p (machine_mode mode
, rtx x
,
3646 RTX_CODE outer_code
, bool strict_p
)
3648 struct aarch64_address_info addr
;
3650 return aarch64_classify_address (&addr
, x
, mode
, outer_code
, strict_p
);
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3655 aarch64_float_const_zero_rtx_p (rtx x
)
3659 if (GET_MODE (x
) == VOIDmode
)
3662 REAL_VALUE_FROM_CONST_DOUBLE (r
, x
);
3663 if (REAL_VALUE_MINUS_ZERO (r
))
3664 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
3665 return REAL_VALUES_EQUAL (r
, dconst0
);
3668 /* Return the fixed registers used for condition codes. */
3671 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
3674 *p2
= INVALID_REGNUM
;
3678 /* Emit call insn with PAT and do aarch64-specific handling. */
3681 aarch64_emit_call_insn (rtx pat
)
3683 rtx insn
= emit_call_insn (pat
);
3685 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
3686 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
3687 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
3691 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
3693 /* All floating point compares return CCFP if it is an equality
3694 comparison, and CCFPE otherwise. */
3695 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
3722 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
3724 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
3725 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
3726 || GET_CODE (x
) == NEG
))
3729 /* A compare with a shifted operand. Because of canonicalization,
3730 the comparison will have to be swapped when we emit the assembly
3732 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
3733 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
3734 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
3735 || GET_CODE (x
) == LSHIFTRT
3736 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
3739 /* Similarly for a negated operand, but we can only do this for
3741 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
3742 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
3743 && (code
== EQ
|| code
== NE
)
3744 && GET_CODE (x
) == NEG
)
3747 /* A compare of a mode narrower than SI mode against zero can be done
3748 by extending the value in the comparison. */
3749 if ((GET_MODE (x
) == QImode
|| GET_MODE (x
) == HImode
)
3751 /* Only use sign-extension if we really need it. */
3752 return ((code
== GT
|| code
== GE
|| code
== LE
|| code
== LT
)
3753 ? CC_SESWPmode
: CC_ZESWPmode
);
3755 /* For everything else, return CCmode. */
3760 aarch64_get_condition_code_1 (enum machine_mode
, enum rtx_code
);
3763 aarch64_get_condition_code (rtx x
)
3765 machine_mode mode
= GET_MODE (XEXP (x
, 0));
3766 enum rtx_code comp_code
= GET_CODE (x
);
3768 if (GET_MODE_CLASS (mode
) != MODE_CC
)
3769 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
3770 return aarch64_get_condition_code_1 (mode
, comp_code
);
3774 aarch64_get_condition_code_1 (enum machine_mode mode
, enum rtx_code comp_code
)
3776 int ne
= -1, eq
= -1;
3783 case GE
: return AARCH64_GE
;
3784 case GT
: return AARCH64_GT
;
3785 case LE
: return AARCH64_LS
;
3786 case LT
: return AARCH64_MI
;
3787 case NE
: return AARCH64_NE
;
3788 case EQ
: return AARCH64_EQ
;
3789 case ORDERED
: return AARCH64_VC
;
3790 case UNORDERED
: return AARCH64_VS
;
3791 case UNLT
: return AARCH64_LT
;
3792 case UNLE
: return AARCH64_LE
;
3793 case UNGT
: return AARCH64_HI
;
3794 case UNGE
: return AARCH64_PL
;
3852 case NE
: return AARCH64_NE
;
3853 case EQ
: return AARCH64_EQ
;
3854 case GE
: return AARCH64_GE
;
3855 case GT
: return AARCH64_GT
;
3856 case LE
: return AARCH64_LE
;
3857 case LT
: return AARCH64_LT
;
3858 case GEU
: return AARCH64_CS
;
3859 case GTU
: return AARCH64_HI
;
3860 case LEU
: return AARCH64_LS
;
3861 case LTU
: return AARCH64_CC
;
3871 case NE
: return AARCH64_NE
;
3872 case EQ
: return AARCH64_EQ
;
3873 case GE
: return AARCH64_LE
;
3874 case GT
: return AARCH64_LT
;
3875 case LE
: return AARCH64_GE
;
3876 case LT
: return AARCH64_GT
;
3877 case GEU
: return AARCH64_LS
;
3878 case GTU
: return AARCH64_CC
;
3879 case LEU
: return AARCH64_CS
;
3880 case LTU
: return AARCH64_HI
;
3888 case NE
: return AARCH64_NE
;
3889 case EQ
: return AARCH64_EQ
;
3890 case GE
: return AARCH64_PL
;
3891 case LT
: return AARCH64_MI
;
3899 case NE
: return AARCH64_NE
;
3900 case EQ
: return AARCH64_EQ
;
3910 if (comp_code
== NE
)
3913 if (comp_code
== EQ
)
3920 aarch64_const_vec_all_same_in_range_p (rtx x
,
3921 HOST_WIDE_INT minval
,
3922 HOST_WIDE_INT maxval
)
3924 HOST_WIDE_INT firstval
;
3927 if (GET_CODE (x
) != CONST_VECTOR
3928 || GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_INT
)
3931 firstval
= INTVAL (CONST_VECTOR_ELT (x
, 0));
3932 if (firstval
< minval
|| firstval
> maxval
)
3935 count
= CONST_VECTOR_NUNITS (x
);
3936 for (i
= 1; i
< count
; i
++)
3937 if (INTVAL (CONST_VECTOR_ELT (x
, i
)) != firstval
)
3944 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
3946 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
3950 bit_count (unsigned HOST_WIDE_INT value
)
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3969 /* N Z C V flags for ccmp. The first code is for AND op and the other
3970 is for IOR op. Indexed by AARCH64_COND_CODE. */
3971 static const int aarch64_nzcv_codes
[][2] =
3973 {AARCH64_CC_Z
, 0}, /* EQ, Z == 1. */
3974 {0, AARCH64_CC_Z
}, /* NE, Z == 0. */
3975 {AARCH64_CC_C
, 0}, /* CS, C == 1. */
3976 {0, AARCH64_CC_C
}, /* CC, C == 0. */
3977 {AARCH64_CC_N
, 0}, /* MI, N == 1. */
3978 {0, AARCH64_CC_N
}, /* PL, N == 0. */
3979 {AARCH64_CC_V
, 0}, /* VS, V == 1. */
3980 {0, AARCH64_CC_V
}, /* VC, V == 0. */
3981 {AARCH64_CC_C
, 0}, /* HI, C ==1 && Z == 0. */
3982 {0, AARCH64_CC_C
}, /* LS, !(C == 1 && Z == 0). */
3983 {0, AARCH64_CC_V
}, /* GE, N == V. */
3984 {AARCH64_CC_V
, 0}, /* LT, N != V. */
3985 {0, AARCH64_CC_Z
}, /* GT, Z == 0 && N == V. */
3986 {AARCH64_CC_Z
, 0}, /* LE, !(Z == 0 && N == V). */
3987 {0, 0}, /* AL, Any. */
3988 {0, 0}, /* NV, Any. */
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode
)
4033 aarch64_print_operand (FILE *f
, rtx x
, char code
)
4037 /* An integer or symbol address without a preceding # sign. */
4039 switch (GET_CODE (x
))
4042 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
4046 output_addr_const (f
, x
);
4050 if (GET_CODE (XEXP (x
, 0)) == PLUS
4051 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
4053 output_addr_const (f
, x
);
4059 output_operand_lossage ("Unsupported operand for code '%c'", code
);
4064 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4068 if (!CONST_INT_P (x
)
4069 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
4071 output_operand_lossage ("invalid operand for '%%%c'", code
);
4087 output_operand_lossage ("invalid operand for '%%%c'", code
);
4097 /* Print N such that 2^N == X. */
4098 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
4100 output_operand_lossage ("invalid operand for '%%%c'", code
);
4104 asm_fprintf (f
, "%d", n
);
4109 /* Print the number of non-zero bits in X (a const_int). */
4110 if (!CONST_INT_P (x
))
4112 output_operand_lossage ("invalid operand for '%%%c'", code
);
4116 asm_fprintf (f
, "%u", bit_count (INTVAL (x
)));
4120 /* Print the higher numbered register of a pair (TImode) of regs. */
4121 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
4123 output_operand_lossage ("invalid operand for '%%%c'", code
);
4127 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
4133 /* Print a condition (eq, ne, etc). */
4135 /* CONST_TRUE_RTX means always -- that's the default. */
4136 if (x
== const_true_rtx
)
4139 if (!COMPARISON_P (x
))
4141 output_operand_lossage ("invalid operand for '%%%c'", code
);
4145 cond_code
= aarch64_get_condition_code (x
);
4146 gcc_assert (cond_code
>= 0);
4147 fputs (aarch64_condition_codes
[cond_code
], f
);
4154 /* Print the inverse of a condition (eq <-> ne, etc). */
4156 /* CONST_TRUE_RTX means never -- that's the default. */
4157 if (x
== const_true_rtx
)
4163 if (!COMPARISON_P (x
))
4165 output_operand_lossage ("invalid operand for '%%%c'", code
);
4168 cond_code
= aarch64_get_condition_code (x
);
4169 gcc_assert (cond_code
>= 0);
4170 fputs (aarch64_condition_codes
[AARCH64_INVERSE_CONDITION_CODE
4180 /* Print a scalar FP/SIMD register name. */
4181 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4183 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4186 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
4193 /* Print the first FP/SIMD register name in a list. */
4194 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4196 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4199 asm_fprintf (f
, "v%d", REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
4203 /* Print a scalar FP/SIMD register name + 1. */
4204 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4206 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4209 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
4213 /* Print bottom 16 bits of integer constant in hex. */
4214 if (!CONST_INT_P (x
))
4216 output_operand_lossage ("invalid operand for '%%%c'", code
);
4219 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
4224 /* Print a general register name or the zero register (32-bit or
4227 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
4229 asm_fprintf (f
, "%czr", code
);
4233 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
4235 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
4239 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
4241 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
4248 /* Print a normal operand, if it's a general register, then we
4252 output_operand_lossage ("missing operand");
4256 switch (GET_CODE (x
))
4259 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
4263 aarch64_memory_reference_mode
= GET_MODE (x
);
4264 output_address (XEXP (x
, 0));
4269 output_addr_const (asm_out_file
, x
);
4273 asm_fprintf (f
, "%wd", INTVAL (x
));
4277 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
4280 aarch64_const_vec_all_same_in_range_p (x
,
4282 HOST_WIDE_INT_MAX
));
4283 asm_fprintf (f
, "%wd", INTVAL (CONST_VECTOR_ELT (x
, 0)));
4285 else if (aarch64_simd_imm_zero_p (x
, GET_MODE (x
)))
4294 /* CONST_DOUBLE can represent a double-width integer.
4295 In this case, the mode of x is VOIDmode. */
4296 if (GET_MODE (x
) == VOIDmode
)
4298 else if (aarch64_float_const_zero_rtx_p (x
))
4303 else if (aarch64_float_const_representable_p (x
))
4306 char float_buf
[buf_size
] = {'\0'};
4308 REAL_VALUE_FROM_CONST_DOUBLE (r
, x
);
4309 real_to_decimal_for_mode (float_buf
, &r
,
4312 asm_fprintf (asm_out_file
, "%s", float_buf
);
4316 output_operand_lossage ("invalid constant");
4319 output_operand_lossage ("invalid operand");
4325 if (GET_CODE (x
) == HIGH
)
4328 switch (aarch64_classify_symbolic_expression (x
, SYMBOL_CONTEXT_ADR
))
4330 case SYMBOL_SMALL_GOT
:
4331 asm_fprintf (asm_out_file
, ":got:");
4334 case SYMBOL_SMALL_TLSGD
:
4335 asm_fprintf (asm_out_file
, ":tlsgd:");
4338 case SYMBOL_SMALL_TLSDESC
:
4339 asm_fprintf (asm_out_file
, ":tlsdesc:");
4342 case SYMBOL_SMALL_GOTTPREL
:
4343 asm_fprintf (asm_out_file
, ":gottprel:");
4346 case SYMBOL_SMALL_TPREL
:
4347 asm_fprintf (asm_out_file
, ":tprel:");
4350 case SYMBOL_TINY_GOT
:
4357 output_addr_const (asm_out_file
, x
);
4361 switch (aarch64_classify_symbolic_expression (x
, SYMBOL_CONTEXT_ADR
))
4363 case SYMBOL_SMALL_GOT
:
4364 asm_fprintf (asm_out_file
, ":lo12:");
4367 case SYMBOL_SMALL_TLSGD
:
4368 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
4371 case SYMBOL_SMALL_TLSDESC
:
4372 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
4375 case SYMBOL_SMALL_GOTTPREL
:
4376 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
4379 case SYMBOL_SMALL_TPREL
:
4380 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
4383 case SYMBOL_TINY_GOT
:
4384 asm_fprintf (asm_out_file
, ":got:");
4390 output_addr_const (asm_out_file
, x
);
4395 switch (aarch64_classify_symbolic_expression (x
, SYMBOL_CONTEXT_ADR
))
4397 case SYMBOL_SMALL_TPREL
:
4398 asm_fprintf (asm_out_file
, ":tprel_hi12:");
4403 output_addr_const (asm_out_file
, x
);
4411 if (!COMPARISON_P (x
))
4413 output_operand_lossage ("invalid operand for '%%%c'", code
);
4417 cond_code
= aarch64_get_condition_code_1 (CCmode
, GET_CODE (x
));
4418 gcc_assert (cond_code
>= 0);
4419 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
][0]);
4428 if (!COMPARISON_P (x
))
4430 output_operand_lossage ("invalid operand for '%%%c'", code
);
4434 cond_code
= aarch64_get_condition_code_1 (CCmode
, GET_CODE (x
));
4435 gcc_assert (cond_code
>= 0);
4436 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
][1]);
4441 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
4447 aarch64_print_operand_address (FILE *f
, rtx x
)
4449 struct aarch64_address_info addr
;
4451 if (aarch64_classify_address (&addr
, x
, aarch64_memory_reference_mode
,
4455 case ADDRESS_REG_IMM
:
4456 if (addr
.offset
== const0_rtx
)
4457 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
4459 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
4460 INTVAL (addr
.offset
));
4463 case ADDRESS_REG_REG
:
4464 if (addr
.shift
== 0)
4465 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
4466 reg_names
[REGNO (addr
.offset
)]);
4468 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
4469 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
4472 case ADDRESS_REG_UXTW
:
4473 if (addr
.shift
== 0)
4474 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
4475 REGNO (addr
.offset
) - R0_REGNUM
);
4477 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
4478 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
4481 case ADDRESS_REG_SXTW
:
4482 if (addr
.shift
== 0)
4483 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
4484 REGNO (addr
.offset
) - R0_REGNUM
);
4486 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
4487 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
4490 case ADDRESS_REG_WB
:
4491 switch (GET_CODE (x
))
4494 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)],
4495 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4498 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)],
4499 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4502 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)],
4503 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4506 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)],
4507 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4510 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
4511 INTVAL (addr
.offset
));
4514 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
4515 INTVAL (addr
.offset
));
4522 case ADDRESS_LO_SUM
:
4523 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
4524 output_addr_const (f
, addr
.offset
);
4525 asm_fprintf (f
, "]");
4528 case ADDRESS_SYMBOLIC
:
4532 output_addr_const (f
, x
);
4536 aarch64_label_mentioned_p (rtx x
)
4541 if (GET_CODE (x
) == LABEL_REF
)
4544 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545 referencing instruction, but they are constant offsets, not
4547 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
4550 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
4551 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
4557 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
4558 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
4561 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
4568 /* Implement REGNO_REG_CLASS. */
4571 aarch64_regno_regclass (unsigned regno
)
4573 if (GP_REGNUM_P (regno
))
4574 return GENERAL_REGS
;
4576 if (regno
== SP_REGNUM
)
4579 if (regno
== FRAME_POINTER_REGNUM
4580 || regno
== ARG_POINTER_REGNUM
)
4581 return POINTER_REGS
;
4583 if (FP_REGNUM_P (regno
))
4584 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
4590 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
4592 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593 where mask is selected by alignment and size of the offset.
4594 We try to pick as large a range for the offset as possible to
4595 maximize the chance of a CSE. However, for aligned addresses
4596 we limit the range to 4k so that structures with different sized
4597 elements are likely to use the same base. */
4599 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
4601 HOST_WIDE_INT offset
= INTVAL (XEXP (x
, 1));
4602 HOST_WIDE_INT base_offset
;
4604 /* Does it look like we'll need a load/store-pair operation? */
4605 if (GET_MODE_SIZE (mode
) > 16
4607 base_offset
= ((offset
+ 64 * GET_MODE_SIZE (mode
))
4608 & ~((128 * GET_MODE_SIZE (mode
)) - 1));
4609 /* For offsets aren't a multiple of the access size, the limit is
4611 else if (offset
& (GET_MODE_SIZE (mode
) - 1))
4612 base_offset
= (offset
+ 0x100) & ~0x1ff;
4614 base_offset
= offset
& ~0xfff;
4616 if (base_offset
== 0)
4619 offset
-= base_offset
;
4620 rtx base_reg
= gen_reg_rtx (Pmode
);
4621 rtx val
= force_operand (plus_constant (Pmode
, XEXP (x
, 0), base_offset
),
4623 emit_move_insn (base_reg
, val
);
4624 x
= plus_constant (Pmode
, base_reg
, offset
);
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631 operand. If we find one, push the reload and return the new rtx. */
4634 aarch64_legitimize_reload_address (rtx
*x_p
,
4636 int opnum
, int type
,
4637 int ind_levels ATTRIBUTE_UNUSED
)
4641 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4642 if (aarch64_vect_struct_mode_p (mode
)
4643 && GET_CODE (x
) == PLUS
4644 && REG_P (XEXP (x
, 0))
4645 && CONST_INT_P (XEXP (x
, 1)))
4649 push_reload (orig_rtx
, NULL_RTX
, x_p
, NULL
,
4650 BASE_REG_CLASS
, GET_MODE (x
), VOIDmode
, 0, 0,
4651 opnum
, (enum reload_type
) type
);
4655 /* We must recognize output that we have already generated ourselves. */
4656 if (GET_CODE (x
) == PLUS
4657 && GET_CODE (XEXP (x
, 0)) == PLUS
4658 && REG_P (XEXP (XEXP (x
, 0), 0))
4659 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4660 && CONST_INT_P (XEXP (x
, 1)))
4662 push_reload (XEXP (x
, 0), NULL_RTX
, &XEXP (x
, 0), NULL
,
4663 BASE_REG_CLASS
, GET_MODE (x
), VOIDmode
, 0, 0,
4664 opnum
, (enum reload_type
) type
);
4668 /* We wish to handle large displacements off a base register by splitting
4669 the addend across an add and the mem insn. This can cut the number of
4670 extra insns needed from 3 to 1. It is only useful for load/store of a
4671 single register with 12 bit offset field. */
4672 if (GET_CODE (x
) == PLUS
4673 && REG_P (XEXP (x
, 0))
4674 && CONST_INT_P (XEXP (x
, 1))
4675 && HARD_REGISTER_P (XEXP (x
, 0))
4678 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x
, 0)), true))
4680 HOST_WIDE_INT val
= INTVAL (XEXP (x
, 1));
4681 HOST_WIDE_INT low
= val
& 0xfff;
4682 HOST_WIDE_INT high
= val
- low
;
4685 machine_mode xmode
= GET_MODE (x
);
4687 /* In ILP32, xmode can be either DImode or SImode. */
4688 gcc_assert (xmode
== DImode
|| xmode
== SImode
);
4690 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4691 BLKmode alignment. */
4692 if (GET_MODE_SIZE (mode
) == 0)
4695 offs
= low
% GET_MODE_SIZE (mode
);
4697 /* Align misaligned offset by adjusting high part to compensate. */
4700 if (aarch64_uimm12_shift (high
+ offs
))
4709 offs
= GET_MODE_SIZE (mode
) - offs
;
4711 high
= high
+ (low
& 0x1000) - offs
;
4716 /* Check for overflow. */
4717 if (high
+ low
!= val
)
4720 cst
= GEN_INT (high
);
4721 if (!aarch64_uimm12_shift (high
))
4722 cst
= force_const_mem (xmode
, cst
);
4724 /* Reload high part into base reg, leaving the low part
4725 in the mem instruction.
4726 Note that replacing this gen_rtx_PLUS with plus_constant is
4727 wrong in this case because we rely on the
4728 (plus (plus reg c1) c2) structure being preserved so that
4729 XEXP (*p, 0) in push_reload below uses the correct term. */
4730 x
= gen_rtx_PLUS (xmode
,
4731 gen_rtx_PLUS (xmode
, XEXP (x
, 0), cst
),
4734 push_reload (XEXP (x
, 0), NULL_RTX
, &XEXP (x
, 0), NULL
,
4735 BASE_REG_CLASS
, xmode
, VOIDmode
, 0, 0,
4736 opnum
, (enum reload_type
) type
);
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
4748 secondary_reload_info
*sri
)
4750 /* Without the TARGET_SIMD instructions we cannot move a Q register
4751 to a Q register directly. We need a scratch. */
4752 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
4753 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
4754 && reg_class_subset_p (rclass
, FP_REGS
))
4757 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
4758 else if (mode
== TImode
)
4759 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
4763 /* A TFmode or TImode memory access should be handled via an FP_REGS
4764 because AArch64 has richer addressing modes for LDR/STR instructions
4765 than LDP/STP instructions. */
4766 if (!TARGET_GENERAL_REGS_ONLY
&& rclass
== GENERAL_REGS
4767 && GET_MODE_SIZE (mode
) == 16 && MEM_P (x
))
4770 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
4771 return GENERAL_REGS
;
4777 aarch64_can_eliminate (const int from
, const int to
)
4779 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4782 if (frame_pointer_needed
)
4784 if (from
== ARG_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
4786 if (from
== ARG_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
)
4788 if (from
== FRAME_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
4789 && !cfun
->calls_alloca
)
4791 if (from
== FRAME_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
4798 /* If we decided that we didn't need a leaf frame pointer but then used
4799 LR in the function, then we'll want a frame pointer after all, so
4800 prevent this elimination to ensure a frame pointer is used. */
4801 if (to
== STACK_POINTER_REGNUM
4802 && flag_omit_leaf_frame_pointer
4803 && df_regs_ever_live_p (LR_REGNUM
))
4811 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
4813 aarch64_layout_frame ();
4815 if (to
== HARD_FRAME_POINTER_REGNUM
)
4817 if (from
== ARG_POINTER_REGNUM
)
4818 return cfun
->machine
->frame
.frame_size
- crtl
->outgoing_args_size
;
4820 if (from
== FRAME_POINTER_REGNUM
)
4821 return (cfun
->machine
->frame
.hard_fp_offset
4822 - cfun
->machine
->frame
.saved_varargs_size
);
4825 if (to
== STACK_POINTER_REGNUM
)
4827 if (from
== FRAME_POINTER_REGNUM
)
4828 return (cfun
->machine
->frame
.frame_size
4829 - cfun
->machine
->frame
.saved_varargs_size
);
4832 return cfun
->machine
->frame
.frame_size
;
4835 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4839 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
4843 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
4848 aarch64_asm_trampoline_template (FILE *f
)
4852 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
4853 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
4857 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
4858 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
4860 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
4861 assemble_aligned_integer (4, const0_rtx
);
4862 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
4863 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
4867 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
4869 rtx fnaddr
, mem
, a_tramp
;
4870 const int tramp_code_sz
= 16;
4872 /* Don't need to copy the trailing D-words, we fill those in below. */
4873 emit_block_move (m_tramp
, assemble_trampoline_template (),
4874 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
4875 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
4876 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
4877 if (GET_MODE (fnaddr
) != ptr_mode
)
4878 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
4879 emit_move_insn (mem
, fnaddr
);
4881 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
4882 emit_move_insn (mem
, chain_value
);
4884 /* XXX We should really define a "clear_cache" pattern and use
4885 gen_clear_cache(). */
4886 a_tramp
= XEXP (m_tramp
, 0);
4887 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
4888 LCT_NORMAL
, VOIDmode
, 2, a_tramp
, ptr_mode
,
4889 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
4898 case CALLER_SAVE_REGS
:
4905 aarch64_vector_mode_p (mode
) ? (GET_MODE_SIZE (mode
) + 15) / 16 :
4906 (GET_MODE_SIZE (mode
) + 7) / 8;
4920 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
4922 if (regclass
== POINTER_REGS
)
4923 return GENERAL_REGS
;
4925 if (regclass
== STACK_REG
)
4928 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
4934 /* If it's an integer immediate that MOVI can't handle, then
4935 FP_REGS is not an option, so we return NO_REGS instead. */
4936 if (CONST_INT_P (x
) && reg_class_subset_p (regclass
, FP_REGS
)
4937 && !aarch64_simd_imm_scalar_p (x
, GET_MODE (x
)))
4940 /* Register eliminiation can result in a request for
4941 SP+constant->FP_REGS. We cannot support such operations which
4942 use SP as source and an FP_REG as destination, so reject out
4944 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
4946 rtx lhs
= XEXP (x
, 0);
4948 /* Look through a possible SUBREG introduced by ILP32. */
4949 if (GET_CODE (lhs
) == SUBREG
)
4950 lhs
= SUBREG_REG (lhs
);
4952 gcc_assert (REG_P (lhs
));
4953 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
4962 aarch64_asm_output_labelref (FILE* f
, const char *name
)
4964 asm_fprintf (f
, "%U%s", name
);
4968 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
4970 if (priority
== DEFAULT_INIT_PRIORITY
)
4971 default_ctor_section_asm_out_constructor (symbol
, priority
);
4976 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
4977 s
= get_section (buf
, SECTION_WRITE
, NULL
);
4978 switch_to_section (s
);
4979 assemble_align (POINTER_SIZE
);
4980 assemble_aligned_integer (POINTER_BYTES
, symbol
);
4985 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
4987 if (priority
== DEFAULT_INIT_PRIORITY
)
4988 default_dtor_section_asm_out_destructor (symbol
, priority
);
4993 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
4994 s
= get_section (buf
, SECTION_WRITE
, NULL
);
4995 switch_to_section (s
);
4996 assemble_align (POINTER_SIZE
);
4997 assemble_aligned_integer (POINTER_BYTES
, symbol
);
5002 aarch64_output_casesi (rtx
*operands
)
5006 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
5008 static const char *const patterns
[4][2] =
5011 "ldrb\t%w3, [%0,%w1,uxtw]",
5012 "add\t%3, %4, %w3, sxtb #2"
5015 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016 "add\t%3, %4, %w3, sxth #2"
5019 "ldr\t%w3, [%0,%w1,uxtw #2]",
5020 "add\t%3, %4, %w3, sxtw #2"
5022 /* We assume that DImode is only generated when not optimizing and
5023 that we don't really need 64-bit address offsets. That would
5024 imply an object file with 8GB of code in a single function! */
5026 "ldr\t%w3, [%0,%w1,uxtw #2]",
5027 "add\t%3, %4, %w3, sxtw #2"
5031 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
5033 index
= exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec
)));
5035 gcc_assert (index
>= 0 && index
<= 3);
5037 /* Need to implement table size reduction, by chaning the code below. */
5038 output_asm_insn (patterns
[index
][0], operands
);
5039 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
5040 snprintf (buf
, sizeof (buf
),
5041 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
5042 output_asm_insn (buf
, operands
);
5043 output_asm_insn (patterns
[index
][1], operands
);
5044 output_asm_insn ("br\t%3", operands
);
5045 assemble_label (asm_out_file
, label
);
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5055 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
5057 if (shift
>= 0 && shift
<= 3)
5060 for (size
= 8; size
<= 32; size
*= 2)
5062 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
5063 if (mask
== bits
<< shift
)
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED
,
5072 const_rtx x ATTRIBUTE_UNUSED
)
5074 /* We can't use blocks for constants when we're using a per-function
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED
,
5081 rtx x ATTRIBUTE_UNUSED
,
5082 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED
)
5084 /* Force all constant pool entries into the current function section. */
5085 return function_section (current_function_decl
);
5091 /* Helper function for rtx cost calculation. Strip a shift expression
5092 from X. Returns the inner operand if successful, or the original
5093 expression on failure. */
5095 aarch64_strip_shift (rtx x
)
5099 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100 we can convert both to ROR during final output. */
5101 if ((GET_CODE (op
) == ASHIFT
5102 || GET_CODE (op
) == ASHIFTRT
5103 || GET_CODE (op
) == LSHIFTRT
5104 || GET_CODE (op
) == ROTATERT
5105 || GET_CODE (op
) == ROTATE
)
5106 && CONST_INT_P (XEXP (op
, 1)))
5107 return XEXP (op
, 0);
5109 if (GET_CODE (op
) == MULT
5110 && CONST_INT_P (XEXP (op
, 1))
5111 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
5112 return XEXP (op
, 0);
5117 /* Helper function for rtx cost calculation. Strip an extend
5118 expression from X. Returns the inner operand if successful, or the
5119 original expression on failure. We deal with a number of possible
5120 canonicalization variations here. */
5122 aarch64_strip_extend (rtx x
)
5126 /* Zero and sign extraction of a widened value. */
5127 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
5128 && XEXP (op
, 2) == const0_rtx
5129 && GET_CODE (XEXP (op
, 0)) == MULT
5130 && aarch64_is_extend_from_extract (GET_MODE (op
), XEXP (XEXP (op
, 0), 1),
5132 return XEXP (XEXP (op
, 0), 0);
5134 /* It can also be represented (for zero-extend) as an AND with an
5136 if (GET_CODE (op
) == AND
5137 && GET_CODE (XEXP (op
, 0)) == MULT
5138 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
5139 && CONST_INT_P (XEXP (op
, 1))
5140 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
5141 INTVAL (XEXP (op
, 1))) != 0)
5142 return XEXP (XEXP (op
, 0), 0);
5144 /* Now handle extended register, as this may also have an optional
5145 left shift by 1..4. */
5146 if (GET_CODE (op
) == ASHIFT
5147 && CONST_INT_P (XEXP (op
, 1))
5148 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
5151 if (GET_CODE (op
) == ZERO_EXTEND
5152 || GET_CODE (op
) == SIGN_EXTEND
)
5161 /* Helper function for rtx cost calculation. Calculate the cost of
5162 a MULT, which may be part of a multiply-accumulate rtx. Return
5163 the calculated cost of the expression, recursing manually in to
5164 operands where needed. */
5167 aarch64_rtx_mult_cost (rtx x
, int code
, int outer
, bool speed
)
5170 const struct cpu_cost_table
*extra_cost
5171 = aarch64_tune_params
->insn_extra_cost
;
5173 bool maybe_fma
= (outer
== PLUS
|| outer
== MINUS
);
5174 machine_mode mode
= GET_MODE (x
);
5176 gcc_checking_assert (code
== MULT
);
5181 if (VECTOR_MODE_P (mode
))
5182 mode
= GET_MODE_INNER (mode
);
5184 /* Integer multiply/fma. */
5185 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5187 /* The multiply will be canonicalized as a shift, cost it as such. */
5188 if (CONST_INT_P (op1
)
5189 && exact_log2 (INTVAL (op1
)) > 0)
5194 /* ADD (shifted register). */
5195 cost
+= extra_cost
->alu
.arith_shift
;
5197 /* LSL (immediate). */
5198 cost
+= extra_cost
->alu
.shift
;
5201 cost
+= rtx_cost (op0
, GET_CODE (op0
), 0, speed
);
5206 /* Integer multiplies or FMAs have zero/sign extending variants. */
5207 if ((GET_CODE (op0
) == ZERO_EXTEND
5208 && GET_CODE (op1
) == ZERO_EXTEND
)
5209 || (GET_CODE (op0
) == SIGN_EXTEND
5210 && GET_CODE (op1
) == SIGN_EXTEND
))
5212 cost
+= rtx_cost (XEXP (op0
, 0), MULT
, 0, speed
)
5213 + rtx_cost (XEXP (op1
, 0), MULT
, 1, speed
);
5218 /* MADD/SMADDL/UMADDL. */
5219 cost
+= extra_cost
->mult
[0].extend_add
;
5221 /* MUL/SMULL/UMULL. */
5222 cost
+= extra_cost
->mult
[0].extend
;
5228 /* This is either an integer multiply or an FMA. In both cases
5229 we want to recurse and cost the operands. */
5230 cost
+= rtx_cost (op0
, MULT
, 0, speed
)
5231 + rtx_cost (op1
, MULT
, 1, speed
);
5237 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
5240 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
5249 /* Floating-point FMA/FMUL can also support negations of the
5251 if (GET_CODE (op0
) == NEG
)
5252 op0
= XEXP (op0
, 0);
5253 if (GET_CODE (op1
) == NEG
)
5254 op1
= XEXP (op1
, 0);
5257 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5258 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
5261 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
5264 cost
+= rtx_cost (op0
, MULT
, 0, speed
)
5265 + rtx_cost (op1
, MULT
, 1, speed
);
5271 aarch64_address_cost (rtx x
,
5273 addr_space_t as ATTRIBUTE_UNUSED
,
5276 enum rtx_code c
= GET_CODE (x
);
5277 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
->addr_cost
;
5278 struct aarch64_address_info info
;
5282 if (!aarch64_classify_address (&info
, x
, mode
, c
, false))
5284 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
5286 /* This is a CONST or SYMBOL ref which will be split
5287 in a different way depending on the code model in use.
5288 Cost it through the generic infrastructure. */
5289 int cost_symbol_ref
= rtx_cost (x
, MEM
, 1, speed
);
5290 /* Divide through by the cost of one instruction to
5291 bring it to the same units as the address costs. */
5292 cost_symbol_ref
/= COSTS_N_INSNS (1);
5293 /* The cost is then the cost of preparing the address,
5294 followed by an immediate (possibly 0) offset. */
5295 return cost_symbol_ref
+ addr_cost
->imm_offset
;
5299 /* This is most likely a jump table from a case
5301 return addr_cost
->register_offset
;
5307 case ADDRESS_LO_SUM
:
5308 case ADDRESS_SYMBOLIC
:
5309 case ADDRESS_REG_IMM
:
5310 cost
+= addr_cost
->imm_offset
;
5313 case ADDRESS_REG_WB
:
5314 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
5315 cost
+= addr_cost
->pre_modify
;
5316 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
5317 cost
+= addr_cost
->post_modify
;
5323 case ADDRESS_REG_REG
:
5324 cost
+= addr_cost
->register_offset
;
5327 case ADDRESS_REG_UXTW
:
5328 case ADDRESS_REG_SXTW
:
5329 cost
+= addr_cost
->register_extend
;
5339 /* For the sake of calculating the cost of the shifted register
5340 component, we can treat same sized modes in the same way. */
5341 switch (GET_MODE_BITSIZE (mode
))
5344 cost
+= addr_cost
->addr_scale_costs
.hi
;
5348 cost
+= addr_cost
->addr_scale_costs
.si
;
5352 cost
+= addr_cost
->addr_scale_costs
.di
;
5355 /* We can't tell, or this is a 128-bit vector. */
5357 cost
+= addr_cost
->addr_scale_costs
.ti
;
5365 /* Return true if the RTX X in mode MODE is a zero or sign extract
5366 usable in an ADD or SUB (extended register) instruction. */
5368 aarch64_rtx_arith_op_extract_p (rtx x
, machine_mode mode
)
5370 /* Catch add with a sign extract.
5371 This is add_<optab><mode>_multp2. */
5372 if (GET_CODE (x
) == SIGN_EXTRACT
5373 || GET_CODE (x
) == ZERO_EXTRACT
)
5375 rtx op0
= XEXP (x
, 0);
5376 rtx op1
= XEXP (x
, 1);
5377 rtx op2
= XEXP (x
, 2);
5379 if (GET_CODE (op0
) == MULT
5380 && CONST_INT_P (op1
)
5381 && op2
== const0_rtx
5382 && CONST_INT_P (XEXP (op0
, 1))
5383 && aarch64_is_extend_from_extract (mode
,
5395 aarch64_frint_unspec_p (unsigned int u
)
5413 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5414 storing it in *COST. Result is true if the total cost of the operation
5415 has now been calculated. */
5417 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
5421 enum rtx_code cmpcode
;
5423 if (COMPARISON_P (op0
))
5425 inner
= XEXP (op0
, 0);
5426 comparator
= XEXP (op0
, 1);
5427 cmpcode
= GET_CODE (op0
);
5432 comparator
= const0_rtx
;
5436 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
5438 /* Conditional branch. */
5439 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
5443 if (cmpcode
== NE
|| cmpcode
== EQ
)
5445 if (comparator
== const0_rtx
)
5447 /* TBZ/TBNZ/CBZ/CBNZ. */
5448 if (GET_CODE (inner
) == ZERO_EXTRACT
)
5450 *cost
+= rtx_cost (XEXP (inner
, 0), ZERO_EXTRACT
,
5454 *cost
+= rtx_cost (inner
, cmpcode
, 0, speed
);
5459 else if (cmpcode
== LT
|| cmpcode
== GE
)
5462 if (comparator
== const0_rtx
)
5467 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
5469 /* It's a conditional operation based on the status flags,
5470 so it must be some flavor of CSEL. */
5472 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5473 if (GET_CODE (op1
) == NEG
5474 || GET_CODE (op1
) == NOT
5475 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
5476 op1
= XEXP (op1
, 0);
5478 *cost
+= rtx_cost (op1
, IF_THEN_ELSE
, 1, speed
);
5479 *cost
+= rtx_cost (op2
, IF_THEN_ELSE
, 2, speed
);
5483 /* We don't know what this is, cost all operands. */
5487 /* Calculate the cost of calculating X, storing it in *COST. Result
5488 is true if the total cost of the operation has now been calculated. */
5490 aarch64_rtx_costs (rtx x
, int code
, int outer ATTRIBUTE_UNUSED
,
5491 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
5494 const struct cpu_cost_table
*extra_cost
5495 = aarch64_tune_params
->insn_extra_cost
;
5496 machine_mode mode
= GET_MODE (x
);
5498 /* By default, assume that everything has equivalent cost to the
5499 cheapest instruction. Any additional costs are applied as a delta
5500 above this default. */
5501 *cost
= COSTS_N_INSNS (1);
5503 /* TODO: The cost infrastructure currently does not handle
5504 vector operations. Assume that all vector operations
5505 are equally expensive. */
5506 if (VECTOR_MODE_P (mode
))
5509 *cost
+= extra_cost
->vect
.alu
;
5516 /* The cost depends entirely on the operands to SET. */
5521 switch (GET_CODE (op0
))
5526 rtx address
= XEXP (op0
, 0);
5527 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5528 *cost
+= extra_cost
->ldst
.store
;
5529 else if (mode
== SFmode
)
5530 *cost
+= extra_cost
->ldst
.storef
;
5531 else if (mode
== DFmode
)
5532 *cost
+= extra_cost
->ldst
.stored
;
5535 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
5539 *cost
+= rtx_cost (op1
, SET
, 1, speed
);
5543 if (! REG_P (SUBREG_REG (op0
)))
5544 *cost
+= rtx_cost (SUBREG_REG (op0
), SET
, 0, speed
);
5548 /* const0_rtx is in general free, but we will use an
5549 instruction to set a register to 0. */
5550 if (REG_P (op1
) || op1
== const0_rtx
)
5552 /* The cost is 1 per register copied. */
5553 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
5555 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
5558 /* Cost is just the cost of the RHS of the set. */
5559 *cost
+= rtx_cost (op1
, SET
, 1, speed
);
5564 /* Bit-field insertion. Strip any redundant widening of
5565 the RHS to meet the width of the target. */
5566 if (GET_CODE (op1
) == SUBREG
)
5567 op1
= SUBREG_REG (op1
);
5568 if ((GET_CODE (op1
) == ZERO_EXTEND
5569 || GET_CODE (op1
) == SIGN_EXTEND
)
5570 && CONST_INT_P (XEXP (op0
, 1))
5571 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1
, 0)))
5572 >= INTVAL (XEXP (op0
, 1))))
5573 op1
= XEXP (op1
, 0);
5575 if (CONST_INT_P (op1
))
5577 /* MOV immediate is assumed to always be cheap. */
5578 *cost
= COSTS_N_INSNS (1);
5584 *cost
+= extra_cost
->alu
.bfi
;
5585 *cost
+= rtx_cost (op1
, (enum rtx_code
) code
, 1, speed
);
5591 /* We can't make sense of this, assume default cost. */
5592 *cost
= COSTS_N_INSNS (1);
5598 /* If an instruction can incorporate a constant within the
5599 instruction, the instruction's expression avoids calling
5600 rtx_cost() on the constant. If rtx_cost() is called on a
5601 constant, then it is usually because the constant must be
5602 moved into a register by one or more instructions.
5604 The exception is constant 0, which can be expressed
5605 as XZR/WZR and is therefore free. The exception to this is
5606 if we have (set (reg) (const0_rtx)) in which case we must cost
5607 the move. However, we can catch that when we cost the SET, so
5608 we don't need to consider that here. */
5609 if (x
== const0_rtx
)
5613 /* To an approximation, building any other constant is
5614 proportionally expensive to the number of instructions
5615 required to build that constant. This is true whether we
5616 are compiling for SPEED or otherwise. */
5617 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
5618 (NULL_RTX
, x
, false, mode
));
5625 /* mov[df,sf]_aarch64. */
5626 if (aarch64_float_const_representable_p (x
))
5627 /* FMOV (scalar immediate). */
5628 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
5629 else if (!aarch64_float_const_zero_rtx_p (x
))
5631 /* This will be a load from memory. */
5633 *cost
+= extra_cost
->ldst
.loadd
;
5635 *cost
+= extra_cost
->ldst
.loadf
;
5638 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5639 or MOV v0.s[0], wzr - neither of which are modeled by the
5640 cost tables. Just use the default cost. */
5650 /* For loads we want the base cost of a load, plus an
5651 approximation for the additional cost of the addressing
5653 rtx address
= XEXP (x
, 0);
5654 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5655 *cost
+= extra_cost
->ldst
.load
;
5656 else if (mode
== SFmode
)
5657 *cost
+= extra_cost
->ldst
.loadf
;
5658 else if (mode
== DFmode
)
5659 *cost
+= extra_cost
->ldst
.loadd
;
5662 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
5671 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_INT
)
5673 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
5674 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
5677 *cost
+= rtx_cost (XEXP (op0
, 0), NEG
, 0, speed
);
5681 /* Cost this as SUB wzr, X. */
5682 op0
= CONST0_RTX (GET_MODE (x
));
5687 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
5689 /* Support (neg(fma...)) as a single instruction only if
5690 sign of zeros is unimportant. This matches the decision
5691 making in aarch64.md. */
5692 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
5695 *cost
= rtx_cost (op0
, NEG
, 0, speed
);
5700 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
5709 *cost
+= extra_cost
->alu
.clz
;
5717 if (op1
== const0_rtx
5718 && GET_CODE (op0
) == AND
)
5724 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
5726 /* TODO: A write to the CC flags possibly costs extra, this
5727 needs encoding in the cost tables. */
5729 /* CC_ZESWPmode supports zero extend for free. */
5730 if (GET_MODE (x
) == CC_ZESWPmode
&& GET_CODE (op0
) == ZERO_EXTEND
)
5731 op0
= XEXP (op0
, 0);
5734 if (GET_CODE (op0
) == AND
)
5740 if (GET_CODE (op0
) == PLUS
)
5742 /* ADDS (and CMN alias). */
5747 if (GET_CODE (op0
) == MINUS
)
5754 if (GET_CODE (op1
) == NEG
)
5758 *cost
+= extra_cost
->alu
.arith
;
5760 *cost
+= rtx_cost (op0
, COMPARE
, 0, speed
);
5761 *cost
+= rtx_cost (XEXP (op1
, 0), NEG
, 1, speed
);
5767 Compare can freely swap the order of operands, and
5768 canonicalization puts the more complex operation first.
5769 But the integer MINUS logic expects the shift/extend
5770 operation in op1. */
5772 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
5780 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
5784 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
5786 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
5788 /* FCMP supports constant 0.0 for no extra cost. */
5802 /* Detect valid immediates. */
5803 if ((GET_MODE_CLASS (mode
) == MODE_INT
5804 || (GET_MODE_CLASS (mode
) == MODE_CC
5805 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
5806 && CONST_INT_P (op1
)
5807 && aarch64_uimm12_shift (INTVAL (op1
)))
5809 *cost
+= rtx_cost (op0
, MINUS
, 0, speed
);
5812 /* SUB(S) (immediate). */
5813 *cost
+= extra_cost
->alu
.arith
;
5818 /* Look for SUB (extended register). */
5819 if (aarch64_rtx_arith_op_extract_p (op1
, mode
))
5822 *cost
+= extra_cost
->alu
.extend_arith
;
5824 *cost
+= rtx_cost (XEXP (XEXP (op1
, 0), 0),
5825 (enum rtx_code
) GET_CODE (op1
),
5830 rtx new_op1
= aarch64_strip_extend (op1
);
5832 /* Cost this as an FMA-alike operation. */
5833 if ((GET_CODE (new_op1
) == MULT
5834 || GET_CODE (new_op1
) == ASHIFT
)
5837 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
5838 (enum rtx_code
) code
,
5840 *cost
+= rtx_cost (op0
, MINUS
, 0, speed
);
5844 *cost
+= rtx_cost (new_op1
, MINUS
, 1, speed
);
5848 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5850 *cost
+= extra_cost
->alu
.arith
;
5851 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
5853 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
5866 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
5867 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
5870 *cost
+= rtx_cost (XEXP (op0
, 0), PLUS
, 0, speed
);
5871 *cost
+= rtx_cost (op1
, PLUS
, 1, speed
);
5875 if (GET_MODE_CLASS (mode
) == MODE_INT
5876 && CONST_INT_P (op1
)
5877 && aarch64_uimm12_shift (INTVAL (op1
)))
5879 *cost
+= rtx_cost (op0
, PLUS
, 0, speed
);
5882 /* ADD (immediate). */
5883 *cost
+= extra_cost
->alu
.arith
;
5887 /* Look for ADD (extended register). */
5888 if (aarch64_rtx_arith_op_extract_p (op0
, mode
))
5891 *cost
+= extra_cost
->alu
.extend_arith
;
5893 *cost
+= rtx_cost (XEXP (XEXP (op0
, 0), 0),
5894 (enum rtx_code
) GET_CODE (op0
),
5899 /* Strip any extend, leave shifts behind as we will
5900 cost them through mult_cost. */
5901 new_op0
= aarch64_strip_extend (op0
);
5903 if (GET_CODE (new_op0
) == MULT
5904 || GET_CODE (new_op0
) == ASHIFT
)
5906 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
5908 *cost
+= rtx_cost (op1
, PLUS
, 1, speed
);
5912 *cost
+= (rtx_cost (new_op0
, PLUS
, 0, speed
)
5913 + rtx_cost (op1
, PLUS
, 1, speed
));
5917 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5919 *cost
+= extra_cost
->alu
.arith
;
5920 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
5922 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
5928 *cost
= COSTS_N_INSNS (1);
5931 *cost
+= extra_cost
->alu
.rev
;
5936 if (aarch_rev16_p (x
))
5938 *cost
= COSTS_N_INSNS (1);
5941 *cost
+= extra_cost
->alu
.rev
;
5953 && GET_CODE (op0
) == MULT
5954 && CONST_INT_P (XEXP (op0
, 1))
5955 && CONST_INT_P (op1
)
5956 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
5959 /* This is a UBFM/SBFM. */
5960 *cost
+= rtx_cost (XEXP (op0
, 0), ZERO_EXTRACT
, 0, speed
);
5962 *cost
+= extra_cost
->alu
.bfx
;
5966 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_INT
)
5968 /* We possibly get the immediate for free, this is not
5970 if (CONST_INT_P (op1
)
5971 && aarch64_bitmask_imm (INTVAL (op1
), GET_MODE (x
)))
5973 *cost
+= rtx_cost (op0
, (enum rtx_code
) code
, 0, speed
);
5976 *cost
+= extra_cost
->alu
.logical
;
5984 /* Handle ORN, EON, or BIC. */
5985 if (GET_CODE (op0
) == NOT
)
5986 op0
= XEXP (op0
, 0);
5988 new_op0
= aarch64_strip_shift (op0
);
5990 /* If we had a shift on op0 then this is a logical-shift-
5991 by-register/immediate operation. Otherwise, this is just
5992 a logical operation. */
5997 /* Shift by immediate. */
5998 if (CONST_INT_P (XEXP (op0
, 1)))
5999 *cost
+= extra_cost
->alu
.log_shift
;
6001 *cost
+= extra_cost
->alu
.log_shift_reg
;
6004 *cost
+= extra_cost
->alu
.logical
;
6007 /* In both cases we want to cost both operands. */
6008 *cost
+= rtx_cost (new_op0
, (enum rtx_code
) code
, 0, speed
)
6009 + rtx_cost (op1
, (enum rtx_code
) code
, 1, speed
);
6019 *cost
+= extra_cost
->alu
.logical
;
6021 /* The logical instruction could have the shifted register form,
6022 but the cost is the same if the shift is processed as a separate
6023 instruction, so we don't bother with it here. */
6029 /* If a value is written in SI mode, then zero extended to DI
6030 mode, the operation will in general be free as a write to
6031 a 'w' register implicitly zeroes the upper bits of an 'x'
6032 register. However, if this is
6034 (set (reg) (zero_extend (reg)))
6036 we must cost the explicit register move. */
6038 && GET_MODE (op0
) == SImode
6041 int op_cost
= rtx_cost (XEXP (x
, 0), ZERO_EXTEND
, 0, speed
);
6043 if (!op_cost
&& speed
)
6045 *cost
+= extra_cost
->alu
.extend
;
6047 /* Free, the cost is that of the SI mode operation. */
6052 else if (MEM_P (XEXP (x
, 0)))
6054 /* All loads can zero extend to any size for free. */
6055 *cost
= rtx_cost (XEXP (x
, 0), ZERO_EXTEND
, param
, speed
);
6061 *cost
+= extra_cost
->alu
.extend
;
6066 if (MEM_P (XEXP (x
, 0)))
6071 rtx address
= XEXP (XEXP (x
, 0), 0);
6072 *cost
+= extra_cost
->ldst
.load_sign_extend
;
6075 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6082 *cost
+= extra_cost
->alu
.extend
;
6089 if (CONST_INT_P (op1
))
6091 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6094 *cost
+= extra_cost
->alu
.shift
;
6096 /* We can incorporate zero/sign extend for free. */
6097 if (GET_CODE (op0
) == ZERO_EXTEND
6098 || GET_CODE (op0
) == SIGN_EXTEND
)
6099 op0
= XEXP (op0
, 0);
6101 *cost
+= rtx_cost (op0
, ASHIFT
, 0, speed
);
6108 *cost
+= extra_cost
->alu
.shift_reg
;
6110 return false; /* All arguments need to be in registers. */
6120 if (CONST_INT_P (op1
))
6122 /* ASR (immediate) and friends. */
6124 *cost
+= extra_cost
->alu
.shift
;
6126 *cost
+= rtx_cost (op0
, (enum rtx_code
) code
, 0, speed
);
6132 /* ASR (register) and friends. */
6134 *cost
+= extra_cost
->alu
.shift_reg
;
6136 return false; /* All arguments need to be in registers. */
6141 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
)
6145 *cost
+= extra_cost
->ldst
.load
;
6147 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
6148 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
6150 /* ADRP, followed by ADD. */
6151 *cost
+= COSTS_N_INSNS (1);
6153 *cost
+= 2 * extra_cost
->alu
.arith
;
6155 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
6156 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
6160 *cost
+= extra_cost
->alu
.arith
;
6165 /* One extra load instruction, after accessing the GOT. */
6166 *cost
+= COSTS_N_INSNS (1);
6168 *cost
+= extra_cost
->ldst
.load
;
6174 /* ADRP/ADD (immediate). */
6176 *cost
+= extra_cost
->alu
.arith
;
6183 *cost
+= extra_cost
->alu
.bfx
;
6185 /* We can trust that the immediates used will be correct (there
6186 are no by-register forms), so we need only cost op0. */
6187 *cost
+= rtx_cost (XEXP (x
, 0), (enum rtx_code
) code
, 0, speed
);
6191 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
6192 /* aarch64_rtx_mult_cost always handles recursion to its
6200 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_INT
)
6201 *cost
+= (extra_cost
->mult
[GET_MODE (x
) == DImode
].add
6202 + extra_cost
->mult
[GET_MODE (x
) == DImode
].idiv
);
6203 else if (GET_MODE (x
) == DFmode
)
6204 *cost
+= (extra_cost
->fp
[1].mult
6205 + extra_cost
->fp
[1].div
);
6206 else if (GET_MODE (x
) == SFmode
)
6207 *cost
+= (extra_cost
->fp
[0].mult
6208 + extra_cost
->fp
[0].div
);
6210 return false; /* All arguments need to be in registers. */
6217 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6218 /* There is no integer SQRT, so only DIV and UDIV can get
6220 *cost
+= extra_cost
->mult
[mode
== DImode
].idiv
;
6222 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
6224 return false; /* All arguments need to be in registers. */
6227 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
6228 XEXP (x
, 2), cost
, speed
);
6241 return false; /* All arguments must be in registers. */
6249 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
6251 /* FMSUB, FNMADD, and FNMSUB are free. */
6252 if (GET_CODE (op0
) == NEG
)
6253 op0
= XEXP (op0
, 0);
6255 if (GET_CODE (op2
) == NEG
)
6256 op2
= XEXP (op2
, 0);
6258 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6259 and the by-element operand as operand 0. */
6260 if (GET_CODE (op1
) == NEG
)
6261 op1
= XEXP (op1
, 0);
6263 /* Catch vector-by-element operations. The by-element operand can
6264 either be (vec_duplicate (vec_select (x))) or just
6265 (vec_select (x)), depending on whether we are multiplying by
6266 a vector or a scalar.
6268 Canonicalization is not very good in these cases, FMA4 will put the
6269 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6270 if (GET_CODE (op0
) == VEC_DUPLICATE
)
6271 op0
= XEXP (op0
, 0);
6272 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
6273 op1
= XEXP (op1
, 0);
6275 if (GET_CODE (op0
) == VEC_SELECT
)
6276 op0
= XEXP (op0
, 0);
6277 else if (GET_CODE (op1
) == VEC_SELECT
)
6278 op1
= XEXP (op1
, 0);
6280 /* If the remaining parameters are not registers,
6281 get the cost to put them into registers. */
6282 *cost
+= rtx_cost (op0
, FMA
, 0, speed
);
6283 *cost
+= rtx_cost (op1
, FMA
, 1, speed
);
6284 *cost
+= rtx_cost (op2
, FMA
, 2, speed
);
6289 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
6292 case FLOAT_TRUNCATE
:
6294 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
6300 /* Strip the rounding part. They will all be implemented
6301 by the fcvt* family of instructions anyway. */
6302 if (GET_CODE (x
) == UNSPEC
)
6304 unsigned int uns_code
= XINT (x
, 1);
6306 if (uns_code
== UNSPEC_FRINTA
6307 || uns_code
== UNSPEC_FRINTM
6308 || uns_code
== UNSPEC_FRINTN
6309 || uns_code
== UNSPEC_FRINTP
6310 || uns_code
== UNSPEC_FRINTZ
)
6311 x
= XVECEXP (x
, 0, 0);
6315 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
6317 *cost
+= rtx_cost (x
, (enum rtx_code
) code
, 0, speed
);
6321 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
6323 /* FABS and FNEG are analogous. */
6325 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
6329 /* Integer ABS will either be split to
6330 two arithmetic instructions, or will be an ABS
6331 (scalar), which we don't model. */
6332 *cost
= COSTS_N_INSNS (2);
6334 *cost
+= 2 * extra_cost
->alu
.arith
;
6342 /* FMAXNM/FMINNM/FMAX/FMIN.
6343 TODO: This may not be accurate for all implementations, but
6344 we do not model this in the cost tables. */
6345 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
6350 /* The floating point round to integer frint* instructions. */
6351 if (aarch64_frint_unspec_p (XINT (x
, 1)))
6354 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
6359 if (XINT (x
, 1) == UNSPEC_RBIT
)
6362 *cost
+= extra_cost
->alu
.rev
;
6370 /* Decompose <su>muldi3_highpart. */
6371 if (/* (truncate:DI */
6374 && GET_MODE (XEXP (x
, 0)) == TImode
6375 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
6377 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
6378 /* (ANY_EXTEND:TI (reg:DI))
6379 (ANY_EXTEND:TI (reg:DI))) */
6380 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
6381 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
6382 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
6383 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
6384 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
6385 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
6386 /* (const_int 64) */
6387 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6388 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
6392 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
6393 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
6395 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
6405 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
6407 "\nFailed to cost RTX. Assuming default cost.\n");
6412 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6413 calculated for X. This cost is stored in *COST. Returns true
6414 if the total cost of X was calculated. */
6416 aarch64_rtx_costs_wrapper (rtx x
, int code
, int outer
,
6417 int param
, int *cost
, bool speed
)
6419 bool result
= aarch64_rtx_costs (x
, code
, outer
, param
, cost
, speed
);
6421 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
6423 print_rtl_single (dump_file
, x
);
6424 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
6425 speed
? "Hot" : "Cold",
6426 *cost
, result
? "final" : "partial");
6433 aarch64_register_move_cost (machine_mode mode
,
6434 reg_class_t from_i
, reg_class_t to_i
)
6436 enum reg_class from
= (enum reg_class
) from_i
;
6437 enum reg_class to
= (enum reg_class
) to_i
;
6438 const struct cpu_regmove_cost
*regmove_cost
6439 = aarch64_tune_params
->regmove_cost
;
6441 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6442 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
6445 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
6446 from
= GENERAL_REGS
;
6448 /* Moving between GPR and stack cost is the same as GP2GP. */
6449 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
6450 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
6451 return regmove_cost
->GP2GP
;
6453 /* To/From the stack register, we move via the gprs. */
6454 if (to
== STACK_REG
|| from
== STACK_REG
)
6455 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
6456 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
6458 if (GET_MODE_SIZE (mode
) == 16)
6460 /* 128-bit operations on general registers require 2 instructions. */
6461 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
6462 return regmove_cost
->GP2GP
* 2;
6463 else if (from
== GENERAL_REGS
)
6464 return regmove_cost
->GP2FP
* 2;
6465 else if (to
== GENERAL_REGS
)
6466 return regmove_cost
->FP2GP
* 2;
6468 /* When AdvSIMD instructions are disabled it is not possible to move
6469 a 128-bit value directly between Q registers. This is handled in
6470 secondary reload. A general register is used as a scratch to move
6471 the upper DI value and the lower DI value is moved directly,
6472 hence the cost is the sum of three moves. */
6474 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
6476 return regmove_cost
->FP2FP
;
6479 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
6480 return regmove_cost
->GP2GP
;
6481 else if (from
== GENERAL_REGS
)
6482 return regmove_cost
->GP2FP
;
6483 else if (to
== GENERAL_REGS
)
6484 return regmove_cost
->FP2GP
;
6486 return regmove_cost
->FP2FP
;
6490 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
6491 reg_class_t rclass ATTRIBUTE_UNUSED
,
6492 bool in ATTRIBUTE_UNUSED
)
6494 return aarch64_tune_params
->memmov_cost
;
6497 /* Return the number of instructions that can be issued per cycle. */
6499 aarch64_sched_issue_rate (void)
6501 return aarch64_tune_params
->issue_rate
;
6505 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6507 int issue_rate
= aarch64_sched_issue_rate ();
6509 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
6512 /* Vectorizer cost model target hooks. */
6514 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6516 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
6518 int misalign ATTRIBUTE_UNUSED
)
6522 switch (type_of_cost
)
6525 return aarch64_tune_params
->vec_costs
->scalar_stmt_cost
;
6528 return aarch64_tune_params
->vec_costs
->scalar_load_cost
;
6531 return aarch64_tune_params
->vec_costs
->scalar_store_cost
;
6534 return aarch64_tune_params
->vec_costs
->vec_stmt_cost
;
6537 return aarch64_tune_params
->vec_costs
->vec_align_load_cost
;
6540 return aarch64_tune_params
->vec_costs
->vec_store_cost
;
6543 return aarch64_tune_params
->vec_costs
->vec_to_scalar_cost
;
6546 return aarch64_tune_params
->vec_costs
->scalar_to_vec_cost
;
6548 case unaligned_load
:
6549 return aarch64_tune_params
->vec_costs
->vec_unalign_load_cost
;
6551 case unaligned_store
:
6552 return aarch64_tune_params
->vec_costs
->vec_unalign_store_cost
;
6554 case cond_branch_taken
:
6555 return aarch64_tune_params
->vec_costs
->cond_taken_branch_cost
;
6557 case cond_branch_not_taken
:
6558 return aarch64_tune_params
->vec_costs
->cond_not_taken_branch_cost
;
6561 case vec_promote_demote
:
6562 return aarch64_tune_params
->vec_costs
->vec_stmt_cost
;
6565 elements
= TYPE_VECTOR_SUBPARTS (vectype
);
6566 return elements
/ 2 + 1;
6573 /* Implement targetm.vectorize.add_stmt_cost. */
6575 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
6576 struct _stmt_vec_info
*stmt_info
, int misalign
,
6577 enum vect_cost_model_location where
)
6579 unsigned *cost
= (unsigned *) data
;
6580 unsigned retval
= 0;
6582 if (flag_vect_cost_model
)
6584 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
6586 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
6588 /* Statements in an inner loop relative to the loop being
6589 vectorized are weighted more heavily. The value here is
6590 a function (linear for now) of the loop nest level. */
6591 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
6593 loop_vec_info loop_info
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6594 struct loop
*loop
= LOOP_VINFO_LOOP (loop_info
);
6595 unsigned nest_level
= loop_depth (loop
);
6597 count
*= nest_level
;
6600 retval
= (unsigned) (count
* stmt_cost
);
6601 cost
[where
] += retval
;
6607 static void initialize_aarch64_code_model (void);
6609 /* Parse the architecture extension string. */
6612 aarch64_parse_extension (char *str
)
6614 /* The extension string is parsed left to right. */
6615 const struct aarch64_option_extension
*opt
= NULL
;
6617 /* Flag to say whether we are adding or removing an extension. */
6618 int adding_ext
= -1;
6620 while (str
!= NULL
&& *str
!= 0)
6626 ext
= strchr (str
, '+');
6633 if (len
>= 2 && strncmp (str
, "no", 2) == 0)
6644 error ("missing feature modifier after %qs", adding_ext
? "+"
6649 /* Scan over the extensions table trying to find an exact match. */
6650 for (opt
= all_extensions
; opt
->name
!= NULL
; opt
++)
6652 if (strlen (opt
->name
) == len
&& strncmp (opt
->name
, str
, len
) == 0)
6654 /* Add or remove the extension. */
6656 aarch64_isa_flags
|= opt
->flags_on
;
6658 aarch64_isa_flags
&= ~(opt
->flags_off
);
6663 if (opt
->name
== NULL
)
6665 /* Extension not found in list. */
6666 error ("unknown feature modifier %qs", str
);
6676 /* Parse the ARCH string. */
6679 aarch64_parse_arch (void)
6682 const struct processor
*arch
;
6683 char *str
= (char *) alloca (strlen (aarch64_arch_string
) + 1);
6686 strcpy (str
, aarch64_arch_string
);
6688 ext
= strchr (str
, '+');
6697 error ("missing arch name in -march=%qs", str
);
6701 /* Loop through the list of supported ARCHs to find a match. */
6702 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
6704 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
6706 selected_arch
= arch
;
6707 aarch64_isa_flags
= selected_arch
->flags
;
6710 selected_cpu
= &all_cores
[selected_arch
->core
];
6714 /* ARCH string contains at least one extension. */
6715 aarch64_parse_extension (ext
);
6718 if (strcmp (selected_arch
->arch
, selected_cpu
->arch
))
6720 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6721 selected_cpu
->name
, selected_arch
->name
);
6728 /* ARCH name not found in list. */
6729 error ("unknown value %qs for -march", str
);
6733 /* Parse the CPU string. */
6736 aarch64_parse_cpu (void)
6739 const struct processor
*cpu
;
6740 char *str
= (char *) alloca (strlen (aarch64_cpu_string
) + 1);
6743 strcpy (str
, aarch64_cpu_string
);
6745 ext
= strchr (str
, '+');
6754 error ("missing cpu name in -mcpu=%qs", str
);
6758 /* Loop through the list of supported CPUs to find a match. */
6759 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
6761 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
6764 aarch64_isa_flags
= selected_cpu
->flags
;
6768 /* CPU string contains at least one extension. */
6769 aarch64_parse_extension (ext
);
6776 /* CPU name not found in list. */
6777 error ("unknown value %qs for -mcpu", str
);
6781 /* Parse the TUNE string. */
6784 aarch64_parse_tune (void)
6786 const struct processor
*cpu
;
6787 char *str
= (char *) alloca (strlen (aarch64_tune_string
) + 1);
6788 strcpy (str
, aarch64_tune_string
);
6790 /* Loop through the list of supported CPUs to find a match. */
6791 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
6793 if (strcmp (cpu
->name
, str
) == 0)
6795 selected_tune
= cpu
;
6800 /* CPU name not found in list. */
6801 error ("unknown value %qs for -mtune", str
);
6806 /* Implement TARGET_OPTION_OVERRIDE. */
6809 aarch64_override_options (void)
6811 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6812 If either of -march or -mtune is given, they override their
6813 respective component of -mcpu.
6815 So, first parse AARCH64_CPU_STRING, then the others, be careful
6816 with -march as, if -mcpu is not present on the command line, march
6817 must set a sensible default CPU. */
6818 if (aarch64_cpu_string
)
6820 aarch64_parse_cpu ();
6823 if (aarch64_arch_string
)
6825 aarch64_parse_arch ();
6828 if (aarch64_tune_string
)
6830 aarch64_parse_tune ();
6833 #ifndef HAVE_AS_MABI_OPTION
6834 /* The compiler may have been configured with 2.23.* binutils, which does
6835 not have support for ILP32. */
6837 error ("Assembler does not support -mabi=ilp32");
6840 initialize_aarch64_code_model ();
6842 aarch64_build_bitmask_table ();
6844 /* This target defaults to strict volatile bitfields. */
6845 if (flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
6846 flag_strict_volatile_bitfields
= 1;
6848 /* If the user did not specify a processor, choose the default
6849 one for them. This will be the CPU set during configuration using
6850 --with-cpu, otherwise it is "generic". */
6853 selected_cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
6854 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
6857 gcc_assert (selected_cpu
);
6860 selected_tune
= selected_cpu
;
6862 aarch64_tune_flags
= selected_tune
->flags
;
6863 aarch64_tune
= selected_tune
->core
;
6864 aarch64_tune_params
= selected_tune
->tune
;
6865 aarch64_architecture_version
= selected_cpu
->architecture_version
;
6867 if (aarch64_fix_a53_err835769
== 2)
6869 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6870 aarch64_fix_a53_err835769
= 1;
6872 aarch64_fix_a53_err835769
= 0;
6876 /* If not opzimizing for size, set the default
6877 alignment to what the target wants */
6880 if (align_loops
<= 0)
6881 align_loops
= aarch64_tune_params
->loop_align
;
6882 if (align_jumps
<= 0)
6883 align_jumps
= aarch64_tune_params
->jump_align
;
6884 if (align_functions
<= 0)
6885 align_functions
= aarch64_tune_params
->function_align
;
6888 if (AARCH64_TUNE_FMA_STEERING
)
6889 aarch64_register_fma_steering ();
6891 aarch64_override_options_after_change ();
6894 /* Implement targetm.override_options_after_change. */
6897 aarch64_override_options_after_change (void)
6899 if (flag_omit_frame_pointer
)
6900 flag_omit_leaf_frame_pointer
= false;
6901 else if (flag_omit_leaf_frame_pointer
)
6902 flag_omit_frame_pointer
= true;
6905 static struct machine_function
*
6906 aarch64_init_machine_status (void)
6908 struct machine_function
*machine
;
6909 machine
= ggc_cleared_alloc
<machine_function
> ();
6914 aarch64_init_expanders (void)
6916 init_machine_status
= aarch64_init_machine_status
;
6919 /* A checking mechanism for the implementation of the various code models. */
6921 initialize_aarch64_code_model (void)
6925 switch (aarch64_cmodel_var
)
6927 case AARCH64_CMODEL_TINY
:
6928 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
6930 case AARCH64_CMODEL_SMALL
:
6931 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
6933 case AARCH64_CMODEL_LARGE
:
6934 sorry ("code model %qs with -f%s", "large",
6935 flag_pic
> 1 ? "PIC" : "pic");
6941 aarch64_cmodel
= aarch64_cmodel_var
;
6944 /* Return true if SYMBOL_REF X binds locally. */
6947 aarch64_symbol_binds_local_p (const_rtx x
)
6949 return (SYMBOL_REF_DECL (x
)
6950 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
6951 : SYMBOL_REF_LOCAL_P (x
));
6954 /* Return true if SYMBOL_REF X is thread local */
6956 aarch64_tls_symbol_p (rtx x
)
6958 if (! TARGET_HAVE_TLS
)
6961 if (GET_CODE (x
) != SYMBOL_REF
)
6964 return SYMBOL_REF_TLS_MODEL (x
) != 0;
6967 /* Classify a TLS symbol into one of the TLS kinds. */
6968 enum aarch64_symbol_type
6969 aarch64_classify_tls_symbol (rtx x
)
6971 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
6975 case TLS_MODEL_GLOBAL_DYNAMIC
:
6976 case TLS_MODEL_LOCAL_DYNAMIC
:
6977 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
6979 case TLS_MODEL_INITIAL_EXEC
:
6980 return SYMBOL_SMALL_GOTTPREL
;
6982 case TLS_MODEL_LOCAL_EXEC
:
6983 return SYMBOL_SMALL_TPREL
;
6985 case TLS_MODEL_EMULATED
:
6986 case TLS_MODEL_NONE
:
6987 return SYMBOL_FORCE_TO_MEM
;
6994 /* Return the method that should be used to access SYMBOL_REF or
6995 LABEL_REF X in context CONTEXT. */
6997 enum aarch64_symbol_type
6998 aarch64_classify_symbol (rtx x
, rtx offset
,
6999 enum aarch64_symbol_context context ATTRIBUTE_UNUSED
)
7001 if (GET_CODE (x
) == LABEL_REF
)
7003 switch (aarch64_cmodel
)
7005 case AARCH64_CMODEL_LARGE
:
7006 return SYMBOL_FORCE_TO_MEM
;
7008 case AARCH64_CMODEL_TINY_PIC
:
7009 case AARCH64_CMODEL_TINY
:
7010 return SYMBOL_TINY_ABSOLUTE
;
7012 case AARCH64_CMODEL_SMALL_PIC
:
7013 case AARCH64_CMODEL_SMALL
:
7014 return SYMBOL_SMALL_ABSOLUTE
;
7021 if (GET_CODE (x
) == SYMBOL_REF
)
7023 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
)
7024 return SYMBOL_FORCE_TO_MEM
;
7026 if (aarch64_tls_symbol_p (x
))
7027 return aarch64_classify_tls_symbol (x
);
7029 switch (aarch64_cmodel
)
7031 case AARCH64_CMODEL_TINY
:
7032 /* When we retreive symbol + offset address, we have to make sure
7033 the offset does not cause overflow of the final address. But
7034 we have no way of knowing the address of symbol at compile time
7035 so we can't accurately say if the distance between the PC and
7036 symbol + offset is outside the addressible range of +/-1M in the
7037 TINY code model. So we rely on images not being greater than
7038 1M and cap the offset at 1M and anything beyond 1M will have to
7039 be loaded using an alternative mechanism. */
7040 if (SYMBOL_REF_WEAK (x
)
7041 || INTVAL (offset
) < -1048575 || INTVAL (offset
) > 1048575)
7042 return SYMBOL_FORCE_TO_MEM
;
7043 return SYMBOL_TINY_ABSOLUTE
;
7045 case AARCH64_CMODEL_SMALL
:
7046 /* Same reasoning as the tiny code model, but the offset cap here is
7048 if (SYMBOL_REF_WEAK (x
)
7049 || !IN_RANGE (INTVAL (offset
), HOST_WIDE_INT_C (-4294967263),
7050 HOST_WIDE_INT_C (4294967264)))
7051 return SYMBOL_FORCE_TO_MEM
;
7052 return SYMBOL_SMALL_ABSOLUTE
;
7054 case AARCH64_CMODEL_TINY_PIC
:
7055 if (!aarch64_symbol_binds_local_p (x
))
7056 return SYMBOL_TINY_GOT
;
7057 return SYMBOL_TINY_ABSOLUTE
;
7059 case AARCH64_CMODEL_SMALL_PIC
:
7060 if (!aarch64_symbol_binds_local_p (x
))
7061 return SYMBOL_SMALL_GOT
;
7062 return SYMBOL_SMALL_ABSOLUTE
;
7069 /* By default push everything into the constant pool. */
7070 return SYMBOL_FORCE_TO_MEM
;
7074 aarch64_constant_address_p (rtx x
)
7076 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
7080 aarch64_legitimate_pic_operand_p (rtx x
)
7082 if (GET_CODE (x
) == SYMBOL_REF
7083 || (GET_CODE (x
) == CONST
7084 && GET_CODE (XEXP (x
, 0)) == PLUS
7085 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
7091 /* Return true if X holds either a quarter-precision or
7092 floating-point +0.0 constant. */
7094 aarch64_valid_floating_const (machine_mode mode
, rtx x
)
7096 if (!CONST_DOUBLE_P (x
))
7099 /* TODO: We could handle moving 0.0 to a TFmode register,
7100 but first we would like to refactor the movtf_aarch64
7101 to be more amicable to split moves properly and
7102 correctly gate on TARGET_SIMD. For now - reject all
7103 constants which are not to SFmode or DFmode registers. */
7104 if (!(mode
== SFmode
|| mode
== DFmode
))
7107 if (aarch64_float_const_zero_rtx_p (x
))
7109 return aarch64_float_const_representable_p (x
);
7113 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
7115 /* Do not allow vector struct mode constants. We could support
7116 0 and -1 easily, but they need support in aarch64-simd.md. */
7117 if (TARGET_SIMD
&& aarch64_vect_struct_mode_p (mode
))
7120 /* This could probably go away because
7121 we now decompose CONST_INTs according to expand_mov_immediate. */
7122 if ((GET_CODE (x
) == CONST_VECTOR
7123 && aarch64_simd_valid_immediate (x
, mode
, false, NULL
))
7124 || CONST_INT_P (x
) || aarch64_valid_floating_const (mode
, x
))
7125 return !targetm
.cannot_force_const_mem (mode
, x
);
7127 if (GET_CODE (x
) == HIGH
7128 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
7131 return aarch64_constant_address_p (x
);
7135 aarch64_load_tp (rtx target
)
7138 || GET_MODE (target
) != Pmode
7139 || !register_operand (target
, Pmode
))
7140 target
= gen_reg_rtx (Pmode
);
7142 /* Can return in any reg. */
7143 emit_insn (gen_aarch64_load_tp_hard (target
));
7147 /* On AAPCS systems, this is the "struct __va_list". */
7148 static GTY(()) tree va_list_type
;
7150 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7151 Return the type to use as __builtin_va_list.
7153 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7165 aarch64_build_builtin_va_list (void)
7168 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
7170 /* Create the type. */
7171 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
7172 /* Give it the required name. */
7173 va_list_name
= build_decl (BUILTINS_LOCATION
,
7175 get_identifier ("__va_list"),
7177 DECL_ARTIFICIAL (va_list_name
) = 1;
7178 TYPE_NAME (va_list_type
) = va_list_name
;
7179 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
7181 /* Create the fields. */
7182 f_stack
= build_decl (BUILTINS_LOCATION
,
7183 FIELD_DECL
, get_identifier ("__stack"),
7185 f_grtop
= build_decl (BUILTINS_LOCATION
,
7186 FIELD_DECL
, get_identifier ("__gr_top"),
7188 f_vrtop
= build_decl (BUILTINS_LOCATION
,
7189 FIELD_DECL
, get_identifier ("__vr_top"),
7191 f_groff
= build_decl (BUILTINS_LOCATION
,
7192 FIELD_DECL
, get_identifier ("__gr_offs"),
7194 f_vroff
= build_decl (BUILTINS_LOCATION
,
7195 FIELD_DECL
, get_identifier ("__vr_offs"),
7198 DECL_ARTIFICIAL (f_stack
) = 1;
7199 DECL_ARTIFICIAL (f_grtop
) = 1;
7200 DECL_ARTIFICIAL (f_vrtop
) = 1;
7201 DECL_ARTIFICIAL (f_groff
) = 1;
7202 DECL_ARTIFICIAL (f_vroff
) = 1;
7204 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
7205 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
7206 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
7207 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
7208 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
7210 TYPE_FIELDS (va_list_type
) = f_stack
;
7211 DECL_CHAIN (f_stack
) = f_grtop
;
7212 DECL_CHAIN (f_grtop
) = f_vrtop
;
7213 DECL_CHAIN (f_vrtop
) = f_groff
;
7214 DECL_CHAIN (f_groff
) = f_vroff
;
7216 /* Compute its layout. */
7217 layout_type (va_list_type
);
7219 return va_list_type
;
7222 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7224 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
7226 const CUMULATIVE_ARGS
*cum
;
7227 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
7228 tree stack
, grtop
, vrtop
, groff
, vroff
;
7230 int gr_save_area_size
;
7231 int vr_save_area_size
;
7234 cum
= &crtl
->args
.info
;
7236 = (NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
;
7238 = (NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
) * UNITS_PER_VREG
;
7240 if (TARGET_GENERAL_REGS_ONLY
)
7242 if (cum
->aapcs_nvrn
> 0)
7243 sorry ("%qs and floating point or vector arguments",
7244 "-mgeneral-regs-only");
7245 vr_save_area_size
= 0;
7248 f_stack
= TYPE_FIELDS (va_list_type_node
);
7249 f_grtop
= DECL_CHAIN (f_stack
);
7250 f_vrtop
= DECL_CHAIN (f_grtop
);
7251 f_groff
= DECL_CHAIN (f_vrtop
);
7252 f_vroff
= DECL_CHAIN (f_groff
);
7254 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
7256 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
7258 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
7260 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
7262 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
7265 /* Emit code to initialize STACK, which points to the next varargs stack
7266 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7267 by named arguments. STACK is 8-byte aligned. */
7268 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
7269 if (cum
->aapcs_stack_size
> 0)
7270 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
7271 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
7272 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7274 /* Emit code to initialize GRTOP, the top of the GR save area.
7275 virtual_incoming_args_rtx should have been 16 byte aligned. */
7276 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
7277 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
7278 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7280 /* Emit code to initialize VRTOP, the top of the VR save area.
7281 This address is gr_save_area_bytes below GRTOP, rounded
7282 down to the next 16-byte boundary. */
7283 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
7284 vr_offset
= AARCH64_ROUND_UP (gr_save_area_size
,
7285 STACK_BOUNDARY
/ BITS_PER_UNIT
);
7288 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
7289 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
7290 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7292 /* Emit code to initialize GROFF, the offset from GRTOP of the
7293 next GPR argument. */
7294 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
7295 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
7296 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7298 /* Likewise emit code to initialize VROFF, the offset from FTOP
7299 of the next VR argument. */
7300 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
7301 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
7302 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7305 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7308 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
7309 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
7313 bool is_ha
; /* is HFA or HVA. */
7314 bool dw_align
; /* double-word align. */
7315 machine_mode ag_mode
= VOIDmode
;
7319 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
7320 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
7321 HOST_WIDE_INT size
, rsize
, adjust
, align
;
7322 tree t
, u
, cond1
, cond2
;
7324 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
7326 type
= build_pointer_type (type
);
7328 mode
= TYPE_MODE (type
);
7330 f_stack
= TYPE_FIELDS (va_list_type_node
);
7331 f_grtop
= DECL_CHAIN (f_stack
);
7332 f_vrtop
= DECL_CHAIN (f_grtop
);
7333 f_groff
= DECL_CHAIN (f_vrtop
);
7334 f_vroff
= DECL_CHAIN (f_groff
);
7336 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
7337 f_stack
, NULL_TREE
);
7338 size
= int_size_in_bytes (type
);
7339 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
7343 if (aarch64_vfp_is_call_or_return_candidate (mode
,
7349 /* TYPE passed in fp/simd registers. */
7350 if (TARGET_GENERAL_REGS_ONLY
)
7351 sorry ("%qs and floating point or vector arguments",
7352 "-mgeneral-regs-only");
7354 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
7355 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
7356 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
7357 unshare_expr (valist
), f_vroff
, NULL_TREE
);
7359 rsize
= nregs
* UNITS_PER_VREG
;
7363 if (BYTES_BIG_ENDIAN
&& GET_MODE_SIZE (ag_mode
) < UNITS_PER_VREG
)
7364 adjust
= UNITS_PER_VREG
- GET_MODE_SIZE (ag_mode
);
7366 else if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
7367 && size
< UNITS_PER_VREG
)
7369 adjust
= UNITS_PER_VREG
- size
;
7374 /* TYPE passed in general registers. */
7375 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
7376 unshare_expr (valist
), f_grtop
, NULL_TREE
);
7377 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
7378 unshare_expr (valist
), f_groff
, NULL_TREE
);
7379 rsize
= (size
+ UNITS_PER_WORD
- 1) & -UNITS_PER_WORD
;
7380 nregs
= rsize
/ UNITS_PER_WORD
;
7385 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
7386 && size
< UNITS_PER_WORD
)
7388 adjust
= UNITS_PER_WORD
- size
;
7392 /* Get a local temporary for the field value. */
7393 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
7395 /* Emit code to branch if off >= 0. */
7396 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
7397 build_int_cst (TREE_TYPE (off
), 0));
7398 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
7402 /* Emit: offs = (offs + 15) & -16. */
7403 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
7404 build_int_cst (TREE_TYPE (off
), 15));
7405 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
7406 build_int_cst (TREE_TYPE (off
), -16));
7407 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
7412 /* Update ap.__[g|v]r_offs */
7413 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
7414 build_int_cst (TREE_TYPE (off
), rsize
));
7415 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
7419 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
7421 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7422 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
7423 build_int_cst (TREE_TYPE (f_off
), 0));
7424 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
7426 /* String up: make sure the assignment happens before the use. */
7427 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
7428 COND_EXPR_ELSE (cond1
) = t
;
7430 /* Prepare the trees handling the argument that is passed on the stack;
7431 the top level node will store in ON_STACK. */
7432 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
7435 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7436 t
= fold_convert (intDI_type_node
, arg
);
7437 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
7438 build_int_cst (TREE_TYPE (t
), 15));
7439 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
7440 build_int_cst (TREE_TYPE (t
), -16));
7441 t
= fold_convert (TREE_TYPE (arg
), t
);
7442 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
7446 /* Advance ap.__stack */
7447 t
= fold_convert (intDI_type_node
, arg
);
7448 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
7449 build_int_cst (TREE_TYPE (t
), size
+ 7));
7450 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
7451 build_int_cst (TREE_TYPE (t
), -8));
7452 t
= fold_convert (TREE_TYPE (arg
), t
);
7453 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
7454 /* String up roundup and advance. */
7456 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
7457 /* String up with arg */
7458 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
7459 /* Big-endianness related address adjustment. */
7460 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
7461 && size
< UNITS_PER_WORD
)
7463 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
7464 size_int (UNITS_PER_WORD
- size
));
7465 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
7468 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
7469 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
7471 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7474 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
7475 build_int_cst (TREE_TYPE (off
), adjust
));
7477 t
= fold_convert (sizetype
, t
);
7478 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
7482 /* type ha; // treat as "struct {ftype field[n];}"
7483 ... [computing offs]
7484 for (i = 0; i <nregs; ++i, offs += 16)
7485 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7488 tree tmp_ha
, field_t
, field_ptr_t
;
7490 /* Declare a local variable. */
7491 tmp_ha
= create_tmp_var_raw (type
, "ha");
7492 gimple_add_tmp_var (tmp_ha
);
7494 /* Establish the base type. */
7498 field_t
= float_type_node
;
7499 field_ptr_t
= float_ptr_type_node
;
7502 field_t
= double_type_node
;
7503 field_ptr_t
= double_ptr_type_node
;
7506 field_t
= long_double_type_node
;
7507 field_ptr_t
= long_double_ptr_type_node
;
7509 /* The half precision and quad precision are not fully supported yet. Enable
7510 the following code after the support is complete. Need to find the correct
7511 type node for __fp16 *. */
7514 field_t
= float_type_node
;
7515 field_ptr_t
= float_ptr_type_node
;
7521 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
7522 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
7523 field_ptr_t
= build_pointer_type (field_t
);
7530 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7531 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
7533 t
= fold_convert (field_ptr_t
, addr
);
7534 t
= build2 (MODIFY_EXPR
, field_t
,
7535 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
7536 build1 (INDIRECT_REF
, field_t
, t
));
7538 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7539 for (i
= 1; i
< nregs
; ++i
)
7541 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
7542 u
= fold_convert (field_ptr_t
, addr
);
7543 u
= build2 (MODIFY_EXPR
, field_t
,
7544 build2 (MEM_REF
, field_t
, tmp_ha
,
7545 build_int_cst (field_ptr_t
,
7547 int_size_in_bytes (field_t
)))),
7548 build1 (INDIRECT_REF
, field_t
, u
));
7549 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
7552 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
7553 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
7556 COND_EXPR_ELSE (cond2
) = t
;
7557 addr
= fold_convert (build_pointer_type (type
), cond1
);
7558 addr
= build_va_arg_indirect_ref (addr
);
7561 addr
= build_va_arg_indirect_ref (addr
);
7566 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7569 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
7570 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
7573 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
7574 CUMULATIVE_ARGS local_cum
;
7575 int gr_saved
, vr_saved
;
7577 /* The caller has advanced CUM up to, but not beyond, the last named
7578 argument. Advance a local copy of CUM past the last "real" named
7579 argument, to find out how many registers are left over. */
7581 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
7583 /* Found out how many registers we need to save. */
7584 gr_saved
= NUM_ARG_REGS
- local_cum
.aapcs_ncrn
;
7585 vr_saved
= NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
;
7587 if (TARGET_GENERAL_REGS_ONLY
)
7589 if (local_cum
.aapcs_nvrn
> 0)
7590 sorry ("%qs and floating point or vector arguments",
7591 "-mgeneral-regs-only");
7601 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7602 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
7603 - gr_saved
* UNITS_PER_WORD
);
7604 mem
= gen_frame_mem (BLKmode
, ptr
);
7605 set_mem_alias_set (mem
, get_varargs_alias_set ());
7607 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
7612 /* We can't use move_block_from_reg, because it will use
7613 the wrong mode, storing D regs only. */
7614 machine_mode mode
= TImode
;
7617 /* Set OFF to the offset from virtual_incoming_args_rtx of
7618 the first vector register. The VR save area lies below
7619 the GR one, and is aligned to 16 bytes. */
7620 off
= -AARCH64_ROUND_UP (gr_saved
* UNITS_PER_WORD
,
7621 STACK_BOUNDARY
/ BITS_PER_UNIT
);
7622 off
-= vr_saved
* UNITS_PER_VREG
;
7624 for (i
= local_cum
.aapcs_nvrn
; i
< NUM_FP_ARG_REGS
; ++i
)
7628 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
7629 mem
= gen_frame_mem (mode
, ptr
);
7630 set_mem_alias_set (mem
, get_varargs_alias_set ());
7631 aarch64_emit_move (mem
, gen_rtx_REG (mode
, V0_REGNUM
+ i
));
7632 off
+= UNITS_PER_VREG
;
7637 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7638 any complication of having crtl->args.pretend_args_size changed. */
7639 cfun
->machine
->frame
.saved_varargs_size
7640 = (AARCH64_ROUND_UP (gr_saved
* UNITS_PER_WORD
,
7641 STACK_BOUNDARY
/ BITS_PER_UNIT
)
7642 + vr_saved
* UNITS_PER_VREG
);
7646 aarch64_conditional_register_usage (void)
7651 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
7654 call_used_regs
[i
] = 1;
7659 /* Walk down the type tree of TYPE counting consecutive base elements.
7660 If *MODEP is VOIDmode, then set it to the first valid floating point
7661 type. If a non-floating point type is found, or if a floating point
7662 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7663 otherwise return the count in the sub-tree. */
7665 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
7670 switch (TREE_CODE (type
))
7673 mode
= TYPE_MODE (type
);
7674 if (mode
!= DFmode
&& mode
!= SFmode
&& mode
!= TFmode
)
7677 if (*modep
== VOIDmode
)
7686 mode
= TYPE_MODE (TREE_TYPE (type
));
7687 if (mode
!= DFmode
&& mode
!= SFmode
&& mode
!= TFmode
)
7690 if (*modep
== VOIDmode
)
7699 /* Use V2SImode and V4SImode as representatives of all 64-bit
7700 and 128-bit vector types. */
7701 size
= int_size_in_bytes (type
);
7714 if (*modep
== VOIDmode
)
7717 /* Vector modes are considered to be opaque: two vectors are
7718 equivalent for the purposes of being homogeneous aggregates
7719 if they are the same size. */
7728 tree index
= TYPE_DOMAIN (type
);
7730 /* Can't handle incomplete types nor sizes that are not
7732 if (!COMPLETE_TYPE_P (type
)
7733 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
7736 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
7739 || !TYPE_MAX_VALUE (index
)
7740 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
7741 || !TYPE_MIN_VALUE (index
)
7742 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
7746 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
7747 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
7749 /* There must be no padding. */
7750 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
7762 /* Can't handle incomplete types nor sizes that are not
7764 if (!COMPLETE_TYPE_P (type
)
7765 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
7768 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
7770 if (TREE_CODE (field
) != FIELD_DECL
)
7773 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
7779 /* There must be no padding. */
7780 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
7787 case QUAL_UNION_TYPE
:
7789 /* These aren't very interesting except in a degenerate case. */
7794 /* Can't handle incomplete types nor sizes that are not
7796 if (!COMPLETE_TYPE_P (type
)
7797 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
7800 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
7802 if (TREE_CODE (field
) != FIELD_DECL
)
7805 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
7808 count
= count
> sub_count
? count
: sub_count
;
7811 /* There must be no padding. */
7812 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
7825 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7826 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7827 array types. The C99 floating-point complex types are also considered
7828 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7829 types, which are GCC extensions and out of the scope of AAPCS64, are
7830 treated as composite types here as well.
7832 Note that MODE itself is not sufficient in determining whether a type
7833 is such a composite type or not. This is because
7834 stor-layout.c:compute_record_mode may have already changed the MODE
7835 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7836 structure with only one field may have its MODE set to the mode of the
7837 field. Also an integer mode whose size matches the size of the
7838 RECORD_TYPE type may be used to substitute the original mode
7839 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7840 solely relied on. */
7843 aarch64_composite_type_p (const_tree type
,
7846 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
7850 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
7851 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
7857 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7858 type as described in AAPCS64 \S 4.1.2.
7860 See the comment above aarch64_composite_type_p for the notes on MODE. */
7863 aarch64_short_vector_p (const_tree type
,
7866 HOST_WIDE_INT size
= -1;
7868 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
7869 size
= int_size_in_bytes (type
);
7870 else if (!aarch64_composite_type_p (type
, mode
)
7871 && (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
7872 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
))
7873 size
= GET_MODE_SIZE (mode
);
7875 return (size
== 8 || size
== 16) ? true : false;
7878 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7879 shall be passed or returned in simd/fp register(s) (providing these
7880 parameter passing registers are available).
7882 Upon successful return, *COUNT returns the number of needed registers,
7883 *BASE_MODE returns the mode of the individual register and when IS_HAF
7884 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7885 floating-point aggregate or a homogeneous short-vector aggregate. */
7888 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
7890 machine_mode
*base_mode
,
7894 machine_mode new_mode
= VOIDmode
;
7895 bool composite_p
= aarch64_composite_type_p (type
, mode
);
7897 if (is_ha
!= NULL
) *is_ha
= false;
7899 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7900 || aarch64_short_vector_p (type
, mode
))
7905 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
7907 if (is_ha
!= NULL
) *is_ha
= true;
7909 new_mode
= GET_MODE_INNER (mode
);
7911 else if (type
&& composite_p
)
7913 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
7915 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
7917 if (is_ha
!= NULL
) *is_ha
= true;
7926 *base_mode
= new_mode
;
7930 /* Implement TARGET_STRUCT_VALUE_RTX. */
7933 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
7934 int incoming ATTRIBUTE_UNUSED
)
7936 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
7939 /* Implements target hook vector_mode_supported_p. */
7941 aarch64_vector_mode_supported_p (machine_mode mode
)
7944 && (mode
== V4SImode
|| mode
== V8HImode
7945 || mode
== V16QImode
|| mode
== V2DImode
7946 || mode
== V2SImode
|| mode
== V4HImode
7947 || mode
== V8QImode
|| mode
== V2SFmode
7948 || mode
== V4SFmode
|| mode
== V2DFmode
7949 || mode
== V1DFmode
))
7955 /* Return appropriate SIMD container
7956 for MODE within a vector of WIDTH bits. */
7958 aarch64_simd_container_mode (machine_mode mode
, unsigned width
)
7960 gcc_assert (width
== 64 || width
== 128);
7999 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8001 aarch64_preferred_simd_mode (machine_mode mode
)
8003 return aarch64_simd_container_mode (mode
, 128);
8006 /* Return the bitmask of possible vector sizes for the vectorizer
8009 aarch64_autovectorize_vector_sizes (void)
8014 /* Implement TARGET_MANGLE_TYPE. */
8017 aarch64_mangle_type (const_tree type
)
8019 /* The AArch64 ABI documents say that "__va_list" has to be
8020 managled as if it is in the "std" namespace. */
8021 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
8022 return "St9__va_list";
8024 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8026 if (TYPE_NAME (type
) != NULL
)
8027 return aarch64_mangle_builtin_type (type
);
8029 /* Use the default mangling. */
8034 /* Return true if the rtx_insn contains a MEM RTX somewhere
8038 has_memory_op (rtx_insn
*mem_insn
)
8040 subrtx_iterator::array_type array
;
8041 FOR_EACH_SUBRTX (iter
, array
, PATTERN (mem_insn
), ALL
)
8048 /* Find the first rtx_insn before insn that will generate an assembly
8052 aarch64_prev_real_insn (rtx_insn
*insn
)
8059 insn
= prev_real_insn (insn
);
8061 while (insn
&& recog_memoized (insn
) < 0);
8067 is_madd_op (enum attr_type t1
)
8070 /* A number of these may be AArch32 only. */
8071 enum attr_type mlatypes
[] = {
8072 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
8073 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
8074 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
8077 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
8079 if (t1
== mlatypes
[i
])
8086 /* Check if there is a register dependency between a load and the insn
8087 for which we hold recog_data. */
8090 dep_between_memop_and_curr (rtx memop
)
8095 gcc_assert (GET_CODE (memop
) == SET
);
8097 if (!REG_P (SET_DEST (memop
)))
8100 load_reg
= SET_DEST (memop
);
8101 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
8103 rtx operand
= recog_data
.operand
[opno
];
8105 && reg_overlap_mentioned_p (load_reg
, operand
))
8113 /* When working around the Cortex-A53 erratum 835769,
8114 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8115 instruction and has a preceding memory instruction such that a NOP
8116 should be inserted between them. */
8119 aarch64_madd_needs_nop (rtx_insn
* insn
)
8121 enum attr_type attr_type
;
8125 if (!aarch64_fix_a53_err835769
)
8128 if (recog_memoized (insn
) < 0)
8131 attr_type
= get_attr_type (insn
);
8132 if (!is_madd_op (attr_type
))
8135 prev
= aarch64_prev_real_insn (insn
);
8136 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8137 Restore recog state to INSN to avoid state corruption. */
8138 extract_constrain_insn_cached (insn
);
8140 if (!prev
|| !has_memory_op (prev
))
8143 body
= single_set (prev
);
8145 /* If the previous insn is a memory op and there is no dependency between
8146 it and the DImode madd, emit a NOP between them. If body is NULL then we
8147 have a complex memory operation, probably a load/store pair.
8148 Be conservative for now and emit a NOP. */
8149 if (GET_MODE (recog_data
.operand
[0]) == DImode
8150 && (!body
|| !dep_between_memop_and_curr (body
)))
8158 /* Implement FINAL_PRESCAN_INSN. */
8161 aarch64_final_prescan_insn (rtx_insn
*insn
)
8163 if (aarch64_madd_needs_nop (insn
))
8164 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
8168 /* Return the equivalent letter for size. */
8170 sizetochar (int size
)
8174 case 64: return 'd';
8175 case 32: return 's';
8176 case 16: return 'h';
8177 case 8 : return 'b';
8178 default: gcc_unreachable ();
8182 /* Return true iff x is a uniform vector of floating-point
8183 constants, and the constant can be represented in
8184 quarter-precision form. Note, as aarch64_float_const_representable
8185 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8187 aarch64_vect_float_const_representable_p (rtx x
)
8190 REAL_VALUE_TYPE r0
, ri
;
8193 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
8196 x0
= CONST_VECTOR_ELT (x
, 0);
8197 if (!CONST_DOUBLE_P (x0
))
8200 REAL_VALUE_FROM_CONST_DOUBLE (r0
, x0
);
8202 for (i
= 1; i
< CONST_VECTOR_NUNITS (x
); i
++)
8204 xi
= CONST_VECTOR_ELT (x
, i
);
8205 if (!CONST_DOUBLE_P (xi
))
8208 REAL_VALUE_FROM_CONST_DOUBLE (ri
, xi
);
8209 if (!REAL_VALUES_EQUAL (r0
, ri
))
8213 return aarch64_float_const_representable_p (x0
);
8216 /* Return true for valid and false for invalid. */
8218 aarch64_simd_valid_immediate (rtx op
, machine_mode mode
, bool inverse
,
8219 struct simd_immediate_info
*info
)
8221 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8223 for (i = 0; i < idx; i += (STRIDE)) \
8228 immtype = (CLASS); \
8229 elsize = (ELSIZE); \
8235 unsigned int i
, elsize
= 0, idx
= 0, n_elts
= CONST_VECTOR_NUNITS (op
);
8236 unsigned int innersize
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
8237 unsigned char bytes
[16];
8238 int immtype
= -1, matches
;
8239 unsigned int invmask
= inverse
? 0xff : 0;
8242 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
8244 if (! (aarch64_simd_imm_zero_p (op
, mode
)
8245 || aarch64_vect_float_const_representable_p (op
)))
8250 info
->value
= CONST_VECTOR_ELT (op
, 0);
8251 info
->element_width
= GET_MODE_BITSIZE (GET_MODE (info
->value
));
8259 /* Splat vector constant out into a byte vector. */
8260 for (i
= 0; i
< n_elts
; i
++)
8262 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8263 it must be laid out in the vector register in reverse order. */
8264 rtx el
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
8265 unsigned HOST_WIDE_INT elpart
;
8266 unsigned int part
, parts
;
8268 if (CONST_INT_P (el
))
8270 elpart
= INTVAL (el
);
8273 else if (GET_CODE (el
) == CONST_DOUBLE
)
8275 elpart
= CONST_DOUBLE_LOW (el
);
8281 for (part
= 0; part
< parts
; part
++)
8284 for (byte
= 0; byte
< innersize
; byte
++)
8286 bytes
[idx
++] = (elpart
& 0xff) ^ invmask
;
8287 elpart
>>= BITS_PER_UNIT
;
8289 if (GET_CODE (el
) == CONST_DOUBLE
)
8290 elpart
= CONST_DOUBLE_HIGH (el
);
8295 gcc_assert (idx
== GET_MODE_SIZE (mode
));
8299 CHECK (4, 32, 0, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0
8300 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 0, 0);
8302 CHECK (4, 32, 1, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
8303 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
8305 CHECK (4, 32, 2, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
8306 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
8308 CHECK (4, 32, 3, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
8309 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == bytes
[3], 24, 0);
8311 CHECK (2, 16, 4, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0, 0, 0);
8313 CHECK (2, 16, 5, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1], 8, 0);
8315 CHECK (4, 32, 6, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff
8316 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 0, 1);
8318 CHECK (4, 32, 7, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
8319 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
8321 CHECK (4, 32, 8, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
8322 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
8324 CHECK (4, 32, 9, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
8325 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == bytes
[3], 24, 1);
8327 CHECK (2, 16, 10, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff, 0, 1);
8329 CHECK (2, 16, 11, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1], 8, 1);
8331 CHECK (4, 32, 12, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
8332 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
8334 CHECK (4, 32, 13, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
8335 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
8337 CHECK (4, 32, 14, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
8338 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
8340 CHECK (4, 32, 15, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
8341 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
8343 CHECK (1, 8, 16, bytes
[i
] == bytes
[0], 0, 0);
8345 CHECK (1, 64, 17, (bytes
[i
] == 0 || bytes
[i
] == 0xff)
8346 && bytes
[i
] == bytes
[(i
+ 8) % idx
], 0, 0);
8355 info
->element_width
= elsize
;
8356 info
->mvn
= emvn
!= 0;
8357 info
->shift
= eshift
;
8359 unsigned HOST_WIDE_INT imm
= 0;
8361 if (immtype
>= 12 && immtype
<= 15)
8364 /* Un-invert bytes of recognized vector, if necessary. */
8366 for (i
= 0; i
< idx
; i
++)
8367 bytes
[i
] ^= invmask
;
8371 /* FIXME: Broken on 32-bit H_W_I hosts. */
8372 gcc_assert (sizeof (HOST_WIDE_INT
) == 8);
8374 for (i
= 0; i
< 8; i
++)
8375 imm
|= (unsigned HOST_WIDE_INT
) (bytes
[i
] ? 0xff : 0)
8376 << (i
* BITS_PER_UNIT
);
8379 info
->value
= GEN_INT (imm
);
8383 for (i
= 0; i
< elsize
/ BITS_PER_UNIT
; i
++)
8384 imm
|= (unsigned HOST_WIDE_INT
) bytes
[i
] << (i
* BITS_PER_UNIT
);
8386 /* Construct 'abcdefgh' because the assembler cannot handle
8387 generic constants. */
8390 imm
= (imm
>> info
->shift
) & 0xff;
8391 info
->value
= GEN_INT (imm
);
8399 /* Check of immediate shift constants are within range. */
8401 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
8403 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
8405 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
8407 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
8410 /* Return true if X is a uniform vector where all elements
8411 are either the floating-point constant 0.0 or the
8412 integer constant 0. */
8414 aarch64_simd_imm_zero_p (rtx x
, machine_mode mode
)
8416 return x
== CONST0_RTX (mode
);
8420 aarch64_simd_imm_scalar_p (rtx x
, machine_mode mode ATTRIBUTE_UNUSED
)
8422 HOST_WIDE_INT imm
= INTVAL (x
);
8425 for (i
= 0; i
< 8; i
++)
8427 unsigned int byte
= imm
& 0xff;
8428 if (byte
!= 0xff && byte
!= 0)
8437 aarch64_mov_operand_p (rtx x
,
8438 enum aarch64_symbol_context context
,
8441 if (GET_CODE (x
) == HIGH
8442 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
8445 if (CONST_INT_P (x
))
8448 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
8451 return aarch64_classify_symbolic_expression (x
, context
)
8452 == SYMBOL_TINY_ABSOLUTE
;
8455 /* Return a const_int vector of VAL. */
8457 aarch64_simd_gen_const_vector_dup (machine_mode mode
, int val
)
8459 int nunits
= GET_MODE_NUNITS (mode
);
8460 rtvec v
= rtvec_alloc (nunits
);
8463 for (i
=0; i
< nunits
; i
++)
8464 RTVEC_ELT (v
, i
) = GEN_INT (val
);
8466 return gen_rtx_CONST_VECTOR (mode
, v
);
8469 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8472 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, machine_mode mode
)
8476 gcc_assert (!VECTOR_MODE_P (mode
));
8477 vmode
= aarch64_preferred_simd_mode (mode
);
8478 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
8479 return aarch64_simd_valid_immediate (op_v
, vmode
, false, NULL
);
8482 /* Construct and return a PARALLEL RTX vector with elements numbering the
8483 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8484 the vector - from the perspective of the architecture. This does not
8485 line up with GCC's perspective on lane numbers, so we end up with
8486 different masks depending on our target endian-ness. The diagram
8487 below may help. We must draw the distinction when building masks
8488 which select one half of the vector. An instruction selecting
8489 architectural low-lanes for a big-endian target, must be described using
8490 a mask selecting GCC high-lanes.
8492 Big-Endian Little-Endian
8495 | x | x | x | x | | x | x | x | x |
8496 Architecture 3 2 1 0 3 2 1 0
8498 Low Mask: { 2, 3 } { 0, 1 }
8499 High Mask: { 0, 1 } { 2, 3 }
8503 aarch64_simd_vect_par_cnst_half (machine_mode mode
, bool high
)
8505 int nunits
= GET_MODE_NUNITS (mode
);
8506 rtvec v
= rtvec_alloc (nunits
/ 2);
8507 int high_base
= nunits
/ 2;
8513 if (BYTES_BIG_ENDIAN
)
8514 base
= high
? low_base
: high_base
;
8516 base
= high
? high_base
: low_base
;
8518 for (i
= 0; i
< nunits
/ 2; i
++)
8519 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
8521 t1
= gen_rtx_PARALLEL (mode
, v
);
8525 /* Check OP for validity as a PARALLEL RTX vector with elements
8526 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8527 from the perspective of the architecture. See the diagram above
8528 aarch64_simd_vect_par_cnst_half for more details. */
8531 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
8534 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, high
);
8535 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
8536 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
8539 if (!VECTOR_MODE_P (mode
))
8542 if (count_op
!= count_ideal
)
8545 for (i
= 0; i
< count_ideal
; i
++)
8547 rtx elt_op
= XVECEXP (op
, 0, i
);
8548 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
8550 if (!CONST_INT_P (elt_op
)
8551 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
8557 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8558 HIGH (exclusive). */
8560 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
8564 gcc_assert (CONST_INT_P (operand
));
8565 lane
= INTVAL (operand
);
8567 if (lane
< low
|| lane
>= high
)
8570 error ("%Klane %ld out of range %ld - %ld", exp
, lane
, low
, high
- 1);
8572 error ("lane %ld out of range %ld - %ld", lane
, low
, high
- 1);
8576 /* Return TRUE if OP is a valid vector addressing mode. */
8578 aarch64_simd_mem_operand_p (rtx op
)
8580 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
8581 || REG_P (XEXP (op
, 0)));
8584 /* Emit a register copy from operand to operand, taking care not to
8585 early-clobber source registers in the process.
8587 COUNT is the number of components into which the copy needs to be
8590 aarch64_simd_emit_reg_reg_move (rtx
*operands
, enum machine_mode mode
,
8594 int rdest
= REGNO (operands
[0]);
8595 int rsrc
= REGNO (operands
[1]);
8597 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
8599 for (i
= 0; i
< count
; i
++)
8600 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
8601 gen_rtx_REG (mode
, rsrc
+ i
));
8603 for (i
= 0; i
< count
; i
++)
8604 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
8605 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
8608 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8609 one of VSTRUCT modes: OI, CI or XI. */
8611 aarch64_simd_attr_length_move (rtx_insn
*insn
)
8615 extract_insn_cached (insn
);
8617 if (REG_P (recog_data
.operand
[0]) && REG_P (recog_data
.operand
[1]))
8619 mode
= GET_MODE (recog_data
.operand
[0]);
8635 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8636 one of VSTRUCT modes: OI, CI, EI, or XI. */
8638 aarch64_simd_attr_length_rglist (enum machine_mode mode
)
8640 return (GET_MODE_SIZE (mode
) / UNITS_PER_VREG
) * 4;
8643 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8644 alignment of a vector to 128 bits. */
8645 static HOST_WIDE_INT
8646 aarch64_simd_vector_alignment (const_tree type
)
8648 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
8649 return MIN (align
, 128);
8652 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8654 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
8659 /* We guarantee alignment for vectors up to 128-bits. */
8660 if (tree_int_cst_compare (TYPE_SIZE (type
),
8661 bitsize_int (BIGGEST_ALIGNMENT
)) > 0)
8664 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8668 /* If VALS is a vector constant that can be loaded into a register
8669 using DUP, generate instructions to do so and return an RTX to
8670 assign to the register. Otherwise return NULL_RTX. */
8672 aarch64_simd_dup_constant (rtx vals
)
8674 machine_mode mode
= GET_MODE (vals
);
8675 machine_mode inner_mode
= GET_MODE_INNER (mode
);
8676 int n_elts
= GET_MODE_NUNITS (mode
);
8677 bool all_same
= true;
8681 if (GET_CODE (vals
) != CONST_VECTOR
)
8684 for (i
= 1; i
< n_elts
; ++i
)
8686 x
= CONST_VECTOR_ELT (vals
, i
);
8687 if (!rtx_equal_p (x
, CONST_VECTOR_ELT (vals
, 0)))
8694 /* We can load this constant by using DUP and a constant in a
8695 single ARM register. This will be cheaper than a vector
8697 x
= copy_to_mode_reg (inner_mode
, CONST_VECTOR_ELT (vals
, 0));
8698 return gen_rtx_VEC_DUPLICATE (mode
, x
);
8702 /* Generate code to load VALS, which is a PARALLEL containing only
8703 constants (for vec_init) or CONST_VECTOR, efficiently into a
8704 register. Returns an RTX to copy into the register, or NULL_RTX
8705 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8707 aarch64_simd_make_constant (rtx vals
)
8709 machine_mode mode
= GET_MODE (vals
);
8711 rtx const_vec
= NULL_RTX
;
8712 int n_elts
= GET_MODE_NUNITS (mode
);
8716 if (GET_CODE (vals
) == CONST_VECTOR
)
8718 else if (GET_CODE (vals
) == PARALLEL
)
8720 /* A CONST_VECTOR must contain only CONST_INTs and
8721 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8722 Only store valid constants in a CONST_VECTOR. */
8723 for (i
= 0; i
< n_elts
; ++i
)
8725 rtx x
= XVECEXP (vals
, 0, i
);
8726 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
8729 if (n_const
== n_elts
)
8730 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
8735 if (const_vec
!= NULL_RTX
8736 && aarch64_simd_valid_immediate (const_vec
, mode
, false, NULL
))
8737 /* Load using MOVI/MVNI. */
8739 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
8740 /* Loaded using DUP. */
8742 else if (const_vec
!= NULL_RTX
)
8743 /* Load from constant pool. We can not take advantage of single-cycle
8744 LD1 because we need a PC-relative addressing mode. */
8747 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8748 We can not construct an initializer. */
8753 aarch64_expand_vector_init (rtx target
, rtx vals
)
8755 machine_mode mode
= GET_MODE (target
);
8756 machine_mode inner_mode
= GET_MODE_INNER (mode
);
8757 int n_elts
= GET_MODE_NUNITS (mode
);
8759 rtx any_const
= NULL_RTX
;
8760 bool all_same
= true;
8762 for (int i
= 0; i
< n_elts
; ++i
)
8764 rtx x
= XVECEXP (vals
, 0, i
);
8765 if (!CONST_INT_P (x
) && !CONST_DOUBLE_P (x
))
8770 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
8776 rtx constant
= aarch64_simd_make_constant (vals
);
8777 if (constant
!= NULL_RTX
)
8779 emit_move_insn (target
, constant
);
8784 /* Splat a single non-constant element if we can. */
8787 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, 0));
8788 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
8792 /* Half the fields (or less) are non-constant. Load constant then overwrite
8793 varying fields. Hope that this is more efficient than using the stack. */
8794 if (n_var
<= n_elts
/2)
8796 rtx copy
= copy_rtx (vals
);
8798 /* Load constant part of vector. We really don't care what goes into the
8799 parts we will overwrite, but we're more likely to be able to load the
8800 constant efficiently if it has fewer, larger, repeating parts
8801 (see aarch64_simd_valid_immediate). */
8802 for (int i
= 0; i
< n_elts
; i
++)
8804 rtx x
= XVECEXP (vals
, 0, i
);
8805 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
8807 rtx subst
= any_const
;
8808 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
8810 /* Look in the copied vector, as more elements are const. */
8811 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
8812 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
8818 XVECEXP (copy
, 0, i
) = subst
;
8820 aarch64_expand_vector_init (target
, copy
);
8822 /* Insert variables. */
8823 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
8824 gcc_assert (icode
!= CODE_FOR_nothing
);
8826 for (int i
= 0; i
< n_elts
; i
++)
8828 rtx x
= XVECEXP (vals
, 0, i
);
8829 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
8831 x
= copy_to_mode_reg (inner_mode
, x
);
8832 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
8837 /* Construct the vector in memory one field at a time
8838 and load the whole vector. */
8839 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
8840 for (int i
= 0; i
< n_elts
; i
++)
8841 emit_move_insn (adjust_address_nv (mem
, inner_mode
,
8842 i
* GET_MODE_SIZE (inner_mode
)),
8843 XVECEXP (vals
, 0, i
));
8844 emit_move_insn (target
, mem
);
8848 static unsigned HOST_WIDE_INT
8849 aarch64_shift_truncation_mask (machine_mode mode
)
8852 (aarch64_vector_mode_supported_p (mode
)
8853 || aarch64_vect_struct_mode_p (mode
)) ? 0 : (GET_MODE_BITSIZE (mode
) - 1);
8856 #ifndef TLS_SECTION_ASM_FLAG
8857 #define TLS_SECTION_ASM_FLAG 'T'
8861 aarch64_elf_asm_named_section (const char *name
, unsigned int flags
,
8862 tree decl ATTRIBUTE_UNUSED
)
8864 char flagchars
[10], *f
= flagchars
;
8866 /* If we have already declared this section, we can use an
8867 abbreviated form to switch back to it -- unless this section is
8868 part of a COMDAT groups, in which case GAS requires the full
8869 declaration every time. */
8870 if (!(HAVE_COMDAT_GROUP
&& (flags
& SECTION_LINKONCE
))
8871 && (flags
& SECTION_DECLARED
))
8873 fprintf (asm_out_file
, "\t.section\t%s\n", name
);
8877 if (!(flags
& SECTION_DEBUG
))
8879 if (flags
& SECTION_WRITE
)
8881 if (flags
& SECTION_CODE
)
8883 if (flags
& SECTION_SMALL
)
8885 if (flags
& SECTION_MERGE
)
8887 if (flags
& SECTION_STRINGS
)
8889 if (flags
& SECTION_TLS
)
8890 *f
++ = TLS_SECTION_ASM_FLAG
;
8891 if (HAVE_COMDAT_GROUP
&& (flags
& SECTION_LINKONCE
))
8895 fprintf (asm_out_file
, "\t.section\t%s,\"%s\"", name
, flagchars
);
8897 if (!(flags
& SECTION_NOTYPE
))
8902 if (flags
& SECTION_BSS
)
8907 #ifdef TYPE_OPERAND_FMT
8908 format
= "," TYPE_OPERAND_FMT
;
8913 fprintf (asm_out_file
, format
, type
);
8915 if (flags
& SECTION_ENTSIZE
)
8916 fprintf (asm_out_file
, ",%d", flags
& SECTION_ENTSIZE
);
8917 if (HAVE_COMDAT_GROUP
&& (flags
& SECTION_LINKONCE
))
8919 if (TREE_CODE (decl
) == IDENTIFIER_NODE
)
8920 fprintf (asm_out_file
, ",%s,comdat", IDENTIFIER_POINTER (decl
));
8922 fprintf (asm_out_file
, ",%s,comdat",
8923 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl
)));
8927 putc ('\n', asm_out_file
);
8930 /* Select a format to encode pointers in exception handling data. */
8932 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
8935 switch (aarch64_cmodel
)
8937 case AARCH64_CMODEL_TINY
:
8938 case AARCH64_CMODEL_TINY_PIC
:
8939 case AARCH64_CMODEL_SMALL
:
8940 case AARCH64_CMODEL_SMALL_PIC
:
8941 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8943 type
= DW_EH_PE_sdata4
;
8946 /* No assumptions here. 8-byte relocs required. */
8947 type
= DW_EH_PE_sdata8
;
8950 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
8953 /* Emit load exclusive. */
8956 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
8957 rtx mem
, rtx model_rtx
)
8959 rtx (*gen
) (rtx
, rtx
, rtx
);
8963 case QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
8964 case HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
8965 case SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
8966 case DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
8971 emit_insn (gen (rval
, mem
, model_rtx
));
8974 /* Emit store exclusive. */
8977 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
8978 rtx rval
, rtx mem
, rtx model_rtx
)
8980 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
8984 case QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
8985 case HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
8986 case SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
8987 case DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
8992 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
8995 /* Mark the previous jump instruction as unlikely. */
8998 aarch64_emit_unlikely_jump (rtx insn
)
9000 int very_unlikely
= REG_BR_PROB_BASE
/ 100 - 1;
9002 insn
= emit_jump_insn (insn
);
9003 add_int_reg_note (insn
, REG_BR_PROB
, very_unlikely
);
9006 /* Expand a compare and swap pattern. */
9009 aarch64_expand_compare_and_swap (rtx operands
[])
9011 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
9012 machine_mode mode
, cmp_mode
;
9013 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
9018 oldval
= operands
[3];
9019 newval
= operands
[4];
9020 is_weak
= operands
[5];
9021 mod_s
= operands
[6];
9022 mod_f
= operands
[7];
9023 mode
= GET_MODE (mem
);
9026 /* Normally the succ memory model must be stronger than fail, but in the
9027 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9028 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9030 if (INTVAL (mod_f
) == MEMMODEL_ACQUIRE
9031 && INTVAL (mod_s
) == MEMMODEL_RELEASE
)
9032 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
9038 /* For short modes, we're going to perform the comparison in SImode,
9039 so do the zero-extension now. */
9041 rval
= gen_reg_rtx (SImode
);
9042 oldval
= convert_modes (SImode
, mode
, oldval
, true);
9047 /* Force the value into a register if needed. */
9048 if (!aarch64_plus_operand (oldval
, mode
))
9049 oldval
= force_reg (cmp_mode
, oldval
);
9058 case QImode
: gen
= gen_atomic_compare_and_swapqi_1
; break;
9059 case HImode
: gen
= gen_atomic_compare_and_swaphi_1
; break;
9060 case SImode
: gen
= gen_atomic_compare_and_swapsi_1
; break;
9061 case DImode
: gen
= gen_atomic_compare_and_swapdi_1
; break;
9066 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
9068 if (mode
== QImode
|| mode
== HImode
)
9069 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
9071 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
9072 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
9073 emit_insn (gen_rtx_SET (VOIDmode
, bval
, x
));
9076 /* Split a compare and swap pattern. */
9079 aarch64_split_compare_and_swap (rtx operands
[])
9081 rtx rval
, mem
, oldval
, newval
, scratch
;
9084 rtx_code_label
*label1
, *label2
;
9089 oldval
= operands
[2];
9090 newval
= operands
[3];
9091 is_weak
= (operands
[4] != const0_rtx
);
9092 scratch
= operands
[7];
9093 mode
= GET_MODE (mem
);
9098 label1
= gen_label_rtx ();
9099 emit_label (label1
);
9101 label2
= gen_label_rtx ();
9103 aarch64_emit_load_exclusive (mode
, rval
, mem
, operands
[5]);
9105 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
9106 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
9107 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
9108 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
9109 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode
, pc_rtx
, x
));
9111 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, operands
[5]);
9115 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
9116 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
9117 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
9118 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode
, pc_rtx
, x
));
9122 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
9123 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
9124 emit_insn (gen_rtx_SET (VOIDmode
, cond
, x
));
9127 emit_label (label2
);
9130 /* Split an atomic operation. */
9133 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
9134 rtx value
, rtx model_rtx
, rtx cond
)
9136 machine_mode mode
= GET_MODE (mem
);
9137 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
9138 rtx_code_label
*label
;
9141 label
= gen_label_rtx ();
9145 new_out
= gen_lowpart (wmode
, new_out
);
9147 old_out
= gen_lowpart (wmode
, old_out
);
9150 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
9152 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
9161 x
= gen_rtx_AND (wmode
, old_out
, value
);
9162 emit_insn (gen_rtx_SET (VOIDmode
, new_out
, x
));
9163 x
= gen_rtx_NOT (wmode
, new_out
);
9164 emit_insn (gen_rtx_SET (VOIDmode
, new_out
, x
));
9168 if (CONST_INT_P (value
))
9170 value
= GEN_INT (-INTVAL (value
));
9176 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
9177 emit_insn (gen_rtx_SET (VOIDmode
, new_out
, x
));
9181 aarch64_emit_store_exclusive (mode
, cond
, mem
,
9182 gen_lowpart (mode
, new_out
), model_rtx
);
9184 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
9185 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
9186 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
9187 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode
, pc_rtx
, x
));
9191 aarch64_print_extension (void)
9193 const struct aarch64_option_extension
*opt
= NULL
;
9195 for (opt
= all_extensions
; opt
->name
!= NULL
; opt
++)
9196 if ((aarch64_isa_flags
& opt
->flags_on
) == opt
->flags_on
)
9197 asm_fprintf (asm_out_file
, "+%s", opt
->name
);
9199 asm_fprintf (asm_out_file
, "\n");
9203 aarch64_start_file (void)
9207 asm_fprintf (asm_out_file
, "\t.arch %s", selected_arch
->name
);
9208 aarch64_print_extension ();
9210 else if (selected_cpu
)
9212 const char *truncated_name
9213 = aarch64_rewrite_selected_cpu (selected_cpu
->name
);
9214 asm_fprintf (asm_out_file
, "\t.cpu %s", truncated_name
);
9215 aarch64_print_extension ();
9217 default_file_start();
9220 /* Target hook for c_mode_for_suffix. */
9222 aarch64_c_mode_for_suffix (char suffix
)
9230 /* We can only represent floating point constants which will fit in
9231 "quarter-precision" values. These values are characterised by
9232 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9235 (-1)^s * (n/16) * 2^r
9238 's' is the sign bit.
9239 'n' is an integer in the range 16 <= n <= 31.
9240 'r' is an integer in the range -3 <= r <= 4. */
9242 /* Return true iff X can be represented by a quarter-precision
9243 floating point immediate operand X. Note, we cannot represent 0.0. */
9245 aarch64_float_const_representable_p (rtx x
)
9247 /* This represents our current view of how many bits
9248 make up the mantissa. */
9249 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
9251 unsigned HOST_WIDE_INT mantissa
, mask
;
9252 REAL_VALUE_TYPE r
, m
;
9255 if (!CONST_DOUBLE_P (x
))
9258 if (GET_MODE (x
) == VOIDmode
)
9261 REAL_VALUE_FROM_CONST_DOUBLE (r
, x
);
9263 /* We cannot represent infinities, NaNs or +/-zero. We won't
9264 know if we have +zero until we analyse the mantissa, but we
9265 can reject the other invalid values. */
9266 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
9267 || REAL_VALUE_MINUS_ZERO (r
))
9270 /* Extract exponent. */
9271 r
= real_value_abs (&r
);
9272 exponent
= REAL_EXP (&r
);
9274 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9275 highest (sign) bit, with a fixed binary point at bit point_pos.
9276 m1 holds the low part of the mantissa, m2 the high part.
9277 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9278 bits for the mantissa, this can fail (low bits will be lost). */
9279 real_ldexp (&m
, &r
, point_pos
- exponent
);
9280 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
9282 /* If the low part of the mantissa has bits set we cannot represent
9286 /* We have rejected the lower HOST_WIDE_INT, so update our
9287 understanding of how many bits lie in the mantissa and
9288 look only at the high HOST_WIDE_INT. */
9289 mantissa
= w
.elt (1);
9290 point_pos
-= HOST_BITS_PER_WIDE_INT
;
9292 /* We can only represent values with a mantissa of the form 1.xxxx. */
9293 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
9294 if ((mantissa
& mask
) != 0)
9297 /* Having filtered unrepresentable values, we may now remove all
9298 but the highest 5 bits. */
9299 mantissa
>>= point_pos
- 5;
9301 /* We cannot represent the value 0.0, so reject it. This is handled
9306 /* Then, as bit 4 is always set, we can mask it off, leaving
9307 the mantissa in the range [0, 15]. */
9308 mantissa
&= ~(1 << 4);
9309 gcc_assert (mantissa
<= 15);
9311 /* GCC internally does not use IEEE754-like encoding (where normalized
9312 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9313 Our mantissa values are shifted 4 places to the left relative to
9314 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9315 by 5 places to correct for GCC's representation. */
9316 exponent
= 5 - exponent
;
9318 return (exponent
>= 0 && exponent
<= 7);
9322 aarch64_output_simd_mov_immediate (rtx const_vector
,
9327 static char templ
[40];
9328 const char *mnemonic
;
9329 const char *shift_op
;
9330 unsigned int lane_count
= 0;
9333 struct simd_immediate_info info
= { NULL_RTX
, 0, 0, false, false };
9335 /* This will return true to show const_vector is legal for use as either
9336 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9337 also update INFO to show how the immediate should be generated. */
9338 is_valid
= aarch64_simd_valid_immediate (const_vector
, mode
, false, &info
);
9339 gcc_assert (is_valid
);
9341 element_char
= sizetochar (info
.element_width
);
9342 lane_count
= width
/ info
.element_width
;
9344 mode
= GET_MODE_INNER (mode
);
9345 if (mode
== SFmode
|| mode
== DFmode
)
9347 gcc_assert (info
.shift
== 0 && ! info
.mvn
);
9348 if (aarch64_float_const_zero_rtx_p (info
.value
))
9349 info
.value
= GEN_INT (0);
9354 REAL_VALUE_FROM_CONST_DOUBLE (r
, info
.value
);
9355 char float_buf
[buf_size
] = {'\0'};
9356 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
, 1, mode
);
9359 if (lane_count
== 1)
9360 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
9362 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
9363 lane_count
, element_char
, float_buf
);
9368 mnemonic
= info
.mvn
? "mvni" : "movi";
9369 shift_op
= info
.msl
? "msl" : "lsl";
9371 if (lane_count
== 1)
9372 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
9373 mnemonic
, UINTVAL (info
.value
));
9374 else if (info
.shift
)
9375 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9376 ", %s %d", mnemonic
, lane_count
, element_char
,
9377 UINTVAL (info
.value
), shift_op
, info
.shift
);
9379 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
,
9380 mnemonic
, lane_count
, element_char
, UINTVAL (info
.value
));
9385 aarch64_output_scalar_simd_mov_immediate (rtx immediate
,
9390 gcc_assert (!VECTOR_MODE_P (mode
));
9391 vmode
= aarch64_simd_container_mode (mode
, 64);
9392 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
9393 return aarch64_output_simd_mov_immediate (v_op
, vmode
, 64);
9396 /* Split operands into moves from op[1] + op[2] into op[0]. */
9399 aarch64_split_combinev16qi (rtx operands
[3])
9401 unsigned int dest
= REGNO (operands
[0]);
9402 unsigned int src1
= REGNO (operands
[1]);
9403 unsigned int src2
= REGNO (operands
[2]);
9404 machine_mode halfmode
= GET_MODE (operands
[1]);
9405 unsigned int halfregs
= HARD_REGNO_NREGS (src1
, halfmode
);
9408 gcc_assert (halfmode
== V16QImode
);
9410 if (src1
== dest
&& src2
== dest
+ halfregs
)
9412 /* No-op move. Can't split to nothing; emit something. */
9413 emit_note (NOTE_INSN_DELETED
);
9417 /* Preserve register attributes for variable tracking. */
9418 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
9419 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
9420 GET_MODE_SIZE (halfmode
));
9422 /* Special case of reversed high/low parts. */
9423 if (reg_overlap_mentioned_p (operands
[2], destlo
)
9424 && reg_overlap_mentioned_p (operands
[1], desthi
))
9426 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
9427 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
9428 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
9430 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
9432 /* Try to avoid unnecessary moves if part of the result
9433 is in the right place already. */
9435 emit_move_insn (destlo
, operands
[1]);
9436 if (src2
!= dest
+ halfregs
)
9437 emit_move_insn (desthi
, operands
[2]);
9441 if (src2
!= dest
+ halfregs
)
9442 emit_move_insn (desthi
, operands
[2]);
9444 emit_move_insn (destlo
, operands
[1]);
9448 /* vec_perm support. */
9450 #define MAX_VECT_LEN 16
9452 struct expand_vec_perm_d
9454 rtx target
, op0
, op1
;
9455 unsigned char perm
[MAX_VECT_LEN
];
9462 /* Generate a variable permutation. */
9465 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
9467 machine_mode vmode
= GET_MODE (target
);
9468 bool one_vector_p
= rtx_equal_p (op0
, op1
);
9470 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
9471 gcc_checking_assert (GET_MODE (op0
) == vmode
);
9472 gcc_checking_assert (GET_MODE (op1
) == vmode
);
9473 gcc_checking_assert (GET_MODE (sel
) == vmode
);
9474 gcc_checking_assert (TARGET_SIMD
);
9478 if (vmode
== V8QImode
)
9480 /* Expand the argument to a V16QI mode by duplicating it. */
9481 rtx pair
= gen_reg_rtx (V16QImode
);
9482 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
9483 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
9487 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
9494 if (vmode
== V8QImode
)
9496 pair
= gen_reg_rtx (V16QImode
);
9497 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
9498 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
9502 pair
= gen_reg_rtx (OImode
);
9503 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
9504 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
9510 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
9512 machine_mode vmode
= GET_MODE (target
);
9513 unsigned int nelt
= GET_MODE_NUNITS (vmode
);
9514 bool one_vector_p
= rtx_equal_p (op0
, op1
);
9517 /* The TBL instruction does not use a modulo index, so we must take care
9518 of that ourselves. */
9519 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
9520 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9521 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
9523 /* For big-endian, we also need to reverse the index within the vector
9524 (but not which vector). */
9525 if (BYTES_BIG_ENDIAN
)
9527 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9529 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
9530 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
9531 NULL
, 0, OPTAB_LIB_WIDEN
);
9533 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
9536 /* Recognize patterns suitable for the TRN instructions. */
9538 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
9540 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
9541 rtx out
, in0
, in1
, x
;
9542 rtx (*gen
) (rtx
, rtx
, rtx
);
9543 machine_mode vmode
= d
->vmode
;
9545 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
9548 /* Note that these are little-endian tests.
9549 We correct for big-endian later. */
9550 if (d
->perm
[0] == 0)
9552 else if (d
->perm
[0] == 1)
9556 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9558 for (i
= 0; i
< nelt
; i
+= 2)
9560 if (d
->perm
[i
] != i
+ odd
)
9562 if (d
->perm
[i
+ 1] != ((i
+ nelt
+ odd
) & mask
))
9572 if (BYTES_BIG_ENDIAN
)
9574 x
= in0
, in0
= in1
, in1
= x
;
9583 case V16QImode
: gen
= gen_aarch64_trn2v16qi
; break;
9584 case V8QImode
: gen
= gen_aarch64_trn2v8qi
; break;
9585 case V8HImode
: gen
= gen_aarch64_trn2v8hi
; break;
9586 case V4HImode
: gen
= gen_aarch64_trn2v4hi
; break;
9587 case V4SImode
: gen
= gen_aarch64_trn2v4si
; break;
9588 case V2SImode
: gen
= gen_aarch64_trn2v2si
; break;
9589 case V2DImode
: gen
= gen_aarch64_trn2v2di
; break;
9590 case V4SFmode
: gen
= gen_aarch64_trn2v4sf
; break;
9591 case V2SFmode
: gen
= gen_aarch64_trn2v2sf
; break;
9592 case V2DFmode
: gen
= gen_aarch64_trn2v2df
; break;
9601 case V16QImode
: gen
= gen_aarch64_trn1v16qi
; break;
9602 case V8QImode
: gen
= gen_aarch64_trn1v8qi
; break;
9603 case V8HImode
: gen
= gen_aarch64_trn1v8hi
; break;
9604 case V4HImode
: gen
= gen_aarch64_trn1v4hi
; break;
9605 case V4SImode
: gen
= gen_aarch64_trn1v4si
; break;
9606 case V2SImode
: gen
= gen_aarch64_trn1v2si
; break;
9607 case V2DImode
: gen
= gen_aarch64_trn1v2di
; break;
9608 case V4SFmode
: gen
= gen_aarch64_trn1v4sf
; break;
9609 case V2SFmode
: gen
= gen_aarch64_trn1v2sf
; break;
9610 case V2DFmode
: gen
= gen_aarch64_trn1v2df
; break;
9616 emit_insn (gen (out
, in0
, in1
));
9620 /* Recognize patterns suitable for the UZP instructions. */
9622 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
9624 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
9625 rtx out
, in0
, in1
, x
;
9626 rtx (*gen
) (rtx
, rtx
, rtx
);
9627 machine_mode vmode
= d
->vmode
;
9629 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
9632 /* Note that these are little-endian tests.
9633 We correct for big-endian later. */
9634 if (d
->perm
[0] == 0)
9636 else if (d
->perm
[0] == 1)
9640 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9642 for (i
= 0; i
< nelt
; i
++)
9644 unsigned elt
= (i
* 2 + odd
) & mask
;
9645 if (d
->perm
[i
] != elt
)
9655 if (BYTES_BIG_ENDIAN
)
9657 x
= in0
, in0
= in1
, in1
= x
;
9666 case V16QImode
: gen
= gen_aarch64_uzp2v16qi
; break;
9667 case V8QImode
: gen
= gen_aarch64_uzp2v8qi
; break;
9668 case V8HImode
: gen
= gen_aarch64_uzp2v8hi
; break;
9669 case V4HImode
: gen
= gen_aarch64_uzp2v4hi
; break;
9670 case V4SImode
: gen
= gen_aarch64_uzp2v4si
; break;
9671 case V2SImode
: gen
= gen_aarch64_uzp2v2si
; break;
9672 case V2DImode
: gen
= gen_aarch64_uzp2v2di
; break;
9673 case V4SFmode
: gen
= gen_aarch64_uzp2v4sf
; break;
9674 case V2SFmode
: gen
= gen_aarch64_uzp2v2sf
; break;
9675 case V2DFmode
: gen
= gen_aarch64_uzp2v2df
; break;
9684 case V16QImode
: gen
= gen_aarch64_uzp1v16qi
; break;
9685 case V8QImode
: gen
= gen_aarch64_uzp1v8qi
; break;
9686 case V8HImode
: gen
= gen_aarch64_uzp1v8hi
; break;
9687 case V4HImode
: gen
= gen_aarch64_uzp1v4hi
; break;
9688 case V4SImode
: gen
= gen_aarch64_uzp1v4si
; break;
9689 case V2SImode
: gen
= gen_aarch64_uzp1v2si
; break;
9690 case V2DImode
: gen
= gen_aarch64_uzp1v2di
; break;
9691 case V4SFmode
: gen
= gen_aarch64_uzp1v4sf
; break;
9692 case V2SFmode
: gen
= gen_aarch64_uzp1v2sf
; break;
9693 case V2DFmode
: gen
= gen_aarch64_uzp1v2df
; break;
9699 emit_insn (gen (out
, in0
, in1
));
9703 /* Recognize patterns suitable for the ZIP instructions. */
9705 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
9707 unsigned int i
, high
, mask
, nelt
= d
->nelt
;
9708 rtx out
, in0
, in1
, x
;
9709 rtx (*gen
) (rtx
, rtx
, rtx
);
9710 machine_mode vmode
= d
->vmode
;
9712 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
9715 /* Note that these are little-endian tests.
9716 We correct for big-endian later. */
9718 if (d
->perm
[0] == high
)
9721 else if (d
->perm
[0] == 0)
9725 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9727 for (i
= 0; i
< nelt
/ 2; i
++)
9729 unsigned elt
= (i
+ high
) & mask
;
9730 if (d
->perm
[i
* 2] != elt
)
9732 elt
= (elt
+ nelt
) & mask
;
9733 if (d
->perm
[i
* 2 + 1] != elt
)
9743 if (BYTES_BIG_ENDIAN
)
9745 x
= in0
, in0
= in1
, in1
= x
;
9754 case V16QImode
: gen
= gen_aarch64_zip2v16qi
; break;
9755 case V8QImode
: gen
= gen_aarch64_zip2v8qi
; break;
9756 case V8HImode
: gen
= gen_aarch64_zip2v8hi
; break;
9757 case V4HImode
: gen
= gen_aarch64_zip2v4hi
; break;
9758 case V4SImode
: gen
= gen_aarch64_zip2v4si
; break;
9759 case V2SImode
: gen
= gen_aarch64_zip2v2si
; break;
9760 case V2DImode
: gen
= gen_aarch64_zip2v2di
; break;
9761 case V4SFmode
: gen
= gen_aarch64_zip2v4sf
; break;
9762 case V2SFmode
: gen
= gen_aarch64_zip2v2sf
; break;
9763 case V2DFmode
: gen
= gen_aarch64_zip2v2df
; break;
9772 case V16QImode
: gen
= gen_aarch64_zip1v16qi
; break;
9773 case V8QImode
: gen
= gen_aarch64_zip1v8qi
; break;
9774 case V8HImode
: gen
= gen_aarch64_zip1v8hi
; break;
9775 case V4HImode
: gen
= gen_aarch64_zip1v4hi
; break;
9776 case V4SImode
: gen
= gen_aarch64_zip1v4si
; break;
9777 case V2SImode
: gen
= gen_aarch64_zip1v2si
; break;
9778 case V2DImode
: gen
= gen_aarch64_zip1v2di
; break;
9779 case V4SFmode
: gen
= gen_aarch64_zip1v4sf
; break;
9780 case V2SFmode
: gen
= gen_aarch64_zip1v2sf
; break;
9781 case V2DFmode
: gen
= gen_aarch64_zip1v2df
; break;
9787 emit_insn (gen (out
, in0
, in1
));
9791 /* Recognize patterns for the EXT insn. */
9794 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
9796 unsigned int i
, nelt
= d
->nelt
;
9797 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
9800 unsigned int location
= d
->perm
[0]; /* Always < nelt. */
9802 /* Check if the extracted indices are increasing by one. */
9803 for (i
= 1; i
< nelt
; i
++)
9805 unsigned int required
= location
+ i
;
9806 if (d
->one_vector_p
)
9808 /* We'll pass the same vector in twice, so allow indices to wrap. */
9809 required
&= (nelt
- 1);
9811 if (d
->perm
[i
] != required
)
9817 case V16QImode
: gen
= gen_aarch64_extv16qi
; break;
9818 case V8QImode
: gen
= gen_aarch64_extv8qi
; break;
9819 case V4HImode
: gen
= gen_aarch64_extv4hi
; break;
9820 case V8HImode
: gen
= gen_aarch64_extv8hi
; break;
9821 case V2SImode
: gen
= gen_aarch64_extv2si
; break;
9822 case V4SImode
: gen
= gen_aarch64_extv4si
; break;
9823 case V2SFmode
: gen
= gen_aarch64_extv2sf
; break;
9824 case V4SFmode
: gen
= gen_aarch64_extv4sf
; break;
9825 case V2DImode
: gen
= gen_aarch64_extv2di
; break;
9826 case V2DFmode
: gen
= gen_aarch64_extv2df
; break;
9835 /* The case where (location == 0) is a no-op for both big- and little-endian,
9836 and is removed by the mid-end at optimization levels -O1 and higher. */
9838 if (BYTES_BIG_ENDIAN
&& (location
!= 0))
9840 /* After setup, we want the high elements of the first vector (stored
9841 at the LSB end of the register), and the low elements of the second
9842 vector (stored at the MSB end of the register). So swap. */
9843 std::swap (d
->op0
, d
->op1
);
9844 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9845 location
= nelt
- location
;
9848 offset
= GEN_INT (location
);
9849 emit_insn (gen (d
->target
, d
->op0
, d
->op1
, offset
));
9853 /* Recognize patterns for the REV insns. */
9856 aarch64_evpc_rev (struct expand_vec_perm_d
*d
)
9858 unsigned int i
, j
, diff
, nelt
= d
->nelt
;
9859 rtx (*gen
) (rtx
, rtx
);
9861 if (!d
->one_vector_p
)
9870 case V16QImode
: gen
= gen_aarch64_rev64v16qi
; break;
9871 case V8QImode
: gen
= gen_aarch64_rev64v8qi
; break;
9879 case V16QImode
: gen
= gen_aarch64_rev32v16qi
; break;
9880 case V8QImode
: gen
= gen_aarch64_rev32v8qi
; break;
9881 case V8HImode
: gen
= gen_aarch64_rev64v8hi
; break;
9882 case V4HImode
: gen
= gen_aarch64_rev64v4hi
; break;
9890 case V16QImode
: gen
= gen_aarch64_rev16v16qi
; break;
9891 case V8QImode
: gen
= gen_aarch64_rev16v8qi
; break;
9892 case V8HImode
: gen
= gen_aarch64_rev32v8hi
; break;
9893 case V4HImode
: gen
= gen_aarch64_rev32v4hi
; break;
9894 case V4SImode
: gen
= gen_aarch64_rev64v4si
; break;
9895 case V2SImode
: gen
= gen_aarch64_rev64v2si
; break;
9896 case V4SFmode
: gen
= gen_aarch64_rev64v4sf
; break;
9897 case V2SFmode
: gen
= gen_aarch64_rev64v2sf
; break;
9906 for (i
= 0; i
< nelt
; i
+= diff
+ 1)
9907 for (j
= 0; j
<= diff
; j
+= 1)
9909 /* This is guaranteed to be true as the value of diff
9910 is 7, 3, 1 and we should have enough elements in the
9911 queue to generate this. Getting a vector mask with a
9912 value of diff other than these values implies that
9913 something is wrong by the time we get here. */
9914 gcc_assert (i
+ j
< nelt
);
9915 if (d
->perm
[i
+ j
] != i
+ diff
- j
)
9923 emit_insn (gen (d
->target
, d
->op0
));
9928 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
9930 rtx (*gen
) (rtx
, rtx
, rtx
);
9931 rtx out
= d
->target
;
9933 machine_mode vmode
= d
->vmode
;
9934 unsigned int i
, elt
, nelt
= d
->nelt
;
9938 for (i
= 1; i
< nelt
; i
++)
9940 if (elt
!= d
->perm
[i
])
9944 /* The generic preparation in aarch64_expand_vec_perm_const_1
9945 swaps the operand order and the permute indices if it finds
9946 d->perm[0] to be in the second operand. Thus, we can always
9947 use d->op0 and need not do any extra arithmetic to get the
9948 correct lane number. */
9950 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
9954 case V16QImode
: gen
= gen_aarch64_dup_lanev16qi
; break;
9955 case V8QImode
: gen
= gen_aarch64_dup_lanev8qi
; break;
9956 case V8HImode
: gen
= gen_aarch64_dup_lanev8hi
; break;
9957 case V4HImode
: gen
= gen_aarch64_dup_lanev4hi
; break;
9958 case V4SImode
: gen
= gen_aarch64_dup_lanev4si
; break;
9959 case V2SImode
: gen
= gen_aarch64_dup_lanev2si
; break;
9960 case V2DImode
: gen
= gen_aarch64_dup_lanev2di
; break;
9961 case V4SFmode
: gen
= gen_aarch64_dup_lanev4sf
; break;
9962 case V2SFmode
: gen
= gen_aarch64_dup_lanev2sf
; break;
9963 case V2DFmode
: gen
= gen_aarch64_dup_lanev2df
; break;
9968 emit_insn (gen (out
, in0
, lane
));
9973 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
9975 rtx rperm
[MAX_VECT_LEN
], sel
;
9976 machine_mode vmode
= d
->vmode
;
9977 unsigned int i
, nelt
= d
->nelt
;
9982 /* Generic code will try constant permutation twice. Once with the
9983 original mode and again with the elements lowered to QImode.
9984 So wait and don't do the selector expansion ourselves. */
9985 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
9988 for (i
= 0; i
< nelt
; ++i
)
9990 int nunits
= GET_MODE_NUNITS (vmode
);
9992 /* If big-endian and two vectors we end up with a weird mixed-endian
9993 mode on NEON. Reverse the index within each word but not the word
9995 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? d
->perm
[i
] ^ (nunits
- 1)
9998 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
9999 sel
= force_reg (vmode
, sel
);
10001 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
10006 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
10008 /* The pattern matching functions above are written to look for a small
10009 number to begin the sequence (0, 1, N/2). If we begin with an index
10010 from the second operand, we can swap the operands. */
10011 if (d
->perm
[0] >= d
->nelt
)
10013 unsigned i
, nelt
= d
->nelt
;
10015 gcc_assert (nelt
== (nelt
& -nelt
));
10016 for (i
= 0; i
< nelt
; ++i
)
10017 d
->perm
[i
] ^= nelt
; /* Keep the same index, but in the other vector. */
10019 std::swap (d
->op0
, d
->op1
);
10024 if (aarch64_evpc_rev (d
))
10026 else if (aarch64_evpc_ext (d
))
10028 else if (aarch64_evpc_dup (d
))
10030 else if (aarch64_evpc_zip (d
))
10032 else if (aarch64_evpc_uzp (d
))
10034 else if (aarch64_evpc_trn (d
))
10036 return aarch64_evpc_tbl (d
);
10041 /* Expand a vec_perm_const pattern. */
10044 aarch64_expand_vec_perm_const (rtx target
, rtx op0
, rtx op1
, rtx sel
)
10046 struct expand_vec_perm_d d
;
10047 int i
, nelt
, which
;
10053 d
.vmode
= GET_MODE (target
);
10054 gcc_assert (VECTOR_MODE_P (d
.vmode
));
10055 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
10056 d
.testing_p
= false;
10058 for (i
= which
= 0; i
< nelt
; ++i
)
10060 rtx e
= XVECEXP (sel
, 0, i
);
10061 int ei
= INTVAL (e
) & (2 * nelt
- 1);
10062 which
|= (ei
< nelt
? 1 : 2);
10069 gcc_unreachable ();
10072 d
.one_vector_p
= false;
10073 if (!rtx_equal_p (op0
, op1
))
10076 /* The elements of PERM do not suggest that only the first operand
10077 is used, but both operands are identical. Allow easier matching
10078 of the permutation by folding the permutation into the single
10080 /* Fall Through. */
10082 for (i
= 0; i
< nelt
; ++i
)
10083 d
.perm
[i
] &= nelt
- 1;
10085 d
.one_vector_p
= true;
10090 d
.one_vector_p
= true;
10094 return aarch64_expand_vec_perm_const_1 (&d
);
10098 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
10099 const unsigned char *sel
)
10101 struct expand_vec_perm_d d
;
10102 unsigned int i
, nelt
, which
;
10106 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
10107 d
.testing_p
= true;
10108 memcpy (d
.perm
, sel
, nelt
);
10110 /* Calculate whether all elements are in one vector. */
10111 for (i
= which
= 0; i
< nelt
; ++i
)
10113 unsigned char e
= d
.perm
[i
];
10114 gcc_assert (e
< 2 * nelt
);
10115 which
|= (e
< nelt
? 1 : 2);
10118 /* If all elements are from the second vector, reindex as if from the
10121 for (i
= 0; i
< nelt
; ++i
)
10124 /* Check whether the mask can be applied to a single vector. */
10125 d
.one_vector_p
= (which
!= 3);
10127 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
10128 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
10129 if (!d
.one_vector_p
)
10130 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
10133 ret
= aarch64_expand_vec_perm_const_1 (&d
);
10140 aarch64_reverse_mask (enum machine_mode mode
)
10142 /* We have to reverse each vector because we dont have
10143 a permuted load that can reverse-load according to ABI rules. */
10145 rtvec v
= rtvec_alloc (16);
10147 int nunits
= GET_MODE_NUNITS (mode
);
10148 int usize
= GET_MODE_UNIT_SIZE (mode
);
10150 gcc_assert (BYTES_BIG_ENDIAN
);
10151 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
10153 for (i
= 0; i
< nunits
; i
++)
10154 for (j
= 0; j
< usize
; j
++)
10155 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
10156 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
10157 return force_reg (V16QImode
, mask
);
10160 /* Implement MODES_TIEABLE_P. */
10163 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
10165 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
10168 /* We specifically want to allow elements of "structure" modes to
10169 be tieable to the structure. This more general condition allows
10170 other rarer situations too. */
10172 && aarch64_vector_mode_p (mode1
)
10173 && aarch64_vector_mode_p (mode2
))
10179 /* Return a new RTX holding the result of moving POINTER forward by
10183 aarch64_move_pointer (rtx pointer
, int amount
)
10185 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
10187 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
10191 /* Return a new RTX holding the result of moving POINTER forward by the
10192 size of the mode it points to. */
10195 aarch64_progress_pointer (rtx pointer
)
10197 HOST_WIDE_INT amount
= GET_MODE_SIZE (GET_MODE (pointer
));
10199 return aarch64_move_pointer (pointer
, amount
);
10202 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10206 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
10209 rtx reg
= gen_reg_rtx (mode
);
10211 /* "Cast" the pointers to the correct mode. */
10212 *src
= adjust_address (*src
, mode
, 0);
10213 *dst
= adjust_address (*dst
, mode
, 0);
10214 /* Emit the memcpy. */
10215 emit_move_insn (reg
, *src
);
10216 emit_move_insn (*dst
, reg
);
10217 /* Move the pointers forward. */
10218 *src
= aarch64_progress_pointer (*src
);
10219 *dst
= aarch64_progress_pointer (*dst
);
10222 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10223 we succeed, otherwise return false. */
10226 aarch64_expand_movmem (rtx
*operands
)
10229 rtx dst
= operands
[0];
10230 rtx src
= operands
[1];
10232 bool speed_p
= !optimize_function_for_size_p (cfun
);
10234 /* When optimizing for size, give a better estimate of the length of a
10235 memcpy call, but use the default otherwise. */
10236 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
10238 /* We can't do anything smart if the amount to copy is not constant. */
10239 if (!CONST_INT_P (operands
[2]))
10242 n
= UINTVAL (operands
[2]);
10244 /* Try to keep the number of instructions low. For cases below 16 bytes we
10245 need to make at most two moves. For cases above 16 bytes it will be one
10246 move for each 16 byte chunk, then at most two additional moves. */
10247 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
10250 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
10251 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
10253 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
10254 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
10256 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10262 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
10267 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
10272 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10273 4-byte chunk, partially overlapping with the previously copied chunk. */
10276 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10282 src
= aarch64_move_pointer (src
, move
);
10283 dst
= aarch64_move_pointer (dst
, move
);
10284 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10289 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10290 them, then (if applicable) an 8-byte chunk. */
10295 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
10300 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
10305 /* Finish the final bytes of the copy. We can always do this in one
10306 instruction. We either copy the exact amount we need, or partially
10307 overlap with the previous chunk we copied and copy 8-bytes. */
10311 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
10313 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
10315 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10320 src
= aarch64_move_pointer (src
, -1);
10321 dst
= aarch64_move_pointer (dst
, -1);
10322 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10328 src
= aarch64_move_pointer (src
, move
);
10329 dst
= aarch64_move_pointer (dst
, move
);
10330 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
10337 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10339 static unsigned HOST_WIDE_INT
10340 aarch64_asan_shadow_offset (void)
10342 return (HOST_WIDE_INT_1
<< 36);
10346 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size
,
10347 unsigned int align
,
10348 enum by_pieces_operation op
,
10351 /* STORE_BY_PIECES can be used when copying a constant string, but
10352 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10353 For now we always fail this and let the move_by_pieces code copy
10354 the string from read-only memory. */
10355 if (op
== STORE_BY_PIECES
)
10358 return default_use_by_pieces_infrastructure_p (size
, align
, op
, speed_p
);
10361 static enum machine_mode
10362 aarch64_code_to_ccmode (enum rtx_code code
)
10385 return CC_DLEUmode
;
10388 return CC_DLTUmode
;
10391 return CC_DGEUmode
;
10394 return CC_DGTUmode
;
10402 aarch64_gen_ccmp_first (rtx
*prep_seq
, rtx
*gen_seq
,
10403 int code
, tree treeop0
, tree treeop1
)
10405 enum machine_mode op_mode
, cmp_mode
, cc_mode
;
10406 rtx op0
, op1
, cmp
, target
;
10407 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
10408 enum insn_code icode
;
10409 struct expand_operand ops
[4];
10411 cc_mode
= aarch64_code_to_ccmode ((enum rtx_code
) code
);
10412 if (cc_mode
== CCmode
)
10416 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
10418 op_mode
= GET_MODE (op0
);
10419 if (op_mode
== VOIDmode
)
10420 op_mode
= GET_MODE (op1
);
10428 icode
= CODE_FOR_cmpsi
;
10433 icode
= CODE_FOR_cmpdi
;
10441 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
10442 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
10448 *prep_seq
= get_insns ();
10451 cmp
= gen_rtx_fmt_ee ((enum rtx_code
) code
, cmp_mode
, op0
, op1
);
10452 target
= gen_rtx_REG (CCmode
, CC_REGNUM
);
10454 create_output_operand (&ops
[0], target
, CCmode
);
10455 create_fixed_operand (&ops
[1], cmp
);
10456 create_fixed_operand (&ops
[2], op0
);
10457 create_fixed_operand (&ops
[3], op1
);
10460 if (!maybe_expand_insn (icode
, 4, ops
))
10465 *gen_seq
= get_insns ();
10468 return gen_rtx_REG (cc_mode
, CC_REGNUM
);
10472 aarch64_gen_ccmp_next (rtx
*prep_seq
, rtx
*gen_seq
, rtx prev
, int cmp_code
,
10473 tree treeop0
, tree treeop1
, int bit_code
)
10475 rtx op0
, op1
, cmp0
, cmp1
, target
;
10476 enum machine_mode op_mode
, cmp_mode
, cc_mode
;
10477 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
10478 enum insn_code icode
= CODE_FOR_ccmp_andsi
;
10479 struct expand_operand ops
[6];
10481 cc_mode
= aarch64_code_to_ccmode ((enum rtx_code
) cmp_code
);
10482 if (cc_mode
== CCmode
)
10485 push_to_sequence ((rtx_insn
*) *prep_seq
);
10486 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
10488 op_mode
= GET_MODE (op0
);
10489 if (op_mode
== VOIDmode
)
10490 op_mode
= GET_MODE (op1
);
10498 icode
= (enum rtx_code
) bit_code
== AND
? CODE_FOR_ccmp_andsi
10499 : CODE_FOR_ccmp_iorsi
;
10504 icode
= (enum rtx_code
) bit_code
== AND
? CODE_FOR_ccmp_anddi
10505 : CODE_FOR_ccmp_iordi
;
10513 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
10514 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
10520 *prep_seq
= get_insns ();
10523 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
10524 cmp1
= gen_rtx_fmt_ee ((enum rtx_code
) cmp_code
, cmp_mode
, op0
, op1
);
10525 cmp0
= gen_rtx_fmt_ee (NE
, cmp_mode
, prev
, const0_rtx
);
10527 create_fixed_operand (&ops
[0], prev
);
10528 create_fixed_operand (&ops
[1], target
);
10529 create_fixed_operand (&ops
[2], op0
);
10530 create_fixed_operand (&ops
[3], op1
);
10531 create_fixed_operand (&ops
[4], cmp0
);
10532 create_fixed_operand (&ops
[5], cmp1
);
10534 push_to_sequence ((rtx_insn
*) *gen_seq
);
10535 if (!maybe_expand_insn (icode
, 6, ops
))
10541 *gen_seq
= get_insns ();
10547 #undef TARGET_GEN_CCMP_FIRST
10548 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10550 #undef TARGET_GEN_CCMP_NEXT
10551 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10553 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10554 instruction fusion of some sort. */
10557 aarch64_macro_fusion_p (void)
10559 return aarch64_tune_params
->fuseable_ops
!= AARCH64_FUSE_NOTHING
;
10563 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10564 should be kept together during scheduling. */
10567 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
10570 rtx prev_set
= single_set (prev
);
10571 rtx curr_set
= single_set (curr
);
10572 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10573 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
10575 if (!aarch64_macro_fusion_p ())
10579 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_MOV_MOVK
))
10581 /* We are trying to match:
10582 prev (mov) == (set (reg r0) (const_int imm16))
10583 curr (movk) == (set (zero_extract (reg r0)
10586 (const_int imm16_1)) */
10588 set_dest
= SET_DEST (curr_set
);
10590 if (GET_CODE (set_dest
) == ZERO_EXTRACT
10591 && CONST_INT_P (SET_SRC (curr_set
))
10592 && CONST_INT_P (SET_SRC (prev_set
))
10593 && CONST_INT_P (XEXP (set_dest
, 2))
10594 && INTVAL (XEXP (set_dest
, 2)) == 16
10595 && REG_P (XEXP (set_dest
, 0))
10596 && REG_P (SET_DEST (prev_set
))
10597 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
10604 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_ADRP_ADD
))
10607 /* We're trying to match:
10608 prev (adrp) == (set (reg r1)
10609 (high (symbol_ref ("SYM"))))
10610 curr (add) == (set (reg r0)
10612 (symbol_ref ("SYM"))))
10613 Note that r0 need not necessarily be the same as r1, especially
10614 during pre-regalloc scheduling. */
10616 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
10617 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
10619 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
10620 && REG_P (XEXP (SET_SRC (curr_set
), 0))
10621 && REGNO (XEXP (SET_SRC (curr_set
), 0))
10622 == REGNO (SET_DEST (prev_set
))
10623 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
10624 XEXP (SET_SRC (curr_set
), 1)))
10630 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_MOVK_MOVK
))
10633 /* We're trying to match:
10634 prev (movk) == (set (zero_extract (reg r0)
10637 (const_int imm16_1))
10638 curr (movk) == (set (zero_extract (reg r0)
10641 (const_int imm16_2)) */
10643 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
10644 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
10645 && REG_P (XEXP (SET_DEST (prev_set
), 0))
10646 && REG_P (XEXP (SET_DEST (curr_set
), 0))
10647 && REGNO (XEXP (SET_DEST (prev_set
), 0))
10648 == REGNO (XEXP (SET_DEST (curr_set
), 0))
10649 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
10650 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
10651 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
10652 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
10653 && CONST_INT_P (SET_SRC (prev_set
))
10654 && CONST_INT_P (SET_SRC (curr_set
)))
10659 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_ADRP_LDR
))
10661 /* We're trying to match:
10662 prev (adrp) == (set (reg r0)
10663 (high (symbol_ref ("SYM"))))
10664 curr (ldr) == (set (reg r1)
10665 (mem (lo_sum (reg r0)
10666 (symbol_ref ("SYM")))))
10668 curr (ldr) == (set (reg r1)
10671 (symbol_ref ("SYM")))))) */
10672 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
10673 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
10675 rtx curr_src
= SET_SRC (curr_set
);
10677 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
10678 curr_src
= XEXP (curr_src
, 0);
10680 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
10681 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
10682 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
10683 == REGNO (SET_DEST (prev_set
))
10684 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
10685 XEXP (SET_SRC (prev_set
), 0)))
10690 if ((aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_CMP_BRANCH
)
10691 && any_condjump_p (curr
))
10693 enum attr_type prev_type
= get_attr_type (prev
);
10695 /* FIXME: this misses some which is considered simple arthematic
10696 instructions for ThunderX. Simple shifts are missed here. */
10697 if (prev_type
== TYPE_ALUS_SREG
10698 || prev_type
== TYPE_ALUS_IMM
10699 || prev_type
== TYPE_LOGICS_REG
10700 || prev_type
== TYPE_LOGICS_IMM
)
10707 /* If MEM is in the form of [base+offset], extract the two parts
10708 of address and set to BASE and OFFSET, otherwise return false
10709 after clearing BASE and OFFSET. */
10712 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
10716 gcc_assert (MEM_P (mem
));
10718 addr
= XEXP (mem
, 0);
10723 *offset
= const0_rtx
;
10727 if (GET_CODE (addr
) == PLUS
10728 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
10730 *base
= XEXP (addr
, 0);
10731 *offset
= XEXP (addr
, 1);
10736 *offset
= NULL_RTX
;
10741 /* Types for scheduling fusion. */
10742 enum sched_fusion_type
10744 SCHED_FUSION_NONE
= 0,
10745 SCHED_FUSION_LD_SIGN_EXTEND
,
10746 SCHED_FUSION_LD_ZERO_EXTEND
,
10752 /* If INSN is a load or store of address in the form of [base+offset],
10753 extract the two parts and set to BASE and OFFSET. Return scheduling
10754 fusion type this INSN is. */
10756 static enum sched_fusion_type
10757 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
10760 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
10762 gcc_assert (INSN_P (insn
));
10763 x
= PATTERN (insn
);
10764 if (GET_CODE (x
) != SET
)
10765 return SCHED_FUSION_NONE
;
10768 dest
= SET_DEST (x
);
10770 if (GET_MODE (dest
) != SImode
&& GET_MODE (dest
) != DImode
10771 && GET_MODE (dest
) != SFmode
&& GET_MODE (dest
) != DFmode
)
10772 return SCHED_FUSION_NONE
;
10774 if (GET_CODE (src
) == SIGN_EXTEND
)
10776 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
10777 src
= XEXP (src
, 0);
10778 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
10779 return SCHED_FUSION_NONE
;
10781 else if (GET_CODE (src
) == ZERO_EXTEND
)
10783 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
10784 src
= XEXP (src
, 0);
10785 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
10786 return SCHED_FUSION_NONE
;
10789 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
10790 extract_base_offset_in_addr (src
, base
, offset
);
10791 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
10793 fusion
= SCHED_FUSION_ST
;
10794 extract_base_offset_in_addr (dest
, base
, offset
);
10797 return SCHED_FUSION_NONE
;
10799 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
10800 fusion
= SCHED_FUSION_NONE
;
10805 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10807 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10808 and PRI are only calculated for these instructions. For other instruction,
10809 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10810 type instruction fusion can be added by returning different priorities.
10812 It's important that irrelevant instructions get the largest FUSION_PRI. */
10815 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
10816 int *fusion_pri
, int *pri
)
10820 enum sched_fusion_type fusion
;
10822 gcc_assert (INSN_P (insn
));
10825 fusion
= fusion_load_store (insn
, &base
, &offset
);
10826 if (fusion
== SCHED_FUSION_NONE
)
10833 /* Set FUSION_PRI according to fusion type and base register. */
10834 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
10836 /* Calculate PRI. */
10839 /* INSN with smaller offset goes first. */
10840 off_val
= (int)(INTVAL (offset
));
10842 tmp
-= (off_val
& 0xfffff);
10844 tmp
+= ((- off_val
) & 0xfffff);
10850 /* Given OPERANDS of consecutive load/store, check if we can merge
10851 them into ldp/stp. LOAD is true if they are load instructions.
10852 MODE is the mode of memory operands. */
10855 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
10856 enum machine_mode mode
)
10858 HOST_WIDE_INT offval_1
, offval_2
, msize
;
10859 enum reg_class rclass_1
, rclass_2
;
10860 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
10864 mem_1
= operands
[1];
10865 mem_2
= operands
[3];
10866 reg_1
= operands
[0];
10867 reg_2
= operands
[2];
10868 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
10869 if (REGNO (reg_1
) == REGNO (reg_2
))
10874 mem_1
= operands
[0];
10875 mem_2
= operands
[2];
10876 reg_1
= operands
[1];
10877 reg_2
= operands
[3];
10880 /* The mems cannot be volatile. */
10881 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
10884 /* Check if the addresses are in the form of [base+offset]. */
10885 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
10886 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
10888 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
10889 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
10892 /* Check if the bases are same. */
10893 if (!rtx_equal_p (base_1
, base_2
))
10896 offval_1
= INTVAL (offset_1
);
10897 offval_2
= INTVAL (offset_2
);
10898 msize
= GET_MODE_SIZE (mode
);
10899 /* Check if the offsets are consecutive. */
10900 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
10903 /* Check if the addresses are clobbered by load. */
10906 if (reg_mentioned_p (reg_1
, mem_1
))
10909 /* In increasing order, the last load can clobber the address. */
10910 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
10914 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
10915 rclass_1
= FP_REGS
;
10917 rclass_1
= GENERAL_REGS
;
10919 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
10920 rclass_2
= FP_REGS
;
10922 rclass_2
= GENERAL_REGS
;
10924 /* Check if the registers are of same class. */
10925 if (rclass_1
!= rclass_2
)
10931 /* Given OPERANDS of consecutive load/store, check if we can merge
10932 them into ldp/stp by adjusting the offset. LOAD is true if they
10933 are load instructions. MODE is the mode of memory operands.
10935 Given below consecutive stores:
10937 str w1, [xb, 0x100]
10938 str w1, [xb, 0x104]
10939 str w1, [xb, 0x108]
10940 str w1, [xb, 0x10c]
10942 Though the offsets are out of the range supported by stp, we can
10943 still pair them after adjusting the offset, like:
10945 add scratch, xb, 0x100
10946 stp w1, w1, [scratch]
10947 stp w1, w1, [scratch, 0x8]
10949 The peephole patterns detecting this opportunity should guarantee
10950 the scratch register is avaliable. */
10953 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
10954 enum machine_mode mode
)
10956 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
10957 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
10958 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
10959 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
10963 reg_1
= operands
[0];
10964 mem_1
= operands
[1];
10965 reg_2
= operands
[2];
10966 mem_2
= operands
[3];
10967 reg_3
= operands
[4];
10968 mem_3
= operands
[5];
10969 reg_4
= operands
[6];
10970 mem_4
= operands
[7];
10971 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
10972 && REG_P (reg_3
) && REG_P (reg_4
));
10973 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
10978 mem_1
= operands
[0];
10979 reg_1
= operands
[1];
10980 mem_2
= operands
[2];
10981 reg_2
= operands
[3];
10982 mem_3
= operands
[4];
10983 reg_3
= operands
[5];
10984 mem_4
= operands
[6];
10985 reg_4
= operands
[7];
10987 /* Skip if memory operand is by itslef valid for ldp/stp. */
10988 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
10991 /* The mems cannot be volatile. */
10992 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
10993 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
10996 /* Check if the addresses are in the form of [base+offset]. */
10997 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
10998 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
11000 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
11001 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
11003 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
11004 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
11006 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
11007 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
11010 /* Check if the bases are same. */
11011 if (!rtx_equal_p (base_1
, base_2
)
11012 || !rtx_equal_p (base_2
, base_3
)
11013 || !rtx_equal_p (base_3
, base_4
))
11016 offval_1
= INTVAL (offset_1
);
11017 offval_2
= INTVAL (offset_2
);
11018 offval_3
= INTVAL (offset_3
);
11019 offval_4
= INTVAL (offset_4
);
11020 msize
= GET_MODE_SIZE (mode
);
11021 /* Check if the offsets are consecutive. */
11022 if ((offval_1
!= (offval_2
+ msize
)
11023 || offval_1
!= (offval_3
+ msize
* 2)
11024 || offval_1
!= (offval_4
+ msize
* 3))
11025 && (offval_4
!= (offval_3
+ msize
)
11026 || offval_4
!= (offval_2
+ msize
* 2)
11027 || offval_4
!= (offval_1
+ msize
* 3)))
11030 /* Check if the addresses are clobbered by load. */
11033 if (reg_mentioned_p (reg_1
, mem_1
)
11034 || reg_mentioned_p (reg_2
, mem_2
)
11035 || reg_mentioned_p (reg_3
, mem_3
))
11038 /* In increasing order, the last load can clobber the address. */
11039 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
11043 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
11044 rclass_1
= FP_REGS
;
11046 rclass_1
= GENERAL_REGS
;
11048 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
11049 rclass_2
= FP_REGS
;
11051 rclass_2
= GENERAL_REGS
;
11053 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
11054 rclass_3
= FP_REGS
;
11056 rclass_3
= GENERAL_REGS
;
11058 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
11059 rclass_4
= FP_REGS
;
11061 rclass_4
= GENERAL_REGS
;
11063 /* Check if the registers are of same class. */
11064 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
11070 /* Given OPERANDS of consecutive load/store, this function pairs them
11071 into ldp/stp after adjusting the offset. It depends on the fact
11072 that addresses of load/store instructions are in increasing order.
11073 MODE is the mode of memory operands. CODE is the rtl operator
11074 which should be applied to all memory operands, it's SIGN_EXTEND,
11075 ZERO_EXTEND or UNKNOWN. */
11078 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
11079 enum machine_mode mode
, RTX_CODE code
)
11081 rtx base
, offset
, t1
, t2
;
11082 rtx mem_1
, mem_2
, mem_3
, mem_4
;
11083 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
11087 mem_1
= operands
[1];
11088 mem_2
= operands
[3];
11089 mem_3
= operands
[5];
11090 mem_4
= operands
[7];
11094 mem_1
= operands
[0];
11095 mem_2
= operands
[2];
11096 mem_3
= operands
[4];
11097 mem_4
= operands
[6];
11098 gcc_assert (code
== UNKNOWN
);
11101 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
11102 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
11104 /* Adjust offset thus it can fit in ldp/stp instruction. */
11105 msize
= GET_MODE_SIZE (mode
);
11106 stp_off_limit
= msize
* 0x40;
11107 off_val
= INTVAL (offset
);
11108 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
11109 new_off
= abs_off
% stp_off_limit
;
11110 adj_off
= abs_off
- new_off
;
11112 /* Further adjust to make sure all offsets are OK. */
11113 if ((new_off
+ msize
* 2) >= stp_off_limit
)
11115 adj_off
+= stp_off_limit
;
11116 new_off
-= stp_off_limit
;
11119 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11120 if (adj_off
>= 0x1000)
11125 adj_off
= -adj_off
;
11126 new_off
= -new_off
;
11129 /* Create new memory references. */
11130 mem_1
= change_address (mem_1
, VOIDmode
,
11131 plus_constant (DImode
, operands
[8], new_off
));
11133 /* Check if the adjusted address is OK for ldp/stp. */
11134 if (!aarch64_mem_pair_operand (mem_1
, mode
))
11137 msize
= GET_MODE_SIZE (mode
);
11138 mem_2
= change_address (mem_2
, VOIDmode
,
11139 plus_constant (DImode
,
11142 mem_3
= change_address (mem_3
, VOIDmode
,
11143 plus_constant (DImode
,
11145 new_off
+ msize
* 2));
11146 mem_4
= change_address (mem_4
, VOIDmode
,
11147 plus_constant (DImode
,
11149 new_off
+ msize
* 3));
11151 if (code
== ZERO_EXTEND
)
11153 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
11154 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
11155 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
11156 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
11158 else if (code
== SIGN_EXTEND
)
11160 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
11161 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
11162 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
11163 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
11168 operands
[1] = mem_1
;
11169 operands
[3] = mem_2
;
11170 operands
[5] = mem_3
;
11171 operands
[7] = mem_4
;
11175 operands
[0] = mem_1
;
11176 operands
[2] = mem_2
;
11177 operands
[4] = mem_3
;
11178 operands
[6] = mem_4
;
11181 /* Emit adjusting instruction. */
11182 emit_insn (gen_rtx_SET (VOIDmode
, operands
[8],
11183 plus_constant (DImode
, base
, adj_off
)));
11184 /* Emit ldp/stp instructions. */
11185 t1
= gen_rtx_SET (VOIDmode
, operands
[0], operands
[1]);
11186 t2
= gen_rtx_SET (VOIDmode
, operands
[2], operands
[3]);
11187 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
11188 t1
= gen_rtx_SET (VOIDmode
, operands
[4], operands
[5]);
11189 t2
= gen_rtx_SET (VOIDmode
, operands
[6], operands
[7]);
11190 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
11194 #undef TARGET_ADDRESS_COST
11195 #define TARGET_ADDRESS_COST aarch64_address_cost
11197 /* This hook will determines whether unnamed bitfields affect the alignment
11198 of the containing structure. The hook returns true if the structure
11199 should inherit the alignment requirements of an unnamed bitfield's
11201 #undef TARGET_ALIGN_ANON_BITFIELD
11202 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11204 #undef TARGET_ASM_ALIGNED_DI_OP
11205 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11207 #undef TARGET_ASM_ALIGNED_HI_OP
11208 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11210 #undef TARGET_ASM_ALIGNED_SI_OP
11211 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11213 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11214 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11215 hook_bool_const_tree_hwi_hwi_const_tree_true
11217 #undef TARGET_ASM_FILE_START
11218 #define TARGET_ASM_FILE_START aarch64_start_file
11220 #undef TARGET_ASM_OUTPUT_MI_THUNK
11221 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11223 #undef TARGET_ASM_SELECT_RTX_SECTION
11224 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11226 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11227 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11229 #undef TARGET_BUILD_BUILTIN_VA_LIST
11230 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11232 #undef TARGET_CALLEE_COPIES
11233 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11235 #undef TARGET_CAN_ELIMINATE
11236 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11238 #undef TARGET_CANNOT_FORCE_CONST_MEM
11239 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11241 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11242 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11244 /* Only the least significant bit is used for initialization guard
11246 #undef TARGET_CXX_GUARD_MASK_BIT
11247 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11249 #undef TARGET_C_MODE_FOR_SUFFIX
11250 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11252 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11253 #undef TARGET_DEFAULT_TARGET_FLAGS
11254 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11257 #undef TARGET_CLASS_MAX_NREGS
11258 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11260 #undef TARGET_BUILTIN_DECL
11261 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11263 #undef TARGET_EXPAND_BUILTIN
11264 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11266 #undef TARGET_EXPAND_BUILTIN_VA_START
11267 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11269 #undef TARGET_FOLD_BUILTIN
11270 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11272 #undef TARGET_FUNCTION_ARG
11273 #define TARGET_FUNCTION_ARG aarch64_function_arg
11275 #undef TARGET_FUNCTION_ARG_ADVANCE
11276 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11278 #undef TARGET_FUNCTION_ARG_BOUNDARY
11279 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11281 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11282 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11284 #undef TARGET_FUNCTION_VALUE
11285 #define TARGET_FUNCTION_VALUE aarch64_function_value
11287 #undef TARGET_FUNCTION_VALUE_REGNO_P
11288 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11290 #undef TARGET_FRAME_POINTER_REQUIRED
11291 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11293 #undef TARGET_GIMPLE_FOLD_BUILTIN
11294 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11296 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11297 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11299 #undef TARGET_INIT_BUILTINS
11300 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11302 #undef TARGET_LEGITIMATE_ADDRESS_P
11303 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11305 #undef TARGET_LEGITIMATE_CONSTANT_P
11306 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11308 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11309 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11311 #undef TARGET_LRA_P
11312 #define TARGET_LRA_P hook_bool_void_true
11314 #undef TARGET_MANGLE_TYPE
11315 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11317 #undef TARGET_MEMORY_MOVE_COST
11318 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11320 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11321 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11323 #undef TARGET_MUST_PASS_IN_STACK
11324 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11326 /* This target hook should return true if accesses to volatile bitfields
11327 should use the narrowest mode possible. It should return false if these
11328 accesses should use the bitfield container type. */
11329 #undef TARGET_NARROW_VOLATILE_BITFIELD
11330 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11332 #undef TARGET_OPTION_OVERRIDE
11333 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11335 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11336 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11337 aarch64_override_options_after_change
11339 #undef TARGET_PASS_BY_REFERENCE
11340 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11342 #undef TARGET_PREFERRED_RELOAD_CLASS
11343 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11345 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11346 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11348 #undef TARGET_SECONDARY_RELOAD
11349 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11351 #undef TARGET_SHIFT_TRUNCATION_MASK
11352 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11354 #undef TARGET_SETUP_INCOMING_VARARGS
11355 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11357 #undef TARGET_STRUCT_VALUE_RTX
11358 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11360 #undef TARGET_REGISTER_MOVE_COST
11361 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11363 #undef TARGET_RETURN_IN_MEMORY
11364 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11366 #undef TARGET_RETURN_IN_MSB
11367 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11369 #undef TARGET_RTX_COSTS
11370 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11372 #undef TARGET_SCHED_ISSUE_RATE
11373 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11375 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11376 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11377 aarch64_sched_first_cycle_multipass_dfa_lookahead
11379 #undef TARGET_TRAMPOLINE_INIT
11380 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11382 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11383 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11385 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11386 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11388 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11389 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11391 #undef TARGET_VECTORIZE_ADD_STMT_COST
11392 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11394 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11395 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11396 aarch64_builtin_vectorization_cost
11398 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11399 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11401 #undef TARGET_VECTORIZE_BUILTINS
11402 #define TARGET_VECTORIZE_BUILTINS
11404 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11405 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11406 aarch64_builtin_vectorized_function
11408 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11409 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11410 aarch64_autovectorize_vector_sizes
11412 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11413 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11414 aarch64_atomic_assign_expand_fenv
11416 /* Section anchor support. */
11418 #undef TARGET_MIN_ANCHOR_OFFSET
11419 #define TARGET_MIN_ANCHOR_OFFSET -256
11421 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11422 byte offset; we can do much more for larger data types, but have no way
11423 to determine the size of the access. We assume accesses are aligned. */
11424 #undef TARGET_MAX_ANCHOR_OFFSET
11425 #define TARGET_MAX_ANCHOR_OFFSET 4095
11427 #undef TARGET_VECTOR_ALIGNMENT
11428 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11430 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11431 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11432 aarch64_simd_vector_alignment_reachable
11434 /* vec_perm support. */
11436 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11437 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11438 aarch64_vectorize_vec_perm_const_ok
11441 #undef TARGET_FIXED_CONDITION_CODE_REGS
11442 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11444 #undef TARGET_FLAGS_REGNUM
11445 #define TARGET_FLAGS_REGNUM CC_REGNUM
11447 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11448 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11450 #undef TARGET_ASAN_SHADOW_OFFSET
11451 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11453 #undef TARGET_LEGITIMIZE_ADDRESS
11454 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11456 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11457 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11458 aarch64_use_by_pieces_infrastructure_p
11460 #undef TARGET_CAN_USE_DOLOOP_P
11461 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11463 #undef TARGET_SCHED_MACRO_FUSION_P
11464 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11466 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11467 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11469 #undef TARGET_SCHED_FUSION_PRIORITY
11470 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11472 struct gcc_target targetm
= TARGET_INITIALIZER
;
11474 #include "gt-aarch64.h"