1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
25 #include "insn-codes.h"
27 #include "insn-attr.h"
31 #include "double-int.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
44 #include "dominance.h"
50 #include "cfgcleanup.h"
52 #include "basic-block.h"
54 #include "hard-reg-set.h"
59 #include "statistics.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
72 #include "target-def.h"
73 #include "targhooks.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
84 #include "gimple-expr.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
99 /* Defined for convenience. */
100 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
102 /* Classifies an address.
105 A simple base register plus immediate offset.
108 A base register indexed by immediate offset with writeback.
111 A base register indexed by (optionally scaled) register.
114 A base register indexed by (optionally scaled) zero-extended register.
117 A base register indexed by (optionally scaled) sign-extended register.
120 A LO_SUM rtx with a base register and "LO12" symbol relocation.
123 A constant symbolic address, in pc-relative literal pool. */
125 enum aarch64_address_type
{
135 struct aarch64_address_info
{
136 enum aarch64_address_type type
;
140 enum aarch64_symbol_type symbol_type
;
143 struct simd_immediate_info
152 /* The current code model. */
153 enum aarch64_code_model aarch64_cmodel
;
156 #undef TARGET_HAVE_TLS
157 #define TARGET_HAVE_TLS 1
160 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
161 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
163 machine_mode
*, int *,
165 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
166 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
167 static void aarch64_override_options_after_change (void);
168 static bool aarch64_vector_mode_supported_p (machine_mode
);
169 static unsigned bit_count (unsigned HOST_WIDE_INT
);
170 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
171 const unsigned char *sel
);
172 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
174 /* Major revision number of the ARM Architecture implemented by the target. */
175 unsigned aarch64_architecture_version
;
177 /* The processor for which instructions should be scheduled. */
178 enum aarch64_processor aarch64_tune
= cortexa53
;
180 /* The current tuning set. */
181 const struct tune_params
*aarch64_tune_params
;
183 /* Mask to specify which instructions we are allowed to generate. */
184 unsigned long aarch64_isa_flags
= 0;
186 /* Mask to specify which instruction scheduling options should be used. */
187 unsigned long aarch64_tune_flags
= 0;
189 /* Tuning parameters. */
191 static const struct cpu_addrcost_table generic_addrcost_table
=
201 0, /* register_offset */
202 0, /* register_extend */
206 static const struct cpu_addrcost_table cortexa57_addrcost_table
=
216 0, /* register_offset */
217 0, /* register_extend */
221 static const struct cpu_addrcost_table xgene1_addrcost_table
=
231 0, /* register_offset */
232 1, /* register_extend */
236 static const struct cpu_regmove_cost generic_regmove_cost
=
239 /* Avoid the use of slow int<->fp moves for spilling by setting
240 their cost higher than memmov_cost. */
246 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
249 /* Avoid the use of slow int<->fp moves for spilling by setting
250 their cost higher than memmov_cost. */
256 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
259 /* Avoid the use of slow int<->fp moves for spilling by setting
260 their cost higher than memmov_cost. */
266 static const struct cpu_regmove_cost thunderx_regmove_cost
=
274 static const struct cpu_regmove_cost xgene1_regmove_cost
=
277 /* Avoid the use of slow int<->fp moves for spilling by setting
278 their cost higher than memmov_cost. */
284 /* Generic costs for vector insn classes. */
285 static const struct cpu_vector_cost generic_vector_cost
=
287 1, /* scalar_stmt_cost */
288 1, /* scalar_load_cost */
289 1, /* scalar_store_cost */
290 1, /* vec_stmt_cost */
291 1, /* vec_to_scalar_cost */
292 1, /* scalar_to_vec_cost */
293 1, /* vec_align_load_cost */
294 1, /* vec_unalign_load_cost */
295 1, /* vec_unalign_store_cost */
296 1, /* vec_store_cost */
297 3, /* cond_taken_branch_cost */
298 1 /* cond_not_taken_branch_cost */
301 /* Generic costs for vector insn classes. */
302 static const struct cpu_vector_cost cortexa57_vector_cost
=
304 1, /* scalar_stmt_cost */
305 4, /* scalar_load_cost */
306 1, /* scalar_store_cost */
307 3, /* vec_stmt_cost */
308 8, /* vec_to_scalar_cost */
309 8, /* scalar_to_vec_cost */
310 5, /* vec_align_load_cost */
311 5, /* vec_unalign_load_cost */
312 1, /* vec_unalign_store_cost */
313 1, /* vec_store_cost */
314 1, /* cond_taken_branch_cost */
315 1 /* cond_not_taken_branch_cost */
318 /* Generic costs for vector insn classes. */
319 static const struct cpu_vector_cost xgene1_vector_cost
=
321 1, /* scalar_stmt_cost */
322 5, /* scalar_load_cost */
323 1, /* scalar_store_cost */
324 2, /* vec_stmt_cost */
325 4, /* vec_to_scalar_cost */
326 4, /* scalar_to_vec_cost */
327 10, /* vec_align_load_cost */
328 10, /* vec_unalign_load_cost */
329 2, /* vec_unalign_store_cost */
330 2, /* vec_store_cost */
331 2, /* cond_taken_branch_cost */
332 1 /* cond_not_taken_branch_cost */
335 #define AARCH64_FUSE_NOTHING (0)
336 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
337 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
338 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
339 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
340 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
342 static const struct tune_params generic_tunings
=
344 &cortexa57_extra_costs
,
345 &generic_addrcost_table
,
346 &generic_regmove_cost
,
347 &generic_vector_cost
,
350 AARCH64_FUSE_NOTHING
, /* fuseable_ops */
351 8, /* function_align. */
354 2, /* int_reassoc_width. */
355 4, /* fp_reassoc_width. */
356 1 /* vec_reassoc_width. */
359 static const struct tune_params cortexa53_tunings
=
361 &cortexa53_extra_costs
,
362 &generic_addrcost_table
,
363 &cortexa53_regmove_cost
,
364 &generic_vector_cost
,
367 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
368 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fuseable_ops */
369 8, /* function_align. */
372 2, /* int_reassoc_width. */
373 4, /* fp_reassoc_width. */
374 1 /* vec_reassoc_width. */
377 static const struct tune_params cortexa57_tunings
=
379 &cortexa57_extra_costs
,
380 &cortexa57_addrcost_table
,
381 &cortexa57_regmove_cost
,
382 &cortexa57_vector_cost
,
385 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
386 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
387 16, /* function_align. */
390 2, /* int_reassoc_width. */
391 4, /* fp_reassoc_width. */
392 1 /* vec_reassoc_width. */
395 static const struct tune_params thunderx_tunings
=
397 &thunderx_extra_costs
,
398 &generic_addrcost_table
,
399 &thunderx_regmove_cost
,
400 &generic_vector_cost
,
403 AARCH64_FUSE_CMP_BRANCH
, /* fuseable_ops */
404 8, /* function_align. */
407 2, /* int_reassoc_width. */
408 4, /* fp_reassoc_width. */
409 1 /* vec_reassoc_width. */
412 static const struct tune_params xgene1_tunings
=
415 &xgene1_addrcost_table
,
416 &xgene1_regmove_cost
,
420 AARCH64_FUSE_NOTHING
, /* fuseable_ops */
421 16, /* function_align. */
423 16, /* loop_align. */
424 2, /* int_reassoc_width. */
425 4, /* fp_reassoc_width. */
426 1 /* vec_reassoc_width. */
429 /* A processor implementing AArch64. */
432 const char *const name
;
433 enum aarch64_processor core
;
435 unsigned architecture_version
;
436 const unsigned long flags
;
437 const struct tune_params
*const tune
;
440 /* Processor cores implementing AArch64. */
441 static const struct processor all_cores
[] =
443 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
444 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
445 #include "aarch64-cores.def"
447 {"generic", cortexa53
, "8", 8, AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
448 {NULL
, aarch64_none
, NULL
, 0, 0, NULL
}
451 /* Architectures implementing AArch64. */
452 static const struct processor all_architectures
[] =
454 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
455 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
456 #include "aarch64-arches.def"
458 {NULL
, aarch64_none
, NULL
, 0, 0, NULL
}
461 /* Target specification. These are populated as commandline arguments
462 are processed, or NULL if not specified. */
463 static const struct processor
*selected_arch
;
464 static const struct processor
*selected_cpu
;
465 static const struct processor
*selected_tune
;
467 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
469 /* An ISA extension in the co-processor and main instruction set space. */
470 struct aarch64_option_extension
472 const char *const name
;
473 const unsigned long flags_on
;
474 const unsigned long flags_off
;
477 /* ISA extensions in AArch64. */
478 static const struct aarch64_option_extension all_extensions
[] =
480 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
481 {NAME, FLAGS_ON, FLAGS_OFF},
482 #include "aarch64-option-extensions.def"
483 #undef AARCH64_OPT_EXTENSION
487 /* Used to track the size of an address when generating a pre/post
488 increment address. */
489 static machine_mode aarch64_memory_reference_mode
;
491 /* A table of valid AArch64 "bitmask immediate" values for
492 logical instructions. */
494 #define AARCH64_NUM_BITMASKS 5334
495 static unsigned HOST_WIDE_INT aarch64_bitmasks
[AARCH64_NUM_BITMASKS
];
497 typedef enum aarch64_cond_code
499 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
500 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
501 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
505 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
507 /* The condition codes of the processor, and the inverse function. */
508 static const char * const aarch64_condition_codes
[] =
510 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
511 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
515 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED
)
521 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
522 enum machine_mode mode
)
524 if (VECTOR_MODE_P (mode
))
525 return aarch64_tune_params
->vec_reassoc_width
;
526 if (INTEGRAL_MODE_P (mode
))
527 return aarch64_tune_params
->int_reassoc_width
;
528 if (FLOAT_MODE_P (mode
))
529 return aarch64_tune_params
->fp_reassoc_width
;
533 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
535 aarch64_dbx_register_number (unsigned regno
)
537 if (GP_REGNUM_P (regno
))
538 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
539 else if (regno
== SP_REGNUM
)
540 return AARCH64_DWARF_SP
;
541 else if (FP_REGNUM_P (regno
))
542 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
544 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
545 equivalent DWARF register. */
546 return DWARF_FRAME_REGISTERS
;
549 /* Return TRUE if MODE is any of the large INT modes. */
551 aarch64_vect_struct_mode_p (machine_mode mode
)
553 return mode
== OImode
|| mode
== CImode
|| mode
== XImode
;
556 /* Return TRUE if MODE is any of the vector modes. */
558 aarch64_vector_mode_p (machine_mode mode
)
560 return aarch64_vector_mode_supported_p (mode
)
561 || aarch64_vect_struct_mode_p (mode
);
564 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
566 aarch64_array_mode_supported_p (machine_mode mode
,
567 unsigned HOST_WIDE_INT nelems
)
570 && AARCH64_VALID_SIMD_QREG_MODE (mode
)
571 && (nelems
>= 2 && nelems
<= 4))
577 /* Implement HARD_REGNO_NREGS. */
580 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
582 switch (aarch64_regno_regclass (regno
))
586 return (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
;
588 return (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
593 /* Implement HARD_REGNO_MODE_OK. */
596 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
598 if (GET_MODE_CLASS (mode
) == MODE_CC
)
599 return regno
== CC_REGNUM
;
601 if (regno
== SP_REGNUM
)
602 /* The purpose of comparing with ptr_mode is to support the
603 global register variable associated with the stack pointer
604 register via the syntax of asm ("wsp") in ILP32. */
605 return mode
== Pmode
|| mode
== ptr_mode
;
607 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
608 return mode
== Pmode
;
610 if (GP_REGNUM_P (regno
) && ! aarch64_vect_struct_mode_p (mode
))
613 if (FP_REGNUM_P (regno
))
615 if (aarch64_vect_struct_mode_p (mode
))
617 (regno
+ aarch64_hard_regno_nregs (regno
, mode
) - 1) <= V31_REGNUM
;
625 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
627 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned nregs
,
630 /* Handle modes that fit within single registers. */
631 if (nregs
== 1 && GET_MODE_SIZE (mode
) <= 16)
633 if (GET_MODE_SIZE (mode
) >= 4)
638 /* Fall back to generic for multi-reg and very large modes. */
640 return choose_hard_reg_mode (regno
, nregs
, false);
643 /* Return true if calls to DECL should be treated as
644 long-calls (ie called via a register). */
646 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
651 /* Return true if calls to symbol-ref SYM should be treated as
652 long-calls (ie called via a register). */
654 aarch64_is_long_call_p (rtx sym
)
656 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
659 /* Return true if the offsets to a zero/sign-extract operation
660 represent an expression that matches an extend operation. The
661 operands represent the paramters from
663 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
665 aarch64_is_extend_from_extract (machine_mode mode
, rtx mult_imm
,
668 HOST_WIDE_INT mult_val
, extract_val
;
670 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
673 mult_val
= INTVAL (mult_imm
);
674 extract_val
= INTVAL (extract_imm
);
677 && extract_val
< GET_MODE_BITSIZE (mode
)
678 && exact_log2 (extract_val
& ~7) > 0
679 && (extract_val
& 7) <= 4
680 && mult_val
== (1 << (extract_val
& 7)))
686 /* Emit an insn that's a simple single-set. Both the operands must be
687 known to be valid. */
689 emit_set_insn (rtx x
, rtx y
)
691 return emit_insn (gen_rtx_SET (VOIDmode
, x
, y
));
694 /* X and Y are two things to compare using CODE. Emit the compare insn and
695 return the rtx for register 0 in the proper mode. */
697 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
699 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
700 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
702 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
706 /* Build the SYMBOL_REF for __tls_get_addr. */
708 static GTY(()) rtx tls_get_addr_libfunc
;
711 aarch64_tls_get_addr (void)
713 if (!tls_get_addr_libfunc
)
714 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
715 return tls_get_addr_libfunc
;
718 /* Return the TLS model to use for ADDR. */
720 static enum tls_model
721 tls_symbolic_operand_type (rtx addr
)
723 enum tls_model tls_kind
= TLS_MODEL_NONE
;
726 if (GET_CODE (addr
) == CONST
)
728 split_const (addr
, &sym
, &addend
);
729 if (GET_CODE (sym
) == SYMBOL_REF
)
730 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
732 else if (GET_CODE (addr
) == SYMBOL_REF
)
733 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
738 /* We'll allow lo_sum's in addresses in our legitimate addresses
739 so that combine would take care of combining addresses where
740 necessary, but for generation purposes, we'll generate the address
743 tmp = hi (symbol_ref); adrp x1, foo
744 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
748 adrp x1, :got:foo adrp tmp, :tlsgd:foo
749 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
753 Load TLS symbol, depending on TLS mechanism and TLS access model.
755 Global Dynamic - Traditional TLS:
757 add dest, tmp, #:tlsgd_lo12:imm
760 Global Dynamic - TLS Descriptors:
761 adrp dest, :tlsdesc:imm
762 ldr tmp, [dest, #:tlsdesc_lo12:imm]
763 add dest, dest, #:tlsdesc_lo12:imm
770 adrp tmp, :gottprel:imm
771 ldr dest, [tmp, #:gottprel_lo12:imm]
776 add t0, tp, #:tprel_hi12:imm, lsl #12
777 add t0, t0, #:tprel_lo12_nc:imm
781 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
782 enum aarch64_symbol_type type
)
786 case SYMBOL_SMALL_ABSOLUTE
:
788 /* In ILP32, the mode of dest can be either SImode or DImode. */
790 machine_mode mode
= GET_MODE (dest
);
792 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
794 if (can_create_pseudo_p ())
795 tmp_reg
= gen_reg_rtx (mode
);
797 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
798 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
802 case SYMBOL_TINY_ABSOLUTE
:
803 emit_insn (gen_rtx_SET (Pmode
, dest
, imm
));
806 case SYMBOL_SMALL_GOT
:
808 /* In ILP32, the mode of dest can be either SImode or DImode,
809 while the got entry is always of SImode size. The mode of
810 dest depends on how dest is used: if dest is assigned to a
811 pointer (e.g. in the memory), it has SImode; it may have
812 DImode if dest is dereferenced to access the memeory.
813 This is why we have to handle three different ldr_got_small
814 patterns here (two patterns for ILP32). */
816 machine_mode mode
= GET_MODE (dest
);
818 if (can_create_pseudo_p ())
819 tmp_reg
= gen_reg_rtx (mode
);
821 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
822 if (mode
== ptr_mode
)
825 emit_insn (gen_ldr_got_small_di (dest
, tmp_reg
, imm
));
827 emit_insn (gen_ldr_got_small_si (dest
, tmp_reg
, imm
));
831 gcc_assert (mode
== Pmode
);
832 emit_insn (gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
));
838 case SYMBOL_SMALL_TLSGD
:
841 rtx result
= gen_rtx_REG (Pmode
, R0_REGNUM
);
844 aarch64_emit_call_insn (gen_tlsgd_small (result
, imm
));
845 insns
= get_insns ();
848 RTL_CONST_CALL_P (insns
) = 1;
849 emit_libcall_block (insns
, dest
, result
, imm
);
853 case SYMBOL_SMALL_TLSDESC
:
855 machine_mode mode
= GET_MODE (dest
);
856 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
859 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
861 /* In ILP32, the got entry is always of SImode size. Unlike
862 small GOT, the dest is fixed at reg 0. */
864 emit_insn (gen_tlsdesc_small_si (imm
));
866 emit_insn (gen_tlsdesc_small_di (imm
));
867 tp
= aarch64_load_tp (NULL
);
870 tp
= gen_lowpart (mode
, tp
);
872 emit_insn (gen_rtx_SET (mode
, dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
873 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
877 case SYMBOL_SMALL_GOTTPREL
:
879 /* In ILP32, the mode of dest can be either SImode or DImode,
880 while the got entry is always of SImode size. The mode of
881 dest depends on how dest is used: if dest is assigned to a
882 pointer (e.g. in the memory), it has SImode; it may have
883 DImode if dest is dereferenced to access the memeory.
884 This is why we have to handle three different tlsie_small
885 patterns here (two patterns for ILP32). */
886 machine_mode mode
= GET_MODE (dest
);
887 rtx tmp_reg
= gen_reg_rtx (mode
);
888 rtx tp
= aarch64_load_tp (NULL
);
890 if (mode
== ptr_mode
)
893 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
896 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
897 tp
= gen_lowpart (mode
, tp
);
902 gcc_assert (mode
== Pmode
);
903 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
906 emit_insn (gen_rtx_SET (mode
, dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
907 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
911 case SYMBOL_SMALL_TPREL
:
913 rtx tp
= aarch64_load_tp (NULL
);
915 if (GET_MODE (dest
) != Pmode
)
916 tp
= gen_lowpart (GET_MODE (dest
), tp
);
918 emit_insn (gen_tlsle_small (dest
, tp
, imm
));
919 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
923 case SYMBOL_TINY_GOT
:
924 emit_insn (gen_ldr_got_tiny (dest
, imm
));
932 /* Emit a move from SRC to DEST. Assume that the move expanders can
933 handle all moves if !can_create_pseudo_p (). The distinction is
934 important because, unlike emit_move_insn, the move expanders know
935 how to force Pmode objects into the constant pool even when the
936 constant pool address is not itself legitimate. */
938 aarch64_emit_move (rtx dest
, rtx src
)
940 return (can_create_pseudo_p ()
941 ? emit_move_insn (dest
, src
)
942 : emit_move_insn_1 (dest
, src
));
945 /* Split a 128-bit move operation into two 64-bit move operations,
946 taking care to handle partial overlap of register to register
947 copies. Special cases are needed when moving between GP regs and
948 FP regs. SRC can be a register, constant or memory; DST a register
949 or memory. If either operand is memory it must not have any side
952 aarch64_split_128bit_move (rtx dst
, rtx src
)
957 machine_mode mode
= GET_MODE (dst
);
959 gcc_assert (mode
== TImode
|| mode
== TFmode
);
960 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
961 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
963 if (REG_P (dst
) && REG_P (src
))
965 int src_regno
= REGNO (src
);
966 int dst_regno
= REGNO (dst
);
968 /* Handle FP <-> GP regs. */
969 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
971 src_lo
= gen_lowpart (word_mode
, src
);
972 src_hi
= gen_highpart (word_mode
, src
);
976 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
977 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
981 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
982 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
986 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
988 dst_lo
= gen_lowpart (word_mode
, dst
);
989 dst_hi
= gen_highpart (word_mode
, dst
);
993 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
994 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
998 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
999 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1005 dst_lo
= gen_lowpart (word_mode
, dst
);
1006 dst_hi
= gen_highpart (word_mode
, dst
);
1007 src_lo
= gen_lowpart (word_mode
, src
);
1008 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1010 /* At most one pairing may overlap. */
1011 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1013 aarch64_emit_move (dst_hi
, src_hi
);
1014 aarch64_emit_move (dst_lo
, src_lo
);
1018 aarch64_emit_move (dst_lo
, src_lo
);
1019 aarch64_emit_move (dst_hi
, src_hi
);
1024 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1026 return (! REG_P (src
)
1027 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1030 /* Split a complex SIMD combine. */
1033 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1035 machine_mode src_mode
= GET_MODE (src1
);
1036 machine_mode dst_mode
= GET_MODE (dst
);
1038 gcc_assert (VECTOR_MODE_P (dst_mode
));
1040 if (REG_P (dst
) && REG_P (src1
) && REG_P (src2
))
1042 rtx (*gen
) (rtx
, rtx
, rtx
);
1047 gen
= gen_aarch64_simd_combinev8qi
;
1050 gen
= gen_aarch64_simd_combinev4hi
;
1053 gen
= gen_aarch64_simd_combinev2si
;
1056 gen
= gen_aarch64_simd_combinev2sf
;
1059 gen
= gen_aarch64_simd_combinedi
;
1062 gen
= gen_aarch64_simd_combinedf
;
1068 emit_insn (gen (dst
, src1
, src2
));
1073 /* Split a complex SIMD move. */
1076 aarch64_split_simd_move (rtx dst
, rtx src
)
1078 machine_mode src_mode
= GET_MODE (src
);
1079 machine_mode dst_mode
= GET_MODE (dst
);
1081 gcc_assert (VECTOR_MODE_P (dst_mode
));
1083 if (REG_P (dst
) && REG_P (src
))
1085 rtx (*gen
) (rtx
, rtx
);
1087 gcc_assert (VECTOR_MODE_P (src_mode
));
1092 gen
= gen_aarch64_split_simd_movv16qi
;
1095 gen
= gen_aarch64_split_simd_movv8hi
;
1098 gen
= gen_aarch64_split_simd_movv4si
;
1101 gen
= gen_aarch64_split_simd_movv2di
;
1104 gen
= gen_aarch64_split_simd_movv4sf
;
1107 gen
= gen_aarch64_split_simd_movv2df
;
1113 emit_insn (gen (dst
, src
));
1119 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
1121 if (can_create_pseudo_p ())
1122 return force_reg (mode
, value
);
1125 x
= aarch64_emit_move (x
, value
);
1132 aarch64_add_offset (machine_mode mode
, rtx temp
, rtx reg
, HOST_WIDE_INT offset
)
1134 if (!aarch64_plus_immediate (GEN_INT (offset
), mode
))
1137 /* Load the full offset into a register. This
1138 might be improvable in the future. */
1139 high
= GEN_INT (offset
);
1141 high
= aarch64_force_temporary (mode
, temp
, high
);
1142 reg
= aarch64_force_temporary (mode
, temp
,
1143 gen_rtx_PLUS (mode
, high
, reg
));
1145 return plus_constant (mode
, reg
, offset
);
1149 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
1152 unsigned HOST_WIDE_INT mask
;
1155 unsigned HOST_WIDE_INT val
;
1158 int one_match
, zero_match
, first_not_ffff_match
;
1161 if (CONST_INT_P (imm
) && aarch64_move_imm (INTVAL (imm
), mode
))
1164 emit_insn (gen_rtx_SET (VOIDmode
, dest
, imm
));
1171 /* We know we can't do this in 1 insn, and we must be able to do it
1172 in two; so don't mess around looking for sequences that don't buy
1176 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1177 GEN_INT (INTVAL (imm
) & 0xffff)));
1178 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
1179 GEN_INT ((INTVAL (imm
) >> 16) & 0xffff)));
1185 /* Remaining cases are all for DImode. */
1188 subtargets
= optimize
&& can_create_pseudo_p ();
1193 first_not_ffff_match
= -1;
1195 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1197 if ((val
& mask
) == mask
)
1201 if (first_not_ffff_match
< 0)
1202 first_not_ffff_match
= i
;
1203 if ((val
& mask
) == 0)
1210 /* Set one of the quarters and then insert back into result. */
1211 mask
= 0xffffll
<< first_not_ffff_match
;
1214 emit_insn (gen_rtx_SET (VOIDmode
, dest
, GEN_INT (val
| mask
)));
1215 emit_insn (gen_insv_immdi (dest
, GEN_INT (first_not_ffff_match
),
1216 GEN_INT ((val
>> first_not_ffff_match
)
1223 if (zero_match
== 2)
1224 goto simple_sequence
;
1226 mask
= 0x0ffff0000UL
;
1227 for (i
= 16; i
< 64; i
+= 16, mask
<<= 16)
1229 HOST_WIDE_INT comp
= mask
& ~(mask
- 1);
1231 if (aarch64_uimm12_shift (val
- (val
& mask
)))
1235 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1236 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1237 GEN_INT (val
& mask
)));
1238 emit_insn (gen_adddi3 (dest
, subtarget
,
1239 GEN_INT (val
- (val
& mask
))));
1244 else if (aarch64_uimm12_shift (-(val
- ((val
+ comp
) & mask
))))
1248 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1249 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1250 GEN_INT ((val
+ comp
) & mask
)));
1251 emit_insn (gen_adddi3 (dest
, subtarget
,
1252 GEN_INT (val
- ((val
+ comp
) & mask
))));
1257 else if (aarch64_uimm12_shift (val
- ((val
- comp
) | ~mask
)))
1261 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1262 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1263 GEN_INT ((val
- comp
) | ~mask
)));
1264 emit_insn (gen_adddi3 (dest
, subtarget
,
1265 GEN_INT (val
- ((val
- comp
) | ~mask
))));
1270 else if (aarch64_uimm12_shift (-(val
- (val
| ~mask
))))
1274 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1275 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1276 GEN_INT (val
| ~mask
)));
1277 emit_insn (gen_adddi3 (dest
, subtarget
,
1278 GEN_INT (val
- (val
| ~mask
))));
1285 /* See if we can do it by arithmetically combining two
1287 for (i
= 0; i
< AARCH64_NUM_BITMASKS
; i
++)
1292 if (aarch64_uimm12_shift (val
- aarch64_bitmasks
[i
])
1293 || aarch64_uimm12_shift (-val
+ aarch64_bitmasks
[i
]))
1297 subtarget
= subtargets
? gen_reg_rtx (DImode
) : dest
;
1298 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1299 GEN_INT (aarch64_bitmasks
[i
])));
1300 emit_insn (gen_adddi3 (dest
, subtarget
,
1301 GEN_INT (val
- aarch64_bitmasks
[i
])));
1307 for (j
= 0; j
< 64; j
+= 16, mask
<<= 16)
1309 if ((aarch64_bitmasks
[i
] & ~mask
) == (val
& ~mask
))
1313 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1314 GEN_INT (aarch64_bitmasks
[i
])));
1315 emit_insn (gen_insv_immdi (dest
, GEN_INT (j
),
1316 GEN_INT ((val
>> j
) & 0xffff)));
1324 /* See if we can do it by logically combining two immediates. */
1325 for (i
= 0; i
< AARCH64_NUM_BITMASKS
; i
++)
1327 if ((aarch64_bitmasks
[i
] & val
) == aarch64_bitmasks
[i
])
1331 for (j
= i
+ 1; j
< AARCH64_NUM_BITMASKS
; j
++)
1332 if (val
== (aarch64_bitmasks
[i
] | aarch64_bitmasks
[j
]))
1336 subtarget
= subtargets
? gen_reg_rtx (mode
) : dest
;
1337 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1338 GEN_INT (aarch64_bitmasks
[i
])));
1339 emit_insn (gen_iordi3 (dest
, subtarget
,
1340 GEN_INT (aarch64_bitmasks
[j
])));
1346 else if ((val
& aarch64_bitmasks
[i
]) == val
)
1350 for (j
= i
+ 1; j
< AARCH64_NUM_BITMASKS
; j
++)
1351 if (val
== (aarch64_bitmasks
[j
] & aarch64_bitmasks
[i
]))
1355 subtarget
= subtargets
? gen_reg_rtx (mode
) : dest
;
1356 emit_insn (gen_rtx_SET (VOIDmode
, subtarget
,
1357 GEN_INT (aarch64_bitmasks
[j
])));
1358 emit_insn (gen_anddi3 (dest
, subtarget
,
1359 GEN_INT (aarch64_bitmasks
[i
])));
1367 if (one_match
> zero_match
)
1369 /* Set either first three quarters or all but the third. */
1370 mask
= 0xffffll
<< (16 - first_not_ffff_match
);
1372 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1373 GEN_INT (val
| mask
| 0xffffffff00000000ull
)));
1376 /* Now insert other two quarters. */
1377 for (i
= first_not_ffff_match
+ 16, mask
<<= (first_not_ffff_match
<< 1);
1378 i
< 64; i
+= 16, mask
<<= 16)
1380 if ((val
& mask
) != mask
)
1383 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1384 GEN_INT ((val
>> i
) & 0xffff)));
1394 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1396 if ((val
& mask
) != 0)
1401 emit_insn (gen_rtx_SET (VOIDmode
, dest
,
1402 GEN_INT (val
& mask
)));
1409 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1410 GEN_INT ((val
>> i
) & 0xffff)));
1421 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
1423 machine_mode mode
= GET_MODE (dest
);
1425 gcc_assert (mode
== SImode
|| mode
== DImode
);
1427 /* Check on what type of symbol it is. */
1428 if (GET_CODE (imm
) == SYMBOL_REF
1429 || GET_CODE (imm
) == LABEL_REF
1430 || GET_CODE (imm
) == CONST
)
1432 rtx mem
, base
, offset
;
1433 enum aarch64_symbol_type sty
;
1435 /* If we have (const (plus symbol offset)), separate out the offset
1436 before we start classifying the symbol. */
1437 split_const (imm
, &base
, &offset
);
1439 sty
= aarch64_classify_symbol (base
, offset
, SYMBOL_CONTEXT_ADR
);
1442 case SYMBOL_FORCE_TO_MEM
:
1443 if (offset
!= const0_rtx
1444 && targetm
.cannot_force_const_mem (mode
, imm
))
1446 gcc_assert (can_create_pseudo_p ());
1447 base
= aarch64_force_temporary (mode
, dest
, base
);
1448 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1449 aarch64_emit_move (dest
, base
);
1452 mem
= force_const_mem (ptr_mode
, imm
);
1454 if (mode
!= ptr_mode
)
1455 mem
= gen_rtx_ZERO_EXTEND (mode
, mem
);
1456 emit_insn (gen_rtx_SET (VOIDmode
, dest
, mem
));
1459 case SYMBOL_SMALL_TLSGD
:
1460 case SYMBOL_SMALL_TLSDESC
:
1461 case SYMBOL_SMALL_GOTTPREL
:
1462 case SYMBOL_SMALL_GOT
:
1463 case SYMBOL_TINY_GOT
:
1464 if (offset
!= const0_rtx
)
1466 gcc_assert(can_create_pseudo_p ());
1467 base
= aarch64_force_temporary (mode
, dest
, base
);
1468 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1469 aarch64_emit_move (dest
, base
);
1474 case SYMBOL_SMALL_TPREL
:
1475 case SYMBOL_SMALL_ABSOLUTE
:
1476 case SYMBOL_TINY_ABSOLUTE
:
1477 aarch64_load_symref_appropriately (dest
, imm
, sty
);
1485 if (!CONST_INT_P (imm
))
1487 if (GET_CODE (imm
) == HIGH
)
1488 emit_insn (gen_rtx_SET (VOIDmode
, dest
, imm
));
1491 rtx mem
= force_const_mem (mode
, imm
);
1493 emit_insn (gen_rtx_SET (VOIDmode
, dest
, mem
));
1499 aarch64_internal_mov_immediate (dest
, imm
, true, GET_MODE (dest
));
1503 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
1504 tree exp ATTRIBUTE_UNUSED
)
1506 /* Currently, always true. */
1510 /* Implement TARGET_PASS_BY_REFERENCE. */
1513 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
1516 bool named ATTRIBUTE_UNUSED
)
1519 machine_mode dummymode
;
1522 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1523 size
= (mode
== BLKmode
&& type
)
1524 ? int_size_in_bytes (type
) : (int) GET_MODE_SIZE (mode
);
1526 /* Aggregates are passed by reference based on their size. */
1527 if (type
&& AGGREGATE_TYPE_P (type
))
1529 size
= int_size_in_bytes (type
);
1532 /* Variable sized arguments are always returned by reference. */
1536 /* Can this be a candidate to be passed in fp/simd register(s)? */
1537 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
1542 /* Arguments which are variable sized or larger than 2 registers are
1543 passed by reference unless they are a homogenous floating point
1545 return size
> 2 * UNITS_PER_WORD
;
1548 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1550 aarch64_return_in_msb (const_tree valtype
)
1552 machine_mode dummy_mode
;
1555 /* Never happens in little-endian mode. */
1556 if (!BYTES_BIG_ENDIAN
)
1559 /* Only composite types smaller than or equal to 16 bytes can
1560 be potentially returned in registers. */
1561 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
1562 || int_size_in_bytes (valtype
) <= 0
1563 || int_size_in_bytes (valtype
) > 16)
1566 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1567 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1568 is always passed/returned in the least significant bits of fp/simd
1570 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
1571 &dummy_mode
, &dummy_int
, NULL
))
1577 /* Implement TARGET_FUNCTION_VALUE.
1578 Define how to find the value returned by a function. */
1581 aarch64_function_value (const_tree type
, const_tree func
,
1582 bool outgoing ATTRIBUTE_UNUSED
)
1587 machine_mode ag_mode
;
1589 mode
= TYPE_MODE (type
);
1590 if (INTEGRAL_TYPE_P (type
))
1591 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
1593 if (aarch64_return_in_msb (type
))
1595 HOST_WIDE_INT size
= int_size_in_bytes (type
);
1597 if (size
% UNITS_PER_WORD
!= 0)
1599 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
1600 mode
= mode_for_size (size
* BITS_PER_UNIT
, MODE_INT
, 0);
1604 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
1605 &ag_mode
, &count
, NULL
))
1607 if (!aarch64_composite_type_p (type
, mode
))
1609 gcc_assert (count
== 1 && mode
== ag_mode
);
1610 return gen_rtx_REG (mode
, V0_REGNUM
);
1617 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
1618 for (i
= 0; i
< count
; i
++)
1620 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
1621 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
1622 GEN_INT (i
* GET_MODE_SIZE (ag_mode
)));
1623 XVECEXP (par
, 0, i
) = tmp
;
1629 return gen_rtx_REG (mode
, R0_REGNUM
);
1632 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1633 Return true if REGNO is the number of a hard register in which the values
1634 of called function may come back. */
1637 aarch64_function_value_regno_p (const unsigned int regno
)
1639 /* Maximum of 16 bytes can be returned in the general registers. Examples
1640 of 16-byte return values are: 128-bit integers and 16-byte small
1641 structures (excluding homogeneous floating-point aggregates). */
1642 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
1645 /* Up to four fp/simd registers can return a function value, e.g. a
1646 homogeneous floating-point aggregate having four members. */
1647 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
1648 return !TARGET_GENERAL_REGS_ONLY
;
1653 /* Implement TARGET_RETURN_IN_MEMORY.
1655 If the type T of the result of a function is such that
1657 would require that arg be passed as a value in a register (or set of
1658 registers) according to the parameter passing rules, then the result
1659 is returned in the same registers as would be used for such an
1663 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
1666 machine_mode ag_mode
;
1669 if (!AGGREGATE_TYPE_P (type
)
1670 && TREE_CODE (type
) != COMPLEX_TYPE
1671 && TREE_CODE (type
) != VECTOR_TYPE
)
1672 /* Simple scalar types always returned in registers. */
1675 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
1682 /* Types larger than 2 registers returned in memory. */
1683 size
= int_size_in_bytes (type
);
1684 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
1688 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
1689 const_tree type
, int *nregs
)
1691 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1692 return aarch64_vfp_is_call_or_return_candidate (mode
,
1694 &pcum
->aapcs_vfp_rmode
,
1699 /* Given MODE and TYPE of a function argument, return the alignment in
1700 bits. The idea is to suppress any stronger alignment requested by
1701 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1702 This is a helper function for local use only. */
1705 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
1707 unsigned int alignment
;
1711 if (!integer_zerop (TYPE_SIZE (type
)))
1713 if (TYPE_MODE (type
) == mode
)
1714 alignment
= TYPE_ALIGN (type
);
1716 alignment
= GET_MODE_ALIGNMENT (mode
);
1722 alignment
= GET_MODE_ALIGNMENT (mode
);
1727 /* Layout a function argument according to the AAPCS64 rules. The rule
1728 numbers refer to the rule numbers in the AAPCS64. */
1731 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
1733 bool named ATTRIBUTE_UNUSED
)
1735 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1736 int ncrn
, nvrn
, nregs
;
1737 bool allocate_ncrn
, allocate_nvrn
;
1740 /* We need to do this once per argument. */
1741 if (pcum
->aapcs_arg_processed
)
1744 pcum
->aapcs_arg_processed
= true;
1746 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1748 = AARCH64_ROUND_UP (type
? int_size_in_bytes (type
) : GET_MODE_SIZE (mode
),
1751 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
1752 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
1757 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1758 The following code thus handles passing by SIMD/FP registers first. */
1760 nvrn
= pcum
->aapcs_nvrn
;
1762 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1763 and homogenous short-vector aggregates (HVA). */
1766 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
1768 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
1769 if (!aarch64_composite_type_p (type
, mode
))
1771 gcc_assert (nregs
== 1);
1772 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
1778 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
1779 for (i
= 0; i
< nregs
; i
++)
1781 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
1782 V0_REGNUM
+ nvrn
+ i
);
1783 tmp
= gen_rtx_EXPR_LIST
1785 GEN_INT (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
)));
1786 XVECEXP (par
, 0, i
) = tmp
;
1788 pcum
->aapcs_reg
= par
;
1794 /* C.3 NSRN is set to 8. */
1795 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
1800 ncrn
= pcum
->aapcs_ncrn
;
1801 nregs
= size
/ UNITS_PER_WORD
;
1803 /* C6 - C9. though the sign and zero extension semantics are
1804 handled elsewhere. This is the case where the argument fits
1805 entirely general registers. */
1806 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
1808 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
1810 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
1812 /* C.8 if the argument has an alignment of 16 then the NGRN is
1813 rounded up to the next even number. */
1814 if (nregs
== 2 && alignment
== 16 * BITS_PER_UNIT
&& ncrn
% 2)
1817 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
1819 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1820 A reg is still generated for it, but the caller should be smart
1821 enough not to use it. */
1822 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
1824 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
1831 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
1832 for (i
= 0; i
< nregs
; i
++)
1834 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
1835 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
1836 GEN_INT (i
* UNITS_PER_WORD
));
1837 XVECEXP (par
, 0, i
) = tmp
;
1839 pcum
->aapcs_reg
= par
;
1842 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
1847 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
1849 /* The argument is passed on stack; record the needed number of words for
1850 this argument and align the total size if necessary. */
1852 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
1853 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
1854 pcum
->aapcs_stack_size
= AARCH64_ROUND_UP (pcum
->aapcs_stack_size
,
1855 16 / UNITS_PER_WORD
);
1859 /* Implement TARGET_FUNCTION_ARG. */
1862 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
1863 const_tree type
, bool named
)
1865 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1866 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
1868 if (mode
== VOIDmode
)
1871 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
1872 return pcum
->aapcs_reg
;
1876 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
1877 const_tree fntype ATTRIBUTE_UNUSED
,
1878 rtx libname ATTRIBUTE_UNUSED
,
1879 const_tree fndecl ATTRIBUTE_UNUSED
,
1880 unsigned n_named ATTRIBUTE_UNUSED
)
1882 pcum
->aapcs_ncrn
= 0;
1883 pcum
->aapcs_nvrn
= 0;
1884 pcum
->aapcs_nextncrn
= 0;
1885 pcum
->aapcs_nextnvrn
= 0;
1886 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
1887 pcum
->aapcs_reg
= NULL_RTX
;
1888 pcum
->aapcs_arg_processed
= false;
1889 pcum
->aapcs_stack_words
= 0;
1890 pcum
->aapcs_stack_size
= 0;
1896 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
1901 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
1902 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
1904 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
1905 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
1906 != (pcum
->aapcs_stack_words
!= 0));
1907 pcum
->aapcs_arg_processed
= false;
1908 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
1909 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
1910 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
1911 pcum
->aapcs_stack_words
= 0;
1912 pcum
->aapcs_reg
= NULL_RTX
;
1917 aarch64_function_arg_regno_p (unsigned regno
)
1919 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
1920 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
1923 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1924 PARM_BOUNDARY bits of alignment, but will be given anything up
1925 to STACK_BOUNDARY bits if the type requires it. This makes sure
1926 that both before and after the layout of each argument, the Next
1927 Stacked Argument Address (NSAA) will have a minimum alignment of
1931 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
1933 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
1935 if (alignment
< PARM_BOUNDARY
)
1936 alignment
= PARM_BOUNDARY
;
1937 if (alignment
> STACK_BOUNDARY
)
1938 alignment
= STACK_BOUNDARY
;
1942 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1944 Return true if an argument passed on the stack should be padded upwards,
1945 i.e. if the least-significant byte of the stack slot has useful data.
1947 Small aggregate types are placed in the lowest memory address.
1949 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1952 aarch64_pad_arg_upward (machine_mode mode
, const_tree type
)
1954 /* On little-endian targets, the least significant byte of every stack
1955 argument is passed at the lowest byte address of the stack slot. */
1956 if (!BYTES_BIG_ENDIAN
)
1959 /* Otherwise, integral, floating-point and pointer types are padded downward:
1960 the least significant byte of a stack argument is passed at the highest
1961 byte address of the stack slot. */
1963 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
1964 || POINTER_TYPE_P (type
))
1965 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
1968 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1972 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1974 It specifies padding for the last (may also be the only)
1975 element of a block move between registers and memory. If
1976 assuming the block is in the memory, padding upward means that
1977 the last element is padded after its highest significant byte,
1978 while in downward padding, the last element is padded at the
1979 its least significant byte side.
1981 Small aggregates and small complex types are always padded
1984 We don't need to worry about homogeneous floating-point or
1985 short-vector aggregates; their move is not affected by the
1986 padding direction determined here. Regardless of endianness,
1987 each element of such an aggregate is put in the least
1988 significant bits of a fp/simd register.
1990 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1991 register has useful data, and return the opposite if the most
1992 significant byte does. */
1995 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
1996 bool first ATTRIBUTE_UNUSED
)
1999 /* Small composite types are always padded upward. */
2000 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
2002 HOST_WIDE_INT size
= (type
? int_size_in_bytes (type
)
2003 : GET_MODE_SIZE (mode
));
2004 if (size
< 2 * UNITS_PER_WORD
)
2008 /* Otherwise, use the default padding. */
2009 return !BYTES_BIG_ENDIAN
;
2013 aarch64_libgcc_cmp_return_mode (void)
2019 aarch64_frame_pointer_required (void)
2021 /* In aarch64_override_options_after_change
2022 flag_omit_leaf_frame_pointer turns off the frame pointer by
2023 default. Turn it back on now if we've not got a leaf
2025 if (flag_omit_leaf_frame_pointer
2026 && (!crtl
->is_leaf
|| df_regs_ever_live_p (LR_REGNUM
)))
2032 /* Mark the registers that need to be saved by the callee and calculate
2033 the size of the callee-saved registers area and frame record (both FP
2034 and LR may be omitted). */
2036 aarch64_layout_frame (void)
2038 HOST_WIDE_INT offset
= 0;
2041 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
2044 #define SLOT_NOT_REQUIRED (-2)
2045 #define SLOT_REQUIRED (-1)
2047 cfun
->machine
->frame
.wb_candidate1
= FIRST_PSEUDO_REGISTER
;
2048 cfun
->machine
->frame
.wb_candidate2
= FIRST_PSEUDO_REGISTER
;
2050 /* First mark all the registers that really need to be saved... */
2051 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2052 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2054 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2055 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2057 /* ... that includes the eh data registers (if needed)... */
2058 if (crtl
->calls_eh_return
)
2059 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
2060 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
2063 /* ... and any callee saved register that dataflow says is live. */
2064 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2065 if (df_regs_ever_live_p (regno
)
2066 && (regno
== R30_REGNUM
2067 || !call_used_regs
[regno
]))
2068 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2070 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2071 if (df_regs_ever_live_p (regno
)
2072 && !call_used_regs
[regno
])
2073 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2075 if (frame_pointer_needed
)
2077 /* FP and LR are placed in the linkage record. */
2078 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
2079 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
2080 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
2081 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
2082 cfun
->machine
->frame
.hardfp_offset
= 2 * UNITS_PER_WORD
;
2083 offset
+= 2 * UNITS_PER_WORD
;
2086 /* Now assign stack slots for them. */
2087 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2088 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2090 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2091 if (cfun
->machine
->frame
.wb_candidate1
== FIRST_PSEUDO_REGISTER
)
2092 cfun
->machine
->frame
.wb_candidate1
= regno
;
2093 else if (cfun
->machine
->frame
.wb_candidate2
== FIRST_PSEUDO_REGISTER
)
2094 cfun
->machine
->frame
.wb_candidate2
= regno
;
2095 offset
+= UNITS_PER_WORD
;
2098 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2099 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2101 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2102 if (cfun
->machine
->frame
.wb_candidate1
== FIRST_PSEUDO_REGISTER
)
2103 cfun
->machine
->frame
.wb_candidate1
= regno
;
2104 else if (cfun
->machine
->frame
.wb_candidate2
== FIRST_PSEUDO_REGISTER
2105 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
2106 cfun
->machine
->frame
.wb_candidate2
= regno
;
2107 offset
+= UNITS_PER_WORD
;
2110 cfun
->machine
->frame
.padding0
=
2111 (AARCH64_ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
) - offset
);
2112 offset
= AARCH64_ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2114 cfun
->machine
->frame
.saved_regs_size
= offset
;
2116 cfun
->machine
->frame
.hard_fp_offset
2117 = AARCH64_ROUND_UP (cfun
->machine
->frame
.saved_varargs_size
2119 + cfun
->machine
->frame
.saved_regs_size
,
2120 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2122 cfun
->machine
->frame
.frame_size
2123 = AARCH64_ROUND_UP (cfun
->machine
->frame
.hard_fp_offset
2124 + crtl
->outgoing_args_size
,
2125 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2127 cfun
->machine
->frame
.laid_out
= true;
2131 aarch64_register_saved_on_entry (int regno
)
2133 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
2137 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
2139 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
2145 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
2146 HOST_WIDE_INT adjustment
)
2148 rtx base_rtx
= stack_pointer_rtx
;
2151 reg
= gen_rtx_REG (mode
, regno
);
2152 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
2153 plus_constant (Pmode
, base_rtx
, -adjustment
));
2154 mem
= gen_rtx_MEM (mode
, mem
);
2156 insn
= emit_move_insn (mem
, reg
);
2157 RTX_FRAME_RELATED_P (insn
) = 1;
2161 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
2162 HOST_WIDE_INT adjustment
)
2167 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
2168 GEN_INT (-adjustment
),
2169 GEN_INT (UNITS_PER_WORD
- adjustment
));
2171 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
2172 GEN_INT (-adjustment
),
2173 GEN_INT (UNITS_PER_WORD
- adjustment
));
2180 aarch64_pushwb_pair_reg (machine_mode mode
, unsigned regno1
,
2181 unsigned regno2
, HOST_WIDE_INT adjustment
)
2184 rtx reg1
= gen_rtx_REG (mode
, regno1
);
2185 rtx reg2
= gen_rtx_REG (mode
, regno2
);
2187 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
2189 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
2190 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
2191 RTX_FRAME_RELATED_P (insn
) = 1;
2195 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
2196 HOST_WIDE_INT adjustment
)
2201 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
2202 GEN_INT (UNITS_PER_WORD
));
2204 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
2205 GEN_INT (UNITS_PER_WORD
));
2212 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
2218 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
2221 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
2229 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
2235 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
2238 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
2247 aarch64_save_callee_saves (machine_mode mode
, HOST_WIDE_INT start_offset
,
2248 unsigned start
, unsigned limit
, bool skip_wb
)
2251 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
2252 ? gen_frame_mem
: gen_rtx_MEM
);
2256 for (regno
= aarch64_next_callee_save (start
, limit
);
2258 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
2261 HOST_WIDE_INT offset
;
2264 && (regno
== cfun
->machine
->frame
.wb_candidate1
2265 || regno
== cfun
->machine
->frame
.wb_candidate2
))
2268 reg
= gen_rtx_REG (mode
, regno
);
2269 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
2270 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
2273 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
2276 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
2277 == cfun
->machine
->frame
.reg_offset
[regno2
]))
2280 rtx reg2
= gen_rtx_REG (mode
, regno2
);
2283 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
2284 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
2286 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
2289 /* The first part of a frame-related parallel insn is
2290 always assumed to be relevant to the frame
2291 calculations; subsequent parts, are only
2292 frame-related if explicitly marked. */
2293 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
2297 insn
= emit_move_insn (mem
, reg
);
2299 RTX_FRAME_RELATED_P (insn
) = 1;
2304 aarch64_restore_callee_saves (machine_mode mode
,
2305 HOST_WIDE_INT start_offset
, unsigned start
,
2306 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
2308 rtx base_rtx
= stack_pointer_rtx
;
2309 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
2310 ? gen_frame_mem
: gen_rtx_MEM
);
2313 HOST_WIDE_INT offset
;
2315 for (regno
= aarch64_next_callee_save (start
, limit
);
2317 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
2322 && (regno
== cfun
->machine
->frame
.wb_candidate1
2323 || regno
== cfun
->machine
->frame
.wb_candidate2
))
2326 reg
= gen_rtx_REG (mode
, regno
);
2327 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
2328 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
2330 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
2333 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
2334 == cfun
->machine
->frame
.reg_offset
[regno2
]))
2336 rtx reg2
= gen_rtx_REG (mode
, regno2
);
2339 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
2340 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
2341 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
2343 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
2347 emit_move_insn (reg
, mem
);
2348 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
2352 /* AArch64 stack frames generated by this compiler look like:
2354 +-------------------------------+
2356 | incoming stack arguments |
2358 +-------------------------------+
2359 | | <-- incoming stack pointer (aligned)
2360 | callee-allocated save area |
2361 | for register varargs |
2363 +-------------------------------+
2364 | local variables | <-- frame_pointer_rtx
2366 +-------------------------------+
2368 +-------------------------------+ |
2369 | callee-saved registers | | frame.saved_regs_size
2370 +-------------------------------+ |
2372 +-------------------------------+ |
2373 | FP' | / <- hard_frame_pointer_rtx (aligned)
2374 +-------------------------------+
2375 | dynamic allocation |
2376 +-------------------------------+
2378 +-------------------------------+
2379 | outgoing stack arguments | <-- arg_pointer
2381 +-------------------------------+
2382 | | <-- stack_pointer_rtx (aligned)
2384 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2385 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2388 /* Generate the prologue instructions for entry into a function.
2389 Establish the stack frame by decreasing the stack pointer with a
2390 properly calculated size and, if necessary, create a frame record
2391 filled with the values of LR and previous frame pointer. The
2392 current FP is also set up if it is in use. */
2395 aarch64_expand_prologue (void)
2397 /* sub sp, sp, #<frame_size>
2398 stp {fp, lr}, [sp, #<frame_size> - 16]
2399 add fp, sp, #<frame_size> - hardfp_offset
2400 stp {cs_reg}, [fp, #-16] etc.
2402 sub sp, sp, <final_adjustment_if_any>
2404 HOST_WIDE_INT frame_size
, offset
;
2405 HOST_WIDE_INT fp_offset
; /* Offset from hard FP to SP. */
2406 HOST_WIDE_INT hard_fp_offset
;
2409 aarch64_layout_frame ();
2411 offset
= frame_size
= cfun
->machine
->frame
.frame_size
;
2412 hard_fp_offset
= cfun
->machine
->frame
.hard_fp_offset
;
2413 fp_offset
= frame_size
- hard_fp_offset
;
2415 if (flag_stack_usage_info
)
2416 current_function_static_stack_size
= frame_size
;
2418 /* Store pairs and load pairs have a range only -512 to 504. */
2421 /* When the frame has a large size, an initial decrease is done on
2422 the stack pointer to jump over the callee-allocated save area for
2423 register varargs, the local variable area and/or the callee-saved
2424 register area. This will allow the pre-index write-back
2425 store pair instructions to be used for setting up the stack frame
2427 offset
= hard_fp_offset
;
2429 offset
= cfun
->machine
->frame
.saved_regs_size
;
2431 frame_size
-= (offset
+ crtl
->outgoing_args_size
);
2434 if (frame_size
>= 0x1000000)
2436 rtx op0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
2437 emit_move_insn (op0
, GEN_INT (-frame_size
));
2438 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
, op0
));
2440 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2441 gen_rtx_SET (VOIDmode
, stack_pointer_rtx
,
2442 plus_constant (Pmode
, stack_pointer_rtx
,
2444 RTX_FRAME_RELATED_P (insn
) = 1;
2446 else if (frame_size
> 0)
2448 int hi_ofs
= frame_size
& 0xfff000;
2449 int lo_ofs
= frame_size
& 0x000fff;
2453 insn
= emit_insn (gen_add2_insn
2454 (stack_pointer_rtx
, GEN_INT (-hi_ofs
)));
2455 RTX_FRAME_RELATED_P (insn
) = 1;
2459 insn
= emit_insn (gen_add2_insn
2460 (stack_pointer_rtx
, GEN_INT (-lo_ofs
)));
2461 RTX_FRAME_RELATED_P (insn
) = 1;
2470 bool skip_wb
= false;
2472 if (frame_pointer_needed
)
2478 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
,
2479 GEN_INT (-offset
)));
2480 RTX_FRAME_RELATED_P (insn
) = 1;
2482 aarch64_save_callee_saves (DImode
, fp_offset
, R29_REGNUM
,
2486 aarch64_pushwb_pair_reg (DImode
, R29_REGNUM
, R30_REGNUM
, offset
);
2488 /* Set up frame pointer to point to the location of the
2489 previous frame pointer on the stack. */
2490 insn
= emit_insn (gen_add3_insn (hard_frame_pointer_rtx
,
2492 GEN_INT (fp_offset
)));
2493 RTX_FRAME_RELATED_P (insn
) = 1;
2494 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
2498 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
2499 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
2502 || reg1
== FIRST_PSEUDO_REGISTER
2503 || (reg2
== FIRST_PSEUDO_REGISTER
2506 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
,
2507 GEN_INT (-offset
)));
2508 RTX_FRAME_RELATED_P (insn
) = 1;
2512 machine_mode mode1
= (reg1
<= R30_REGNUM
) ? DImode
: DFmode
;
2516 if (reg2
== FIRST_PSEUDO_REGISTER
)
2517 aarch64_pushwb_single_reg (mode1
, reg1
, offset
);
2519 aarch64_pushwb_pair_reg (mode1
, reg1
, reg2
, offset
);
2523 aarch64_save_callee_saves (DImode
, fp_offset
, R0_REGNUM
, R30_REGNUM
,
2525 aarch64_save_callee_saves (DFmode
, fp_offset
, V0_REGNUM
, V31_REGNUM
,
2529 /* when offset >= 512,
2530 sub sp, sp, #<outgoing_args_size> */
2531 if (frame_size
> -1)
2533 if (crtl
->outgoing_args_size
> 0)
2535 insn
= emit_insn (gen_add2_insn
2537 GEN_INT (- crtl
->outgoing_args_size
)));
2538 RTX_FRAME_RELATED_P (insn
) = 1;
2543 /* Return TRUE if we can use a simple_return insn.
2545 This function checks whether the callee saved stack is empty, which
2546 means no restore actions are need. The pro_and_epilogue will use
2547 this to check whether shrink-wrapping opt is feasible. */
2550 aarch64_use_return_insn_p (void)
2552 if (!reload_completed
)
2558 aarch64_layout_frame ();
2560 return cfun
->machine
->frame
.frame_size
== 0;
2563 /* Generate the epilogue instructions for returning from a function. */
2565 aarch64_expand_epilogue (bool for_sibcall
)
2567 HOST_WIDE_INT frame_size
, offset
;
2568 HOST_WIDE_INT fp_offset
;
2569 HOST_WIDE_INT hard_fp_offset
;
2571 /* We need to add memory barrier to prevent read from deallocated stack. */
2572 bool need_barrier_p
= (get_frame_size () != 0
2573 || cfun
->machine
->frame
.saved_varargs_size
);
2575 aarch64_layout_frame ();
2577 offset
= frame_size
= cfun
->machine
->frame
.frame_size
;
2578 hard_fp_offset
= cfun
->machine
->frame
.hard_fp_offset
;
2579 fp_offset
= frame_size
- hard_fp_offset
;
2581 /* Store pairs and load pairs have a range only -512 to 504. */
2584 offset
= hard_fp_offset
;
2586 offset
= cfun
->machine
->frame
.saved_regs_size
;
2588 frame_size
-= (offset
+ crtl
->outgoing_args_size
);
2590 if (!frame_pointer_needed
&& crtl
->outgoing_args_size
> 0)
2592 insn
= emit_insn (gen_add2_insn
2594 GEN_INT (crtl
->outgoing_args_size
)));
2595 RTX_FRAME_RELATED_P (insn
) = 1;
2601 /* If there were outgoing arguments or we've done dynamic stack
2602 allocation, then restore the stack pointer from the frame
2603 pointer. This is at most one insn and more efficient than using
2604 GCC's internal mechanism. */
2605 if (frame_pointer_needed
2606 && (crtl
->outgoing_args_size
|| cfun
->calls_alloca
))
2608 if (cfun
->calls_alloca
)
2609 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
2611 insn
= emit_insn (gen_add3_insn (stack_pointer_rtx
,
2612 hard_frame_pointer_rtx
,
2614 offset
= offset
- fp_offset
;
2619 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
2620 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
2621 bool skip_wb
= true;
2624 if (frame_pointer_needed
)
2627 || reg1
== FIRST_PSEUDO_REGISTER
2628 || (reg2
== FIRST_PSEUDO_REGISTER
2632 aarch64_restore_callee_saves (DImode
, fp_offset
, R0_REGNUM
, R30_REGNUM
,
2634 aarch64_restore_callee_saves (DFmode
, fp_offset
, V0_REGNUM
, V31_REGNUM
,
2638 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
2642 machine_mode mode1
= (reg1
<= R30_REGNUM
) ? DImode
: DFmode
;
2643 rtx rreg1
= gen_rtx_REG (mode1
, reg1
);
2645 cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, rreg1
, cfi_ops
);
2646 if (reg2
== FIRST_PSEUDO_REGISTER
)
2648 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, offset
);
2649 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
2650 mem
= gen_rtx_MEM (mode1
, mem
);
2651 insn
= emit_move_insn (rreg1
, mem
);
2655 rtx rreg2
= gen_rtx_REG (mode1
, reg2
);
2657 cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, rreg2
, cfi_ops
);
2658 insn
= emit_insn (aarch64_gen_loadwb_pair
2659 (mode1
, stack_pointer_rtx
, rreg1
,
2665 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
,
2669 /* Reset the CFA to be SP + FRAME_SIZE. */
2670 rtx new_cfa
= stack_pointer_rtx
;
2672 new_cfa
= plus_constant (Pmode
, new_cfa
, frame_size
);
2673 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
2674 REG_NOTES (insn
) = cfi_ops
;
2675 RTX_FRAME_RELATED_P (insn
) = 1;
2681 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
2683 if (frame_size
>= 0x1000000)
2685 rtx op0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
2686 emit_move_insn (op0
, GEN_INT (frame_size
));
2687 insn
= emit_insn (gen_add2_insn (stack_pointer_rtx
, op0
));
2691 int hi_ofs
= frame_size
& 0xfff000;
2692 int lo_ofs
= frame_size
& 0x000fff;
2694 if (hi_ofs
&& lo_ofs
)
2696 insn
= emit_insn (gen_add2_insn
2697 (stack_pointer_rtx
, GEN_INT (hi_ofs
)));
2698 RTX_FRAME_RELATED_P (insn
) = 1;
2699 frame_size
= lo_ofs
;
2701 insn
= emit_insn (gen_add2_insn
2702 (stack_pointer_rtx
, GEN_INT (frame_size
)));
2705 /* Reset the CFA to be SP + 0. */
2706 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_pointer_rtx
);
2707 RTX_FRAME_RELATED_P (insn
) = 1;
2710 /* Stack adjustment for exception handler. */
2711 if (crtl
->calls_eh_return
)
2713 /* We need to unwind the stack by the offset computed by
2714 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2715 to be SP; letting the CFA move during this adjustment
2716 is just as correct as retaining the CFA from the body
2717 of the function. Therefore, do nothing special. */
2718 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
2721 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
2723 emit_jump_insn (ret_rtx
);
2726 /* Return the place to copy the exception unwinding return address to.
2727 This will probably be a stack slot, but could (in theory be the
2728 return register). */
2730 aarch64_final_eh_return_addr (void)
2732 HOST_WIDE_INT fp_offset
;
2734 aarch64_layout_frame ();
2736 fp_offset
= cfun
->machine
->frame
.frame_size
2737 - cfun
->machine
->frame
.hard_fp_offset
;
2739 if (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] < 0)
2740 return gen_rtx_REG (DImode
, LR_REGNUM
);
2742 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2743 result in a store to save LR introduced by builtin_eh_return () being
2744 incorrectly deleted because the alias is not detected.
2745 So in the calculation of the address to copy the exception unwinding
2746 return address to, we note 2 cases.
2747 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2748 we return a SP-relative location since all the addresses are SP-relative
2749 in this case. This prevents the store from being optimized away.
2750 If the fp_offset is not 0, then the addresses will be FP-relative and
2751 therefore we return a FP-relative location. */
2753 if (frame_pointer_needed
)
2756 return gen_frame_mem (DImode
,
2757 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
2759 return gen_frame_mem (DImode
,
2760 plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
));
2763 /* If FP is not needed, we calculate the location of LR, which would be
2764 at the top of the saved registers block. */
2766 return gen_frame_mem (DImode
,
2767 plus_constant (Pmode
,
2770 + cfun
->machine
->frame
.saved_regs_size
2771 - 2 * UNITS_PER_WORD
));
2774 /* Possibly output code to build up a constant in a register. For
2775 the benefit of the costs infrastructure, returns the number of
2776 instructions which would be emitted. GENERATE inhibits or
2777 enables code generation. */
2780 aarch64_build_constant (int regnum
, HOST_WIDE_INT val
, bool generate
)
2784 if (aarch64_bitmask_imm (val
, DImode
))
2787 emit_move_insn (gen_rtx_REG (Pmode
, regnum
), GEN_INT (val
));
2795 HOST_WIDE_INT valp
= val
>> 16;
2799 for (i
= 16; i
< 64; i
+= 16)
2801 valm
= (valp
& 0xffff);
2812 /* zcount contains the number of additional MOVK instructions
2813 required if the constant is built up with an initial MOVZ instruction,
2814 while ncount is the number of MOVK instructions required if starting
2815 with a MOVN instruction. Choose the sequence that yields the fewest
2816 number of instructions, preferring MOVZ instructions when they are both
2818 if (ncount
< zcount
)
2821 emit_move_insn (gen_rtx_REG (Pmode
, regnum
),
2822 GEN_INT (val
| ~(HOST_WIDE_INT
) 0xffff));
2829 emit_move_insn (gen_rtx_REG (Pmode
, regnum
),
2830 GEN_INT (val
& 0xffff));
2837 for (i
= 16; i
< 64; i
+= 16)
2839 if ((val
& 0xffff) != tval
)
2842 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode
, regnum
),
2844 GEN_INT (val
& 0xffff)));
2854 aarch64_add_constant (int regnum
, int scratchreg
, HOST_WIDE_INT delta
)
2856 HOST_WIDE_INT mdelta
= delta
;
2857 rtx this_rtx
= gen_rtx_REG (Pmode
, regnum
);
2858 rtx scratch_rtx
= gen_rtx_REG (Pmode
, scratchreg
);
2863 if (mdelta
>= 4096 * 4096)
2865 (void) aarch64_build_constant (scratchreg
, delta
, true);
2866 emit_insn (gen_add3_insn (this_rtx
, this_rtx
, scratch_rtx
));
2868 else if (mdelta
> 0)
2872 emit_insn (gen_rtx_SET (Pmode
, scratch_rtx
, GEN_INT (mdelta
/ 4096)));
2873 rtx shift
= gen_rtx_ASHIFT (Pmode
, scratch_rtx
, GEN_INT (12));
2875 emit_insn (gen_rtx_SET (Pmode
, this_rtx
,
2876 gen_rtx_MINUS (Pmode
, this_rtx
, shift
)));
2878 emit_insn (gen_rtx_SET (Pmode
, this_rtx
,
2879 gen_rtx_PLUS (Pmode
, this_rtx
, shift
)));
2881 if (mdelta
% 4096 != 0)
2883 scratch_rtx
= GEN_INT ((delta
< 0 ? -1 : 1) * (mdelta
% 4096));
2884 emit_insn (gen_rtx_SET (Pmode
, this_rtx
,
2885 gen_rtx_PLUS (Pmode
, this_rtx
, scratch_rtx
)));
2890 /* Output code to add DELTA to the first argument, and then jump
2891 to FUNCTION. Used for C++ multiple inheritance. */
2893 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
2894 HOST_WIDE_INT delta
,
2895 HOST_WIDE_INT vcall_offset
,
2898 /* The this pointer is always in x0. Note that this differs from
2899 Arm where the this pointer maybe bumped to r1 if r0 is required
2900 to return a pointer to an aggregate. On AArch64 a result value
2901 pointer will be in x8. */
2902 int this_regno
= R0_REGNUM
;
2903 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
2906 reload_completed
= 1;
2907 emit_note (NOTE_INSN_PROLOGUE_END
);
2909 if (vcall_offset
== 0)
2910 aarch64_add_constant (this_regno
, IP1_REGNUM
, delta
);
2913 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
2915 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
2916 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
2917 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
2922 if (delta
>= -256 && delta
< 256)
2923 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
2924 plus_constant (Pmode
, this_rtx
, delta
));
2926 aarch64_add_constant (this_regno
, IP1_REGNUM
, delta
);
2929 if (Pmode
== ptr_mode
)
2930 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
2932 aarch64_emit_move (temp0
,
2933 gen_rtx_ZERO_EXTEND (Pmode
,
2934 gen_rtx_MEM (ptr_mode
, addr
)));
2936 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
2937 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
2940 (void) aarch64_build_constant (IP1_REGNUM
, vcall_offset
, true);
2941 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
2944 if (Pmode
== ptr_mode
)
2945 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
2947 aarch64_emit_move (temp1
,
2948 gen_rtx_SIGN_EXTEND (Pmode
,
2949 gen_rtx_MEM (ptr_mode
, addr
)));
2951 emit_insn (gen_add2_insn (this_rtx
, temp1
));
2954 /* Generate a tail call to the target function. */
2955 if (!TREE_USED (function
))
2957 assemble_external (function
);
2958 TREE_USED (function
) = 1;
2960 funexp
= XEXP (DECL_RTL (function
), 0);
2961 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
2962 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
2963 SIBLING_CALL_P (insn
) = 1;
2965 insn
= get_insns ();
2966 shorten_branches (insn
);
2967 final_start_function (insn
, file
, 1);
2968 final (insn
, file
, 1);
2969 final_end_function ();
2971 /* Stop pretending to be a post-reload pass. */
2972 reload_completed
= 0;
2976 aarch64_tls_referenced_p (rtx x
)
2978 if (!TARGET_HAVE_TLS
)
2980 subrtx_iterator::array_type array
;
2981 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
2983 const_rtx x
= *iter
;
2984 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
2986 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2987 TLS offsets, not real symbol references. */
2988 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
2989 iter
.skip_subrtxes ();
2996 aarch64_bitmasks_cmp (const void *i1
, const void *i2
)
2998 const unsigned HOST_WIDE_INT
*imm1
= (const unsigned HOST_WIDE_INT
*) i1
;
2999 const unsigned HOST_WIDE_INT
*imm2
= (const unsigned HOST_WIDE_INT
*) i2
;
3010 aarch64_build_bitmask_table (void)
3012 unsigned HOST_WIDE_INT mask
, imm
;
3013 unsigned int log_e
, e
, s
, r
;
3014 unsigned int nimms
= 0;
3016 for (log_e
= 1; log_e
<= 6; log_e
++)
3020 mask
= ~(HOST_WIDE_INT
) 0;
3022 mask
= ((HOST_WIDE_INT
) 1 << e
) - 1;
3023 for (s
= 1; s
< e
; s
++)
3025 for (r
= 0; r
< e
; r
++)
3027 /* set s consecutive bits to 1 (s < 64) */
3028 imm
= ((unsigned HOST_WIDE_INT
)1 << s
) - 1;
3029 /* rotate right by r */
3031 imm
= ((imm
>> r
) | (imm
<< (e
- r
))) & mask
;
3032 /* replicate the constant depending on SIMD size */
3034 case 1: imm
|= (imm
<< 2);
3035 case 2: imm
|= (imm
<< 4);
3036 case 3: imm
|= (imm
<< 8);
3037 case 4: imm
|= (imm
<< 16);
3038 case 5: imm
|= (imm
<< 32);
3044 gcc_assert (nimms
< AARCH64_NUM_BITMASKS
);
3045 aarch64_bitmasks
[nimms
++] = imm
;
3050 gcc_assert (nimms
== AARCH64_NUM_BITMASKS
);
3051 qsort (aarch64_bitmasks
, nimms
, sizeof (aarch64_bitmasks
[0]),
3052 aarch64_bitmasks_cmp
);
3056 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3057 a left shift of 0 or 12 bits. */
3059 aarch64_uimm12_shift (HOST_WIDE_INT val
)
3061 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
3062 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
3067 /* Return true if val is an immediate that can be loaded into a
3068 register by a MOVZ instruction. */
3070 aarch64_movw_imm (HOST_WIDE_INT val
, machine_mode mode
)
3072 if (GET_MODE_SIZE (mode
) > 4)
3074 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
3075 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
3080 /* Ignore sign extension. */
3081 val
&= (HOST_WIDE_INT
) 0xffffffff;
3083 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
3084 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
3088 /* Return true if val is a valid bitmask immediate. */
3090 aarch64_bitmask_imm (HOST_WIDE_INT val
, machine_mode mode
)
3092 if (GET_MODE_SIZE (mode
) < 8)
3094 /* Replicate bit pattern. */
3095 val
&= (HOST_WIDE_INT
) 0xffffffff;
3098 return bsearch (&val
, aarch64_bitmasks
, AARCH64_NUM_BITMASKS
,
3099 sizeof (aarch64_bitmasks
[0]), aarch64_bitmasks_cmp
) != NULL
;
3103 /* Return true if val is an immediate that can be loaded into a
3104 register in a single instruction. */
3106 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
3108 if (aarch64_movw_imm (val
, mode
) || aarch64_movw_imm (~val
, mode
))
3110 return aarch64_bitmask_imm (val
, mode
);
3114 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
3118 if (GET_CODE (x
) == HIGH
)
3121 split_const (x
, &base
, &offset
);
3122 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
3124 if (aarch64_classify_symbol (base
, offset
, SYMBOL_CONTEXT_ADR
)
3125 != SYMBOL_FORCE_TO_MEM
)
3128 /* Avoid generating a 64-bit relocation in ILP32; leave
3129 to aarch64_expand_mov_immediate to handle it properly. */
3130 return mode
!= ptr_mode
;
3133 return aarch64_tls_referenced_p (x
);
3136 /* Return true if register REGNO is a valid index register.
3137 STRICT_P is true if REG_OK_STRICT is in effect. */
3140 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
3142 if (!HARD_REGISTER_NUM_P (regno
))
3150 regno
= reg_renumber
[regno
];
3152 return GP_REGNUM_P (regno
);
3155 /* Return true if register REGNO is a valid base register for mode MODE.
3156 STRICT_P is true if REG_OK_STRICT is in effect. */
3159 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
3161 if (!HARD_REGISTER_NUM_P (regno
))
3169 regno
= reg_renumber
[regno
];
3172 /* The fake registers will be eliminated to either the stack or
3173 hard frame pointer, both of which are usually valid base registers.
3174 Reload deals with the cases where the eliminated form isn't valid. */
3175 return (GP_REGNUM_P (regno
)
3176 || regno
== SP_REGNUM
3177 || regno
== FRAME_POINTER_REGNUM
3178 || regno
== ARG_POINTER_REGNUM
);
3181 /* Return true if X is a valid base register for mode MODE.
3182 STRICT_P is true if REG_OK_STRICT is in effect. */
3185 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
3187 if (!strict_p
&& GET_CODE (x
) == SUBREG
)
3190 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
3193 /* Return true if address offset is a valid index. If it is, fill in INFO
3194 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3197 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
3198 machine_mode mode
, bool strict_p
)
3200 enum aarch64_address_type type
;
3205 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
3206 && GET_MODE (x
) == Pmode
)
3208 type
= ADDRESS_REG_REG
;
3212 /* (sign_extend:DI (reg:SI)) */
3213 else if ((GET_CODE (x
) == SIGN_EXTEND
3214 || GET_CODE (x
) == ZERO_EXTEND
)
3215 && GET_MODE (x
) == DImode
3216 && GET_MODE (XEXP (x
, 0)) == SImode
)
3218 type
= (GET_CODE (x
) == SIGN_EXTEND
)
3219 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3220 index
= XEXP (x
, 0);
3223 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3224 else if (GET_CODE (x
) == MULT
3225 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
3226 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
3227 && GET_MODE (XEXP (x
, 0)) == DImode
3228 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
3229 && CONST_INT_P (XEXP (x
, 1)))
3231 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
3232 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3233 index
= XEXP (XEXP (x
, 0), 0);
3234 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
3236 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3237 else if (GET_CODE (x
) == ASHIFT
3238 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
3239 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
3240 && GET_MODE (XEXP (x
, 0)) == DImode
3241 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
3242 && CONST_INT_P (XEXP (x
, 1)))
3244 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
3245 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3246 index
= XEXP (XEXP (x
, 0), 0);
3247 shift
= INTVAL (XEXP (x
, 1));
3249 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3250 else if ((GET_CODE (x
) == SIGN_EXTRACT
3251 || GET_CODE (x
) == ZERO_EXTRACT
)
3252 && GET_MODE (x
) == DImode
3253 && GET_CODE (XEXP (x
, 0)) == MULT
3254 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3255 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
3257 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
3258 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3259 index
= XEXP (XEXP (x
, 0), 0);
3260 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
3261 if (INTVAL (XEXP (x
, 1)) != 32 + shift
3262 || INTVAL (XEXP (x
, 2)) != 0)
3265 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3266 (const_int 0xffffffff<<shift)) */
3267 else if (GET_CODE (x
) == AND
3268 && GET_MODE (x
) == DImode
3269 && GET_CODE (XEXP (x
, 0)) == MULT
3270 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3271 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
3272 && CONST_INT_P (XEXP (x
, 1)))
3274 type
= ADDRESS_REG_UXTW
;
3275 index
= XEXP (XEXP (x
, 0), 0);
3276 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
3277 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
3280 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3281 else if ((GET_CODE (x
) == SIGN_EXTRACT
3282 || GET_CODE (x
) == ZERO_EXTRACT
)
3283 && GET_MODE (x
) == DImode
3284 && GET_CODE (XEXP (x
, 0)) == ASHIFT
3285 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3286 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
3288 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
3289 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
3290 index
= XEXP (XEXP (x
, 0), 0);
3291 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
3292 if (INTVAL (XEXP (x
, 1)) != 32 + shift
3293 || INTVAL (XEXP (x
, 2)) != 0)
3296 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3297 (const_int 0xffffffff<<shift)) */
3298 else if (GET_CODE (x
) == AND
3299 && GET_MODE (x
) == DImode
3300 && GET_CODE (XEXP (x
, 0)) == ASHIFT
3301 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
3302 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
3303 && CONST_INT_P (XEXP (x
, 1)))
3305 type
= ADDRESS_REG_UXTW
;
3306 index
= XEXP (XEXP (x
, 0), 0);
3307 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
3308 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
3311 /* (mult:P (reg:P) (const_int scale)) */
3312 else if (GET_CODE (x
) == MULT
3313 && GET_MODE (x
) == Pmode
3314 && GET_MODE (XEXP (x
, 0)) == Pmode
3315 && CONST_INT_P (XEXP (x
, 1)))
3317 type
= ADDRESS_REG_REG
;
3318 index
= XEXP (x
, 0);
3319 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
3321 /* (ashift:P (reg:P) (const_int shift)) */
3322 else if (GET_CODE (x
) == ASHIFT
3323 && GET_MODE (x
) == Pmode
3324 && GET_MODE (XEXP (x
, 0)) == Pmode
3325 && CONST_INT_P (XEXP (x
, 1)))
3327 type
= ADDRESS_REG_REG
;
3328 index
= XEXP (x
, 0);
3329 shift
= INTVAL (XEXP (x
, 1));
3334 if (GET_CODE (index
) == SUBREG
)
3335 index
= SUBREG_REG (index
);
3338 (shift
> 0 && shift
<= 3
3339 && (1 << shift
) == GET_MODE_SIZE (mode
)))
3341 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
3344 info
->offset
= index
;
3345 info
->shift
= shift
;
3353 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3355 return (offset
>= -64 * GET_MODE_SIZE (mode
)
3356 && offset
< 64 * GET_MODE_SIZE (mode
)
3357 && offset
% GET_MODE_SIZE (mode
) == 0);
3361 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
3362 HOST_WIDE_INT offset
)
3364 return offset
>= -256 && offset
< 256;
3368 offset_12bit_unsigned_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3371 && offset
< 4096 * GET_MODE_SIZE (mode
)
3372 && offset
% GET_MODE_SIZE (mode
) == 0);
3375 /* Return true if X is a valid address for machine mode MODE. If it is,
3376 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3377 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3380 aarch64_classify_address (struct aarch64_address_info
*info
,
3381 rtx x
, machine_mode mode
,
3382 RTX_CODE outer_code
, bool strict_p
)
3384 enum rtx_code code
= GET_CODE (x
);
3387 /* On BE, we use load/store pair for all large int mode load/stores. */
3388 bool load_store_pair_p
= (outer_code
== PARALLEL
3389 || (BYTES_BIG_ENDIAN
3390 && aarch64_vect_struct_mode_p (mode
)));
3392 bool allow_reg_index_p
=
3394 && (GET_MODE_SIZE (mode
) != 16 || aarch64_vector_mode_supported_p (mode
))
3395 && !aarch64_vect_struct_mode_p (mode
);
3397 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399 if (aarch64_vect_struct_mode_p (mode
) && !BYTES_BIG_ENDIAN
3400 && (code
!= POST_INC
&& code
!= REG
))
3407 info
->type
= ADDRESS_REG_IMM
;
3409 info
->offset
= const0_rtx
;
3410 return aarch64_base_register_rtx_p (x
, strict_p
);
3418 && (op0
== virtual_stack_vars_rtx
3419 || op0
== frame_pointer_rtx
3420 || op0
== arg_pointer_rtx
)
3421 && CONST_INT_P (op1
))
3423 info
->type
= ADDRESS_REG_IMM
;
3430 if (GET_MODE_SIZE (mode
) != 0
3431 && CONST_INT_P (op1
)
3432 && aarch64_base_register_rtx_p (op0
, strict_p
))
3434 HOST_WIDE_INT offset
= INTVAL (op1
);
3436 info
->type
= ADDRESS_REG_IMM
;
3440 /* TImode and TFmode values are allowed in both pairs of X
3441 registers and individual Q registers. The available
3443 X,X: 7-bit signed scaled offset
3444 Q: 9-bit signed offset
3445 We conservatively require an offset representable in either mode.
3447 if (mode
== TImode
|| mode
== TFmode
)
3448 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
3449 && offset_9bit_signed_unscaled_p (mode
, offset
));
3451 /* A 7bit offset check because OImode will emit a ldp/stp
3452 instruction (only big endian will get here).
3453 For ldp/stp instructions, the offset is scaled for the size of a
3454 single element of the pair. */
3456 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
3458 /* Three 9/12 bit offsets checks because CImode will emit three
3459 ldr/str instructions (only big endian will get here). */
3461 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
3462 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
3463 || offset_12bit_unsigned_scaled_p (V16QImode
,
3466 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3467 instructions (only big endian will get here). */
3469 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
3470 && aarch64_offset_7bit_signed_scaled_p (TImode
,
3473 if (load_store_pair_p
)
3474 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
3475 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
3477 return (offset_9bit_signed_unscaled_p (mode
, offset
)
3478 || offset_12bit_unsigned_scaled_p (mode
, offset
));
3481 if (allow_reg_index_p
)
3483 /* Look for base + (scaled/extended) index register. */
3484 if (aarch64_base_register_rtx_p (op0
, strict_p
)
3485 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
3490 if (aarch64_base_register_rtx_p (op1
, strict_p
)
3491 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
3504 info
->type
= ADDRESS_REG_WB
;
3505 info
->base
= XEXP (x
, 0);
3506 info
->offset
= NULL_RTX
;
3507 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
3511 info
->type
= ADDRESS_REG_WB
;
3512 info
->base
= XEXP (x
, 0);
3513 if (GET_CODE (XEXP (x
, 1)) == PLUS
3514 && CONST_INT_P (XEXP (XEXP (x
, 1), 1))
3515 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
3516 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
3518 HOST_WIDE_INT offset
;
3519 info
->offset
= XEXP (XEXP (x
, 1), 1);
3520 offset
= INTVAL (info
->offset
);
3522 /* TImode and TFmode values are allowed in both pairs of X
3523 registers and individual Q registers. The available
3525 X,X: 7-bit signed scaled offset
3526 Q: 9-bit signed offset
3527 We conservatively require an offset representable in either mode.
3529 if (mode
== TImode
|| mode
== TFmode
)
3530 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
3531 && offset_9bit_signed_unscaled_p (mode
, offset
));
3533 if (load_store_pair_p
)
3534 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
3535 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
3537 return offset_9bit_signed_unscaled_p (mode
, offset
);
3544 /* load literal: pc-relative constant pool entry. Only supported
3545 for SI mode or larger. */
3546 info
->type
= ADDRESS_SYMBOLIC
;
3548 if (!load_store_pair_p
&& GET_MODE_SIZE (mode
) >= 4)
3552 split_const (x
, &sym
, &addend
);
3553 return (GET_CODE (sym
) == LABEL_REF
3554 || (GET_CODE (sym
) == SYMBOL_REF
3555 && CONSTANT_POOL_ADDRESS_P (sym
)));
3560 info
->type
= ADDRESS_LO_SUM
;
3561 info
->base
= XEXP (x
, 0);
3562 info
->offset
= XEXP (x
, 1);
3563 if (allow_reg_index_p
3564 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
3567 split_const (info
->offset
, &sym
, &offs
);
3568 if (GET_CODE (sym
) == SYMBOL_REF
3569 && (aarch64_classify_symbol (sym
, offs
, SYMBOL_CONTEXT_MEM
)
3570 == SYMBOL_SMALL_ABSOLUTE
))
3572 /* The symbol and offset must be aligned to the access size. */
3574 unsigned int ref_size
;
3576 if (CONSTANT_POOL_ADDRESS_P (sym
))
3577 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
3578 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
3580 tree exp
= SYMBOL_REF_DECL (sym
);
3581 align
= TYPE_ALIGN (TREE_TYPE (exp
));
3582 align
= CONSTANT_ALIGNMENT (exp
, align
);
3584 else if (SYMBOL_REF_DECL (sym
))
3585 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
3586 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
3587 && SYMBOL_REF_BLOCK (sym
) != NULL
)
3588 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
3590 align
= BITS_PER_UNIT
;
3592 ref_size
= GET_MODE_SIZE (mode
);
3594 ref_size
= GET_MODE_SIZE (DImode
);
3596 return ((INTVAL (offs
) & (ref_size
- 1)) == 0
3597 && ((align
/ BITS_PER_UNIT
) & (ref_size
- 1)) == 0);
3608 aarch64_symbolic_address_p (rtx x
)
3612 split_const (x
, &x
, &offset
);
3613 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
3616 /* Classify the base of symbolic expression X, given that X appears in
3619 enum aarch64_symbol_type
3620 aarch64_classify_symbolic_expression (rtx x
,
3621 enum aarch64_symbol_context context
)
3625 split_const (x
, &x
, &offset
);
3626 return aarch64_classify_symbol (x
, offset
, context
);
3630 /* Return TRUE if X is a legitimate address for accessing memory in
3633 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
3635 struct aarch64_address_info addr
;
3637 return aarch64_classify_address (&addr
, x
, mode
, MEM
, strict_p
);
3640 /* Return TRUE if X is a legitimate address for accessing memory in
3641 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3644 aarch64_legitimate_address_p (machine_mode mode
, rtx x
,
3645 RTX_CODE outer_code
, bool strict_p
)
3647 struct aarch64_address_info addr
;
3649 return aarch64_classify_address (&addr
, x
, mode
, outer_code
, strict_p
);
3652 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 aarch64_float_const_zero_rtx_p (rtx x
)
3658 if (GET_MODE (x
) == VOIDmode
)
3661 REAL_VALUE_FROM_CONST_DOUBLE (r
, x
);
3662 if (REAL_VALUE_MINUS_ZERO (r
))
3663 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
3664 return REAL_VALUES_EQUAL (r
, dconst0
);
3667 /* Return the fixed registers used for condition codes. */
3670 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
3673 *p2
= INVALID_REGNUM
;
3677 /* Emit call insn with PAT and do aarch64-specific handling. */
3680 aarch64_emit_call_insn (rtx pat
)
3682 rtx insn
= emit_call_insn (pat
);
3684 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
3685 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
3686 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
3690 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
3692 /* All floating point compares return CCFP if it is an equality
3693 comparison, and CCFPE otherwise. */
3694 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
3721 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
3723 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
3724 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
3725 || GET_CODE (x
) == NEG
))
3728 /* A compare with a shifted operand. Because of canonicalization,
3729 the comparison will have to be swapped when we emit the assembly
3731 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
3732 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
3733 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
3734 || GET_CODE (x
) == LSHIFTRT
3735 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
3738 /* Similarly for a negated operand, but we can only do this for
3740 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
3741 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
3742 && (code
== EQ
|| code
== NE
)
3743 && GET_CODE (x
) == NEG
)
3746 /* A compare of a mode narrower than SI mode against zero can be done
3747 by extending the value in the comparison. */
3748 if ((GET_MODE (x
) == QImode
|| GET_MODE (x
) == HImode
)
3750 /* Only use sign-extension if we really need it. */
3751 return ((code
== GT
|| code
== GE
|| code
== LE
|| code
== LT
)
3752 ? CC_SESWPmode
: CC_ZESWPmode
);
3754 /* For everything else, return CCmode. */
3759 aarch64_get_condition_code_1 (enum machine_mode
, enum rtx_code
);
3762 aarch64_get_condition_code (rtx x
)
3764 machine_mode mode
= GET_MODE (XEXP (x
, 0));
3765 enum rtx_code comp_code
= GET_CODE (x
);
3767 if (GET_MODE_CLASS (mode
) != MODE_CC
)
3768 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
3769 return aarch64_get_condition_code_1 (mode
, comp_code
);
3773 aarch64_get_condition_code_1 (enum machine_mode mode
, enum rtx_code comp_code
)
3775 int ne
= -1, eq
= -1;
3782 case GE
: return AARCH64_GE
;
3783 case GT
: return AARCH64_GT
;
3784 case LE
: return AARCH64_LS
;
3785 case LT
: return AARCH64_MI
;
3786 case NE
: return AARCH64_NE
;
3787 case EQ
: return AARCH64_EQ
;
3788 case ORDERED
: return AARCH64_VC
;
3789 case UNORDERED
: return AARCH64_VS
;
3790 case UNLT
: return AARCH64_LT
;
3791 case UNLE
: return AARCH64_LE
;
3792 case UNGT
: return AARCH64_HI
;
3793 case UNGE
: return AARCH64_PL
;
3851 case NE
: return AARCH64_NE
;
3852 case EQ
: return AARCH64_EQ
;
3853 case GE
: return AARCH64_GE
;
3854 case GT
: return AARCH64_GT
;
3855 case LE
: return AARCH64_LE
;
3856 case LT
: return AARCH64_LT
;
3857 case GEU
: return AARCH64_CS
;
3858 case GTU
: return AARCH64_HI
;
3859 case LEU
: return AARCH64_LS
;
3860 case LTU
: return AARCH64_CC
;
3870 case NE
: return AARCH64_NE
;
3871 case EQ
: return AARCH64_EQ
;
3872 case GE
: return AARCH64_LE
;
3873 case GT
: return AARCH64_LT
;
3874 case LE
: return AARCH64_GE
;
3875 case LT
: return AARCH64_GT
;
3876 case GEU
: return AARCH64_LS
;
3877 case GTU
: return AARCH64_CC
;
3878 case LEU
: return AARCH64_CS
;
3879 case LTU
: return AARCH64_HI
;
3887 case NE
: return AARCH64_NE
;
3888 case EQ
: return AARCH64_EQ
;
3889 case GE
: return AARCH64_PL
;
3890 case LT
: return AARCH64_MI
;
3898 case NE
: return AARCH64_NE
;
3899 case EQ
: return AARCH64_EQ
;
3909 if (comp_code
== NE
)
3912 if (comp_code
== EQ
)
3919 aarch64_const_vec_all_same_in_range_p (rtx x
,
3920 HOST_WIDE_INT minval
,
3921 HOST_WIDE_INT maxval
)
3923 HOST_WIDE_INT firstval
;
3926 if (GET_CODE (x
) != CONST_VECTOR
3927 || GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_INT
)
3930 firstval
= INTVAL (CONST_VECTOR_ELT (x
, 0));
3931 if (firstval
< minval
|| firstval
> maxval
)
3934 count
= CONST_VECTOR_NUNITS (x
);
3935 for (i
= 1; i
< count
; i
++)
3936 if (INTVAL (CONST_VECTOR_ELT (x
, i
)) != firstval
)
3943 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
3945 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
3949 bit_count (unsigned HOST_WIDE_INT value
)
3963 #define AARCH64_CC_V 1
3964 #define AARCH64_CC_C (1 << 1)
3965 #define AARCH64_CC_Z (1 << 2)
3966 #define AARCH64_CC_N (1 << 3)
3968 /* N Z C V flags for ccmp. The first code is for AND op and the other
3969 is for IOR op. Indexed by AARCH64_COND_CODE. */
3970 static const int aarch64_nzcv_codes
[][2] =
3972 {AARCH64_CC_Z
, 0}, /* EQ, Z == 1. */
3973 {0, AARCH64_CC_Z
}, /* NE, Z == 0. */
3974 {AARCH64_CC_C
, 0}, /* CS, C == 1. */
3975 {0, AARCH64_CC_C
}, /* CC, C == 0. */
3976 {AARCH64_CC_N
, 0}, /* MI, N == 1. */
3977 {0, AARCH64_CC_N
}, /* PL, N == 0. */
3978 {AARCH64_CC_V
, 0}, /* VS, V == 1. */
3979 {0, AARCH64_CC_V
}, /* VC, V == 0. */
3980 {AARCH64_CC_C
, 0}, /* HI, C ==1 && Z == 0. */
3981 {0, AARCH64_CC_C
}, /* LS, !(C == 1 && Z == 0). */
3982 {0, AARCH64_CC_V
}, /* GE, N == V. */
3983 {AARCH64_CC_V
, 0}, /* LT, N != V. */
3984 {0, AARCH64_CC_Z
}, /* GT, Z == 0 && N == V. */
3985 {AARCH64_CC_Z
, 0}, /* LE, !(Z == 0 && N == V). */
3986 {0, 0}, /* AL, Any. */
3987 {0, 0}, /* NV, Any. */
3991 aarch64_ccmp_mode_to_code (enum machine_mode mode
)
4032 aarch64_print_operand (FILE *f
, rtx x
, char code
)
4036 /* An integer or symbol address without a preceding # sign. */
4038 switch (GET_CODE (x
))
4041 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
4045 output_addr_const (f
, x
);
4049 if (GET_CODE (XEXP (x
, 0)) == PLUS
4050 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
4052 output_addr_const (f
, x
);
4058 output_operand_lossage ("Unsupported operand for code '%c'", code
);
4063 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4067 if (!CONST_INT_P (x
)
4068 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
4070 output_operand_lossage ("invalid operand for '%%%c'", code
);
4086 output_operand_lossage ("invalid operand for '%%%c'", code
);
4096 /* Print N such that 2^N == X. */
4097 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
4099 output_operand_lossage ("invalid operand for '%%%c'", code
);
4103 asm_fprintf (f
, "%d", n
);
4108 /* Print the number of non-zero bits in X (a const_int). */
4109 if (!CONST_INT_P (x
))
4111 output_operand_lossage ("invalid operand for '%%%c'", code
);
4115 asm_fprintf (f
, "%u", bit_count (INTVAL (x
)));
4119 /* Print the higher numbered register of a pair (TImode) of regs. */
4120 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
4122 output_operand_lossage ("invalid operand for '%%%c'", code
);
4126 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
4132 /* Print a condition (eq, ne, etc). */
4134 /* CONST_TRUE_RTX means always -- that's the default. */
4135 if (x
== const_true_rtx
)
4138 if (!COMPARISON_P (x
))
4140 output_operand_lossage ("invalid operand for '%%%c'", code
);
4144 cond_code
= aarch64_get_condition_code (x
);
4145 gcc_assert (cond_code
>= 0);
4146 fputs (aarch64_condition_codes
[cond_code
], f
);
4153 /* Print the inverse of a condition (eq <-> ne, etc). */
4155 /* CONST_TRUE_RTX means never -- that's the default. */
4156 if (x
== const_true_rtx
)
4162 if (!COMPARISON_P (x
))
4164 output_operand_lossage ("invalid operand for '%%%c'", code
);
4167 cond_code
= aarch64_get_condition_code (x
);
4168 gcc_assert (cond_code
>= 0);
4169 fputs (aarch64_condition_codes
[AARCH64_INVERSE_CONDITION_CODE
4179 /* Print a scalar FP/SIMD register name. */
4180 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4182 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4185 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
4192 /* Print the first FP/SIMD register name in a list. */
4193 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4195 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4198 asm_fprintf (f
, "v%d", REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
4202 /* Print a scalar FP/SIMD register name + 1. */
4203 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
4205 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
4208 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
4212 /* Print bottom 16 bits of integer constant in hex. */
4213 if (!CONST_INT_P (x
))
4215 output_operand_lossage ("invalid operand for '%%%c'", code
);
4218 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
4223 /* Print a general register name or the zero register (32-bit or
4226 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
4228 asm_fprintf (f
, "%czr", code
);
4232 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
4234 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
4238 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
4240 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
4247 /* Print a normal operand, if it's a general register, then we
4251 output_operand_lossage ("missing operand");
4255 switch (GET_CODE (x
))
4258 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
4262 aarch64_memory_reference_mode
= GET_MODE (x
);
4263 output_address (XEXP (x
, 0));
4268 output_addr_const (asm_out_file
, x
);
4272 asm_fprintf (f
, "%wd", INTVAL (x
));
4276 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
4279 aarch64_const_vec_all_same_in_range_p (x
,
4281 HOST_WIDE_INT_MAX
));
4282 asm_fprintf (f
, "%wd", INTVAL (CONST_VECTOR_ELT (x
, 0)));
4284 else if (aarch64_simd_imm_zero_p (x
, GET_MODE (x
)))
4293 /* CONST_DOUBLE can represent a double-width integer.
4294 In this case, the mode of x is VOIDmode. */
4295 if (GET_MODE (x
) == VOIDmode
)
4297 else if (aarch64_float_const_zero_rtx_p (x
))
4302 else if (aarch64_float_const_representable_p (x
))
4305 char float_buf
[buf_size
] = {'\0'};
4307 REAL_VALUE_FROM_CONST_DOUBLE (r
, x
);
4308 real_to_decimal_for_mode (float_buf
, &r
,
4311 asm_fprintf (asm_out_file
, "%s", float_buf
);
4315 output_operand_lossage ("invalid constant");
4318 output_operand_lossage ("invalid operand");
4324 if (GET_CODE (x
) == HIGH
)
4327 switch (aarch64_classify_symbolic_expression (x
, SYMBOL_CONTEXT_ADR
))
4329 case SYMBOL_SMALL_GOT
:
4330 asm_fprintf (asm_out_file
, ":got:");
4333 case SYMBOL_SMALL_TLSGD
:
4334 asm_fprintf (asm_out_file
, ":tlsgd:");
4337 case SYMBOL_SMALL_TLSDESC
:
4338 asm_fprintf (asm_out_file
, ":tlsdesc:");
4341 case SYMBOL_SMALL_GOTTPREL
:
4342 asm_fprintf (asm_out_file
, ":gottprel:");
4345 case SYMBOL_SMALL_TPREL
:
4346 asm_fprintf (asm_out_file
, ":tprel:");
4349 case SYMBOL_TINY_GOT
:
4356 output_addr_const (asm_out_file
, x
);
4360 switch (aarch64_classify_symbolic_expression (x
, SYMBOL_CONTEXT_ADR
))
4362 case SYMBOL_SMALL_GOT
:
4363 asm_fprintf (asm_out_file
, ":lo12:");
4366 case SYMBOL_SMALL_TLSGD
:
4367 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
4370 case SYMBOL_SMALL_TLSDESC
:
4371 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
4374 case SYMBOL_SMALL_GOTTPREL
:
4375 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
4378 case SYMBOL_SMALL_TPREL
:
4379 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
4382 case SYMBOL_TINY_GOT
:
4383 asm_fprintf (asm_out_file
, ":got:");
4389 output_addr_const (asm_out_file
, x
);
4394 switch (aarch64_classify_symbolic_expression (x
, SYMBOL_CONTEXT_ADR
))
4396 case SYMBOL_SMALL_TPREL
:
4397 asm_fprintf (asm_out_file
, ":tprel_hi12:");
4402 output_addr_const (asm_out_file
, x
);
4410 if (!COMPARISON_P (x
))
4412 output_operand_lossage ("invalid operand for '%%%c'", code
);
4416 cond_code
= aarch64_get_condition_code_1 (CCmode
, GET_CODE (x
));
4417 gcc_assert (cond_code
>= 0);
4418 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
][0]);
4427 if (!COMPARISON_P (x
))
4429 output_operand_lossage ("invalid operand for '%%%c'", code
);
4433 cond_code
= aarch64_get_condition_code_1 (CCmode
, GET_CODE (x
));
4434 gcc_assert (cond_code
>= 0);
4435 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
][1]);
4440 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
4446 aarch64_print_operand_address (FILE *f
, rtx x
)
4448 struct aarch64_address_info addr
;
4450 if (aarch64_classify_address (&addr
, x
, aarch64_memory_reference_mode
,
4454 case ADDRESS_REG_IMM
:
4455 if (addr
.offset
== const0_rtx
)
4456 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
4458 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
4459 INTVAL (addr
.offset
));
4462 case ADDRESS_REG_REG
:
4463 if (addr
.shift
== 0)
4464 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
4465 reg_names
[REGNO (addr
.offset
)]);
4467 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
4468 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
4471 case ADDRESS_REG_UXTW
:
4472 if (addr
.shift
== 0)
4473 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
4474 REGNO (addr
.offset
) - R0_REGNUM
);
4476 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
4477 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
4480 case ADDRESS_REG_SXTW
:
4481 if (addr
.shift
== 0)
4482 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
4483 REGNO (addr
.offset
) - R0_REGNUM
);
4485 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
4486 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
4489 case ADDRESS_REG_WB
:
4490 switch (GET_CODE (x
))
4493 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)],
4494 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4497 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)],
4498 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4501 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)],
4502 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4505 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)],
4506 GET_MODE_SIZE (aarch64_memory_reference_mode
));
4509 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
4510 INTVAL (addr
.offset
));
4513 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
4514 INTVAL (addr
.offset
));
4521 case ADDRESS_LO_SUM
:
4522 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
4523 output_addr_const (f
, addr
.offset
);
4524 asm_fprintf (f
, "]");
4527 case ADDRESS_SYMBOLIC
:
4531 output_addr_const (f
, x
);
4535 aarch64_label_mentioned_p (rtx x
)
4540 if (GET_CODE (x
) == LABEL_REF
)
4543 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4544 referencing instruction, but they are constant offsets, not
4546 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
4549 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
4550 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
4556 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
4557 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
4560 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
4567 /* Implement REGNO_REG_CLASS. */
4570 aarch64_regno_regclass (unsigned regno
)
4572 if (GP_REGNUM_P (regno
))
4573 return GENERAL_REGS
;
4575 if (regno
== SP_REGNUM
)
4578 if (regno
== FRAME_POINTER_REGNUM
4579 || regno
== ARG_POINTER_REGNUM
)
4580 return POINTER_REGS
;
4582 if (FP_REGNUM_P (regno
))
4583 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
4589 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
4591 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4592 where mask is selected by alignment and size of the offset.
4593 We try to pick as large a range for the offset as possible to
4594 maximize the chance of a CSE. However, for aligned addresses
4595 we limit the range to 4k so that structures with different sized
4596 elements are likely to use the same base. */
4598 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
4600 HOST_WIDE_INT offset
= INTVAL (XEXP (x
, 1));
4601 HOST_WIDE_INT base_offset
;
4603 /* Does it look like we'll need a load/store-pair operation? */
4604 if (GET_MODE_SIZE (mode
) > 16
4606 base_offset
= ((offset
+ 64 * GET_MODE_SIZE (mode
))
4607 & ~((128 * GET_MODE_SIZE (mode
)) - 1));
4608 /* For offsets aren't a multiple of the access size, the limit is
4610 else if (offset
& (GET_MODE_SIZE (mode
) - 1))
4611 base_offset
= (offset
+ 0x100) & ~0x1ff;
4613 base_offset
= offset
& ~0xfff;
4615 if (base_offset
== 0)
4618 offset
-= base_offset
;
4619 rtx base_reg
= gen_reg_rtx (Pmode
);
4620 rtx val
= force_operand (plus_constant (Pmode
, XEXP (x
, 0), base_offset
),
4622 emit_move_insn (base_reg
, val
);
4623 x
= plus_constant (Pmode
, base_reg
, offset
);
4629 /* Try a machine-dependent way of reloading an illegitimate address
4630 operand. If we find one, push the reload and return the new rtx. */
4633 aarch64_legitimize_reload_address (rtx
*x_p
,
4635 int opnum
, int type
,
4636 int ind_levels ATTRIBUTE_UNUSED
)
4640 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4641 if (aarch64_vect_struct_mode_p (mode
)
4642 && GET_CODE (x
) == PLUS
4643 && REG_P (XEXP (x
, 0))
4644 && CONST_INT_P (XEXP (x
, 1)))
4648 push_reload (orig_rtx
, NULL_RTX
, x_p
, NULL
,
4649 BASE_REG_CLASS
, GET_MODE (x
), VOIDmode
, 0, 0,
4650 opnum
, (enum reload_type
) type
);
4654 /* We must recognize output that we have already generated ourselves. */
4655 if (GET_CODE (x
) == PLUS
4656 && GET_CODE (XEXP (x
, 0)) == PLUS
4657 && REG_P (XEXP (XEXP (x
, 0), 0))
4658 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4659 && CONST_INT_P (XEXP (x
, 1)))
4661 push_reload (XEXP (x
, 0), NULL_RTX
, &XEXP (x
, 0), NULL
,
4662 BASE_REG_CLASS
, GET_MODE (x
), VOIDmode
, 0, 0,
4663 opnum
, (enum reload_type
) type
);
4667 /* We wish to handle large displacements off a base register by splitting
4668 the addend across an add and the mem insn. This can cut the number of
4669 extra insns needed from 3 to 1. It is only useful for load/store of a
4670 single register with 12 bit offset field. */
4671 if (GET_CODE (x
) == PLUS
4672 && REG_P (XEXP (x
, 0))
4673 && CONST_INT_P (XEXP (x
, 1))
4674 && HARD_REGISTER_P (XEXP (x
, 0))
4677 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x
, 0)), true))
4679 HOST_WIDE_INT val
= INTVAL (XEXP (x
, 1));
4680 HOST_WIDE_INT low
= val
& 0xfff;
4681 HOST_WIDE_INT high
= val
- low
;
4684 machine_mode xmode
= GET_MODE (x
);
4686 /* In ILP32, xmode can be either DImode or SImode. */
4687 gcc_assert (xmode
== DImode
|| xmode
== SImode
);
4689 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4690 BLKmode alignment. */
4691 if (GET_MODE_SIZE (mode
) == 0)
4694 offs
= low
% GET_MODE_SIZE (mode
);
4696 /* Align misaligned offset by adjusting high part to compensate. */
4699 if (aarch64_uimm12_shift (high
+ offs
))
4708 offs
= GET_MODE_SIZE (mode
) - offs
;
4710 high
= high
+ (low
& 0x1000) - offs
;
4715 /* Check for overflow. */
4716 if (high
+ low
!= val
)
4719 cst
= GEN_INT (high
);
4720 if (!aarch64_uimm12_shift (high
))
4721 cst
= force_const_mem (xmode
, cst
);
4723 /* Reload high part into base reg, leaving the low part
4724 in the mem instruction.
4725 Note that replacing this gen_rtx_PLUS with plus_constant is
4726 wrong in this case because we rely on the
4727 (plus (plus reg c1) c2) structure being preserved so that
4728 XEXP (*p, 0) in push_reload below uses the correct term. */
4729 x
= gen_rtx_PLUS (xmode
,
4730 gen_rtx_PLUS (xmode
, XEXP (x
, 0), cst
),
4733 push_reload (XEXP (x
, 0), NULL_RTX
, &XEXP (x
, 0), NULL
,
4734 BASE_REG_CLASS
, xmode
, VOIDmode
, 0, 0,
4735 opnum
, (enum reload_type
) type
);
4744 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
4747 secondary_reload_info
*sri
)
4749 /* Without the TARGET_SIMD instructions we cannot move a Q register
4750 to a Q register directly. We need a scratch. */
4751 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
4752 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
4753 && reg_class_subset_p (rclass
, FP_REGS
))
4756 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
4757 else if (mode
== TImode
)
4758 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
4762 /* A TFmode or TImode memory access should be handled via an FP_REGS
4763 because AArch64 has richer addressing modes for LDR/STR instructions
4764 than LDP/STP instructions. */
4765 if (!TARGET_GENERAL_REGS_ONLY
&& rclass
== GENERAL_REGS
4766 && GET_MODE_SIZE (mode
) == 16 && MEM_P (x
))
4769 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
4770 return GENERAL_REGS
;
4776 aarch64_can_eliminate (const int from
, const int to
)
4778 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4779 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4781 if (frame_pointer_needed
)
4783 if (from
== ARG_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
4785 if (from
== ARG_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
)
4787 if (from
== FRAME_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
4788 && !cfun
->calls_alloca
)
4790 if (from
== FRAME_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
4797 /* If we decided that we didn't need a leaf frame pointer but then used
4798 LR in the function, then we'll want a frame pointer after all, so
4799 prevent this elimination to ensure a frame pointer is used. */
4800 if (to
== STACK_POINTER_REGNUM
4801 && flag_omit_leaf_frame_pointer
4802 && df_regs_ever_live_p (LR_REGNUM
))
4810 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
4812 aarch64_layout_frame ();
4814 if (to
== HARD_FRAME_POINTER_REGNUM
)
4816 if (from
== ARG_POINTER_REGNUM
)
4817 return cfun
->machine
->frame
.frame_size
- crtl
->outgoing_args_size
;
4819 if (from
== FRAME_POINTER_REGNUM
)
4820 return (cfun
->machine
->frame
.hard_fp_offset
4821 - cfun
->machine
->frame
.saved_varargs_size
);
4824 if (to
== STACK_POINTER_REGNUM
)
4826 if (from
== FRAME_POINTER_REGNUM
)
4827 return (cfun
->machine
->frame
.frame_size
4828 - cfun
->machine
->frame
.saved_varargs_size
);
4831 return cfun
->machine
->frame
.frame_size
;
4834 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4838 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
4842 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
4847 aarch64_asm_trampoline_template (FILE *f
)
4851 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
4852 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
4856 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
4857 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
4859 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
4860 assemble_aligned_integer (4, const0_rtx
);
4861 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
4862 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
4866 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
4868 rtx fnaddr
, mem
, a_tramp
;
4869 const int tramp_code_sz
= 16;
4871 /* Don't need to copy the trailing D-words, we fill those in below. */
4872 emit_block_move (m_tramp
, assemble_trampoline_template (),
4873 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
4874 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
4875 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
4876 if (GET_MODE (fnaddr
) != ptr_mode
)
4877 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
4878 emit_move_insn (mem
, fnaddr
);
4880 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
4881 emit_move_insn (mem
, chain_value
);
4883 /* XXX We should really define a "clear_cache" pattern and use
4884 gen_clear_cache(). */
4885 a_tramp
= XEXP (m_tramp
, 0);
4886 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
4887 LCT_NORMAL
, VOIDmode
, 2, a_tramp
, ptr_mode
,
4888 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
4892 static unsigned char
4893 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
4897 case CALLER_SAVE_REGS
:
4904 aarch64_vector_mode_p (mode
) ? (GET_MODE_SIZE (mode
) + 15) / 16 :
4905 (GET_MODE_SIZE (mode
) + 7) / 8;
4919 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
4921 if (regclass
== POINTER_REGS
)
4922 return GENERAL_REGS
;
4924 if (regclass
== STACK_REG
)
4927 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
4933 /* If it's an integer immediate that MOVI can't handle, then
4934 FP_REGS is not an option, so we return NO_REGS instead. */
4935 if (CONST_INT_P (x
) && reg_class_subset_p (regclass
, FP_REGS
)
4936 && !aarch64_simd_imm_scalar_p (x
, GET_MODE (x
)))
4939 /* Register eliminiation can result in a request for
4940 SP+constant->FP_REGS. We cannot support such operations which
4941 use SP as source and an FP_REG as destination, so reject out
4943 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
4945 rtx lhs
= XEXP (x
, 0);
4947 /* Look through a possible SUBREG introduced by ILP32. */
4948 if (GET_CODE (lhs
) == SUBREG
)
4949 lhs
= SUBREG_REG (lhs
);
4951 gcc_assert (REG_P (lhs
));
4952 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
4961 aarch64_asm_output_labelref (FILE* f
, const char *name
)
4963 asm_fprintf (f
, "%U%s", name
);
4967 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
4969 if (priority
== DEFAULT_INIT_PRIORITY
)
4970 default_ctor_section_asm_out_constructor (symbol
, priority
);
4975 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
4976 s
= get_section (buf
, SECTION_WRITE
, NULL
);
4977 switch_to_section (s
);
4978 assemble_align (POINTER_SIZE
);
4979 assemble_aligned_integer (POINTER_BYTES
, symbol
);
4984 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
4986 if (priority
== DEFAULT_INIT_PRIORITY
)
4987 default_dtor_section_asm_out_destructor (symbol
, priority
);
4992 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
4993 s
= get_section (buf
, SECTION_WRITE
, NULL
);
4994 switch_to_section (s
);
4995 assemble_align (POINTER_SIZE
);
4996 assemble_aligned_integer (POINTER_BYTES
, symbol
);
5001 aarch64_output_casesi (rtx
*operands
)
5005 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
5007 static const char *const patterns
[4][2] =
5010 "ldrb\t%w3, [%0,%w1,uxtw]",
5011 "add\t%3, %4, %w3, sxtb #2"
5014 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5015 "add\t%3, %4, %w3, sxth #2"
5018 "ldr\t%w3, [%0,%w1,uxtw #2]",
5019 "add\t%3, %4, %w3, sxtw #2"
5021 /* We assume that DImode is only generated when not optimizing and
5022 that we don't really need 64-bit address offsets. That would
5023 imply an object file with 8GB of code in a single function! */
5025 "ldr\t%w3, [%0,%w1,uxtw #2]",
5026 "add\t%3, %4, %w3, sxtw #2"
5030 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
5032 index
= exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec
)));
5034 gcc_assert (index
>= 0 && index
<= 3);
5036 /* Need to implement table size reduction, by chaning the code below. */
5037 output_asm_insn (patterns
[index
][0], operands
);
5038 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
5039 snprintf (buf
, sizeof (buf
),
5040 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
5041 output_asm_insn (buf
, operands
);
5042 output_asm_insn (patterns
[index
][1], operands
);
5043 output_asm_insn ("br\t%3", operands
);
5044 assemble_label (asm_out_file
, label
);
5049 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5050 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5054 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
5056 if (shift
>= 0 && shift
<= 3)
5059 for (size
= 8; size
<= 32; size
*= 2)
5061 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
5062 if (mask
== bits
<< shift
)
5070 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED
,
5071 const_rtx x ATTRIBUTE_UNUSED
)
5073 /* We can't use blocks for constants when we're using a per-function
5079 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED
,
5080 rtx x ATTRIBUTE_UNUSED
,
5081 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED
)
5083 /* Force all constant pool entries into the current function section. */
5084 return function_section (current_function_decl
);
5090 /* Helper function for rtx cost calculation. Strip a shift expression
5091 from X. Returns the inner operand if successful, or the original
5092 expression on failure. */
5094 aarch64_strip_shift (rtx x
)
5098 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5099 we can convert both to ROR during final output. */
5100 if ((GET_CODE (op
) == ASHIFT
5101 || GET_CODE (op
) == ASHIFTRT
5102 || GET_CODE (op
) == LSHIFTRT
5103 || GET_CODE (op
) == ROTATERT
5104 || GET_CODE (op
) == ROTATE
)
5105 && CONST_INT_P (XEXP (op
, 1)))
5106 return XEXP (op
, 0);
5108 if (GET_CODE (op
) == MULT
5109 && CONST_INT_P (XEXP (op
, 1))
5110 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
5111 return XEXP (op
, 0);
5116 /* Helper function for rtx cost calculation. Strip an extend
5117 expression from X. Returns the inner operand if successful, or the
5118 original expression on failure. We deal with a number of possible
5119 canonicalization variations here. */
5121 aarch64_strip_extend (rtx x
)
5125 /* Zero and sign extraction of a widened value. */
5126 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
5127 && XEXP (op
, 2) == const0_rtx
5128 && GET_CODE (XEXP (op
, 0)) == MULT
5129 && aarch64_is_extend_from_extract (GET_MODE (op
), XEXP (XEXP (op
, 0), 1),
5131 return XEXP (XEXP (op
, 0), 0);
5133 /* It can also be represented (for zero-extend) as an AND with an
5135 if (GET_CODE (op
) == AND
5136 && GET_CODE (XEXP (op
, 0)) == MULT
5137 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
5138 && CONST_INT_P (XEXP (op
, 1))
5139 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
5140 INTVAL (XEXP (op
, 1))) != 0)
5141 return XEXP (XEXP (op
, 0), 0);
5143 /* Now handle extended register, as this may also have an optional
5144 left shift by 1..4. */
5145 if (GET_CODE (op
) == ASHIFT
5146 && CONST_INT_P (XEXP (op
, 1))
5147 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
5150 if (GET_CODE (op
) == ZERO_EXTEND
5151 || GET_CODE (op
) == SIGN_EXTEND
)
5160 /* Helper function for rtx cost calculation. Calculate the cost of
5161 a MULT, which may be part of a multiply-accumulate rtx. Return
5162 the calculated cost of the expression, recursing manually in to
5163 operands where needed. */
5166 aarch64_rtx_mult_cost (rtx x
, int code
, int outer
, bool speed
)
5169 const struct cpu_cost_table
*extra_cost
5170 = aarch64_tune_params
->insn_extra_cost
;
5172 bool maybe_fma
= (outer
== PLUS
|| outer
== MINUS
);
5173 machine_mode mode
= GET_MODE (x
);
5175 gcc_checking_assert (code
== MULT
);
5180 if (VECTOR_MODE_P (mode
))
5181 mode
= GET_MODE_INNER (mode
);
5183 /* Integer multiply/fma. */
5184 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5186 /* The multiply will be canonicalized as a shift, cost it as such. */
5187 if (CONST_INT_P (op1
)
5188 && exact_log2 (INTVAL (op1
)) > 0)
5193 /* ADD (shifted register). */
5194 cost
+= extra_cost
->alu
.arith_shift
;
5196 /* LSL (immediate). */
5197 cost
+= extra_cost
->alu
.shift
;
5200 cost
+= rtx_cost (op0
, GET_CODE (op0
), 0, speed
);
5205 /* Integer multiplies or FMAs have zero/sign extending variants. */
5206 if ((GET_CODE (op0
) == ZERO_EXTEND
5207 && GET_CODE (op1
) == ZERO_EXTEND
)
5208 || (GET_CODE (op0
) == SIGN_EXTEND
5209 && GET_CODE (op1
) == SIGN_EXTEND
))
5211 cost
+= rtx_cost (XEXP (op0
, 0), MULT
, 0, speed
)
5212 + rtx_cost (XEXP (op1
, 0), MULT
, 1, speed
);
5217 /* MADD/SMADDL/UMADDL. */
5218 cost
+= extra_cost
->mult
[0].extend_add
;
5220 /* MUL/SMULL/UMULL. */
5221 cost
+= extra_cost
->mult
[0].extend
;
5227 /* This is either an integer multiply or an FMA. In both cases
5228 we want to recurse and cost the operands. */
5229 cost
+= rtx_cost (op0
, MULT
, 0, speed
)
5230 + rtx_cost (op1
, MULT
, 1, speed
);
5236 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
5239 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
5248 /* Floating-point FMA/FMUL can also support negations of the
5250 if (GET_CODE (op0
) == NEG
)
5251 op0
= XEXP (op0
, 0);
5252 if (GET_CODE (op1
) == NEG
)
5253 op1
= XEXP (op1
, 0);
5256 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5257 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
5260 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
5263 cost
+= rtx_cost (op0
, MULT
, 0, speed
)
5264 + rtx_cost (op1
, MULT
, 1, speed
);
5270 aarch64_address_cost (rtx x
,
5272 addr_space_t as ATTRIBUTE_UNUSED
,
5275 enum rtx_code c
= GET_CODE (x
);
5276 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
->addr_cost
;
5277 struct aarch64_address_info info
;
5281 if (!aarch64_classify_address (&info
, x
, mode
, c
, false))
5283 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
5285 /* This is a CONST or SYMBOL ref which will be split
5286 in a different way depending on the code model in use.
5287 Cost it through the generic infrastructure. */
5288 int cost_symbol_ref
= rtx_cost (x
, MEM
, 1, speed
);
5289 /* Divide through by the cost of one instruction to
5290 bring it to the same units as the address costs. */
5291 cost_symbol_ref
/= COSTS_N_INSNS (1);
5292 /* The cost is then the cost of preparing the address,
5293 followed by an immediate (possibly 0) offset. */
5294 return cost_symbol_ref
+ addr_cost
->imm_offset
;
5298 /* This is most likely a jump table from a case
5300 return addr_cost
->register_offset
;
5306 case ADDRESS_LO_SUM
:
5307 case ADDRESS_SYMBOLIC
:
5308 case ADDRESS_REG_IMM
:
5309 cost
+= addr_cost
->imm_offset
;
5312 case ADDRESS_REG_WB
:
5313 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
5314 cost
+= addr_cost
->pre_modify
;
5315 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
5316 cost
+= addr_cost
->post_modify
;
5322 case ADDRESS_REG_REG
:
5323 cost
+= addr_cost
->register_offset
;
5326 case ADDRESS_REG_UXTW
:
5327 case ADDRESS_REG_SXTW
:
5328 cost
+= addr_cost
->register_extend
;
5338 /* For the sake of calculating the cost of the shifted register
5339 component, we can treat same sized modes in the same way. */
5340 switch (GET_MODE_BITSIZE (mode
))
5343 cost
+= addr_cost
->addr_scale_costs
.hi
;
5347 cost
+= addr_cost
->addr_scale_costs
.si
;
5351 cost
+= addr_cost
->addr_scale_costs
.di
;
5354 /* We can't tell, or this is a 128-bit vector. */
5356 cost
+= addr_cost
->addr_scale_costs
.ti
;
5364 /* Return true if the RTX X in mode MODE is a zero or sign extract
5365 usable in an ADD or SUB (extended register) instruction. */
5367 aarch64_rtx_arith_op_extract_p (rtx x
, machine_mode mode
)
5369 /* Catch add with a sign extract.
5370 This is add_<optab><mode>_multp2. */
5371 if (GET_CODE (x
) == SIGN_EXTRACT
5372 || GET_CODE (x
) == ZERO_EXTRACT
)
5374 rtx op0
= XEXP (x
, 0);
5375 rtx op1
= XEXP (x
, 1);
5376 rtx op2
= XEXP (x
, 2);
5378 if (GET_CODE (op0
) == MULT
5379 && CONST_INT_P (op1
)
5380 && op2
== const0_rtx
5381 && CONST_INT_P (XEXP (op0
, 1))
5382 && aarch64_is_extend_from_extract (mode
,
5394 aarch64_frint_unspec_p (unsigned int u
)
5412 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5413 storing it in *COST. Result is true if the total cost of the operation
5414 has now been calculated. */
5416 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
5420 enum rtx_code cmpcode
;
5422 if (COMPARISON_P (op0
))
5424 inner
= XEXP (op0
, 0);
5425 comparator
= XEXP (op0
, 1);
5426 cmpcode
= GET_CODE (op0
);
5431 comparator
= const0_rtx
;
5435 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
5437 /* Conditional branch. */
5438 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
5442 if (cmpcode
== NE
|| cmpcode
== EQ
)
5444 if (comparator
== const0_rtx
)
5446 /* TBZ/TBNZ/CBZ/CBNZ. */
5447 if (GET_CODE (inner
) == ZERO_EXTRACT
)
5449 *cost
+= rtx_cost (XEXP (inner
, 0), ZERO_EXTRACT
,
5453 *cost
+= rtx_cost (inner
, cmpcode
, 0, speed
);
5458 else if (cmpcode
== LT
|| cmpcode
== GE
)
5461 if (comparator
== const0_rtx
)
5466 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
5468 /* It's a conditional operation based on the status flags,
5469 so it must be some flavor of CSEL. */
5471 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5472 if (GET_CODE (op1
) == NEG
5473 || GET_CODE (op1
) == NOT
5474 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
5475 op1
= XEXP (op1
, 0);
5477 *cost
+= rtx_cost (op1
, IF_THEN_ELSE
, 1, speed
);
5478 *cost
+= rtx_cost (op2
, IF_THEN_ELSE
, 2, speed
);
5482 /* We don't know what this is, cost all operands. */
5486 /* Calculate the cost of calculating X, storing it in *COST. Result
5487 is true if the total cost of the operation has now been calculated. */
5489 aarch64_rtx_costs (rtx x
, int code
, int outer ATTRIBUTE_UNUSED
,
5490 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
5493 const struct cpu_cost_table
*extra_cost
5494 = aarch64_tune_params
->insn_extra_cost
;
5495 machine_mode mode
= GET_MODE (x
);
5497 /* By default, assume that everything has equivalent cost to the
5498 cheapest instruction. Any additional costs are applied as a delta
5499 above this default. */
5500 *cost
= COSTS_N_INSNS (1);
5502 /* TODO: The cost infrastructure currently does not handle
5503 vector operations. Assume that all vector operations
5504 are equally expensive. */
5505 if (VECTOR_MODE_P (mode
))
5508 *cost
+= extra_cost
->vect
.alu
;
5515 /* The cost depends entirely on the operands to SET. */
5520 switch (GET_CODE (op0
))
5525 rtx address
= XEXP (op0
, 0);
5526 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5527 *cost
+= extra_cost
->ldst
.store
;
5528 else if (mode
== SFmode
)
5529 *cost
+= extra_cost
->ldst
.storef
;
5530 else if (mode
== DFmode
)
5531 *cost
+= extra_cost
->ldst
.stored
;
5534 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
5538 *cost
+= rtx_cost (op1
, SET
, 1, speed
);
5542 if (! REG_P (SUBREG_REG (op0
)))
5543 *cost
+= rtx_cost (SUBREG_REG (op0
), SET
, 0, speed
);
5547 /* const0_rtx is in general free, but we will use an
5548 instruction to set a register to 0. */
5549 if (REG_P (op1
) || op1
== const0_rtx
)
5551 /* The cost is 1 per register copied. */
5552 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
5554 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
5557 /* Cost is just the cost of the RHS of the set. */
5558 *cost
+= rtx_cost (op1
, SET
, 1, speed
);
5563 /* Bit-field insertion. Strip any redundant widening of
5564 the RHS to meet the width of the target. */
5565 if (GET_CODE (op1
) == SUBREG
)
5566 op1
= SUBREG_REG (op1
);
5567 if ((GET_CODE (op1
) == ZERO_EXTEND
5568 || GET_CODE (op1
) == SIGN_EXTEND
)
5569 && CONST_INT_P (XEXP (op0
, 1))
5570 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1
, 0)))
5571 >= INTVAL (XEXP (op0
, 1))))
5572 op1
= XEXP (op1
, 0);
5574 if (CONST_INT_P (op1
))
5576 /* MOV immediate is assumed to always be cheap. */
5577 *cost
= COSTS_N_INSNS (1);
5583 *cost
+= extra_cost
->alu
.bfi
;
5584 *cost
+= rtx_cost (op1
, (enum rtx_code
) code
, 1, speed
);
5590 /* We can't make sense of this, assume default cost. */
5591 *cost
= COSTS_N_INSNS (1);
5597 /* If an instruction can incorporate a constant within the
5598 instruction, the instruction's expression avoids calling
5599 rtx_cost() on the constant. If rtx_cost() is called on a
5600 constant, then it is usually because the constant must be
5601 moved into a register by one or more instructions.
5603 The exception is constant 0, which can be expressed
5604 as XZR/WZR and is therefore free. The exception to this is
5605 if we have (set (reg) (const0_rtx)) in which case we must cost
5606 the move. However, we can catch that when we cost the SET, so
5607 we don't need to consider that here. */
5608 if (x
== const0_rtx
)
5612 /* To an approximation, building any other constant is
5613 proportionally expensive to the number of instructions
5614 required to build that constant. This is true whether we
5615 are compiling for SPEED or otherwise. */
5616 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
5617 (NULL_RTX
, x
, false, mode
));
5624 /* mov[df,sf]_aarch64. */
5625 if (aarch64_float_const_representable_p (x
))
5626 /* FMOV (scalar immediate). */
5627 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
5628 else if (!aarch64_float_const_zero_rtx_p (x
))
5630 /* This will be a load from memory. */
5632 *cost
+= extra_cost
->ldst
.loadd
;
5634 *cost
+= extra_cost
->ldst
.loadf
;
5637 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5638 or MOV v0.s[0], wzr - neither of which are modeled by the
5639 cost tables. Just use the default cost. */
5649 /* For loads we want the base cost of a load, plus an
5650 approximation for the additional cost of the addressing
5652 rtx address
= XEXP (x
, 0);
5653 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5654 *cost
+= extra_cost
->ldst
.load
;
5655 else if (mode
== SFmode
)
5656 *cost
+= extra_cost
->ldst
.loadf
;
5657 else if (mode
== DFmode
)
5658 *cost
+= extra_cost
->ldst
.loadd
;
5661 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
5670 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_INT
)
5672 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
5673 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
5676 *cost
+= rtx_cost (XEXP (op0
, 0), NEG
, 0, speed
);
5680 /* Cost this as SUB wzr, X. */
5681 op0
= CONST0_RTX (GET_MODE (x
));
5686 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
5688 /* Support (neg(fma...)) as a single instruction only if
5689 sign of zeros is unimportant. This matches the decision
5690 making in aarch64.md. */
5691 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
5694 *cost
= rtx_cost (op0
, NEG
, 0, speed
);
5699 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
5708 *cost
+= extra_cost
->alu
.clz
;
5716 if (op1
== const0_rtx
5717 && GET_CODE (op0
) == AND
)
5723 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
5725 /* TODO: A write to the CC flags possibly costs extra, this
5726 needs encoding in the cost tables. */
5728 /* CC_ZESWPmode supports zero extend for free. */
5729 if (GET_MODE (x
) == CC_ZESWPmode
&& GET_CODE (op0
) == ZERO_EXTEND
)
5730 op0
= XEXP (op0
, 0);
5733 if (GET_CODE (op0
) == AND
)
5739 if (GET_CODE (op0
) == PLUS
)
5741 /* ADDS (and CMN alias). */
5746 if (GET_CODE (op0
) == MINUS
)
5753 if (GET_CODE (op1
) == NEG
)
5757 *cost
+= extra_cost
->alu
.arith
;
5759 *cost
+= rtx_cost (op0
, COMPARE
, 0, speed
);
5760 *cost
+= rtx_cost (XEXP (op1
, 0), NEG
, 1, speed
);
5766 Compare can freely swap the order of operands, and
5767 canonicalization puts the more complex operation first.
5768 But the integer MINUS logic expects the shift/extend
5769 operation in op1. */
5771 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
5779 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
5783 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
5785 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
5787 /* FCMP supports constant 0.0 for no extra cost. */
5801 /* Detect valid immediates. */
5802 if ((GET_MODE_CLASS (mode
) == MODE_INT
5803 || (GET_MODE_CLASS (mode
) == MODE_CC
5804 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
5805 && CONST_INT_P (op1
)
5806 && aarch64_uimm12_shift (INTVAL (op1
)))
5808 *cost
+= rtx_cost (op0
, MINUS
, 0, speed
);
5811 /* SUB(S) (immediate). */
5812 *cost
+= extra_cost
->alu
.arith
;
5817 /* Look for SUB (extended register). */
5818 if (aarch64_rtx_arith_op_extract_p (op1
, mode
))
5821 *cost
+= extra_cost
->alu
.arith_shift
;
5823 *cost
+= rtx_cost (XEXP (XEXP (op1
, 0), 0),
5824 (enum rtx_code
) GET_CODE (op1
),
5829 rtx new_op1
= aarch64_strip_extend (op1
);
5831 /* Cost this as an FMA-alike operation. */
5832 if ((GET_CODE (new_op1
) == MULT
5833 || GET_CODE (new_op1
) == ASHIFT
)
5836 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
5837 (enum rtx_code
) code
,
5839 *cost
+= rtx_cost (op0
, MINUS
, 0, speed
);
5843 *cost
+= rtx_cost (new_op1
, MINUS
, 1, speed
);
5847 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5849 *cost
+= extra_cost
->alu
.arith
;
5850 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
5852 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
5865 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
5866 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
5869 *cost
+= rtx_cost (XEXP (op0
, 0), PLUS
, 0, speed
);
5870 *cost
+= rtx_cost (op1
, PLUS
, 1, speed
);
5874 if (GET_MODE_CLASS (mode
) == MODE_INT
5875 && CONST_INT_P (op1
)
5876 && aarch64_uimm12_shift (INTVAL (op1
)))
5878 *cost
+= rtx_cost (op0
, PLUS
, 0, speed
);
5881 /* ADD (immediate). */
5882 *cost
+= extra_cost
->alu
.arith
;
5886 /* Look for ADD (extended register). */
5887 if (aarch64_rtx_arith_op_extract_p (op0
, mode
))
5890 *cost
+= extra_cost
->alu
.arith_shift
;
5892 *cost
+= rtx_cost (XEXP (XEXP (op0
, 0), 0),
5893 (enum rtx_code
) GET_CODE (op0
),
5898 /* Strip any extend, leave shifts behind as we will
5899 cost them through mult_cost. */
5900 new_op0
= aarch64_strip_extend (op0
);
5902 if (GET_CODE (new_op0
) == MULT
5903 || GET_CODE (new_op0
) == ASHIFT
)
5905 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
5907 *cost
+= rtx_cost (op1
, PLUS
, 1, speed
);
5911 *cost
+= (rtx_cost (new_op0
, PLUS
, 0, speed
)
5912 + rtx_cost (op1
, PLUS
, 1, speed
));
5916 if (GET_MODE_CLASS (mode
) == MODE_INT
)
5918 *cost
+= extra_cost
->alu
.arith
;
5919 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
5921 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
5927 *cost
= COSTS_N_INSNS (1);
5930 *cost
+= extra_cost
->alu
.rev
;
5935 if (aarch_rev16_p (x
))
5937 *cost
= COSTS_N_INSNS (1);
5940 *cost
+= extra_cost
->alu
.rev
;
5952 && GET_CODE (op0
) == MULT
5953 && CONST_INT_P (XEXP (op0
, 1))
5954 && CONST_INT_P (op1
)
5955 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
5958 /* This is a UBFM/SBFM. */
5959 *cost
+= rtx_cost (XEXP (op0
, 0), ZERO_EXTRACT
, 0, speed
);
5961 *cost
+= extra_cost
->alu
.bfx
;
5965 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_INT
)
5967 /* We possibly get the immediate for free, this is not
5969 if (CONST_INT_P (op1
)
5970 && aarch64_bitmask_imm (INTVAL (op1
), GET_MODE (x
)))
5972 *cost
+= rtx_cost (op0
, (enum rtx_code
) code
, 0, speed
);
5975 *cost
+= extra_cost
->alu
.logical
;
5983 /* Handle ORN, EON, or BIC. */
5984 if (GET_CODE (op0
) == NOT
)
5985 op0
= XEXP (op0
, 0);
5987 new_op0
= aarch64_strip_shift (op0
);
5989 /* If we had a shift on op0 then this is a logical-shift-
5990 by-register/immediate operation. Otherwise, this is just
5991 a logical operation. */
5996 /* Shift by immediate. */
5997 if (CONST_INT_P (XEXP (op0
, 1)))
5998 *cost
+= extra_cost
->alu
.log_shift
;
6000 *cost
+= extra_cost
->alu
.log_shift_reg
;
6003 *cost
+= extra_cost
->alu
.logical
;
6006 /* In both cases we want to cost both operands. */
6007 *cost
+= rtx_cost (new_op0
, (enum rtx_code
) code
, 0, speed
)
6008 + rtx_cost (op1
, (enum rtx_code
) code
, 1, speed
);
6018 *cost
+= extra_cost
->alu
.logical
;
6020 /* The logical instruction could have the shifted register form,
6021 but the cost is the same if the shift is processed as a separate
6022 instruction, so we don't bother with it here. */
6028 /* If a value is written in SI mode, then zero extended to DI
6029 mode, the operation will in general be free as a write to
6030 a 'w' register implicitly zeroes the upper bits of an 'x'
6031 register. However, if this is
6033 (set (reg) (zero_extend (reg)))
6035 we must cost the explicit register move. */
6037 && GET_MODE (op0
) == SImode
6040 int op_cost
= rtx_cost (XEXP (x
, 0), ZERO_EXTEND
, 0, speed
);
6042 if (!op_cost
&& speed
)
6044 *cost
+= extra_cost
->alu
.extend
;
6046 /* Free, the cost is that of the SI mode operation. */
6051 else if (MEM_P (XEXP (x
, 0)))
6053 /* All loads can zero extend to any size for free. */
6054 *cost
= rtx_cost (XEXP (x
, 0), ZERO_EXTEND
, param
, speed
);
6060 *cost
+= extra_cost
->alu
.extend
;
6065 if (MEM_P (XEXP (x
, 0)))
6070 rtx address
= XEXP (XEXP (x
, 0), 0);
6071 *cost
+= extra_cost
->ldst
.load_sign_extend
;
6074 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6081 *cost
+= extra_cost
->alu
.extend
;
6088 if (CONST_INT_P (op1
))
6090 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6093 *cost
+= extra_cost
->alu
.shift
;
6095 /* We can incorporate zero/sign extend for free. */
6096 if (GET_CODE (op0
) == ZERO_EXTEND
6097 || GET_CODE (op0
) == SIGN_EXTEND
)
6098 op0
= XEXP (op0
, 0);
6100 *cost
+= rtx_cost (op0
, ASHIFT
, 0, speed
);
6107 *cost
+= extra_cost
->alu
.shift_reg
;
6109 return false; /* All arguments need to be in registers. */
6119 if (CONST_INT_P (op1
))
6121 /* ASR (immediate) and friends. */
6123 *cost
+= extra_cost
->alu
.shift
;
6125 *cost
+= rtx_cost (op0
, (enum rtx_code
) code
, 0, speed
);
6131 /* ASR (register) and friends. */
6133 *cost
+= extra_cost
->alu
.shift_reg
;
6135 return false; /* All arguments need to be in registers. */
6140 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
)
6144 *cost
+= extra_cost
->ldst
.load
;
6146 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
6147 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
6149 /* ADRP, followed by ADD. */
6150 *cost
+= COSTS_N_INSNS (1);
6152 *cost
+= 2 * extra_cost
->alu
.arith
;
6154 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
6155 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
6159 *cost
+= extra_cost
->alu
.arith
;
6164 /* One extra load instruction, after accessing the GOT. */
6165 *cost
+= COSTS_N_INSNS (1);
6167 *cost
+= extra_cost
->ldst
.load
;
6173 /* ADRP/ADD (immediate). */
6175 *cost
+= extra_cost
->alu
.arith
;
6182 *cost
+= extra_cost
->alu
.bfx
;
6184 /* We can trust that the immediates used will be correct (there
6185 are no by-register forms), so we need only cost op0. */
6186 *cost
+= rtx_cost (XEXP (x
, 0), (enum rtx_code
) code
, 0, speed
);
6190 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
6191 /* aarch64_rtx_mult_cost always handles recursion to its
6199 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_INT
)
6200 *cost
+= (extra_cost
->mult
[GET_MODE (x
) == DImode
].add
6201 + extra_cost
->mult
[GET_MODE (x
) == DImode
].idiv
);
6202 else if (GET_MODE (x
) == DFmode
)
6203 *cost
+= (extra_cost
->fp
[1].mult
6204 + extra_cost
->fp
[1].div
);
6205 else if (GET_MODE (x
) == SFmode
)
6206 *cost
+= (extra_cost
->fp
[0].mult
6207 + extra_cost
->fp
[0].div
);
6209 return false; /* All arguments need to be in registers. */
6216 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6217 /* There is no integer SQRT, so only DIV and UDIV can get
6219 *cost
+= extra_cost
->mult
[mode
== DImode
].idiv
;
6221 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
6223 return false; /* All arguments need to be in registers. */
6226 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
6227 XEXP (x
, 2), cost
, speed
);
6240 return false; /* All arguments must be in registers. */
6248 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
6250 /* FMSUB, FNMADD, and FNMSUB are free. */
6251 if (GET_CODE (op0
) == NEG
)
6252 op0
= XEXP (op0
, 0);
6254 if (GET_CODE (op2
) == NEG
)
6255 op2
= XEXP (op2
, 0);
6257 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6258 and the by-element operand as operand 0. */
6259 if (GET_CODE (op1
) == NEG
)
6260 op1
= XEXP (op1
, 0);
6262 /* Catch vector-by-element operations. The by-element operand can
6263 either be (vec_duplicate (vec_select (x))) or just
6264 (vec_select (x)), depending on whether we are multiplying by
6265 a vector or a scalar.
6267 Canonicalization is not very good in these cases, FMA4 will put the
6268 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6269 if (GET_CODE (op0
) == VEC_DUPLICATE
)
6270 op0
= XEXP (op0
, 0);
6271 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
6272 op1
= XEXP (op1
, 0);
6274 if (GET_CODE (op0
) == VEC_SELECT
)
6275 op0
= XEXP (op0
, 0);
6276 else if (GET_CODE (op1
) == VEC_SELECT
)
6277 op1
= XEXP (op1
, 0);
6279 /* If the remaining parameters are not registers,
6280 get the cost to put them into registers. */
6281 *cost
+= rtx_cost (op0
, FMA
, 0, speed
);
6282 *cost
+= rtx_cost (op1
, FMA
, 1, speed
);
6283 *cost
+= rtx_cost (op2
, FMA
, 2, speed
);
6288 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
6291 case FLOAT_TRUNCATE
:
6293 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
6299 /* Strip the rounding part. They will all be implemented
6300 by the fcvt* family of instructions anyway. */
6301 if (GET_CODE (x
) == UNSPEC
)
6303 unsigned int uns_code
= XINT (x
, 1);
6305 if (uns_code
== UNSPEC_FRINTA
6306 || uns_code
== UNSPEC_FRINTM
6307 || uns_code
== UNSPEC_FRINTN
6308 || uns_code
== UNSPEC_FRINTP
6309 || uns_code
== UNSPEC_FRINTZ
)
6310 x
= XVECEXP (x
, 0, 0);
6314 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
6316 *cost
+= rtx_cost (x
, (enum rtx_code
) code
, 0, speed
);
6320 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
6322 /* FABS and FNEG are analogous. */
6324 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
6328 /* Integer ABS will either be split to
6329 two arithmetic instructions, or will be an ABS
6330 (scalar), which we don't model. */
6331 *cost
= COSTS_N_INSNS (2);
6333 *cost
+= 2 * extra_cost
->alu
.arith
;
6341 /* FMAXNM/FMINNM/FMAX/FMIN.
6342 TODO: This may not be accurate for all implementations, but
6343 we do not model this in the cost tables. */
6344 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
6349 /* The floating point round to integer frint* instructions. */
6350 if (aarch64_frint_unspec_p (XINT (x
, 1)))
6353 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
6358 if (XINT (x
, 1) == UNSPEC_RBIT
)
6361 *cost
+= extra_cost
->alu
.rev
;
6369 /* Decompose <su>muldi3_highpart. */
6370 if (/* (truncate:DI */
6373 && GET_MODE (XEXP (x
, 0)) == TImode
6374 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
6376 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
6377 /* (ANY_EXTEND:TI (reg:DI))
6378 (ANY_EXTEND:TI (reg:DI))) */
6379 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
6380 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
6381 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
6382 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
6383 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
6384 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
6385 /* (const_int 64) */
6386 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
6387 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
6391 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
6392 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
6394 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
6404 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
6406 "\nFailed to cost RTX. Assuming default cost.\n");
6411 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6412 calculated for X. This cost is stored in *COST. Returns true
6413 if the total cost of X was calculated. */
6415 aarch64_rtx_costs_wrapper (rtx x
, int code
, int outer
,
6416 int param
, int *cost
, bool speed
)
6418 bool result
= aarch64_rtx_costs (x
, code
, outer
, param
, cost
, speed
);
6420 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
6422 print_rtl_single (dump_file
, x
);
6423 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
6424 speed
? "Hot" : "Cold",
6425 *cost
, result
? "final" : "partial");
6432 aarch64_register_move_cost (machine_mode mode
,
6433 reg_class_t from_i
, reg_class_t to_i
)
6435 enum reg_class from
= (enum reg_class
) from_i
;
6436 enum reg_class to
= (enum reg_class
) to_i
;
6437 const struct cpu_regmove_cost
*regmove_cost
6438 = aarch64_tune_params
->regmove_cost
;
6440 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6441 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
6444 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
6445 from
= GENERAL_REGS
;
6447 /* Moving between GPR and stack cost is the same as GP2GP. */
6448 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
6449 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
6450 return regmove_cost
->GP2GP
;
6452 /* To/From the stack register, we move via the gprs. */
6453 if (to
== STACK_REG
|| from
== STACK_REG
)
6454 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
6455 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
6457 if (GET_MODE_SIZE (mode
) == 16)
6459 /* 128-bit operations on general registers require 2 instructions. */
6460 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
6461 return regmove_cost
->GP2GP
* 2;
6462 else if (from
== GENERAL_REGS
)
6463 return regmove_cost
->GP2FP
* 2;
6464 else if (to
== GENERAL_REGS
)
6465 return regmove_cost
->FP2GP
* 2;
6467 /* When AdvSIMD instructions are disabled it is not possible to move
6468 a 128-bit value directly between Q registers. This is handled in
6469 secondary reload. A general register is used as a scratch to move
6470 the upper DI value and the lower DI value is moved directly,
6471 hence the cost is the sum of three moves. */
6473 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
6475 return regmove_cost
->FP2FP
;
6478 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
6479 return regmove_cost
->GP2GP
;
6480 else if (from
== GENERAL_REGS
)
6481 return regmove_cost
->GP2FP
;
6482 else if (to
== GENERAL_REGS
)
6483 return regmove_cost
->FP2GP
;
6485 return regmove_cost
->FP2FP
;
6489 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
6490 reg_class_t rclass ATTRIBUTE_UNUSED
,
6491 bool in ATTRIBUTE_UNUSED
)
6493 return aarch64_tune_params
->memmov_cost
;
6496 /* Return the number of instructions that can be issued per cycle. */
6498 aarch64_sched_issue_rate (void)
6500 return aarch64_tune_params
->issue_rate
;
6504 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6506 int issue_rate
= aarch64_sched_issue_rate ();
6508 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
6511 /* Vectorizer cost model target hooks. */
6513 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6515 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
6517 int misalign ATTRIBUTE_UNUSED
)
6521 switch (type_of_cost
)
6524 return aarch64_tune_params
->vec_costs
->scalar_stmt_cost
;
6527 return aarch64_tune_params
->vec_costs
->scalar_load_cost
;
6530 return aarch64_tune_params
->vec_costs
->scalar_store_cost
;
6533 return aarch64_tune_params
->vec_costs
->vec_stmt_cost
;
6536 return aarch64_tune_params
->vec_costs
->vec_align_load_cost
;
6539 return aarch64_tune_params
->vec_costs
->vec_store_cost
;
6542 return aarch64_tune_params
->vec_costs
->vec_to_scalar_cost
;
6545 return aarch64_tune_params
->vec_costs
->scalar_to_vec_cost
;
6547 case unaligned_load
:
6548 return aarch64_tune_params
->vec_costs
->vec_unalign_load_cost
;
6550 case unaligned_store
:
6551 return aarch64_tune_params
->vec_costs
->vec_unalign_store_cost
;
6553 case cond_branch_taken
:
6554 return aarch64_tune_params
->vec_costs
->cond_taken_branch_cost
;
6556 case cond_branch_not_taken
:
6557 return aarch64_tune_params
->vec_costs
->cond_not_taken_branch_cost
;
6560 case vec_promote_demote
:
6561 return aarch64_tune_params
->vec_costs
->vec_stmt_cost
;
6564 elements
= TYPE_VECTOR_SUBPARTS (vectype
);
6565 return elements
/ 2 + 1;
6572 /* Implement targetm.vectorize.add_stmt_cost. */
6574 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
6575 struct _stmt_vec_info
*stmt_info
, int misalign
,
6576 enum vect_cost_model_location where
)
6578 unsigned *cost
= (unsigned *) data
;
6579 unsigned retval
= 0;
6581 if (flag_vect_cost_model
)
6583 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
6585 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
6587 /* Statements in an inner loop relative to the loop being
6588 vectorized are weighted more heavily. The value here is
6589 a function (linear for now) of the loop nest level. */
6590 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
6592 loop_vec_info loop_info
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6593 struct loop
*loop
= LOOP_VINFO_LOOP (loop_info
);
6594 unsigned nest_level
= loop_depth (loop
);
6596 count
*= nest_level
;
6599 retval
= (unsigned) (count
* stmt_cost
);
6600 cost
[where
] += retval
;
6606 static void initialize_aarch64_code_model (void);
6608 /* Parse the architecture extension string. */
6611 aarch64_parse_extension (char *str
)
6613 /* The extension string is parsed left to right. */
6614 const struct aarch64_option_extension
*opt
= NULL
;
6616 /* Flag to say whether we are adding or removing an extension. */
6617 int adding_ext
= -1;
6619 while (str
!= NULL
&& *str
!= 0)
6625 ext
= strchr (str
, '+');
6632 if (len
>= 2 && strncmp (str
, "no", 2) == 0)
6643 error ("missing feature modifier after %qs", adding_ext
? "+"
6648 /* Scan over the extensions table trying to find an exact match. */
6649 for (opt
= all_extensions
; opt
->name
!= NULL
; opt
++)
6651 if (strlen (opt
->name
) == len
&& strncmp (opt
->name
, str
, len
) == 0)
6653 /* Add or remove the extension. */
6655 aarch64_isa_flags
|= opt
->flags_on
;
6657 aarch64_isa_flags
&= ~(opt
->flags_off
);
6662 if (opt
->name
== NULL
)
6664 /* Extension not found in list. */
6665 error ("unknown feature modifier %qs", str
);
6675 /* Parse the ARCH string. */
6678 aarch64_parse_arch (void)
6681 const struct processor
*arch
;
6682 char *str
= (char *) alloca (strlen (aarch64_arch_string
) + 1);
6685 strcpy (str
, aarch64_arch_string
);
6687 ext
= strchr (str
, '+');
6696 error ("missing arch name in -march=%qs", str
);
6700 /* Loop through the list of supported ARCHs to find a match. */
6701 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
6703 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
6705 selected_arch
= arch
;
6706 aarch64_isa_flags
= selected_arch
->flags
;
6709 selected_cpu
= &all_cores
[selected_arch
->core
];
6713 /* ARCH string contains at least one extension. */
6714 aarch64_parse_extension (ext
);
6717 if (strcmp (selected_arch
->arch
, selected_cpu
->arch
))
6719 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6720 selected_cpu
->name
, selected_arch
->name
);
6727 /* ARCH name not found in list. */
6728 error ("unknown value %qs for -march", str
);
6732 /* Parse the CPU string. */
6735 aarch64_parse_cpu (void)
6738 const struct processor
*cpu
;
6739 char *str
= (char *) alloca (strlen (aarch64_cpu_string
) + 1);
6742 strcpy (str
, aarch64_cpu_string
);
6744 ext
= strchr (str
, '+');
6753 error ("missing cpu name in -mcpu=%qs", str
);
6757 /* Loop through the list of supported CPUs to find a match. */
6758 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
6760 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
6763 aarch64_isa_flags
= selected_cpu
->flags
;
6767 /* CPU string contains at least one extension. */
6768 aarch64_parse_extension (ext
);
6775 /* CPU name not found in list. */
6776 error ("unknown value %qs for -mcpu", str
);
6780 /* Parse the TUNE string. */
6783 aarch64_parse_tune (void)
6785 const struct processor
*cpu
;
6786 char *str
= (char *) alloca (strlen (aarch64_tune_string
) + 1);
6787 strcpy (str
, aarch64_tune_string
);
6789 /* Loop through the list of supported CPUs to find a match. */
6790 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
6792 if (strcmp (cpu
->name
, str
) == 0)
6794 selected_tune
= cpu
;
6799 /* CPU name not found in list. */
6800 error ("unknown value %qs for -mtune", str
);
6805 /* Implement TARGET_OPTION_OVERRIDE. */
6808 aarch64_override_options (void)
6810 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6811 If either of -march or -mtune is given, they override their
6812 respective component of -mcpu.
6814 So, first parse AARCH64_CPU_STRING, then the others, be careful
6815 with -march as, if -mcpu is not present on the command line, march
6816 must set a sensible default CPU. */
6817 if (aarch64_cpu_string
)
6819 aarch64_parse_cpu ();
6822 if (aarch64_arch_string
)
6824 aarch64_parse_arch ();
6827 if (aarch64_tune_string
)
6829 aarch64_parse_tune ();
6832 #ifndef HAVE_AS_MABI_OPTION
6833 /* The compiler may have been configured with 2.23.* binutils, which does
6834 not have support for ILP32. */
6836 error ("Assembler does not support -mabi=ilp32");
6839 initialize_aarch64_code_model ();
6841 aarch64_build_bitmask_table ();
6843 /* This target defaults to strict volatile bitfields. */
6844 if (flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
6845 flag_strict_volatile_bitfields
= 1;
6847 /* If the user did not specify a processor, choose the default
6848 one for them. This will be the CPU set during configuration using
6849 --with-cpu, otherwise it is "generic". */
6852 selected_cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
6853 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
6856 gcc_assert (selected_cpu
);
6859 selected_tune
= selected_cpu
;
6861 aarch64_tune_flags
= selected_tune
->flags
;
6862 aarch64_tune
= selected_tune
->core
;
6863 aarch64_tune_params
= selected_tune
->tune
;
6864 aarch64_architecture_version
= selected_cpu
->architecture_version
;
6866 if (aarch64_fix_a53_err835769
== 2)
6868 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6869 aarch64_fix_a53_err835769
= 1;
6871 aarch64_fix_a53_err835769
= 0;
6875 /* If not opzimizing for size, set the default
6876 alignment to what the target wants */
6879 if (align_loops
<= 0)
6880 align_loops
= aarch64_tune_params
->loop_align
;
6881 if (align_jumps
<= 0)
6882 align_jumps
= aarch64_tune_params
->jump_align
;
6883 if (align_functions
<= 0)
6884 align_functions
= aarch64_tune_params
->function_align
;
6887 aarch64_override_options_after_change ();
6890 /* Implement targetm.override_options_after_change. */
6893 aarch64_override_options_after_change (void)
6895 if (flag_omit_frame_pointer
)
6896 flag_omit_leaf_frame_pointer
= false;
6897 else if (flag_omit_leaf_frame_pointer
)
6898 flag_omit_frame_pointer
= true;
6901 static struct machine_function
*
6902 aarch64_init_machine_status (void)
6904 struct machine_function
*machine
;
6905 machine
= ggc_cleared_alloc
<machine_function
> ();
6910 aarch64_init_expanders (void)
6912 init_machine_status
= aarch64_init_machine_status
;
6915 /* A checking mechanism for the implementation of the various code models. */
6917 initialize_aarch64_code_model (void)
6921 switch (aarch64_cmodel_var
)
6923 case AARCH64_CMODEL_TINY
:
6924 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
6926 case AARCH64_CMODEL_SMALL
:
6927 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
6929 case AARCH64_CMODEL_LARGE
:
6930 sorry ("code model %qs with -f%s", "large",
6931 flag_pic
> 1 ? "PIC" : "pic");
6937 aarch64_cmodel
= aarch64_cmodel_var
;
6940 /* Return true if SYMBOL_REF X binds locally. */
6943 aarch64_symbol_binds_local_p (const_rtx x
)
6945 return (SYMBOL_REF_DECL (x
)
6946 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
6947 : SYMBOL_REF_LOCAL_P (x
));
6950 /* Return true if SYMBOL_REF X is thread local */
6952 aarch64_tls_symbol_p (rtx x
)
6954 if (! TARGET_HAVE_TLS
)
6957 if (GET_CODE (x
) != SYMBOL_REF
)
6960 return SYMBOL_REF_TLS_MODEL (x
) != 0;
6963 /* Classify a TLS symbol into one of the TLS kinds. */
6964 enum aarch64_symbol_type
6965 aarch64_classify_tls_symbol (rtx x
)
6967 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
6971 case TLS_MODEL_GLOBAL_DYNAMIC
:
6972 case TLS_MODEL_LOCAL_DYNAMIC
:
6973 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
6975 case TLS_MODEL_INITIAL_EXEC
:
6976 return SYMBOL_SMALL_GOTTPREL
;
6978 case TLS_MODEL_LOCAL_EXEC
:
6979 return SYMBOL_SMALL_TPREL
;
6981 case TLS_MODEL_EMULATED
:
6982 case TLS_MODEL_NONE
:
6983 return SYMBOL_FORCE_TO_MEM
;
6990 /* Return the method that should be used to access SYMBOL_REF or
6991 LABEL_REF X in context CONTEXT. */
6993 enum aarch64_symbol_type
6994 aarch64_classify_symbol (rtx x
, rtx offset
,
6995 enum aarch64_symbol_context context ATTRIBUTE_UNUSED
)
6997 if (GET_CODE (x
) == LABEL_REF
)
6999 switch (aarch64_cmodel
)
7001 case AARCH64_CMODEL_LARGE
:
7002 return SYMBOL_FORCE_TO_MEM
;
7004 case AARCH64_CMODEL_TINY_PIC
:
7005 case AARCH64_CMODEL_TINY
:
7006 return SYMBOL_TINY_ABSOLUTE
;
7008 case AARCH64_CMODEL_SMALL_PIC
:
7009 case AARCH64_CMODEL_SMALL
:
7010 return SYMBOL_SMALL_ABSOLUTE
;
7017 if (GET_CODE (x
) == SYMBOL_REF
)
7019 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
)
7020 return SYMBOL_FORCE_TO_MEM
;
7022 if (aarch64_tls_symbol_p (x
))
7023 return aarch64_classify_tls_symbol (x
);
7025 switch (aarch64_cmodel
)
7027 case AARCH64_CMODEL_TINY
:
7028 /* When we retreive symbol + offset address, we have to make sure
7029 the offset does not cause overflow of the final address. But
7030 we have no way of knowing the address of symbol at compile time
7031 so we can't accurately say if the distance between the PC and
7032 symbol + offset is outside the addressible range of +/-1M in the
7033 TINY code model. So we rely on images not being greater than
7034 1M and cap the offset at 1M and anything beyond 1M will have to
7035 be loaded using an alternative mechanism. */
7036 if (SYMBOL_REF_WEAK (x
)
7037 || INTVAL (offset
) < -1048575 || INTVAL (offset
) > 1048575)
7038 return SYMBOL_FORCE_TO_MEM
;
7039 return SYMBOL_TINY_ABSOLUTE
;
7041 case AARCH64_CMODEL_SMALL
:
7042 /* Same reasoning as the tiny code model, but the offset cap here is
7044 if (SYMBOL_REF_WEAK (x
)
7045 || !IN_RANGE (INTVAL (offset
), HOST_WIDE_INT_C (-4294967263),
7046 HOST_WIDE_INT_C (4294967264)))
7047 return SYMBOL_FORCE_TO_MEM
;
7048 return SYMBOL_SMALL_ABSOLUTE
;
7050 case AARCH64_CMODEL_TINY_PIC
:
7051 if (!aarch64_symbol_binds_local_p (x
))
7052 return SYMBOL_TINY_GOT
;
7053 return SYMBOL_TINY_ABSOLUTE
;
7055 case AARCH64_CMODEL_SMALL_PIC
:
7056 if (!aarch64_symbol_binds_local_p (x
))
7057 return SYMBOL_SMALL_GOT
;
7058 return SYMBOL_SMALL_ABSOLUTE
;
7065 /* By default push everything into the constant pool. */
7066 return SYMBOL_FORCE_TO_MEM
;
7070 aarch64_constant_address_p (rtx x
)
7072 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
7076 aarch64_legitimate_pic_operand_p (rtx x
)
7078 if (GET_CODE (x
) == SYMBOL_REF
7079 || (GET_CODE (x
) == CONST
7080 && GET_CODE (XEXP (x
, 0)) == PLUS
7081 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
7087 /* Return true if X holds either a quarter-precision or
7088 floating-point +0.0 constant. */
7090 aarch64_valid_floating_const (machine_mode mode
, rtx x
)
7092 if (!CONST_DOUBLE_P (x
))
7095 /* TODO: We could handle moving 0.0 to a TFmode register,
7096 but first we would like to refactor the movtf_aarch64
7097 to be more amicable to split moves properly and
7098 correctly gate on TARGET_SIMD. For now - reject all
7099 constants which are not to SFmode or DFmode registers. */
7100 if (!(mode
== SFmode
|| mode
== DFmode
))
7103 if (aarch64_float_const_zero_rtx_p (x
))
7105 return aarch64_float_const_representable_p (x
);
7109 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
7111 /* Do not allow vector struct mode constants. We could support
7112 0 and -1 easily, but they need support in aarch64-simd.md. */
7113 if (TARGET_SIMD
&& aarch64_vect_struct_mode_p (mode
))
7116 /* This could probably go away because
7117 we now decompose CONST_INTs according to expand_mov_immediate. */
7118 if ((GET_CODE (x
) == CONST_VECTOR
7119 && aarch64_simd_valid_immediate (x
, mode
, false, NULL
))
7120 || CONST_INT_P (x
) || aarch64_valid_floating_const (mode
, x
))
7121 return !targetm
.cannot_force_const_mem (mode
, x
);
7123 if (GET_CODE (x
) == HIGH
7124 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
7127 return aarch64_constant_address_p (x
);
7131 aarch64_load_tp (rtx target
)
7134 || GET_MODE (target
) != Pmode
7135 || !register_operand (target
, Pmode
))
7136 target
= gen_reg_rtx (Pmode
);
7138 /* Can return in any reg. */
7139 emit_insn (gen_aarch64_load_tp_hard (target
));
7143 /* On AAPCS systems, this is the "struct __va_list". */
7144 static GTY(()) tree va_list_type
;
7146 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7147 Return the type to use as __builtin_va_list.
7149 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7161 aarch64_build_builtin_va_list (void)
7164 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
7166 /* Create the type. */
7167 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
7168 /* Give it the required name. */
7169 va_list_name
= build_decl (BUILTINS_LOCATION
,
7171 get_identifier ("__va_list"),
7173 DECL_ARTIFICIAL (va_list_name
) = 1;
7174 TYPE_NAME (va_list_type
) = va_list_name
;
7175 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
7177 /* Create the fields. */
7178 f_stack
= build_decl (BUILTINS_LOCATION
,
7179 FIELD_DECL
, get_identifier ("__stack"),
7181 f_grtop
= build_decl (BUILTINS_LOCATION
,
7182 FIELD_DECL
, get_identifier ("__gr_top"),
7184 f_vrtop
= build_decl (BUILTINS_LOCATION
,
7185 FIELD_DECL
, get_identifier ("__vr_top"),
7187 f_groff
= build_decl (BUILTINS_LOCATION
,
7188 FIELD_DECL
, get_identifier ("__gr_offs"),
7190 f_vroff
= build_decl (BUILTINS_LOCATION
,
7191 FIELD_DECL
, get_identifier ("__vr_offs"),
7194 DECL_ARTIFICIAL (f_stack
) = 1;
7195 DECL_ARTIFICIAL (f_grtop
) = 1;
7196 DECL_ARTIFICIAL (f_vrtop
) = 1;
7197 DECL_ARTIFICIAL (f_groff
) = 1;
7198 DECL_ARTIFICIAL (f_vroff
) = 1;
7200 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
7201 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
7202 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
7203 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
7204 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
7206 TYPE_FIELDS (va_list_type
) = f_stack
;
7207 DECL_CHAIN (f_stack
) = f_grtop
;
7208 DECL_CHAIN (f_grtop
) = f_vrtop
;
7209 DECL_CHAIN (f_vrtop
) = f_groff
;
7210 DECL_CHAIN (f_groff
) = f_vroff
;
7212 /* Compute its layout. */
7213 layout_type (va_list_type
);
7215 return va_list_type
;
7218 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7220 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
7222 const CUMULATIVE_ARGS
*cum
;
7223 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
7224 tree stack
, grtop
, vrtop
, groff
, vroff
;
7226 int gr_save_area_size
;
7227 int vr_save_area_size
;
7230 cum
= &crtl
->args
.info
;
7232 = (NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
;
7234 = (NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
) * UNITS_PER_VREG
;
7236 if (TARGET_GENERAL_REGS_ONLY
)
7238 if (cum
->aapcs_nvrn
> 0)
7239 sorry ("%qs and floating point or vector arguments",
7240 "-mgeneral-regs-only");
7241 vr_save_area_size
= 0;
7244 f_stack
= TYPE_FIELDS (va_list_type_node
);
7245 f_grtop
= DECL_CHAIN (f_stack
);
7246 f_vrtop
= DECL_CHAIN (f_grtop
);
7247 f_groff
= DECL_CHAIN (f_vrtop
);
7248 f_vroff
= DECL_CHAIN (f_groff
);
7250 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
7252 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
7254 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
7256 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
7258 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
7261 /* Emit code to initialize STACK, which points to the next varargs stack
7262 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7263 by named arguments. STACK is 8-byte aligned. */
7264 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
7265 if (cum
->aapcs_stack_size
> 0)
7266 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
7267 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
7268 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7270 /* Emit code to initialize GRTOP, the top of the GR save area.
7271 virtual_incoming_args_rtx should have been 16 byte aligned. */
7272 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
7273 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
7274 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7276 /* Emit code to initialize VRTOP, the top of the VR save area.
7277 This address is gr_save_area_bytes below GRTOP, rounded
7278 down to the next 16-byte boundary. */
7279 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
7280 vr_offset
= AARCH64_ROUND_UP (gr_save_area_size
,
7281 STACK_BOUNDARY
/ BITS_PER_UNIT
);
7284 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
7285 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
7286 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7288 /* Emit code to initialize GROFF, the offset from GRTOP of the
7289 next GPR argument. */
7290 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
7291 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
7292 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7294 /* Likewise emit code to initialize VROFF, the offset from FTOP
7295 of the next VR argument. */
7296 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
7297 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
7298 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
7301 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7304 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
7305 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
7309 bool is_ha
; /* is HFA or HVA. */
7310 bool dw_align
; /* double-word align. */
7311 machine_mode ag_mode
= VOIDmode
;
7315 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
7316 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
7317 HOST_WIDE_INT size
, rsize
, adjust
, align
;
7318 tree t
, u
, cond1
, cond2
;
7320 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
7322 type
= build_pointer_type (type
);
7324 mode
= TYPE_MODE (type
);
7326 f_stack
= TYPE_FIELDS (va_list_type_node
);
7327 f_grtop
= DECL_CHAIN (f_stack
);
7328 f_vrtop
= DECL_CHAIN (f_grtop
);
7329 f_groff
= DECL_CHAIN (f_vrtop
);
7330 f_vroff
= DECL_CHAIN (f_groff
);
7332 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
7333 f_stack
, NULL_TREE
);
7334 size
= int_size_in_bytes (type
);
7335 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
7339 if (aarch64_vfp_is_call_or_return_candidate (mode
,
7345 /* TYPE passed in fp/simd registers. */
7346 if (TARGET_GENERAL_REGS_ONLY
)
7347 sorry ("%qs and floating point or vector arguments",
7348 "-mgeneral-regs-only");
7350 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
7351 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
7352 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
7353 unshare_expr (valist
), f_vroff
, NULL_TREE
);
7355 rsize
= nregs
* UNITS_PER_VREG
;
7359 if (BYTES_BIG_ENDIAN
&& GET_MODE_SIZE (ag_mode
) < UNITS_PER_VREG
)
7360 adjust
= UNITS_PER_VREG
- GET_MODE_SIZE (ag_mode
);
7362 else if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
7363 && size
< UNITS_PER_VREG
)
7365 adjust
= UNITS_PER_VREG
- size
;
7370 /* TYPE passed in general registers. */
7371 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
7372 unshare_expr (valist
), f_grtop
, NULL_TREE
);
7373 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
7374 unshare_expr (valist
), f_groff
, NULL_TREE
);
7375 rsize
= (size
+ UNITS_PER_WORD
- 1) & -UNITS_PER_WORD
;
7376 nregs
= rsize
/ UNITS_PER_WORD
;
7381 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
7382 && size
< UNITS_PER_WORD
)
7384 adjust
= UNITS_PER_WORD
- size
;
7388 /* Get a local temporary for the field value. */
7389 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
7391 /* Emit code to branch if off >= 0. */
7392 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
7393 build_int_cst (TREE_TYPE (off
), 0));
7394 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
7398 /* Emit: offs = (offs + 15) & -16. */
7399 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
7400 build_int_cst (TREE_TYPE (off
), 15));
7401 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
7402 build_int_cst (TREE_TYPE (off
), -16));
7403 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
7408 /* Update ap.__[g|v]r_offs */
7409 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
7410 build_int_cst (TREE_TYPE (off
), rsize
));
7411 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
7415 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
7417 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7418 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
7419 build_int_cst (TREE_TYPE (f_off
), 0));
7420 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
7422 /* String up: make sure the assignment happens before the use. */
7423 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
7424 COND_EXPR_ELSE (cond1
) = t
;
7426 /* Prepare the trees handling the argument that is passed on the stack;
7427 the top level node will store in ON_STACK. */
7428 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
7431 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7432 t
= fold_convert (intDI_type_node
, arg
);
7433 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
7434 build_int_cst (TREE_TYPE (t
), 15));
7435 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
7436 build_int_cst (TREE_TYPE (t
), -16));
7437 t
= fold_convert (TREE_TYPE (arg
), t
);
7438 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
7442 /* Advance ap.__stack */
7443 t
= fold_convert (intDI_type_node
, arg
);
7444 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
7445 build_int_cst (TREE_TYPE (t
), size
+ 7));
7446 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
7447 build_int_cst (TREE_TYPE (t
), -8));
7448 t
= fold_convert (TREE_TYPE (arg
), t
);
7449 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
7450 /* String up roundup and advance. */
7452 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
7453 /* String up with arg */
7454 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
7455 /* Big-endianness related address adjustment. */
7456 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
7457 && size
< UNITS_PER_WORD
)
7459 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
7460 size_int (UNITS_PER_WORD
- size
));
7461 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
7464 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
7465 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
7467 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7470 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
7471 build_int_cst (TREE_TYPE (off
), adjust
));
7473 t
= fold_convert (sizetype
, t
);
7474 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
7478 /* type ha; // treat as "struct {ftype field[n];}"
7479 ... [computing offs]
7480 for (i = 0; i <nregs; ++i, offs += 16)
7481 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7484 tree tmp_ha
, field_t
, field_ptr_t
;
7486 /* Declare a local variable. */
7487 tmp_ha
= create_tmp_var_raw (type
, "ha");
7488 gimple_add_tmp_var (tmp_ha
);
7490 /* Establish the base type. */
7494 field_t
= float_type_node
;
7495 field_ptr_t
= float_ptr_type_node
;
7498 field_t
= double_type_node
;
7499 field_ptr_t
= double_ptr_type_node
;
7502 field_t
= long_double_type_node
;
7503 field_ptr_t
= long_double_ptr_type_node
;
7505 /* The half precision and quad precision are not fully supported yet. Enable
7506 the following code after the support is complete. Need to find the correct
7507 type node for __fp16 *. */
7510 field_t
= float_type_node
;
7511 field_ptr_t
= float_ptr_type_node
;
7517 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
7518 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
7519 field_ptr_t
= build_pointer_type (field_t
);
7526 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7527 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
7529 t
= fold_convert (field_ptr_t
, addr
);
7530 t
= build2 (MODIFY_EXPR
, field_t
,
7531 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
7532 build1 (INDIRECT_REF
, field_t
, t
));
7534 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7535 for (i
= 1; i
< nregs
; ++i
)
7537 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
7538 u
= fold_convert (field_ptr_t
, addr
);
7539 u
= build2 (MODIFY_EXPR
, field_t
,
7540 build2 (MEM_REF
, field_t
, tmp_ha
,
7541 build_int_cst (field_ptr_t
,
7543 int_size_in_bytes (field_t
)))),
7544 build1 (INDIRECT_REF
, field_t
, u
));
7545 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
7548 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
7549 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
7552 COND_EXPR_ELSE (cond2
) = t
;
7553 addr
= fold_convert (build_pointer_type (type
), cond1
);
7554 addr
= build_va_arg_indirect_ref (addr
);
7557 addr
= build_va_arg_indirect_ref (addr
);
7562 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7565 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
7566 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
7569 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
7570 CUMULATIVE_ARGS local_cum
;
7571 int gr_saved
, vr_saved
;
7573 /* The caller has advanced CUM up to, but not beyond, the last named
7574 argument. Advance a local copy of CUM past the last "real" named
7575 argument, to find out how many registers are left over. */
7577 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
7579 /* Found out how many registers we need to save. */
7580 gr_saved
= NUM_ARG_REGS
- local_cum
.aapcs_ncrn
;
7581 vr_saved
= NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
;
7583 if (TARGET_GENERAL_REGS_ONLY
)
7585 if (local_cum
.aapcs_nvrn
> 0)
7586 sorry ("%qs and floating point or vector arguments",
7587 "-mgeneral-regs-only");
7597 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7598 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
7599 - gr_saved
* UNITS_PER_WORD
);
7600 mem
= gen_frame_mem (BLKmode
, ptr
);
7601 set_mem_alias_set (mem
, get_varargs_alias_set ());
7603 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
7608 /* We can't use move_block_from_reg, because it will use
7609 the wrong mode, storing D regs only. */
7610 machine_mode mode
= TImode
;
7613 /* Set OFF to the offset from virtual_incoming_args_rtx of
7614 the first vector register. The VR save area lies below
7615 the GR one, and is aligned to 16 bytes. */
7616 off
= -AARCH64_ROUND_UP (gr_saved
* UNITS_PER_WORD
,
7617 STACK_BOUNDARY
/ BITS_PER_UNIT
);
7618 off
-= vr_saved
* UNITS_PER_VREG
;
7620 for (i
= local_cum
.aapcs_nvrn
; i
< NUM_FP_ARG_REGS
; ++i
)
7624 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
7625 mem
= gen_frame_mem (mode
, ptr
);
7626 set_mem_alias_set (mem
, get_varargs_alias_set ());
7627 aarch64_emit_move (mem
, gen_rtx_REG (mode
, V0_REGNUM
+ i
));
7628 off
+= UNITS_PER_VREG
;
7633 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7634 any complication of having crtl->args.pretend_args_size changed. */
7635 cfun
->machine
->frame
.saved_varargs_size
7636 = (AARCH64_ROUND_UP (gr_saved
* UNITS_PER_WORD
,
7637 STACK_BOUNDARY
/ BITS_PER_UNIT
)
7638 + vr_saved
* UNITS_PER_VREG
);
7642 aarch64_conditional_register_usage (void)
7647 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
7650 call_used_regs
[i
] = 1;
7655 /* Walk down the type tree of TYPE counting consecutive base elements.
7656 If *MODEP is VOIDmode, then set it to the first valid floating point
7657 type. If a non-floating point type is found, or if a floating point
7658 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7659 otherwise return the count in the sub-tree. */
7661 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
7666 switch (TREE_CODE (type
))
7669 mode
= TYPE_MODE (type
);
7670 if (mode
!= DFmode
&& mode
!= SFmode
&& mode
!= TFmode
)
7673 if (*modep
== VOIDmode
)
7682 mode
= TYPE_MODE (TREE_TYPE (type
));
7683 if (mode
!= DFmode
&& mode
!= SFmode
&& mode
!= TFmode
)
7686 if (*modep
== VOIDmode
)
7695 /* Use V2SImode and V4SImode as representatives of all 64-bit
7696 and 128-bit vector types. */
7697 size
= int_size_in_bytes (type
);
7710 if (*modep
== VOIDmode
)
7713 /* Vector modes are considered to be opaque: two vectors are
7714 equivalent for the purposes of being homogeneous aggregates
7715 if they are the same size. */
7724 tree index
= TYPE_DOMAIN (type
);
7726 /* Can't handle incomplete types nor sizes that are not
7728 if (!COMPLETE_TYPE_P (type
)
7729 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
7732 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
7735 || !TYPE_MAX_VALUE (index
)
7736 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
7737 || !TYPE_MIN_VALUE (index
)
7738 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
7742 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
7743 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
7745 /* There must be no padding. */
7746 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
7758 /* Can't handle incomplete types nor sizes that are not
7760 if (!COMPLETE_TYPE_P (type
)
7761 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
7764 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
7766 if (TREE_CODE (field
) != FIELD_DECL
)
7769 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
7775 /* There must be no padding. */
7776 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
7783 case QUAL_UNION_TYPE
:
7785 /* These aren't very interesting except in a degenerate case. */
7790 /* Can't handle incomplete types nor sizes that are not
7792 if (!COMPLETE_TYPE_P (type
)
7793 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
7796 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
7798 if (TREE_CODE (field
) != FIELD_DECL
)
7801 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
7804 count
= count
> sub_count
? count
: sub_count
;
7807 /* There must be no padding. */
7808 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
7821 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7822 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7823 array types. The C99 floating-point complex types are also considered
7824 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7825 types, which are GCC extensions and out of the scope of AAPCS64, are
7826 treated as composite types here as well.
7828 Note that MODE itself is not sufficient in determining whether a type
7829 is such a composite type or not. This is because
7830 stor-layout.c:compute_record_mode may have already changed the MODE
7831 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7832 structure with only one field may have its MODE set to the mode of the
7833 field. Also an integer mode whose size matches the size of the
7834 RECORD_TYPE type may be used to substitute the original mode
7835 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7836 solely relied on. */
7839 aarch64_composite_type_p (const_tree type
,
7842 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
7846 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
7847 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
7853 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7854 type as described in AAPCS64 \S 4.1.2.
7856 See the comment above aarch64_composite_type_p for the notes on MODE. */
7859 aarch64_short_vector_p (const_tree type
,
7862 HOST_WIDE_INT size
= -1;
7864 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
7865 size
= int_size_in_bytes (type
);
7866 else if (!aarch64_composite_type_p (type
, mode
)
7867 && (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
7868 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
))
7869 size
= GET_MODE_SIZE (mode
);
7871 return (size
== 8 || size
== 16) ? true : false;
7874 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7875 shall be passed or returned in simd/fp register(s) (providing these
7876 parameter passing registers are available).
7878 Upon successful return, *COUNT returns the number of needed registers,
7879 *BASE_MODE returns the mode of the individual register and when IS_HAF
7880 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7881 floating-point aggregate or a homogeneous short-vector aggregate. */
7884 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
7886 machine_mode
*base_mode
,
7890 machine_mode new_mode
= VOIDmode
;
7891 bool composite_p
= aarch64_composite_type_p (type
, mode
);
7893 if (is_ha
!= NULL
) *is_ha
= false;
7895 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7896 || aarch64_short_vector_p (type
, mode
))
7901 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
7903 if (is_ha
!= NULL
) *is_ha
= true;
7905 new_mode
= GET_MODE_INNER (mode
);
7907 else if (type
&& composite_p
)
7909 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
7911 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
7913 if (is_ha
!= NULL
) *is_ha
= true;
7922 *base_mode
= new_mode
;
7926 /* Implement TARGET_STRUCT_VALUE_RTX. */
7929 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
7930 int incoming ATTRIBUTE_UNUSED
)
7932 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
7935 /* Implements target hook vector_mode_supported_p. */
7937 aarch64_vector_mode_supported_p (machine_mode mode
)
7940 && (mode
== V4SImode
|| mode
== V8HImode
7941 || mode
== V16QImode
|| mode
== V2DImode
7942 || mode
== V2SImode
|| mode
== V4HImode
7943 || mode
== V8QImode
|| mode
== V2SFmode
7944 || mode
== V4SFmode
|| mode
== V2DFmode
7945 || mode
== V1DFmode
))
7951 /* Return appropriate SIMD container
7952 for MODE within a vector of WIDTH bits. */
7954 aarch64_simd_container_mode (machine_mode mode
, unsigned width
)
7956 gcc_assert (width
== 64 || width
== 128);
7995 /* Return 128-bit container as the preferred SIMD mode for MODE. */
7997 aarch64_preferred_simd_mode (machine_mode mode
)
7999 return aarch64_simd_container_mode (mode
, 128);
8002 /* Return the bitmask of possible vector sizes for the vectorizer
8005 aarch64_autovectorize_vector_sizes (void)
8010 /* Implement TARGET_MANGLE_TYPE. */
8013 aarch64_mangle_type (const_tree type
)
8015 /* The AArch64 ABI documents say that "__va_list" has to be
8016 managled as if it is in the "std" namespace. */
8017 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
8018 return "St9__va_list";
8020 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8022 if (TYPE_NAME (type
) != NULL
)
8023 return aarch64_mangle_builtin_type (type
);
8025 /* Use the default mangling. */
8030 /* Return true if the rtx_insn contains a MEM RTX somewhere
8034 has_memory_op (rtx_insn
*mem_insn
)
8036 subrtx_iterator::array_type array
;
8037 FOR_EACH_SUBRTX (iter
, array
, PATTERN (mem_insn
), ALL
)
8044 /* Find the first rtx_insn before insn that will generate an assembly
8048 aarch64_prev_real_insn (rtx_insn
*insn
)
8055 insn
= prev_real_insn (insn
);
8057 while (insn
&& recog_memoized (insn
) < 0);
8063 is_madd_op (enum attr_type t1
)
8066 /* A number of these may be AArch32 only. */
8067 enum attr_type mlatypes
[] = {
8068 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
8069 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
8070 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
8073 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
8075 if (t1
== mlatypes
[i
])
8082 /* Check if there is a register dependency between a load and the insn
8083 for which we hold recog_data. */
8086 dep_between_memop_and_curr (rtx memop
)
8091 gcc_assert (GET_CODE (memop
) == SET
);
8093 if (!REG_P (SET_DEST (memop
)))
8096 load_reg
= SET_DEST (memop
);
8097 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
8099 rtx operand
= recog_data
.operand
[opno
];
8101 && reg_overlap_mentioned_p (load_reg
, operand
))
8109 /* When working around the Cortex-A53 erratum 835769,
8110 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8111 instruction and has a preceding memory instruction such that a NOP
8112 should be inserted between them. */
8115 aarch64_madd_needs_nop (rtx_insn
* insn
)
8117 enum attr_type attr_type
;
8121 if (!aarch64_fix_a53_err835769
)
8124 if (recog_memoized (insn
) < 0)
8127 attr_type
= get_attr_type (insn
);
8128 if (!is_madd_op (attr_type
))
8131 prev
= aarch64_prev_real_insn (insn
);
8132 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8133 Restore recog state to INSN to avoid state corruption. */
8134 extract_constrain_insn_cached (insn
);
8136 if (!prev
|| !has_memory_op (prev
))
8139 body
= single_set (prev
);
8141 /* If the previous insn is a memory op and there is no dependency between
8142 it and the DImode madd, emit a NOP between them. If body is NULL then we
8143 have a complex memory operation, probably a load/store pair.
8144 Be conservative for now and emit a NOP. */
8145 if (GET_MODE (recog_data
.operand
[0]) == DImode
8146 && (!body
|| !dep_between_memop_and_curr (body
)))
8154 /* Implement FINAL_PRESCAN_INSN. */
8157 aarch64_final_prescan_insn (rtx_insn
*insn
)
8159 if (aarch64_madd_needs_nop (insn
))
8160 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
8164 /* Return the equivalent letter for size. */
8166 sizetochar (int size
)
8170 case 64: return 'd';
8171 case 32: return 's';
8172 case 16: return 'h';
8173 case 8 : return 'b';
8174 default: gcc_unreachable ();
8178 /* Return true iff x is a uniform vector of floating-point
8179 constants, and the constant can be represented in
8180 quarter-precision form. Note, as aarch64_float_const_representable
8181 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8183 aarch64_vect_float_const_representable_p (rtx x
)
8186 REAL_VALUE_TYPE r0
, ri
;
8189 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
8192 x0
= CONST_VECTOR_ELT (x
, 0);
8193 if (!CONST_DOUBLE_P (x0
))
8196 REAL_VALUE_FROM_CONST_DOUBLE (r0
, x0
);
8198 for (i
= 1; i
< CONST_VECTOR_NUNITS (x
); i
++)
8200 xi
= CONST_VECTOR_ELT (x
, i
);
8201 if (!CONST_DOUBLE_P (xi
))
8204 REAL_VALUE_FROM_CONST_DOUBLE (ri
, xi
);
8205 if (!REAL_VALUES_EQUAL (r0
, ri
))
8209 return aarch64_float_const_representable_p (x0
);
8212 /* Return true for valid and false for invalid. */
8214 aarch64_simd_valid_immediate (rtx op
, machine_mode mode
, bool inverse
,
8215 struct simd_immediate_info
*info
)
8217 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8219 for (i = 0; i < idx; i += (STRIDE)) \
8224 immtype = (CLASS); \
8225 elsize = (ELSIZE); \
8231 unsigned int i
, elsize
= 0, idx
= 0, n_elts
= CONST_VECTOR_NUNITS (op
);
8232 unsigned int innersize
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
8233 unsigned char bytes
[16];
8234 int immtype
= -1, matches
;
8235 unsigned int invmask
= inverse
? 0xff : 0;
8238 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
8240 if (! (aarch64_simd_imm_zero_p (op
, mode
)
8241 || aarch64_vect_float_const_representable_p (op
)))
8246 info
->value
= CONST_VECTOR_ELT (op
, 0);
8247 info
->element_width
= GET_MODE_BITSIZE (GET_MODE (info
->value
));
8255 /* Splat vector constant out into a byte vector. */
8256 for (i
= 0; i
< n_elts
; i
++)
8258 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8259 it must be laid out in the vector register in reverse order. */
8260 rtx el
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
8261 unsigned HOST_WIDE_INT elpart
;
8262 unsigned int part
, parts
;
8264 if (CONST_INT_P (el
))
8266 elpart
= INTVAL (el
);
8269 else if (GET_CODE (el
) == CONST_DOUBLE
)
8271 elpart
= CONST_DOUBLE_LOW (el
);
8277 for (part
= 0; part
< parts
; part
++)
8280 for (byte
= 0; byte
< innersize
; byte
++)
8282 bytes
[idx
++] = (elpart
& 0xff) ^ invmask
;
8283 elpart
>>= BITS_PER_UNIT
;
8285 if (GET_CODE (el
) == CONST_DOUBLE
)
8286 elpart
= CONST_DOUBLE_HIGH (el
);
8291 gcc_assert (idx
== GET_MODE_SIZE (mode
));
8295 CHECK (4, 32, 0, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0
8296 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 0, 0);
8298 CHECK (4, 32, 1, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
8299 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
8301 CHECK (4, 32, 2, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
8302 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
8304 CHECK (4, 32, 3, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
8305 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == bytes
[3], 24, 0);
8307 CHECK (2, 16, 4, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0, 0, 0);
8309 CHECK (2, 16, 5, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1], 8, 0);
8311 CHECK (4, 32, 6, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff
8312 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 0, 1);
8314 CHECK (4, 32, 7, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
8315 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
8317 CHECK (4, 32, 8, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
8318 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
8320 CHECK (4, 32, 9, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
8321 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == bytes
[3], 24, 1);
8323 CHECK (2, 16, 10, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff, 0, 1);
8325 CHECK (2, 16, 11, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1], 8, 1);
8327 CHECK (4, 32, 12, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
8328 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
8330 CHECK (4, 32, 13, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
8331 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
8333 CHECK (4, 32, 14, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
8334 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
8336 CHECK (4, 32, 15, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
8337 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
8339 CHECK (1, 8, 16, bytes
[i
] == bytes
[0], 0, 0);
8341 CHECK (1, 64, 17, (bytes
[i
] == 0 || bytes
[i
] == 0xff)
8342 && bytes
[i
] == bytes
[(i
+ 8) % idx
], 0, 0);
8351 info
->element_width
= elsize
;
8352 info
->mvn
= emvn
!= 0;
8353 info
->shift
= eshift
;
8355 unsigned HOST_WIDE_INT imm
= 0;
8357 if (immtype
>= 12 && immtype
<= 15)
8360 /* Un-invert bytes of recognized vector, if necessary. */
8362 for (i
= 0; i
< idx
; i
++)
8363 bytes
[i
] ^= invmask
;
8367 /* FIXME: Broken on 32-bit H_W_I hosts. */
8368 gcc_assert (sizeof (HOST_WIDE_INT
) == 8);
8370 for (i
= 0; i
< 8; i
++)
8371 imm
|= (unsigned HOST_WIDE_INT
) (bytes
[i
] ? 0xff : 0)
8372 << (i
* BITS_PER_UNIT
);
8375 info
->value
= GEN_INT (imm
);
8379 for (i
= 0; i
< elsize
/ BITS_PER_UNIT
; i
++)
8380 imm
|= (unsigned HOST_WIDE_INT
) bytes
[i
] << (i
* BITS_PER_UNIT
);
8382 /* Construct 'abcdefgh' because the assembler cannot handle
8383 generic constants. */
8386 imm
= (imm
>> info
->shift
) & 0xff;
8387 info
->value
= GEN_INT (imm
);
8395 /* Check of immediate shift constants are within range. */
8397 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
8399 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
8401 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
8403 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
8406 /* Return true if X is a uniform vector where all elements
8407 are either the floating-point constant 0.0 or the
8408 integer constant 0. */
8410 aarch64_simd_imm_zero_p (rtx x
, machine_mode mode
)
8412 return x
== CONST0_RTX (mode
);
8416 aarch64_simd_imm_scalar_p (rtx x
, machine_mode mode ATTRIBUTE_UNUSED
)
8418 HOST_WIDE_INT imm
= INTVAL (x
);
8421 for (i
= 0; i
< 8; i
++)
8423 unsigned int byte
= imm
& 0xff;
8424 if (byte
!= 0xff && byte
!= 0)
8433 aarch64_mov_operand_p (rtx x
,
8434 enum aarch64_symbol_context context
,
8437 if (GET_CODE (x
) == HIGH
8438 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
8441 if (CONST_INT_P (x
))
8444 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
8447 return aarch64_classify_symbolic_expression (x
, context
)
8448 == SYMBOL_TINY_ABSOLUTE
;
8451 /* Return a const_int vector of VAL. */
8453 aarch64_simd_gen_const_vector_dup (machine_mode mode
, int val
)
8455 int nunits
= GET_MODE_NUNITS (mode
);
8456 rtvec v
= rtvec_alloc (nunits
);
8459 for (i
=0; i
< nunits
; i
++)
8460 RTVEC_ELT (v
, i
) = GEN_INT (val
);
8462 return gen_rtx_CONST_VECTOR (mode
, v
);
8465 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8468 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, machine_mode mode
)
8472 gcc_assert (!VECTOR_MODE_P (mode
));
8473 vmode
= aarch64_preferred_simd_mode (mode
);
8474 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
8475 return aarch64_simd_valid_immediate (op_v
, vmode
, false, NULL
);
8478 /* Construct and return a PARALLEL RTX vector with elements numbering the
8479 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8480 the vector - from the perspective of the architecture. This does not
8481 line up with GCC's perspective on lane numbers, so we end up with
8482 different masks depending on our target endian-ness. The diagram
8483 below may help. We must draw the distinction when building masks
8484 which select one half of the vector. An instruction selecting
8485 architectural low-lanes for a big-endian target, must be described using
8486 a mask selecting GCC high-lanes.
8488 Big-Endian Little-Endian
8491 | x | x | x | x | | x | x | x | x |
8492 Architecture 3 2 1 0 3 2 1 0
8494 Low Mask: { 2, 3 } { 0, 1 }
8495 High Mask: { 0, 1 } { 2, 3 }
8499 aarch64_simd_vect_par_cnst_half (machine_mode mode
, bool high
)
8501 int nunits
= GET_MODE_NUNITS (mode
);
8502 rtvec v
= rtvec_alloc (nunits
/ 2);
8503 int high_base
= nunits
/ 2;
8509 if (BYTES_BIG_ENDIAN
)
8510 base
= high
? low_base
: high_base
;
8512 base
= high
? high_base
: low_base
;
8514 for (i
= 0; i
< nunits
/ 2; i
++)
8515 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
8517 t1
= gen_rtx_PARALLEL (mode
, v
);
8521 /* Check OP for validity as a PARALLEL RTX vector with elements
8522 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8523 from the perspective of the architecture. See the diagram above
8524 aarch64_simd_vect_par_cnst_half for more details. */
8527 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
8530 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, high
);
8531 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
8532 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
8535 if (!VECTOR_MODE_P (mode
))
8538 if (count_op
!= count_ideal
)
8541 for (i
= 0; i
< count_ideal
; i
++)
8543 rtx elt_op
= XVECEXP (op
, 0, i
);
8544 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
8546 if (!CONST_INT_P (elt_op
)
8547 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
8553 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8554 HIGH (exclusive). */
8556 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
8560 gcc_assert (CONST_INT_P (operand
));
8561 lane
= INTVAL (operand
);
8563 if (lane
< low
|| lane
>= high
)
8566 error ("%Klane %ld out of range %ld - %ld", exp
, lane
, low
, high
- 1);
8568 error ("lane %ld out of range %ld - %ld", lane
, low
, high
- 1);
8572 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8575 aarch64_simd_emit_pair_result_insn (machine_mode mode
,
8576 rtx (*intfn
) (rtx
, rtx
, rtx
), rtx destaddr
,
8579 rtx mem
= gen_rtx_MEM (mode
, destaddr
);
8580 rtx tmp1
= gen_reg_rtx (mode
);
8581 rtx tmp2
= gen_reg_rtx (mode
);
8583 emit_insn (intfn (tmp1
, op1
, tmp2
));
8585 emit_move_insn (mem
, tmp1
);
8586 mem
= adjust_address (mem
, mode
, GET_MODE_SIZE (mode
));
8587 emit_move_insn (mem
, tmp2
);
8590 /* Return TRUE if OP is a valid vector addressing mode. */
8592 aarch64_simd_mem_operand_p (rtx op
)
8594 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
8595 || REG_P (XEXP (op
, 0)));
8598 /* Emit a register copy from operand to operand, taking care not to
8599 early-clobber source registers in the process.
8601 COUNT is the number of components into which the copy needs to be
8604 aarch64_simd_emit_reg_reg_move (rtx
*operands
, enum machine_mode mode
,
8608 int rdest
= REGNO (operands
[0]);
8609 int rsrc
= REGNO (operands
[1]);
8611 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
8613 for (i
= 0; i
< count
; i
++)
8614 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
8615 gen_rtx_REG (mode
, rsrc
+ i
));
8617 for (i
= 0; i
< count
; i
++)
8618 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
8619 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
8622 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8623 one of VSTRUCT modes: OI, CI or XI. */
8625 aarch64_simd_attr_length_move (rtx_insn
*insn
)
8629 extract_insn_cached (insn
);
8631 if (REG_P (recog_data
.operand
[0]) && REG_P (recog_data
.operand
[1]))
8633 mode
= GET_MODE (recog_data
.operand
[0]);
8649 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8650 one of VSTRUCT modes: OI, CI, EI, or XI. */
8652 aarch64_simd_attr_length_rglist (enum machine_mode mode
)
8654 return (GET_MODE_SIZE (mode
) / UNITS_PER_VREG
) * 4;
8657 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8658 alignment of a vector to 128 bits. */
8659 static HOST_WIDE_INT
8660 aarch64_simd_vector_alignment (const_tree type
)
8662 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
8663 return MIN (align
, 128);
8666 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8668 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
8673 /* We guarantee alignment for vectors up to 128-bits. */
8674 if (tree_int_cst_compare (TYPE_SIZE (type
),
8675 bitsize_int (BIGGEST_ALIGNMENT
)) > 0)
8678 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8682 /* If VALS is a vector constant that can be loaded into a register
8683 using DUP, generate instructions to do so and return an RTX to
8684 assign to the register. Otherwise return NULL_RTX. */
8686 aarch64_simd_dup_constant (rtx vals
)
8688 machine_mode mode
= GET_MODE (vals
);
8689 machine_mode inner_mode
= GET_MODE_INNER (mode
);
8690 int n_elts
= GET_MODE_NUNITS (mode
);
8691 bool all_same
= true;
8695 if (GET_CODE (vals
) != CONST_VECTOR
)
8698 for (i
= 1; i
< n_elts
; ++i
)
8700 x
= CONST_VECTOR_ELT (vals
, i
);
8701 if (!rtx_equal_p (x
, CONST_VECTOR_ELT (vals
, 0)))
8708 /* We can load this constant by using DUP and a constant in a
8709 single ARM register. This will be cheaper than a vector
8711 x
= copy_to_mode_reg (inner_mode
, CONST_VECTOR_ELT (vals
, 0));
8712 return gen_rtx_VEC_DUPLICATE (mode
, x
);
8716 /* Generate code to load VALS, which is a PARALLEL containing only
8717 constants (for vec_init) or CONST_VECTOR, efficiently into a
8718 register. Returns an RTX to copy into the register, or NULL_RTX
8719 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8721 aarch64_simd_make_constant (rtx vals
)
8723 machine_mode mode
= GET_MODE (vals
);
8725 rtx const_vec
= NULL_RTX
;
8726 int n_elts
= GET_MODE_NUNITS (mode
);
8730 if (GET_CODE (vals
) == CONST_VECTOR
)
8732 else if (GET_CODE (vals
) == PARALLEL
)
8734 /* A CONST_VECTOR must contain only CONST_INTs and
8735 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8736 Only store valid constants in a CONST_VECTOR. */
8737 for (i
= 0; i
< n_elts
; ++i
)
8739 rtx x
= XVECEXP (vals
, 0, i
);
8740 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
8743 if (n_const
== n_elts
)
8744 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
8749 if (const_vec
!= NULL_RTX
8750 && aarch64_simd_valid_immediate (const_vec
, mode
, false, NULL
))
8751 /* Load using MOVI/MVNI. */
8753 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
8754 /* Loaded using DUP. */
8756 else if (const_vec
!= NULL_RTX
)
8757 /* Load from constant pool. We can not take advantage of single-cycle
8758 LD1 because we need a PC-relative addressing mode. */
8761 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8762 We can not construct an initializer. */
8767 aarch64_expand_vector_init (rtx target
, rtx vals
)
8769 machine_mode mode
= GET_MODE (target
);
8770 machine_mode inner_mode
= GET_MODE_INNER (mode
);
8771 int n_elts
= GET_MODE_NUNITS (mode
);
8772 int n_var
= 0, one_var
= -1;
8773 bool all_same
= true;
8777 x
= XVECEXP (vals
, 0, 0);
8778 if (!CONST_INT_P (x
) && !CONST_DOUBLE_P (x
))
8779 n_var
= 1, one_var
= 0;
8781 for (i
= 1; i
< n_elts
; ++i
)
8783 x
= XVECEXP (vals
, 0, i
);
8784 if (!CONST_INT_P (x
) && !CONST_DOUBLE_P (x
))
8785 ++n_var
, one_var
= i
;
8787 if (!rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
8793 rtx constant
= aarch64_simd_make_constant (vals
);
8794 if (constant
!= NULL_RTX
)
8796 emit_move_insn (target
, constant
);
8801 /* Splat a single non-constant element if we can. */
8804 x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, 0));
8805 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
8809 /* One field is non-constant. Load constant then overwrite varying
8810 field. This is more efficient than using the stack. */
8813 rtx copy
= copy_rtx (vals
);
8814 rtx index
= GEN_INT (one_var
);
8815 enum insn_code icode
;
8817 /* Load constant part of vector, substitute neighboring value for
8819 XVECEXP (copy
, 0, one_var
) = XVECEXP (vals
, 0, one_var
^ 1);
8820 aarch64_expand_vector_init (target
, copy
);
8822 /* Insert variable. */
8823 x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, one_var
));
8824 icode
= optab_handler (vec_set_optab
, mode
);
8825 gcc_assert (icode
!= CODE_FOR_nothing
);
8826 emit_insn (GEN_FCN (icode
) (target
, x
, index
));
8830 /* Construct the vector in memory one field at a time
8831 and load the whole vector. */
8832 mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
8833 for (i
= 0; i
< n_elts
; i
++)
8834 emit_move_insn (adjust_address_nv (mem
, inner_mode
,
8835 i
* GET_MODE_SIZE (inner_mode
)),
8836 XVECEXP (vals
, 0, i
));
8837 emit_move_insn (target
, mem
);
8841 static unsigned HOST_WIDE_INT
8842 aarch64_shift_truncation_mask (machine_mode mode
)
8845 (aarch64_vector_mode_supported_p (mode
)
8846 || aarch64_vect_struct_mode_p (mode
)) ? 0 : (GET_MODE_BITSIZE (mode
) - 1);
8849 #ifndef TLS_SECTION_ASM_FLAG
8850 #define TLS_SECTION_ASM_FLAG 'T'
8854 aarch64_elf_asm_named_section (const char *name
, unsigned int flags
,
8855 tree decl ATTRIBUTE_UNUSED
)
8857 char flagchars
[10], *f
= flagchars
;
8859 /* If we have already declared this section, we can use an
8860 abbreviated form to switch back to it -- unless this section is
8861 part of a COMDAT groups, in which case GAS requires the full
8862 declaration every time. */
8863 if (!(HAVE_COMDAT_GROUP
&& (flags
& SECTION_LINKONCE
))
8864 && (flags
& SECTION_DECLARED
))
8866 fprintf (asm_out_file
, "\t.section\t%s\n", name
);
8870 if (!(flags
& SECTION_DEBUG
))
8872 if (flags
& SECTION_WRITE
)
8874 if (flags
& SECTION_CODE
)
8876 if (flags
& SECTION_SMALL
)
8878 if (flags
& SECTION_MERGE
)
8880 if (flags
& SECTION_STRINGS
)
8882 if (flags
& SECTION_TLS
)
8883 *f
++ = TLS_SECTION_ASM_FLAG
;
8884 if (HAVE_COMDAT_GROUP
&& (flags
& SECTION_LINKONCE
))
8888 fprintf (asm_out_file
, "\t.section\t%s,\"%s\"", name
, flagchars
);
8890 if (!(flags
& SECTION_NOTYPE
))
8895 if (flags
& SECTION_BSS
)
8900 #ifdef TYPE_OPERAND_FMT
8901 format
= "," TYPE_OPERAND_FMT
;
8906 fprintf (asm_out_file
, format
, type
);
8908 if (flags
& SECTION_ENTSIZE
)
8909 fprintf (asm_out_file
, ",%d", flags
& SECTION_ENTSIZE
);
8910 if (HAVE_COMDAT_GROUP
&& (flags
& SECTION_LINKONCE
))
8912 if (TREE_CODE (decl
) == IDENTIFIER_NODE
)
8913 fprintf (asm_out_file
, ",%s,comdat", IDENTIFIER_POINTER (decl
));
8915 fprintf (asm_out_file
, ",%s,comdat",
8916 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl
)));
8920 putc ('\n', asm_out_file
);
8923 /* Select a format to encode pointers in exception handling data. */
8925 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
8928 switch (aarch64_cmodel
)
8930 case AARCH64_CMODEL_TINY
:
8931 case AARCH64_CMODEL_TINY_PIC
:
8932 case AARCH64_CMODEL_SMALL
:
8933 case AARCH64_CMODEL_SMALL_PIC
:
8934 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8936 type
= DW_EH_PE_sdata4
;
8939 /* No assumptions here. 8-byte relocs required. */
8940 type
= DW_EH_PE_sdata8
;
8943 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
8946 /* Emit load exclusive. */
8949 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
8950 rtx mem
, rtx model_rtx
)
8952 rtx (*gen
) (rtx
, rtx
, rtx
);
8956 case QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
8957 case HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
8958 case SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
8959 case DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
8964 emit_insn (gen (rval
, mem
, model_rtx
));
8967 /* Emit store exclusive. */
8970 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
8971 rtx rval
, rtx mem
, rtx model_rtx
)
8973 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
8977 case QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
8978 case HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
8979 case SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
8980 case DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
8985 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
8988 /* Mark the previous jump instruction as unlikely. */
8991 aarch64_emit_unlikely_jump (rtx insn
)
8993 int very_unlikely
= REG_BR_PROB_BASE
/ 100 - 1;
8995 insn
= emit_jump_insn (insn
);
8996 add_int_reg_note (insn
, REG_BR_PROB
, very_unlikely
);
8999 /* Expand a compare and swap pattern. */
9002 aarch64_expand_compare_and_swap (rtx operands
[])
9004 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
9005 machine_mode mode
, cmp_mode
;
9006 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
9011 oldval
= operands
[3];
9012 newval
= operands
[4];
9013 is_weak
= operands
[5];
9014 mod_s
= operands
[6];
9015 mod_f
= operands
[7];
9016 mode
= GET_MODE (mem
);
9019 /* Normally the succ memory model must be stronger than fail, but in the
9020 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9021 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9023 if (INTVAL (mod_f
) == MEMMODEL_ACQUIRE
9024 && INTVAL (mod_s
) == MEMMODEL_RELEASE
)
9025 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
9031 /* For short modes, we're going to perform the comparison in SImode,
9032 so do the zero-extension now. */
9034 rval
= gen_reg_rtx (SImode
);
9035 oldval
= convert_modes (SImode
, mode
, oldval
, true);
9040 /* Force the value into a register if needed. */
9041 if (!aarch64_plus_operand (oldval
, mode
))
9042 oldval
= force_reg (cmp_mode
, oldval
);
9051 case QImode
: gen
= gen_atomic_compare_and_swapqi_1
; break;
9052 case HImode
: gen
= gen_atomic_compare_and_swaphi_1
; break;
9053 case SImode
: gen
= gen_atomic_compare_and_swapsi_1
; break;
9054 case DImode
: gen
= gen_atomic_compare_and_swapdi_1
; break;
9059 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
9061 if (mode
== QImode
|| mode
== HImode
)
9062 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
9064 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
9065 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
9066 emit_insn (gen_rtx_SET (VOIDmode
, bval
, x
));
9069 /* Split a compare and swap pattern. */
9072 aarch64_split_compare_and_swap (rtx operands
[])
9074 rtx rval
, mem
, oldval
, newval
, scratch
;
9077 rtx_code_label
*label1
, *label2
;
9082 oldval
= operands
[2];
9083 newval
= operands
[3];
9084 is_weak
= (operands
[4] != const0_rtx
);
9085 scratch
= operands
[7];
9086 mode
= GET_MODE (mem
);
9091 label1
= gen_label_rtx ();
9092 emit_label (label1
);
9094 label2
= gen_label_rtx ();
9096 aarch64_emit_load_exclusive (mode
, rval
, mem
, operands
[5]);
9098 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
9099 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
9100 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
9101 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
9102 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode
, pc_rtx
, x
));
9104 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, operands
[5]);
9108 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
9109 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
9110 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
9111 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode
, pc_rtx
, x
));
9115 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
9116 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
9117 emit_insn (gen_rtx_SET (VOIDmode
, cond
, x
));
9120 emit_label (label2
);
9123 /* Split an atomic operation. */
9126 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
9127 rtx value
, rtx model_rtx
, rtx cond
)
9129 machine_mode mode
= GET_MODE (mem
);
9130 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
9131 rtx_code_label
*label
;
9134 label
= gen_label_rtx ();
9138 new_out
= gen_lowpart (wmode
, new_out
);
9140 old_out
= gen_lowpart (wmode
, old_out
);
9143 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
9145 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
9154 x
= gen_rtx_AND (wmode
, old_out
, value
);
9155 emit_insn (gen_rtx_SET (VOIDmode
, new_out
, x
));
9156 x
= gen_rtx_NOT (wmode
, new_out
);
9157 emit_insn (gen_rtx_SET (VOIDmode
, new_out
, x
));
9161 if (CONST_INT_P (value
))
9163 value
= GEN_INT (-INTVAL (value
));
9169 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
9170 emit_insn (gen_rtx_SET (VOIDmode
, new_out
, x
));
9174 aarch64_emit_store_exclusive (mode
, cond
, mem
,
9175 gen_lowpart (mode
, new_out
), model_rtx
);
9177 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
9178 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
9179 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
9180 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode
, pc_rtx
, x
));
9184 aarch64_print_extension (void)
9186 const struct aarch64_option_extension
*opt
= NULL
;
9188 for (opt
= all_extensions
; opt
->name
!= NULL
; opt
++)
9189 if ((aarch64_isa_flags
& opt
->flags_on
) == opt
->flags_on
)
9190 asm_fprintf (asm_out_file
, "+%s", opt
->name
);
9192 asm_fprintf (asm_out_file
, "\n");
9196 aarch64_start_file (void)
9200 asm_fprintf (asm_out_file
, "\t.arch %s", selected_arch
->name
);
9201 aarch64_print_extension ();
9203 else if (selected_cpu
)
9205 const char *truncated_name
9206 = aarch64_rewrite_selected_cpu (selected_cpu
->name
);
9207 asm_fprintf (asm_out_file
, "\t.cpu %s", truncated_name
);
9208 aarch64_print_extension ();
9210 default_file_start();
9213 /* Target hook for c_mode_for_suffix. */
9215 aarch64_c_mode_for_suffix (char suffix
)
9223 /* We can only represent floating point constants which will fit in
9224 "quarter-precision" values. These values are characterised by
9225 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9228 (-1)^s * (n/16) * 2^r
9231 's' is the sign bit.
9232 'n' is an integer in the range 16 <= n <= 31.
9233 'r' is an integer in the range -3 <= r <= 4. */
9235 /* Return true iff X can be represented by a quarter-precision
9236 floating point immediate operand X. Note, we cannot represent 0.0. */
9238 aarch64_float_const_representable_p (rtx x
)
9240 /* This represents our current view of how many bits
9241 make up the mantissa. */
9242 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
9244 unsigned HOST_WIDE_INT mantissa
, mask
;
9245 REAL_VALUE_TYPE r
, m
;
9248 if (!CONST_DOUBLE_P (x
))
9251 if (GET_MODE (x
) == VOIDmode
)
9254 REAL_VALUE_FROM_CONST_DOUBLE (r
, x
);
9256 /* We cannot represent infinities, NaNs or +/-zero. We won't
9257 know if we have +zero until we analyse the mantissa, but we
9258 can reject the other invalid values. */
9259 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
9260 || REAL_VALUE_MINUS_ZERO (r
))
9263 /* Extract exponent. */
9264 r
= real_value_abs (&r
);
9265 exponent
= REAL_EXP (&r
);
9267 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9268 highest (sign) bit, with a fixed binary point at bit point_pos.
9269 m1 holds the low part of the mantissa, m2 the high part.
9270 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9271 bits for the mantissa, this can fail (low bits will be lost). */
9272 real_ldexp (&m
, &r
, point_pos
- exponent
);
9273 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
9275 /* If the low part of the mantissa has bits set we cannot represent
9279 /* We have rejected the lower HOST_WIDE_INT, so update our
9280 understanding of how many bits lie in the mantissa and
9281 look only at the high HOST_WIDE_INT. */
9282 mantissa
= w
.elt (1);
9283 point_pos
-= HOST_BITS_PER_WIDE_INT
;
9285 /* We can only represent values with a mantissa of the form 1.xxxx. */
9286 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
9287 if ((mantissa
& mask
) != 0)
9290 /* Having filtered unrepresentable values, we may now remove all
9291 but the highest 5 bits. */
9292 mantissa
>>= point_pos
- 5;
9294 /* We cannot represent the value 0.0, so reject it. This is handled
9299 /* Then, as bit 4 is always set, we can mask it off, leaving
9300 the mantissa in the range [0, 15]. */
9301 mantissa
&= ~(1 << 4);
9302 gcc_assert (mantissa
<= 15);
9304 /* GCC internally does not use IEEE754-like encoding (where normalized
9305 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9306 Our mantissa values are shifted 4 places to the left relative to
9307 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9308 by 5 places to correct for GCC's representation. */
9309 exponent
= 5 - exponent
;
9311 return (exponent
>= 0 && exponent
<= 7);
9315 aarch64_output_simd_mov_immediate (rtx const_vector
,
9320 static char templ
[40];
9321 const char *mnemonic
;
9322 const char *shift_op
;
9323 unsigned int lane_count
= 0;
9326 struct simd_immediate_info info
= { NULL_RTX
, 0, 0, false, false };
9328 /* This will return true to show const_vector is legal for use as either
9329 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9330 also update INFO to show how the immediate should be generated. */
9331 is_valid
= aarch64_simd_valid_immediate (const_vector
, mode
, false, &info
);
9332 gcc_assert (is_valid
);
9334 element_char
= sizetochar (info
.element_width
);
9335 lane_count
= width
/ info
.element_width
;
9337 mode
= GET_MODE_INNER (mode
);
9338 if (mode
== SFmode
|| mode
== DFmode
)
9340 gcc_assert (info
.shift
== 0 && ! info
.mvn
);
9341 if (aarch64_float_const_zero_rtx_p (info
.value
))
9342 info
.value
= GEN_INT (0);
9347 REAL_VALUE_FROM_CONST_DOUBLE (r
, info
.value
);
9348 char float_buf
[buf_size
] = {'\0'};
9349 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
, 1, mode
);
9352 if (lane_count
== 1)
9353 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
9355 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
9356 lane_count
, element_char
, float_buf
);
9361 mnemonic
= info
.mvn
? "mvni" : "movi";
9362 shift_op
= info
.msl
? "msl" : "lsl";
9364 if (lane_count
== 1)
9365 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
9366 mnemonic
, UINTVAL (info
.value
));
9367 else if (info
.shift
)
9368 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9369 ", %s %d", mnemonic
, lane_count
, element_char
,
9370 UINTVAL (info
.value
), shift_op
, info
.shift
);
9372 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
,
9373 mnemonic
, lane_count
, element_char
, UINTVAL (info
.value
));
9378 aarch64_output_scalar_simd_mov_immediate (rtx immediate
,
9383 gcc_assert (!VECTOR_MODE_P (mode
));
9384 vmode
= aarch64_simd_container_mode (mode
, 64);
9385 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
9386 return aarch64_output_simd_mov_immediate (v_op
, vmode
, 64);
9389 /* Split operands into moves from op[1] + op[2] into op[0]. */
9392 aarch64_split_combinev16qi (rtx operands
[3])
9394 unsigned int dest
= REGNO (operands
[0]);
9395 unsigned int src1
= REGNO (operands
[1]);
9396 unsigned int src2
= REGNO (operands
[2]);
9397 machine_mode halfmode
= GET_MODE (operands
[1]);
9398 unsigned int halfregs
= HARD_REGNO_NREGS (src1
, halfmode
);
9401 gcc_assert (halfmode
== V16QImode
);
9403 if (src1
== dest
&& src2
== dest
+ halfregs
)
9405 /* No-op move. Can't split to nothing; emit something. */
9406 emit_note (NOTE_INSN_DELETED
);
9410 /* Preserve register attributes for variable tracking. */
9411 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
9412 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
9413 GET_MODE_SIZE (halfmode
));
9415 /* Special case of reversed high/low parts. */
9416 if (reg_overlap_mentioned_p (operands
[2], destlo
)
9417 && reg_overlap_mentioned_p (operands
[1], desthi
))
9419 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
9420 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
9421 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
9423 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
9425 /* Try to avoid unnecessary moves if part of the result
9426 is in the right place already. */
9428 emit_move_insn (destlo
, operands
[1]);
9429 if (src2
!= dest
+ halfregs
)
9430 emit_move_insn (desthi
, operands
[2]);
9434 if (src2
!= dest
+ halfregs
)
9435 emit_move_insn (desthi
, operands
[2]);
9437 emit_move_insn (destlo
, operands
[1]);
9441 /* vec_perm support. */
9443 #define MAX_VECT_LEN 16
9445 struct expand_vec_perm_d
9447 rtx target
, op0
, op1
;
9448 unsigned char perm
[MAX_VECT_LEN
];
9455 /* Generate a variable permutation. */
9458 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
9460 machine_mode vmode
= GET_MODE (target
);
9461 bool one_vector_p
= rtx_equal_p (op0
, op1
);
9463 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
9464 gcc_checking_assert (GET_MODE (op0
) == vmode
);
9465 gcc_checking_assert (GET_MODE (op1
) == vmode
);
9466 gcc_checking_assert (GET_MODE (sel
) == vmode
);
9467 gcc_checking_assert (TARGET_SIMD
);
9471 if (vmode
== V8QImode
)
9473 /* Expand the argument to a V16QI mode by duplicating it. */
9474 rtx pair
= gen_reg_rtx (V16QImode
);
9475 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
9476 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
9480 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
9487 if (vmode
== V8QImode
)
9489 pair
= gen_reg_rtx (V16QImode
);
9490 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
9491 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
9495 pair
= gen_reg_rtx (OImode
);
9496 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
9497 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
9503 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
9505 machine_mode vmode
= GET_MODE (target
);
9506 unsigned int nelt
= GET_MODE_NUNITS (vmode
);
9507 bool one_vector_p
= rtx_equal_p (op0
, op1
);
9510 /* The TBL instruction does not use a modulo index, so we must take care
9511 of that ourselves. */
9512 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
9513 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9514 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
9516 /* For big-endian, we also need to reverse the index within the vector
9517 (but not which vector). */
9518 if (BYTES_BIG_ENDIAN
)
9520 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9522 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
9523 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
9524 NULL
, 0, OPTAB_LIB_WIDEN
);
9526 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
9529 /* Recognize patterns suitable for the TRN instructions. */
9531 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
9533 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
9534 rtx out
, in0
, in1
, x
;
9535 rtx (*gen
) (rtx
, rtx
, rtx
);
9536 machine_mode vmode
= d
->vmode
;
9538 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
9541 /* Note that these are little-endian tests.
9542 We correct for big-endian later. */
9543 if (d
->perm
[0] == 0)
9545 else if (d
->perm
[0] == 1)
9549 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9551 for (i
= 0; i
< nelt
; i
+= 2)
9553 if (d
->perm
[i
] != i
+ odd
)
9555 if (d
->perm
[i
+ 1] != ((i
+ nelt
+ odd
) & mask
))
9565 if (BYTES_BIG_ENDIAN
)
9567 x
= in0
, in0
= in1
, in1
= x
;
9576 case V16QImode
: gen
= gen_aarch64_trn2v16qi
; break;
9577 case V8QImode
: gen
= gen_aarch64_trn2v8qi
; break;
9578 case V8HImode
: gen
= gen_aarch64_trn2v8hi
; break;
9579 case V4HImode
: gen
= gen_aarch64_trn2v4hi
; break;
9580 case V4SImode
: gen
= gen_aarch64_trn2v4si
; break;
9581 case V2SImode
: gen
= gen_aarch64_trn2v2si
; break;
9582 case V2DImode
: gen
= gen_aarch64_trn2v2di
; break;
9583 case V4SFmode
: gen
= gen_aarch64_trn2v4sf
; break;
9584 case V2SFmode
: gen
= gen_aarch64_trn2v2sf
; break;
9585 case V2DFmode
: gen
= gen_aarch64_trn2v2df
; break;
9594 case V16QImode
: gen
= gen_aarch64_trn1v16qi
; break;
9595 case V8QImode
: gen
= gen_aarch64_trn1v8qi
; break;
9596 case V8HImode
: gen
= gen_aarch64_trn1v8hi
; break;
9597 case V4HImode
: gen
= gen_aarch64_trn1v4hi
; break;
9598 case V4SImode
: gen
= gen_aarch64_trn1v4si
; break;
9599 case V2SImode
: gen
= gen_aarch64_trn1v2si
; break;
9600 case V2DImode
: gen
= gen_aarch64_trn1v2di
; break;
9601 case V4SFmode
: gen
= gen_aarch64_trn1v4sf
; break;
9602 case V2SFmode
: gen
= gen_aarch64_trn1v2sf
; break;
9603 case V2DFmode
: gen
= gen_aarch64_trn1v2df
; break;
9609 emit_insn (gen (out
, in0
, in1
));
9613 /* Recognize patterns suitable for the UZP instructions. */
9615 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
9617 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
9618 rtx out
, in0
, in1
, x
;
9619 rtx (*gen
) (rtx
, rtx
, rtx
);
9620 machine_mode vmode
= d
->vmode
;
9622 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
9625 /* Note that these are little-endian tests.
9626 We correct for big-endian later. */
9627 if (d
->perm
[0] == 0)
9629 else if (d
->perm
[0] == 1)
9633 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9635 for (i
= 0; i
< nelt
; i
++)
9637 unsigned elt
= (i
* 2 + odd
) & mask
;
9638 if (d
->perm
[i
] != elt
)
9648 if (BYTES_BIG_ENDIAN
)
9650 x
= in0
, in0
= in1
, in1
= x
;
9659 case V16QImode
: gen
= gen_aarch64_uzp2v16qi
; break;
9660 case V8QImode
: gen
= gen_aarch64_uzp2v8qi
; break;
9661 case V8HImode
: gen
= gen_aarch64_uzp2v8hi
; break;
9662 case V4HImode
: gen
= gen_aarch64_uzp2v4hi
; break;
9663 case V4SImode
: gen
= gen_aarch64_uzp2v4si
; break;
9664 case V2SImode
: gen
= gen_aarch64_uzp2v2si
; break;
9665 case V2DImode
: gen
= gen_aarch64_uzp2v2di
; break;
9666 case V4SFmode
: gen
= gen_aarch64_uzp2v4sf
; break;
9667 case V2SFmode
: gen
= gen_aarch64_uzp2v2sf
; break;
9668 case V2DFmode
: gen
= gen_aarch64_uzp2v2df
; break;
9677 case V16QImode
: gen
= gen_aarch64_uzp1v16qi
; break;
9678 case V8QImode
: gen
= gen_aarch64_uzp1v8qi
; break;
9679 case V8HImode
: gen
= gen_aarch64_uzp1v8hi
; break;
9680 case V4HImode
: gen
= gen_aarch64_uzp1v4hi
; break;
9681 case V4SImode
: gen
= gen_aarch64_uzp1v4si
; break;
9682 case V2SImode
: gen
= gen_aarch64_uzp1v2si
; break;
9683 case V2DImode
: gen
= gen_aarch64_uzp1v2di
; break;
9684 case V4SFmode
: gen
= gen_aarch64_uzp1v4sf
; break;
9685 case V2SFmode
: gen
= gen_aarch64_uzp1v2sf
; break;
9686 case V2DFmode
: gen
= gen_aarch64_uzp1v2df
; break;
9692 emit_insn (gen (out
, in0
, in1
));
9696 /* Recognize patterns suitable for the ZIP instructions. */
9698 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
9700 unsigned int i
, high
, mask
, nelt
= d
->nelt
;
9701 rtx out
, in0
, in1
, x
;
9702 rtx (*gen
) (rtx
, rtx
, rtx
);
9703 machine_mode vmode
= d
->vmode
;
9705 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
9708 /* Note that these are little-endian tests.
9709 We correct for big-endian later. */
9711 if (d
->perm
[0] == high
)
9714 else if (d
->perm
[0] == 0)
9718 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
9720 for (i
= 0; i
< nelt
/ 2; i
++)
9722 unsigned elt
= (i
+ high
) & mask
;
9723 if (d
->perm
[i
* 2] != elt
)
9725 elt
= (elt
+ nelt
) & mask
;
9726 if (d
->perm
[i
* 2 + 1] != elt
)
9736 if (BYTES_BIG_ENDIAN
)
9738 x
= in0
, in0
= in1
, in1
= x
;
9747 case V16QImode
: gen
= gen_aarch64_zip2v16qi
; break;
9748 case V8QImode
: gen
= gen_aarch64_zip2v8qi
; break;
9749 case V8HImode
: gen
= gen_aarch64_zip2v8hi
; break;
9750 case V4HImode
: gen
= gen_aarch64_zip2v4hi
; break;
9751 case V4SImode
: gen
= gen_aarch64_zip2v4si
; break;
9752 case V2SImode
: gen
= gen_aarch64_zip2v2si
; break;
9753 case V2DImode
: gen
= gen_aarch64_zip2v2di
; break;
9754 case V4SFmode
: gen
= gen_aarch64_zip2v4sf
; break;
9755 case V2SFmode
: gen
= gen_aarch64_zip2v2sf
; break;
9756 case V2DFmode
: gen
= gen_aarch64_zip2v2df
; break;
9765 case V16QImode
: gen
= gen_aarch64_zip1v16qi
; break;
9766 case V8QImode
: gen
= gen_aarch64_zip1v8qi
; break;
9767 case V8HImode
: gen
= gen_aarch64_zip1v8hi
; break;
9768 case V4HImode
: gen
= gen_aarch64_zip1v4hi
; break;
9769 case V4SImode
: gen
= gen_aarch64_zip1v4si
; break;
9770 case V2SImode
: gen
= gen_aarch64_zip1v2si
; break;
9771 case V2DImode
: gen
= gen_aarch64_zip1v2di
; break;
9772 case V4SFmode
: gen
= gen_aarch64_zip1v4sf
; break;
9773 case V2SFmode
: gen
= gen_aarch64_zip1v2sf
; break;
9774 case V2DFmode
: gen
= gen_aarch64_zip1v2df
; break;
9780 emit_insn (gen (out
, in0
, in1
));
9784 /* Recognize patterns for the EXT insn. */
9787 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
9789 unsigned int i
, nelt
= d
->nelt
;
9790 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
9793 unsigned int location
= d
->perm
[0]; /* Always < nelt. */
9795 /* Check if the extracted indices are increasing by one. */
9796 for (i
= 1; i
< nelt
; i
++)
9798 unsigned int required
= location
+ i
;
9799 if (d
->one_vector_p
)
9801 /* We'll pass the same vector in twice, so allow indices to wrap. */
9802 required
&= (nelt
- 1);
9804 if (d
->perm
[i
] != required
)
9810 case V16QImode
: gen
= gen_aarch64_extv16qi
; break;
9811 case V8QImode
: gen
= gen_aarch64_extv8qi
; break;
9812 case V4HImode
: gen
= gen_aarch64_extv4hi
; break;
9813 case V8HImode
: gen
= gen_aarch64_extv8hi
; break;
9814 case V2SImode
: gen
= gen_aarch64_extv2si
; break;
9815 case V4SImode
: gen
= gen_aarch64_extv4si
; break;
9816 case V2SFmode
: gen
= gen_aarch64_extv2sf
; break;
9817 case V4SFmode
: gen
= gen_aarch64_extv4sf
; break;
9818 case V2DImode
: gen
= gen_aarch64_extv2di
; break;
9819 case V2DFmode
: gen
= gen_aarch64_extv2df
; break;
9828 /* The case where (location == 0) is a no-op for both big- and little-endian,
9829 and is removed by the mid-end at optimization levels -O1 and higher. */
9831 if (BYTES_BIG_ENDIAN
&& (location
!= 0))
9833 /* After setup, we want the high elements of the first vector (stored
9834 at the LSB end of the register), and the low elements of the second
9835 vector (stored at the MSB end of the register). So swap. */
9836 std::swap (d
->op0
, d
->op1
);
9837 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9838 location
= nelt
- location
;
9841 offset
= GEN_INT (location
);
9842 emit_insn (gen (d
->target
, d
->op0
, d
->op1
, offset
));
9846 /* Recognize patterns for the REV insns. */
9849 aarch64_evpc_rev (struct expand_vec_perm_d
*d
)
9851 unsigned int i
, j
, diff
, nelt
= d
->nelt
;
9852 rtx (*gen
) (rtx
, rtx
);
9854 if (!d
->one_vector_p
)
9863 case V16QImode
: gen
= gen_aarch64_rev64v16qi
; break;
9864 case V8QImode
: gen
= gen_aarch64_rev64v8qi
; break;
9872 case V16QImode
: gen
= gen_aarch64_rev32v16qi
; break;
9873 case V8QImode
: gen
= gen_aarch64_rev32v8qi
; break;
9874 case V8HImode
: gen
= gen_aarch64_rev64v8hi
; break;
9875 case V4HImode
: gen
= gen_aarch64_rev64v4hi
; break;
9883 case V16QImode
: gen
= gen_aarch64_rev16v16qi
; break;
9884 case V8QImode
: gen
= gen_aarch64_rev16v8qi
; break;
9885 case V8HImode
: gen
= gen_aarch64_rev32v8hi
; break;
9886 case V4HImode
: gen
= gen_aarch64_rev32v4hi
; break;
9887 case V4SImode
: gen
= gen_aarch64_rev64v4si
; break;
9888 case V2SImode
: gen
= gen_aarch64_rev64v2si
; break;
9889 case V4SFmode
: gen
= gen_aarch64_rev64v4sf
; break;
9890 case V2SFmode
: gen
= gen_aarch64_rev64v2sf
; break;
9899 for (i
= 0; i
< nelt
; i
+= diff
+ 1)
9900 for (j
= 0; j
<= diff
; j
+= 1)
9902 /* This is guaranteed to be true as the value of diff
9903 is 7, 3, 1 and we should have enough elements in the
9904 queue to generate this. Getting a vector mask with a
9905 value of diff other than these values implies that
9906 something is wrong by the time we get here. */
9907 gcc_assert (i
+ j
< nelt
);
9908 if (d
->perm
[i
+ j
] != i
+ diff
- j
)
9916 emit_insn (gen (d
->target
, d
->op0
));
9921 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
9923 rtx (*gen
) (rtx
, rtx
, rtx
);
9924 rtx out
= d
->target
;
9926 machine_mode vmode
= d
->vmode
;
9927 unsigned int i
, elt
, nelt
= d
->nelt
;
9931 for (i
= 1; i
< nelt
; i
++)
9933 if (elt
!= d
->perm
[i
])
9937 /* The generic preparation in aarch64_expand_vec_perm_const_1
9938 swaps the operand order and the permute indices if it finds
9939 d->perm[0] to be in the second operand. Thus, we can always
9940 use d->op0 and need not do any extra arithmetic to get the
9941 correct lane number. */
9943 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
9947 case V16QImode
: gen
= gen_aarch64_dup_lanev16qi
; break;
9948 case V8QImode
: gen
= gen_aarch64_dup_lanev8qi
; break;
9949 case V8HImode
: gen
= gen_aarch64_dup_lanev8hi
; break;
9950 case V4HImode
: gen
= gen_aarch64_dup_lanev4hi
; break;
9951 case V4SImode
: gen
= gen_aarch64_dup_lanev4si
; break;
9952 case V2SImode
: gen
= gen_aarch64_dup_lanev2si
; break;
9953 case V2DImode
: gen
= gen_aarch64_dup_lanev2di
; break;
9954 case V4SFmode
: gen
= gen_aarch64_dup_lanev4sf
; break;
9955 case V2SFmode
: gen
= gen_aarch64_dup_lanev2sf
; break;
9956 case V2DFmode
: gen
= gen_aarch64_dup_lanev2df
; break;
9961 emit_insn (gen (out
, in0
, lane
));
9966 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
9968 rtx rperm
[MAX_VECT_LEN
], sel
;
9969 machine_mode vmode
= d
->vmode
;
9970 unsigned int i
, nelt
= d
->nelt
;
9975 /* Generic code will try constant permutation twice. Once with the
9976 original mode and again with the elements lowered to QImode.
9977 So wait and don't do the selector expansion ourselves. */
9978 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
9981 for (i
= 0; i
< nelt
; ++i
)
9983 int nunits
= GET_MODE_NUNITS (vmode
);
9985 /* If big-endian and two vectors we end up with a weird mixed-endian
9986 mode on NEON. Reverse the index within each word but not the word
9988 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? d
->perm
[i
] ^ (nunits
- 1)
9991 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
9992 sel
= force_reg (vmode
, sel
);
9994 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
9999 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
10001 /* The pattern matching functions above are written to look for a small
10002 number to begin the sequence (0, 1, N/2). If we begin with an index
10003 from the second operand, we can swap the operands. */
10004 if (d
->perm
[0] >= d
->nelt
)
10006 unsigned i
, nelt
= d
->nelt
;
10008 gcc_assert (nelt
== (nelt
& -nelt
));
10009 for (i
= 0; i
< nelt
; ++i
)
10010 d
->perm
[i
] ^= nelt
; /* Keep the same index, but in the other vector. */
10012 std::swap (d
->op0
, d
->op1
);
10017 if (aarch64_evpc_rev (d
))
10019 else if (aarch64_evpc_ext (d
))
10021 else if (aarch64_evpc_dup (d
))
10023 else if (aarch64_evpc_zip (d
))
10025 else if (aarch64_evpc_uzp (d
))
10027 else if (aarch64_evpc_trn (d
))
10029 return aarch64_evpc_tbl (d
);
10034 /* Expand a vec_perm_const pattern. */
10037 aarch64_expand_vec_perm_const (rtx target
, rtx op0
, rtx op1
, rtx sel
)
10039 struct expand_vec_perm_d d
;
10040 int i
, nelt
, which
;
10046 d
.vmode
= GET_MODE (target
);
10047 gcc_assert (VECTOR_MODE_P (d
.vmode
));
10048 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
10049 d
.testing_p
= false;
10051 for (i
= which
= 0; i
< nelt
; ++i
)
10053 rtx e
= XVECEXP (sel
, 0, i
);
10054 int ei
= INTVAL (e
) & (2 * nelt
- 1);
10055 which
|= (ei
< nelt
? 1 : 2);
10062 gcc_unreachable ();
10065 d
.one_vector_p
= false;
10066 if (!rtx_equal_p (op0
, op1
))
10069 /* The elements of PERM do not suggest that only the first operand
10070 is used, but both operands are identical. Allow easier matching
10071 of the permutation by folding the permutation into the single
10073 /* Fall Through. */
10075 for (i
= 0; i
< nelt
; ++i
)
10076 d
.perm
[i
] &= nelt
- 1;
10078 d
.one_vector_p
= true;
10083 d
.one_vector_p
= true;
10087 return aarch64_expand_vec_perm_const_1 (&d
);
10091 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
10092 const unsigned char *sel
)
10094 struct expand_vec_perm_d d
;
10095 unsigned int i
, nelt
, which
;
10099 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
10100 d
.testing_p
= true;
10101 memcpy (d
.perm
, sel
, nelt
);
10103 /* Calculate whether all elements are in one vector. */
10104 for (i
= which
= 0; i
< nelt
; ++i
)
10106 unsigned char e
= d
.perm
[i
];
10107 gcc_assert (e
< 2 * nelt
);
10108 which
|= (e
< nelt
? 1 : 2);
10111 /* If all elements are from the second vector, reindex as if from the
10114 for (i
= 0; i
< nelt
; ++i
)
10117 /* Check whether the mask can be applied to a single vector. */
10118 d
.one_vector_p
= (which
!= 3);
10120 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
10121 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
10122 if (!d
.one_vector_p
)
10123 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
10126 ret
= aarch64_expand_vec_perm_const_1 (&d
);
10133 aarch64_reverse_mask (enum machine_mode mode
)
10135 /* We have to reverse each vector because we dont have
10136 a permuted load that can reverse-load according to ABI rules. */
10138 rtvec v
= rtvec_alloc (16);
10140 int nunits
= GET_MODE_NUNITS (mode
);
10141 int usize
= GET_MODE_UNIT_SIZE (mode
);
10143 gcc_assert (BYTES_BIG_ENDIAN
);
10144 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
10146 for (i
= 0; i
< nunits
; i
++)
10147 for (j
= 0; j
< usize
; j
++)
10148 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
10149 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
10150 return force_reg (V16QImode
, mask
);
10153 /* Implement MODES_TIEABLE_P. */
10156 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
10158 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
10161 /* We specifically want to allow elements of "structure" modes to
10162 be tieable to the structure. This more general condition allows
10163 other rarer situations too. */
10165 && aarch64_vector_mode_p (mode1
)
10166 && aarch64_vector_mode_p (mode2
))
10172 /* Return a new RTX holding the result of moving POINTER forward by
10176 aarch64_move_pointer (rtx pointer
, int amount
)
10178 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
10180 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
10184 /* Return a new RTX holding the result of moving POINTER forward by the
10185 size of the mode it points to. */
10188 aarch64_progress_pointer (rtx pointer
)
10190 HOST_WIDE_INT amount
= GET_MODE_SIZE (GET_MODE (pointer
));
10192 return aarch64_move_pointer (pointer
, amount
);
10195 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10199 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
10202 rtx reg
= gen_reg_rtx (mode
);
10204 /* "Cast" the pointers to the correct mode. */
10205 *src
= adjust_address (*src
, mode
, 0);
10206 *dst
= adjust_address (*dst
, mode
, 0);
10207 /* Emit the memcpy. */
10208 emit_move_insn (reg
, *src
);
10209 emit_move_insn (*dst
, reg
);
10210 /* Move the pointers forward. */
10211 *src
= aarch64_progress_pointer (*src
);
10212 *dst
= aarch64_progress_pointer (*dst
);
10215 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10216 we succeed, otherwise return false. */
10219 aarch64_expand_movmem (rtx
*operands
)
10222 rtx dst
= operands
[0];
10223 rtx src
= operands
[1];
10225 bool speed_p
= !optimize_function_for_size_p (cfun
);
10227 /* When optimizing for size, give a better estimate of the length of a
10228 memcpy call, but use the default otherwise. */
10229 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
10231 /* We can't do anything smart if the amount to copy is not constant. */
10232 if (!CONST_INT_P (operands
[2]))
10235 n
= UINTVAL (operands
[2]);
10237 /* Try to keep the number of instructions low. For cases below 16 bytes we
10238 need to make at most two moves. For cases above 16 bytes it will be one
10239 move for each 16 byte chunk, then at most two additional moves. */
10240 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
10243 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
10244 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
10246 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
10247 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
10249 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10255 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
10260 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
10265 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10266 4-byte chunk, partially overlapping with the previously copied chunk. */
10269 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10275 src
= aarch64_move_pointer (src
, move
);
10276 dst
= aarch64_move_pointer (dst
, move
);
10277 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10282 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10283 them, then (if applicable) an 8-byte chunk. */
10288 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
10293 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
10298 /* Finish the final bytes of the copy. We can always do this in one
10299 instruction. We either copy the exact amount we need, or partially
10300 overlap with the previous chunk we copied and copy 8-bytes. */
10304 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
10306 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
10308 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10313 src
= aarch64_move_pointer (src
, -1);
10314 dst
= aarch64_move_pointer (dst
, -1);
10315 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
10321 src
= aarch64_move_pointer (src
, move
);
10322 dst
= aarch64_move_pointer (dst
, move
);
10323 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
10330 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10332 static unsigned HOST_WIDE_INT
10333 aarch64_asan_shadow_offset (void)
10335 return (HOST_WIDE_INT_1
<< 36);
10339 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size
,
10340 unsigned int align
,
10341 enum by_pieces_operation op
,
10344 /* STORE_BY_PIECES can be used when copying a constant string, but
10345 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10346 For now we always fail this and let the move_by_pieces code copy
10347 the string from read-only memory. */
10348 if (op
== STORE_BY_PIECES
)
10351 return default_use_by_pieces_infrastructure_p (size
, align
, op
, speed_p
);
10354 static enum machine_mode
10355 aarch64_code_to_ccmode (enum rtx_code code
)
10378 return CC_DLEUmode
;
10381 return CC_DLTUmode
;
10384 return CC_DGEUmode
;
10387 return CC_DGTUmode
;
10395 aarch64_gen_ccmp_first (rtx
*prep_seq
, rtx
*gen_seq
,
10396 int code
, tree treeop0
, tree treeop1
)
10398 enum machine_mode op_mode
, cmp_mode
, cc_mode
;
10399 rtx op0
, op1
, cmp
, target
;
10400 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
10401 enum insn_code icode
;
10402 struct expand_operand ops
[4];
10404 cc_mode
= aarch64_code_to_ccmode ((enum rtx_code
) code
);
10405 if (cc_mode
== CCmode
)
10409 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
10411 op_mode
= GET_MODE (op0
);
10412 if (op_mode
== VOIDmode
)
10413 op_mode
= GET_MODE (op1
);
10421 icode
= CODE_FOR_cmpsi
;
10426 icode
= CODE_FOR_cmpdi
;
10434 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
10435 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
10441 *prep_seq
= get_insns ();
10444 cmp
= gen_rtx_fmt_ee ((enum rtx_code
) code
, cmp_mode
, op0
, op1
);
10445 target
= gen_rtx_REG (CCmode
, CC_REGNUM
);
10447 create_output_operand (&ops
[0], target
, CCmode
);
10448 create_fixed_operand (&ops
[1], cmp
);
10449 create_fixed_operand (&ops
[2], op0
);
10450 create_fixed_operand (&ops
[3], op1
);
10453 if (!maybe_expand_insn (icode
, 4, ops
))
10458 *gen_seq
= get_insns ();
10461 return gen_rtx_REG (cc_mode
, CC_REGNUM
);
10465 aarch64_gen_ccmp_next (rtx
*prep_seq
, rtx
*gen_seq
, rtx prev
, int cmp_code
,
10466 tree treeop0
, tree treeop1
, int bit_code
)
10468 rtx op0
, op1
, cmp0
, cmp1
, target
;
10469 enum machine_mode op_mode
, cmp_mode
, cc_mode
;
10470 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
10471 enum insn_code icode
= CODE_FOR_ccmp_andsi
;
10472 struct expand_operand ops
[6];
10474 cc_mode
= aarch64_code_to_ccmode ((enum rtx_code
) cmp_code
);
10475 if (cc_mode
== CCmode
)
10478 push_to_sequence ((rtx_insn
*) *prep_seq
);
10479 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
10481 op_mode
= GET_MODE (op0
);
10482 if (op_mode
== VOIDmode
)
10483 op_mode
= GET_MODE (op1
);
10491 icode
= (enum rtx_code
) bit_code
== AND
? CODE_FOR_ccmp_andsi
10492 : CODE_FOR_ccmp_iorsi
;
10497 icode
= (enum rtx_code
) bit_code
== AND
? CODE_FOR_ccmp_anddi
10498 : CODE_FOR_ccmp_iordi
;
10506 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
10507 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
10513 *prep_seq
= get_insns ();
10516 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
10517 cmp1
= gen_rtx_fmt_ee ((enum rtx_code
) cmp_code
, cmp_mode
, op0
, op1
);
10518 cmp0
= gen_rtx_fmt_ee (NE
, cmp_mode
, prev
, const0_rtx
);
10520 create_fixed_operand (&ops
[0], prev
);
10521 create_fixed_operand (&ops
[1], target
);
10522 create_fixed_operand (&ops
[2], op0
);
10523 create_fixed_operand (&ops
[3], op1
);
10524 create_fixed_operand (&ops
[4], cmp0
);
10525 create_fixed_operand (&ops
[5], cmp1
);
10527 push_to_sequence ((rtx_insn
*) *gen_seq
);
10528 if (!maybe_expand_insn (icode
, 6, ops
))
10534 *gen_seq
= get_insns ();
10540 #undef TARGET_GEN_CCMP_FIRST
10541 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10543 #undef TARGET_GEN_CCMP_NEXT
10544 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10546 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10547 instruction fusion of some sort. */
10550 aarch64_macro_fusion_p (void)
10552 return aarch64_tune_params
->fuseable_ops
!= AARCH64_FUSE_NOTHING
;
10556 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10557 should be kept together during scheduling. */
10560 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
10563 rtx prev_set
= single_set (prev
);
10564 rtx curr_set
= single_set (curr
);
10565 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10566 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
10568 if (!aarch64_macro_fusion_p ())
10572 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_MOV_MOVK
))
10574 /* We are trying to match:
10575 prev (mov) == (set (reg r0) (const_int imm16))
10576 curr (movk) == (set (zero_extract (reg r0)
10579 (const_int imm16_1)) */
10581 set_dest
= SET_DEST (curr_set
);
10583 if (GET_CODE (set_dest
) == ZERO_EXTRACT
10584 && CONST_INT_P (SET_SRC (curr_set
))
10585 && CONST_INT_P (SET_SRC (prev_set
))
10586 && CONST_INT_P (XEXP (set_dest
, 2))
10587 && INTVAL (XEXP (set_dest
, 2)) == 16
10588 && REG_P (XEXP (set_dest
, 0))
10589 && REG_P (SET_DEST (prev_set
))
10590 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
10597 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_ADRP_ADD
))
10600 /* We're trying to match:
10601 prev (adrp) == (set (reg r1)
10602 (high (symbol_ref ("SYM"))))
10603 curr (add) == (set (reg r0)
10605 (symbol_ref ("SYM"))))
10606 Note that r0 need not necessarily be the same as r1, especially
10607 during pre-regalloc scheduling. */
10609 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
10610 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
10612 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
10613 && REG_P (XEXP (SET_SRC (curr_set
), 0))
10614 && REGNO (XEXP (SET_SRC (curr_set
), 0))
10615 == REGNO (SET_DEST (prev_set
))
10616 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
10617 XEXP (SET_SRC (curr_set
), 1)))
10623 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_MOVK_MOVK
))
10626 /* We're trying to match:
10627 prev (movk) == (set (zero_extract (reg r0)
10630 (const_int imm16_1))
10631 curr (movk) == (set (zero_extract (reg r0)
10634 (const_int imm16_2)) */
10636 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
10637 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
10638 && REG_P (XEXP (SET_DEST (prev_set
), 0))
10639 && REG_P (XEXP (SET_DEST (curr_set
), 0))
10640 && REGNO (XEXP (SET_DEST (prev_set
), 0))
10641 == REGNO (XEXP (SET_DEST (curr_set
), 0))
10642 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
10643 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
10644 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
10645 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
10646 && CONST_INT_P (SET_SRC (prev_set
))
10647 && CONST_INT_P (SET_SRC (curr_set
)))
10652 && (aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_ADRP_LDR
))
10654 /* We're trying to match:
10655 prev (adrp) == (set (reg r0)
10656 (high (symbol_ref ("SYM"))))
10657 curr (ldr) == (set (reg r1)
10658 (mem (lo_sum (reg r0)
10659 (symbol_ref ("SYM")))))
10661 curr (ldr) == (set (reg r1)
10664 (symbol_ref ("SYM")))))) */
10665 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
10666 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
10668 rtx curr_src
= SET_SRC (curr_set
);
10670 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
10671 curr_src
= XEXP (curr_src
, 0);
10673 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
10674 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
10675 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
10676 == REGNO (SET_DEST (prev_set
))
10677 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
10678 XEXP (SET_SRC (prev_set
), 0)))
10683 if ((aarch64_tune_params
->fuseable_ops
& AARCH64_FUSE_CMP_BRANCH
)
10684 && any_condjump_p (curr
))
10686 enum attr_type prev_type
= get_attr_type (prev
);
10688 /* FIXME: this misses some which is considered simple arthematic
10689 instructions for ThunderX. Simple shifts are missed here. */
10690 if (prev_type
== TYPE_ALUS_SREG
10691 || prev_type
== TYPE_ALUS_IMM
10692 || prev_type
== TYPE_LOGICS_REG
10693 || prev_type
== TYPE_LOGICS_IMM
)
10700 /* If MEM is in the form of [base+offset], extract the two parts
10701 of address and set to BASE and OFFSET, otherwise return false
10702 after clearing BASE and OFFSET. */
10705 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
10709 gcc_assert (MEM_P (mem
));
10711 addr
= XEXP (mem
, 0);
10716 *offset
= const0_rtx
;
10720 if (GET_CODE (addr
) == PLUS
10721 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
10723 *base
= XEXP (addr
, 0);
10724 *offset
= XEXP (addr
, 1);
10729 *offset
= NULL_RTX
;
10734 /* Types for scheduling fusion. */
10735 enum sched_fusion_type
10737 SCHED_FUSION_NONE
= 0,
10738 SCHED_FUSION_LD_SIGN_EXTEND
,
10739 SCHED_FUSION_LD_ZERO_EXTEND
,
10745 /* If INSN is a load or store of address in the form of [base+offset],
10746 extract the two parts and set to BASE and OFFSET. Return scheduling
10747 fusion type this INSN is. */
10749 static enum sched_fusion_type
10750 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
10753 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
10755 gcc_assert (INSN_P (insn
));
10756 x
= PATTERN (insn
);
10757 if (GET_CODE (x
) != SET
)
10758 return SCHED_FUSION_NONE
;
10761 dest
= SET_DEST (x
);
10763 if (GET_MODE (dest
) != SImode
&& GET_MODE (dest
) != DImode
10764 && GET_MODE (dest
) != SFmode
&& GET_MODE (dest
) != DFmode
)
10765 return SCHED_FUSION_NONE
;
10767 if (GET_CODE (src
) == SIGN_EXTEND
)
10769 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
10770 src
= XEXP (src
, 0);
10771 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
10772 return SCHED_FUSION_NONE
;
10774 else if (GET_CODE (src
) == ZERO_EXTEND
)
10776 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
10777 src
= XEXP (src
, 0);
10778 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
10779 return SCHED_FUSION_NONE
;
10782 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
10783 extract_base_offset_in_addr (src
, base
, offset
);
10784 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
10786 fusion
= SCHED_FUSION_ST
;
10787 extract_base_offset_in_addr (dest
, base
, offset
);
10790 return SCHED_FUSION_NONE
;
10792 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
10793 fusion
= SCHED_FUSION_NONE
;
10798 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10800 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10801 and PRI are only calculated for these instructions. For other instruction,
10802 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10803 type instruction fusion can be added by returning different priorities.
10805 It's important that irrelevant instructions get the largest FUSION_PRI. */
10808 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
10809 int *fusion_pri
, int *pri
)
10813 enum sched_fusion_type fusion
;
10815 gcc_assert (INSN_P (insn
));
10818 fusion
= fusion_load_store (insn
, &base
, &offset
);
10819 if (fusion
== SCHED_FUSION_NONE
)
10826 /* Set FUSION_PRI according to fusion type and base register. */
10827 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
10829 /* Calculate PRI. */
10832 /* INSN with smaller offset goes first. */
10833 off_val
= (int)(INTVAL (offset
));
10835 tmp
-= (off_val
& 0xfffff);
10837 tmp
+= ((- off_val
) & 0xfffff);
10843 /* Given OPERANDS of consecutive load/store, check if we can merge
10844 them into ldp/stp. LOAD is true if they are load instructions.
10845 MODE is the mode of memory operands. */
10848 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
10849 enum machine_mode mode
)
10851 HOST_WIDE_INT offval_1
, offval_2
, msize
;
10852 enum reg_class rclass_1
, rclass_2
;
10853 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
10857 mem_1
= operands
[1];
10858 mem_2
= operands
[3];
10859 reg_1
= operands
[0];
10860 reg_2
= operands
[2];
10861 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
10862 if (REGNO (reg_1
) == REGNO (reg_2
))
10867 mem_1
= operands
[0];
10868 mem_2
= operands
[2];
10869 reg_1
= operands
[1];
10870 reg_2
= operands
[3];
10873 /* The mems cannot be volatile. */
10874 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
10877 /* Check if the addresses are in the form of [base+offset]. */
10878 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
10879 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
10881 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
10882 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
10885 /* Check if the bases are same. */
10886 if (!rtx_equal_p (base_1
, base_2
))
10889 offval_1
= INTVAL (offset_1
);
10890 offval_2
= INTVAL (offset_2
);
10891 msize
= GET_MODE_SIZE (mode
);
10892 /* Check if the offsets are consecutive. */
10893 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
10896 /* Check if the addresses are clobbered by load. */
10899 if (reg_mentioned_p (reg_1
, mem_1
))
10902 /* In increasing order, the last load can clobber the address. */
10903 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
10907 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
10908 rclass_1
= FP_REGS
;
10910 rclass_1
= GENERAL_REGS
;
10912 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
10913 rclass_2
= FP_REGS
;
10915 rclass_2
= GENERAL_REGS
;
10917 /* Check if the registers are of same class. */
10918 if (rclass_1
!= rclass_2
)
10924 /* Given OPERANDS of consecutive load/store, check if we can merge
10925 them into ldp/stp by adjusting the offset. LOAD is true if they
10926 are load instructions. MODE is the mode of memory operands.
10928 Given below consecutive stores:
10930 str w1, [xb, 0x100]
10931 str w1, [xb, 0x104]
10932 str w1, [xb, 0x108]
10933 str w1, [xb, 0x10c]
10935 Though the offsets are out of the range supported by stp, we can
10936 still pair them after adjusting the offset, like:
10938 add scratch, xb, 0x100
10939 stp w1, w1, [scratch]
10940 stp w1, w1, [scratch, 0x8]
10942 The peephole patterns detecting this opportunity should guarantee
10943 the scratch register is avaliable. */
10946 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
10947 enum machine_mode mode
)
10949 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
10950 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
10951 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
10952 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
10956 reg_1
= operands
[0];
10957 mem_1
= operands
[1];
10958 reg_2
= operands
[2];
10959 mem_2
= operands
[3];
10960 reg_3
= operands
[4];
10961 mem_3
= operands
[5];
10962 reg_4
= operands
[6];
10963 mem_4
= operands
[7];
10964 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
10965 && REG_P (reg_3
) && REG_P (reg_4
));
10966 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
10971 mem_1
= operands
[0];
10972 reg_1
= operands
[1];
10973 mem_2
= operands
[2];
10974 reg_2
= operands
[3];
10975 mem_3
= operands
[4];
10976 reg_3
= operands
[5];
10977 mem_4
= operands
[6];
10978 reg_4
= operands
[7];
10980 /* Skip if memory operand is by itslef valid for ldp/stp. */
10981 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
10984 /* The mems cannot be volatile. */
10985 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
10986 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
10989 /* Check if the addresses are in the form of [base+offset]. */
10990 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
10991 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
10993 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
10994 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
10996 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
10997 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
10999 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
11000 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
11003 /* Check if the bases are same. */
11004 if (!rtx_equal_p (base_1
, base_2
)
11005 || !rtx_equal_p (base_2
, base_3
)
11006 || !rtx_equal_p (base_3
, base_4
))
11009 offval_1
= INTVAL (offset_1
);
11010 offval_2
= INTVAL (offset_2
);
11011 offval_3
= INTVAL (offset_3
);
11012 offval_4
= INTVAL (offset_4
);
11013 msize
= GET_MODE_SIZE (mode
);
11014 /* Check if the offsets are consecutive. */
11015 if ((offval_1
!= (offval_2
+ msize
)
11016 || offval_1
!= (offval_3
+ msize
* 2)
11017 || offval_1
!= (offval_4
+ msize
* 3))
11018 && (offval_4
!= (offval_3
+ msize
)
11019 || offval_4
!= (offval_2
+ msize
* 2)
11020 || offval_4
!= (offval_1
+ msize
* 3)))
11023 /* Check if the addresses are clobbered by load. */
11026 if (reg_mentioned_p (reg_1
, mem_1
)
11027 || reg_mentioned_p (reg_2
, mem_2
)
11028 || reg_mentioned_p (reg_3
, mem_3
))
11031 /* In increasing order, the last load can clobber the address. */
11032 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
11036 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
11037 rclass_1
= FP_REGS
;
11039 rclass_1
= GENERAL_REGS
;
11041 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
11042 rclass_2
= FP_REGS
;
11044 rclass_2
= GENERAL_REGS
;
11046 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
11047 rclass_3
= FP_REGS
;
11049 rclass_3
= GENERAL_REGS
;
11051 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
11052 rclass_4
= FP_REGS
;
11054 rclass_4
= GENERAL_REGS
;
11056 /* Check if the registers are of same class. */
11057 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
11063 /* Given OPERANDS of consecutive load/store, this function pairs them
11064 into ldp/stp after adjusting the offset. It depends on the fact
11065 that addresses of load/store instructions are in increasing order.
11066 MODE is the mode of memory operands. CODE is the rtl operator
11067 which should be applied to all memory operands, it's SIGN_EXTEND,
11068 ZERO_EXTEND or UNKNOWN. */
11071 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
11072 enum machine_mode mode
, RTX_CODE code
)
11074 rtx base
, offset
, t1
, t2
;
11075 rtx mem_1
, mem_2
, mem_3
, mem_4
;
11076 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
11080 mem_1
= operands
[1];
11081 mem_2
= operands
[3];
11082 mem_3
= operands
[5];
11083 mem_4
= operands
[7];
11087 mem_1
= operands
[0];
11088 mem_2
= operands
[2];
11089 mem_3
= operands
[4];
11090 mem_4
= operands
[6];
11091 gcc_assert (code
== UNKNOWN
);
11094 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
11095 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
11097 /* Adjust offset thus it can fit in ldp/stp instruction. */
11098 msize
= GET_MODE_SIZE (mode
);
11099 stp_off_limit
= msize
* 0x40;
11100 off_val
= INTVAL (offset
);
11101 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
11102 new_off
= abs_off
% stp_off_limit
;
11103 adj_off
= abs_off
- new_off
;
11105 /* Further adjust to make sure all offsets are OK. */
11106 if ((new_off
+ msize
* 2) >= stp_off_limit
)
11108 adj_off
+= stp_off_limit
;
11109 new_off
-= stp_off_limit
;
11112 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11113 if (adj_off
>= 0x1000)
11118 adj_off
= -adj_off
;
11119 new_off
= -new_off
;
11122 /* Create new memory references. */
11123 mem_1
= change_address (mem_1
, VOIDmode
,
11124 plus_constant (DImode
, operands
[8], new_off
));
11126 /* Check if the adjusted address is OK for ldp/stp. */
11127 if (!aarch64_mem_pair_operand (mem_1
, mode
))
11130 msize
= GET_MODE_SIZE (mode
);
11131 mem_2
= change_address (mem_2
, VOIDmode
,
11132 plus_constant (DImode
,
11135 mem_3
= change_address (mem_3
, VOIDmode
,
11136 plus_constant (DImode
,
11138 new_off
+ msize
* 2));
11139 mem_4
= change_address (mem_4
, VOIDmode
,
11140 plus_constant (DImode
,
11142 new_off
+ msize
* 3));
11144 if (code
== ZERO_EXTEND
)
11146 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
11147 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
11148 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
11149 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
11151 else if (code
== SIGN_EXTEND
)
11153 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
11154 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
11155 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
11156 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
11161 operands
[1] = mem_1
;
11162 operands
[3] = mem_2
;
11163 operands
[5] = mem_3
;
11164 operands
[7] = mem_4
;
11168 operands
[0] = mem_1
;
11169 operands
[2] = mem_2
;
11170 operands
[4] = mem_3
;
11171 operands
[6] = mem_4
;
11174 /* Emit adjusting instruction. */
11175 emit_insn (gen_rtx_SET (VOIDmode
, operands
[8],
11176 plus_constant (DImode
, base
, adj_off
)));
11177 /* Emit ldp/stp instructions. */
11178 t1
= gen_rtx_SET (VOIDmode
, operands
[0], operands
[1]);
11179 t2
= gen_rtx_SET (VOIDmode
, operands
[2], operands
[3]);
11180 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
11181 t1
= gen_rtx_SET (VOIDmode
, operands
[4], operands
[5]);
11182 t2
= gen_rtx_SET (VOIDmode
, operands
[6], operands
[7]);
11183 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
11187 #undef TARGET_ADDRESS_COST
11188 #define TARGET_ADDRESS_COST aarch64_address_cost
11190 /* This hook will determines whether unnamed bitfields affect the alignment
11191 of the containing structure. The hook returns true if the structure
11192 should inherit the alignment requirements of an unnamed bitfield's
11194 #undef TARGET_ALIGN_ANON_BITFIELD
11195 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11197 #undef TARGET_ASM_ALIGNED_DI_OP
11198 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11200 #undef TARGET_ASM_ALIGNED_HI_OP
11201 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11203 #undef TARGET_ASM_ALIGNED_SI_OP
11204 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11206 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11207 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11208 hook_bool_const_tree_hwi_hwi_const_tree_true
11210 #undef TARGET_ASM_FILE_START
11211 #define TARGET_ASM_FILE_START aarch64_start_file
11213 #undef TARGET_ASM_OUTPUT_MI_THUNK
11214 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11216 #undef TARGET_ASM_SELECT_RTX_SECTION
11217 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11219 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11220 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11222 #undef TARGET_BUILD_BUILTIN_VA_LIST
11223 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11225 #undef TARGET_CALLEE_COPIES
11226 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11228 #undef TARGET_CAN_ELIMINATE
11229 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11231 #undef TARGET_CANNOT_FORCE_CONST_MEM
11232 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11234 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11235 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11237 /* Only the least significant bit is used for initialization guard
11239 #undef TARGET_CXX_GUARD_MASK_BIT
11240 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11242 #undef TARGET_C_MODE_FOR_SUFFIX
11243 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11245 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11246 #undef TARGET_DEFAULT_TARGET_FLAGS
11247 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11250 #undef TARGET_CLASS_MAX_NREGS
11251 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11253 #undef TARGET_BUILTIN_DECL
11254 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11256 #undef TARGET_EXPAND_BUILTIN
11257 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11259 #undef TARGET_EXPAND_BUILTIN_VA_START
11260 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11262 #undef TARGET_FOLD_BUILTIN
11263 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11265 #undef TARGET_FUNCTION_ARG
11266 #define TARGET_FUNCTION_ARG aarch64_function_arg
11268 #undef TARGET_FUNCTION_ARG_ADVANCE
11269 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11271 #undef TARGET_FUNCTION_ARG_BOUNDARY
11272 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11274 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11275 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11277 #undef TARGET_FUNCTION_VALUE
11278 #define TARGET_FUNCTION_VALUE aarch64_function_value
11280 #undef TARGET_FUNCTION_VALUE_REGNO_P
11281 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11283 #undef TARGET_FRAME_POINTER_REQUIRED
11284 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11286 #undef TARGET_GIMPLE_FOLD_BUILTIN
11287 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11289 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11290 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11292 #undef TARGET_INIT_BUILTINS
11293 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11295 #undef TARGET_LEGITIMATE_ADDRESS_P
11296 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11298 #undef TARGET_LEGITIMATE_CONSTANT_P
11299 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11301 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11302 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11304 #undef TARGET_LRA_P
11305 #define TARGET_LRA_P hook_bool_void_true
11307 #undef TARGET_MANGLE_TYPE
11308 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11310 #undef TARGET_MEMORY_MOVE_COST
11311 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11313 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11314 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11316 #undef TARGET_MUST_PASS_IN_STACK
11317 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11319 /* This target hook should return true if accesses to volatile bitfields
11320 should use the narrowest mode possible. It should return false if these
11321 accesses should use the bitfield container type. */
11322 #undef TARGET_NARROW_VOLATILE_BITFIELD
11323 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11325 #undef TARGET_OPTION_OVERRIDE
11326 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11328 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11329 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11330 aarch64_override_options_after_change
11332 #undef TARGET_PASS_BY_REFERENCE
11333 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11335 #undef TARGET_PREFERRED_RELOAD_CLASS
11336 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11338 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11339 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11341 #undef TARGET_SECONDARY_RELOAD
11342 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11344 #undef TARGET_SHIFT_TRUNCATION_MASK
11345 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11347 #undef TARGET_SETUP_INCOMING_VARARGS
11348 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11350 #undef TARGET_STRUCT_VALUE_RTX
11351 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11353 #undef TARGET_REGISTER_MOVE_COST
11354 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11356 #undef TARGET_RETURN_IN_MEMORY
11357 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11359 #undef TARGET_RETURN_IN_MSB
11360 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11362 #undef TARGET_RTX_COSTS
11363 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11365 #undef TARGET_SCHED_ISSUE_RATE
11366 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11368 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11369 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11370 aarch64_sched_first_cycle_multipass_dfa_lookahead
11372 #undef TARGET_TRAMPOLINE_INIT
11373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11381 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11382 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11384 #undef TARGET_VECTORIZE_ADD_STMT_COST
11385 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11387 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11388 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11389 aarch64_builtin_vectorization_cost
11391 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11392 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11394 #undef TARGET_VECTORIZE_BUILTINS
11395 #define TARGET_VECTORIZE_BUILTINS
11397 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11398 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11399 aarch64_builtin_vectorized_function
11401 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11402 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11403 aarch64_autovectorize_vector_sizes
11405 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11406 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11407 aarch64_atomic_assign_expand_fenv
11409 /* Section anchor support. */
11411 #undef TARGET_MIN_ANCHOR_OFFSET
11412 #define TARGET_MIN_ANCHOR_OFFSET -256
11414 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11415 byte offset; we can do much more for larger data types, but have no way
11416 to determine the size of the access. We assume accesses are aligned. */
11417 #undef TARGET_MAX_ANCHOR_OFFSET
11418 #define TARGET_MAX_ANCHOR_OFFSET 4095
11420 #undef TARGET_VECTOR_ALIGNMENT
11421 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11423 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11424 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11425 aarch64_simd_vector_alignment_reachable
11427 /* vec_perm support. */
11429 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11430 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11431 aarch64_vectorize_vec_perm_const_ok
11434 #undef TARGET_FIXED_CONDITION_CODE_REGS
11435 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11437 #undef TARGET_FLAGS_REGNUM
11438 #define TARGET_FLAGS_REGNUM CC_REGNUM
11440 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11441 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11443 #undef TARGET_ASAN_SHADOW_OFFSET
11444 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11446 #undef TARGET_LEGITIMIZE_ADDRESS
11447 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11449 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11450 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11451 aarch64_use_by_pieces_infrastructure_p
11453 #undef TARGET_CAN_USE_DOLOOP_P
11454 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11456 #undef TARGET_SCHED_MACRO_FUSION_P
11457 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11459 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11460 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11462 #undef TARGET_SCHED_FUSION_PRIORITY
11463 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11465 struct gcc_target targetm
= TARGET_INITIALIZER
;
11467 #include "gt-aarch64.h"