* config/aarch64/aarch64.c (aarch64_legitimize_address): New function.
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob92c78543846e3b98fcf626f042d0e4bf72dfd6c4
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2014 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "tree.h"
29 #include "stringpool.h"
30 #include "stor-layout.h"
31 #include "calls.h"
32 #include "varasm.h"
33 #include "regs.h"
34 #include "df.h"
35 #include "hard-reg-set.h"
36 #include "output.h"
37 #include "expr.h"
38 #include "reload.h"
39 #include "toplev.h"
40 #include "target.h"
41 #include "target-def.h"
42 #include "targhooks.h"
43 #include "ggc.h"
44 #include "function.h"
45 #include "tm_p.h"
46 #include "recog.h"
47 #include "langhooks.h"
48 #include "diagnostic-core.h"
49 #include "hash-table.h"
50 #include "vec.h"
51 #include "basic-block.h"
52 #include "tree-ssa-alias.h"
53 #include "internal-fn.h"
54 #include "gimple-fold.h"
55 #include "tree-eh.h"
56 #include "gimple-expr.h"
57 #include "is-a.h"
58 #include "gimple.h"
59 #include "gimplify.h"
60 #include "optabs.h"
61 #include "dwarf2.h"
62 #include "cfgloop.h"
63 #include "tree-vectorizer.h"
64 #include "config/arm/aarch-cost-tables.h"
65 #include "dumpfile.h"
66 #include "builtins.h"
68 /* Defined for convenience. */
69 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
71 /* Classifies an address.
73 ADDRESS_REG_IMM
74 A simple base register plus immediate offset.
76 ADDRESS_REG_WB
77 A base register indexed by immediate offset with writeback.
79 ADDRESS_REG_REG
80 A base register indexed by (optionally scaled) register.
82 ADDRESS_REG_UXTW
83 A base register indexed by (optionally scaled) zero-extended register.
85 ADDRESS_REG_SXTW
86 A base register indexed by (optionally scaled) sign-extended register.
88 ADDRESS_LO_SUM
89 A LO_SUM rtx with a base register and "LO12" symbol relocation.
91 ADDRESS_SYMBOLIC:
92 A constant symbolic address, in pc-relative literal pool. */
94 enum aarch64_address_type {
95 ADDRESS_REG_IMM,
96 ADDRESS_REG_WB,
97 ADDRESS_REG_REG,
98 ADDRESS_REG_UXTW,
99 ADDRESS_REG_SXTW,
100 ADDRESS_LO_SUM,
101 ADDRESS_SYMBOLIC
104 struct aarch64_address_info {
105 enum aarch64_address_type type;
106 rtx base;
107 rtx offset;
108 int shift;
109 enum aarch64_symbol_type symbol_type;
112 struct simd_immediate_info
114 rtx value;
115 int shift;
116 int element_width;
117 bool mvn;
118 bool msl;
121 /* The current code model. */
122 enum aarch64_code_model aarch64_cmodel;
124 #ifdef HAVE_AS_TLS
125 #undef TARGET_HAVE_TLS
126 #define TARGET_HAVE_TLS 1
127 #endif
129 static bool aarch64_lra_p (void);
130 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
131 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
132 const_tree,
133 enum machine_mode *, int *,
134 bool *);
135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
137 static void aarch64_override_options_after_change (void);
138 static bool aarch64_vector_mode_supported_p (enum machine_mode);
139 static unsigned bit_count (unsigned HOST_WIDE_INT);
140 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
141 const unsigned char *sel);
142 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
144 /* The processor for which instructions should be scheduled. */
145 enum aarch64_processor aarch64_tune = cortexa53;
147 /* The current tuning set. */
148 const struct tune_params *aarch64_tune_params;
150 /* Mask to specify which instructions we are allowed to generate. */
151 unsigned long aarch64_isa_flags = 0;
153 /* Mask to specify which instruction scheduling options should be used. */
154 unsigned long aarch64_tune_flags = 0;
156 /* Tuning parameters. */
158 #if HAVE_DESIGNATED_INITIALIZERS
159 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
160 #else
161 #define NAMED_PARAM(NAME, VAL) (VAL)
162 #endif
164 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
165 __extension__
166 #endif
168 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
169 __extension__
170 #endif
171 static const struct cpu_addrcost_table generic_addrcost_table =
173 #if HAVE_DESIGNATED_INITIALIZERS
174 .addr_scale_costs =
175 #endif
177 NAMED_PARAM (hi, 0),
178 NAMED_PARAM (si, 0),
179 NAMED_PARAM (di, 0),
180 NAMED_PARAM (ti, 0),
182 NAMED_PARAM (pre_modify, 0),
183 NAMED_PARAM (post_modify, 0),
184 NAMED_PARAM (register_offset, 0),
185 NAMED_PARAM (register_extend, 0),
186 NAMED_PARAM (imm_offset, 0)
189 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
190 __extension__
191 #endif
192 static const struct cpu_addrcost_table cortexa57_addrcost_table =
194 #if HAVE_DESIGNATED_INITIALIZERS
195 .addr_scale_costs =
196 #endif
198 NAMED_PARAM (hi, 1),
199 NAMED_PARAM (si, 0),
200 NAMED_PARAM (di, 0),
201 NAMED_PARAM (ti, 1),
203 NAMED_PARAM (pre_modify, 0),
204 NAMED_PARAM (post_modify, 0),
205 NAMED_PARAM (register_offset, 0),
206 NAMED_PARAM (register_extend, 0),
207 NAMED_PARAM (imm_offset, 0),
210 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
211 __extension__
212 #endif
213 static const struct cpu_regmove_cost generic_regmove_cost =
215 NAMED_PARAM (GP2GP, 1),
216 NAMED_PARAM (GP2FP, 2),
217 NAMED_PARAM (FP2GP, 2),
218 NAMED_PARAM (FP2FP, 2)
221 static const struct cpu_regmove_cost cortexa57_regmove_cost =
223 NAMED_PARAM (GP2GP, 1),
224 /* Avoid the use of slow int<->fp moves for spilling by setting
225 their cost higher than memmov_cost. */
226 NAMED_PARAM (GP2FP, 5),
227 NAMED_PARAM (FP2GP, 5),
228 NAMED_PARAM (FP2FP, 2)
231 static const struct cpu_regmove_cost cortexa53_regmove_cost =
233 NAMED_PARAM (GP2GP, 1),
234 /* Avoid the use of slow int<->fp moves for spilling by setting
235 their cost higher than memmov_cost. */
236 NAMED_PARAM (GP2FP, 5),
237 NAMED_PARAM (FP2GP, 5),
238 NAMED_PARAM (FP2FP, 2)
241 /* Generic costs for vector insn classes. */
242 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
243 __extension__
244 #endif
245 static const struct cpu_vector_cost generic_vector_cost =
247 NAMED_PARAM (scalar_stmt_cost, 1),
248 NAMED_PARAM (scalar_load_cost, 1),
249 NAMED_PARAM (scalar_store_cost, 1),
250 NAMED_PARAM (vec_stmt_cost, 1),
251 NAMED_PARAM (vec_to_scalar_cost, 1),
252 NAMED_PARAM (scalar_to_vec_cost, 1),
253 NAMED_PARAM (vec_align_load_cost, 1),
254 NAMED_PARAM (vec_unalign_load_cost, 1),
255 NAMED_PARAM (vec_unalign_store_cost, 1),
256 NAMED_PARAM (vec_store_cost, 1),
257 NAMED_PARAM (cond_taken_branch_cost, 3),
258 NAMED_PARAM (cond_not_taken_branch_cost, 1)
261 /* Generic costs for vector insn classes. */
262 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
263 __extension__
264 #endif
265 static const struct cpu_vector_cost cortexa57_vector_cost =
267 NAMED_PARAM (scalar_stmt_cost, 1),
268 NAMED_PARAM (scalar_load_cost, 4),
269 NAMED_PARAM (scalar_store_cost, 1),
270 NAMED_PARAM (vec_stmt_cost, 3),
271 NAMED_PARAM (vec_to_scalar_cost, 8),
272 NAMED_PARAM (scalar_to_vec_cost, 8),
273 NAMED_PARAM (vec_align_load_cost, 5),
274 NAMED_PARAM (vec_unalign_load_cost, 5),
275 NAMED_PARAM (vec_unalign_store_cost, 1),
276 NAMED_PARAM (vec_store_cost, 1),
277 NAMED_PARAM (cond_taken_branch_cost, 1),
278 NAMED_PARAM (cond_not_taken_branch_cost, 1)
281 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
282 __extension__
283 #endif
284 static const struct tune_params generic_tunings =
286 &cortexa57_extra_costs,
287 &generic_addrcost_table,
288 &generic_regmove_cost,
289 &generic_vector_cost,
290 NAMED_PARAM (memmov_cost, 4),
291 NAMED_PARAM (issue_rate, 2)
294 static const struct tune_params cortexa53_tunings =
296 &cortexa53_extra_costs,
297 &generic_addrcost_table,
298 &cortexa53_regmove_cost,
299 &generic_vector_cost,
300 NAMED_PARAM (memmov_cost, 4),
301 NAMED_PARAM (issue_rate, 2)
304 static const struct tune_params cortexa57_tunings =
306 &cortexa57_extra_costs,
307 &cortexa57_addrcost_table,
308 &cortexa57_regmove_cost,
309 &cortexa57_vector_cost,
310 NAMED_PARAM (memmov_cost, 4),
311 NAMED_PARAM (issue_rate, 3)
314 /* A processor implementing AArch64. */
315 struct processor
317 const char *const name;
318 enum aarch64_processor core;
319 const char *arch;
320 const unsigned long flags;
321 const struct tune_params *const tune;
324 /* Processor cores implementing AArch64. */
325 static const struct processor all_cores[] =
327 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
328 {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
329 #include "aarch64-cores.def"
330 #undef AARCH64_CORE
331 {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
332 {NULL, aarch64_none, NULL, 0, NULL}
335 /* Architectures implementing AArch64. */
336 static const struct processor all_architectures[] =
338 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
339 {NAME, CORE, #ARCH, FLAGS, NULL},
340 #include "aarch64-arches.def"
341 #undef AARCH64_ARCH
342 {NULL, aarch64_none, NULL, 0, NULL}
345 /* Target specification. These are populated as commandline arguments
346 are processed, or NULL if not specified. */
347 static const struct processor *selected_arch;
348 static const struct processor *selected_cpu;
349 static const struct processor *selected_tune;
351 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
353 /* An ISA extension in the co-processor and main instruction set space. */
354 struct aarch64_option_extension
356 const char *const name;
357 const unsigned long flags_on;
358 const unsigned long flags_off;
361 /* ISA extensions in AArch64. */
362 static const struct aarch64_option_extension all_extensions[] =
364 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
365 {NAME, FLAGS_ON, FLAGS_OFF},
366 #include "aarch64-option-extensions.def"
367 #undef AARCH64_OPT_EXTENSION
368 {NULL, 0, 0}
371 /* Used to track the size of an address when generating a pre/post
372 increment address. */
373 static enum machine_mode aarch64_memory_reference_mode;
375 /* Used to force GTY into this file. */
376 static GTY(()) int gty_dummy;
378 /* A table of valid AArch64 "bitmask immediate" values for
379 logical instructions. */
381 #define AARCH64_NUM_BITMASKS 5334
382 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
384 typedef enum aarch64_cond_code
386 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
387 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
388 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
390 aarch64_cc;
392 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
394 /* The condition codes of the processor, and the inverse function. */
395 static const char * const aarch64_condition_codes[] =
397 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
398 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
401 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
402 unsigned
403 aarch64_dbx_register_number (unsigned regno)
405 if (GP_REGNUM_P (regno))
406 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
407 else if (regno == SP_REGNUM)
408 return AARCH64_DWARF_SP;
409 else if (FP_REGNUM_P (regno))
410 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
412 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
413 equivalent DWARF register. */
414 return DWARF_FRAME_REGISTERS;
417 /* Return TRUE if MODE is any of the large INT modes. */
418 static bool
419 aarch64_vect_struct_mode_p (enum machine_mode mode)
421 return mode == OImode || mode == CImode || mode == XImode;
424 /* Return TRUE if MODE is any of the vector modes. */
425 static bool
426 aarch64_vector_mode_p (enum machine_mode mode)
428 return aarch64_vector_mode_supported_p (mode)
429 || aarch64_vect_struct_mode_p (mode);
432 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
433 static bool
434 aarch64_array_mode_supported_p (enum machine_mode mode,
435 unsigned HOST_WIDE_INT nelems)
437 if (TARGET_SIMD
438 && AARCH64_VALID_SIMD_QREG_MODE (mode)
439 && (nelems >= 2 && nelems <= 4))
440 return true;
442 return false;
445 /* Implement HARD_REGNO_NREGS. */
448 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
450 switch (aarch64_regno_regclass (regno))
452 case FP_REGS:
453 case FP_LO_REGS:
454 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
455 default:
456 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
458 gcc_unreachable ();
461 /* Implement HARD_REGNO_MODE_OK. */
464 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
466 if (GET_MODE_CLASS (mode) == MODE_CC)
467 return regno == CC_REGNUM;
469 if (regno == SP_REGNUM)
470 /* The purpose of comparing with ptr_mode is to support the
471 global register variable associated with the stack pointer
472 register via the syntax of asm ("wsp") in ILP32. */
473 return mode == Pmode || mode == ptr_mode;
475 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
476 return mode == Pmode;
478 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
479 return 1;
481 if (FP_REGNUM_P (regno))
483 if (aarch64_vect_struct_mode_p (mode))
484 return
485 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
486 else
487 return 1;
490 return 0;
493 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
494 enum machine_mode
495 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
496 enum machine_mode mode)
498 /* Handle modes that fit within single registers. */
499 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
501 if (GET_MODE_SIZE (mode) >= 4)
502 return mode;
503 else
504 return SImode;
506 /* Fall back to generic for multi-reg and very large modes. */
507 else
508 return choose_hard_reg_mode (regno, nregs, false);
511 /* Return true if calls to DECL should be treated as
512 long-calls (ie called via a register). */
513 static bool
514 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
516 return false;
519 /* Return true if calls to symbol-ref SYM should be treated as
520 long-calls (ie called via a register). */
521 bool
522 aarch64_is_long_call_p (rtx sym)
524 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
527 /* Return true if the offsets to a zero/sign-extract operation
528 represent an expression that matches an extend operation. The
529 operands represent the paramters from
531 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
532 bool
533 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
534 rtx extract_imm)
536 HOST_WIDE_INT mult_val, extract_val;
538 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
539 return false;
541 mult_val = INTVAL (mult_imm);
542 extract_val = INTVAL (extract_imm);
544 if (extract_val > 8
545 && extract_val < GET_MODE_BITSIZE (mode)
546 && exact_log2 (extract_val & ~7) > 0
547 && (extract_val & 7) <= 4
548 && mult_val == (1 << (extract_val & 7)))
549 return true;
551 return false;
554 /* Emit an insn that's a simple single-set. Both the operands must be
555 known to be valid. */
556 inline static rtx
557 emit_set_insn (rtx x, rtx y)
559 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
562 /* X and Y are two things to compare using CODE. Emit the compare insn and
563 return the rtx for register 0 in the proper mode. */
565 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
567 enum machine_mode mode = SELECT_CC_MODE (code, x, y);
568 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
570 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
571 return cc_reg;
574 /* Build the SYMBOL_REF for __tls_get_addr. */
576 static GTY(()) rtx tls_get_addr_libfunc;
579 aarch64_tls_get_addr (void)
581 if (!tls_get_addr_libfunc)
582 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
583 return tls_get_addr_libfunc;
586 /* Return the TLS model to use for ADDR. */
588 static enum tls_model
589 tls_symbolic_operand_type (rtx addr)
591 enum tls_model tls_kind = TLS_MODEL_NONE;
592 rtx sym, addend;
594 if (GET_CODE (addr) == CONST)
596 split_const (addr, &sym, &addend);
597 if (GET_CODE (sym) == SYMBOL_REF)
598 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
600 else if (GET_CODE (addr) == SYMBOL_REF)
601 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
603 return tls_kind;
606 /* We'll allow lo_sum's in addresses in our legitimate addresses
607 so that combine would take care of combining addresses where
608 necessary, but for generation purposes, we'll generate the address
609 as :
610 RTL Absolute
611 tmp = hi (symbol_ref); adrp x1, foo
612 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
615 PIC TLS
616 adrp x1, :got:foo adrp tmp, :tlsgd:foo
617 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
618 bl __tls_get_addr
621 Load TLS symbol, depending on TLS mechanism and TLS access model.
623 Global Dynamic - Traditional TLS:
624 adrp tmp, :tlsgd:imm
625 add dest, tmp, #:tlsgd_lo12:imm
626 bl __tls_get_addr
628 Global Dynamic - TLS Descriptors:
629 adrp dest, :tlsdesc:imm
630 ldr tmp, [dest, #:tlsdesc_lo12:imm]
631 add dest, dest, #:tlsdesc_lo12:imm
632 blr tmp
633 mrs tp, tpidr_el0
634 add dest, dest, tp
636 Initial Exec:
637 mrs tp, tpidr_el0
638 adrp tmp, :gottprel:imm
639 ldr dest, [tmp, #:gottprel_lo12:imm]
640 add dest, dest, tp
642 Local Exec:
643 mrs tp, tpidr_el0
644 add t0, tp, #:tprel_hi12:imm
645 add t0, #:tprel_lo12_nc:imm
648 static void
649 aarch64_load_symref_appropriately (rtx dest, rtx imm,
650 enum aarch64_symbol_type type)
652 switch (type)
654 case SYMBOL_SMALL_ABSOLUTE:
656 /* In ILP32, the mode of dest can be either SImode or DImode. */
657 rtx tmp_reg = dest;
658 enum machine_mode mode = GET_MODE (dest);
660 gcc_assert (mode == Pmode || mode == ptr_mode);
662 if (can_create_pseudo_p ())
663 tmp_reg = gen_reg_rtx (mode);
665 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
666 emit_insn (gen_add_losym (dest, tmp_reg, imm));
667 return;
670 case SYMBOL_TINY_ABSOLUTE:
671 emit_insn (gen_rtx_SET (Pmode, dest, imm));
672 return;
674 case SYMBOL_SMALL_GOT:
676 /* In ILP32, the mode of dest can be either SImode or DImode,
677 while the got entry is always of SImode size. The mode of
678 dest depends on how dest is used: if dest is assigned to a
679 pointer (e.g. in the memory), it has SImode; it may have
680 DImode if dest is dereferenced to access the memeory.
681 This is why we have to handle three different ldr_got_small
682 patterns here (two patterns for ILP32). */
683 rtx tmp_reg = dest;
684 enum machine_mode mode = GET_MODE (dest);
686 if (can_create_pseudo_p ())
687 tmp_reg = gen_reg_rtx (mode);
689 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
690 if (mode == ptr_mode)
692 if (mode == DImode)
693 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
694 else
695 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
697 else
699 gcc_assert (mode == Pmode);
700 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
703 return;
706 case SYMBOL_SMALL_TLSGD:
708 rtx_insn *insns;
709 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
711 start_sequence ();
712 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
713 insns = get_insns ();
714 end_sequence ();
716 RTL_CONST_CALL_P (insns) = 1;
717 emit_libcall_block (insns, dest, result, imm);
718 return;
721 case SYMBOL_SMALL_TLSDESC:
723 enum machine_mode mode = GET_MODE (dest);
724 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
725 rtx tp;
727 gcc_assert (mode == Pmode || mode == ptr_mode);
729 /* In ILP32, the got entry is always of SImode size. Unlike
730 small GOT, the dest is fixed at reg 0. */
731 if (TARGET_ILP32)
732 emit_insn (gen_tlsdesc_small_si (imm));
733 else
734 emit_insn (gen_tlsdesc_small_di (imm));
735 tp = aarch64_load_tp (NULL);
737 if (mode != Pmode)
738 tp = gen_lowpart (mode, tp);
740 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
741 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
742 return;
745 case SYMBOL_SMALL_GOTTPREL:
747 /* In ILP32, the mode of dest can be either SImode or DImode,
748 while the got entry is always of SImode size. The mode of
749 dest depends on how dest is used: if dest is assigned to a
750 pointer (e.g. in the memory), it has SImode; it may have
751 DImode if dest is dereferenced to access the memeory.
752 This is why we have to handle three different tlsie_small
753 patterns here (two patterns for ILP32). */
754 enum machine_mode mode = GET_MODE (dest);
755 rtx tmp_reg = gen_reg_rtx (mode);
756 rtx tp = aarch64_load_tp (NULL);
758 if (mode == ptr_mode)
760 if (mode == DImode)
761 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
762 else
764 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
765 tp = gen_lowpart (mode, tp);
768 else
770 gcc_assert (mode == Pmode);
771 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
774 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
775 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
776 return;
779 case SYMBOL_SMALL_TPREL:
781 rtx tp = aarch64_load_tp (NULL);
782 emit_insn (gen_tlsle_small (dest, tp, imm));
783 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
784 return;
787 case SYMBOL_TINY_GOT:
788 emit_insn (gen_ldr_got_tiny (dest, imm));
789 return;
791 default:
792 gcc_unreachable ();
796 /* Emit a move from SRC to DEST. Assume that the move expanders can
797 handle all moves if !can_create_pseudo_p (). The distinction is
798 important because, unlike emit_move_insn, the move expanders know
799 how to force Pmode objects into the constant pool even when the
800 constant pool address is not itself legitimate. */
801 static rtx
802 aarch64_emit_move (rtx dest, rtx src)
804 return (can_create_pseudo_p ()
805 ? emit_move_insn (dest, src)
806 : emit_move_insn_1 (dest, src));
809 /* Split a 128-bit move operation into two 64-bit move operations,
810 taking care to handle partial overlap of register to register
811 copies. Special cases are needed when moving between GP regs and
812 FP regs. SRC can be a register, constant or memory; DST a register
813 or memory. If either operand is memory it must not have any side
814 effects. */
815 void
816 aarch64_split_128bit_move (rtx dst, rtx src)
818 rtx dst_lo, dst_hi;
819 rtx src_lo, src_hi;
821 enum machine_mode mode = GET_MODE (dst);
823 gcc_assert (mode == TImode || mode == TFmode);
824 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
825 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
827 if (REG_P (dst) && REG_P (src))
829 int src_regno = REGNO (src);
830 int dst_regno = REGNO (dst);
832 /* Handle FP <-> GP regs. */
833 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
835 src_lo = gen_lowpart (word_mode, src);
836 src_hi = gen_highpart (word_mode, src);
838 if (mode == TImode)
840 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
841 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
843 else
845 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
846 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
848 return;
850 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
852 dst_lo = gen_lowpart (word_mode, dst);
853 dst_hi = gen_highpart (word_mode, dst);
855 if (mode == TImode)
857 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
858 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
860 else
862 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
863 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
865 return;
869 dst_lo = gen_lowpart (word_mode, dst);
870 dst_hi = gen_highpart (word_mode, dst);
871 src_lo = gen_lowpart (word_mode, src);
872 src_hi = gen_highpart_mode (word_mode, mode, src);
874 /* At most one pairing may overlap. */
875 if (reg_overlap_mentioned_p (dst_lo, src_hi))
877 aarch64_emit_move (dst_hi, src_hi);
878 aarch64_emit_move (dst_lo, src_lo);
880 else
882 aarch64_emit_move (dst_lo, src_lo);
883 aarch64_emit_move (dst_hi, src_hi);
887 bool
888 aarch64_split_128bit_move_p (rtx dst, rtx src)
890 return (! REG_P (src)
891 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
894 /* Split a complex SIMD combine. */
896 void
897 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
899 enum machine_mode src_mode = GET_MODE (src1);
900 enum machine_mode dst_mode = GET_MODE (dst);
902 gcc_assert (VECTOR_MODE_P (dst_mode));
904 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
906 rtx (*gen) (rtx, rtx, rtx);
908 switch (src_mode)
910 case V8QImode:
911 gen = gen_aarch64_simd_combinev8qi;
912 break;
913 case V4HImode:
914 gen = gen_aarch64_simd_combinev4hi;
915 break;
916 case V2SImode:
917 gen = gen_aarch64_simd_combinev2si;
918 break;
919 case V2SFmode:
920 gen = gen_aarch64_simd_combinev2sf;
921 break;
922 case DImode:
923 gen = gen_aarch64_simd_combinedi;
924 break;
925 case DFmode:
926 gen = gen_aarch64_simd_combinedf;
927 break;
928 default:
929 gcc_unreachable ();
932 emit_insn (gen (dst, src1, src2));
933 return;
937 /* Split a complex SIMD move. */
939 void
940 aarch64_split_simd_move (rtx dst, rtx src)
942 enum machine_mode src_mode = GET_MODE (src);
943 enum machine_mode dst_mode = GET_MODE (dst);
945 gcc_assert (VECTOR_MODE_P (dst_mode));
947 if (REG_P (dst) && REG_P (src))
949 rtx (*gen) (rtx, rtx);
951 gcc_assert (VECTOR_MODE_P (src_mode));
953 switch (src_mode)
955 case V16QImode:
956 gen = gen_aarch64_split_simd_movv16qi;
957 break;
958 case V8HImode:
959 gen = gen_aarch64_split_simd_movv8hi;
960 break;
961 case V4SImode:
962 gen = gen_aarch64_split_simd_movv4si;
963 break;
964 case V2DImode:
965 gen = gen_aarch64_split_simd_movv2di;
966 break;
967 case V4SFmode:
968 gen = gen_aarch64_split_simd_movv4sf;
969 break;
970 case V2DFmode:
971 gen = gen_aarch64_split_simd_movv2df;
972 break;
973 default:
974 gcc_unreachable ();
977 emit_insn (gen (dst, src));
978 return;
982 static rtx
983 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
985 if (can_create_pseudo_p ())
986 return force_reg (mode, value);
987 else
989 x = aarch64_emit_move (x, value);
990 return x;
995 static rtx
996 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
998 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1000 rtx high;
1001 /* Load the full offset into a register. This
1002 might be improvable in the future. */
1003 high = GEN_INT (offset);
1004 offset = 0;
1005 high = aarch64_force_temporary (mode, temp, high);
1006 reg = aarch64_force_temporary (mode, temp,
1007 gen_rtx_PLUS (mode, high, reg));
1009 return plus_constant (mode, reg, offset);
1012 void
1013 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1015 enum machine_mode mode = GET_MODE (dest);
1016 unsigned HOST_WIDE_INT mask;
1017 int i;
1018 bool first;
1019 unsigned HOST_WIDE_INT val;
1020 bool subtargets;
1021 rtx subtarget;
1022 int one_match, zero_match, first_not_ffff_match;
1024 gcc_assert (mode == SImode || mode == DImode);
1026 /* Check on what type of symbol it is. */
1027 if (GET_CODE (imm) == SYMBOL_REF
1028 || GET_CODE (imm) == LABEL_REF
1029 || GET_CODE (imm) == CONST)
1031 rtx mem, base, offset;
1032 enum aarch64_symbol_type sty;
1034 /* If we have (const (plus symbol offset)), separate out the offset
1035 before we start classifying the symbol. */
1036 split_const (imm, &base, &offset);
1038 sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1039 switch (sty)
1041 case SYMBOL_FORCE_TO_MEM:
1042 if (offset != const0_rtx
1043 && targetm.cannot_force_const_mem (mode, imm))
1045 gcc_assert (can_create_pseudo_p ());
1046 base = aarch64_force_temporary (mode, dest, base);
1047 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1048 aarch64_emit_move (dest, base);
1049 return;
1051 mem = force_const_mem (ptr_mode, imm);
1052 gcc_assert (mem);
1053 if (mode != ptr_mode)
1054 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1055 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1056 return;
1058 case SYMBOL_SMALL_TLSGD:
1059 case SYMBOL_SMALL_TLSDESC:
1060 case SYMBOL_SMALL_GOTTPREL:
1061 case SYMBOL_SMALL_GOT:
1062 case SYMBOL_TINY_GOT:
1063 if (offset != const0_rtx)
1065 gcc_assert(can_create_pseudo_p ());
1066 base = aarch64_force_temporary (mode, dest, base);
1067 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1068 aarch64_emit_move (dest, base);
1069 return;
1071 /* FALLTHRU */
1073 case SYMBOL_SMALL_TPREL:
1074 case SYMBOL_SMALL_ABSOLUTE:
1075 case SYMBOL_TINY_ABSOLUTE:
1076 aarch64_load_symref_appropriately (dest, imm, sty);
1077 return;
1079 default:
1080 gcc_unreachable ();
1084 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1086 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1087 return;
1090 if (!CONST_INT_P (imm))
1092 if (GET_CODE (imm) == HIGH)
1093 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1094 else
1096 rtx mem = force_const_mem (mode, imm);
1097 gcc_assert (mem);
1098 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1101 return;
1104 if (mode == SImode)
1106 /* We know we can't do this in 1 insn, and we must be able to do it
1107 in two; so don't mess around looking for sequences that don't buy
1108 us anything. */
1109 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1110 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1111 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1112 return;
1115 /* Remaining cases are all for DImode. */
1117 val = INTVAL (imm);
1118 subtargets = optimize && can_create_pseudo_p ();
1120 one_match = 0;
1121 zero_match = 0;
1122 mask = 0xffff;
1123 first_not_ffff_match = -1;
1125 for (i = 0; i < 64; i += 16, mask <<= 16)
1127 if ((val & mask) == mask)
1128 one_match++;
1129 else
1131 if (first_not_ffff_match < 0)
1132 first_not_ffff_match = i;
1133 if ((val & mask) == 0)
1134 zero_match++;
1138 if (one_match == 2)
1140 /* Set one of the quarters and then insert back into result. */
1141 mask = 0xffffll << first_not_ffff_match;
1142 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1143 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1144 GEN_INT ((val >> first_not_ffff_match)
1145 & 0xffff)));
1146 return;
1149 if (zero_match == 2)
1150 goto simple_sequence;
1152 mask = 0x0ffff0000UL;
1153 for (i = 16; i < 64; i += 16, mask <<= 16)
1155 HOST_WIDE_INT comp = mask & ~(mask - 1);
1157 if (aarch64_uimm12_shift (val - (val & mask)))
1159 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1161 emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1162 emit_insn (gen_adddi3 (dest, subtarget,
1163 GEN_INT (val - (val & mask))));
1164 return;
1166 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1168 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1170 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1171 GEN_INT ((val + comp) & mask)));
1172 emit_insn (gen_adddi3 (dest, subtarget,
1173 GEN_INT (val - ((val + comp) & mask))));
1174 return;
1176 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1178 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1180 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1181 GEN_INT ((val - comp) | ~mask)));
1182 emit_insn (gen_adddi3 (dest, subtarget,
1183 GEN_INT (val - ((val - comp) | ~mask))));
1184 return;
1186 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1188 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1190 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1191 GEN_INT (val | ~mask)));
1192 emit_insn (gen_adddi3 (dest, subtarget,
1193 GEN_INT (val - (val | ~mask))));
1194 return;
1198 /* See if we can do it by arithmetically combining two
1199 immediates. */
1200 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1202 int j;
1203 mask = 0xffff;
1205 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1206 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1208 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1209 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1210 GEN_INT (aarch64_bitmasks[i])));
1211 emit_insn (gen_adddi3 (dest, subtarget,
1212 GEN_INT (val - aarch64_bitmasks[i])));
1213 return;
1216 for (j = 0; j < 64; j += 16, mask <<= 16)
1218 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1220 emit_insn (gen_rtx_SET (VOIDmode, dest,
1221 GEN_INT (aarch64_bitmasks[i])));
1222 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1223 GEN_INT ((val >> j) & 0xffff)));
1224 return;
1229 /* See if we can do it by logically combining two immediates. */
1230 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1232 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1234 int j;
1236 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1237 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1239 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1240 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1241 GEN_INT (aarch64_bitmasks[i])));
1242 emit_insn (gen_iordi3 (dest, subtarget,
1243 GEN_INT (aarch64_bitmasks[j])));
1244 return;
1247 else if ((val & aarch64_bitmasks[i]) == val)
1249 int j;
1251 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1252 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1255 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1256 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1257 GEN_INT (aarch64_bitmasks[j])));
1258 emit_insn (gen_anddi3 (dest, subtarget,
1259 GEN_INT (aarch64_bitmasks[i])));
1260 return;
1265 if (one_match > zero_match)
1267 /* Set either first three quarters or all but the third. */
1268 mask = 0xffffll << (16 - first_not_ffff_match);
1269 emit_insn (gen_rtx_SET (VOIDmode, dest,
1270 GEN_INT (val | mask | 0xffffffff00000000ull)));
1272 /* Now insert other two quarters. */
1273 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1274 i < 64; i += 16, mask <<= 16)
1276 if ((val & mask) != mask)
1277 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1278 GEN_INT ((val >> i) & 0xffff)));
1280 return;
1283 simple_sequence:
1284 first = true;
1285 mask = 0xffff;
1286 for (i = 0; i < 64; i += 16, mask <<= 16)
1288 if ((val & mask) != 0)
1290 if (first)
1292 emit_insn (gen_rtx_SET (VOIDmode, dest,
1293 GEN_INT (val & mask)));
1294 first = false;
1296 else
1297 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1298 GEN_INT ((val >> i) & 0xffff)));
1303 static bool
1304 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1305 tree exp ATTRIBUTE_UNUSED)
1307 /* Currently, always true. */
1308 return true;
1311 /* Implement TARGET_PASS_BY_REFERENCE. */
1313 static bool
1314 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1315 enum machine_mode mode,
1316 const_tree type,
1317 bool named ATTRIBUTE_UNUSED)
1319 HOST_WIDE_INT size;
1320 enum machine_mode dummymode;
1321 int nregs;
1323 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1324 size = (mode == BLKmode && type)
1325 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1327 /* Aggregates are passed by reference based on their size. */
1328 if (type && AGGREGATE_TYPE_P (type))
1330 size = int_size_in_bytes (type);
1333 /* Variable sized arguments are always returned by reference. */
1334 if (size < 0)
1335 return true;
1337 /* Can this be a candidate to be passed in fp/simd register(s)? */
1338 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1339 &dummymode, &nregs,
1340 NULL))
1341 return false;
1343 /* Arguments which are variable sized or larger than 2 registers are
1344 passed by reference unless they are a homogenous floating point
1345 aggregate. */
1346 return size > 2 * UNITS_PER_WORD;
1349 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1350 static bool
1351 aarch64_return_in_msb (const_tree valtype)
1353 enum machine_mode dummy_mode;
1354 int dummy_int;
1356 /* Never happens in little-endian mode. */
1357 if (!BYTES_BIG_ENDIAN)
1358 return false;
1360 /* Only composite types smaller than or equal to 16 bytes can
1361 be potentially returned in registers. */
1362 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1363 || int_size_in_bytes (valtype) <= 0
1364 || int_size_in_bytes (valtype) > 16)
1365 return false;
1367 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1368 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1369 is always passed/returned in the least significant bits of fp/simd
1370 register(s). */
1371 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1372 &dummy_mode, &dummy_int, NULL))
1373 return false;
1375 return true;
1378 /* Implement TARGET_FUNCTION_VALUE.
1379 Define how to find the value returned by a function. */
1381 static rtx
1382 aarch64_function_value (const_tree type, const_tree func,
1383 bool outgoing ATTRIBUTE_UNUSED)
1385 enum machine_mode mode;
1386 int unsignedp;
1387 int count;
1388 enum machine_mode ag_mode;
1390 mode = TYPE_MODE (type);
1391 if (INTEGRAL_TYPE_P (type))
1392 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1394 if (aarch64_return_in_msb (type))
1396 HOST_WIDE_INT size = int_size_in_bytes (type);
1398 if (size % UNITS_PER_WORD != 0)
1400 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1401 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1405 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1406 &ag_mode, &count, NULL))
1408 if (!aarch64_composite_type_p (type, mode))
1410 gcc_assert (count == 1 && mode == ag_mode);
1411 return gen_rtx_REG (mode, V0_REGNUM);
1413 else
1415 int i;
1416 rtx par;
1418 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1419 for (i = 0; i < count; i++)
1421 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1422 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1423 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1424 XVECEXP (par, 0, i) = tmp;
1426 return par;
1429 else
1430 return gen_rtx_REG (mode, R0_REGNUM);
1433 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1434 Return true if REGNO is the number of a hard register in which the values
1435 of called function may come back. */
1437 static bool
1438 aarch64_function_value_regno_p (const unsigned int regno)
1440 /* Maximum of 16 bytes can be returned in the general registers. Examples
1441 of 16-byte return values are: 128-bit integers and 16-byte small
1442 structures (excluding homogeneous floating-point aggregates). */
1443 if (regno == R0_REGNUM || regno == R1_REGNUM)
1444 return true;
1446 /* Up to four fp/simd registers can return a function value, e.g. a
1447 homogeneous floating-point aggregate having four members. */
1448 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1449 return !TARGET_GENERAL_REGS_ONLY;
1451 return false;
1454 /* Implement TARGET_RETURN_IN_MEMORY.
1456 If the type T of the result of a function is such that
1457 void func (T arg)
1458 would require that arg be passed as a value in a register (or set of
1459 registers) according to the parameter passing rules, then the result
1460 is returned in the same registers as would be used for such an
1461 argument. */
1463 static bool
1464 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1466 HOST_WIDE_INT size;
1467 enum machine_mode ag_mode;
1468 int count;
1470 if (!AGGREGATE_TYPE_P (type)
1471 && TREE_CODE (type) != COMPLEX_TYPE
1472 && TREE_CODE (type) != VECTOR_TYPE)
1473 /* Simple scalar types always returned in registers. */
1474 return false;
1476 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1477 type,
1478 &ag_mode,
1479 &count,
1480 NULL))
1481 return false;
1483 /* Types larger than 2 registers returned in memory. */
1484 size = int_size_in_bytes (type);
1485 return (size < 0 || size > 2 * UNITS_PER_WORD);
1488 static bool
1489 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1490 const_tree type, int *nregs)
1492 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1493 return aarch64_vfp_is_call_or_return_candidate (mode,
1494 type,
1495 &pcum->aapcs_vfp_rmode,
1496 nregs,
1497 NULL);
1500 /* Given MODE and TYPE of a function argument, return the alignment in
1501 bits. The idea is to suppress any stronger alignment requested by
1502 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1503 This is a helper function for local use only. */
1505 static unsigned int
1506 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1508 unsigned int alignment;
1510 if (type)
1512 if (!integer_zerop (TYPE_SIZE (type)))
1514 if (TYPE_MODE (type) == mode)
1515 alignment = TYPE_ALIGN (type);
1516 else
1517 alignment = GET_MODE_ALIGNMENT (mode);
1519 else
1520 alignment = 0;
1522 else
1523 alignment = GET_MODE_ALIGNMENT (mode);
1525 return alignment;
1528 /* Layout a function argument according to the AAPCS64 rules. The rule
1529 numbers refer to the rule numbers in the AAPCS64. */
1531 static void
1532 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1533 const_tree type,
1534 bool named ATTRIBUTE_UNUSED)
1536 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1537 int ncrn, nvrn, nregs;
1538 bool allocate_ncrn, allocate_nvrn;
1539 HOST_WIDE_INT size;
1541 /* We need to do this once per argument. */
1542 if (pcum->aapcs_arg_processed)
1543 return;
1545 pcum->aapcs_arg_processed = true;
1547 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1548 size
1549 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1550 UNITS_PER_WORD);
1552 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1553 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1554 mode,
1555 type,
1556 &nregs);
1558 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1559 The following code thus handles passing by SIMD/FP registers first. */
1561 nvrn = pcum->aapcs_nvrn;
1563 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1564 and homogenous short-vector aggregates (HVA). */
1565 if (allocate_nvrn)
1567 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1569 pcum->aapcs_nextnvrn = nvrn + nregs;
1570 if (!aarch64_composite_type_p (type, mode))
1572 gcc_assert (nregs == 1);
1573 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1575 else
1577 rtx par;
1578 int i;
1579 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1580 for (i = 0; i < nregs; i++)
1582 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1583 V0_REGNUM + nvrn + i);
1584 tmp = gen_rtx_EXPR_LIST
1585 (VOIDmode, tmp,
1586 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1587 XVECEXP (par, 0, i) = tmp;
1589 pcum->aapcs_reg = par;
1591 return;
1593 else
1595 /* C.3 NSRN is set to 8. */
1596 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1597 goto on_stack;
1601 ncrn = pcum->aapcs_ncrn;
1602 nregs = size / UNITS_PER_WORD;
1604 /* C6 - C9. though the sign and zero extension semantics are
1605 handled elsewhere. This is the case where the argument fits
1606 entirely general registers. */
1607 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1609 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1611 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1613 /* C.8 if the argument has an alignment of 16 then the NGRN is
1614 rounded up to the next even number. */
1615 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1617 ++ncrn;
1618 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1620 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1621 A reg is still generated for it, but the caller should be smart
1622 enough not to use it. */
1623 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1625 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1627 else
1629 rtx par;
1630 int i;
1632 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1633 for (i = 0; i < nregs; i++)
1635 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1636 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1637 GEN_INT (i * UNITS_PER_WORD));
1638 XVECEXP (par, 0, i) = tmp;
1640 pcum->aapcs_reg = par;
1643 pcum->aapcs_nextncrn = ncrn + nregs;
1644 return;
1647 /* C.11 */
1648 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1650 /* The argument is passed on stack; record the needed number of words for
1651 this argument and align the total size if necessary. */
1652 on_stack:
1653 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1654 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1655 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1656 16 / UNITS_PER_WORD);
1657 return;
1660 /* Implement TARGET_FUNCTION_ARG. */
1662 static rtx
1663 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1664 const_tree type, bool named)
1666 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1667 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1669 if (mode == VOIDmode)
1670 return NULL_RTX;
1672 aarch64_layout_arg (pcum_v, mode, type, named);
1673 return pcum->aapcs_reg;
1676 void
1677 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1678 const_tree fntype ATTRIBUTE_UNUSED,
1679 rtx libname ATTRIBUTE_UNUSED,
1680 const_tree fndecl ATTRIBUTE_UNUSED,
1681 unsigned n_named ATTRIBUTE_UNUSED)
1683 pcum->aapcs_ncrn = 0;
1684 pcum->aapcs_nvrn = 0;
1685 pcum->aapcs_nextncrn = 0;
1686 pcum->aapcs_nextnvrn = 0;
1687 pcum->pcs_variant = ARM_PCS_AAPCS64;
1688 pcum->aapcs_reg = NULL_RTX;
1689 pcum->aapcs_arg_processed = false;
1690 pcum->aapcs_stack_words = 0;
1691 pcum->aapcs_stack_size = 0;
1693 return;
1696 static void
1697 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1698 enum machine_mode mode,
1699 const_tree type,
1700 bool named)
1702 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1703 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1705 aarch64_layout_arg (pcum_v, mode, type, named);
1706 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1707 != (pcum->aapcs_stack_words != 0));
1708 pcum->aapcs_arg_processed = false;
1709 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1710 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1711 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1712 pcum->aapcs_stack_words = 0;
1713 pcum->aapcs_reg = NULL_RTX;
1717 bool
1718 aarch64_function_arg_regno_p (unsigned regno)
1720 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1721 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1724 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1725 PARM_BOUNDARY bits of alignment, but will be given anything up
1726 to STACK_BOUNDARY bits if the type requires it. This makes sure
1727 that both before and after the layout of each argument, the Next
1728 Stacked Argument Address (NSAA) will have a minimum alignment of
1729 8 bytes. */
1731 static unsigned int
1732 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1734 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1736 if (alignment < PARM_BOUNDARY)
1737 alignment = PARM_BOUNDARY;
1738 if (alignment > STACK_BOUNDARY)
1739 alignment = STACK_BOUNDARY;
1740 return alignment;
1743 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1745 Return true if an argument passed on the stack should be padded upwards,
1746 i.e. if the least-significant byte of the stack slot has useful data.
1748 Small aggregate types are placed in the lowest memory address.
1750 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1752 bool
1753 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1755 /* On little-endian targets, the least significant byte of every stack
1756 argument is passed at the lowest byte address of the stack slot. */
1757 if (!BYTES_BIG_ENDIAN)
1758 return true;
1760 /* Otherwise, integral, floating-point and pointer types are padded downward:
1761 the least significant byte of a stack argument is passed at the highest
1762 byte address of the stack slot. */
1763 if (type
1764 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1765 || POINTER_TYPE_P (type))
1766 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1767 return false;
1769 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1770 return true;
1773 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1775 It specifies padding for the last (may also be the only)
1776 element of a block move between registers and memory. If
1777 assuming the block is in the memory, padding upward means that
1778 the last element is padded after its highest significant byte,
1779 while in downward padding, the last element is padded at the
1780 its least significant byte side.
1782 Small aggregates and small complex types are always padded
1783 upwards.
1785 We don't need to worry about homogeneous floating-point or
1786 short-vector aggregates; their move is not affected by the
1787 padding direction determined here. Regardless of endianness,
1788 each element of such an aggregate is put in the least
1789 significant bits of a fp/simd register.
1791 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1792 register has useful data, and return the opposite if the most
1793 significant byte does. */
1795 bool
1796 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1797 bool first ATTRIBUTE_UNUSED)
1800 /* Small composite types are always padded upward. */
1801 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1803 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1804 : GET_MODE_SIZE (mode));
1805 if (size < 2 * UNITS_PER_WORD)
1806 return true;
1809 /* Otherwise, use the default padding. */
1810 return !BYTES_BIG_ENDIAN;
1813 static enum machine_mode
1814 aarch64_libgcc_cmp_return_mode (void)
1816 return SImode;
1819 static bool
1820 aarch64_frame_pointer_required (void)
1822 /* In aarch64_override_options_after_change
1823 flag_omit_leaf_frame_pointer turns off the frame pointer by
1824 default. Turn it back on now if we've not got a leaf
1825 function. */
1826 if (flag_omit_leaf_frame_pointer
1827 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1828 return true;
1830 return false;
1833 /* Mark the registers that need to be saved by the callee and calculate
1834 the size of the callee-saved registers area and frame record (both FP
1835 and LR may be omitted). */
1836 static void
1837 aarch64_layout_frame (void)
1839 HOST_WIDE_INT offset = 0;
1840 int regno;
1842 if (reload_completed && cfun->machine->frame.laid_out)
1843 return;
1845 #define SLOT_NOT_REQUIRED (-2)
1846 #define SLOT_REQUIRED (-1)
1848 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
1849 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
1851 /* First mark all the registers that really need to be saved... */
1852 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1853 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1855 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1856 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1858 /* ... that includes the eh data registers (if needed)... */
1859 if (crtl->calls_eh_return)
1860 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1861 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1862 = SLOT_REQUIRED;
1864 /* ... and any callee saved register that dataflow says is live. */
1865 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1866 if (df_regs_ever_live_p (regno)
1867 && !call_used_regs[regno])
1868 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1870 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1871 if (df_regs_ever_live_p (regno)
1872 && !call_used_regs[regno])
1873 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1875 if (frame_pointer_needed)
1877 /* FP and LR are placed in the linkage record. */
1878 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1879 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
1880 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1881 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
1882 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1883 offset += 2 * UNITS_PER_WORD;
1886 /* Now assign stack slots for them. */
1887 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1888 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1890 cfun->machine->frame.reg_offset[regno] = offset;
1891 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1892 cfun->machine->frame.wb_candidate1 = regno;
1893 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
1894 cfun->machine->frame.wb_candidate2 = regno;
1895 offset += UNITS_PER_WORD;
1898 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1899 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1901 cfun->machine->frame.reg_offset[regno] = offset;
1902 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1903 cfun->machine->frame.wb_candidate1 = regno;
1904 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
1905 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
1906 cfun->machine->frame.wb_candidate2 = regno;
1907 offset += UNITS_PER_WORD;
1910 cfun->machine->frame.padding0 =
1911 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1912 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1914 cfun->machine->frame.saved_regs_size = offset;
1916 cfun->machine->frame.hard_fp_offset
1917 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1918 + get_frame_size ()
1919 + cfun->machine->frame.saved_regs_size,
1920 STACK_BOUNDARY / BITS_PER_UNIT);
1922 cfun->machine->frame.frame_size
1923 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1924 + crtl->outgoing_args_size,
1925 STACK_BOUNDARY / BITS_PER_UNIT);
1927 cfun->machine->frame.laid_out = true;
1930 static bool
1931 aarch64_register_saved_on_entry (int regno)
1933 return cfun->machine->frame.reg_offset[regno] >= 0;
1936 static unsigned
1937 aarch64_next_callee_save (unsigned regno, unsigned limit)
1939 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1940 regno ++;
1941 return regno;
1944 static void
1945 aarch64_pushwb_single_reg (enum machine_mode mode, unsigned regno,
1946 HOST_WIDE_INT adjustment)
1948 rtx base_rtx = stack_pointer_rtx;
1949 rtx insn, reg, mem;
1951 reg = gen_rtx_REG (mode, regno);
1952 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
1953 plus_constant (Pmode, base_rtx, -adjustment));
1954 mem = gen_rtx_MEM (mode, mem);
1956 insn = emit_move_insn (mem, reg);
1957 RTX_FRAME_RELATED_P (insn) = 1;
1960 static rtx
1961 aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1962 HOST_WIDE_INT adjustment)
1964 switch (mode)
1966 case DImode:
1967 return gen_storewb_pairdi_di (base, base, reg, reg2,
1968 GEN_INT (-adjustment),
1969 GEN_INT (UNITS_PER_WORD - adjustment));
1970 case DFmode:
1971 return gen_storewb_pairdf_di (base, base, reg, reg2,
1972 GEN_INT (-adjustment),
1973 GEN_INT (UNITS_PER_WORD - adjustment));
1974 default:
1975 gcc_unreachable ();
1979 static void
1980 aarch64_pushwb_pair_reg (enum machine_mode mode, unsigned regno1,
1981 unsigned regno2, HOST_WIDE_INT adjustment)
1983 rtx_insn *insn;
1984 rtx reg1 = gen_rtx_REG (mode, regno1);
1985 rtx reg2 = gen_rtx_REG (mode, regno2);
1987 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
1988 reg2, adjustment));
1989 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
1990 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1991 RTX_FRAME_RELATED_P (insn) = 1;
1994 static rtx
1995 aarch64_gen_loadwb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1996 HOST_WIDE_INT adjustment)
1998 switch (mode)
2000 case DImode:
2001 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2002 GEN_INT (UNITS_PER_WORD));
2003 case DFmode:
2004 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2005 GEN_INT (UNITS_PER_WORD));
2006 default:
2007 gcc_unreachable ();
2011 static rtx
2012 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2013 rtx reg2)
2015 switch (mode)
2017 case DImode:
2018 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2020 case DFmode:
2021 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2023 default:
2024 gcc_unreachable ();
2028 static rtx
2029 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2030 rtx mem2)
2032 switch (mode)
2034 case DImode:
2035 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2037 case DFmode:
2038 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2040 default:
2041 gcc_unreachable ();
2046 static void
2047 aarch64_save_callee_saves (enum machine_mode mode, HOST_WIDE_INT start_offset,
2048 unsigned start, unsigned limit, bool skip_wb)
2050 rtx_insn *insn;
2051 rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2052 ? gen_frame_mem : gen_rtx_MEM);
2053 unsigned regno;
2054 unsigned regno2;
2056 for (regno = aarch64_next_callee_save (start, limit);
2057 regno <= limit;
2058 regno = aarch64_next_callee_save (regno + 1, limit))
2060 rtx reg, mem;
2061 HOST_WIDE_INT offset;
2063 if (skip_wb
2064 && (regno == cfun->machine->frame.wb_candidate1
2065 || regno == cfun->machine->frame.wb_candidate2))
2066 continue;
2068 reg = gen_rtx_REG (mode, regno);
2069 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2070 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2071 offset));
2073 regno2 = aarch64_next_callee_save (regno + 1, limit);
2075 if (regno2 <= limit
2076 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2077 == cfun->machine->frame.reg_offset[regno2]))
2080 rtx reg2 = gen_rtx_REG (mode, regno2);
2081 rtx mem2;
2083 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2084 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2085 offset));
2086 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2087 reg2));
2089 /* The first part of a frame-related parallel insn is
2090 always assumed to be relevant to the frame
2091 calculations; subsequent parts, are only
2092 frame-related if explicitly marked. */
2093 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2094 regno = regno2;
2096 else
2097 insn = emit_move_insn (mem, reg);
2099 RTX_FRAME_RELATED_P (insn) = 1;
2103 static void
2104 aarch64_restore_callee_saves (enum machine_mode mode,
2105 HOST_WIDE_INT start_offset, unsigned start,
2106 unsigned limit, bool skip_wb, rtx *cfi_ops)
2108 rtx base_rtx = stack_pointer_rtx;
2109 rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2110 ? gen_frame_mem : gen_rtx_MEM);
2111 unsigned regno;
2112 unsigned regno2;
2113 HOST_WIDE_INT offset;
2115 for (regno = aarch64_next_callee_save (start, limit);
2116 regno <= limit;
2117 regno = aarch64_next_callee_save (regno + 1, limit))
2119 rtx reg, mem;
2121 if (skip_wb
2122 && (regno == cfun->machine->frame.wb_candidate1
2123 || regno == cfun->machine->frame.wb_candidate2))
2124 continue;
2126 reg = gen_rtx_REG (mode, regno);
2127 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2128 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2130 regno2 = aarch64_next_callee_save (regno + 1, limit);
2132 if (regno2 <= limit
2133 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2134 == cfun->machine->frame.reg_offset[regno2]))
2136 rtx reg2 = gen_rtx_REG (mode, regno2);
2137 rtx mem2;
2139 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2140 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2141 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2143 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2144 regno = regno2;
2146 else
2147 emit_move_insn (reg, mem);
2148 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2152 /* AArch64 stack frames generated by this compiler look like:
2154 +-------------------------------+
2156 | incoming stack arguments |
2158 +-------------------------------+
2159 | | <-- incoming stack pointer (aligned)
2160 | callee-allocated save area |
2161 | for register varargs |
2163 +-------------------------------+
2164 | local variables | <-- frame_pointer_rtx
2166 +-------------------------------+
2167 | padding0 | \
2168 +-------------------------------+ |
2169 | callee-saved registers | | frame.saved_regs_size
2170 +-------------------------------+ |
2171 | LR' | |
2172 +-------------------------------+ |
2173 | FP' | / <- hard_frame_pointer_rtx (aligned)
2174 +-------------------------------+
2175 | dynamic allocation |
2176 +-------------------------------+
2177 | padding |
2178 +-------------------------------+
2179 | outgoing stack arguments | <-- arg_pointer
2181 +-------------------------------+
2182 | | <-- stack_pointer_rtx (aligned)
2184 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2185 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2186 unchanged. */
2188 /* Generate the prologue instructions for entry into a function.
2189 Establish the stack frame by decreasing the stack pointer with a
2190 properly calculated size and, if necessary, create a frame record
2191 filled with the values of LR and previous frame pointer. The
2192 current FP is also set up if it is in use. */
2194 void
2195 aarch64_expand_prologue (void)
2197 /* sub sp, sp, #<frame_size>
2198 stp {fp, lr}, [sp, #<frame_size> - 16]
2199 add fp, sp, #<frame_size> - hardfp_offset
2200 stp {cs_reg}, [fp, #-16] etc.
2202 sub sp, sp, <final_adjustment_if_any>
2204 HOST_WIDE_INT frame_size, offset;
2205 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2206 HOST_WIDE_INT hard_fp_offset;
2207 rtx_insn *insn;
2209 aarch64_layout_frame ();
2211 offset = frame_size = cfun->machine->frame.frame_size;
2212 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2213 fp_offset = frame_size - hard_fp_offset;
2215 if (flag_stack_usage_info)
2216 current_function_static_stack_size = frame_size;
2218 /* Store pairs and load pairs have a range only -512 to 504. */
2219 if (offset >= 512)
2221 /* When the frame has a large size, an initial decrease is done on
2222 the stack pointer to jump over the callee-allocated save area for
2223 register varargs, the local variable area and/or the callee-saved
2224 register area. This will allow the pre-index write-back
2225 store pair instructions to be used for setting up the stack frame
2226 efficiently. */
2227 offset = hard_fp_offset;
2228 if (offset >= 512)
2229 offset = cfun->machine->frame.saved_regs_size;
2231 frame_size -= (offset + crtl->outgoing_args_size);
2232 fp_offset = 0;
2234 if (frame_size >= 0x1000000)
2236 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2237 emit_move_insn (op0, GEN_INT (-frame_size));
2238 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2240 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2241 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2242 plus_constant (Pmode, stack_pointer_rtx,
2243 -frame_size)));
2244 RTX_FRAME_RELATED_P (insn) = 1;
2246 else if (frame_size > 0)
2248 int hi_ofs = frame_size & 0xfff000;
2249 int lo_ofs = frame_size & 0x000fff;
2251 if (hi_ofs)
2253 insn = emit_insn (gen_add2_insn
2254 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2255 RTX_FRAME_RELATED_P (insn) = 1;
2257 if (lo_ofs)
2259 insn = emit_insn (gen_add2_insn
2260 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2261 RTX_FRAME_RELATED_P (insn) = 1;
2265 else
2266 frame_size = -1;
2268 if (offset > 0)
2270 bool skip_wb = false;
2272 if (frame_pointer_needed)
2274 skip_wb = true;
2276 if (fp_offset)
2278 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2279 GEN_INT (-offset)));
2280 RTX_FRAME_RELATED_P (insn) = 1;
2282 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2283 R30_REGNUM, false);
2285 else
2286 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2288 /* Set up frame pointer to point to the location of the
2289 previous frame pointer on the stack. */
2290 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2291 stack_pointer_rtx,
2292 GEN_INT (fp_offset)));
2293 RTX_FRAME_RELATED_P (insn) = 1;
2294 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2296 else
2298 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2299 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2301 if (fp_offset
2302 || reg1 == FIRST_PSEUDO_REGISTER
2303 || (reg2 == FIRST_PSEUDO_REGISTER
2304 && offset >= 256))
2306 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2307 GEN_INT (-offset)));
2308 RTX_FRAME_RELATED_P (insn) = 1;
2310 else
2312 enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2314 skip_wb = true;
2316 if (reg2 == FIRST_PSEUDO_REGISTER)
2317 aarch64_pushwb_single_reg (mode1, reg1, offset);
2318 else
2319 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2323 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2324 skip_wb);
2325 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2326 skip_wb);
2329 /* when offset >= 512,
2330 sub sp, sp, #<outgoing_args_size> */
2331 if (frame_size > -1)
2333 if (crtl->outgoing_args_size > 0)
2335 insn = emit_insn (gen_add2_insn
2336 (stack_pointer_rtx,
2337 GEN_INT (- crtl->outgoing_args_size)));
2338 RTX_FRAME_RELATED_P (insn) = 1;
2343 /* Return TRUE if we can use a simple_return insn.
2345 This function checks whether the callee saved stack is empty, which
2346 means no restore actions are need. The pro_and_epilogue will use
2347 this to check whether shrink-wrapping opt is feasible. */
2349 bool
2350 aarch64_use_return_insn_p (void)
2352 if (!reload_completed)
2353 return false;
2355 if (crtl->profile)
2356 return false;
2358 aarch64_layout_frame ();
2360 return cfun->machine->frame.frame_size == 0;
2363 /* Generate the epilogue instructions for returning from a function. */
2364 void
2365 aarch64_expand_epilogue (bool for_sibcall)
2367 HOST_WIDE_INT frame_size, offset;
2368 HOST_WIDE_INT fp_offset;
2369 HOST_WIDE_INT hard_fp_offset;
2370 rtx_insn *insn;
2372 aarch64_layout_frame ();
2374 offset = frame_size = cfun->machine->frame.frame_size;
2375 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2376 fp_offset = frame_size - hard_fp_offset;
2378 /* Store pairs and load pairs have a range only -512 to 504. */
2379 if (offset >= 512)
2381 offset = hard_fp_offset;
2382 if (offset >= 512)
2383 offset = cfun->machine->frame.saved_regs_size;
2385 frame_size -= (offset + crtl->outgoing_args_size);
2386 fp_offset = 0;
2387 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2389 insn = emit_insn (gen_add2_insn
2390 (stack_pointer_rtx,
2391 GEN_INT (crtl->outgoing_args_size)));
2392 RTX_FRAME_RELATED_P (insn) = 1;
2395 else
2396 frame_size = -1;
2398 /* If there were outgoing arguments or we've done dynamic stack
2399 allocation, then restore the stack pointer from the frame
2400 pointer. This is at most one insn and more efficient than using
2401 GCC's internal mechanism. */
2402 if (frame_pointer_needed
2403 && (crtl->outgoing_args_size || cfun->calls_alloca))
2405 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2406 hard_frame_pointer_rtx,
2407 GEN_INT (0)));
2408 offset = offset - fp_offset;
2411 if (offset > 0)
2413 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2414 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2415 bool skip_wb = true;
2416 rtx cfi_ops = NULL;
2418 if (frame_pointer_needed)
2419 fp_offset = 0;
2420 else if (fp_offset
2421 || reg1 == FIRST_PSEUDO_REGISTER
2422 || (reg2 == FIRST_PSEUDO_REGISTER
2423 && offset >= 256))
2424 skip_wb = false;
2426 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2427 skip_wb, &cfi_ops);
2428 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2429 skip_wb, &cfi_ops);
2431 if (skip_wb)
2433 enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2434 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2436 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2437 if (reg2 == FIRST_PSEUDO_REGISTER)
2439 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2440 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2441 mem = gen_rtx_MEM (mode1, mem);
2442 insn = emit_move_insn (rreg1, mem);
2444 else
2446 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2448 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2449 insn = emit_insn (aarch64_gen_loadwb_pair
2450 (mode1, stack_pointer_rtx, rreg1,
2451 rreg2, offset));
2454 else
2456 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2457 GEN_INT (offset)));
2460 /* Reset the CFA to be SP + FRAME_SIZE. */
2461 rtx new_cfa = stack_pointer_rtx;
2462 if (frame_size > 0)
2463 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2464 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2465 REG_NOTES (insn) = cfi_ops;
2466 RTX_FRAME_RELATED_P (insn) = 1;
2469 if (frame_size > 0)
2471 if (frame_size >= 0x1000000)
2473 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2474 emit_move_insn (op0, GEN_INT (frame_size));
2475 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2477 else
2479 int hi_ofs = frame_size & 0xfff000;
2480 int lo_ofs = frame_size & 0x000fff;
2482 if (hi_ofs && lo_ofs)
2484 insn = emit_insn (gen_add2_insn
2485 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2486 RTX_FRAME_RELATED_P (insn) = 1;
2487 frame_size = lo_ofs;
2489 insn = emit_insn (gen_add2_insn
2490 (stack_pointer_rtx, GEN_INT (frame_size)));
2493 /* Reset the CFA to be SP + 0. */
2494 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2495 RTX_FRAME_RELATED_P (insn) = 1;
2498 /* Stack adjustment for exception handler. */
2499 if (crtl->calls_eh_return)
2501 /* We need to unwind the stack by the offset computed by
2502 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2503 to be SP; letting the CFA move during this adjustment
2504 is just as correct as retaining the CFA from the body
2505 of the function. Therefore, do nothing special. */
2506 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2509 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2510 if (!for_sibcall)
2511 emit_jump_insn (ret_rtx);
2514 /* Return the place to copy the exception unwinding return address to.
2515 This will probably be a stack slot, but could (in theory be the
2516 return register). */
2518 aarch64_final_eh_return_addr (void)
2520 HOST_WIDE_INT fp_offset;
2522 aarch64_layout_frame ();
2524 fp_offset = cfun->machine->frame.frame_size
2525 - cfun->machine->frame.hard_fp_offset;
2527 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2528 return gen_rtx_REG (DImode, LR_REGNUM);
2530 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2531 result in a store to save LR introduced by builtin_eh_return () being
2532 incorrectly deleted because the alias is not detected.
2533 So in the calculation of the address to copy the exception unwinding
2534 return address to, we note 2 cases.
2535 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2536 we return a SP-relative location since all the addresses are SP-relative
2537 in this case. This prevents the store from being optimized away.
2538 If the fp_offset is not 0, then the addresses will be FP-relative and
2539 therefore we return a FP-relative location. */
2541 if (frame_pointer_needed)
2543 if (fp_offset)
2544 return gen_frame_mem (DImode,
2545 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2546 else
2547 return gen_frame_mem (DImode,
2548 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2551 /* If FP is not needed, we calculate the location of LR, which would be
2552 at the top of the saved registers block. */
2554 return gen_frame_mem (DImode,
2555 plus_constant (Pmode,
2556 stack_pointer_rtx,
2557 fp_offset
2558 + cfun->machine->frame.saved_regs_size
2559 - 2 * UNITS_PER_WORD));
2562 /* Possibly output code to build up a constant in a register. For
2563 the benefit of the costs infrastructure, returns the number of
2564 instructions which would be emitted. GENERATE inhibits or
2565 enables code generation. */
2567 static int
2568 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2570 int insns = 0;
2572 if (aarch64_bitmask_imm (val, DImode))
2574 if (generate)
2575 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2576 insns = 1;
2578 else
2580 int i;
2581 int ncount = 0;
2582 int zcount = 0;
2583 HOST_WIDE_INT valp = val >> 16;
2584 HOST_WIDE_INT valm;
2585 HOST_WIDE_INT tval;
2587 for (i = 16; i < 64; i += 16)
2589 valm = (valp & 0xffff);
2591 if (valm != 0)
2592 ++ zcount;
2594 if (valm != 0xffff)
2595 ++ ncount;
2597 valp >>= 16;
2600 /* zcount contains the number of additional MOVK instructions
2601 required if the constant is built up with an initial MOVZ instruction,
2602 while ncount is the number of MOVK instructions required if starting
2603 with a MOVN instruction. Choose the sequence that yields the fewest
2604 number of instructions, preferring MOVZ instructions when they are both
2605 the same. */
2606 if (ncount < zcount)
2608 if (generate)
2609 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2610 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2611 tval = 0xffff;
2612 insns++;
2614 else
2616 if (generate)
2617 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2618 GEN_INT (val & 0xffff));
2619 tval = 0;
2620 insns++;
2623 val >>= 16;
2625 for (i = 16; i < 64; i += 16)
2627 if ((val & 0xffff) != tval)
2629 if (generate)
2630 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2631 GEN_INT (i),
2632 GEN_INT (val & 0xffff)));
2633 insns++;
2635 val >>= 16;
2638 return insns;
2641 static void
2642 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2644 HOST_WIDE_INT mdelta = delta;
2645 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2646 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2648 if (mdelta < 0)
2649 mdelta = -mdelta;
2651 if (mdelta >= 4096 * 4096)
2653 (void) aarch64_build_constant (scratchreg, delta, true);
2654 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2656 else if (mdelta > 0)
2658 if (mdelta >= 4096)
2660 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2661 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2662 if (delta < 0)
2663 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2664 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2665 else
2666 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2667 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2669 if (mdelta % 4096 != 0)
2671 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2672 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2673 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2678 /* Output code to add DELTA to the first argument, and then jump
2679 to FUNCTION. Used for C++ multiple inheritance. */
2680 static void
2681 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2682 HOST_WIDE_INT delta,
2683 HOST_WIDE_INT vcall_offset,
2684 tree function)
2686 /* The this pointer is always in x0. Note that this differs from
2687 Arm where the this pointer maybe bumped to r1 if r0 is required
2688 to return a pointer to an aggregate. On AArch64 a result value
2689 pointer will be in x8. */
2690 int this_regno = R0_REGNUM;
2691 rtx this_rtx, temp0, temp1, addr, funexp;
2692 rtx_insn *insn;
2694 reload_completed = 1;
2695 emit_note (NOTE_INSN_PROLOGUE_END);
2697 if (vcall_offset == 0)
2698 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2699 else
2701 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2703 this_rtx = gen_rtx_REG (Pmode, this_regno);
2704 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2705 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2707 addr = this_rtx;
2708 if (delta != 0)
2710 if (delta >= -256 && delta < 256)
2711 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2712 plus_constant (Pmode, this_rtx, delta));
2713 else
2714 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2717 if (Pmode == ptr_mode)
2718 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2719 else
2720 aarch64_emit_move (temp0,
2721 gen_rtx_ZERO_EXTEND (Pmode,
2722 gen_rtx_MEM (ptr_mode, addr)));
2724 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2725 addr = plus_constant (Pmode, temp0, vcall_offset);
2726 else
2728 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2729 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2732 if (Pmode == ptr_mode)
2733 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2734 else
2735 aarch64_emit_move (temp1,
2736 gen_rtx_SIGN_EXTEND (Pmode,
2737 gen_rtx_MEM (ptr_mode, addr)));
2739 emit_insn (gen_add2_insn (this_rtx, temp1));
2742 /* Generate a tail call to the target function. */
2743 if (!TREE_USED (function))
2745 assemble_external (function);
2746 TREE_USED (function) = 1;
2748 funexp = XEXP (DECL_RTL (function), 0);
2749 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2750 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2751 SIBLING_CALL_P (insn) = 1;
2753 insn = get_insns ();
2754 shorten_branches (insn);
2755 final_start_function (insn, file, 1);
2756 final (insn, file, 1);
2757 final_end_function ();
2759 /* Stop pretending to be a post-reload pass. */
2760 reload_completed = 0;
2763 static int
2764 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2766 if (GET_CODE (*x) == SYMBOL_REF)
2767 return SYMBOL_REF_TLS_MODEL (*x) != 0;
2769 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2770 TLS offsets, not real symbol references. */
2771 if (GET_CODE (*x) == UNSPEC
2772 && XINT (*x, 1) == UNSPEC_TLS)
2773 return -1;
2775 return 0;
2778 static bool
2779 aarch64_tls_referenced_p (rtx x)
2781 if (!TARGET_HAVE_TLS)
2782 return false;
2784 return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2788 static int
2789 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2791 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2792 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2794 if (*imm1 < *imm2)
2795 return -1;
2796 if (*imm1 > *imm2)
2797 return +1;
2798 return 0;
2802 static void
2803 aarch64_build_bitmask_table (void)
2805 unsigned HOST_WIDE_INT mask, imm;
2806 unsigned int log_e, e, s, r;
2807 unsigned int nimms = 0;
2809 for (log_e = 1; log_e <= 6; log_e++)
2811 e = 1 << log_e;
2812 if (e == 64)
2813 mask = ~(HOST_WIDE_INT) 0;
2814 else
2815 mask = ((HOST_WIDE_INT) 1 << e) - 1;
2816 for (s = 1; s < e; s++)
2818 for (r = 0; r < e; r++)
2820 /* set s consecutive bits to 1 (s < 64) */
2821 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2822 /* rotate right by r */
2823 if (r != 0)
2824 imm = ((imm >> r) | (imm << (e - r))) & mask;
2825 /* replicate the constant depending on SIMD size */
2826 switch (log_e) {
2827 case 1: imm |= (imm << 2);
2828 case 2: imm |= (imm << 4);
2829 case 3: imm |= (imm << 8);
2830 case 4: imm |= (imm << 16);
2831 case 5: imm |= (imm << 32);
2832 case 6:
2833 break;
2834 default:
2835 gcc_unreachable ();
2837 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2838 aarch64_bitmasks[nimms++] = imm;
2843 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2844 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2845 aarch64_bitmasks_cmp);
2849 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2850 a left shift of 0 or 12 bits. */
2851 bool
2852 aarch64_uimm12_shift (HOST_WIDE_INT val)
2854 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2855 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2860 /* Return true if val is an immediate that can be loaded into a
2861 register by a MOVZ instruction. */
2862 static bool
2863 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2865 if (GET_MODE_SIZE (mode) > 4)
2867 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2868 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2869 return 1;
2871 else
2873 /* Ignore sign extension. */
2874 val &= (HOST_WIDE_INT) 0xffffffff;
2876 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2877 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2881 /* Return true if val is a valid bitmask immediate. */
2882 bool
2883 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2885 if (GET_MODE_SIZE (mode) < 8)
2887 /* Replicate bit pattern. */
2888 val &= (HOST_WIDE_INT) 0xffffffff;
2889 val |= val << 32;
2891 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2892 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2896 /* Return true if val is an immediate that can be loaded into a
2897 register in a single instruction. */
2898 bool
2899 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2901 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2902 return 1;
2903 return aarch64_bitmask_imm (val, mode);
2906 static bool
2907 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2909 rtx base, offset;
2911 if (GET_CODE (x) == HIGH)
2912 return true;
2914 split_const (x, &base, &offset);
2915 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2917 if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2918 != SYMBOL_FORCE_TO_MEM)
2919 return true;
2920 else
2921 /* Avoid generating a 64-bit relocation in ILP32; leave
2922 to aarch64_expand_mov_immediate to handle it properly. */
2923 return mode != ptr_mode;
2926 return aarch64_tls_referenced_p (x);
2929 /* Return true if register REGNO is a valid index register.
2930 STRICT_P is true if REG_OK_STRICT is in effect. */
2932 bool
2933 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2935 if (!HARD_REGISTER_NUM_P (regno))
2937 if (!strict_p)
2938 return true;
2940 if (!reg_renumber)
2941 return false;
2943 regno = reg_renumber[regno];
2945 return GP_REGNUM_P (regno);
2948 /* Return true if register REGNO is a valid base register for mode MODE.
2949 STRICT_P is true if REG_OK_STRICT is in effect. */
2951 bool
2952 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2954 if (!HARD_REGISTER_NUM_P (regno))
2956 if (!strict_p)
2957 return true;
2959 if (!reg_renumber)
2960 return false;
2962 regno = reg_renumber[regno];
2965 /* The fake registers will be eliminated to either the stack or
2966 hard frame pointer, both of which are usually valid base registers.
2967 Reload deals with the cases where the eliminated form isn't valid. */
2968 return (GP_REGNUM_P (regno)
2969 || regno == SP_REGNUM
2970 || regno == FRAME_POINTER_REGNUM
2971 || regno == ARG_POINTER_REGNUM);
2974 /* Return true if X is a valid base register for mode MODE.
2975 STRICT_P is true if REG_OK_STRICT is in effect. */
2977 static bool
2978 aarch64_base_register_rtx_p (rtx x, bool strict_p)
2980 if (!strict_p && GET_CODE (x) == SUBREG)
2981 x = SUBREG_REG (x);
2983 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
2986 /* Return true if address offset is a valid index. If it is, fill in INFO
2987 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
2989 static bool
2990 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
2991 enum machine_mode mode, bool strict_p)
2993 enum aarch64_address_type type;
2994 rtx index;
2995 int shift;
2997 /* (reg:P) */
2998 if ((REG_P (x) || GET_CODE (x) == SUBREG)
2999 && GET_MODE (x) == Pmode)
3001 type = ADDRESS_REG_REG;
3002 index = x;
3003 shift = 0;
3005 /* (sign_extend:DI (reg:SI)) */
3006 else if ((GET_CODE (x) == SIGN_EXTEND
3007 || GET_CODE (x) == ZERO_EXTEND)
3008 && GET_MODE (x) == DImode
3009 && GET_MODE (XEXP (x, 0)) == SImode)
3011 type = (GET_CODE (x) == SIGN_EXTEND)
3012 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3013 index = XEXP (x, 0);
3014 shift = 0;
3016 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3017 else if (GET_CODE (x) == MULT
3018 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3019 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3020 && GET_MODE (XEXP (x, 0)) == DImode
3021 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3022 && CONST_INT_P (XEXP (x, 1)))
3024 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3025 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3026 index = XEXP (XEXP (x, 0), 0);
3027 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3029 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3030 else if (GET_CODE (x) == ASHIFT
3031 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3032 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3033 && GET_MODE (XEXP (x, 0)) == DImode
3034 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3035 && CONST_INT_P (XEXP (x, 1)))
3037 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3038 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3039 index = XEXP (XEXP (x, 0), 0);
3040 shift = INTVAL (XEXP (x, 1));
3042 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3043 else if ((GET_CODE (x) == SIGN_EXTRACT
3044 || GET_CODE (x) == ZERO_EXTRACT)
3045 && GET_MODE (x) == DImode
3046 && GET_CODE (XEXP (x, 0)) == MULT
3047 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3048 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3050 type = (GET_CODE (x) == SIGN_EXTRACT)
3051 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3052 index = XEXP (XEXP (x, 0), 0);
3053 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3054 if (INTVAL (XEXP (x, 1)) != 32 + shift
3055 || INTVAL (XEXP (x, 2)) != 0)
3056 shift = -1;
3058 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3059 (const_int 0xffffffff<<shift)) */
3060 else if (GET_CODE (x) == AND
3061 && GET_MODE (x) == DImode
3062 && GET_CODE (XEXP (x, 0)) == MULT
3063 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3064 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3065 && CONST_INT_P (XEXP (x, 1)))
3067 type = ADDRESS_REG_UXTW;
3068 index = XEXP (XEXP (x, 0), 0);
3069 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3070 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3071 shift = -1;
3073 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3074 else if ((GET_CODE (x) == SIGN_EXTRACT
3075 || GET_CODE (x) == ZERO_EXTRACT)
3076 && GET_MODE (x) == DImode
3077 && GET_CODE (XEXP (x, 0)) == ASHIFT
3078 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3079 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3081 type = (GET_CODE (x) == SIGN_EXTRACT)
3082 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3083 index = XEXP (XEXP (x, 0), 0);
3084 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3085 if (INTVAL (XEXP (x, 1)) != 32 + shift
3086 || INTVAL (XEXP (x, 2)) != 0)
3087 shift = -1;
3089 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3090 (const_int 0xffffffff<<shift)) */
3091 else if (GET_CODE (x) == AND
3092 && GET_MODE (x) == DImode
3093 && GET_CODE (XEXP (x, 0)) == ASHIFT
3094 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3095 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3096 && CONST_INT_P (XEXP (x, 1)))
3098 type = ADDRESS_REG_UXTW;
3099 index = XEXP (XEXP (x, 0), 0);
3100 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3101 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3102 shift = -1;
3104 /* (mult:P (reg:P) (const_int scale)) */
3105 else if (GET_CODE (x) == MULT
3106 && GET_MODE (x) == Pmode
3107 && GET_MODE (XEXP (x, 0)) == Pmode
3108 && CONST_INT_P (XEXP (x, 1)))
3110 type = ADDRESS_REG_REG;
3111 index = XEXP (x, 0);
3112 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3114 /* (ashift:P (reg:P) (const_int shift)) */
3115 else if (GET_CODE (x) == ASHIFT
3116 && GET_MODE (x) == Pmode
3117 && GET_MODE (XEXP (x, 0)) == Pmode
3118 && CONST_INT_P (XEXP (x, 1)))
3120 type = ADDRESS_REG_REG;
3121 index = XEXP (x, 0);
3122 shift = INTVAL (XEXP (x, 1));
3124 else
3125 return false;
3127 if (GET_CODE (index) == SUBREG)
3128 index = SUBREG_REG (index);
3130 if ((shift == 0 ||
3131 (shift > 0 && shift <= 3
3132 && (1 << shift) == GET_MODE_SIZE (mode)))
3133 && REG_P (index)
3134 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3136 info->type = type;
3137 info->offset = index;
3138 info->shift = shift;
3139 return true;
3142 return false;
3145 bool
3146 aarch64_offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3148 return (offset >= -64 * GET_MODE_SIZE (mode)
3149 && offset < 64 * GET_MODE_SIZE (mode)
3150 && offset % GET_MODE_SIZE (mode) == 0);
3153 static inline bool
3154 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3155 HOST_WIDE_INT offset)
3157 return offset >= -256 && offset < 256;
3160 static inline bool
3161 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3163 return (offset >= 0
3164 && offset < 4096 * GET_MODE_SIZE (mode)
3165 && offset % GET_MODE_SIZE (mode) == 0);
3168 /* Return true if X is a valid address for machine mode MODE. If it is,
3169 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3170 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3172 static bool
3173 aarch64_classify_address (struct aarch64_address_info *info,
3174 rtx x, enum machine_mode mode,
3175 RTX_CODE outer_code, bool strict_p)
3177 enum rtx_code code = GET_CODE (x);
3178 rtx op0, op1;
3179 bool allow_reg_index_p =
3180 outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3181 || aarch64_vector_mode_supported_p (mode));
3182 /* Don't support anything other than POST_INC or REG addressing for
3183 AdvSIMD. */
3184 if (aarch64_vect_struct_mode_p (mode)
3185 && (code != POST_INC && code != REG))
3186 return false;
3188 switch (code)
3190 case REG:
3191 case SUBREG:
3192 info->type = ADDRESS_REG_IMM;
3193 info->base = x;
3194 info->offset = const0_rtx;
3195 return aarch64_base_register_rtx_p (x, strict_p);
3197 case PLUS:
3198 op0 = XEXP (x, 0);
3199 op1 = XEXP (x, 1);
3201 if (! strict_p
3202 && REG_P (op0)
3203 && (op0 == virtual_stack_vars_rtx
3204 || op0 == frame_pointer_rtx
3205 || op0 == arg_pointer_rtx)
3206 && CONST_INT_P (op1))
3208 info->type = ADDRESS_REG_IMM;
3209 info->base = op0;
3210 info->offset = op1;
3212 return true;
3215 if (GET_MODE_SIZE (mode) != 0
3216 && CONST_INT_P (op1)
3217 && aarch64_base_register_rtx_p (op0, strict_p))
3219 HOST_WIDE_INT offset = INTVAL (op1);
3221 info->type = ADDRESS_REG_IMM;
3222 info->base = op0;
3223 info->offset = op1;
3225 /* TImode and TFmode values are allowed in both pairs of X
3226 registers and individual Q registers. The available
3227 address modes are:
3228 X,X: 7-bit signed scaled offset
3229 Q: 9-bit signed offset
3230 We conservatively require an offset representable in either mode.
3232 if (mode == TImode || mode == TFmode)
3233 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3234 && offset_9bit_signed_unscaled_p (mode, offset));
3236 if (outer_code == PARALLEL)
3237 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3238 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3239 else
3240 return (offset_9bit_signed_unscaled_p (mode, offset)
3241 || offset_12bit_unsigned_scaled_p (mode, offset));
3244 if (allow_reg_index_p)
3246 /* Look for base + (scaled/extended) index register. */
3247 if (aarch64_base_register_rtx_p (op0, strict_p)
3248 && aarch64_classify_index (info, op1, mode, strict_p))
3250 info->base = op0;
3251 return true;
3253 if (aarch64_base_register_rtx_p (op1, strict_p)
3254 && aarch64_classify_index (info, op0, mode, strict_p))
3256 info->base = op1;
3257 return true;
3261 return false;
3263 case POST_INC:
3264 case POST_DEC:
3265 case PRE_INC:
3266 case PRE_DEC:
3267 info->type = ADDRESS_REG_WB;
3268 info->base = XEXP (x, 0);
3269 info->offset = NULL_RTX;
3270 return aarch64_base_register_rtx_p (info->base, strict_p);
3272 case POST_MODIFY:
3273 case PRE_MODIFY:
3274 info->type = ADDRESS_REG_WB;
3275 info->base = XEXP (x, 0);
3276 if (GET_CODE (XEXP (x, 1)) == PLUS
3277 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3278 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3279 && aarch64_base_register_rtx_p (info->base, strict_p))
3281 HOST_WIDE_INT offset;
3282 info->offset = XEXP (XEXP (x, 1), 1);
3283 offset = INTVAL (info->offset);
3285 /* TImode and TFmode values are allowed in both pairs of X
3286 registers and individual Q registers. The available
3287 address modes are:
3288 X,X: 7-bit signed scaled offset
3289 Q: 9-bit signed offset
3290 We conservatively require an offset representable in either mode.
3292 if (mode == TImode || mode == TFmode)
3293 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3294 && offset_9bit_signed_unscaled_p (mode, offset));
3296 if (outer_code == PARALLEL)
3297 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3298 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3299 else
3300 return offset_9bit_signed_unscaled_p (mode, offset);
3302 return false;
3304 case CONST:
3305 case SYMBOL_REF:
3306 case LABEL_REF:
3307 /* load literal: pc-relative constant pool entry. Only supported
3308 for SI mode or larger. */
3309 info->type = ADDRESS_SYMBOLIC;
3310 if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3312 rtx sym, addend;
3314 split_const (x, &sym, &addend);
3315 return (GET_CODE (sym) == LABEL_REF
3316 || (GET_CODE (sym) == SYMBOL_REF
3317 && CONSTANT_POOL_ADDRESS_P (sym)));
3319 return false;
3321 case LO_SUM:
3322 info->type = ADDRESS_LO_SUM;
3323 info->base = XEXP (x, 0);
3324 info->offset = XEXP (x, 1);
3325 if (allow_reg_index_p
3326 && aarch64_base_register_rtx_p (info->base, strict_p))
3328 rtx sym, offs;
3329 split_const (info->offset, &sym, &offs);
3330 if (GET_CODE (sym) == SYMBOL_REF
3331 && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3332 == SYMBOL_SMALL_ABSOLUTE))
3334 /* The symbol and offset must be aligned to the access size. */
3335 unsigned int align;
3336 unsigned int ref_size;
3338 if (CONSTANT_POOL_ADDRESS_P (sym))
3339 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3340 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3342 tree exp = SYMBOL_REF_DECL (sym);
3343 align = TYPE_ALIGN (TREE_TYPE (exp));
3344 align = CONSTANT_ALIGNMENT (exp, align);
3346 else if (SYMBOL_REF_DECL (sym))
3347 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3348 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3349 && SYMBOL_REF_BLOCK (sym) != NULL)
3350 align = SYMBOL_REF_BLOCK (sym)->alignment;
3351 else
3352 align = BITS_PER_UNIT;
3354 ref_size = GET_MODE_SIZE (mode);
3355 if (ref_size == 0)
3356 ref_size = GET_MODE_SIZE (DImode);
3358 return ((INTVAL (offs) & (ref_size - 1)) == 0
3359 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3362 return false;
3364 default:
3365 return false;
3369 bool
3370 aarch64_symbolic_address_p (rtx x)
3372 rtx offset;
3374 split_const (x, &x, &offset);
3375 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3378 /* Classify the base of symbolic expression X, given that X appears in
3379 context CONTEXT. */
3381 enum aarch64_symbol_type
3382 aarch64_classify_symbolic_expression (rtx x,
3383 enum aarch64_symbol_context context)
3385 rtx offset;
3387 split_const (x, &x, &offset);
3388 return aarch64_classify_symbol (x, context);
3392 /* Return TRUE if X is a legitimate address for accessing memory in
3393 mode MODE. */
3394 static bool
3395 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3397 struct aarch64_address_info addr;
3399 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3402 /* Return TRUE if X is a legitimate address for accessing memory in
3403 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3404 pair operation. */
3405 bool
3406 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3407 RTX_CODE outer_code, bool strict_p)
3409 struct aarch64_address_info addr;
3411 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3414 /* Return TRUE if rtx X is immediate constant 0.0 */
3415 bool
3416 aarch64_float_const_zero_rtx_p (rtx x)
3418 REAL_VALUE_TYPE r;
3420 if (GET_MODE (x) == VOIDmode)
3421 return false;
3423 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3424 if (REAL_VALUE_MINUS_ZERO (r))
3425 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3426 return REAL_VALUES_EQUAL (r, dconst0);
3429 /* Return the fixed registers used for condition codes. */
3431 static bool
3432 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3434 *p1 = CC_REGNUM;
3435 *p2 = INVALID_REGNUM;
3436 return true;
3439 /* Emit call insn with PAT and do aarch64-specific handling. */
3441 void
3442 aarch64_emit_call_insn (rtx pat)
3444 rtx insn = emit_call_insn (pat);
3446 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3447 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3448 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3451 enum machine_mode
3452 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3454 /* All floating point compares return CCFP if it is an equality
3455 comparison, and CCFPE otherwise. */
3456 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3458 switch (code)
3460 case EQ:
3461 case NE:
3462 case UNORDERED:
3463 case ORDERED:
3464 case UNLT:
3465 case UNLE:
3466 case UNGT:
3467 case UNGE:
3468 case UNEQ:
3469 case LTGT:
3470 return CCFPmode;
3472 case LT:
3473 case LE:
3474 case GT:
3475 case GE:
3476 return CCFPEmode;
3478 default:
3479 gcc_unreachable ();
3483 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3484 && y == const0_rtx
3485 && (code == EQ || code == NE || code == LT || code == GE)
3486 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3487 || GET_CODE (x) == NEG))
3488 return CC_NZmode;
3490 /* A compare with a shifted operand. Because of canonicalization,
3491 the comparison will have to be swapped when we emit the assembly
3492 code. */
3493 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3494 && (REG_P (y) || GET_CODE (y) == SUBREG)
3495 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3496 || GET_CODE (x) == LSHIFTRT
3497 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3498 return CC_SWPmode;
3500 /* Similarly for a negated operand, but we can only do this for
3501 equalities. */
3502 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3503 && (REG_P (y) || GET_CODE (y) == SUBREG)
3504 && (code == EQ || code == NE)
3505 && GET_CODE (x) == NEG)
3506 return CC_Zmode;
3508 /* A compare of a mode narrower than SI mode against zero can be done
3509 by extending the value in the comparison. */
3510 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3511 && y == const0_rtx)
3512 /* Only use sign-extension if we really need it. */
3513 return ((code == GT || code == GE || code == LE || code == LT)
3514 ? CC_SESWPmode : CC_ZESWPmode);
3516 /* For everything else, return CCmode. */
3517 return CCmode;
3521 aarch64_get_condition_code (rtx x)
3523 enum machine_mode mode = GET_MODE (XEXP (x, 0));
3524 enum rtx_code comp_code = GET_CODE (x);
3526 if (GET_MODE_CLASS (mode) != MODE_CC)
3527 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3529 switch (mode)
3531 case CCFPmode:
3532 case CCFPEmode:
3533 switch (comp_code)
3535 case GE: return AARCH64_GE;
3536 case GT: return AARCH64_GT;
3537 case LE: return AARCH64_LS;
3538 case LT: return AARCH64_MI;
3539 case NE: return AARCH64_NE;
3540 case EQ: return AARCH64_EQ;
3541 case ORDERED: return AARCH64_VC;
3542 case UNORDERED: return AARCH64_VS;
3543 case UNLT: return AARCH64_LT;
3544 case UNLE: return AARCH64_LE;
3545 case UNGT: return AARCH64_HI;
3546 case UNGE: return AARCH64_PL;
3547 default: return -1;
3549 break;
3551 case CCmode:
3552 switch (comp_code)
3554 case NE: return AARCH64_NE;
3555 case EQ: return AARCH64_EQ;
3556 case GE: return AARCH64_GE;
3557 case GT: return AARCH64_GT;
3558 case LE: return AARCH64_LE;
3559 case LT: return AARCH64_LT;
3560 case GEU: return AARCH64_CS;
3561 case GTU: return AARCH64_HI;
3562 case LEU: return AARCH64_LS;
3563 case LTU: return AARCH64_CC;
3564 default: return -1;
3566 break;
3568 case CC_SWPmode:
3569 case CC_ZESWPmode:
3570 case CC_SESWPmode:
3571 switch (comp_code)
3573 case NE: return AARCH64_NE;
3574 case EQ: return AARCH64_EQ;
3575 case GE: return AARCH64_LE;
3576 case GT: return AARCH64_LT;
3577 case LE: return AARCH64_GE;
3578 case LT: return AARCH64_GT;
3579 case GEU: return AARCH64_LS;
3580 case GTU: return AARCH64_CC;
3581 case LEU: return AARCH64_CS;
3582 case LTU: return AARCH64_HI;
3583 default: return -1;
3585 break;
3587 case CC_NZmode:
3588 switch (comp_code)
3590 case NE: return AARCH64_NE;
3591 case EQ: return AARCH64_EQ;
3592 case GE: return AARCH64_PL;
3593 case LT: return AARCH64_MI;
3594 default: return -1;
3596 break;
3598 case CC_Zmode:
3599 switch (comp_code)
3601 case NE: return AARCH64_NE;
3602 case EQ: return AARCH64_EQ;
3603 default: return -1;
3605 break;
3607 default:
3608 return -1;
3609 break;
3613 bool
3614 aarch64_const_vec_all_same_in_range_p (rtx x,
3615 HOST_WIDE_INT minval,
3616 HOST_WIDE_INT maxval)
3618 HOST_WIDE_INT firstval;
3619 int count, i;
3621 if (GET_CODE (x) != CONST_VECTOR
3622 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3623 return false;
3625 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3626 if (firstval < minval || firstval > maxval)
3627 return false;
3629 count = CONST_VECTOR_NUNITS (x);
3630 for (i = 1; i < count; i++)
3631 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3632 return false;
3634 return true;
3637 bool
3638 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3640 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3643 static unsigned
3644 bit_count (unsigned HOST_WIDE_INT value)
3646 unsigned count = 0;
3648 while (value)
3650 count++;
3651 value &= value - 1;
3654 return count;
3657 void
3658 aarch64_print_operand (FILE *f, rtx x, char code)
3660 switch (code)
3662 /* An integer or symbol address without a preceding # sign. */
3663 case 'c':
3664 switch (GET_CODE (x))
3666 case CONST_INT:
3667 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3668 break;
3670 case SYMBOL_REF:
3671 output_addr_const (f, x);
3672 break;
3674 case CONST:
3675 if (GET_CODE (XEXP (x, 0)) == PLUS
3676 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3678 output_addr_const (f, x);
3679 break;
3681 /* Fall through. */
3683 default:
3684 output_operand_lossage ("Unsupported operand for code '%c'", code);
3686 break;
3688 case 'e':
3689 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
3691 int n;
3693 if (!CONST_INT_P (x)
3694 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3696 output_operand_lossage ("invalid operand for '%%%c'", code);
3697 return;
3700 switch (n)
3702 case 3:
3703 fputc ('b', f);
3704 break;
3705 case 4:
3706 fputc ('h', f);
3707 break;
3708 case 5:
3709 fputc ('w', f);
3710 break;
3711 default:
3712 output_operand_lossage ("invalid operand for '%%%c'", code);
3713 return;
3716 break;
3718 case 'p':
3720 int n;
3722 /* Print N such that 2^N == X. */
3723 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
3725 output_operand_lossage ("invalid operand for '%%%c'", code);
3726 return;
3729 asm_fprintf (f, "%d", n);
3731 break;
3733 case 'P':
3734 /* Print the number of non-zero bits in X (a const_int). */
3735 if (!CONST_INT_P (x))
3737 output_operand_lossage ("invalid operand for '%%%c'", code);
3738 return;
3741 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3742 break;
3744 case 'H':
3745 /* Print the higher numbered register of a pair (TImode) of regs. */
3746 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
3748 output_operand_lossage ("invalid operand for '%%%c'", code);
3749 return;
3752 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3753 break;
3755 case 'm':
3757 int cond_code;
3758 /* Print a condition (eq, ne, etc). */
3760 /* CONST_TRUE_RTX means always -- that's the default. */
3761 if (x == const_true_rtx)
3762 return;
3764 if (!COMPARISON_P (x))
3766 output_operand_lossage ("invalid operand for '%%%c'", code);
3767 return;
3770 cond_code = aarch64_get_condition_code (x);
3771 gcc_assert (cond_code >= 0);
3772 fputs (aarch64_condition_codes[cond_code], f);
3774 break;
3776 case 'M':
3778 int cond_code;
3779 /* Print the inverse of a condition (eq <-> ne, etc). */
3781 /* CONST_TRUE_RTX means never -- that's the default. */
3782 if (x == const_true_rtx)
3784 fputs ("nv", f);
3785 return;
3788 if (!COMPARISON_P (x))
3790 output_operand_lossage ("invalid operand for '%%%c'", code);
3791 return;
3793 cond_code = aarch64_get_condition_code (x);
3794 gcc_assert (cond_code >= 0);
3795 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3796 (cond_code)], f);
3798 break;
3800 case 'b':
3801 case 'h':
3802 case 's':
3803 case 'd':
3804 case 'q':
3805 /* Print a scalar FP/SIMD register name. */
3806 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3808 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3809 return;
3811 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3812 break;
3814 case 'S':
3815 case 'T':
3816 case 'U':
3817 case 'V':
3818 /* Print the first FP/SIMD register name in a list. */
3819 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3821 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3822 return;
3824 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3825 break;
3827 case 'X':
3828 /* Print bottom 16 bits of integer constant in hex. */
3829 if (!CONST_INT_P (x))
3831 output_operand_lossage ("invalid operand for '%%%c'", code);
3832 return;
3834 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3835 break;
3837 case 'w':
3838 case 'x':
3839 /* Print a general register name or the zero register (32-bit or
3840 64-bit). */
3841 if (x == const0_rtx
3842 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3844 asm_fprintf (f, "%czr", code);
3845 break;
3848 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3850 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3851 break;
3854 if (REG_P (x) && REGNO (x) == SP_REGNUM)
3856 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3857 break;
3860 /* Fall through */
3862 case 0:
3863 /* Print a normal operand, if it's a general register, then we
3864 assume DImode. */
3865 if (x == NULL)
3867 output_operand_lossage ("missing operand");
3868 return;
3871 switch (GET_CODE (x))
3873 case REG:
3874 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3875 break;
3877 case MEM:
3878 aarch64_memory_reference_mode = GET_MODE (x);
3879 output_address (XEXP (x, 0));
3880 break;
3882 case LABEL_REF:
3883 case SYMBOL_REF:
3884 output_addr_const (asm_out_file, x);
3885 break;
3887 case CONST_INT:
3888 asm_fprintf (f, "%wd", INTVAL (x));
3889 break;
3891 case CONST_VECTOR:
3892 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3894 gcc_assert (
3895 aarch64_const_vec_all_same_in_range_p (x,
3896 HOST_WIDE_INT_MIN,
3897 HOST_WIDE_INT_MAX));
3898 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3900 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3902 fputc ('0', f);
3904 else
3905 gcc_unreachable ();
3906 break;
3908 case CONST_DOUBLE:
3909 /* CONST_DOUBLE can represent a double-width integer.
3910 In this case, the mode of x is VOIDmode. */
3911 if (GET_MODE (x) == VOIDmode)
3912 ; /* Do Nothing. */
3913 else if (aarch64_float_const_zero_rtx_p (x))
3915 fputc ('0', f);
3916 break;
3918 else if (aarch64_float_const_representable_p (x))
3920 #define buf_size 20
3921 char float_buf[buf_size] = {'\0'};
3922 REAL_VALUE_TYPE r;
3923 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3924 real_to_decimal_for_mode (float_buf, &r,
3925 buf_size, buf_size,
3926 1, GET_MODE (x));
3927 asm_fprintf (asm_out_file, "%s", float_buf);
3928 break;
3929 #undef buf_size
3931 output_operand_lossage ("invalid constant");
3932 return;
3933 default:
3934 output_operand_lossage ("invalid operand");
3935 return;
3937 break;
3939 case 'A':
3940 if (GET_CODE (x) == HIGH)
3941 x = XEXP (x, 0);
3943 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3945 case SYMBOL_SMALL_GOT:
3946 asm_fprintf (asm_out_file, ":got:");
3947 break;
3949 case SYMBOL_SMALL_TLSGD:
3950 asm_fprintf (asm_out_file, ":tlsgd:");
3951 break;
3953 case SYMBOL_SMALL_TLSDESC:
3954 asm_fprintf (asm_out_file, ":tlsdesc:");
3955 break;
3957 case SYMBOL_SMALL_GOTTPREL:
3958 asm_fprintf (asm_out_file, ":gottprel:");
3959 break;
3961 case SYMBOL_SMALL_TPREL:
3962 asm_fprintf (asm_out_file, ":tprel:");
3963 break;
3965 case SYMBOL_TINY_GOT:
3966 gcc_unreachable ();
3967 break;
3969 default:
3970 break;
3972 output_addr_const (asm_out_file, x);
3973 break;
3975 case 'L':
3976 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3978 case SYMBOL_SMALL_GOT:
3979 asm_fprintf (asm_out_file, ":lo12:");
3980 break;
3982 case SYMBOL_SMALL_TLSGD:
3983 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3984 break;
3986 case SYMBOL_SMALL_TLSDESC:
3987 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3988 break;
3990 case SYMBOL_SMALL_GOTTPREL:
3991 asm_fprintf (asm_out_file, ":gottprel_lo12:");
3992 break;
3994 case SYMBOL_SMALL_TPREL:
3995 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3996 break;
3998 case SYMBOL_TINY_GOT:
3999 asm_fprintf (asm_out_file, ":got:");
4000 break;
4002 default:
4003 break;
4005 output_addr_const (asm_out_file, x);
4006 break;
4008 case 'G':
4010 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4012 case SYMBOL_SMALL_TPREL:
4013 asm_fprintf (asm_out_file, ":tprel_hi12:");
4014 break;
4015 default:
4016 break;
4018 output_addr_const (asm_out_file, x);
4019 break;
4021 default:
4022 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4023 return;
4027 void
4028 aarch64_print_operand_address (FILE *f, rtx x)
4030 struct aarch64_address_info addr;
4032 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4033 MEM, true))
4034 switch (addr.type)
4036 case ADDRESS_REG_IMM:
4037 if (addr.offset == const0_rtx)
4038 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4039 else
4040 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4041 INTVAL (addr.offset));
4042 return;
4044 case ADDRESS_REG_REG:
4045 if (addr.shift == 0)
4046 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4047 reg_names [REGNO (addr.offset)]);
4048 else
4049 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4050 reg_names [REGNO (addr.offset)], addr.shift);
4051 return;
4053 case ADDRESS_REG_UXTW:
4054 if (addr.shift == 0)
4055 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4056 REGNO (addr.offset) - R0_REGNUM);
4057 else
4058 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4059 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4060 return;
4062 case ADDRESS_REG_SXTW:
4063 if (addr.shift == 0)
4064 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4065 REGNO (addr.offset) - R0_REGNUM);
4066 else
4067 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4068 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4069 return;
4071 case ADDRESS_REG_WB:
4072 switch (GET_CODE (x))
4074 case PRE_INC:
4075 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4076 GET_MODE_SIZE (aarch64_memory_reference_mode));
4077 return;
4078 case POST_INC:
4079 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4080 GET_MODE_SIZE (aarch64_memory_reference_mode));
4081 return;
4082 case PRE_DEC:
4083 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4084 GET_MODE_SIZE (aarch64_memory_reference_mode));
4085 return;
4086 case POST_DEC:
4087 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4088 GET_MODE_SIZE (aarch64_memory_reference_mode));
4089 return;
4090 case PRE_MODIFY:
4091 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4092 INTVAL (addr.offset));
4093 return;
4094 case POST_MODIFY:
4095 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4096 INTVAL (addr.offset));
4097 return;
4098 default:
4099 break;
4101 break;
4103 case ADDRESS_LO_SUM:
4104 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4105 output_addr_const (f, addr.offset);
4106 asm_fprintf (f, "]");
4107 return;
4109 case ADDRESS_SYMBOLIC:
4110 break;
4113 output_addr_const (f, x);
4116 bool
4117 aarch64_label_mentioned_p (rtx x)
4119 const char *fmt;
4120 int i;
4122 if (GET_CODE (x) == LABEL_REF)
4123 return true;
4125 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4126 referencing instruction, but they are constant offsets, not
4127 symbols. */
4128 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4129 return false;
4131 fmt = GET_RTX_FORMAT (GET_CODE (x));
4132 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4134 if (fmt[i] == 'E')
4136 int j;
4138 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4139 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4140 return 1;
4142 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4143 return 1;
4146 return 0;
4149 /* Implement REGNO_REG_CLASS. */
4151 enum reg_class
4152 aarch64_regno_regclass (unsigned regno)
4154 if (GP_REGNUM_P (regno))
4155 return GENERAL_REGS;
4157 if (regno == SP_REGNUM)
4158 return STACK_REG;
4160 if (regno == FRAME_POINTER_REGNUM
4161 || regno == ARG_POINTER_REGNUM)
4162 return POINTER_REGS;
4164 if (FP_REGNUM_P (regno))
4165 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4167 return NO_REGS;
4170 static rtx
4171 aarch64_legitimize_address (rtx x, rtx /* orig_x */, enum machine_mode mode)
4173 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4174 where mask is selected by alignment and size of the offset.
4175 We try to pick as large a range for the offset as possible to
4176 maximize the chance of a CSE. However, for aligned addresses
4177 we limit the range to 4k so that structures with different sized
4178 elements are likely to use the same base. */
4180 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4182 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4183 HOST_WIDE_INT base_offset;
4185 /* Does it look like we'll need a load/store-pair operation? */
4186 if (GET_MODE_SIZE (mode) > 16
4187 || mode == TImode)
4188 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4189 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4190 /* For offsets aren't a multiple of the access size, the limit is
4191 -256...255. */
4192 else if (offset & (GET_MODE_SIZE (mode) - 1))
4193 base_offset = (offset + 0x100) & ~0x1ff;
4194 else
4195 base_offset = offset & ~0xfff;
4197 if (base_offset == 0)
4198 return x;
4200 offset -= base_offset;
4201 rtx base_reg = gen_reg_rtx (Pmode);
4202 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4203 NULL_RTX);
4204 emit_move_insn (base_reg, val);
4205 x = plus_constant (Pmode, base_reg, offset);
4208 return x;
4211 /* Try a machine-dependent way of reloading an illegitimate address
4212 operand. If we find one, push the reload and return the new rtx. */
4215 aarch64_legitimize_reload_address (rtx *x_p,
4216 enum machine_mode mode,
4217 int opnum, int type,
4218 int ind_levels ATTRIBUTE_UNUSED)
4220 rtx x = *x_p;
4222 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4223 if (aarch64_vect_struct_mode_p (mode)
4224 && GET_CODE (x) == PLUS
4225 && REG_P (XEXP (x, 0))
4226 && CONST_INT_P (XEXP (x, 1)))
4228 rtx orig_rtx = x;
4229 x = copy_rtx (x);
4230 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4231 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4232 opnum, (enum reload_type) type);
4233 return x;
4236 /* We must recognize output that we have already generated ourselves. */
4237 if (GET_CODE (x) == PLUS
4238 && GET_CODE (XEXP (x, 0)) == PLUS
4239 && REG_P (XEXP (XEXP (x, 0), 0))
4240 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4241 && CONST_INT_P (XEXP (x, 1)))
4243 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4244 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4245 opnum, (enum reload_type) type);
4246 return x;
4249 /* We wish to handle large displacements off a base register by splitting
4250 the addend across an add and the mem insn. This can cut the number of
4251 extra insns needed from 3 to 1. It is only useful for load/store of a
4252 single register with 12 bit offset field. */
4253 if (GET_CODE (x) == PLUS
4254 && REG_P (XEXP (x, 0))
4255 && CONST_INT_P (XEXP (x, 1))
4256 && HARD_REGISTER_P (XEXP (x, 0))
4257 && mode != TImode
4258 && mode != TFmode
4259 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4261 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4262 HOST_WIDE_INT low = val & 0xfff;
4263 HOST_WIDE_INT high = val - low;
4264 HOST_WIDE_INT offs;
4265 rtx cst;
4266 enum machine_mode xmode = GET_MODE (x);
4268 /* In ILP32, xmode can be either DImode or SImode. */
4269 gcc_assert (xmode == DImode || xmode == SImode);
4271 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4272 BLKmode alignment. */
4273 if (GET_MODE_SIZE (mode) == 0)
4274 return NULL_RTX;
4276 offs = low % GET_MODE_SIZE (mode);
4278 /* Align misaligned offset by adjusting high part to compensate. */
4279 if (offs != 0)
4281 if (aarch64_uimm12_shift (high + offs))
4283 /* Align down. */
4284 low = low - offs;
4285 high = high + offs;
4287 else
4289 /* Align up. */
4290 offs = GET_MODE_SIZE (mode) - offs;
4291 low = low + offs;
4292 high = high + (low & 0x1000) - offs;
4293 low &= 0xfff;
4297 /* Check for overflow. */
4298 if (high + low != val)
4299 return NULL_RTX;
4301 cst = GEN_INT (high);
4302 if (!aarch64_uimm12_shift (high))
4303 cst = force_const_mem (xmode, cst);
4305 /* Reload high part into base reg, leaving the low part
4306 in the mem instruction.
4307 Note that replacing this gen_rtx_PLUS with plus_constant is
4308 wrong in this case because we rely on the
4309 (plus (plus reg c1) c2) structure being preserved so that
4310 XEXP (*p, 0) in push_reload below uses the correct term. */
4311 x = gen_rtx_PLUS (xmode,
4312 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4313 GEN_INT (low));
4315 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4316 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4317 opnum, (enum reload_type) type);
4318 return x;
4321 return NULL_RTX;
4325 static reg_class_t
4326 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4327 reg_class_t rclass,
4328 enum machine_mode mode,
4329 secondary_reload_info *sri)
4331 /* Without the TARGET_SIMD instructions we cannot move a Q register
4332 to a Q register directly. We need a scratch. */
4333 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4334 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4335 && reg_class_subset_p (rclass, FP_REGS))
4337 if (mode == TFmode)
4338 sri->icode = CODE_FOR_aarch64_reload_movtf;
4339 else if (mode == TImode)
4340 sri->icode = CODE_FOR_aarch64_reload_movti;
4341 return NO_REGS;
4344 /* A TFmode or TImode memory access should be handled via an FP_REGS
4345 because AArch64 has richer addressing modes for LDR/STR instructions
4346 than LDP/STP instructions. */
4347 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4348 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4349 return FP_REGS;
4351 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4352 return GENERAL_REGS;
4354 return NO_REGS;
4357 static bool
4358 aarch64_can_eliminate (const int from, const int to)
4360 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4361 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4363 if (frame_pointer_needed)
4365 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4366 return true;
4367 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4368 return false;
4369 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4370 && !cfun->calls_alloca)
4371 return true;
4372 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4373 return true;
4375 return false;
4378 return true;
4381 HOST_WIDE_INT
4382 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4384 aarch64_layout_frame ();
4386 if (to == HARD_FRAME_POINTER_REGNUM)
4388 if (from == ARG_POINTER_REGNUM)
4389 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4391 if (from == FRAME_POINTER_REGNUM)
4392 return (cfun->machine->frame.hard_fp_offset
4393 - cfun->machine->frame.saved_varargs_size);
4396 if (to == STACK_POINTER_REGNUM)
4398 if (from == FRAME_POINTER_REGNUM)
4399 return (cfun->machine->frame.frame_size
4400 - cfun->machine->frame.saved_varargs_size);
4403 return cfun->machine->frame.frame_size;
4406 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4407 previous frame. */
4410 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4412 if (count != 0)
4413 return const0_rtx;
4414 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4418 static void
4419 aarch64_asm_trampoline_template (FILE *f)
4421 if (TARGET_ILP32)
4423 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4424 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4426 else
4428 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4429 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4431 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4432 assemble_aligned_integer (4, const0_rtx);
4433 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4434 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4437 static void
4438 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4440 rtx fnaddr, mem, a_tramp;
4441 const int tramp_code_sz = 16;
4443 /* Don't need to copy the trailing D-words, we fill those in below. */
4444 emit_block_move (m_tramp, assemble_trampoline_template (),
4445 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4446 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4447 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4448 if (GET_MODE (fnaddr) != ptr_mode)
4449 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4450 emit_move_insn (mem, fnaddr);
4452 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4453 emit_move_insn (mem, chain_value);
4455 /* XXX We should really define a "clear_cache" pattern and use
4456 gen_clear_cache(). */
4457 a_tramp = XEXP (m_tramp, 0);
4458 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4459 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4460 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4461 ptr_mode);
4464 static unsigned char
4465 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4467 switch (regclass)
4469 case CALLER_SAVE_REGS:
4470 case POINTER_REGS:
4471 case GENERAL_REGS:
4472 case ALL_REGS:
4473 case FP_REGS:
4474 case FP_LO_REGS:
4475 return
4476 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4477 (GET_MODE_SIZE (mode) + 7) / 8;
4478 case STACK_REG:
4479 return 1;
4481 case NO_REGS:
4482 return 0;
4484 default:
4485 break;
4487 gcc_unreachable ();
4490 static reg_class_t
4491 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4493 if (regclass == POINTER_REGS)
4494 return GENERAL_REGS;
4496 if (regclass == STACK_REG)
4498 if (REG_P(x)
4499 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4500 return regclass;
4502 return NO_REGS;
4505 /* If it's an integer immediate that MOVI can't handle, then
4506 FP_REGS is not an option, so we return NO_REGS instead. */
4507 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4508 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4509 return NO_REGS;
4511 /* Register eliminiation can result in a request for
4512 SP+constant->FP_REGS. We cannot support such operations which
4513 use SP as source and an FP_REG as destination, so reject out
4514 right now. */
4515 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4517 rtx lhs = XEXP (x, 0);
4519 /* Look through a possible SUBREG introduced by ILP32. */
4520 if (GET_CODE (lhs) == SUBREG)
4521 lhs = SUBREG_REG (lhs);
4523 gcc_assert (REG_P (lhs));
4524 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4525 POINTER_REGS));
4526 return NO_REGS;
4529 return regclass;
4532 void
4533 aarch64_asm_output_labelref (FILE* f, const char *name)
4535 asm_fprintf (f, "%U%s", name);
4538 static void
4539 aarch64_elf_asm_constructor (rtx symbol, int priority)
4541 if (priority == DEFAULT_INIT_PRIORITY)
4542 default_ctor_section_asm_out_constructor (symbol, priority);
4543 else
4545 section *s;
4546 char buf[18];
4547 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4548 s = get_section (buf, SECTION_WRITE, NULL);
4549 switch_to_section (s);
4550 assemble_align (POINTER_SIZE);
4551 assemble_aligned_integer (POINTER_BYTES, symbol);
4555 static void
4556 aarch64_elf_asm_destructor (rtx symbol, int priority)
4558 if (priority == DEFAULT_INIT_PRIORITY)
4559 default_dtor_section_asm_out_destructor (symbol, priority);
4560 else
4562 section *s;
4563 char buf[18];
4564 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4565 s = get_section (buf, SECTION_WRITE, NULL);
4566 switch_to_section (s);
4567 assemble_align (POINTER_SIZE);
4568 assemble_aligned_integer (POINTER_BYTES, symbol);
4572 const char*
4573 aarch64_output_casesi (rtx *operands)
4575 char buf[100];
4576 char label[100];
4577 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
4578 int index;
4579 static const char *const patterns[4][2] =
4582 "ldrb\t%w3, [%0,%w1,uxtw]",
4583 "add\t%3, %4, %w3, sxtb #2"
4586 "ldrh\t%w3, [%0,%w1,uxtw #1]",
4587 "add\t%3, %4, %w3, sxth #2"
4590 "ldr\t%w3, [%0,%w1,uxtw #2]",
4591 "add\t%3, %4, %w3, sxtw #2"
4593 /* We assume that DImode is only generated when not optimizing and
4594 that we don't really need 64-bit address offsets. That would
4595 imply an object file with 8GB of code in a single function! */
4597 "ldr\t%w3, [%0,%w1,uxtw #2]",
4598 "add\t%3, %4, %w3, sxtw #2"
4602 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4604 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4606 gcc_assert (index >= 0 && index <= 3);
4608 /* Need to implement table size reduction, by chaning the code below. */
4609 output_asm_insn (patterns[index][0], operands);
4610 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4611 snprintf (buf, sizeof (buf),
4612 "adr\t%%4, %s", targetm.strip_name_encoding (label));
4613 output_asm_insn (buf, operands);
4614 output_asm_insn (patterns[index][1], operands);
4615 output_asm_insn ("br\t%3", operands);
4616 assemble_label (asm_out_file, label);
4617 return "";
4621 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4622 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4623 operator. */
4626 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4628 if (shift >= 0 && shift <= 3)
4630 int size;
4631 for (size = 8; size <= 32; size *= 2)
4633 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4634 if (mask == bits << shift)
4635 return size;
4638 return 0;
4641 static bool
4642 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4643 const_rtx x ATTRIBUTE_UNUSED)
4645 /* We can't use blocks for constants when we're using a per-function
4646 constant pool. */
4647 return false;
4650 static section *
4651 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4652 rtx x ATTRIBUTE_UNUSED,
4653 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4655 /* Force all constant pool entries into the current function section. */
4656 return function_section (current_function_decl);
4660 /* Costs. */
4662 /* Helper function for rtx cost calculation. Strip a shift expression
4663 from X. Returns the inner operand if successful, or the original
4664 expression on failure. */
4665 static rtx
4666 aarch64_strip_shift (rtx x)
4668 rtx op = x;
4670 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4671 we can convert both to ROR during final output. */
4672 if ((GET_CODE (op) == ASHIFT
4673 || GET_CODE (op) == ASHIFTRT
4674 || GET_CODE (op) == LSHIFTRT
4675 || GET_CODE (op) == ROTATERT
4676 || GET_CODE (op) == ROTATE)
4677 && CONST_INT_P (XEXP (op, 1)))
4678 return XEXP (op, 0);
4680 if (GET_CODE (op) == MULT
4681 && CONST_INT_P (XEXP (op, 1))
4682 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4683 return XEXP (op, 0);
4685 return x;
4688 /* Helper function for rtx cost calculation. Strip an extend
4689 expression from X. Returns the inner operand if successful, or the
4690 original expression on failure. We deal with a number of possible
4691 canonicalization variations here. */
4692 static rtx
4693 aarch64_strip_extend (rtx x)
4695 rtx op = x;
4697 /* Zero and sign extraction of a widened value. */
4698 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4699 && XEXP (op, 2) == const0_rtx
4700 && GET_CODE (XEXP (op, 0)) == MULT
4701 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4702 XEXP (op, 1)))
4703 return XEXP (XEXP (op, 0), 0);
4705 /* It can also be represented (for zero-extend) as an AND with an
4706 immediate. */
4707 if (GET_CODE (op) == AND
4708 && GET_CODE (XEXP (op, 0)) == MULT
4709 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4710 && CONST_INT_P (XEXP (op, 1))
4711 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4712 INTVAL (XEXP (op, 1))) != 0)
4713 return XEXP (XEXP (op, 0), 0);
4715 /* Now handle extended register, as this may also have an optional
4716 left shift by 1..4. */
4717 if (GET_CODE (op) == ASHIFT
4718 && CONST_INT_P (XEXP (op, 1))
4719 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4720 op = XEXP (op, 0);
4722 if (GET_CODE (op) == ZERO_EXTEND
4723 || GET_CODE (op) == SIGN_EXTEND)
4724 op = XEXP (op, 0);
4726 if (op != x)
4727 return op;
4729 return x;
4732 /* Helper function for rtx cost calculation. Calculate the cost of
4733 a MULT, which may be part of a multiply-accumulate rtx. Return
4734 the calculated cost of the expression, recursing manually in to
4735 operands where needed. */
4737 static int
4738 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4740 rtx op0, op1;
4741 const struct cpu_cost_table *extra_cost
4742 = aarch64_tune_params->insn_extra_cost;
4743 int cost = 0;
4744 bool maybe_fma = (outer == PLUS || outer == MINUS);
4745 enum machine_mode mode = GET_MODE (x);
4747 gcc_checking_assert (code == MULT);
4749 op0 = XEXP (x, 0);
4750 op1 = XEXP (x, 1);
4752 if (VECTOR_MODE_P (mode))
4753 mode = GET_MODE_INNER (mode);
4755 /* Integer multiply/fma. */
4756 if (GET_MODE_CLASS (mode) == MODE_INT)
4758 /* The multiply will be canonicalized as a shift, cost it as such. */
4759 if (CONST_INT_P (op1)
4760 && exact_log2 (INTVAL (op1)) > 0)
4762 if (speed)
4764 if (maybe_fma)
4765 /* ADD (shifted register). */
4766 cost += extra_cost->alu.arith_shift;
4767 else
4768 /* LSL (immediate). */
4769 cost += extra_cost->alu.shift;
4772 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4774 return cost;
4777 /* Integer multiplies or FMAs have zero/sign extending variants. */
4778 if ((GET_CODE (op0) == ZERO_EXTEND
4779 && GET_CODE (op1) == ZERO_EXTEND)
4780 || (GET_CODE (op0) == SIGN_EXTEND
4781 && GET_CODE (op1) == SIGN_EXTEND))
4783 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4784 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4786 if (speed)
4788 if (maybe_fma)
4789 /* MADD/SMADDL/UMADDL. */
4790 cost += extra_cost->mult[0].extend_add;
4791 else
4792 /* MUL/SMULL/UMULL. */
4793 cost += extra_cost->mult[0].extend;
4796 return cost;
4799 /* This is either an integer multiply or an FMA. In both cases
4800 we want to recurse and cost the operands. */
4801 cost += rtx_cost (op0, MULT, 0, speed)
4802 + rtx_cost (op1, MULT, 1, speed);
4804 if (speed)
4806 if (maybe_fma)
4807 /* MADD. */
4808 cost += extra_cost->mult[mode == DImode].add;
4809 else
4810 /* MUL. */
4811 cost += extra_cost->mult[mode == DImode].simple;
4814 return cost;
4816 else
4818 if (speed)
4820 /* Floating-point FMA/FMUL can also support negations of the
4821 operands. */
4822 if (GET_CODE (op0) == NEG)
4823 op0 = XEXP (op0, 0);
4824 if (GET_CODE (op1) == NEG)
4825 op1 = XEXP (op1, 0);
4827 if (maybe_fma)
4828 /* FMADD/FNMADD/FNMSUB/FMSUB. */
4829 cost += extra_cost->fp[mode == DFmode].fma;
4830 else
4831 /* FMUL/FNMUL. */
4832 cost += extra_cost->fp[mode == DFmode].mult;
4835 cost += rtx_cost (op0, MULT, 0, speed)
4836 + rtx_cost (op1, MULT, 1, speed);
4837 return cost;
4841 static int
4842 aarch64_address_cost (rtx x,
4843 enum machine_mode mode,
4844 addr_space_t as ATTRIBUTE_UNUSED,
4845 bool speed)
4847 enum rtx_code c = GET_CODE (x);
4848 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4849 struct aarch64_address_info info;
4850 int cost = 0;
4851 info.shift = 0;
4853 if (!aarch64_classify_address (&info, x, mode, c, false))
4855 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4857 /* This is a CONST or SYMBOL ref which will be split
4858 in a different way depending on the code model in use.
4859 Cost it through the generic infrastructure. */
4860 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4861 /* Divide through by the cost of one instruction to
4862 bring it to the same units as the address costs. */
4863 cost_symbol_ref /= COSTS_N_INSNS (1);
4864 /* The cost is then the cost of preparing the address,
4865 followed by an immediate (possibly 0) offset. */
4866 return cost_symbol_ref + addr_cost->imm_offset;
4868 else
4870 /* This is most likely a jump table from a case
4871 statement. */
4872 return addr_cost->register_offset;
4876 switch (info.type)
4878 case ADDRESS_LO_SUM:
4879 case ADDRESS_SYMBOLIC:
4880 case ADDRESS_REG_IMM:
4881 cost += addr_cost->imm_offset;
4882 break;
4884 case ADDRESS_REG_WB:
4885 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4886 cost += addr_cost->pre_modify;
4887 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4888 cost += addr_cost->post_modify;
4889 else
4890 gcc_unreachable ();
4892 break;
4894 case ADDRESS_REG_REG:
4895 cost += addr_cost->register_offset;
4896 break;
4898 case ADDRESS_REG_UXTW:
4899 case ADDRESS_REG_SXTW:
4900 cost += addr_cost->register_extend;
4901 break;
4903 default:
4904 gcc_unreachable ();
4908 if (info.shift > 0)
4910 /* For the sake of calculating the cost of the shifted register
4911 component, we can treat same sized modes in the same way. */
4912 switch (GET_MODE_BITSIZE (mode))
4914 case 16:
4915 cost += addr_cost->addr_scale_costs.hi;
4916 break;
4918 case 32:
4919 cost += addr_cost->addr_scale_costs.si;
4920 break;
4922 case 64:
4923 cost += addr_cost->addr_scale_costs.di;
4924 break;
4926 /* We can't tell, or this is a 128-bit vector. */
4927 default:
4928 cost += addr_cost->addr_scale_costs.ti;
4929 break;
4933 return cost;
4936 /* Return true if the RTX X in mode MODE is a zero or sign extract
4937 usable in an ADD or SUB (extended register) instruction. */
4938 static bool
4939 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4941 /* Catch add with a sign extract.
4942 This is add_<optab><mode>_multp2. */
4943 if (GET_CODE (x) == SIGN_EXTRACT
4944 || GET_CODE (x) == ZERO_EXTRACT)
4946 rtx op0 = XEXP (x, 0);
4947 rtx op1 = XEXP (x, 1);
4948 rtx op2 = XEXP (x, 2);
4950 if (GET_CODE (op0) == MULT
4951 && CONST_INT_P (op1)
4952 && op2 == const0_rtx
4953 && CONST_INT_P (XEXP (op0, 1))
4954 && aarch64_is_extend_from_extract (mode,
4955 XEXP (op0, 1),
4956 op1))
4958 return true;
4962 return false;
4965 static bool
4966 aarch64_frint_unspec_p (unsigned int u)
4968 switch (u)
4970 case UNSPEC_FRINTZ:
4971 case UNSPEC_FRINTP:
4972 case UNSPEC_FRINTM:
4973 case UNSPEC_FRINTA:
4974 case UNSPEC_FRINTN:
4975 case UNSPEC_FRINTX:
4976 case UNSPEC_FRINTI:
4977 return true;
4979 default:
4980 return false;
4984 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4985 storing it in *COST. Result is true if the total cost of the operation
4986 has now been calculated. */
4987 static bool
4988 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4990 rtx inner;
4991 rtx comparator;
4992 enum rtx_code cmpcode;
4994 if (COMPARISON_P (op0))
4996 inner = XEXP (op0, 0);
4997 comparator = XEXP (op0, 1);
4998 cmpcode = GET_CODE (op0);
5000 else
5002 inner = op0;
5003 comparator = const0_rtx;
5004 cmpcode = NE;
5007 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5009 /* Conditional branch. */
5010 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5011 return true;
5012 else
5014 if (cmpcode == NE || cmpcode == EQ)
5016 if (comparator == const0_rtx)
5018 /* TBZ/TBNZ/CBZ/CBNZ. */
5019 if (GET_CODE (inner) == ZERO_EXTRACT)
5020 /* TBZ/TBNZ. */
5021 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5022 0, speed);
5023 else
5024 /* CBZ/CBNZ. */
5025 *cost += rtx_cost (inner, cmpcode, 0, speed);
5027 return true;
5030 else if (cmpcode == LT || cmpcode == GE)
5032 /* TBZ/TBNZ. */
5033 if (comparator == const0_rtx)
5034 return true;
5038 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5040 /* It's a conditional operation based on the status flags,
5041 so it must be some flavor of CSEL. */
5043 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5044 if (GET_CODE (op1) == NEG
5045 || GET_CODE (op1) == NOT
5046 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5047 op1 = XEXP (op1, 0);
5049 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5050 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5051 return true;
5054 /* We don't know what this is, cost all operands. */
5055 return false;
5058 /* Calculate the cost of calculating X, storing it in *COST. Result
5059 is true if the total cost of the operation has now been calculated. */
5060 static bool
5061 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5062 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5064 rtx op0, op1, op2;
5065 const struct cpu_cost_table *extra_cost
5066 = aarch64_tune_params->insn_extra_cost;
5067 enum machine_mode mode = GET_MODE (x);
5069 /* By default, assume that everything has equivalent cost to the
5070 cheapest instruction. Any additional costs are applied as a delta
5071 above this default. */
5072 *cost = COSTS_N_INSNS (1);
5074 /* TODO: The cost infrastructure currently does not handle
5075 vector operations. Assume that all vector operations
5076 are equally expensive. */
5077 if (VECTOR_MODE_P (mode))
5079 if (speed)
5080 *cost += extra_cost->vect.alu;
5081 return true;
5084 switch (code)
5086 case SET:
5087 /* The cost depends entirely on the operands to SET. */
5088 *cost = 0;
5089 op0 = SET_DEST (x);
5090 op1 = SET_SRC (x);
5092 switch (GET_CODE (op0))
5094 case MEM:
5095 if (speed)
5097 rtx address = XEXP (op0, 0);
5098 if (GET_MODE_CLASS (mode) == MODE_INT)
5099 *cost += extra_cost->ldst.store;
5100 else if (mode == SFmode)
5101 *cost += extra_cost->ldst.storef;
5102 else if (mode == DFmode)
5103 *cost += extra_cost->ldst.stored;
5105 *cost +=
5106 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5107 0, speed));
5110 *cost += rtx_cost (op1, SET, 1, speed);
5111 return true;
5113 case SUBREG:
5114 if (! REG_P (SUBREG_REG (op0)))
5115 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5117 /* Fall through. */
5118 case REG:
5119 /* const0_rtx is in general free, but we will use an
5120 instruction to set a register to 0. */
5121 if (REG_P (op1) || op1 == const0_rtx)
5123 /* The cost is 1 per register copied. */
5124 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5125 / UNITS_PER_WORD;
5126 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5128 else
5129 /* Cost is just the cost of the RHS of the set. */
5130 *cost += rtx_cost (op1, SET, 1, speed);
5131 return true;
5133 case ZERO_EXTRACT:
5134 case SIGN_EXTRACT:
5135 /* Bit-field insertion. Strip any redundant widening of
5136 the RHS to meet the width of the target. */
5137 if (GET_CODE (op1) == SUBREG)
5138 op1 = SUBREG_REG (op1);
5139 if ((GET_CODE (op1) == ZERO_EXTEND
5140 || GET_CODE (op1) == SIGN_EXTEND)
5141 && CONST_INT_P (XEXP (op0, 1))
5142 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5143 >= INTVAL (XEXP (op0, 1))))
5144 op1 = XEXP (op1, 0);
5146 if (CONST_INT_P (op1))
5148 /* MOV immediate is assumed to always be cheap. */
5149 *cost = COSTS_N_INSNS (1);
5151 else
5153 /* BFM. */
5154 if (speed)
5155 *cost += extra_cost->alu.bfi;
5156 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5159 return true;
5161 default:
5162 /* We can't make sense of this, assume default cost. */
5163 *cost = COSTS_N_INSNS (1);
5164 return false;
5166 return false;
5168 case CONST_INT:
5169 /* If an instruction can incorporate a constant within the
5170 instruction, the instruction's expression avoids calling
5171 rtx_cost() on the constant. If rtx_cost() is called on a
5172 constant, then it is usually because the constant must be
5173 moved into a register by one or more instructions.
5175 The exception is constant 0, which can be expressed
5176 as XZR/WZR and is therefore free. The exception to this is
5177 if we have (set (reg) (const0_rtx)) in which case we must cost
5178 the move. However, we can catch that when we cost the SET, so
5179 we don't need to consider that here. */
5180 if (x == const0_rtx)
5181 *cost = 0;
5182 else
5184 /* To an approximation, building any other constant is
5185 proportionally expensive to the number of instructions
5186 required to build that constant. This is true whether we
5187 are compiling for SPEED or otherwise. */
5188 *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5189 INTVAL (x),
5190 false));
5192 return true;
5194 case CONST_DOUBLE:
5195 if (speed)
5197 /* mov[df,sf]_aarch64. */
5198 if (aarch64_float_const_representable_p (x))
5199 /* FMOV (scalar immediate). */
5200 *cost += extra_cost->fp[mode == DFmode].fpconst;
5201 else if (!aarch64_float_const_zero_rtx_p (x))
5203 /* This will be a load from memory. */
5204 if (mode == DFmode)
5205 *cost += extra_cost->ldst.loadd;
5206 else
5207 *cost += extra_cost->ldst.loadf;
5209 else
5210 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5211 or MOV v0.s[0], wzr - neither of which are modeled by the
5212 cost tables. Just use the default cost. */
5217 return true;
5219 case MEM:
5220 if (speed)
5222 /* For loads we want the base cost of a load, plus an
5223 approximation for the additional cost of the addressing
5224 mode. */
5225 rtx address = XEXP (x, 0);
5226 if (GET_MODE_CLASS (mode) == MODE_INT)
5227 *cost += extra_cost->ldst.load;
5228 else if (mode == SFmode)
5229 *cost += extra_cost->ldst.loadf;
5230 else if (mode == DFmode)
5231 *cost += extra_cost->ldst.loadd;
5233 *cost +=
5234 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5235 0, speed));
5238 return true;
5240 case NEG:
5241 op0 = XEXP (x, 0);
5243 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5245 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5246 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5248 /* CSETM. */
5249 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5250 return true;
5253 /* Cost this as SUB wzr, X. */
5254 op0 = CONST0_RTX (GET_MODE (x));
5255 op1 = XEXP (x, 0);
5256 goto cost_minus;
5259 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5261 /* Support (neg(fma...)) as a single instruction only if
5262 sign of zeros is unimportant. This matches the decision
5263 making in aarch64.md. */
5264 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5266 /* FNMADD. */
5267 *cost = rtx_cost (op0, NEG, 0, speed);
5268 return true;
5270 if (speed)
5271 /* FNEG. */
5272 *cost += extra_cost->fp[mode == DFmode].neg;
5273 return false;
5276 return false;
5278 case CLRSB:
5279 case CLZ:
5280 if (speed)
5281 *cost += extra_cost->alu.clz;
5283 return false;
5285 case COMPARE:
5286 op0 = XEXP (x, 0);
5287 op1 = XEXP (x, 1);
5289 if (op1 == const0_rtx
5290 && GET_CODE (op0) == AND)
5292 x = op0;
5293 goto cost_logic;
5296 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5298 /* TODO: A write to the CC flags possibly costs extra, this
5299 needs encoding in the cost tables. */
5301 /* CC_ZESWPmode supports zero extend for free. */
5302 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5303 op0 = XEXP (op0, 0);
5305 /* ANDS. */
5306 if (GET_CODE (op0) == AND)
5308 x = op0;
5309 goto cost_logic;
5312 if (GET_CODE (op0) == PLUS)
5314 /* ADDS (and CMN alias). */
5315 x = op0;
5316 goto cost_plus;
5319 if (GET_CODE (op0) == MINUS)
5321 /* SUBS. */
5322 x = op0;
5323 goto cost_minus;
5326 if (GET_CODE (op1) == NEG)
5328 /* CMN. */
5329 if (speed)
5330 *cost += extra_cost->alu.arith;
5332 *cost += rtx_cost (op0, COMPARE, 0, speed);
5333 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5334 return true;
5337 /* CMP.
5339 Compare can freely swap the order of operands, and
5340 canonicalization puts the more complex operation first.
5341 But the integer MINUS logic expects the shift/extend
5342 operation in op1. */
5343 if (! (REG_P (op0)
5344 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5346 op0 = XEXP (x, 1);
5347 op1 = XEXP (x, 0);
5349 goto cost_minus;
5352 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5354 /* FCMP. */
5355 if (speed)
5356 *cost += extra_cost->fp[mode == DFmode].compare;
5358 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5360 /* FCMP supports constant 0.0 for no extra cost. */
5361 return true;
5363 return false;
5366 return false;
5368 case MINUS:
5370 op0 = XEXP (x, 0);
5371 op1 = XEXP (x, 1);
5373 cost_minus:
5374 /* Detect valid immediates. */
5375 if ((GET_MODE_CLASS (mode) == MODE_INT
5376 || (GET_MODE_CLASS (mode) == MODE_CC
5377 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5378 && CONST_INT_P (op1)
5379 && aarch64_uimm12_shift (INTVAL (op1)))
5381 *cost += rtx_cost (op0, MINUS, 0, speed);
5383 if (speed)
5384 /* SUB(S) (immediate). */
5385 *cost += extra_cost->alu.arith;
5386 return true;
5390 /* Look for SUB (extended register). */
5391 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5393 if (speed)
5394 *cost += extra_cost->alu.arith_shift;
5396 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5397 (enum rtx_code) GET_CODE (op1),
5398 0, speed);
5399 return true;
5402 rtx new_op1 = aarch64_strip_extend (op1);
5404 /* Cost this as an FMA-alike operation. */
5405 if ((GET_CODE (new_op1) == MULT
5406 || GET_CODE (new_op1) == ASHIFT)
5407 && code != COMPARE)
5409 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5410 (enum rtx_code) code,
5411 speed);
5412 *cost += rtx_cost (op0, MINUS, 0, speed);
5413 return true;
5416 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5418 if (speed)
5420 if (GET_MODE_CLASS (mode) == MODE_INT)
5421 /* SUB(S). */
5422 *cost += extra_cost->alu.arith;
5423 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5424 /* FSUB. */
5425 *cost += extra_cost->fp[mode == DFmode].addsub;
5427 return true;
5430 case PLUS:
5432 rtx new_op0;
5434 op0 = XEXP (x, 0);
5435 op1 = XEXP (x, 1);
5437 cost_plus:
5438 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5439 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5441 /* CSINC. */
5442 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5443 *cost += rtx_cost (op1, PLUS, 1, speed);
5444 return true;
5447 if (GET_MODE_CLASS (mode) == MODE_INT
5448 && CONST_INT_P (op1)
5449 && aarch64_uimm12_shift (INTVAL (op1)))
5451 *cost += rtx_cost (op0, PLUS, 0, speed);
5453 if (speed)
5454 /* ADD (immediate). */
5455 *cost += extra_cost->alu.arith;
5456 return true;
5459 /* Look for ADD (extended register). */
5460 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5462 if (speed)
5463 *cost += extra_cost->alu.arith_shift;
5465 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5466 (enum rtx_code) GET_CODE (op0),
5467 0, speed);
5468 return true;
5471 /* Strip any extend, leave shifts behind as we will
5472 cost them through mult_cost. */
5473 new_op0 = aarch64_strip_extend (op0);
5475 if (GET_CODE (new_op0) == MULT
5476 || GET_CODE (new_op0) == ASHIFT)
5478 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5479 speed);
5480 *cost += rtx_cost (op1, PLUS, 1, speed);
5481 return true;
5484 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5485 + rtx_cost (op1, PLUS, 1, speed));
5487 if (speed)
5489 if (GET_MODE_CLASS (mode) == MODE_INT)
5490 /* ADD. */
5491 *cost += extra_cost->alu.arith;
5492 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5493 /* FADD. */
5494 *cost += extra_cost->fp[mode == DFmode].addsub;
5496 return true;
5499 case BSWAP:
5500 *cost = COSTS_N_INSNS (1);
5502 if (speed)
5503 *cost += extra_cost->alu.rev;
5505 return false;
5507 case IOR:
5508 if (aarch_rev16_p (x))
5510 *cost = COSTS_N_INSNS (1);
5512 if (speed)
5513 *cost += extra_cost->alu.rev;
5515 return true;
5517 /* Fall through. */
5518 case XOR:
5519 case AND:
5520 cost_logic:
5521 op0 = XEXP (x, 0);
5522 op1 = XEXP (x, 1);
5524 if (code == AND
5525 && GET_CODE (op0) == MULT
5526 && CONST_INT_P (XEXP (op0, 1))
5527 && CONST_INT_P (op1)
5528 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5529 INTVAL (op1)) != 0)
5531 /* This is a UBFM/SBFM. */
5532 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5533 if (speed)
5534 *cost += extra_cost->alu.bfx;
5535 return true;
5538 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5540 /* We possibly get the immediate for free, this is not
5541 modelled. */
5542 if (CONST_INT_P (op1)
5543 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5545 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5547 if (speed)
5548 *cost += extra_cost->alu.logical;
5550 return true;
5552 else
5554 rtx new_op0 = op0;
5556 /* Handle ORN, EON, or BIC. */
5557 if (GET_CODE (op0) == NOT)
5558 op0 = XEXP (op0, 0);
5560 new_op0 = aarch64_strip_shift (op0);
5562 /* If we had a shift on op0 then this is a logical-shift-
5563 by-register/immediate operation. Otherwise, this is just
5564 a logical operation. */
5565 if (speed)
5567 if (new_op0 != op0)
5569 /* Shift by immediate. */
5570 if (CONST_INT_P (XEXP (op0, 1)))
5571 *cost += extra_cost->alu.log_shift;
5572 else
5573 *cost += extra_cost->alu.log_shift_reg;
5575 else
5576 *cost += extra_cost->alu.logical;
5579 /* In both cases we want to cost both operands. */
5580 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5581 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5583 return true;
5586 return false;
5588 case NOT:
5589 /* MVN. */
5590 if (speed)
5591 *cost += extra_cost->alu.logical;
5593 /* The logical instruction could have the shifted register form,
5594 but the cost is the same if the shift is processed as a separate
5595 instruction, so we don't bother with it here. */
5596 return false;
5598 case ZERO_EXTEND:
5600 op0 = XEXP (x, 0);
5601 /* If a value is written in SI mode, then zero extended to DI
5602 mode, the operation will in general be free as a write to
5603 a 'w' register implicitly zeroes the upper bits of an 'x'
5604 register. However, if this is
5606 (set (reg) (zero_extend (reg)))
5608 we must cost the explicit register move. */
5609 if (mode == DImode
5610 && GET_MODE (op0) == SImode
5611 && outer == SET)
5613 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5615 if (!op_cost && speed)
5616 /* MOV. */
5617 *cost += extra_cost->alu.extend;
5618 else
5619 /* Free, the cost is that of the SI mode operation. */
5620 *cost = op_cost;
5622 return true;
5624 else if (MEM_P (XEXP (x, 0)))
5626 /* All loads can zero extend to any size for free. */
5627 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5628 return true;
5631 /* UXTB/UXTH. */
5632 if (speed)
5633 *cost += extra_cost->alu.extend;
5635 return false;
5637 case SIGN_EXTEND:
5638 if (MEM_P (XEXP (x, 0)))
5640 /* LDRSH. */
5641 if (speed)
5643 rtx address = XEXP (XEXP (x, 0), 0);
5644 *cost += extra_cost->ldst.load_sign_extend;
5646 *cost +=
5647 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5648 0, speed));
5650 return true;
5653 if (speed)
5654 *cost += extra_cost->alu.extend;
5655 return false;
5657 case ASHIFT:
5658 op0 = XEXP (x, 0);
5659 op1 = XEXP (x, 1);
5661 if (CONST_INT_P (op1))
5663 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
5664 aliases. */
5665 if (speed)
5666 *cost += extra_cost->alu.shift;
5668 /* We can incorporate zero/sign extend for free. */
5669 if (GET_CODE (op0) == ZERO_EXTEND
5670 || GET_CODE (op0) == SIGN_EXTEND)
5671 op0 = XEXP (op0, 0);
5673 *cost += rtx_cost (op0, ASHIFT, 0, speed);
5674 return true;
5676 else
5678 /* LSLV. */
5679 if (speed)
5680 *cost += extra_cost->alu.shift_reg;
5682 return false; /* All arguments need to be in registers. */
5685 case ROTATE:
5686 case ROTATERT:
5687 case LSHIFTRT:
5688 case ASHIFTRT:
5689 op0 = XEXP (x, 0);
5690 op1 = XEXP (x, 1);
5692 if (CONST_INT_P (op1))
5694 /* ASR (immediate) and friends. */
5695 if (speed)
5696 *cost += extra_cost->alu.shift;
5698 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5699 return true;
5701 else
5704 /* ASR (register) and friends. */
5705 if (speed)
5706 *cost += extra_cost->alu.shift_reg;
5708 return false; /* All arguments need to be in registers. */
5711 case SYMBOL_REF:
5713 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5715 /* LDR. */
5716 if (speed)
5717 *cost += extra_cost->ldst.load;
5719 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5720 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5722 /* ADRP, followed by ADD. */
5723 *cost += COSTS_N_INSNS (1);
5724 if (speed)
5725 *cost += 2 * extra_cost->alu.arith;
5727 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5728 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5730 /* ADR. */
5731 if (speed)
5732 *cost += extra_cost->alu.arith;
5735 if (flag_pic)
5737 /* One extra load instruction, after accessing the GOT. */
5738 *cost += COSTS_N_INSNS (1);
5739 if (speed)
5740 *cost += extra_cost->ldst.load;
5742 return true;
5744 case HIGH:
5745 case LO_SUM:
5746 /* ADRP/ADD (immediate). */
5747 if (speed)
5748 *cost += extra_cost->alu.arith;
5749 return true;
5751 case ZERO_EXTRACT:
5752 case SIGN_EXTRACT:
5753 /* UBFX/SBFX. */
5754 if (speed)
5755 *cost += extra_cost->alu.bfx;
5757 /* We can trust that the immediates used will be correct (there
5758 are no by-register forms), so we need only cost op0. */
5759 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5760 return true;
5762 case MULT:
5763 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5764 /* aarch64_rtx_mult_cost always handles recursion to its
5765 operands. */
5766 return true;
5768 case MOD:
5769 case UMOD:
5770 if (speed)
5772 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5773 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5774 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5775 else if (GET_MODE (x) == DFmode)
5776 *cost += (extra_cost->fp[1].mult
5777 + extra_cost->fp[1].div);
5778 else if (GET_MODE (x) == SFmode)
5779 *cost += (extra_cost->fp[0].mult
5780 + extra_cost->fp[0].div);
5782 return false; /* All arguments need to be in registers. */
5784 case DIV:
5785 case UDIV:
5786 case SQRT:
5787 if (speed)
5789 if (GET_MODE_CLASS (mode) == MODE_INT)
5790 /* There is no integer SQRT, so only DIV and UDIV can get
5791 here. */
5792 *cost += extra_cost->mult[mode == DImode].idiv;
5793 else
5794 *cost += extra_cost->fp[mode == DFmode].div;
5796 return false; /* All arguments need to be in registers. */
5798 case IF_THEN_ELSE:
5799 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5800 XEXP (x, 2), cost, speed);
5802 case EQ:
5803 case NE:
5804 case GT:
5805 case GTU:
5806 case LT:
5807 case LTU:
5808 case GE:
5809 case GEU:
5810 case LE:
5811 case LEU:
5813 return false; /* All arguments must be in registers. */
5815 case FMA:
5816 op0 = XEXP (x, 0);
5817 op1 = XEXP (x, 1);
5818 op2 = XEXP (x, 2);
5820 if (speed)
5821 *cost += extra_cost->fp[mode == DFmode].fma;
5823 /* FMSUB, FNMADD, and FNMSUB are free. */
5824 if (GET_CODE (op0) == NEG)
5825 op0 = XEXP (op0, 0);
5827 if (GET_CODE (op2) == NEG)
5828 op2 = XEXP (op2, 0);
5830 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5831 and the by-element operand as operand 0. */
5832 if (GET_CODE (op1) == NEG)
5833 op1 = XEXP (op1, 0);
5835 /* Catch vector-by-element operations. The by-element operand can
5836 either be (vec_duplicate (vec_select (x))) or just
5837 (vec_select (x)), depending on whether we are multiplying by
5838 a vector or a scalar.
5840 Canonicalization is not very good in these cases, FMA4 will put the
5841 by-element operand as operand 0, FNMA4 will have it as operand 1. */
5842 if (GET_CODE (op0) == VEC_DUPLICATE)
5843 op0 = XEXP (op0, 0);
5844 else if (GET_CODE (op1) == VEC_DUPLICATE)
5845 op1 = XEXP (op1, 0);
5847 if (GET_CODE (op0) == VEC_SELECT)
5848 op0 = XEXP (op0, 0);
5849 else if (GET_CODE (op1) == VEC_SELECT)
5850 op1 = XEXP (op1, 0);
5852 /* If the remaining parameters are not registers,
5853 get the cost to put them into registers. */
5854 *cost += rtx_cost (op0, FMA, 0, speed);
5855 *cost += rtx_cost (op1, FMA, 1, speed);
5856 *cost += rtx_cost (op2, FMA, 2, speed);
5857 return true;
5859 case FLOAT_EXTEND:
5860 if (speed)
5861 *cost += extra_cost->fp[mode == DFmode].widen;
5862 return false;
5864 case FLOAT_TRUNCATE:
5865 if (speed)
5866 *cost += extra_cost->fp[mode == DFmode].narrow;
5867 return false;
5869 case FIX:
5870 case UNSIGNED_FIX:
5871 x = XEXP (x, 0);
5872 /* Strip the rounding part. They will all be implemented
5873 by the fcvt* family of instructions anyway. */
5874 if (GET_CODE (x) == UNSPEC)
5876 unsigned int uns_code = XINT (x, 1);
5878 if (uns_code == UNSPEC_FRINTA
5879 || uns_code == UNSPEC_FRINTM
5880 || uns_code == UNSPEC_FRINTN
5881 || uns_code == UNSPEC_FRINTP
5882 || uns_code == UNSPEC_FRINTZ)
5883 x = XVECEXP (x, 0, 0);
5886 if (speed)
5887 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5889 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5890 return true;
5892 case ABS:
5893 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5895 /* FABS and FNEG are analogous. */
5896 if (speed)
5897 *cost += extra_cost->fp[mode == DFmode].neg;
5899 else
5901 /* Integer ABS will either be split to
5902 two arithmetic instructions, or will be an ABS
5903 (scalar), which we don't model. */
5904 *cost = COSTS_N_INSNS (2);
5905 if (speed)
5906 *cost += 2 * extra_cost->alu.arith;
5908 return false;
5910 case SMAX:
5911 case SMIN:
5912 if (speed)
5914 /* FMAXNM/FMINNM/FMAX/FMIN.
5915 TODO: This may not be accurate for all implementations, but
5916 we do not model this in the cost tables. */
5917 *cost += extra_cost->fp[mode == DFmode].addsub;
5919 return false;
5921 case UNSPEC:
5922 /* The floating point round to integer frint* instructions. */
5923 if (aarch64_frint_unspec_p (XINT (x, 1)))
5925 if (speed)
5926 *cost += extra_cost->fp[mode == DFmode].roundint;
5928 return false;
5931 if (XINT (x, 1) == UNSPEC_RBIT)
5933 if (speed)
5934 *cost += extra_cost->alu.rev;
5936 return false;
5938 break;
5940 case TRUNCATE:
5942 /* Decompose <su>muldi3_highpart. */
5943 if (/* (truncate:DI */
5944 mode == DImode
5945 /* (lshiftrt:TI */
5946 && GET_MODE (XEXP (x, 0)) == TImode
5947 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5948 /* (mult:TI */
5949 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5950 /* (ANY_EXTEND:TI (reg:DI))
5951 (ANY_EXTEND:TI (reg:DI))) */
5952 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5953 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5954 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5955 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5956 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5957 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5958 /* (const_int 64) */
5959 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5960 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5962 /* UMULH/SMULH. */
5963 if (speed)
5964 *cost += extra_cost->mult[mode == DImode].extend;
5965 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5966 MULT, 0, speed);
5967 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5968 MULT, 1, speed);
5969 return true;
5972 /* Fall through. */
5973 default:
5974 break;
5977 if (dump_file && (dump_flags & TDF_DETAILS))
5978 fprintf (dump_file,
5979 "\nFailed to cost RTX. Assuming default cost.\n");
5981 return true;
5984 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5985 calculated for X. This cost is stored in *COST. Returns true
5986 if the total cost of X was calculated. */
5987 static bool
5988 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5989 int param, int *cost, bool speed)
5991 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5993 if (dump_file && (dump_flags & TDF_DETAILS))
5995 print_rtl_single (dump_file, x);
5996 fprintf (dump_file, "\n%s cost: %d (%s)\n",
5997 speed ? "Hot" : "Cold",
5998 *cost, result ? "final" : "partial");
6001 return result;
6004 static int
6005 aarch64_register_move_cost (enum machine_mode mode,
6006 reg_class_t from_i, reg_class_t to_i)
6008 enum reg_class from = (enum reg_class) from_i;
6009 enum reg_class to = (enum reg_class) to_i;
6010 const struct cpu_regmove_cost *regmove_cost
6011 = aarch64_tune_params->regmove_cost;
6013 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6014 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6015 to = GENERAL_REGS;
6017 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6018 from = GENERAL_REGS;
6020 /* Moving between GPR and stack cost is the same as GP2GP. */
6021 if ((from == GENERAL_REGS && to == STACK_REG)
6022 || (to == GENERAL_REGS && from == STACK_REG))
6023 return regmove_cost->GP2GP;
6025 /* To/From the stack register, we move via the gprs. */
6026 if (to == STACK_REG || from == STACK_REG)
6027 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6028 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6030 if (GET_MODE_SIZE (mode) == 16)
6032 /* 128-bit operations on general registers require 2 instructions. */
6033 if (from == GENERAL_REGS && to == GENERAL_REGS)
6034 return regmove_cost->GP2GP * 2;
6035 else if (from == GENERAL_REGS)
6036 return regmove_cost->GP2FP * 2;
6037 else if (to == GENERAL_REGS)
6038 return regmove_cost->FP2GP * 2;
6040 /* When AdvSIMD instructions are disabled it is not possible to move
6041 a 128-bit value directly between Q registers. This is handled in
6042 secondary reload. A general register is used as a scratch to move
6043 the upper DI value and the lower DI value is moved directly,
6044 hence the cost is the sum of three moves. */
6045 if (! TARGET_SIMD)
6046 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6048 return regmove_cost->FP2FP;
6051 if (from == GENERAL_REGS && to == GENERAL_REGS)
6052 return regmove_cost->GP2GP;
6053 else if (from == GENERAL_REGS)
6054 return regmove_cost->GP2FP;
6055 else if (to == GENERAL_REGS)
6056 return regmove_cost->FP2GP;
6058 return regmove_cost->FP2FP;
6061 static int
6062 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
6063 reg_class_t rclass ATTRIBUTE_UNUSED,
6064 bool in ATTRIBUTE_UNUSED)
6066 return aarch64_tune_params->memmov_cost;
6069 /* Return the number of instructions that can be issued per cycle. */
6070 static int
6071 aarch64_sched_issue_rate (void)
6073 return aarch64_tune_params->issue_rate;
6076 /* Vectorizer cost model target hooks. */
6078 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6079 static int
6080 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6081 tree vectype,
6082 int misalign ATTRIBUTE_UNUSED)
6084 unsigned elements;
6086 switch (type_of_cost)
6088 case scalar_stmt:
6089 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6091 case scalar_load:
6092 return aarch64_tune_params->vec_costs->scalar_load_cost;
6094 case scalar_store:
6095 return aarch64_tune_params->vec_costs->scalar_store_cost;
6097 case vector_stmt:
6098 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6100 case vector_load:
6101 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6103 case vector_store:
6104 return aarch64_tune_params->vec_costs->vec_store_cost;
6106 case vec_to_scalar:
6107 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6109 case scalar_to_vec:
6110 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6112 case unaligned_load:
6113 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6115 case unaligned_store:
6116 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6118 case cond_branch_taken:
6119 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6121 case cond_branch_not_taken:
6122 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6124 case vec_perm:
6125 case vec_promote_demote:
6126 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6128 case vec_construct:
6129 elements = TYPE_VECTOR_SUBPARTS (vectype);
6130 return elements / 2 + 1;
6132 default:
6133 gcc_unreachable ();
6137 /* Implement targetm.vectorize.add_stmt_cost. */
6138 static unsigned
6139 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6140 struct _stmt_vec_info *stmt_info, int misalign,
6141 enum vect_cost_model_location where)
6143 unsigned *cost = (unsigned *) data;
6144 unsigned retval = 0;
6146 if (flag_vect_cost_model)
6148 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6149 int stmt_cost =
6150 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6152 /* Statements in an inner loop relative to the loop being
6153 vectorized are weighted more heavily. The value here is
6154 a function (linear for now) of the loop nest level. */
6155 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6157 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6158 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6159 unsigned nest_level = loop_depth (loop);
6161 count *= nest_level;
6164 retval = (unsigned) (count * stmt_cost);
6165 cost[where] += retval;
6168 return retval;
6171 static void initialize_aarch64_code_model (void);
6173 /* Parse the architecture extension string. */
6175 static void
6176 aarch64_parse_extension (char *str)
6178 /* The extension string is parsed left to right. */
6179 const struct aarch64_option_extension *opt = NULL;
6181 /* Flag to say whether we are adding or removing an extension. */
6182 int adding_ext = -1;
6184 while (str != NULL && *str != 0)
6186 char *ext;
6187 size_t len;
6189 str++;
6190 ext = strchr (str, '+');
6192 if (ext != NULL)
6193 len = ext - str;
6194 else
6195 len = strlen (str);
6197 if (len >= 2 && strncmp (str, "no", 2) == 0)
6199 adding_ext = 0;
6200 len -= 2;
6201 str += 2;
6203 else if (len > 0)
6204 adding_ext = 1;
6206 if (len == 0)
6208 error ("missing feature modifier after %qs", "+no");
6209 return;
6212 /* Scan over the extensions table trying to find an exact match. */
6213 for (opt = all_extensions; opt->name != NULL; opt++)
6215 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6217 /* Add or remove the extension. */
6218 if (adding_ext)
6219 aarch64_isa_flags |= opt->flags_on;
6220 else
6221 aarch64_isa_flags &= ~(opt->flags_off);
6222 break;
6226 if (opt->name == NULL)
6228 /* Extension not found in list. */
6229 error ("unknown feature modifier %qs", str);
6230 return;
6233 str = ext;
6236 return;
6239 /* Parse the ARCH string. */
6241 static void
6242 aarch64_parse_arch (void)
6244 char *ext;
6245 const struct processor *arch;
6246 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6247 size_t len;
6249 strcpy (str, aarch64_arch_string);
6251 ext = strchr (str, '+');
6253 if (ext != NULL)
6254 len = ext - str;
6255 else
6256 len = strlen (str);
6258 if (len == 0)
6260 error ("missing arch name in -march=%qs", str);
6261 return;
6264 /* Loop through the list of supported ARCHs to find a match. */
6265 for (arch = all_architectures; arch->name != NULL; arch++)
6267 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6269 selected_arch = arch;
6270 aarch64_isa_flags = selected_arch->flags;
6272 if (!selected_cpu)
6273 selected_cpu = &all_cores[selected_arch->core];
6275 if (ext != NULL)
6277 /* ARCH string contains at least one extension. */
6278 aarch64_parse_extension (ext);
6281 if (strcmp (selected_arch->arch, selected_cpu->arch))
6283 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6284 selected_cpu->name, selected_arch->name);
6287 return;
6291 /* ARCH name not found in list. */
6292 error ("unknown value %qs for -march", str);
6293 return;
6296 /* Parse the CPU string. */
6298 static void
6299 aarch64_parse_cpu (void)
6301 char *ext;
6302 const struct processor *cpu;
6303 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6304 size_t len;
6306 strcpy (str, aarch64_cpu_string);
6308 ext = strchr (str, '+');
6310 if (ext != NULL)
6311 len = ext - str;
6312 else
6313 len = strlen (str);
6315 if (len == 0)
6317 error ("missing cpu name in -mcpu=%qs", str);
6318 return;
6321 /* Loop through the list of supported CPUs to find a match. */
6322 for (cpu = all_cores; cpu->name != NULL; cpu++)
6324 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6326 selected_cpu = cpu;
6327 selected_tune = cpu;
6328 aarch64_isa_flags = selected_cpu->flags;
6330 if (ext != NULL)
6332 /* CPU string contains at least one extension. */
6333 aarch64_parse_extension (ext);
6336 return;
6340 /* CPU name not found in list. */
6341 error ("unknown value %qs for -mcpu", str);
6342 return;
6345 /* Parse the TUNE string. */
6347 static void
6348 aarch64_parse_tune (void)
6350 const struct processor *cpu;
6351 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6352 strcpy (str, aarch64_tune_string);
6354 /* Loop through the list of supported CPUs to find a match. */
6355 for (cpu = all_cores; cpu->name != NULL; cpu++)
6357 if (strcmp (cpu->name, str) == 0)
6359 selected_tune = cpu;
6360 return;
6364 /* CPU name not found in list. */
6365 error ("unknown value %qs for -mtune", str);
6366 return;
6370 /* Implement TARGET_OPTION_OVERRIDE. */
6372 static void
6373 aarch64_override_options (void)
6375 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6376 If either of -march or -mtune is given, they override their
6377 respective component of -mcpu.
6379 So, first parse AARCH64_CPU_STRING, then the others, be careful
6380 with -march as, if -mcpu is not present on the command line, march
6381 must set a sensible default CPU. */
6382 if (aarch64_cpu_string)
6384 aarch64_parse_cpu ();
6387 if (aarch64_arch_string)
6389 aarch64_parse_arch ();
6392 if (aarch64_tune_string)
6394 aarch64_parse_tune ();
6397 #ifndef HAVE_AS_MABI_OPTION
6398 /* The compiler may have been configured with 2.23.* binutils, which does
6399 not have support for ILP32. */
6400 if (TARGET_ILP32)
6401 error ("Assembler does not support -mabi=ilp32");
6402 #endif
6404 initialize_aarch64_code_model ();
6406 aarch64_build_bitmask_table ();
6408 /* This target defaults to strict volatile bitfields. */
6409 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6410 flag_strict_volatile_bitfields = 1;
6412 /* If the user did not specify a processor, choose the default
6413 one for them. This will be the CPU set during configuration using
6414 --with-cpu, otherwise it is "generic". */
6415 if (!selected_cpu)
6417 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6418 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6421 gcc_assert (selected_cpu);
6423 /* The selected cpu may be an architecture, so lookup tuning by core ID. */
6424 if (!selected_tune)
6425 selected_tune = &all_cores[selected_cpu->core];
6427 aarch64_tune_flags = selected_tune->flags;
6428 aarch64_tune = selected_tune->core;
6429 aarch64_tune_params = selected_tune->tune;
6431 if (aarch64_fix_a53_err835769 == 2)
6433 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6434 aarch64_fix_a53_err835769 = 1;
6435 #else
6436 aarch64_fix_a53_err835769 = 0;
6437 #endif
6440 aarch64_override_options_after_change ();
6443 /* Implement targetm.override_options_after_change. */
6445 static void
6446 aarch64_override_options_after_change (void)
6448 if (flag_omit_frame_pointer)
6449 flag_omit_leaf_frame_pointer = false;
6450 else if (flag_omit_leaf_frame_pointer)
6451 flag_omit_frame_pointer = true;
6454 static struct machine_function *
6455 aarch64_init_machine_status (void)
6457 struct machine_function *machine;
6458 machine = ggc_cleared_alloc<machine_function> ();
6459 return machine;
6462 void
6463 aarch64_init_expanders (void)
6465 init_machine_status = aarch64_init_machine_status;
6468 /* A checking mechanism for the implementation of the various code models. */
6469 static void
6470 initialize_aarch64_code_model (void)
6472 if (flag_pic)
6474 switch (aarch64_cmodel_var)
6476 case AARCH64_CMODEL_TINY:
6477 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6478 break;
6479 case AARCH64_CMODEL_SMALL:
6480 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6481 break;
6482 case AARCH64_CMODEL_LARGE:
6483 sorry ("code model %qs with -f%s", "large",
6484 flag_pic > 1 ? "PIC" : "pic");
6485 default:
6486 gcc_unreachable ();
6489 else
6490 aarch64_cmodel = aarch64_cmodel_var;
6493 /* Return true if SYMBOL_REF X binds locally. */
6495 static bool
6496 aarch64_symbol_binds_local_p (const_rtx x)
6498 return (SYMBOL_REF_DECL (x)
6499 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6500 : SYMBOL_REF_LOCAL_P (x));
6503 /* Return true if SYMBOL_REF X is thread local */
6504 static bool
6505 aarch64_tls_symbol_p (rtx x)
6507 if (! TARGET_HAVE_TLS)
6508 return false;
6510 if (GET_CODE (x) != SYMBOL_REF)
6511 return false;
6513 return SYMBOL_REF_TLS_MODEL (x) != 0;
6516 /* Classify a TLS symbol into one of the TLS kinds. */
6517 enum aarch64_symbol_type
6518 aarch64_classify_tls_symbol (rtx x)
6520 enum tls_model tls_kind = tls_symbolic_operand_type (x);
6522 switch (tls_kind)
6524 case TLS_MODEL_GLOBAL_DYNAMIC:
6525 case TLS_MODEL_LOCAL_DYNAMIC:
6526 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6528 case TLS_MODEL_INITIAL_EXEC:
6529 return SYMBOL_SMALL_GOTTPREL;
6531 case TLS_MODEL_LOCAL_EXEC:
6532 return SYMBOL_SMALL_TPREL;
6534 case TLS_MODEL_EMULATED:
6535 case TLS_MODEL_NONE:
6536 return SYMBOL_FORCE_TO_MEM;
6538 default:
6539 gcc_unreachable ();
6543 /* Return the method that should be used to access SYMBOL_REF or
6544 LABEL_REF X in context CONTEXT. */
6546 enum aarch64_symbol_type
6547 aarch64_classify_symbol (rtx x,
6548 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6550 if (GET_CODE (x) == LABEL_REF)
6552 switch (aarch64_cmodel)
6554 case AARCH64_CMODEL_LARGE:
6555 return SYMBOL_FORCE_TO_MEM;
6557 case AARCH64_CMODEL_TINY_PIC:
6558 case AARCH64_CMODEL_TINY:
6559 return SYMBOL_TINY_ABSOLUTE;
6561 case AARCH64_CMODEL_SMALL_PIC:
6562 case AARCH64_CMODEL_SMALL:
6563 return SYMBOL_SMALL_ABSOLUTE;
6565 default:
6566 gcc_unreachable ();
6570 if (GET_CODE (x) == SYMBOL_REF)
6572 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6573 return SYMBOL_FORCE_TO_MEM;
6575 if (aarch64_tls_symbol_p (x))
6576 return aarch64_classify_tls_symbol (x);
6578 switch (aarch64_cmodel)
6580 case AARCH64_CMODEL_TINY:
6581 if (SYMBOL_REF_WEAK (x))
6582 return SYMBOL_FORCE_TO_MEM;
6583 return SYMBOL_TINY_ABSOLUTE;
6585 case AARCH64_CMODEL_SMALL:
6586 if (SYMBOL_REF_WEAK (x))
6587 return SYMBOL_FORCE_TO_MEM;
6588 return SYMBOL_SMALL_ABSOLUTE;
6590 case AARCH64_CMODEL_TINY_PIC:
6591 if (!aarch64_symbol_binds_local_p (x))
6592 return SYMBOL_TINY_GOT;
6593 return SYMBOL_TINY_ABSOLUTE;
6595 case AARCH64_CMODEL_SMALL_PIC:
6596 if (!aarch64_symbol_binds_local_p (x))
6597 return SYMBOL_SMALL_GOT;
6598 return SYMBOL_SMALL_ABSOLUTE;
6600 default:
6601 gcc_unreachable ();
6605 /* By default push everything into the constant pool. */
6606 return SYMBOL_FORCE_TO_MEM;
6609 bool
6610 aarch64_constant_address_p (rtx x)
6612 return (CONSTANT_P (x) && memory_address_p (DImode, x));
6615 bool
6616 aarch64_legitimate_pic_operand_p (rtx x)
6618 if (GET_CODE (x) == SYMBOL_REF
6619 || (GET_CODE (x) == CONST
6620 && GET_CODE (XEXP (x, 0)) == PLUS
6621 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6622 return false;
6624 return true;
6627 /* Return true if X holds either a quarter-precision or
6628 floating-point +0.0 constant. */
6629 static bool
6630 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6632 if (!CONST_DOUBLE_P (x))
6633 return false;
6635 /* TODO: We could handle moving 0.0 to a TFmode register,
6636 but first we would like to refactor the movtf_aarch64
6637 to be more amicable to split moves properly and
6638 correctly gate on TARGET_SIMD. For now - reject all
6639 constants which are not to SFmode or DFmode registers. */
6640 if (!(mode == SFmode || mode == DFmode))
6641 return false;
6643 if (aarch64_float_const_zero_rtx_p (x))
6644 return true;
6645 return aarch64_float_const_representable_p (x);
6648 static bool
6649 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6651 /* Do not allow vector struct mode constants. We could support
6652 0 and -1 easily, but they need support in aarch64-simd.md. */
6653 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6654 return false;
6656 /* This could probably go away because
6657 we now decompose CONST_INTs according to expand_mov_immediate. */
6658 if ((GET_CODE (x) == CONST_VECTOR
6659 && aarch64_simd_valid_immediate (x, mode, false, NULL))
6660 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6661 return !targetm.cannot_force_const_mem (mode, x);
6663 if (GET_CODE (x) == HIGH
6664 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6665 return true;
6667 return aarch64_constant_address_p (x);
6671 aarch64_load_tp (rtx target)
6673 if (!target
6674 || GET_MODE (target) != Pmode
6675 || !register_operand (target, Pmode))
6676 target = gen_reg_rtx (Pmode);
6678 /* Can return in any reg. */
6679 emit_insn (gen_aarch64_load_tp_hard (target));
6680 return target;
6683 /* On AAPCS systems, this is the "struct __va_list". */
6684 static GTY(()) tree va_list_type;
6686 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6687 Return the type to use as __builtin_va_list.
6689 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6691 struct __va_list
6693 void *__stack;
6694 void *__gr_top;
6695 void *__vr_top;
6696 int __gr_offs;
6697 int __vr_offs;
6698 }; */
6700 static tree
6701 aarch64_build_builtin_va_list (void)
6703 tree va_list_name;
6704 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6706 /* Create the type. */
6707 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6708 /* Give it the required name. */
6709 va_list_name = build_decl (BUILTINS_LOCATION,
6710 TYPE_DECL,
6711 get_identifier ("__va_list"),
6712 va_list_type);
6713 DECL_ARTIFICIAL (va_list_name) = 1;
6714 TYPE_NAME (va_list_type) = va_list_name;
6715 TYPE_STUB_DECL (va_list_type) = va_list_name;
6717 /* Create the fields. */
6718 f_stack = build_decl (BUILTINS_LOCATION,
6719 FIELD_DECL, get_identifier ("__stack"),
6720 ptr_type_node);
6721 f_grtop = build_decl (BUILTINS_LOCATION,
6722 FIELD_DECL, get_identifier ("__gr_top"),
6723 ptr_type_node);
6724 f_vrtop = build_decl (BUILTINS_LOCATION,
6725 FIELD_DECL, get_identifier ("__vr_top"),
6726 ptr_type_node);
6727 f_groff = build_decl (BUILTINS_LOCATION,
6728 FIELD_DECL, get_identifier ("__gr_offs"),
6729 integer_type_node);
6730 f_vroff = build_decl (BUILTINS_LOCATION,
6731 FIELD_DECL, get_identifier ("__vr_offs"),
6732 integer_type_node);
6734 DECL_ARTIFICIAL (f_stack) = 1;
6735 DECL_ARTIFICIAL (f_grtop) = 1;
6736 DECL_ARTIFICIAL (f_vrtop) = 1;
6737 DECL_ARTIFICIAL (f_groff) = 1;
6738 DECL_ARTIFICIAL (f_vroff) = 1;
6740 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6741 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6742 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6743 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6744 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6746 TYPE_FIELDS (va_list_type) = f_stack;
6747 DECL_CHAIN (f_stack) = f_grtop;
6748 DECL_CHAIN (f_grtop) = f_vrtop;
6749 DECL_CHAIN (f_vrtop) = f_groff;
6750 DECL_CHAIN (f_groff) = f_vroff;
6752 /* Compute its layout. */
6753 layout_type (va_list_type);
6755 return va_list_type;
6758 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
6759 static void
6760 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6762 const CUMULATIVE_ARGS *cum;
6763 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6764 tree stack, grtop, vrtop, groff, vroff;
6765 tree t;
6766 int gr_save_area_size;
6767 int vr_save_area_size;
6768 int vr_offset;
6770 cum = &crtl->args.info;
6771 gr_save_area_size
6772 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6773 vr_save_area_size
6774 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6776 if (TARGET_GENERAL_REGS_ONLY)
6778 if (cum->aapcs_nvrn > 0)
6779 sorry ("%qs and floating point or vector arguments",
6780 "-mgeneral-regs-only");
6781 vr_save_area_size = 0;
6784 f_stack = TYPE_FIELDS (va_list_type_node);
6785 f_grtop = DECL_CHAIN (f_stack);
6786 f_vrtop = DECL_CHAIN (f_grtop);
6787 f_groff = DECL_CHAIN (f_vrtop);
6788 f_vroff = DECL_CHAIN (f_groff);
6790 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6791 NULL_TREE);
6792 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6793 NULL_TREE);
6794 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6795 NULL_TREE);
6796 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6797 NULL_TREE);
6798 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6799 NULL_TREE);
6801 /* Emit code to initialize STACK, which points to the next varargs stack
6802 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
6803 by named arguments. STACK is 8-byte aligned. */
6804 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6805 if (cum->aapcs_stack_size > 0)
6806 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6807 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6808 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6810 /* Emit code to initialize GRTOP, the top of the GR save area.
6811 virtual_incoming_args_rtx should have been 16 byte aligned. */
6812 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6813 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6814 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6816 /* Emit code to initialize VRTOP, the top of the VR save area.
6817 This address is gr_save_area_bytes below GRTOP, rounded
6818 down to the next 16-byte boundary. */
6819 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6820 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6821 STACK_BOUNDARY / BITS_PER_UNIT);
6823 if (vr_offset)
6824 t = fold_build_pointer_plus_hwi (t, -vr_offset);
6825 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6826 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6828 /* Emit code to initialize GROFF, the offset from GRTOP of the
6829 next GPR argument. */
6830 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6831 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6832 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6834 /* Likewise emit code to initialize VROFF, the offset from FTOP
6835 of the next VR argument. */
6836 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6837 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6838 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6841 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
6843 static tree
6844 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6845 gimple_seq *post_p ATTRIBUTE_UNUSED)
6847 tree addr;
6848 bool indirect_p;
6849 bool is_ha; /* is HFA or HVA. */
6850 bool dw_align; /* double-word align. */
6851 enum machine_mode ag_mode = VOIDmode;
6852 int nregs;
6853 enum machine_mode mode;
6855 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6856 tree stack, f_top, f_off, off, arg, roundup, on_stack;
6857 HOST_WIDE_INT size, rsize, adjust, align;
6858 tree t, u, cond1, cond2;
6860 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6861 if (indirect_p)
6862 type = build_pointer_type (type);
6864 mode = TYPE_MODE (type);
6866 f_stack = TYPE_FIELDS (va_list_type_node);
6867 f_grtop = DECL_CHAIN (f_stack);
6868 f_vrtop = DECL_CHAIN (f_grtop);
6869 f_groff = DECL_CHAIN (f_vrtop);
6870 f_vroff = DECL_CHAIN (f_groff);
6872 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6873 f_stack, NULL_TREE);
6874 size = int_size_in_bytes (type);
6875 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6877 dw_align = false;
6878 adjust = 0;
6879 if (aarch64_vfp_is_call_or_return_candidate (mode,
6880 type,
6881 &ag_mode,
6882 &nregs,
6883 &is_ha))
6885 /* TYPE passed in fp/simd registers. */
6886 if (TARGET_GENERAL_REGS_ONLY)
6887 sorry ("%qs and floating point or vector arguments",
6888 "-mgeneral-regs-only");
6890 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6891 unshare_expr (valist), f_vrtop, NULL_TREE);
6892 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6893 unshare_expr (valist), f_vroff, NULL_TREE);
6895 rsize = nregs * UNITS_PER_VREG;
6897 if (is_ha)
6899 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6900 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6902 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6903 && size < UNITS_PER_VREG)
6905 adjust = UNITS_PER_VREG - size;
6908 else
6910 /* TYPE passed in general registers. */
6911 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6912 unshare_expr (valist), f_grtop, NULL_TREE);
6913 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6914 unshare_expr (valist), f_groff, NULL_TREE);
6915 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6916 nregs = rsize / UNITS_PER_WORD;
6918 if (align > 8)
6919 dw_align = true;
6921 if (BLOCK_REG_PADDING (mode, type, 1) == downward
6922 && size < UNITS_PER_WORD)
6924 adjust = UNITS_PER_WORD - size;
6928 /* Get a local temporary for the field value. */
6929 off = get_initialized_tmp_var (f_off, pre_p, NULL);
6931 /* Emit code to branch if off >= 0. */
6932 t = build2 (GE_EXPR, boolean_type_node, off,
6933 build_int_cst (TREE_TYPE (off), 0));
6934 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6936 if (dw_align)
6938 /* Emit: offs = (offs + 15) & -16. */
6939 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6940 build_int_cst (TREE_TYPE (off), 15));
6941 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6942 build_int_cst (TREE_TYPE (off), -16));
6943 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6945 else
6946 roundup = NULL;
6948 /* Update ap.__[g|v]r_offs */
6949 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6950 build_int_cst (TREE_TYPE (off), rsize));
6951 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6953 /* String up. */
6954 if (roundup)
6955 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6957 /* [cond2] if (ap.__[g|v]r_offs > 0) */
6958 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6959 build_int_cst (TREE_TYPE (f_off), 0));
6960 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6962 /* String up: make sure the assignment happens before the use. */
6963 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6964 COND_EXPR_ELSE (cond1) = t;
6966 /* Prepare the trees handling the argument that is passed on the stack;
6967 the top level node will store in ON_STACK. */
6968 arg = get_initialized_tmp_var (stack, pre_p, NULL);
6969 if (align > 8)
6971 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
6972 t = fold_convert (intDI_type_node, arg);
6973 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6974 build_int_cst (TREE_TYPE (t), 15));
6975 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6976 build_int_cst (TREE_TYPE (t), -16));
6977 t = fold_convert (TREE_TYPE (arg), t);
6978 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6980 else
6981 roundup = NULL;
6982 /* Advance ap.__stack */
6983 t = fold_convert (intDI_type_node, arg);
6984 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6985 build_int_cst (TREE_TYPE (t), size + 7));
6986 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6987 build_int_cst (TREE_TYPE (t), -8));
6988 t = fold_convert (TREE_TYPE (arg), t);
6989 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6990 /* String up roundup and advance. */
6991 if (roundup)
6992 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6993 /* String up with arg */
6994 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6995 /* Big-endianness related address adjustment. */
6996 if (BLOCK_REG_PADDING (mode, type, 1) == downward
6997 && size < UNITS_PER_WORD)
6999 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7000 size_int (UNITS_PER_WORD - size));
7001 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7004 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7005 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7007 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7008 t = off;
7009 if (adjust)
7010 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7011 build_int_cst (TREE_TYPE (off), adjust));
7013 t = fold_convert (sizetype, t);
7014 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7016 if (is_ha)
7018 /* type ha; // treat as "struct {ftype field[n];}"
7019 ... [computing offs]
7020 for (i = 0; i <nregs; ++i, offs += 16)
7021 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7022 return ha; */
7023 int i;
7024 tree tmp_ha, field_t, field_ptr_t;
7026 /* Declare a local variable. */
7027 tmp_ha = create_tmp_var_raw (type, "ha");
7028 gimple_add_tmp_var (tmp_ha);
7030 /* Establish the base type. */
7031 switch (ag_mode)
7033 case SFmode:
7034 field_t = float_type_node;
7035 field_ptr_t = float_ptr_type_node;
7036 break;
7037 case DFmode:
7038 field_t = double_type_node;
7039 field_ptr_t = double_ptr_type_node;
7040 break;
7041 case TFmode:
7042 field_t = long_double_type_node;
7043 field_ptr_t = long_double_ptr_type_node;
7044 break;
7045 /* The half precision and quad precision are not fully supported yet. Enable
7046 the following code after the support is complete. Need to find the correct
7047 type node for __fp16 *. */
7048 #if 0
7049 case HFmode:
7050 field_t = float_type_node;
7051 field_ptr_t = float_ptr_type_node;
7052 break;
7053 #endif
7054 case V2SImode:
7055 case V4SImode:
7057 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7058 field_t = build_vector_type_for_mode (innertype, ag_mode);
7059 field_ptr_t = build_pointer_type (field_t);
7061 break;
7062 default:
7063 gcc_assert (0);
7066 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7067 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7068 addr = t;
7069 t = fold_convert (field_ptr_t, addr);
7070 t = build2 (MODIFY_EXPR, field_t,
7071 build1 (INDIRECT_REF, field_t, tmp_ha),
7072 build1 (INDIRECT_REF, field_t, t));
7074 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7075 for (i = 1; i < nregs; ++i)
7077 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7078 u = fold_convert (field_ptr_t, addr);
7079 u = build2 (MODIFY_EXPR, field_t,
7080 build2 (MEM_REF, field_t, tmp_ha,
7081 build_int_cst (field_ptr_t,
7082 (i *
7083 int_size_in_bytes (field_t)))),
7084 build1 (INDIRECT_REF, field_t, u));
7085 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7088 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7089 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7092 COND_EXPR_ELSE (cond2) = t;
7093 addr = fold_convert (build_pointer_type (type), cond1);
7094 addr = build_va_arg_indirect_ref (addr);
7096 if (indirect_p)
7097 addr = build_va_arg_indirect_ref (addr);
7099 return addr;
7102 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7104 static void
7105 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7106 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7107 int no_rtl)
7109 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7110 CUMULATIVE_ARGS local_cum;
7111 int gr_saved, vr_saved;
7113 /* The caller has advanced CUM up to, but not beyond, the last named
7114 argument. Advance a local copy of CUM past the last "real" named
7115 argument, to find out how many registers are left over. */
7116 local_cum = *cum;
7117 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7119 /* Found out how many registers we need to save. */
7120 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7121 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7123 if (TARGET_GENERAL_REGS_ONLY)
7125 if (local_cum.aapcs_nvrn > 0)
7126 sorry ("%qs and floating point or vector arguments",
7127 "-mgeneral-regs-only");
7128 vr_saved = 0;
7131 if (!no_rtl)
7133 if (gr_saved > 0)
7135 rtx ptr, mem;
7137 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7138 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7139 - gr_saved * UNITS_PER_WORD);
7140 mem = gen_frame_mem (BLKmode, ptr);
7141 set_mem_alias_set (mem, get_varargs_alias_set ());
7143 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7144 mem, gr_saved);
7146 if (vr_saved > 0)
7148 /* We can't use move_block_from_reg, because it will use
7149 the wrong mode, storing D regs only. */
7150 enum machine_mode mode = TImode;
7151 int off, i;
7153 /* Set OFF to the offset from virtual_incoming_args_rtx of
7154 the first vector register. The VR save area lies below
7155 the GR one, and is aligned to 16 bytes. */
7156 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7157 STACK_BOUNDARY / BITS_PER_UNIT);
7158 off -= vr_saved * UNITS_PER_VREG;
7160 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7162 rtx ptr, mem;
7164 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7165 mem = gen_frame_mem (mode, ptr);
7166 set_mem_alias_set (mem, get_varargs_alias_set ());
7167 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7168 off += UNITS_PER_VREG;
7173 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7174 any complication of having crtl->args.pretend_args_size changed. */
7175 cfun->machine->frame.saved_varargs_size
7176 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7177 STACK_BOUNDARY / BITS_PER_UNIT)
7178 + vr_saved * UNITS_PER_VREG);
7181 static void
7182 aarch64_conditional_register_usage (void)
7184 int i;
7185 if (!TARGET_FLOAT)
7187 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7189 fixed_regs[i] = 1;
7190 call_used_regs[i] = 1;
7195 /* Walk down the type tree of TYPE counting consecutive base elements.
7196 If *MODEP is VOIDmode, then set it to the first valid floating point
7197 type. If a non-floating point type is found, or if a floating point
7198 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7199 otherwise return the count in the sub-tree. */
7200 static int
7201 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
7203 enum machine_mode mode;
7204 HOST_WIDE_INT size;
7206 switch (TREE_CODE (type))
7208 case REAL_TYPE:
7209 mode = TYPE_MODE (type);
7210 if (mode != DFmode && mode != SFmode && mode != TFmode)
7211 return -1;
7213 if (*modep == VOIDmode)
7214 *modep = mode;
7216 if (*modep == mode)
7217 return 1;
7219 break;
7221 case COMPLEX_TYPE:
7222 mode = TYPE_MODE (TREE_TYPE (type));
7223 if (mode != DFmode && mode != SFmode && mode != TFmode)
7224 return -1;
7226 if (*modep == VOIDmode)
7227 *modep = mode;
7229 if (*modep == mode)
7230 return 2;
7232 break;
7234 case VECTOR_TYPE:
7235 /* Use V2SImode and V4SImode as representatives of all 64-bit
7236 and 128-bit vector types. */
7237 size = int_size_in_bytes (type);
7238 switch (size)
7240 case 8:
7241 mode = V2SImode;
7242 break;
7243 case 16:
7244 mode = V4SImode;
7245 break;
7246 default:
7247 return -1;
7250 if (*modep == VOIDmode)
7251 *modep = mode;
7253 /* Vector modes are considered to be opaque: two vectors are
7254 equivalent for the purposes of being homogeneous aggregates
7255 if they are the same size. */
7256 if (*modep == mode)
7257 return 1;
7259 break;
7261 case ARRAY_TYPE:
7263 int count;
7264 tree index = TYPE_DOMAIN (type);
7266 /* Can't handle incomplete types nor sizes that are not
7267 fixed. */
7268 if (!COMPLETE_TYPE_P (type)
7269 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7270 return -1;
7272 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7273 if (count == -1
7274 || !index
7275 || !TYPE_MAX_VALUE (index)
7276 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7277 || !TYPE_MIN_VALUE (index)
7278 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7279 || count < 0)
7280 return -1;
7282 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7283 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7285 /* There must be no padding. */
7286 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7287 return -1;
7289 return count;
7292 case RECORD_TYPE:
7294 int count = 0;
7295 int sub_count;
7296 tree field;
7298 /* Can't handle incomplete types nor sizes that are not
7299 fixed. */
7300 if (!COMPLETE_TYPE_P (type)
7301 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7302 return -1;
7304 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7306 if (TREE_CODE (field) != FIELD_DECL)
7307 continue;
7309 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7310 if (sub_count < 0)
7311 return -1;
7312 count += sub_count;
7315 /* There must be no padding. */
7316 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7317 return -1;
7319 return count;
7322 case UNION_TYPE:
7323 case QUAL_UNION_TYPE:
7325 /* These aren't very interesting except in a degenerate case. */
7326 int count = 0;
7327 int sub_count;
7328 tree field;
7330 /* Can't handle incomplete types nor sizes that are not
7331 fixed. */
7332 if (!COMPLETE_TYPE_P (type)
7333 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7334 return -1;
7336 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7338 if (TREE_CODE (field) != FIELD_DECL)
7339 continue;
7341 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7342 if (sub_count < 0)
7343 return -1;
7344 count = count > sub_count ? count : sub_count;
7347 /* There must be no padding. */
7348 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7349 return -1;
7351 return count;
7354 default:
7355 break;
7358 return -1;
7361 /* Return true if we use LRA instead of reload pass. */
7362 static bool
7363 aarch64_lra_p (void)
7365 return aarch64_lra_flag;
7368 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7369 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7370 array types. The C99 floating-point complex types are also considered
7371 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7372 types, which are GCC extensions and out of the scope of AAPCS64, are
7373 treated as composite types here as well.
7375 Note that MODE itself is not sufficient in determining whether a type
7376 is such a composite type or not. This is because
7377 stor-layout.c:compute_record_mode may have already changed the MODE
7378 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7379 structure with only one field may have its MODE set to the mode of the
7380 field. Also an integer mode whose size matches the size of the
7381 RECORD_TYPE type may be used to substitute the original mode
7382 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7383 solely relied on. */
7385 static bool
7386 aarch64_composite_type_p (const_tree type,
7387 enum machine_mode mode)
7389 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7390 return true;
7392 if (mode == BLKmode
7393 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7394 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7395 return true;
7397 return false;
7400 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7401 type as described in AAPCS64 \S 4.1.2.
7403 See the comment above aarch64_composite_type_p for the notes on MODE. */
7405 static bool
7406 aarch64_short_vector_p (const_tree type,
7407 enum machine_mode mode)
7409 HOST_WIDE_INT size = -1;
7411 if (type && TREE_CODE (type) == VECTOR_TYPE)
7412 size = int_size_in_bytes (type);
7413 else if (!aarch64_composite_type_p (type, mode)
7414 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7415 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7416 size = GET_MODE_SIZE (mode);
7418 return (size == 8 || size == 16) ? true : false;
7421 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7422 shall be passed or returned in simd/fp register(s) (providing these
7423 parameter passing registers are available).
7425 Upon successful return, *COUNT returns the number of needed registers,
7426 *BASE_MODE returns the mode of the individual register and when IS_HAF
7427 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7428 floating-point aggregate or a homogeneous short-vector aggregate. */
7430 static bool
7431 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7432 const_tree type,
7433 enum machine_mode *base_mode,
7434 int *count,
7435 bool *is_ha)
7437 enum machine_mode new_mode = VOIDmode;
7438 bool composite_p = aarch64_composite_type_p (type, mode);
7440 if (is_ha != NULL) *is_ha = false;
7442 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7443 || aarch64_short_vector_p (type, mode))
7445 *count = 1;
7446 new_mode = mode;
7448 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7450 if (is_ha != NULL) *is_ha = true;
7451 *count = 2;
7452 new_mode = GET_MODE_INNER (mode);
7454 else if (type && composite_p)
7456 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7458 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7460 if (is_ha != NULL) *is_ha = true;
7461 *count = ag_count;
7463 else
7464 return false;
7466 else
7467 return false;
7469 *base_mode = new_mode;
7470 return true;
7473 /* Implement TARGET_STRUCT_VALUE_RTX. */
7475 static rtx
7476 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7477 int incoming ATTRIBUTE_UNUSED)
7479 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7482 /* Implements target hook vector_mode_supported_p. */
7483 static bool
7484 aarch64_vector_mode_supported_p (enum machine_mode mode)
7486 if (TARGET_SIMD
7487 && (mode == V4SImode || mode == V8HImode
7488 || mode == V16QImode || mode == V2DImode
7489 || mode == V2SImode || mode == V4HImode
7490 || mode == V8QImode || mode == V2SFmode
7491 || mode == V4SFmode || mode == V2DFmode
7492 || mode == V1DFmode))
7493 return true;
7495 return false;
7498 /* Return appropriate SIMD container
7499 for MODE within a vector of WIDTH bits. */
7500 static enum machine_mode
7501 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7503 gcc_assert (width == 64 || width == 128);
7504 if (TARGET_SIMD)
7506 if (width == 128)
7507 switch (mode)
7509 case DFmode:
7510 return V2DFmode;
7511 case SFmode:
7512 return V4SFmode;
7513 case SImode:
7514 return V4SImode;
7515 case HImode:
7516 return V8HImode;
7517 case QImode:
7518 return V16QImode;
7519 case DImode:
7520 return V2DImode;
7521 default:
7522 break;
7524 else
7525 switch (mode)
7527 case SFmode:
7528 return V2SFmode;
7529 case SImode:
7530 return V2SImode;
7531 case HImode:
7532 return V4HImode;
7533 case QImode:
7534 return V8QImode;
7535 default:
7536 break;
7539 return word_mode;
7542 /* Return 128-bit container as the preferred SIMD mode for MODE. */
7543 static enum machine_mode
7544 aarch64_preferred_simd_mode (enum machine_mode mode)
7546 return aarch64_simd_container_mode (mode, 128);
7549 /* Return the bitmask of possible vector sizes for the vectorizer
7550 to iterate over. */
7551 static unsigned int
7552 aarch64_autovectorize_vector_sizes (void)
7554 return (16 | 8);
7557 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7558 vector types in order to conform to the AAPCS64 (see "Procedure
7559 Call Standard for the ARM 64-bit Architecture", Appendix A). To
7560 qualify for emission with the mangled names defined in that document,
7561 a vector type must not only be of the correct mode but also be
7562 composed of AdvSIMD vector element types (e.g.
7563 _builtin_aarch64_simd_qi); these types are registered by
7564 aarch64_init_simd_builtins (). In other words, vector types defined
7565 in other ways e.g. via vector_size attribute will get default
7566 mangled names. */
7567 typedef struct
7569 enum machine_mode mode;
7570 const char *element_type_name;
7571 const char *mangled_name;
7572 } aarch64_simd_mangle_map_entry;
7574 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7575 /* 64-bit containerized types. */
7576 { V8QImode, "__builtin_aarch64_simd_qi", "10__Int8x8_t" },
7577 { V8QImode, "__builtin_aarch64_simd_uqi", "11__Uint8x8_t" },
7578 { V4HImode, "__builtin_aarch64_simd_hi", "11__Int16x4_t" },
7579 { V4HImode, "__builtin_aarch64_simd_uhi", "12__Uint16x4_t" },
7580 { V2SImode, "__builtin_aarch64_simd_si", "11__Int32x2_t" },
7581 { V2SImode, "__builtin_aarch64_simd_usi", "12__Uint32x2_t" },
7582 { V2SFmode, "__builtin_aarch64_simd_sf", "13__Float32x2_t" },
7583 { DImode, "__builtin_aarch64_simd_di", "11__Int64x1_t" },
7584 { DImode, "__builtin_aarch64_simd_udi", "12__Uint64x1_t" },
7585 { V1DFmode, "__builtin_aarch64_simd_df", "13__Float64x1_t" },
7586 { V8QImode, "__builtin_aarch64_simd_poly8", "11__Poly8x8_t" },
7587 { V4HImode, "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7588 /* 128-bit containerized types. */
7589 { V16QImode, "__builtin_aarch64_simd_qi", "11__Int8x16_t" },
7590 { V16QImode, "__builtin_aarch64_simd_uqi", "12__Uint8x16_t" },
7591 { V8HImode, "__builtin_aarch64_simd_hi", "11__Int16x8_t" },
7592 { V8HImode, "__builtin_aarch64_simd_uhi", "12__Uint16x8_t" },
7593 { V4SImode, "__builtin_aarch64_simd_si", "11__Int32x4_t" },
7594 { V4SImode, "__builtin_aarch64_simd_usi", "12__Uint32x4_t" },
7595 { V2DImode, "__builtin_aarch64_simd_di", "11__Int64x2_t" },
7596 { V2DImode, "__builtin_aarch64_simd_udi", "12__Uint64x2_t" },
7597 { V4SFmode, "__builtin_aarch64_simd_sf", "13__Float32x4_t" },
7598 { V2DFmode, "__builtin_aarch64_simd_df", "13__Float64x2_t" },
7599 { V16QImode, "__builtin_aarch64_simd_poly8", "12__Poly8x16_t" },
7600 { V8HImode, "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7601 { V2DImode, "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7602 { VOIDmode, NULL, NULL }
7605 /* Implement TARGET_MANGLE_TYPE. */
7607 static const char *
7608 aarch64_mangle_type (const_tree type)
7610 /* The AArch64 ABI documents say that "__va_list" has to be
7611 managled as if it is in the "std" namespace. */
7612 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7613 return "St9__va_list";
7615 /* Check the mode of the vector type, and the name of the vector
7616 element type, against the table. */
7617 if (TREE_CODE (type) == VECTOR_TYPE)
7619 aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7621 while (pos->mode != VOIDmode)
7623 tree elt_type = TREE_TYPE (type);
7625 if (pos->mode == TYPE_MODE (type)
7626 && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7627 && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7628 pos->element_type_name))
7629 return pos->mangled_name;
7631 pos++;
7635 /* Use the default mangling. */
7636 return NULL;
7639 static int
7640 is_mem_p (rtx *x, void *data ATTRIBUTE_UNUSED)
7642 return MEM_P (*x);
7645 static bool
7646 is_memory_op (rtx_insn *mem_insn)
7648 rtx pattern = PATTERN (mem_insn);
7649 return for_each_rtx (&pattern, is_mem_p, NULL);
7652 /* Find the first rtx_insn before insn that will generate an assembly
7653 instruction. */
7655 static rtx_insn *
7656 aarch64_prev_real_insn (rtx_insn *insn)
7658 if (!insn)
7659 return NULL;
7663 insn = prev_real_insn (insn);
7665 while (insn && recog_memoized (insn) < 0);
7667 return insn;
7670 static bool
7671 is_madd_op (enum attr_type t1)
7673 unsigned int i;
7674 /* A number of these may be AArch32 only. */
7675 enum attr_type mlatypes[] = {
7676 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
7677 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
7678 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
7681 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
7683 if (t1 == mlatypes[i])
7684 return true;
7687 return false;
7690 /* Check if there is a register dependency between a load and the insn
7691 for which we hold recog_data. */
7693 static bool
7694 dep_between_memop_and_curr (rtx memop)
7696 rtx load_reg;
7697 int opno;
7699 if (!memop)
7700 return false;
7702 if (!REG_P (SET_DEST (memop)))
7703 return false;
7705 load_reg = SET_DEST (memop);
7706 for (opno = 0; opno < recog_data.n_operands; opno++)
7708 rtx operand = recog_data.operand[opno];
7709 if (REG_P (operand)
7710 && reg_overlap_mentioned_p (load_reg, operand))
7711 return true;
7714 return false;
7717 bool
7718 aarch64_madd_needs_nop (rtx_insn* insn)
7720 enum attr_type attr_type;
7721 rtx_insn *prev;
7722 rtx body;
7724 if (!aarch64_fix_a53_err835769)
7725 return false;
7727 if (recog_memoized (insn) < 0)
7728 return false;
7730 attr_type = get_attr_type (insn);
7731 if (!is_madd_op (attr_type))
7732 return false;
7734 prev = aarch64_prev_real_insn (insn);
7735 if (!prev)
7736 return false;
7738 body = single_set (prev);
7740 /* If the previous insn is a memory op and there is no dependency between
7741 it and the madd, emit a nop between them. If we know the previous insn is
7742 a memory op but body is NULL, emit the nop to be safe, it's probably a
7743 load/store pair insn. */
7744 if (is_memory_op (prev)
7745 && GET_MODE (recog_data.operand[0]) == DImode
7746 && (!dep_between_memop_and_curr (body)))
7747 return true;
7749 return false;
7753 void
7754 aarch64_final_prescan_insn (rtx_insn *insn)
7756 if (aarch64_madd_needs_nop (insn))
7757 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
7761 /* Return the equivalent letter for size. */
7762 static char
7763 sizetochar (int size)
7765 switch (size)
7767 case 64: return 'd';
7768 case 32: return 's';
7769 case 16: return 'h';
7770 case 8 : return 'b';
7771 default: gcc_unreachable ();
7775 /* Return true iff x is a uniform vector of floating-point
7776 constants, and the constant can be represented in
7777 quarter-precision form. Note, as aarch64_float_const_representable
7778 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
7779 static bool
7780 aarch64_vect_float_const_representable_p (rtx x)
7782 int i = 0;
7783 REAL_VALUE_TYPE r0, ri;
7784 rtx x0, xi;
7786 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7787 return false;
7789 x0 = CONST_VECTOR_ELT (x, 0);
7790 if (!CONST_DOUBLE_P (x0))
7791 return false;
7793 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7795 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7797 xi = CONST_VECTOR_ELT (x, i);
7798 if (!CONST_DOUBLE_P (xi))
7799 return false;
7801 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7802 if (!REAL_VALUES_EQUAL (r0, ri))
7803 return false;
7806 return aarch64_float_const_representable_p (x0);
7809 /* Return true for valid and false for invalid. */
7810 bool
7811 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7812 struct simd_immediate_info *info)
7814 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
7815 matches = 1; \
7816 for (i = 0; i < idx; i += (STRIDE)) \
7817 if (!(TEST)) \
7818 matches = 0; \
7819 if (matches) \
7821 immtype = (CLASS); \
7822 elsize = (ELSIZE); \
7823 eshift = (SHIFT); \
7824 emvn = (NEG); \
7825 break; \
7828 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7829 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7830 unsigned char bytes[16];
7831 int immtype = -1, matches;
7832 unsigned int invmask = inverse ? 0xff : 0;
7833 int eshift, emvn;
7835 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7837 if (! (aarch64_simd_imm_zero_p (op, mode)
7838 || aarch64_vect_float_const_representable_p (op)))
7839 return false;
7841 if (info)
7843 info->value = CONST_VECTOR_ELT (op, 0);
7844 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7845 info->mvn = false;
7846 info->shift = 0;
7849 return true;
7852 /* Splat vector constant out into a byte vector. */
7853 for (i = 0; i < n_elts; i++)
7855 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
7856 it must be laid out in the vector register in reverse order. */
7857 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7858 unsigned HOST_WIDE_INT elpart;
7859 unsigned int part, parts;
7861 if (CONST_INT_P (el))
7863 elpart = INTVAL (el);
7864 parts = 1;
7866 else if (GET_CODE (el) == CONST_DOUBLE)
7868 elpart = CONST_DOUBLE_LOW (el);
7869 parts = 2;
7871 else
7872 gcc_unreachable ();
7874 for (part = 0; part < parts; part++)
7876 unsigned int byte;
7877 for (byte = 0; byte < innersize; byte++)
7879 bytes[idx++] = (elpart & 0xff) ^ invmask;
7880 elpart >>= BITS_PER_UNIT;
7882 if (GET_CODE (el) == CONST_DOUBLE)
7883 elpart = CONST_DOUBLE_HIGH (el);
7887 /* Sanity check. */
7888 gcc_assert (idx == GET_MODE_SIZE (mode));
7892 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7893 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7895 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7896 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7898 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7899 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7901 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7902 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7904 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7906 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7908 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7909 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7911 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7912 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7914 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7915 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7917 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7918 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7920 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7922 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7924 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7925 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7927 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7928 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7930 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7931 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7933 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7934 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7936 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7938 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7939 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7941 while (0);
7943 if (immtype == -1)
7944 return false;
7946 if (info)
7948 info->element_width = elsize;
7949 info->mvn = emvn != 0;
7950 info->shift = eshift;
7952 unsigned HOST_WIDE_INT imm = 0;
7954 if (immtype >= 12 && immtype <= 15)
7955 info->msl = true;
7957 /* Un-invert bytes of recognized vector, if necessary. */
7958 if (invmask != 0)
7959 for (i = 0; i < idx; i++)
7960 bytes[i] ^= invmask;
7962 if (immtype == 17)
7964 /* FIXME: Broken on 32-bit H_W_I hosts. */
7965 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7967 for (i = 0; i < 8; i++)
7968 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7969 << (i * BITS_PER_UNIT);
7972 info->value = GEN_INT (imm);
7974 else
7976 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7977 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7979 /* Construct 'abcdefgh' because the assembler cannot handle
7980 generic constants. */
7981 if (info->mvn)
7982 imm = ~imm;
7983 imm = (imm >> info->shift) & 0xff;
7984 info->value = GEN_INT (imm);
7988 return true;
7989 #undef CHECK
7992 /* Check of immediate shift constants are within range. */
7993 bool
7994 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7996 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7997 if (left)
7998 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
7999 else
8000 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8003 /* Return true if X is a uniform vector where all elements
8004 are either the floating-point constant 0.0 or the
8005 integer constant 0. */
8006 bool
8007 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
8009 return x == CONST0_RTX (mode);
8012 bool
8013 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
8015 HOST_WIDE_INT imm = INTVAL (x);
8016 int i;
8018 for (i = 0; i < 8; i++)
8020 unsigned int byte = imm & 0xff;
8021 if (byte != 0xff && byte != 0)
8022 return false;
8023 imm >>= 8;
8026 return true;
8029 bool
8030 aarch64_mov_operand_p (rtx x,
8031 enum aarch64_symbol_context context,
8032 enum machine_mode mode)
8034 if (GET_CODE (x) == HIGH
8035 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8036 return true;
8038 if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
8039 return true;
8041 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8042 return true;
8044 return aarch64_classify_symbolic_expression (x, context)
8045 == SYMBOL_TINY_ABSOLUTE;
8048 /* Return a const_int vector of VAL. */
8050 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
8052 int nunits = GET_MODE_NUNITS (mode);
8053 rtvec v = rtvec_alloc (nunits);
8054 int i;
8056 for (i=0; i < nunits; i++)
8057 RTVEC_ELT (v, i) = GEN_INT (val);
8059 return gen_rtx_CONST_VECTOR (mode, v);
8062 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8064 bool
8065 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
8067 enum machine_mode vmode;
8069 gcc_assert (!VECTOR_MODE_P (mode));
8070 vmode = aarch64_preferred_simd_mode (mode);
8071 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8072 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8075 /* Construct and return a PARALLEL RTX vector with elements numbering the
8076 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8077 the vector - from the perspective of the architecture. This does not
8078 line up with GCC's perspective on lane numbers, so we end up with
8079 different masks depending on our target endian-ness. The diagram
8080 below may help. We must draw the distinction when building masks
8081 which select one half of the vector. An instruction selecting
8082 architectural low-lanes for a big-endian target, must be described using
8083 a mask selecting GCC high-lanes.
8085 Big-Endian Little-Endian
8087 GCC 0 1 2 3 3 2 1 0
8088 | x | x | x | x | | x | x | x | x |
8089 Architecture 3 2 1 0 3 2 1 0
8091 Low Mask: { 2, 3 } { 0, 1 }
8092 High Mask: { 0, 1 } { 2, 3 }
8096 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
8098 int nunits = GET_MODE_NUNITS (mode);
8099 rtvec v = rtvec_alloc (nunits / 2);
8100 int high_base = nunits / 2;
8101 int low_base = 0;
8102 int base;
8103 rtx t1;
8104 int i;
8106 if (BYTES_BIG_ENDIAN)
8107 base = high ? low_base : high_base;
8108 else
8109 base = high ? high_base : low_base;
8111 for (i = 0; i < nunits / 2; i++)
8112 RTVEC_ELT (v, i) = GEN_INT (base + i);
8114 t1 = gen_rtx_PARALLEL (mode, v);
8115 return t1;
8118 /* Check OP for validity as a PARALLEL RTX vector with elements
8119 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8120 from the perspective of the architecture. See the diagram above
8121 aarch64_simd_vect_par_cnst_half for more details. */
8123 bool
8124 aarch64_simd_check_vect_par_cnst_half (rtx op, enum machine_mode mode,
8125 bool high)
8127 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8128 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8129 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8130 int i = 0;
8132 if (!VECTOR_MODE_P (mode))
8133 return false;
8135 if (count_op != count_ideal)
8136 return false;
8138 for (i = 0; i < count_ideal; i++)
8140 rtx elt_op = XVECEXP (op, 0, i);
8141 rtx elt_ideal = XVECEXP (ideal, 0, i);
8143 if (!CONST_INT_P (elt_op)
8144 || INTVAL (elt_ideal) != INTVAL (elt_op))
8145 return false;
8147 return true;
8150 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8151 HIGH (exclusive). */
8152 void
8153 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
8155 HOST_WIDE_INT lane;
8156 gcc_assert (CONST_INT_P (operand));
8157 lane = INTVAL (operand);
8159 if (lane < low || lane >= high)
8160 error ("lane out of range");
8163 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8164 registers). */
8165 void
8166 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
8167 rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8168 rtx op1)
8170 rtx mem = gen_rtx_MEM (mode, destaddr);
8171 rtx tmp1 = gen_reg_rtx (mode);
8172 rtx tmp2 = gen_reg_rtx (mode);
8174 emit_insn (intfn (tmp1, op1, tmp2));
8176 emit_move_insn (mem, tmp1);
8177 mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8178 emit_move_insn (mem, tmp2);
8181 /* Return TRUE if OP is a valid vector addressing mode. */
8182 bool
8183 aarch64_simd_mem_operand_p (rtx op)
8185 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8186 || REG_P (XEXP (op, 0)));
8189 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8190 not to early-clobber SRC registers in the process.
8192 We assume that the operands described by SRC and DEST represent a
8193 decomposed copy of OPERANDS[1] into OPERANDS[0]. COUNT is the
8194 number of components into which the copy has been decomposed. */
8195 void
8196 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8197 rtx *src, unsigned int count)
8199 unsigned int i;
8201 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8202 || REGNO (operands[0]) < REGNO (operands[1]))
8204 for (i = 0; i < count; i++)
8206 operands[2 * i] = dest[i];
8207 operands[2 * i + 1] = src[i];
8210 else
8212 for (i = 0; i < count; i++)
8214 operands[2 * i] = dest[count - i - 1];
8215 operands[2 * i + 1] = src[count - i - 1];
8220 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8221 one of VSTRUCT modes: OI, CI or XI. */
8223 aarch64_simd_attr_length_move (rtx_insn *insn)
8225 enum machine_mode mode;
8227 extract_insn_cached (insn);
8229 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8231 mode = GET_MODE (recog_data.operand[0]);
8232 switch (mode)
8234 case OImode:
8235 return 8;
8236 case CImode:
8237 return 12;
8238 case XImode:
8239 return 16;
8240 default:
8241 gcc_unreachable ();
8244 return 4;
8247 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8248 alignment of a vector to 128 bits. */
8249 static HOST_WIDE_INT
8250 aarch64_simd_vector_alignment (const_tree type)
8252 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8253 return MIN (align, 128);
8256 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8257 static bool
8258 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8260 if (is_packed)
8261 return false;
8263 /* We guarantee alignment for vectors up to 128-bits. */
8264 if (tree_int_cst_compare (TYPE_SIZE (type),
8265 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8266 return false;
8268 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8269 return true;
8272 /* If VALS is a vector constant that can be loaded into a register
8273 using DUP, generate instructions to do so and return an RTX to
8274 assign to the register. Otherwise return NULL_RTX. */
8275 static rtx
8276 aarch64_simd_dup_constant (rtx vals)
8278 enum machine_mode mode = GET_MODE (vals);
8279 enum machine_mode inner_mode = GET_MODE_INNER (mode);
8280 int n_elts = GET_MODE_NUNITS (mode);
8281 bool all_same = true;
8282 rtx x;
8283 int i;
8285 if (GET_CODE (vals) != CONST_VECTOR)
8286 return NULL_RTX;
8288 for (i = 1; i < n_elts; ++i)
8290 x = CONST_VECTOR_ELT (vals, i);
8291 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8292 all_same = false;
8295 if (!all_same)
8296 return NULL_RTX;
8298 /* We can load this constant by using DUP and a constant in a
8299 single ARM register. This will be cheaper than a vector
8300 load. */
8301 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8302 return gen_rtx_VEC_DUPLICATE (mode, x);
8306 /* Generate code to load VALS, which is a PARALLEL containing only
8307 constants (for vec_init) or CONST_VECTOR, efficiently into a
8308 register. Returns an RTX to copy into the register, or NULL_RTX
8309 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8310 static rtx
8311 aarch64_simd_make_constant (rtx vals)
8313 enum machine_mode mode = GET_MODE (vals);
8314 rtx const_dup;
8315 rtx const_vec = NULL_RTX;
8316 int n_elts = GET_MODE_NUNITS (mode);
8317 int n_const = 0;
8318 int i;
8320 if (GET_CODE (vals) == CONST_VECTOR)
8321 const_vec = vals;
8322 else if (GET_CODE (vals) == PARALLEL)
8324 /* A CONST_VECTOR must contain only CONST_INTs and
8325 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8326 Only store valid constants in a CONST_VECTOR. */
8327 for (i = 0; i < n_elts; ++i)
8329 rtx x = XVECEXP (vals, 0, i);
8330 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8331 n_const++;
8333 if (n_const == n_elts)
8334 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8336 else
8337 gcc_unreachable ();
8339 if (const_vec != NULL_RTX
8340 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8341 /* Load using MOVI/MVNI. */
8342 return const_vec;
8343 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8344 /* Loaded using DUP. */
8345 return const_dup;
8346 else if (const_vec != NULL_RTX)
8347 /* Load from constant pool. We can not take advantage of single-cycle
8348 LD1 because we need a PC-relative addressing mode. */
8349 return const_vec;
8350 else
8351 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8352 We can not construct an initializer. */
8353 return NULL_RTX;
8356 void
8357 aarch64_expand_vector_init (rtx target, rtx vals)
8359 enum machine_mode mode = GET_MODE (target);
8360 enum machine_mode inner_mode = GET_MODE_INNER (mode);
8361 int n_elts = GET_MODE_NUNITS (mode);
8362 int n_var = 0, one_var = -1;
8363 bool all_same = true;
8364 rtx x, mem;
8365 int i;
8367 x = XVECEXP (vals, 0, 0);
8368 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8369 n_var = 1, one_var = 0;
8371 for (i = 1; i < n_elts; ++i)
8373 x = XVECEXP (vals, 0, i);
8374 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8375 ++n_var, one_var = i;
8377 if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8378 all_same = false;
8381 if (n_var == 0)
8383 rtx constant = aarch64_simd_make_constant (vals);
8384 if (constant != NULL_RTX)
8386 emit_move_insn (target, constant);
8387 return;
8391 /* Splat a single non-constant element if we can. */
8392 if (all_same)
8394 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8395 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8396 return;
8399 /* One field is non-constant. Load constant then overwrite varying
8400 field. This is more efficient than using the stack. */
8401 if (n_var == 1)
8403 rtx copy = copy_rtx (vals);
8404 rtx index = GEN_INT (one_var);
8405 enum insn_code icode;
8407 /* Load constant part of vector, substitute neighboring value for
8408 varying element. */
8409 XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8410 aarch64_expand_vector_init (target, copy);
8412 /* Insert variable. */
8413 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8414 icode = optab_handler (vec_set_optab, mode);
8415 gcc_assert (icode != CODE_FOR_nothing);
8416 emit_insn (GEN_FCN (icode) (target, x, index));
8417 return;
8420 /* Construct the vector in memory one field at a time
8421 and load the whole vector. */
8422 mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8423 for (i = 0; i < n_elts; i++)
8424 emit_move_insn (adjust_address_nv (mem, inner_mode,
8425 i * GET_MODE_SIZE (inner_mode)),
8426 XVECEXP (vals, 0, i));
8427 emit_move_insn (target, mem);
8431 static unsigned HOST_WIDE_INT
8432 aarch64_shift_truncation_mask (enum machine_mode mode)
8434 return
8435 (aarch64_vector_mode_supported_p (mode)
8436 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8439 #ifndef TLS_SECTION_ASM_FLAG
8440 #define TLS_SECTION_ASM_FLAG 'T'
8441 #endif
8443 void
8444 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8445 tree decl ATTRIBUTE_UNUSED)
8447 char flagchars[10], *f = flagchars;
8449 /* If we have already declared this section, we can use an
8450 abbreviated form to switch back to it -- unless this section is
8451 part of a COMDAT groups, in which case GAS requires the full
8452 declaration every time. */
8453 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8454 && (flags & SECTION_DECLARED))
8456 fprintf (asm_out_file, "\t.section\t%s\n", name);
8457 return;
8460 if (!(flags & SECTION_DEBUG))
8461 *f++ = 'a';
8462 if (flags & SECTION_WRITE)
8463 *f++ = 'w';
8464 if (flags & SECTION_CODE)
8465 *f++ = 'x';
8466 if (flags & SECTION_SMALL)
8467 *f++ = 's';
8468 if (flags & SECTION_MERGE)
8469 *f++ = 'M';
8470 if (flags & SECTION_STRINGS)
8471 *f++ = 'S';
8472 if (flags & SECTION_TLS)
8473 *f++ = TLS_SECTION_ASM_FLAG;
8474 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8475 *f++ = 'G';
8476 *f = '\0';
8478 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8480 if (!(flags & SECTION_NOTYPE))
8482 const char *type;
8483 const char *format;
8485 if (flags & SECTION_BSS)
8486 type = "nobits";
8487 else
8488 type = "progbits";
8490 #ifdef TYPE_OPERAND_FMT
8491 format = "," TYPE_OPERAND_FMT;
8492 #else
8493 format = ",@%s";
8494 #endif
8496 fprintf (asm_out_file, format, type);
8498 if (flags & SECTION_ENTSIZE)
8499 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8500 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8502 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8503 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8504 else
8505 fprintf (asm_out_file, ",%s,comdat",
8506 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8510 putc ('\n', asm_out_file);
8513 /* Select a format to encode pointers in exception handling data. */
8515 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8517 int type;
8518 switch (aarch64_cmodel)
8520 case AARCH64_CMODEL_TINY:
8521 case AARCH64_CMODEL_TINY_PIC:
8522 case AARCH64_CMODEL_SMALL:
8523 case AARCH64_CMODEL_SMALL_PIC:
8524 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8525 for everything. */
8526 type = DW_EH_PE_sdata4;
8527 break;
8528 default:
8529 /* No assumptions here. 8-byte relocs required. */
8530 type = DW_EH_PE_sdata8;
8531 break;
8533 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8536 /* Emit load exclusive. */
8538 static void
8539 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8540 rtx mem, rtx model_rtx)
8542 rtx (*gen) (rtx, rtx, rtx);
8544 switch (mode)
8546 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8547 case HImode: gen = gen_aarch64_load_exclusivehi; break;
8548 case SImode: gen = gen_aarch64_load_exclusivesi; break;
8549 case DImode: gen = gen_aarch64_load_exclusivedi; break;
8550 default:
8551 gcc_unreachable ();
8554 emit_insn (gen (rval, mem, model_rtx));
8557 /* Emit store exclusive. */
8559 static void
8560 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8561 rtx rval, rtx mem, rtx model_rtx)
8563 rtx (*gen) (rtx, rtx, rtx, rtx);
8565 switch (mode)
8567 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8568 case HImode: gen = gen_aarch64_store_exclusivehi; break;
8569 case SImode: gen = gen_aarch64_store_exclusivesi; break;
8570 case DImode: gen = gen_aarch64_store_exclusivedi; break;
8571 default:
8572 gcc_unreachable ();
8575 emit_insn (gen (bval, rval, mem, model_rtx));
8578 /* Mark the previous jump instruction as unlikely. */
8580 static void
8581 aarch64_emit_unlikely_jump (rtx insn)
8583 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8585 insn = emit_jump_insn (insn);
8586 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8589 /* Expand a compare and swap pattern. */
8591 void
8592 aarch64_expand_compare_and_swap (rtx operands[])
8594 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8595 enum machine_mode mode, cmp_mode;
8596 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8598 bval = operands[0];
8599 rval = operands[1];
8600 mem = operands[2];
8601 oldval = operands[3];
8602 newval = operands[4];
8603 is_weak = operands[5];
8604 mod_s = operands[6];
8605 mod_f = operands[7];
8606 mode = GET_MODE (mem);
8607 cmp_mode = mode;
8609 /* Normally the succ memory model must be stronger than fail, but in the
8610 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8611 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
8613 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8614 && INTVAL (mod_s) == MEMMODEL_RELEASE)
8615 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8617 switch (mode)
8619 case QImode:
8620 case HImode:
8621 /* For short modes, we're going to perform the comparison in SImode,
8622 so do the zero-extension now. */
8623 cmp_mode = SImode;
8624 rval = gen_reg_rtx (SImode);
8625 oldval = convert_modes (SImode, mode, oldval, true);
8626 /* Fall through. */
8628 case SImode:
8629 case DImode:
8630 /* Force the value into a register if needed. */
8631 if (!aarch64_plus_operand (oldval, mode))
8632 oldval = force_reg (cmp_mode, oldval);
8633 break;
8635 default:
8636 gcc_unreachable ();
8639 switch (mode)
8641 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8642 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8643 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8644 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8645 default:
8646 gcc_unreachable ();
8649 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8651 if (mode == QImode || mode == HImode)
8652 emit_move_insn (operands[1], gen_lowpart (mode, rval));
8654 x = gen_rtx_REG (CCmode, CC_REGNUM);
8655 x = gen_rtx_EQ (SImode, x, const0_rtx);
8656 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8659 /* Split a compare and swap pattern. */
8661 void
8662 aarch64_split_compare_and_swap (rtx operands[])
8664 rtx rval, mem, oldval, newval, scratch;
8665 enum machine_mode mode;
8666 bool is_weak;
8667 rtx_code_label *label1, *label2;
8668 rtx x, cond;
8670 rval = operands[0];
8671 mem = operands[1];
8672 oldval = operands[2];
8673 newval = operands[3];
8674 is_weak = (operands[4] != const0_rtx);
8675 scratch = operands[7];
8676 mode = GET_MODE (mem);
8678 label1 = NULL;
8679 if (!is_weak)
8681 label1 = gen_label_rtx ();
8682 emit_label (label1);
8684 label2 = gen_label_rtx ();
8686 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8688 cond = aarch64_gen_compare_reg (NE, rval, oldval);
8689 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8690 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8691 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8692 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8694 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8696 if (!is_weak)
8698 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8699 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8700 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8701 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8703 else
8705 cond = gen_rtx_REG (CCmode, CC_REGNUM);
8706 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8707 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8710 emit_label (label2);
8713 /* Split an atomic operation. */
8715 void
8716 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8717 rtx value, rtx model_rtx, rtx cond)
8719 enum machine_mode mode = GET_MODE (mem);
8720 enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8721 rtx_code_label *label;
8722 rtx x;
8724 label = gen_label_rtx ();
8725 emit_label (label);
8727 if (new_out)
8728 new_out = gen_lowpart (wmode, new_out);
8729 if (old_out)
8730 old_out = gen_lowpart (wmode, old_out);
8731 else
8732 old_out = new_out;
8733 value = simplify_gen_subreg (wmode, value, mode, 0);
8735 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8737 switch (code)
8739 case SET:
8740 new_out = value;
8741 break;
8743 case NOT:
8744 x = gen_rtx_AND (wmode, old_out, value);
8745 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8746 x = gen_rtx_NOT (wmode, new_out);
8747 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8748 break;
8750 case MINUS:
8751 if (CONST_INT_P (value))
8753 value = GEN_INT (-INTVAL (value));
8754 code = PLUS;
8756 /* Fall through. */
8758 default:
8759 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8760 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8761 break;
8764 aarch64_emit_store_exclusive (mode, cond, mem,
8765 gen_lowpart (mode, new_out), model_rtx);
8767 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8768 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8769 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8770 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8773 static void
8774 aarch64_print_extension (void)
8776 const struct aarch64_option_extension *opt = NULL;
8778 for (opt = all_extensions; opt->name != NULL; opt++)
8779 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8780 asm_fprintf (asm_out_file, "+%s", opt->name);
8782 asm_fprintf (asm_out_file, "\n");
8785 static void
8786 aarch64_start_file (void)
8788 if (selected_arch)
8790 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8791 aarch64_print_extension ();
8793 else if (selected_cpu)
8795 const char *truncated_name
8796 = aarch64_rewrite_selected_cpu (selected_cpu->name);
8797 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8798 aarch64_print_extension ();
8800 default_file_start();
8803 /* Target hook for c_mode_for_suffix. */
8804 static enum machine_mode
8805 aarch64_c_mode_for_suffix (char suffix)
8807 if (suffix == 'q')
8808 return TFmode;
8810 return VOIDmode;
8813 /* We can only represent floating point constants which will fit in
8814 "quarter-precision" values. These values are characterised by
8815 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
8818 (-1)^s * (n/16) * 2^r
8820 Where:
8821 's' is the sign bit.
8822 'n' is an integer in the range 16 <= n <= 31.
8823 'r' is an integer in the range -3 <= r <= 4. */
8825 /* Return true iff X can be represented by a quarter-precision
8826 floating point immediate operand X. Note, we cannot represent 0.0. */
8827 bool
8828 aarch64_float_const_representable_p (rtx x)
8830 /* This represents our current view of how many bits
8831 make up the mantissa. */
8832 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8833 int exponent;
8834 unsigned HOST_WIDE_INT mantissa, mask;
8835 REAL_VALUE_TYPE r, m;
8836 bool fail;
8838 if (!CONST_DOUBLE_P (x))
8839 return false;
8841 if (GET_MODE (x) == VOIDmode)
8842 return false;
8844 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8846 /* We cannot represent infinities, NaNs or +/-zero. We won't
8847 know if we have +zero until we analyse the mantissa, but we
8848 can reject the other invalid values. */
8849 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8850 || REAL_VALUE_MINUS_ZERO (r))
8851 return false;
8853 /* Extract exponent. */
8854 r = real_value_abs (&r);
8855 exponent = REAL_EXP (&r);
8857 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8858 highest (sign) bit, with a fixed binary point at bit point_pos.
8859 m1 holds the low part of the mantissa, m2 the high part.
8860 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8861 bits for the mantissa, this can fail (low bits will be lost). */
8862 real_ldexp (&m, &r, point_pos - exponent);
8863 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8865 /* If the low part of the mantissa has bits set we cannot represent
8866 the value. */
8867 if (w.elt (0) != 0)
8868 return false;
8869 /* We have rejected the lower HOST_WIDE_INT, so update our
8870 understanding of how many bits lie in the mantissa and
8871 look only at the high HOST_WIDE_INT. */
8872 mantissa = w.elt (1);
8873 point_pos -= HOST_BITS_PER_WIDE_INT;
8875 /* We can only represent values with a mantissa of the form 1.xxxx. */
8876 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8877 if ((mantissa & mask) != 0)
8878 return false;
8880 /* Having filtered unrepresentable values, we may now remove all
8881 but the highest 5 bits. */
8882 mantissa >>= point_pos - 5;
8884 /* We cannot represent the value 0.0, so reject it. This is handled
8885 elsewhere. */
8886 if (mantissa == 0)
8887 return false;
8889 /* Then, as bit 4 is always set, we can mask it off, leaving
8890 the mantissa in the range [0, 15]. */
8891 mantissa &= ~(1 << 4);
8892 gcc_assert (mantissa <= 15);
8894 /* GCC internally does not use IEEE754-like encoding (where normalized
8895 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
8896 Our mantissa values are shifted 4 places to the left relative to
8897 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8898 by 5 places to correct for GCC's representation. */
8899 exponent = 5 - exponent;
8901 return (exponent >= 0 && exponent <= 7);
8904 char*
8905 aarch64_output_simd_mov_immediate (rtx const_vector,
8906 enum machine_mode mode,
8907 unsigned width)
8909 bool is_valid;
8910 static char templ[40];
8911 const char *mnemonic;
8912 const char *shift_op;
8913 unsigned int lane_count = 0;
8914 char element_char;
8916 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8918 /* This will return true to show const_vector is legal for use as either
8919 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
8920 also update INFO to show how the immediate should be generated. */
8921 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8922 gcc_assert (is_valid);
8924 element_char = sizetochar (info.element_width);
8925 lane_count = width / info.element_width;
8927 mode = GET_MODE_INNER (mode);
8928 if (mode == SFmode || mode == DFmode)
8930 gcc_assert (info.shift == 0 && ! info.mvn);
8931 if (aarch64_float_const_zero_rtx_p (info.value))
8932 info.value = GEN_INT (0);
8933 else
8935 #define buf_size 20
8936 REAL_VALUE_TYPE r;
8937 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8938 char float_buf[buf_size] = {'\0'};
8939 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8940 #undef buf_size
8942 if (lane_count == 1)
8943 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8944 else
8945 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8946 lane_count, element_char, float_buf);
8947 return templ;
8951 mnemonic = info.mvn ? "mvni" : "movi";
8952 shift_op = info.msl ? "msl" : "lsl";
8954 if (lane_count == 1)
8955 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8956 mnemonic, UINTVAL (info.value));
8957 else if (info.shift)
8958 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8959 ", %s %d", mnemonic, lane_count, element_char,
8960 UINTVAL (info.value), shift_op, info.shift);
8961 else
8962 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8963 mnemonic, lane_count, element_char, UINTVAL (info.value));
8964 return templ;
8967 char*
8968 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8969 enum machine_mode mode)
8971 enum machine_mode vmode;
8973 gcc_assert (!VECTOR_MODE_P (mode));
8974 vmode = aarch64_simd_container_mode (mode, 64);
8975 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8976 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8979 /* Split operands into moves from op[1] + op[2] into op[0]. */
8981 void
8982 aarch64_split_combinev16qi (rtx operands[3])
8984 unsigned int dest = REGNO (operands[0]);
8985 unsigned int src1 = REGNO (operands[1]);
8986 unsigned int src2 = REGNO (operands[2]);
8987 enum machine_mode halfmode = GET_MODE (operands[1]);
8988 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8989 rtx destlo, desthi;
8991 gcc_assert (halfmode == V16QImode);
8993 if (src1 == dest && src2 == dest + halfregs)
8995 /* No-op move. Can't split to nothing; emit something. */
8996 emit_note (NOTE_INSN_DELETED);
8997 return;
9000 /* Preserve register attributes for variable tracking. */
9001 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9002 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9003 GET_MODE_SIZE (halfmode));
9005 /* Special case of reversed high/low parts. */
9006 if (reg_overlap_mentioned_p (operands[2], destlo)
9007 && reg_overlap_mentioned_p (operands[1], desthi))
9009 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9010 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9011 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9013 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9015 /* Try to avoid unnecessary moves if part of the result
9016 is in the right place already. */
9017 if (src1 != dest)
9018 emit_move_insn (destlo, operands[1]);
9019 if (src2 != dest + halfregs)
9020 emit_move_insn (desthi, operands[2]);
9022 else
9024 if (src2 != dest + halfregs)
9025 emit_move_insn (desthi, operands[2]);
9026 if (src1 != dest)
9027 emit_move_insn (destlo, operands[1]);
9031 /* vec_perm support. */
9033 #define MAX_VECT_LEN 16
9035 struct expand_vec_perm_d
9037 rtx target, op0, op1;
9038 unsigned char perm[MAX_VECT_LEN];
9039 enum machine_mode vmode;
9040 unsigned char nelt;
9041 bool one_vector_p;
9042 bool testing_p;
9045 /* Generate a variable permutation. */
9047 static void
9048 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9050 enum machine_mode vmode = GET_MODE (target);
9051 bool one_vector_p = rtx_equal_p (op0, op1);
9053 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9054 gcc_checking_assert (GET_MODE (op0) == vmode);
9055 gcc_checking_assert (GET_MODE (op1) == vmode);
9056 gcc_checking_assert (GET_MODE (sel) == vmode);
9057 gcc_checking_assert (TARGET_SIMD);
9059 if (one_vector_p)
9061 if (vmode == V8QImode)
9063 /* Expand the argument to a V16QI mode by duplicating it. */
9064 rtx pair = gen_reg_rtx (V16QImode);
9065 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9066 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9068 else
9070 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9073 else
9075 rtx pair;
9077 if (vmode == V8QImode)
9079 pair = gen_reg_rtx (V16QImode);
9080 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9081 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9083 else
9085 pair = gen_reg_rtx (OImode);
9086 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9087 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9092 void
9093 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9095 enum machine_mode vmode = GET_MODE (target);
9096 unsigned int nelt = GET_MODE_NUNITS (vmode);
9097 bool one_vector_p = rtx_equal_p (op0, op1);
9098 rtx mask;
9100 /* The TBL instruction does not use a modulo index, so we must take care
9101 of that ourselves. */
9102 mask = aarch64_simd_gen_const_vector_dup (vmode,
9103 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9104 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9106 /* For big-endian, we also need to reverse the index within the vector
9107 (but not which vector). */
9108 if (BYTES_BIG_ENDIAN)
9110 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9111 if (!one_vector_p)
9112 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9113 sel = expand_simple_binop (vmode, XOR, sel, mask,
9114 NULL, 0, OPTAB_LIB_WIDEN);
9116 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9119 /* Recognize patterns suitable for the TRN instructions. */
9120 static bool
9121 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9123 unsigned int i, odd, mask, nelt = d->nelt;
9124 rtx out, in0, in1, x;
9125 rtx (*gen) (rtx, rtx, rtx);
9126 enum machine_mode vmode = d->vmode;
9128 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9129 return false;
9131 /* Note that these are little-endian tests.
9132 We correct for big-endian later. */
9133 if (d->perm[0] == 0)
9134 odd = 0;
9135 else if (d->perm[0] == 1)
9136 odd = 1;
9137 else
9138 return false;
9139 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9141 for (i = 0; i < nelt; i += 2)
9143 if (d->perm[i] != i + odd)
9144 return false;
9145 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9146 return false;
9149 /* Success! */
9150 if (d->testing_p)
9151 return true;
9153 in0 = d->op0;
9154 in1 = d->op1;
9155 if (BYTES_BIG_ENDIAN)
9157 x = in0, in0 = in1, in1 = x;
9158 odd = !odd;
9160 out = d->target;
9162 if (odd)
9164 switch (vmode)
9166 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9167 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9168 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9169 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9170 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9171 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9172 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9173 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9174 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9175 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9176 default:
9177 return false;
9180 else
9182 switch (vmode)
9184 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9185 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9186 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9187 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9188 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9189 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9190 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9191 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9192 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9193 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9194 default:
9195 return false;
9199 emit_insn (gen (out, in0, in1));
9200 return true;
9203 /* Recognize patterns suitable for the UZP instructions. */
9204 static bool
9205 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9207 unsigned int i, odd, mask, nelt = d->nelt;
9208 rtx out, in0, in1, x;
9209 rtx (*gen) (rtx, rtx, rtx);
9210 enum machine_mode vmode = d->vmode;
9212 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9213 return false;
9215 /* Note that these are little-endian tests.
9216 We correct for big-endian later. */
9217 if (d->perm[0] == 0)
9218 odd = 0;
9219 else if (d->perm[0] == 1)
9220 odd = 1;
9221 else
9222 return false;
9223 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9225 for (i = 0; i < nelt; i++)
9227 unsigned elt = (i * 2 + odd) & mask;
9228 if (d->perm[i] != elt)
9229 return false;
9232 /* Success! */
9233 if (d->testing_p)
9234 return true;
9236 in0 = d->op0;
9237 in1 = d->op1;
9238 if (BYTES_BIG_ENDIAN)
9240 x = in0, in0 = in1, in1 = x;
9241 odd = !odd;
9243 out = d->target;
9245 if (odd)
9247 switch (vmode)
9249 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9250 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9251 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9252 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9253 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9254 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9255 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9256 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9257 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9258 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9259 default:
9260 return false;
9263 else
9265 switch (vmode)
9267 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9268 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9269 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9270 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9271 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9272 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9273 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9274 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9275 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9276 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9277 default:
9278 return false;
9282 emit_insn (gen (out, in0, in1));
9283 return true;
9286 /* Recognize patterns suitable for the ZIP instructions. */
9287 static bool
9288 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9290 unsigned int i, high, mask, nelt = d->nelt;
9291 rtx out, in0, in1, x;
9292 rtx (*gen) (rtx, rtx, rtx);
9293 enum machine_mode vmode = d->vmode;
9295 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9296 return false;
9298 /* Note that these are little-endian tests.
9299 We correct for big-endian later. */
9300 high = nelt / 2;
9301 if (d->perm[0] == high)
9302 /* Do Nothing. */
9304 else if (d->perm[0] == 0)
9305 high = 0;
9306 else
9307 return false;
9308 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9310 for (i = 0; i < nelt / 2; i++)
9312 unsigned elt = (i + high) & mask;
9313 if (d->perm[i * 2] != elt)
9314 return false;
9315 elt = (elt + nelt) & mask;
9316 if (d->perm[i * 2 + 1] != elt)
9317 return false;
9320 /* Success! */
9321 if (d->testing_p)
9322 return true;
9324 in0 = d->op0;
9325 in1 = d->op1;
9326 if (BYTES_BIG_ENDIAN)
9328 x = in0, in0 = in1, in1 = x;
9329 high = !high;
9331 out = d->target;
9333 if (high)
9335 switch (vmode)
9337 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9338 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9339 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9340 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9341 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9342 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9343 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9344 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9345 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9346 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9347 default:
9348 return false;
9351 else
9353 switch (vmode)
9355 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9356 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9357 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9358 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9359 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9360 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9361 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9362 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9363 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9364 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9365 default:
9366 return false;
9370 emit_insn (gen (out, in0, in1));
9371 return true;
9374 /* Recognize patterns for the EXT insn. */
9376 static bool
9377 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9379 unsigned int i, nelt = d->nelt;
9380 rtx (*gen) (rtx, rtx, rtx, rtx);
9381 rtx offset;
9383 unsigned int location = d->perm[0]; /* Always < nelt. */
9385 /* Check if the extracted indices are increasing by one. */
9386 for (i = 1; i < nelt; i++)
9388 unsigned int required = location + i;
9389 if (d->one_vector_p)
9391 /* We'll pass the same vector in twice, so allow indices to wrap. */
9392 required &= (nelt - 1);
9394 if (d->perm[i] != required)
9395 return false;
9398 switch (d->vmode)
9400 case V16QImode: gen = gen_aarch64_extv16qi; break;
9401 case V8QImode: gen = gen_aarch64_extv8qi; break;
9402 case V4HImode: gen = gen_aarch64_extv4hi; break;
9403 case V8HImode: gen = gen_aarch64_extv8hi; break;
9404 case V2SImode: gen = gen_aarch64_extv2si; break;
9405 case V4SImode: gen = gen_aarch64_extv4si; break;
9406 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9407 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9408 case V2DImode: gen = gen_aarch64_extv2di; break;
9409 case V2DFmode: gen = gen_aarch64_extv2df; break;
9410 default:
9411 return false;
9414 /* Success! */
9415 if (d->testing_p)
9416 return true;
9418 /* The case where (location == 0) is a no-op for both big- and little-endian,
9419 and is removed by the mid-end at optimization levels -O1 and higher. */
9421 if (BYTES_BIG_ENDIAN && (location != 0))
9423 /* After setup, we want the high elements of the first vector (stored
9424 at the LSB end of the register), and the low elements of the second
9425 vector (stored at the MSB end of the register). So swap. */
9426 rtx temp = d->op0;
9427 d->op0 = d->op1;
9428 d->op1 = temp;
9429 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9430 location = nelt - location;
9433 offset = GEN_INT (location);
9434 emit_insn (gen (d->target, d->op0, d->op1, offset));
9435 return true;
9438 /* Recognize patterns for the REV insns. */
9440 static bool
9441 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9443 unsigned int i, j, diff, nelt = d->nelt;
9444 rtx (*gen) (rtx, rtx);
9446 if (!d->one_vector_p)
9447 return false;
9449 diff = d->perm[0];
9450 switch (diff)
9452 case 7:
9453 switch (d->vmode)
9455 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9456 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
9457 default:
9458 return false;
9460 break;
9461 case 3:
9462 switch (d->vmode)
9464 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9465 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
9466 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
9467 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
9468 default:
9469 return false;
9471 break;
9472 case 1:
9473 switch (d->vmode)
9475 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9476 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
9477 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
9478 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
9479 case V4SImode: gen = gen_aarch64_rev64v4si; break;
9480 case V2SImode: gen = gen_aarch64_rev64v2si; break;
9481 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
9482 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
9483 default:
9484 return false;
9486 break;
9487 default:
9488 return false;
9491 for (i = 0; i < nelt ; i += diff + 1)
9492 for (j = 0; j <= diff; j += 1)
9494 /* This is guaranteed to be true as the value of diff
9495 is 7, 3, 1 and we should have enough elements in the
9496 queue to generate this. Getting a vector mask with a
9497 value of diff other than these values implies that
9498 something is wrong by the time we get here. */
9499 gcc_assert (i + j < nelt);
9500 if (d->perm[i + j] != i + diff - j)
9501 return false;
9504 /* Success! */
9505 if (d->testing_p)
9506 return true;
9508 emit_insn (gen (d->target, d->op0));
9509 return true;
9512 static bool
9513 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9515 rtx (*gen) (rtx, rtx, rtx);
9516 rtx out = d->target;
9517 rtx in0;
9518 enum machine_mode vmode = d->vmode;
9519 unsigned int i, elt, nelt = d->nelt;
9520 rtx lane;
9522 elt = d->perm[0];
9523 for (i = 1; i < nelt; i++)
9525 if (elt != d->perm[i])
9526 return false;
9529 /* The generic preparation in aarch64_expand_vec_perm_const_1
9530 swaps the operand order and the permute indices if it finds
9531 d->perm[0] to be in the second operand. Thus, we can always
9532 use d->op0 and need not do any extra arithmetic to get the
9533 correct lane number. */
9534 in0 = d->op0;
9535 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
9537 switch (vmode)
9539 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9540 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9541 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9542 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9543 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9544 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9545 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9546 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9547 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9548 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9549 default:
9550 return false;
9553 emit_insn (gen (out, in0, lane));
9554 return true;
9557 static bool
9558 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9560 rtx rperm[MAX_VECT_LEN], sel;
9561 enum machine_mode vmode = d->vmode;
9562 unsigned int i, nelt = d->nelt;
9564 if (d->testing_p)
9565 return true;
9567 /* Generic code will try constant permutation twice. Once with the
9568 original mode and again with the elements lowered to QImode.
9569 So wait and don't do the selector expansion ourselves. */
9570 if (vmode != V8QImode && vmode != V16QImode)
9571 return false;
9573 for (i = 0; i < nelt; ++i)
9575 int nunits = GET_MODE_NUNITS (vmode);
9577 /* If big-endian and two vectors we end up with a weird mixed-endian
9578 mode on NEON. Reverse the index within each word but not the word
9579 itself. */
9580 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9581 : d->perm[i]);
9583 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9584 sel = force_reg (vmode, sel);
9586 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9587 return true;
9590 static bool
9591 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9593 /* The pattern matching functions above are written to look for a small
9594 number to begin the sequence (0, 1, N/2). If we begin with an index
9595 from the second operand, we can swap the operands. */
9596 if (d->perm[0] >= d->nelt)
9598 unsigned i, nelt = d->nelt;
9599 rtx x;
9601 gcc_assert (nelt == (nelt & -nelt));
9602 for (i = 0; i < nelt; ++i)
9603 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
9605 x = d->op0;
9606 d->op0 = d->op1;
9607 d->op1 = x;
9610 if (TARGET_SIMD)
9612 if (aarch64_evpc_rev (d))
9613 return true;
9614 else if (aarch64_evpc_ext (d))
9615 return true;
9616 else if (aarch64_evpc_dup (d))
9617 return true;
9618 else if (aarch64_evpc_zip (d))
9619 return true;
9620 else if (aarch64_evpc_uzp (d))
9621 return true;
9622 else if (aarch64_evpc_trn (d))
9623 return true;
9624 return aarch64_evpc_tbl (d);
9626 return false;
9629 /* Expand a vec_perm_const pattern. */
9631 bool
9632 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9634 struct expand_vec_perm_d d;
9635 int i, nelt, which;
9637 d.target = target;
9638 d.op0 = op0;
9639 d.op1 = op1;
9641 d.vmode = GET_MODE (target);
9642 gcc_assert (VECTOR_MODE_P (d.vmode));
9643 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9644 d.testing_p = false;
9646 for (i = which = 0; i < nelt; ++i)
9648 rtx e = XVECEXP (sel, 0, i);
9649 int ei = INTVAL (e) & (2 * nelt - 1);
9650 which |= (ei < nelt ? 1 : 2);
9651 d.perm[i] = ei;
9654 switch (which)
9656 default:
9657 gcc_unreachable ();
9659 case 3:
9660 d.one_vector_p = false;
9661 if (!rtx_equal_p (op0, op1))
9662 break;
9664 /* The elements of PERM do not suggest that only the first operand
9665 is used, but both operands are identical. Allow easier matching
9666 of the permutation by folding the permutation into the single
9667 input vector. */
9668 /* Fall Through. */
9669 case 2:
9670 for (i = 0; i < nelt; ++i)
9671 d.perm[i] &= nelt - 1;
9672 d.op0 = op1;
9673 d.one_vector_p = true;
9674 break;
9676 case 1:
9677 d.op1 = op0;
9678 d.one_vector_p = true;
9679 break;
9682 return aarch64_expand_vec_perm_const_1 (&d);
9685 static bool
9686 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9687 const unsigned char *sel)
9689 struct expand_vec_perm_d d;
9690 unsigned int i, nelt, which;
9691 bool ret;
9693 d.vmode = vmode;
9694 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9695 d.testing_p = true;
9696 memcpy (d.perm, sel, nelt);
9698 /* Calculate whether all elements are in one vector. */
9699 for (i = which = 0; i < nelt; ++i)
9701 unsigned char e = d.perm[i];
9702 gcc_assert (e < 2 * nelt);
9703 which |= (e < nelt ? 1 : 2);
9706 /* If all elements are from the second vector, reindex as if from the
9707 first vector. */
9708 if (which == 2)
9709 for (i = 0; i < nelt; ++i)
9710 d.perm[i] -= nelt;
9712 /* Check whether the mask can be applied to a single vector. */
9713 d.one_vector_p = (which != 3);
9715 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9716 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9717 if (!d.one_vector_p)
9718 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9720 start_sequence ();
9721 ret = aarch64_expand_vec_perm_const_1 (&d);
9722 end_sequence ();
9724 return ret;
9727 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
9728 bool
9729 aarch64_cannot_change_mode_class (enum machine_mode from,
9730 enum machine_mode to,
9731 enum reg_class rclass)
9733 /* Full-reg subregs are allowed on general regs or any class if they are
9734 the same size. */
9735 if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9736 || !reg_classes_intersect_p (FP_REGS, rclass))
9737 return false;
9739 /* Limited combinations of subregs are safe on FPREGs. Particularly,
9740 1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9741 2. Scalar to Scalar for integer modes or same size float modes.
9742 3. Vector to Vector modes.
9743 4. On little-endian only, Vector-Structure to Vector modes. */
9744 if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9746 if (aarch64_vector_mode_supported_p (from)
9747 && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9748 return false;
9750 if (GET_MODE_NUNITS (from) == 1
9751 && GET_MODE_NUNITS (to) == 1
9752 && (GET_MODE_CLASS (from) == MODE_INT
9753 || from == to))
9754 return false;
9756 if (aarch64_vector_mode_supported_p (from)
9757 && aarch64_vector_mode_supported_p (to))
9758 return false;
9760 /* Within an vector structure straddling multiple vector registers
9761 we are in a mixed-endian representation. As such, we can't
9762 easily change modes for BYTES_BIG_ENDIAN. Otherwise, we can
9763 switch between vectors and vector structures cheaply. */
9764 if (!BYTES_BIG_ENDIAN)
9765 if ((aarch64_vector_mode_supported_p (from)
9766 && aarch64_vect_struct_mode_p (to))
9767 || (aarch64_vector_mode_supported_p (to)
9768 && aarch64_vect_struct_mode_p (from)))
9769 return false;
9772 return true;
9775 /* Implement MODES_TIEABLE_P. */
9777 bool
9778 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9780 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9781 return true;
9783 /* We specifically want to allow elements of "structure" modes to
9784 be tieable to the structure. This more general condition allows
9785 other rarer situations too. */
9786 if (TARGET_SIMD
9787 && aarch64_vector_mode_p (mode1)
9788 && aarch64_vector_mode_p (mode2))
9789 return true;
9791 return false;
9794 /* Return a new RTX holding the result of moving POINTER forward by
9795 AMOUNT bytes. */
9797 static rtx
9798 aarch64_move_pointer (rtx pointer, int amount)
9800 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9802 return adjust_automodify_address (pointer, GET_MODE (pointer),
9803 next, amount);
9806 /* Return a new RTX holding the result of moving POINTER forward by the
9807 size of the mode it points to. */
9809 static rtx
9810 aarch64_progress_pointer (rtx pointer)
9812 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9814 return aarch64_move_pointer (pointer, amount);
9817 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9818 MODE bytes. */
9820 static void
9821 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9822 enum machine_mode mode)
9824 rtx reg = gen_reg_rtx (mode);
9826 /* "Cast" the pointers to the correct mode. */
9827 *src = adjust_address (*src, mode, 0);
9828 *dst = adjust_address (*dst, mode, 0);
9829 /* Emit the memcpy. */
9830 emit_move_insn (reg, *src);
9831 emit_move_insn (*dst, reg);
9832 /* Move the pointers forward. */
9833 *src = aarch64_progress_pointer (*src);
9834 *dst = aarch64_progress_pointer (*dst);
9837 /* Expand movmem, as if from a __builtin_memcpy. Return true if
9838 we succeed, otherwise return false. */
9840 bool
9841 aarch64_expand_movmem (rtx *operands)
9843 unsigned int n;
9844 rtx dst = operands[0];
9845 rtx src = operands[1];
9846 rtx base;
9847 bool speed_p = !optimize_function_for_size_p (cfun);
9849 /* When optimizing for size, give a better estimate of the length of a
9850 memcpy call, but use the default otherwise. */
9851 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9853 /* We can't do anything smart if the amount to copy is not constant. */
9854 if (!CONST_INT_P (operands[2]))
9855 return false;
9857 n = UINTVAL (operands[2]);
9859 /* Try to keep the number of instructions low. For cases below 16 bytes we
9860 need to make at most two moves. For cases above 16 bytes it will be one
9861 move for each 16 byte chunk, then at most two additional moves. */
9862 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9863 return false;
9865 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9866 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9868 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9869 src = adjust_automodify_address (src, VOIDmode, base, 0);
9871 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9872 1-byte chunk. */
9873 if (n < 4)
9875 if (n >= 2)
9877 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9878 n -= 2;
9881 if (n == 1)
9882 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9884 return true;
9887 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
9888 4-byte chunk, partially overlapping with the previously copied chunk. */
9889 if (n < 8)
9891 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9892 n -= 4;
9893 if (n > 0)
9895 int move = n - 4;
9897 src = aarch64_move_pointer (src, move);
9898 dst = aarch64_move_pointer (dst, move);
9899 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9901 return true;
9904 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
9905 them, then (if applicable) an 8-byte chunk. */
9906 while (n >= 8)
9908 if (n / 16)
9910 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9911 n -= 16;
9913 else
9915 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9916 n -= 8;
9920 /* Finish the final bytes of the copy. We can always do this in one
9921 instruction. We either copy the exact amount we need, or partially
9922 overlap with the previous chunk we copied and copy 8-bytes. */
9923 if (n == 0)
9924 return true;
9925 else if (n == 1)
9926 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9927 else if (n == 2)
9928 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9929 else if (n == 4)
9930 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9931 else
9933 if (n == 3)
9935 src = aarch64_move_pointer (src, -1);
9936 dst = aarch64_move_pointer (dst, -1);
9937 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9939 else
9941 int move = n - 8;
9943 src = aarch64_move_pointer (src, move);
9944 dst = aarch64_move_pointer (dst, move);
9945 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9949 return true;
9952 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
9954 static unsigned HOST_WIDE_INT
9955 aarch64_asan_shadow_offset (void)
9957 return (HOST_WIDE_INT_1 << 36);
9960 #undef TARGET_ADDRESS_COST
9961 #define TARGET_ADDRESS_COST aarch64_address_cost
9963 /* This hook will determines whether unnamed bitfields affect the alignment
9964 of the containing structure. The hook returns true if the structure
9965 should inherit the alignment requirements of an unnamed bitfield's
9966 type. */
9967 #undef TARGET_ALIGN_ANON_BITFIELD
9968 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9970 #undef TARGET_ASM_ALIGNED_DI_OP
9971 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9973 #undef TARGET_ASM_ALIGNED_HI_OP
9974 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9976 #undef TARGET_ASM_ALIGNED_SI_OP
9977 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9979 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9980 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9981 hook_bool_const_tree_hwi_hwi_const_tree_true
9983 #undef TARGET_ASM_FILE_START
9984 #define TARGET_ASM_FILE_START aarch64_start_file
9986 #undef TARGET_ASM_OUTPUT_MI_THUNK
9987 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9989 #undef TARGET_ASM_SELECT_RTX_SECTION
9990 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9992 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9993 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9995 #undef TARGET_BUILD_BUILTIN_VA_LIST
9996 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9998 #undef TARGET_CALLEE_COPIES
9999 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
10001 #undef TARGET_CAN_ELIMINATE
10002 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
10004 #undef TARGET_CANNOT_FORCE_CONST_MEM
10005 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
10007 #undef TARGET_CONDITIONAL_REGISTER_USAGE
10008 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
10010 /* Only the least significant bit is used for initialization guard
10011 variables. */
10012 #undef TARGET_CXX_GUARD_MASK_BIT
10013 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
10015 #undef TARGET_C_MODE_FOR_SUFFIX
10016 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
10018 #ifdef TARGET_BIG_ENDIAN_DEFAULT
10019 #undef TARGET_DEFAULT_TARGET_FLAGS
10020 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
10021 #endif
10023 #undef TARGET_CLASS_MAX_NREGS
10024 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
10026 #undef TARGET_BUILTIN_DECL
10027 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
10029 #undef TARGET_EXPAND_BUILTIN
10030 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
10032 #undef TARGET_EXPAND_BUILTIN_VA_START
10033 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
10035 #undef TARGET_FOLD_BUILTIN
10036 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
10038 #undef TARGET_FUNCTION_ARG
10039 #define TARGET_FUNCTION_ARG aarch64_function_arg
10041 #undef TARGET_FUNCTION_ARG_ADVANCE
10042 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
10044 #undef TARGET_FUNCTION_ARG_BOUNDARY
10045 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
10047 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
10048 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
10050 #undef TARGET_FUNCTION_VALUE
10051 #define TARGET_FUNCTION_VALUE aarch64_function_value
10053 #undef TARGET_FUNCTION_VALUE_REGNO_P
10054 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
10056 #undef TARGET_FRAME_POINTER_REQUIRED
10057 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
10059 #undef TARGET_GIMPLE_FOLD_BUILTIN
10060 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
10062 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
10063 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
10065 #undef TARGET_INIT_BUILTINS
10066 #define TARGET_INIT_BUILTINS aarch64_init_builtins
10068 #undef TARGET_LEGITIMATE_ADDRESS_P
10069 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
10071 #undef TARGET_LEGITIMATE_CONSTANT_P
10072 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
10074 #undef TARGET_LIBGCC_CMP_RETURN_MODE
10075 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
10077 #undef TARGET_LRA_P
10078 #define TARGET_LRA_P aarch64_lra_p
10080 #undef TARGET_MANGLE_TYPE
10081 #define TARGET_MANGLE_TYPE aarch64_mangle_type
10083 #undef TARGET_MEMORY_MOVE_COST
10084 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
10086 #undef TARGET_MUST_PASS_IN_STACK
10087 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
10089 /* This target hook should return true if accesses to volatile bitfields
10090 should use the narrowest mode possible. It should return false if these
10091 accesses should use the bitfield container type. */
10092 #undef TARGET_NARROW_VOLATILE_BITFIELD
10093 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
10095 #undef TARGET_OPTION_OVERRIDE
10096 #define TARGET_OPTION_OVERRIDE aarch64_override_options
10098 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
10099 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
10100 aarch64_override_options_after_change
10102 #undef TARGET_PASS_BY_REFERENCE
10103 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
10105 #undef TARGET_PREFERRED_RELOAD_CLASS
10106 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
10108 #undef TARGET_SECONDARY_RELOAD
10109 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
10111 #undef TARGET_SHIFT_TRUNCATION_MASK
10112 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
10114 #undef TARGET_SETUP_INCOMING_VARARGS
10115 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
10117 #undef TARGET_STRUCT_VALUE_RTX
10118 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
10120 #undef TARGET_REGISTER_MOVE_COST
10121 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
10123 #undef TARGET_RETURN_IN_MEMORY
10124 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
10126 #undef TARGET_RETURN_IN_MSB
10127 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
10129 #undef TARGET_RTX_COSTS
10130 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
10132 #undef TARGET_SCHED_ISSUE_RATE
10133 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
10135 #undef TARGET_TRAMPOLINE_INIT
10136 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
10138 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
10139 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
10141 #undef TARGET_VECTOR_MODE_SUPPORTED_P
10142 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
10144 #undef TARGET_ARRAY_MODE_SUPPORTED_P
10145 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
10147 #undef TARGET_VECTORIZE_ADD_STMT_COST
10148 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
10150 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
10151 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
10152 aarch64_builtin_vectorization_cost
10154 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
10155 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
10157 #undef TARGET_VECTORIZE_BUILTINS
10158 #define TARGET_VECTORIZE_BUILTINS
10160 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
10161 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
10162 aarch64_builtin_vectorized_function
10164 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
10165 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
10166 aarch64_autovectorize_vector_sizes
10168 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
10169 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
10170 aarch64_atomic_assign_expand_fenv
10172 /* Section anchor support. */
10174 #undef TARGET_MIN_ANCHOR_OFFSET
10175 #define TARGET_MIN_ANCHOR_OFFSET -256
10177 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
10178 byte offset; we can do much more for larger data types, but have no way
10179 to determine the size of the access. We assume accesses are aligned. */
10180 #undef TARGET_MAX_ANCHOR_OFFSET
10181 #define TARGET_MAX_ANCHOR_OFFSET 4095
10183 #undef TARGET_VECTOR_ALIGNMENT
10184 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
10186 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
10187 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
10188 aarch64_simd_vector_alignment_reachable
10190 /* vec_perm support. */
10192 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
10193 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
10194 aarch64_vectorize_vec_perm_const_ok
10197 #undef TARGET_FIXED_CONDITION_CODE_REGS
10198 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
10200 #undef TARGET_FLAGS_REGNUM
10201 #define TARGET_FLAGS_REGNUM CC_REGNUM
10203 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
10204 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
10206 #undef TARGET_ASAN_SHADOW_OFFSET
10207 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
10209 #undef TARGET_LEGITIMIZE_ADDRESS
10210 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
10212 struct gcc_target targetm = TARGET_INITIALIZER;
10214 #include "gt-aarch64.h"