[Patch 6/7 AArch64] Deprecate *_BY_PIECES_P, move to hookized version
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob9aeac7c2f052a78a05d9cc2882b45a7809673eac
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2014 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "tree.h"
29 #include "stringpool.h"
30 #include "stor-layout.h"
31 #include "calls.h"
32 #include "varasm.h"
33 #include "regs.h"
34 #include "dominance.h"
35 #include "cfg.h"
36 #include "cfgrtl.h"
37 #include "cfganal.h"
38 #include "lcm.h"
39 #include "cfgbuild.h"
40 #include "cfgcleanup.h"
41 #include "predict.h"
42 #include "basic-block.h"
43 #include "df.h"
44 #include "hard-reg-set.h"
45 #include "output.h"
46 #include "expr.h"
47 #include "reload.h"
48 #include "toplev.h"
49 #include "target.h"
50 #include "target-def.h"
51 #include "targhooks.h"
52 #include "ggc.h"
53 #include "hashtab.h"
54 #include "hash-set.h"
55 #include "vec.h"
56 #include "machmode.h"
57 #include "input.h"
58 #include "function.h"
59 #include "tm_p.h"
60 #include "recog.h"
61 #include "langhooks.h"
62 #include "diagnostic-core.h"
63 #include "hash-table.h"
64 #include "tree-ssa-alias.h"
65 #include "internal-fn.h"
66 #include "gimple-fold.h"
67 #include "tree-eh.h"
68 #include "gimple-expr.h"
69 #include "is-a.h"
70 #include "gimple.h"
71 #include "gimplify.h"
72 #include "optabs.h"
73 #include "dwarf2.h"
74 #include "cfgloop.h"
75 #include "tree-vectorizer.h"
76 #include "aarch64-cost-tables.h"
77 #include "dumpfile.h"
78 #include "builtins.h"
79 #include "rtl-iter.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Classifies an address.
86 ADDRESS_REG_IMM
87 A simple base register plus immediate offset.
89 ADDRESS_REG_WB
90 A base register indexed by immediate offset with writeback.
92 ADDRESS_REG_REG
93 A base register indexed by (optionally scaled) register.
95 ADDRESS_REG_UXTW
96 A base register indexed by (optionally scaled) zero-extended register.
98 ADDRESS_REG_SXTW
99 A base register indexed by (optionally scaled) sign-extended register.
101 ADDRESS_LO_SUM
102 A LO_SUM rtx with a base register and "LO12" symbol relocation.
104 ADDRESS_SYMBOLIC:
105 A constant symbolic address, in pc-relative literal pool. */
107 enum aarch64_address_type {
108 ADDRESS_REG_IMM,
109 ADDRESS_REG_WB,
110 ADDRESS_REG_REG,
111 ADDRESS_REG_UXTW,
112 ADDRESS_REG_SXTW,
113 ADDRESS_LO_SUM,
114 ADDRESS_SYMBOLIC
117 struct aarch64_address_info {
118 enum aarch64_address_type type;
119 rtx base;
120 rtx offset;
121 int shift;
122 enum aarch64_symbol_type symbol_type;
125 struct simd_immediate_info
127 rtx value;
128 int shift;
129 int element_width;
130 bool mvn;
131 bool msl;
134 /* The current code model. */
135 enum aarch64_code_model aarch64_cmodel;
137 #ifdef HAVE_AS_TLS
138 #undef TARGET_HAVE_TLS
139 #define TARGET_HAVE_TLS 1
140 #endif
142 static bool aarch64_lra_p (void);
143 static bool aarch64_composite_type_p (const_tree, machine_mode);
144 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
145 const_tree,
146 machine_mode *, int *,
147 bool *);
148 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
149 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
150 static void aarch64_override_options_after_change (void);
151 static bool aarch64_vector_mode_supported_p (machine_mode);
152 static unsigned bit_count (unsigned HOST_WIDE_INT);
153 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
154 const unsigned char *sel);
155 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* The current tuning set. */
161 const struct tune_params *aarch64_tune_params;
163 /* Mask to specify which instructions we are allowed to generate. */
164 unsigned long aarch64_isa_flags = 0;
166 /* Mask to specify which instruction scheduling options should be used. */
167 unsigned long aarch64_tune_flags = 0;
169 /* Tuning parameters. */
171 #if HAVE_DESIGNATED_INITIALIZERS
172 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
173 #else
174 #define NAMED_PARAM(NAME, VAL) (VAL)
175 #endif
177 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
178 __extension__
179 #endif
181 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
182 __extension__
183 #endif
184 static const struct cpu_addrcost_table generic_addrcost_table =
186 #if HAVE_DESIGNATED_INITIALIZERS
187 .addr_scale_costs =
188 #endif
190 NAMED_PARAM (hi, 0),
191 NAMED_PARAM (si, 0),
192 NAMED_PARAM (di, 0),
193 NAMED_PARAM (ti, 0),
195 NAMED_PARAM (pre_modify, 0),
196 NAMED_PARAM (post_modify, 0),
197 NAMED_PARAM (register_offset, 0),
198 NAMED_PARAM (register_extend, 0),
199 NAMED_PARAM (imm_offset, 0)
202 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
203 __extension__
204 #endif
205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
207 #if HAVE_DESIGNATED_INITIALIZERS
208 .addr_scale_costs =
209 #endif
211 NAMED_PARAM (hi, 1),
212 NAMED_PARAM (si, 0),
213 NAMED_PARAM (di, 0),
214 NAMED_PARAM (ti, 1),
216 NAMED_PARAM (pre_modify, 0),
217 NAMED_PARAM (post_modify, 0),
218 NAMED_PARAM (register_offset, 0),
219 NAMED_PARAM (register_extend, 0),
220 NAMED_PARAM (imm_offset, 0),
223 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
224 __extension__
225 #endif
226 static const struct cpu_regmove_cost generic_regmove_cost =
228 NAMED_PARAM (GP2GP, 1),
229 NAMED_PARAM (GP2FP, 2),
230 NAMED_PARAM (FP2GP, 2),
231 NAMED_PARAM (FP2FP, 2)
234 static const struct cpu_regmove_cost cortexa57_regmove_cost =
236 NAMED_PARAM (GP2GP, 1),
237 /* Avoid the use of slow int<->fp moves for spilling by setting
238 their cost higher than memmov_cost. */
239 NAMED_PARAM (GP2FP, 5),
240 NAMED_PARAM (FP2GP, 5),
241 NAMED_PARAM (FP2FP, 2)
244 static const struct cpu_regmove_cost cortexa53_regmove_cost =
246 NAMED_PARAM (GP2GP, 1),
247 /* Avoid the use of slow int<->fp moves for spilling by setting
248 their cost higher than memmov_cost. */
249 NAMED_PARAM (GP2FP, 5),
250 NAMED_PARAM (FP2GP, 5),
251 NAMED_PARAM (FP2FP, 2)
254 static const struct cpu_regmove_cost thunderx_regmove_cost =
256 NAMED_PARAM (GP2GP, 2),
257 NAMED_PARAM (GP2FP, 2),
258 NAMED_PARAM (FP2GP, 6),
259 NAMED_PARAM (FP2FP, 4)
262 /* Generic costs for vector insn classes. */
263 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
264 __extension__
265 #endif
266 static const struct cpu_vector_cost generic_vector_cost =
268 NAMED_PARAM (scalar_stmt_cost, 1),
269 NAMED_PARAM (scalar_load_cost, 1),
270 NAMED_PARAM (scalar_store_cost, 1),
271 NAMED_PARAM (vec_stmt_cost, 1),
272 NAMED_PARAM (vec_to_scalar_cost, 1),
273 NAMED_PARAM (scalar_to_vec_cost, 1),
274 NAMED_PARAM (vec_align_load_cost, 1),
275 NAMED_PARAM (vec_unalign_load_cost, 1),
276 NAMED_PARAM (vec_unalign_store_cost, 1),
277 NAMED_PARAM (vec_store_cost, 1),
278 NAMED_PARAM (cond_taken_branch_cost, 3),
279 NAMED_PARAM (cond_not_taken_branch_cost, 1)
282 /* Generic costs for vector insn classes. */
283 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
284 __extension__
285 #endif
286 static const struct cpu_vector_cost cortexa57_vector_cost =
288 NAMED_PARAM (scalar_stmt_cost, 1),
289 NAMED_PARAM (scalar_load_cost, 4),
290 NAMED_PARAM (scalar_store_cost, 1),
291 NAMED_PARAM (vec_stmt_cost, 3),
292 NAMED_PARAM (vec_to_scalar_cost, 8),
293 NAMED_PARAM (scalar_to_vec_cost, 8),
294 NAMED_PARAM (vec_align_load_cost, 5),
295 NAMED_PARAM (vec_unalign_load_cost, 5),
296 NAMED_PARAM (vec_unalign_store_cost, 1),
297 NAMED_PARAM (vec_store_cost, 1),
298 NAMED_PARAM (cond_taken_branch_cost, 1),
299 NAMED_PARAM (cond_not_taken_branch_cost, 1)
302 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
303 __extension__
304 #endif
305 static const struct tune_params generic_tunings =
307 &cortexa57_extra_costs,
308 &generic_addrcost_table,
309 &generic_regmove_cost,
310 &generic_vector_cost,
311 NAMED_PARAM (memmov_cost, 4),
312 NAMED_PARAM (issue_rate, 2)
315 static const struct tune_params cortexa53_tunings =
317 &cortexa53_extra_costs,
318 &generic_addrcost_table,
319 &cortexa53_regmove_cost,
320 &generic_vector_cost,
321 NAMED_PARAM (memmov_cost, 4),
322 NAMED_PARAM (issue_rate, 2)
325 static const struct tune_params cortexa57_tunings =
327 &cortexa57_extra_costs,
328 &cortexa57_addrcost_table,
329 &cortexa57_regmove_cost,
330 &cortexa57_vector_cost,
331 NAMED_PARAM (memmov_cost, 4),
332 NAMED_PARAM (issue_rate, 3)
335 static const struct tune_params thunderx_tunings =
337 &thunderx_extra_costs,
338 &generic_addrcost_table,
339 &thunderx_regmove_cost,
340 &generic_vector_cost,
341 NAMED_PARAM (memmov_cost, 6),
342 NAMED_PARAM (issue_rate, 2)
345 /* A processor implementing AArch64. */
346 struct processor
348 const char *const name;
349 enum aarch64_processor core;
350 const char *arch;
351 const unsigned long flags;
352 const struct tune_params *const tune;
355 /* Processor cores implementing AArch64. */
356 static const struct processor all_cores[] =
358 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
359 {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
360 #include "aarch64-cores.def"
361 #undef AARCH64_CORE
362 {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
363 {NULL, aarch64_none, NULL, 0, NULL}
366 /* Architectures implementing AArch64. */
367 static const struct processor all_architectures[] =
369 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
370 {NAME, CORE, #ARCH, FLAGS, NULL},
371 #include "aarch64-arches.def"
372 #undef AARCH64_ARCH
373 {NULL, aarch64_none, NULL, 0, NULL}
376 /* Target specification. These are populated as commandline arguments
377 are processed, or NULL if not specified. */
378 static const struct processor *selected_arch;
379 static const struct processor *selected_cpu;
380 static const struct processor *selected_tune;
382 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
384 /* An ISA extension in the co-processor and main instruction set space. */
385 struct aarch64_option_extension
387 const char *const name;
388 const unsigned long flags_on;
389 const unsigned long flags_off;
392 /* ISA extensions in AArch64. */
393 static const struct aarch64_option_extension all_extensions[] =
395 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
396 {NAME, FLAGS_ON, FLAGS_OFF},
397 #include "aarch64-option-extensions.def"
398 #undef AARCH64_OPT_EXTENSION
399 {NULL, 0, 0}
402 /* Used to track the size of an address when generating a pre/post
403 increment address. */
404 static machine_mode aarch64_memory_reference_mode;
406 /* Used to force GTY into this file. */
407 static GTY(()) int gty_dummy;
409 /* A table of valid AArch64 "bitmask immediate" values for
410 logical instructions. */
412 #define AARCH64_NUM_BITMASKS 5334
413 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
415 typedef enum aarch64_cond_code
417 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
418 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
419 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
421 aarch64_cc;
423 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
425 /* The condition codes of the processor, and the inverse function. */
426 static const char * const aarch64_condition_codes[] =
428 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
429 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
432 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
433 unsigned
434 aarch64_dbx_register_number (unsigned regno)
436 if (GP_REGNUM_P (regno))
437 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
438 else if (regno == SP_REGNUM)
439 return AARCH64_DWARF_SP;
440 else if (FP_REGNUM_P (regno))
441 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
443 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
444 equivalent DWARF register. */
445 return DWARF_FRAME_REGISTERS;
448 /* Return TRUE if MODE is any of the large INT modes. */
449 static bool
450 aarch64_vect_struct_mode_p (machine_mode mode)
452 return mode == OImode || mode == CImode || mode == XImode;
455 /* Return TRUE if MODE is any of the vector modes. */
456 static bool
457 aarch64_vector_mode_p (machine_mode mode)
459 return aarch64_vector_mode_supported_p (mode)
460 || aarch64_vect_struct_mode_p (mode);
463 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
464 static bool
465 aarch64_array_mode_supported_p (machine_mode mode,
466 unsigned HOST_WIDE_INT nelems)
468 if (TARGET_SIMD
469 && AARCH64_VALID_SIMD_QREG_MODE (mode)
470 && (nelems >= 2 && nelems <= 4))
471 return true;
473 return false;
476 /* Implement HARD_REGNO_NREGS. */
479 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
481 switch (aarch64_regno_regclass (regno))
483 case FP_REGS:
484 case FP_LO_REGS:
485 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
486 default:
487 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
489 gcc_unreachable ();
492 /* Implement HARD_REGNO_MODE_OK. */
495 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
497 if (GET_MODE_CLASS (mode) == MODE_CC)
498 return regno == CC_REGNUM;
500 if (regno == SP_REGNUM)
501 /* The purpose of comparing with ptr_mode is to support the
502 global register variable associated with the stack pointer
503 register via the syntax of asm ("wsp") in ILP32. */
504 return mode == Pmode || mode == ptr_mode;
506 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
507 return mode == Pmode;
509 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
510 return 1;
512 if (FP_REGNUM_P (regno))
514 if (aarch64_vect_struct_mode_p (mode))
515 return
516 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
517 else
518 return 1;
521 return 0;
524 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
525 machine_mode
526 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
527 machine_mode mode)
529 /* Handle modes that fit within single registers. */
530 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
532 if (GET_MODE_SIZE (mode) >= 4)
533 return mode;
534 else
535 return SImode;
537 /* Fall back to generic for multi-reg and very large modes. */
538 else
539 return choose_hard_reg_mode (regno, nregs, false);
542 /* Return true if calls to DECL should be treated as
543 long-calls (ie called via a register). */
544 static bool
545 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
547 return false;
550 /* Return true if calls to symbol-ref SYM should be treated as
551 long-calls (ie called via a register). */
552 bool
553 aarch64_is_long_call_p (rtx sym)
555 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
558 /* Return true if the offsets to a zero/sign-extract operation
559 represent an expression that matches an extend operation. The
560 operands represent the paramters from
562 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
563 bool
564 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
565 rtx extract_imm)
567 HOST_WIDE_INT mult_val, extract_val;
569 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
570 return false;
572 mult_val = INTVAL (mult_imm);
573 extract_val = INTVAL (extract_imm);
575 if (extract_val > 8
576 && extract_val < GET_MODE_BITSIZE (mode)
577 && exact_log2 (extract_val & ~7) > 0
578 && (extract_val & 7) <= 4
579 && mult_val == (1 << (extract_val & 7)))
580 return true;
582 return false;
585 /* Emit an insn that's a simple single-set. Both the operands must be
586 known to be valid. */
587 inline static rtx
588 emit_set_insn (rtx x, rtx y)
590 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
593 /* X and Y are two things to compare using CODE. Emit the compare insn and
594 return the rtx for register 0 in the proper mode. */
596 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
598 machine_mode mode = SELECT_CC_MODE (code, x, y);
599 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
601 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
602 return cc_reg;
605 /* Build the SYMBOL_REF for __tls_get_addr. */
607 static GTY(()) rtx tls_get_addr_libfunc;
610 aarch64_tls_get_addr (void)
612 if (!tls_get_addr_libfunc)
613 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
614 return tls_get_addr_libfunc;
617 /* Return the TLS model to use for ADDR. */
619 static enum tls_model
620 tls_symbolic_operand_type (rtx addr)
622 enum tls_model tls_kind = TLS_MODEL_NONE;
623 rtx sym, addend;
625 if (GET_CODE (addr) == CONST)
627 split_const (addr, &sym, &addend);
628 if (GET_CODE (sym) == SYMBOL_REF)
629 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
631 else if (GET_CODE (addr) == SYMBOL_REF)
632 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
634 return tls_kind;
637 /* We'll allow lo_sum's in addresses in our legitimate addresses
638 so that combine would take care of combining addresses where
639 necessary, but for generation purposes, we'll generate the address
640 as :
641 RTL Absolute
642 tmp = hi (symbol_ref); adrp x1, foo
643 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
646 PIC TLS
647 adrp x1, :got:foo adrp tmp, :tlsgd:foo
648 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
649 bl __tls_get_addr
652 Load TLS symbol, depending on TLS mechanism and TLS access model.
654 Global Dynamic - Traditional TLS:
655 adrp tmp, :tlsgd:imm
656 add dest, tmp, #:tlsgd_lo12:imm
657 bl __tls_get_addr
659 Global Dynamic - TLS Descriptors:
660 adrp dest, :tlsdesc:imm
661 ldr tmp, [dest, #:tlsdesc_lo12:imm]
662 add dest, dest, #:tlsdesc_lo12:imm
663 blr tmp
664 mrs tp, tpidr_el0
665 add dest, dest, tp
667 Initial Exec:
668 mrs tp, tpidr_el0
669 adrp tmp, :gottprel:imm
670 ldr dest, [tmp, #:gottprel_lo12:imm]
671 add dest, dest, tp
673 Local Exec:
674 mrs tp, tpidr_el0
675 add t0, tp, #:tprel_hi12:imm
676 add t0, #:tprel_lo12_nc:imm
679 static void
680 aarch64_load_symref_appropriately (rtx dest, rtx imm,
681 enum aarch64_symbol_type type)
683 switch (type)
685 case SYMBOL_SMALL_ABSOLUTE:
687 /* In ILP32, the mode of dest can be either SImode or DImode. */
688 rtx tmp_reg = dest;
689 machine_mode mode = GET_MODE (dest);
691 gcc_assert (mode == Pmode || mode == ptr_mode);
693 if (can_create_pseudo_p ())
694 tmp_reg = gen_reg_rtx (mode);
696 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
697 emit_insn (gen_add_losym (dest, tmp_reg, imm));
698 return;
701 case SYMBOL_TINY_ABSOLUTE:
702 emit_insn (gen_rtx_SET (Pmode, dest, imm));
703 return;
705 case SYMBOL_SMALL_GOT:
707 /* In ILP32, the mode of dest can be either SImode or DImode,
708 while the got entry is always of SImode size. The mode of
709 dest depends on how dest is used: if dest is assigned to a
710 pointer (e.g. in the memory), it has SImode; it may have
711 DImode if dest is dereferenced to access the memeory.
712 This is why we have to handle three different ldr_got_small
713 patterns here (two patterns for ILP32). */
714 rtx tmp_reg = dest;
715 machine_mode mode = GET_MODE (dest);
717 if (can_create_pseudo_p ())
718 tmp_reg = gen_reg_rtx (mode);
720 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
721 if (mode == ptr_mode)
723 if (mode == DImode)
724 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
725 else
726 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
728 else
730 gcc_assert (mode == Pmode);
731 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
734 return;
737 case SYMBOL_SMALL_TLSGD:
739 rtx_insn *insns;
740 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
742 start_sequence ();
743 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
744 insns = get_insns ();
745 end_sequence ();
747 RTL_CONST_CALL_P (insns) = 1;
748 emit_libcall_block (insns, dest, result, imm);
749 return;
752 case SYMBOL_SMALL_TLSDESC:
754 machine_mode mode = GET_MODE (dest);
755 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
756 rtx tp;
758 gcc_assert (mode == Pmode || mode == ptr_mode);
760 /* In ILP32, the got entry is always of SImode size. Unlike
761 small GOT, the dest is fixed at reg 0. */
762 if (TARGET_ILP32)
763 emit_insn (gen_tlsdesc_small_si (imm));
764 else
765 emit_insn (gen_tlsdesc_small_di (imm));
766 tp = aarch64_load_tp (NULL);
768 if (mode != Pmode)
769 tp = gen_lowpart (mode, tp);
771 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
772 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
773 return;
776 case SYMBOL_SMALL_GOTTPREL:
778 /* In ILP32, the mode of dest can be either SImode or DImode,
779 while the got entry is always of SImode size. The mode of
780 dest depends on how dest is used: if dest is assigned to a
781 pointer (e.g. in the memory), it has SImode; it may have
782 DImode if dest is dereferenced to access the memeory.
783 This is why we have to handle three different tlsie_small
784 patterns here (two patterns for ILP32). */
785 machine_mode mode = GET_MODE (dest);
786 rtx tmp_reg = gen_reg_rtx (mode);
787 rtx tp = aarch64_load_tp (NULL);
789 if (mode == ptr_mode)
791 if (mode == DImode)
792 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
793 else
795 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
796 tp = gen_lowpart (mode, tp);
799 else
801 gcc_assert (mode == Pmode);
802 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
805 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
806 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
807 return;
810 case SYMBOL_SMALL_TPREL:
812 rtx tp = aarch64_load_tp (NULL);
813 emit_insn (gen_tlsle_small (dest, tp, imm));
814 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
815 return;
818 case SYMBOL_TINY_GOT:
819 emit_insn (gen_ldr_got_tiny (dest, imm));
820 return;
822 default:
823 gcc_unreachable ();
827 /* Emit a move from SRC to DEST. Assume that the move expanders can
828 handle all moves if !can_create_pseudo_p (). The distinction is
829 important because, unlike emit_move_insn, the move expanders know
830 how to force Pmode objects into the constant pool even when the
831 constant pool address is not itself legitimate. */
832 static rtx
833 aarch64_emit_move (rtx dest, rtx src)
835 return (can_create_pseudo_p ()
836 ? emit_move_insn (dest, src)
837 : emit_move_insn_1 (dest, src));
840 /* Split a 128-bit move operation into two 64-bit move operations,
841 taking care to handle partial overlap of register to register
842 copies. Special cases are needed when moving between GP regs and
843 FP regs. SRC can be a register, constant or memory; DST a register
844 or memory. If either operand is memory it must not have any side
845 effects. */
846 void
847 aarch64_split_128bit_move (rtx dst, rtx src)
849 rtx dst_lo, dst_hi;
850 rtx src_lo, src_hi;
852 machine_mode mode = GET_MODE (dst);
854 gcc_assert (mode == TImode || mode == TFmode);
855 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
856 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
858 if (REG_P (dst) && REG_P (src))
860 int src_regno = REGNO (src);
861 int dst_regno = REGNO (dst);
863 /* Handle FP <-> GP regs. */
864 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
866 src_lo = gen_lowpart (word_mode, src);
867 src_hi = gen_highpart (word_mode, src);
869 if (mode == TImode)
871 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
872 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
874 else
876 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
877 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
879 return;
881 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
883 dst_lo = gen_lowpart (word_mode, dst);
884 dst_hi = gen_highpart (word_mode, dst);
886 if (mode == TImode)
888 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
889 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
891 else
893 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
894 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
896 return;
900 dst_lo = gen_lowpart (word_mode, dst);
901 dst_hi = gen_highpart (word_mode, dst);
902 src_lo = gen_lowpart (word_mode, src);
903 src_hi = gen_highpart_mode (word_mode, mode, src);
905 /* At most one pairing may overlap. */
906 if (reg_overlap_mentioned_p (dst_lo, src_hi))
908 aarch64_emit_move (dst_hi, src_hi);
909 aarch64_emit_move (dst_lo, src_lo);
911 else
913 aarch64_emit_move (dst_lo, src_lo);
914 aarch64_emit_move (dst_hi, src_hi);
918 bool
919 aarch64_split_128bit_move_p (rtx dst, rtx src)
921 return (! REG_P (src)
922 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
925 /* Split a complex SIMD combine. */
927 void
928 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
930 machine_mode src_mode = GET_MODE (src1);
931 machine_mode dst_mode = GET_MODE (dst);
933 gcc_assert (VECTOR_MODE_P (dst_mode));
935 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
937 rtx (*gen) (rtx, rtx, rtx);
939 switch (src_mode)
941 case V8QImode:
942 gen = gen_aarch64_simd_combinev8qi;
943 break;
944 case V4HImode:
945 gen = gen_aarch64_simd_combinev4hi;
946 break;
947 case V2SImode:
948 gen = gen_aarch64_simd_combinev2si;
949 break;
950 case V2SFmode:
951 gen = gen_aarch64_simd_combinev2sf;
952 break;
953 case DImode:
954 gen = gen_aarch64_simd_combinedi;
955 break;
956 case DFmode:
957 gen = gen_aarch64_simd_combinedf;
958 break;
959 default:
960 gcc_unreachable ();
963 emit_insn (gen (dst, src1, src2));
964 return;
968 /* Split a complex SIMD move. */
970 void
971 aarch64_split_simd_move (rtx dst, rtx src)
973 machine_mode src_mode = GET_MODE (src);
974 machine_mode dst_mode = GET_MODE (dst);
976 gcc_assert (VECTOR_MODE_P (dst_mode));
978 if (REG_P (dst) && REG_P (src))
980 rtx (*gen) (rtx, rtx);
982 gcc_assert (VECTOR_MODE_P (src_mode));
984 switch (src_mode)
986 case V16QImode:
987 gen = gen_aarch64_split_simd_movv16qi;
988 break;
989 case V8HImode:
990 gen = gen_aarch64_split_simd_movv8hi;
991 break;
992 case V4SImode:
993 gen = gen_aarch64_split_simd_movv4si;
994 break;
995 case V2DImode:
996 gen = gen_aarch64_split_simd_movv2di;
997 break;
998 case V4SFmode:
999 gen = gen_aarch64_split_simd_movv4sf;
1000 break;
1001 case V2DFmode:
1002 gen = gen_aarch64_split_simd_movv2df;
1003 break;
1004 default:
1005 gcc_unreachable ();
1008 emit_insn (gen (dst, src));
1009 return;
1013 static rtx
1014 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1016 if (can_create_pseudo_p ())
1017 return force_reg (mode, value);
1018 else
1020 x = aarch64_emit_move (x, value);
1021 return x;
1026 static rtx
1027 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1029 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1031 rtx high;
1032 /* Load the full offset into a register. This
1033 might be improvable in the future. */
1034 high = GEN_INT (offset);
1035 offset = 0;
1036 high = aarch64_force_temporary (mode, temp, high);
1037 reg = aarch64_force_temporary (mode, temp,
1038 gen_rtx_PLUS (mode, high, reg));
1040 return plus_constant (mode, reg, offset);
1043 void
1044 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1046 machine_mode mode = GET_MODE (dest);
1047 unsigned HOST_WIDE_INT mask;
1048 int i;
1049 bool first;
1050 unsigned HOST_WIDE_INT val;
1051 bool subtargets;
1052 rtx subtarget;
1053 int one_match, zero_match, first_not_ffff_match;
1055 gcc_assert (mode == SImode || mode == DImode);
1057 /* Check on what type of symbol it is. */
1058 if (GET_CODE (imm) == SYMBOL_REF
1059 || GET_CODE (imm) == LABEL_REF
1060 || GET_CODE (imm) == CONST)
1062 rtx mem, base, offset;
1063 enum aarch64_symbol_type sty;
1065 /* If we have (const (plus symbol offset)), separate out the offset
1066 before we start classifying the symbol. */
1067 split_const (imm, &base, &offset);
1069 sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1070 switch (sty)
1072 case SYMBOL_FORCE_TO_MEM:
1073 if (offset != const0_rtx
1074 && targetm.cannot_force_const_mem (mode, imm))
1076 gcc_assert (can_create_pseudo_p ());
1077 base = aarch64_force_temporary (mode, dest, base);
1078 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1079 aarch64_emit_move (dest, base);
1080 return;
1082 mem = force_const_mem (ptr_mode, imm);
1083 gcc_assert (mem);
1084 if (mode != ptr_mode)
1085 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1086 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1087 return;
1089 case SYMBOL_SMALL_TLSGD:
1090 case SYMBOL_SMALL_TLSDESC:
1091 case SYMBOL_SMALL_GOTTPREL:
1092 case SYMBOL_SMALL_GOT:
1093 case SYMBOL_TINY_GOT:
1094 if (offset != const0_rtx)
1096 gcc_assert(can_create_pseudo_p ());
1097 base = aarch64_force_temporary (mode, dest, base);
1098 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1099 aarch64_emit_move (dest, base);
1100 return;
1102 /* FALLTHRU */
1104 case SYMBOL_SMALL_TPREL:
1105 case SYMBOL_SMALL_ABSOLUTE:
1106 case SYMBOL_TINY_ABSOLUTE:
1107 aarch64_load_symref_appropriately (dest, imm, sty);
1108 return;
1110 default:
1111 gcc_unreachable ();
1115 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1117 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1118 return;
1121 if (!CONST_INT_P (imm))
1123 if (GET_CODE (imm) == HIGH)
1124 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1125 else
1127 rtx mem = force_const_mem (mode, imm);
1128 gcc_assert (mem);
1129 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1132 return;
1135 if (mode == SImode)
1137 /* We know we can't do this in 1 insn, and we must be able to do it
1138 in two; so don't mess around looking for sequences that don't buy
1139 us anything. */
1140 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1141 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1142 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1143 return;
1146 /* Remaining cases are all for DImode. */
1148 val = INTVAL (imm);
1149 subtargets = optimize && can_create_pseudo_p ();
1151 one_match = 0;
1152 zero_match = 0;
1153 mask = 0xffff;
1154 first_not_ffff_match = -1;
1156 for (i = 0; i < 64; i += 16, mask <<= 16)
1158 if ((val & mask) == mask)
1159 one_match++;
1160 else
1162 if (first_not_ffff_match < 0)
1163 first_not_ffff_match = i;
1164 if ((val & mask) == 0)
1165 zero_match++;
1169 if (one_match == 2)
1171 /* Set one of the quarters and then insert back into result. */
1172 mask = 0xffffll << first_not_ffff_match;
1173 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1174 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1175 GEN_INT ((val >> first_not_ffff_match)
1176 & 0xffff)));
1177 return;
1180 if (zero_match == 2)
1181 goto simple_sequence;
1183 mask = 0x0ffff0000UL;
1184 for (i = 16; i < 64; i += 16, mask <<= 16)
1186 HOST_WIDE_INT comp = mask & ~(mask - 1);
1188 if (aarch64_uimm12_shift (val - (val & mask)))
1190 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1192 emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1193 emit_insn (gen_adddi3 (dest, subtarget,
1194 GEN_INT (val - (val & mask))));
1195 return;
1197 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1199 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1201 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1202 GEN_INT ((val + comp) & mask)));
1203 emit_insn (gen_adddi3 (dest, subtarget,
1204 GEN_INT (val - ((val + comp) & mask))));
1205 return;
1207 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1209 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1211 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1212 GEN_INT ((val - comp) | ~mask)));
1213 emit_insn (gen_adddi3 (dest, subtarget,
1214 GEN_INT (val - ((val - comp) | ~mask))));
1215 return;
1217 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1219 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1221 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1222 GEN_INT (val | ~mask)));
1223 emit_insn (gen_adddi3 (dest, subtarget,
1224 GEN_INT (val - (val | ~mask))));
1225 return;
1229 /* See if we can do it by arithmetically combining two
1230 immediates. */
1231 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1233 int j;
1234 mask = 0xffff;
1236 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1237 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1239 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1240 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1241 GEN_INT (aarch64_bitmasks[i])));
1242 emit_insn (gen_adddi3 (dest, subtarget,
1243 GEN_INT (val - aarch64_bitmasks[i])));
1244 return;
1247 for (j = 0; j < 64; j += 16, mask <<= 16)
1249 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1251 emit_insn (gen_rtx_SET (VOIDmode, dest,
1252 GEN_INT (aarch64_bitmasks[i])));
1253 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1254 GEN_INT ((val >> j) & 0xffff)));
1255 return;
1260 /* See if we can do it by logically combining two immediates. */
1261 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1263 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1265 int j;
1267 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1268 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1270 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1271 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1272 GEN_INT (aarch64_bitmasks[i])));
1273 emit_insn (gen_iordi3 (dest, subtarget,
1274 GEN_INT (aarch64_bitmasks[j])));
1275 return;
1278 else if ((val & aarch64_bitmasks[i]) == val)
1280 int j;
1282 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1283 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1286 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1287 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1288 GEN_INT (aarch64_bitmasks[j])));
1289 emit_insn (gen_anddi3 (dest, subtarget,
1290 GEN_INT (aarch64_bitmasks[i])));
1291 return;
1296 if (one_match > zero_match)
1298 /* Set either first three quarters or all but the third. */
1299 mask = 0xffffll << (16 - first_not_ffff_match);
1300 emit_insn (gen_rtx_SET (VOIDmode, dest,
1301 GEN_INT (val | mask | 0xffffffff00000000ull)));
1303 /* Now insert other two quarters. */
1304 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1305 i < 64; i += 16, mask <<= 16)
1307 if ((val & mask) != mask)
1308 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1309 GEN_INT ((val >> i) & 0xffff)));
1311 return;
1314 simple_sequence:
1315 first = true;
1316 mask = 0xffff;
1317 for (i = 0; i < 64; i += 16, mask <<= 16)
1319 if ((val & mask) != 0)
1321 if (first)
1323 emit_insn (gen_rtx_SET (VOIDmode, dest,
1324 GEN_INT (val & mask)));
1325 first = false;
1327 else
1328 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1329 GEN_INT ((val >> i) & 0xffff)));
1334 static bool
1335 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1336 tree exp ATTRIBUTE_UNUSED)
1338 /* Currently, always true. */
1339 return true;
1342 /* Implement TARGET_PASS_BY_REFERENCE. */
1344 static bool
1345 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1346 machine_mode mode,
1347 const_tree type,
1348 bool named ATTRIBUTE_UNUSED)
1350 HOST_WIDE_INT size;
1351 machine_mode dummymode;
1352 int nregs;
1354 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1355 size = (mode == BLKmode && type)
1356 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1358 /* Aggregates are passed by reference based on their size. */
1359 if (type && AGGREGATE_TYPE_P (type))
1361 size = int_size_in_bytes (type);
1364 /* Variable sized arguments are always returned by reference. */
1365 if (size < 0)
1366 return true;
1368 /* Can this be a candidate to be passed in fp/simd register(s)? */
1369 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1370 &dummymode, &nregs,
1371 NULL))
1372 return false;
1374 /* Arguments which are variable sized or larger than 2 registers are
1375 passed by reference unless they are a homogenous floating point
1376 aggregate. */
1377 return size > 2 * UNITS_PER_WORD;
1380 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1381 static bool
1382 aarch64_return_in_msb (const_tree valtype)
1384 machine_mode dummy_mode;
1385 int dummy_int;
1387 /* Never happens in little-endian mode. */
1388 if (!BYTES_BIG_ENDIAN)
1389 return false;
1391 /* Only composite types smaller than or equal to 16 bytes can
1392 be potentially returned in registers. */
1393 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1394 || int_size_in_bytes (valtype) <= 0
1395 || int_size_in_bytes (valtype) > 16)
1396 return false;
1398 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1399 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1400 is always passed/returned in the least significant bits of fp/simd
1401 register(s). */
1402 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1403 &dummy_mode, &dummy_int, NULL))
1404 return false;
1406 return true;
1409 /* Implement TARGET_FUNCTION_VALUE.
1410 Define how to find the value returned by a function. */
1412 static rtx
1413 aarch64_function_value (const_tree type, const_tree func,
1414 bool outgoing ATTRIBUTE_UNUSED)
1416 machine_mode mode;
1417 int unsignedp;
1418 int count;
1419 machine_mode ag_mode;
1421 mode = TYPE_MODE (type);
1422 if (INTEGRAL_TYPE_P (type))
1423 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1425 if (aarch64_return_in_msb (type))
1427 HOST_WIDE_INT size = int_size_in_bytes (type);
1429 if (size % UNITS_PER_WORD != 0)
1431 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1432 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1436 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1437 &ag_mode, &count, NULL))
1439 if (!aarch64_composite_type_p (type, mode))
1441 gcc_assert (count == 1 && mode == ag_mode);
1442 return gen_rtx_REG (mode, V0_REGNUM);
1444 else
1446 int i;
1447 rtx par;
1449 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1450 for (i = 0; i < count; i++)
1452 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1453 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1454 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1455 XVECEXP (par, 0, i) = tmp;
1457 return par;
1460 else
1461 return gen_rtx_REG (mode, R0_REGNUM);
1464 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1465 Return true if REGNO is the number of a hard register in which the values
1466 of called function may come back. */
1468 static bool
1469 aarch64_function_value_regno_p (const unsigned int regno)
1471 /* Maximum of 16 bytes can be returned in the general registers. Examples
1472 of 16-byte return values are: 128-bit integers and 16-byte small
1473 structures (excluding homogeneous floating-point aggregates). */
1474 if (regno == R0_REGNUM || regno == R1_REGNUM)
1475 return true;
1477 /* Up to four fp/simd registers can return a function value, e.g. a
1478 homogeneous floating-point aggregate having four members. */
1479 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1480 return !TARGET_GENERAL_REGS_ONLY;
1482 return false;
1485 /* Implement TARGET_RETURN_IN_MEMORY.
1487 If the type T of the result of a function is such that
1488 void func (T arg)
1489 would require that arg be passed as a value in a register (or set of
1490 registers) according to the parameter passing rules, then the result
1491 is returned in the same registers as would be used for such an
1492 argument. */
1494 static bool
1495 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1497 HOST_WIDE_INT size;
1498 machine_mode ag_mode;
1499 int count;
1501 if (!AGGREGATE_TYPE_P (type)
1502 && TREE_CODE (type) != COMPLEX_TYPE
1503 && TREE_CODE (type) != VECTOR_TYPE)
1504 /* Simple scalar types always returned in registers. */
1505 return false;
1507 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1508 type,
1509 &ag_mode,
1510 &count,
1511 NULL))
1512 return false;
1514 /* Types larger than 2 registers returned in memory. */
1515 size = int_size_in_bytes (type);
1516 return (size < 0 || size > 2 * UNITS_PER_WORD);
1519 static bool
1520 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1521 const_tree type, int *nregs)
1523 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1524 return aarch64_vfp_is_call_or_return_candidate (mode,
1525 type,
1526 &pcum->aapcs_vfp_rmode,
1527 nregs,
1528 NULL);
1531 /* Given MODE and TYPE of a function argument, return the alignment in
1532 bits. The idea is to suppress any stronger alignment requested by
1533 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1534 This is a helper function for local use only. */
1536 static unsigned int
1537 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1539 unsigned int alignment;
1541 if (type)
1543 if (!integer_zerop (TYPE_SIZE (type)))
1545 if (TYPE_MODE (type) == mode)
1546 alignment = TYPE_ALIGN (type);
1547 else
1548 alignment = GET_MODE_ALIGNMENT (mode);
1550 else
1551 alignment = 0;
1553 else
1554 alignment = GET_MODE_ALIGNMENT (mode);
1556 return alignment;
1559 /* Layout a function argument according to the AAPCS64 rules. The rule
1560 numbers refer to the rule numbers in the AAPCS64. */
1562 static void
1563 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1564 const_tree type,
1565 bool named ATTRIBUTE_UNUSED)
1567 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1568 int ncrn, nvrn, nregs;
1569 bool allocate_ncrn, allocate_nvrn;
1570 HOST_WIDE_INT size;
1572 /* We need to do this once per argument. */
1573 if (pcum->aapcs_arg_processed)
1574 return;
1576 pcum->aapcs_arg_processed = true;
1578 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1579 size
1580 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1581 UNITS_PER_WORD);
1583 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1584 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1585 mode,
1586 type,
1587 &nregs);
1589 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1590 The following code thus handles passing by SIMD/FP registers first. */
1592 nvrn = pcum->aapcs_nvrn;
1594 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1595 and homogenous short-vector aggregates (HVA). */
1596 if (allocate_nvrn)
1598 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1600 pcum->aapcs_nextnvrn = nvrn + nregs;
1601 if (!aarch64_composite_type_p (type, mode))
1603 gcc_assert (nregs == 1);
1604 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1606 else
1608 rtx par;
1609 int i;
1610 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1611 for (i = 0; i < nregs; i++)
1613 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1614 V0_REGNUM + nvrn + i);
1615 tmp = gen_rtx_EXPR_LIST
1616 (VOIDmode, tmp,
1617 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1618 XVECEXP (par, 0, i) = tmp;
1620 pcum->aapcs_reg = par;
1622 return;
1624 else
1626 /* C.3 NSRN is set to 8. */
1627 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1628 goto on_stack;
1632 ncrn = pcum->aapcs_ncrn;
1633 nregs = size / UNITS_PER_WORD;
1635 /* C6 - C9. though the sign and zero extension semantics are
1636 handled elsewhere. This is the case where the argument fits
1637 entirely general registers. */
1638 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1640 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1642 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1644 /* C.8 if the argument has an alignment of 16 then the NGRN is
1645 rounded up to the next even number. */
1646 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1648 ++ncrn;
1649 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1651 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1652 A reg is still generated for it, but the caller should be smart
1653 enough not to use it. */
1654 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1656 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1658 else
1660 rtx par;
1661 int i;
1663 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1664 for (i = 0; i < nregs; i++)
1666 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1667 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1668 GEN_INT (i * UNITS_PER_WORD));
1669 XVECEXP (par, 0, i) = tmp;
1671 pcum->aapcs_reg = par;
1674 pcum->aapcs_nextncrn = ncrn + nregs;
1675 return;
1678 /* C.11 */
1679 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1681 /* The argument is passed on stack; record the needed number of words for
1682 this argument and align the total size if necessary. */
1683 on_stack:
1684 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1685 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1686 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1687 16 / UNITS_PER_WORD);
1688 return;
1691 /* Implement TARGET_FUNCTION_ARG. */
1693 static rtx
1694 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1695 const_tree type, bool named)
1697 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1698 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1700 if (mode == VOIDmode)
1701 return NULL_RTX;
1703 aarch64_layout_arg (pcum_v, mode, type, named);
1704 return pcum->aapcs_reg;
1707 void
1708 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1709 const_tree fntype ATTRIBUTE_UNUSED,
1710 rtx libname ATTRIBUTE_UNUSED,
1711 const_tree fndecl ATTRIBUTE_UNUSED,
1712 unsigned n_named ATTRIBUTE_UNUSED)
1714 pcum->aapcs_ncrn = 0;
1715 pcum->aapcs_nvrn = 0;
1716 pcum->aapcs_nextncrn = 0;
1717 pcum->aapcs_nextnvrn = 0;
1718 pcum->pcs_variant = ARM_PCS_AAPCS64;
1719 pcum->aapcs_reg = NULL_RTX;
1720 pcum->aapcs_arg_processed = false;
1721 pcum->aapcs_stack_words = 0;
1722 pcum->aapcs_stack_size = 0;
1724 return;
1727 static void
1728 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1729 machine_mode mode,
1730 const_tree type,
1731 bool named)
1733 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1734 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1736 aarch64_layout_arg (pcum_v, mode, type, named);
1737 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1738 != (pcum->aapcs_stack_words != 0));
1739 pcum->aapcs_arg_processed = false;
1740 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1741 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1742 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1743 pcum->aapcs_stack_words = 0;
1744 pcum->aapcs_reg = NULL_RTX;
1748 bool
1749 aarch64_function_arg_regno_p (unsigned regno)
1751 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1752 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1755 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1756 PARM_BOUNDARY bits of alignment, but will be given anything up
1757 to STACK_BOUNDARY bits if the type requires it. This makes sure
1758 that both before and after the layout of each argument, the Next
1759 Stacked Argument Address (NSAA) will have a minimum alignment of
1760 8 bytes. */
1762 static unsigned int
1763 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1765 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1767 if (alignment < PARM_BOUNDARY)
1768 alignment = PARM_BOUNDARY;
1769 if (alignment > STACK_BOUNDARY)
1770 alignment = STACK_BOUNDARY;
1771 return alignment;
1774 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1776 Return true if an argument passed on the stack should be padded upwards,
1777 i.e. if the least-significant byte of the stack slot has useful data.
1779 Small aggregate types are placed in the lowest memory address.
1781 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1783 bool
1784 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1786 /* On little-endian targets, the least significant byte of every stack
1787 argument is passed at the lowest byte address of the stack slot. */
1788 if (!BYTES_BIG_ENDIAN)
1789 return true;
1791 /* Otherwise, integral, floating-point and pointer types are padded downward:
1792 the least significant byte of a stack argument is passed at the highest
1793 byte address of the stack slot. */
1794 if (type
1795 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1796 || POINTER_TYPE_P (type))
1797 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1798 return false;
1800 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1801 return true;
1804 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1806 It specifies padding for the last (may also be the only)
1807 element of a block move between registers and memory. If
1808 assuming the block is in the memory, padding upward means that
1809 the last element is padded after its highest significant byte,
1810 while in downward padding, the last element is padded at the
1811 its least significant byte side.
1813 Small aggregates and small complex types are always padded
1814 upwards.
1816 We don't need to worry about homogeneous floating-point or
1817 short-vector aggregates; their move is not affected by the
1818 padding direction determined here. Regardless of endianness,
1819 each element of such an aggregate is put in the least
1820 significant bits of a fp/simd register.
1822 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1823 register has useful data, and return the opposite if the most
1824 significant byte does. */
1826 bool
1827 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1828 bool first ATTRIBUTE_UNUSED)
1831 /* Small composite types are always padded upward. */
1832 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1834 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1835 : GET_MODE_SIZE (mode));
1836 if (size < 2 * UNITS_PER_WORD)
1837 return true;
1840 /* Otherwise, use the default padding. */
1841 return !BYTES_BIG_ENDIAN;
1844 static machine_mode
1845 aarch64_libgcc_cmp_return_mode (void)
1847 return SImode;
1850 static bool
1851 aarch64_frame_pointer_required (void)
1853 /* In aarch64_override_options_after_change
1854 flag_omit_leaf_frame_pointer turns off the frame pointer by
1855 default. Turn it back on now if we've not got a leaf
1856 function. */
1857 if (flag_omit_leaf_frame_pointer
1858 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1859 return true;
1861 return false;
1864 /* Mark the registers that need to be saved by the callee and calculate
1865 the size of the callee-saved registers area and frame record (both FP
1866 and LR may be omitted). */
1867 static void
1868 aarch64_layout_frame (void)
1870 HOST_WIDE_INT offset = 0;
1871 int regno;
1873 if (reload_completed && cfun->machine->frame.laid_out)
1874 return;
1876 #define SLOT_NOT_REQUIRED (-2)
1877 #define SLOT_REQUIRED (-1)
1879 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
1880 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
1882 /* First mark all the registers that really need to be saved... */
1883 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1884 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1886 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1887 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1889 /* ... that includes the eh data registers (if needed)... */
1890 if (crtl->calls_eh_return)
1891 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1892 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1893 = SLOT_REQUIRED;
1895 /* ... and any callee saved register that dataflow says is live. */
1896 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1897 if (df_regs_ever_live_p (regno)
1898 && !call_used_regs[regno])
1899 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1901 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1902 if (df_regs_ever_live_p (regno)
1903 && !call_used_regs[regno])
1904 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1906 if (frame_pointer_needed)
1908 /* FP and LR are placed in the linkage record. */
1909 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1910 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
1911 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1912 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
1913 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1914 offset += 2 * UNITS_PER_WORD;
1917 /* Now assign stack slots for them. */
1918 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1919 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1921 cfun->machine->frame.reg_offset[regno] = offset;
1922 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1923 cfun->machine->frame.wb_candidate1 = regno;
1924 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
1925 cfun->machine->frame.wb_candidate2 = regno;
1926 offset += UNITS_PER_WORD;
1929 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1930 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1932 cfun->machine->frame.reg_offset[regno] = offset;
1933 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1934 cfun->machine->frame.wb_candidate1 = regno;
1935 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
1936 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
1937 cfun->machine->frame.wb_candidate2 = regno;
1938 offset += UNITS_PER_WORD;
1941 cfun->machine->frame.padding0 =
1942 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1943 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1945 cfun->machine->frame.saved_regs_size = offset;
1947 cfun->machine->frame.hard_fp_offset
1948 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1949 + get_frame_size ()
1950 + cfun->machine->frame.saved_regs_size,
1951 STACK_BOUNDARY / BITS_PER_UNIT);
1953 cfun->machine->frame.frame_size
1954 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1955 + crtl->outgoing_args_size,
1956 STACK_BOUNDARY / BITS_PER_UNIT);
1958 cfun->machine->frame.laid_out = true;
1961 static bool
1962 aarch64_register_saved_on_entry (int regno)
1964 return cfun->machine->frame.reg_offset[regno] >= 0;
1967 static unsigned
1968 aarch64_next_callee_save (unsigned regno, unsigned limit)
1970 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1971 regno ++;
1972 return regno;
1975 static void
1976 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
1977 HOST_WIDE_INT adjustment)
1979 rtx base_rtx = stack_pointer_rtx;
1980 rtx insn, reg, mem;
1982 reg = gen_rtx_REG (mode, regno);
1983 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
1984 plus_constant (Pmode, base_rtx, -adjustment));
1985 mem = gen_rtx_MEM (mode, mem);
1987 insn = emit_move_insn (mem, reg);
1988 RTX_FRAME_RELATED_P (insn) = 1;
1991 static rtx
1992 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
1993 HOST_WIDE_INT adjustment)
1995 switch (mode)
1997 case DImode:
1998 return gen_storewb_pairdi_di (base, base, reg, reg2,
1999 GEN_INT (-adjustment),
2000 GEN_INT (UNITS_PER_WORD - adjustment));
2001 case DFmode:
2002 return gen_storewb_pairdf_di (base, base, reg, reg2,
2003 GEN_INT (-adjustment),
2004 GEN_INT (UNITS_PER_WORD - adjustment));
2005 default:
2006 gcc_unreachable ();
2010 static void
2011 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2012 unsigned regno2, HOST_WIDE_INT adjustment)
2014 rtx_insn *insn;
2015 rtx reg1 = gen_rtx_REG (mode, regno1);
2016 rtx reg2 = gen_rtx_REG (mode, regno2);
2018 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2019 reg2, adjustment));
2020 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2021 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2022 RTX_FRAME_RELATED_P (insn) = 1;
2025 static rtx
2026 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2027 HOST_WIDE_INT adjustment)
2029 switch (mode)
2031 case DImode:
2032 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2033 GEN_INT (UNITS_PER_WORD));
2034 case DFmode:
2035 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2036 GEN_INT (UNITS_PER_WORD));
2037 default:
2038 gcc_unreachable ();
2042 static rtx
2043 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2044 rtx reg2)
2046 switch (mode)
2048 case DImode:
2049 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2051 case DFmode:
2052 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2054 default:
2055 gcc_unreachable ();
2059 static rtx
2060 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2061 rtx mem2)
2063 switch (mode)
2065 case DImode:
2066 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2068 case DFmode:
2069 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2071 default:
2072 gcc_unreachable ();
2077 static void
2078 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2079 unsigned start, unsigned limit, bool skip_wb)
2081 rtx_insn *insn;
2082 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2083 ? gen_frame_mem : gen_rtx_MEM);
2084 unsigned regno;
2085 unsigned regno2;
2087 for (regno = aarch64_next_callee_save (start, limit);
2088 regno <= limit;
2089 regno = aarch64_next_callee_save (regno + 1, limit))
2091 rtx reg, mem;
2092 HOST_WIDE_INT offset;
2094 if (skip_wb
2095 && (regno == cfun->machine->frame.wb_candidate1
2096 || regno == cfun->machine->frame.wb_candidate2))
2097 continue;
2099 reg = gen_rtx_REG (mode, regno);
2100 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2101 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2102 offset));
2104 regno2 = aarch64_next_callee_save (regno + 1, limit);
2106 if (regno2 <= limit
2107 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2108 == cfun->machine->frame.reg_offset[regno2]))
2111 rtx reg2 = gen_rtx_REG (mode, regno2);
2112 rtx mem2;
2114 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2115 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2116 offset));
2117 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2118 reg2));
2120 /* The first part of a frame-related parallel insn is
2121 always assumed to be relevant to the frame
2122 calculations; subsequent parts, are only
2123 frame-related if explicitly marked. */
2124 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2125 regno = regno2;
2127 else
2128 insn = emit_move_insn (mem, reg);
2130 RTX_FRAME_RELATED_P (insn) = 1;
2134 static void
2135 aarch64_restore_callee_saves (machine_mode mode,
2136 HOST_WIDE_INT start_offset, unsigned start,
2137 unsigned limit, bool skip_wb, rtx *cfi_ops)
2139 rtx base_rtx = stack_pointer_rtx;
2140 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2141 ? gen_frame_mem : gen_rtx_MEM);
2142 unsigned regno;
2143 unsigned regno2;
2144 HOST_WIDE_INT offset;
2146 for (regno = aarch64_next_callee_save (start, limit);
2147 regno <= limit;
2148 regno = aarch64_next_callee_save (regno + 1, limit))
2150 rtx reg, mem;
2152 if (skip_wb
2153 && (regno == cfun->machine->frame.wb_candidate1
2154 || regno == cfun->machine->frame.wb_candidate2))
2155 continue;
2157 reg = gen_rtx_REG (mode, regno);
2158 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2159 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2161 regno2 = aarch64_next_callee_save (regno + 1, limit);
2163 if (regno2 <= limit
2164 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2165 == cfun->machine->frame.reg_offset[regno2]))
2167 rtx reg2 = gen_rtx_REG (mode, regno2);
2168 rtx mem2;
2170 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2171 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2172 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2174 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2175 regno = regno2;
2177 else
2178 emit_move_insn (reg, mem);
2179 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2183 /* AArch64 stack frames generated by this compiler look like:
2185 +-------------------------------+
2187 | incoming stack arguments |
2189 +-------------------------------+
2190 | | <-- incoming stack pointer (aligned)
2191 | callee-allocated save area |
2192 | for register varargs |
2194 +-------------------------------+
2195 | local variables | <-- frame_pointer_rtx
2197 +-------------------------------+
2198 | padding0 | \
2199 +-------------------------------+ |
2200 | callee-saved registers | | frame.saved_regs_size
2201 +-------------------------------+ |
2202 | LR' | |
2203 +-------------------------------+ |
2204 | FP' | / <- hard_frame_pointer_rtx (aligned)
2205 +-------------------------------+
2206 | dynamic allocation |
2207 +-------------------------------+
2208 | padding |
2209 +-------------------------------+
2210 | outgoing stack arguments | <-- arg_pointer
2212 +-------------------------------+
2213 | | <-- stack_pointer_rtx (aligned)
2215 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2216 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2217 unchanged. */
2219 /* Generate the prologue instructions for entry into a function.
2220 Establish the stack frame by decreasing the stack pointer with a
2221 properly calculated size and, if necessary, create a frame record
2222 filled with the values of LR and previous frame pointer. The
2223 current FP is also set up if it is in use. */
2225 void
2226 aarch64_expand_prologue (void)
2228 /* sub sp, sp, #<frame_size>
2229 stp {fp, lr}, [sp, #<frame_size> - 16]
2230 add fp, sp, #<frame_size> - hardfp_offset
2231 stp {cs_reg}, [fp, #-16] etc.
2233 sub sp, sp, <final_adjustment_if_any>
2235 HOST_WIDE_INT frame_size, offset;
2236 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2237 HOST_WIDE_INT hard_fp_offset;
2238 rtx_insn *insn;
2240 aarch64_layout_frame ();
2242 offset = frame_size = cfun->machine->frame.frame_size;
2243 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2244 fp_offset = frame_size - hard_fp_offset;
2246 if (flag_stack_usage_info)
2247 current_function_static_stack_size = frame_size;
2249 /* Store pairs and load pairs have a range only -512 to 504. */
2250 if (offset >= 512)
2252 /* When the frame has a large size, an initial decrease is done on
2253 the stack pointer to jump over the callee-allocated save area for
2254 register varargs, the local variable area and/or the callee-saved
2255 register area. This will allow the pre-index write-back
2256 store pair instructions to be used for setting up the stack frame
2257 efficiently. */
2258 offset = hard_fp_offset;
2259 if (offset >= 512)
2260 offset = cfun->machine->frame.saved_regs_size;
2262 frame_size -= (offset + crtl->outgoing_args_size);
2263 fp_offset = 0;
2265 if (frame_size >= 0x1000000)
2267 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2268 emit_move_insn (op0, GEN_INT (-frame_size));
2269 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2271 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2272 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2273 plus_constant (Pmode, stack_pointer_rtx,
2274 -frame_size)));
2275 RTX_FRAME_RELATED_P (insn) = 1;
2277 else if (frame_size > 0)
2279 int hi_ofs = frame_size & 0xfff000;
2280 int lo_ofs = frame_size & 0x000fff;
2282 if (hi_ofs)
2284 insn = emit_insn (gen_add2_insn
2285 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2286 RTX_FRAME_RELATED_P (insn) = 1;
2288 if (lo_ofs)
2290 insn = emit_insn (gen_add2_insn
2291 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2292 RTX_FRAME_RELATED_P (insn) = 1;
2296 else
2297 frame_size = -1;
2299 if (offset > 0)
2301 bool skip_wb = false;
2303 if (frame_pointer_needed)
2305 skip_wb = true;
2307 if (fp_offset)
2309 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2310 GEN_INT (-offset)));
2311 RTX_FRAME_RELATED_P (insn) = 1;
2313 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2314 R30_REGNUM, false);
2316 else
2317 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2319 /* Set up frame pointer to point to the location of the
2320 previous frame pointer on the stack. */
2321 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2322 stack_pointer_rtx,
2323 GEN_INT (fp_offset)));
2324 RTX_FRAME_RELATED_P (insn) = 1;
2325 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2327 else
2329 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2330 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2332 if (fp_offset
2333 || reg1 == FIRST_PSEUDO_REGISTER
2334 || (reg2 == FIRST_PSEUDO_REGISTER
2335 && offset >= 256))
2337 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2338 GEN_INT (-offset)));
2339 RTX_FRAME_RELATED_P (insn) = 1;
2341 else
2343 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2345 skip_wb = true;
2347 if (reg2 == FIRST_PSEUDO_REGISTER)
2348 aarch64_pushwb_single_reg (mode1, reg1, offset);
2349 else
2350 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2354 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2355 skip_wb);
2356 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2357 skip_wb);
2360 /* when offset >= 512,
2361 sub sp, sp, #<outgoing_args_size> */
2362 if (frame_size > -1)
2364 if (crtl->outgoing_args_size > 0)
2366 insn = emit_insn (gen_add2_insn
2367 (stack_pointer_rtx,
2368 GEN_INT (- crtl->outgoing_args_size)));
2369 RTX_FRAME_RELATED_P (insn) = 1;
2374 /* Return TRUE if we can use a simple_return insn.
2376 This function checks whether the callee saved stack is empty, which
2377 means no restore actions are need. The pro_and_epilogue will use
2378 this to check whether shrink-wrapping opt is feasible. */
2380 bool
2381 aarch64_use_return_insn_p (void)
2383 if (!reload_completed)
2384 return false;
2386 if (crtl->profile)
2387 return false;
2389 aarch64_layout_frame ();
2391 return cfun->machine->frame.frame_size == 0;
2394 /* Generate the epilogue instructions for returning from a function. */
2395 void
2396 aarch64_expand_epilogue (bool for_sibcall)
2398 HOST_WIDE_INT frame_size, offset;
2399 HOST_WIDE_INT fp_offset;
2400 HOST_WIDE_INT hard_fp_offset;
2401 rtx_insn *insn;
2403 aarch64_layout_frame ();
2405 offset = frame_size = cfun->machine->frame.frame_size;
2406 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2407 fp_offset = frame_size - hard_fp_offset;
2409 /* Store pairs and load pairs have a range only -512 to 504. */
2410 if (offset >= 512)
2412 offset = hard_fp_offset;
2413 if (offset >= 512)
2414 offset = cfun->machine->frame.saved_regs_size;
2416 frame_size -= (offset + crtl->outgoing_args_size);
2417 fp_offset = 0;
2418 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2420 insn = emit_insn (gen_add2_insn
2421 (stack_pointer_rtx,
2422 GEN_INT (crtl->outgoing_args_size)));
2423 RTX_FRAME_RELATED_P (insn) = 1;
2426 else
2427 frame_size = -1;
2429 /* If there were outgoing arguments or we've done dynamic stack
2430 allocation, then restore the stack pointer from the frame
2431 pointer. This is at most one insn and more efficient than using
2432 GCC's internal mechanism. */
2433 if (frame_pointer_needed
2434 && (crtl->outgoing_args_size || cfun->calls_alloca))
2436 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2437 hard_frame_pointer_rtx,
2438 GEN_INT (0)));
2439 offset = offset - fp_offset;
2442 if (offset > 0)
2444 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2445 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2446 bool skip_wb = true;
2447 rtx cfi_ops = NULL;
2449 if (frame_pointer_needed)
2450 fp_offset = 0;
2451 else if (fp_offset
2452 || reg1 == FIRST_PSEUDO_REGISTER
2453 || (reg2 == FIRST_PSEUDO_REGISTER
2454 && offset >= 256))
2455 skip_wb = false;
2457 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2458 skip_wb, &cfi_ops);
2459 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2460 skip_wb, &cfi_ops);
2462 if (skip_wb)
2464 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2465 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2467 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2468 if (reg2 == FIRST_PSEUDO_REGISTER)
2470 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2471 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2472 mem = gen_rtx_MEM (mode1, mem);
2473 insn = emit_move_insn (rreg1, mem);
2475 else
2477 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2479 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2480 insn = emit_insn (aarch64_gen_loadwb_pair
2481 (mode1, stack_pointer_rtx, rreg1,
2482 rreg2, offset));
2485 else
2487 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2488 GEN_INT (offset)));
2491 /* Reset the CFA to be SP + FRAME_SIZE. */
2492 rtx new_cfa = stack_pointer_rtx;
2493 if (frame_size > 0)
2494 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2495 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2496 REG_NOTES (insn) = cfi_ops;
2497 RTX_FRAME_RELATED_P (insn) = 1;
2500 if (frame_size > 0)
2502 if (frame_size >= 0x1000000)
2504 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2505 emit_move_insn (op0, GEN_INT (frame_size));
2506 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2508 else
2510 int hi_ofs = frame_size & 0xfff000;
2511 int lo_ofs = frame_size & 0x000fff;
2513 if (hi_ofs && lo_ofs)
2515 insn = emit_insn (gen_add2_insn
2516 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2517 RTX_FRAME_RELATED_P (insn) = 1;
2518 frame_size = lo_ofs;
2520 insn = emit_insn (gen_add2_insn
2521 (stack_pointer_rtx, GEN_INT (frame_size)));
2524 /* Reset the CFA to be SP + 0. */
2525 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2526 RTX_FRAME_RELATED_P (insn) = 1;
2529 /* Stack adjustment for exception handler. */
2530 if (crtl->calls_eh_return)
2532 /* We need to unwind the stack by the offset computed by
2533 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2534 to be SP; letting the CFA move during this adjustment
2535 is just as correct as retaining the CFA from the body
2536 of the function. Therefore, do nothing special. */
2537 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2540 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2541 if (!for_sibcall)
2542 emit_jump_insn (ret_rtx);
2545 /* Return the place to copy the exception unwinding return address to.
2546 This will probably be a stack slot, but could (in theory be the
2547 return register). */
2549 aarch64_final_eh_return_addr (void)
2551 HOST_WIDE_INT fp_offset;
2553 aarch64_layout_frame ();
2555 fp_offset = cfun->machine->frame.frame_size
2556 - cfun->machine->frame.hard_fp_offset;
2558 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2559 return gen_rtx_REG (DImode, LR_REGNUM);
2561 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2562 result in a store to save LR introduced by builtin_eh_return () being
2563 incorrectly deleted because the alias is not detected.
2564 So in the calculation of the address to copy the exception unwinding
2565 return address to, we note 2 cases.
2566 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2567 we return a SP-relative location since all the addresses are SP-relative
2568 in this case. This prevents the store from being optimized away.
2569 If the fp_offset is not 0, then the addresses will be FP-relative and
2570 therefore we return a FP-relative location. */
2572 if (frame_pointer_needed)
2574 if (fp_offset)
2575 return gen_frame_mem (DImode,
2576 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2577 else
2578 return gen_frame_mem (DImode,
2579 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2582 /* If FP is not needed, we calculate the location of LR, which would be
2583 at the top of the saved registers block. */
2585 return gen_frame_mem (DImode,
2586 plus_constant (Pmode,
2587 stack_pointer_rtx,
2588 fp_offset
2589 + cfun->machine->frame.saved_regs_size
2590 - 2 * UNITS_PER_WORD));
2593 /* Possibly output code to build up a constant in a register. For
2594 the benefit of the costs infrastructure, returns the number of
2595 instructions which would be emitted. GENERATE inhibits or
2596 enables code generation. */
2598 static int
2599 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2601 int insns = 0;
2603 if (aarch64_bitmask_imm (val, DImode))
2605 if (generate)
2606 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2607 insns = 1;
2609 else
2611 int i;
2612 int ncount = 0;
2613 int zcount = 0;
2614 HOST_WIDE_INT valp = val >> 16;
2615 HOST_WIDE_INT valm;
2616 HOST_WIDE_INT tval;
2618 for (i = 16; i < 64; i += 16)
2620 valm = (valp & 0xffff);
2622 if (valm != 0)
2623 ++ zcount;
2625 if (valm != 0xffff)
2626 ++ ncount;
2628 valp >>= 16;
2631 /* zcount contains the number of additional MOVK instructions
2632 required if the constant is built up with an initial MOVZ instruction,
2633 while ncount is the number of MOVK instructions required if starting
2634 with a MOVN instruction. Choose the sequence that yields the fewest
2635 number of instructions, preferring MOVZ instructions when they are both
2636 the same. */
2637 if (ncount < zcount)
2639 if (generate)
2640 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2641 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2642 tval = 0xffff;
2643 insns++;
2645 else
2647 if (generate)
2648 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2649 GEN_INT (val & 0xffff));
2650 tval = 0;
2651 insns++;
2654 val >>= 16;
2656 for (i = 16; i < 64; i += 16)
2658 if ((val & 0xffff) != tval)
2660 if (generate)
2661 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2662 GEN_INT (i),
2663 GEN_INT (val & 0xffff)));
2664 insns++;
2666 val >>= 16;
2669 return insns;
2672 static void
2673 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2675 HOST_WIDE_INT mdelta = delta;
2676 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2677 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2679 if (mdelta < 0)
2680 mdelta = -mdelta;
2682 if (mdelta >= 4096 * 4096)
2684 (void) aarch64_build_constant (scratchreg, delta, true);
2685 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2687 else if (mdelta > 0)
2689 if (mdelta >= 4096)
2691 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2692 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2693 if (delta < 0)
2694 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2695 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2696 else
2697 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2698 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2700 if (mdelta % 4096 != 0)
2702 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2703 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2704 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2709 /* Output code to add DELTA to the first argument, and then jump
2710 to FUNCTION. Used for C++ multiple inheritance. */
2711 static void
2712 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2713 HOST_WIDE_INT delta,
2714 HOST_WIDE_INT vcall_offset,
2715 tree function)
2717 /* The this pointer is always in x0. Note that this differs from
2718 Arm where the this pointer maybe bumped to r1 if r0 is required
2719 to return a pointer to an aggregate. On AArch64 a result value
2720 pointer will be in x8. */
2721 int this_regno = R0_REGNUM;
2722 rtx this_rtx, temp0, temp1, addr, funexp;
2723 rtx_insn *insn;
2725 reload_completed = 1;
2726 emit_note (NOTE_INSN_PROLOGUE_END);
2728 if (vcall_offset == 0)
2729 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2730 else
2732 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2734 this_rtx = gen_rtx_REG (Pmode, this_regno);
2735 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2736 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2738 addr = this_rtx;
2739 if (delta != 0)
2741 if (delta >= -256 && delta < 256)
2742 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2743 plus_constant (Pmode, this_rtx, delta));
2744 else
2745 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2748 if (Pmode == ptr_mode)
2749 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2750 else
2751 aarch64_emit_move (temp0,
2752 gen_rtx_ZERO_EXTEND (Pmode,
2753 gen_rtx_MEM (ptr_mode, addr)));
2755 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2756 addr = plus_constant (Pmode, temp0, vcall_offset);
2757 else
2759 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2760 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2763 if (Pmode == ptr_mode)
2764 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2765 else
2766 aarch64_emit_move (temp1,
2767 gen_rtx_SIGN_EXTEND (Pmode,
2768 gen_rtx_MEM (ptr_mode, addr)));
2770 emit_insn (gen_add2_insn (this_rtx, temp1));
2773 /* Generate a tail call to the target function. */
2774 if (!TREE_USED (function))
2776 assemble_external (function);
2777 TREE_USED (function) = 1;
2779 funexp = XEXP (DECL_RTL (function), 0);
2780 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2781 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2782 SIBLING_CALL_P (insn) = 1;
2784 insn = get_insns ();
2785 shorten_branches (insn);
2786 final_start_function (insn, file, 1);
2787 final (insn, file, 1);
2788 final_end_function ();
2790 /* Stop pretending to be a post-reload pass. */
2791 reload_completed = 0;
2794 static int
2795 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2797 if (GET_CODE (*x) == SYMBOL_REF)
2798 return SYMBOL_REF_TLS_MODEL (*x) != 0;
2800 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2801 TLS offsets, not real symbol references. */
2802 if (GET_CODE (*x) == UNSPEC
2803 && XINT (*x, 1) == UNSPEC_TLS)
2804 return -1;
2806 return 0;
2809 static bool
2810 aarch64_tls_referenced_p (rtx x)
2812 if (!TARGET_HAVE_TLS)
2813 return false;
2815 return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2819 static int
2820 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2822 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2823 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2825 if (*imm1 < *imm2)
2826 return -1;
2827 if (*imm1 > *imm2)
2828 return +1;
2829 return 0;
2833 static void
2834 aarch64_build_bitmask_table (void)
2836 unsigned HOST_WIDE_INT mask, imm;
2837 unsigned int log_e, e, s, r;
2838 unsigned int nimms = 0;
2840 for (log_e = 1; log_e <= 6; log_e++)
2842 e = 1 << log_e;
2843 if (e == 64)
2844 mask = ~(HOST_WIDE_INT) 0;
2845 else
2846 mask = ((HOST_WIDE_INT) 1 << e) - 1;
2847 for (s = 1; s < e; s++)
2849 for (r = 0; r < e; r++)
2851 /* set s consecutive bits to 1 (s < 64) */
2852 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2853 /* rotate right by r */
2854 if (r != 0)
2855 imm = ((imm >> r) | (imm << (e - r))) & mask;
2856 /* replicate the constant depending on SIMD size */
2857 switch (log_e) {
2858 case 1: imm |= (imm << 2);
2859 case 2: imm |= (imm << 4);
2860 case 3: imm |= (imm << 8);
2861 case 4: imm |= (imm << 16);
2862 case 5: imm |= (imm << 32);
2863 case 6:
2864 break;
2865 default:
2866 gcc_unreachable ();
2868 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2869 aarch64_bitmasks[nimms++] = imm;
2874 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2875 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2876 aarch64_bitmasks_cmp);
2880 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2881 a left shift of 0 or 12 bits. */
2882 bool
2883 aarch64_uimm12_shift (HOST_WIDE_INT val)
2885 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2886 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2891 /* Return true if val is an immediate that can be loaded into a
2892 register by a MOVZ instruction. */
2893 static bool
2894 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
2896 if (GET_MODE_SIZE (mode) > 4)
2898 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2899 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2900 return 1;
2902 else
2904 /* Ignore sign extension. */
2905 val &= (HOST_WIDE_INT) 0xffffffff;
2907 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2908 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2912 /* Return true if val is a valid bitmask immediate. */
2913 bool
2914 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
2916 if (GET_MODE_SIZE (mode) < 8)
2918 /* Replicate bit pattern. */
2919 val &= (HOST_WIDE_INT) 0xffffffff;
2920 val |= val << 32;
2922 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2923 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2927 /* Return true if val is an immediate that can be loaded into a
2928 register in a single instruction. */
2929 bool
2930 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
2932 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2933 return 1;
2934 return aarch64_bitmask_imm (val, mode);
2937 static bool
2938 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2940 rtx base, offset;
2942 if (GET_CODE (x) == HIGH)
2943 return true;
2945 split_const (x, &base, &offset);
2946 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2948 if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2949 != SYMBOL_FORCE_TO_MEM)
2950 return true;
2951 else
2952 /* Avoid generating a 64-bit relocation in ILP32; leave
2953 to aarch64_expand_mov_immediate to handle it properly. */
2954 return mode != ptr_mode;
2957 return aarch64_tls_referenced_p (x);
2960 /* Return true if register REGNO is a valid index register.
2961 STRICT_P is true if REG_OK_STRICT is in effect. */
2963 bool
2964 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2966 if (!HARD_REGISTER_NUM_P (regno))
2968 if (!strict_p)
2969 return true;
2971 if (!reg_renumber)
2972 return false;
2974 regno = reg_renumber[regno];
2976 return GP_REGNUM_P (regno);
2979 /* Return true if register REGNO is a valid base register for mode MODE.
2980 STRICT_P is true if REG_OK_STRICT is in effect. */
2982 bool
2983 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2985 if (!HARD_REGISTER_NUM_P (regno))
2987 if (!strict_p)
2988 return true;
2990 if (!reg_renumber)
2991 return false;
2993 regno = reg_renumber[regno];
2996 /* The fake registers will be eliminated to either the stack or
2997 hard frame pointer, both of which are usually valid base registers.
2998 Reload deals with the cases where the eliminated form isn't valid. */
2999 return (GP_REGNUM_P (regno)
3000 || regno == SP_REGNUM
3001 || regno == FRAME_POINTER_REGNUM
3002 || regno == ARG_POINTER_REGNUM);
3005 /* Return true if X is a valid base register for mode MODE.
3006 STRICT_P is true if REG_OK_STRICT is in effect. */
3008 static bool
3009 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3011 if (!strict_p && GET_CODE (x) == SUBREG)
3012 x = SUBREG_REG (x);
3014 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3017 /* Return true if address offset is a valid index. If it is, fill in INFO
3018 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3020 static bool
3021 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3022 machine_mode mode, bool strict_p)
3024 enum aarch64_address_type type;
3025 rtx index;
3026 int shift;
3028 /* (reg:P) */
3029 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3030 && GET_MODE (x) == Pmode)
3032 type = ADDRESS_REG_REG;
3033 index = x;
3034 shift = 0;
3036 /* (sign_extend:DI (reg:SI)) */
3037 else if ((GET_CODE (x) == SIGN_EXTEND
3038 || GET_CODE (x) == ZERO_EXTEND)
3039 && GET_MODE (x) == DImode
3040 && GET_MODE (XEXP (x, 0)) == SImode)
3042 type = (GET_CODE (x) == SIGN_EXTEND)
3043 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3044 index = XEXP (x, 0);
3045 shift = 0;
3047 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3048 else if (GET_CODE (x) == MULT
3049 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3050 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3051 && GET_MODE (XEXP (x, 0)) == DImode
3052 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3053 && CONST_INT_P (XEXP (x, 1)))
3055 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3056 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3057 index = XEXP (XEXP (x, 0), 0);
3058 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3060 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3061 else if (GET_CODE (x) == ASHIFT
3062 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3063 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3064 && GET_MODE (XEXP (x, 0)) == DImode
3065 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3066 && CONST_INT_P (XEXP (x, 1)))
3068 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3069 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3070 index = XEXP (XEXP (x, 0), 0);
3071 shift = INTVAL (XEXP (x, 1));
3073 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3074 else if ((GET_CODE (x) == SIGN_EXTRACT
3075 || GET_CODE (x) == ZERO_EXTRACT)
3076 && GET_MODE (x) == DImode
3077 && GET_CODE (XEXP (x, 0)) == MULT
3078 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3079 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3081 type = (GET_CODE (x) == SIGN_EXTRACT)
3082 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3083 index = XEXP (XEXP (x, 0), 0);
3084 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3085 if (INTVAL (XEXP (x, 1)) != 32 + shift
3086 || INTVAL (XEXP (x, 2)) != 0)
3087 shift = -1;
3089 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3090 (const_int 0xffffffff<<shift)) */
3091 else if (GET_CODE (x) == AND
3092 && GET_MODE (x) == DImode
3093 && GET_CODE (XEXP (x, 0)) == MULT
3094 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3095 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3096 && CONST_INT_P (XEXP (x, 1)))
3098 type = ADDRESS_REG_UXTW;
3099 index = XEXP (XEXP (x, 0), 0);
3100 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3101 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3102 shift = -1;
3104 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3105 else if ((GET_CODE (x) == SIGN_EXTRACT
3106 || GET_CODE (x) == ZERO_EXTRACT)
3107 && GET_MODE (x) == DImode
3108 && GET_CODE (XEXP (x, 0)) == ASHIFT
3109 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3110 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3112 type = (GET_CODE (x) == SIGN_EXTRACT)
3113 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3114 index = XEXP (XEXP (x, 0), 0);
3115 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3116 if (INTVAL (XEXP (x, 1)) != 32 + shift
3117 || INTVAL (XEXP (x, 2)) != 0)
3118 shift = -1;
3120 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3121 (const_int 0xffffffff<<shift)) */
3122 else if (GET_CODE (x) == AND
3123 && GET_MODE (x) == DImode
3124 && GET_CODE (XEXP (x, 0)) == ASHIFT
3125 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3126 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3127 && CONST_INT_P (XEXP (x, 1)))
3129 type = ADDRESS_REG_UXTW;
3130 index = XEXP (XEXP (x, 0), 0);
3131 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3132 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3133 shift = -1;
3135 /* (mult:P (reg:P) (const_int scale)) */
3136 else if (GET_CODE (x) == MULT
3137 && GET_MODE (x) == Pmode
3138 && GET_MODE (XEXP (x, 0)) == Pmode
3139 && CONST_INT_P (XEXP (x, 1)))
3141 type = ADDRESS_REG_REG;
3142 index = XEXP (x, 0);
3143 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3145 /* (ashift:P (reg:P) (const_int shift)) */
3146 else if (GET_CODE (x) == ASHIFT
3147 && GET_MODE (x) == Pmode
3148 && GET_MODE (XEXP (x, 0)) == Pmode
3149 && CONST_INT_P (XEXP (x, 1)))
3151 type = ADDRESS_REG_REG;
3152 index = XEXP (x, 0);
3153 shift = INTVAL (XEXP (x, 1));
3155 else
3156 return false;
3158 if (GET_CODE (index) == SUBREG)
3159 index = SUBREG_REG (index);
3161 if ((shift == 0 ||
3162 (shift > 0 && shift <= 3
3163 && (1 << shift) == GET_MODE_SIZE (mode)))
3164 && REG_P (index)
3165 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3167 info->type = type;
3168 info->offset = index;
3169 info->shift = shift;
3170 return true;
3173 return false;
3176 bool
3177 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3179 return (offset >= -64 * GET_MODE_SIZE (mode)
3180 && offset < 64 * GET_MODE_SIZE (mode)
3181 && offset % GET_MODE_SIZE (mode) == 0);
3184 static inline bool
3185 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3186 HOST_WIDE_INT offset)
3188 return offset >= -256 && offset < 256;
3191 static inline bool
3192 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3194 return (offset >= 0
3195 && offset < 4096 * GET_MODE_SIZE (mode)
3196 && offset % GET_MODE_SIZE (mode) == 0);
3199 /* Return true if X is a valid address for machine mode MODE. If it is,
3200 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3201 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3203 static bool
3204 aarch64_classify_address (struct aarch64_address_info *info,
3205 rtx x, machine_mode mode,
3206 RTX_CODE outer_code, bool strict_p)
3208 enum rtx_code code = GET_CODE (x);
3209 rtx op0, op1;
3210 bool allow_reg_index_p =
3211 outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3212 || aarch64_vector_mode_supported_p (mode));
3213 /* Don't support anything other than POST_INC or REG addressing for
3214 AdvSIMD. */
3215 if (aarch64_vect_struct_mode_p (mode)
3216 && (code != POST_INC && code != REG))
3217 return false;
3219 switch (code)
3221 case REG:
3222 case SUBREG:
3223 info->type = ADDRESS_REG_IMM;
3224 info->base = x;
3225 info->offset = const0_rtx;
3226 return aarch64_base_register_rtx_p (x, strict_p);
3228 case PLUS:
3229 op0 = XEXP (x, 0);
3230 op1 = XEXP (x, 1);
3232 if (! strict_p
3233 && REG_P (op0)
3234 && (op0 == virtual_stack_vars_rtx
3235 || op0 == frame_pointer_rtx
3236 || op0 == arg_pointer_rtx)
3237 && CONST_INT_P (op1))
3239 info->type = ADDRESS_REG_IMM;
3240 info->base = op0;
3241 info->offset = op1;
3243 return true;
3246 if (GET_MODE_SIZE (mode) != 0
3247 && CONST_INT_P (op1)
3248 && aarch64_base_register_rtx_p (op0, strict_p))
3250 HOST_WIDE_INT offset = INTVAL (op1);
3252 info->type = ADDRESS_REG_IMM;
3253 info->base = op0;
3254 info->offset = op1;
3256 /* TImode and TFmode values are allowed in both pairs of X
3257 registers and individual Q registers. The available
3258 address modes are:
3259 X,X: 7-bit signed scaled offset
3260 Q: 9-bit signed offset
3261 We conservatively require an offset representable in either mode.
3263 if (mode == TImode || mode == TFmode)
3264 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3265 && offset_9bit_signed_unscaled_p (mode, offset));
3267 if (outer_code == PARALLEL)
3268 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3269 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3270 else
3271 return (offset_9bit_signed_unscaled_p (mode, offset)
3272 || offset_12bit_unsigned_scaled_p (mode, offset));
3275 if (allow_reg_index_p)
3277 /* Look for base + (scaled/extended) index register. */
3278 if (aarch64_base_register_rtx_p (op0, strict_p)
3279 && aarch64_classify_index (info, op1, mode, strict_p))
3281 info->base = op0;
3282 return true;
3284 if (aarch64_base_register_rtx_p (op1, strict_p)
3285 && aarch64_classify_index (info, op0, mode, strict_p))
3287 info->base = op1;
3288 return true;
3292 return false;
3294 case POST_INC:
3295 case POST_DEC:
3296 case PRE_INC:
3297 case PRE_DEC:
3298 info->type = ADDRESS_REG_WB;
3299 info->base = XEXP (x, 0);
3300 info->offset = NULL_RTX;
3301 return aarch64_base_register_rtx_p (info->base, strict_p);
3303 case POST_MODIFY:
3304 case PRE_MODIFY:
3305 info->type = ADDRESS_REG_WB;
3306 info->base = XEXP (x, 0);
3307 if (GET_CODE (XEXP (x, 1)) == PLUS
3308 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3309 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3310 && aarch64_base_register_rtx_p (info->base, strict_p))
3312 HOST_WIDE_INT offset;
3313 info->offset = XEXP (XEXP (x, 1), 1);
3314 offset = INTVAL (info->offset);
3316 /* TImode and TFmode values are allowed in both pairs of X
3317 registers and individual Q registers. The available
3318 address modes are:
3319 X,X: 7-bit signed scaled offset
3320 Q: 9-bit signed offset
3321 We conservatively require an offset representable in either mode.
3323 if (mode == TImode || mode == TFmode)
3324 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3325 && offset_9bit_signed_unscaled_p (mode, offset));
3327 if (outer_code == PARALLEL)
3328 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3329 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3330 else
3331 return offset_9bit_signed_unscaled_p (mode, offset);
3333 return false;
3335 case CONST:
3336 case SYMBOL_REF:
3337 case LABEL_REF:
3338 /* load literal: pc-relative constant pool entry. Only supported
3339 for SI mode or larger. */
3340 info->type = ADDRESS_SYMBOLIC;
3341 if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3343 rtx sym, addend;
3345 split_const (x, &sym, &addend);
3346 return (GET_CODE (sym) == LABEL_REF
3347 || (GET_CODE (sym) == SYMBOL_REF
3348 && CONSTANT_POOL_ADDRESS_P (sym)));
3350 return false;
3352 case LO_SUM:
3353 info->type = ADDRESS_LO_SUM;
3354 info->base = XEXP (x, 0);
3355 info->offset = XEXP (x, 1);
3356 if (allow_reg_index_p
3357 && aarch64_base_register_rtx_p (info->base, strict_p))
3359 rtx sym, offs;
3360 split_const (info->offset, &sym, &offs);
3361 if (GET_CODE (sym) == SYMBOL_REF
3362 && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3363 == SYMBOL_SMALL_ABSOLUTE))
3365 /* The symbol and offset must be aligned to the access size. */
3366 unsigned int align;
3367 unsigned int ref_size;
3369 if (CONSTANT_POOL_ADDRESS_P (sym))
3370 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3371 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3373 tree exp = SYMBOL_REF_DECL (sym);
3374 align = TYPE_ALIGN (TREE_TYPE (exp));
3375 align = CONSTANT_ALIGNMENT (exp, align);
3377 else if (SYMBOL_REF_DECL (sym))
3378 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3379 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3380 && SYMBOL_REF_BLOCK (sym) != NULL)
3381 align = SYMBOL_REF_BLOCK (sym)->alignment;
3382 else
3383 align = BITS_PER_UNIT;
3385 ref_size = GET_MODE_SIZE (mode);
3386 if (ref_size == 0)
3387 ref_size = GET_MODE_SIZE (DImode);
3389 return ((INTVAL (offs) & (ref_size - 1)) == 0
3390 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3393 return false;
3395 default:
3396 return false;
3400 bool
3401 aarch64_symbolic_address_p (rtx x)
3403 rtx offset;
3405 split_const (x, &x, &offset);
3406 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3409 /* Classify the base of symbolic expression X, given that X appears in
3410 context CONTEXT. */
3412 enum aarch64_symbol_type
3413 aarch64_classify_symbolic_expression (rtx x,
3414 enum aarch64_symbol_context context)
3416 rtx offset;
3418 split_const (x, &x, &offset);
3419 return aarch64_classify_symbol (x, context);
3423 /* Return TRUE if X is a legitimate address for accessing memory in
3424 mode MODE. */
3425 static bool
3426 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3428 struct aarch64_address_info addr;
3430 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3433 /* Return TRUE if X is a legitimate address for accessing memory in
3434 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3435 pair operation. */
3436 bool
3437 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3438 RTX_CODE outer_code, bool strict_p)
3440 struct aarch64_address_info addr;
3442 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3445 /* Return TRUE if rtx X is immediate constant 0.0 */
3446 bool
3447 aarch64_float_const_zero_rtx_p (rtx x)
3449 REAL_VALUE_TYPE r;
3451 if (GET_MODE (x) == VOIDmode)
3452 return false;
3454 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3455 if (REAL_VALUE_MINUS_ZERO (r))
3456 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3457 return REAL_VALUES_EQUAL (r, dconst0);
3460 /* Return the fixed registers used for condition codes. */
3462 static bool
3463 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3465 *p1 = CC_REGNUM;
3466 *p2 = INVALID_REGNUM;
3467 return true;
3470 /* Emit call insn with PAT and do aarch64-specific handling. */
3472 void
3473 aarch64_emit_call_insn (rtx pat)
3475 rtx insn = emit_call_insn (pat);
3477 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3478 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3479 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3482 machine_mode
3483 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3485 /* All floating point compares return CCFP if it is an equality
3486 comparison, and CCFPE otherwise. */
3487 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3489 switch (code)
3491 case EQ:
3492 case NE:
3493 case UNORDERED:
3494 case ORDERED:
3495 case UNLT:
3496 case UNLE:
3497 case UNGT:
3498 case UNGE:
3499 case UNEQ:
3500 case LTGT:
3501 return CCFPmode;
3503 case LT:
3504 case LE:
3505 case GT:
3506 case GE:
3507 return CCFPEmode;
3509 default:
3510 gcc_unreachable ();
3514 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3515 && y == const0_rtx
3516 && (code == EQ || code == NE || code == LT || code == GE)
3517 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3518 || GET_CODE (x) == NEG))
3519 return CC_NZmode;
3521 /* A compare with a shifted operand. Because of canonicalization,
3522 the comparison will have to be swapped when we emit the assembly
3523 code. */
3524 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3525 && (REG_P (y) || GET_CODE (y) == SUBREG)
3526 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3527 || GET_CODE (x) == LSHIFTRT
3528 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3529 return CC_SWPmode;
3531 /* Similarly for a negated operand, but we can only do this for
3532 equalities. */
3533 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3534 && (REG_P (y) || GET_CODE (y) == SUBREG)
3535 && (code == EQ || code == NE)
3536 && GET_CODE (x) == NEG)
3537 return CC_Zmode;
3539 /* A compare of a mode narrower than SI mode against zero can be done
3540 by extending the value in the comparison. */
3541 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3542 && y == const0_rtx)
3543 /* Only use sign-extension if we really need it. */
3544 return ((code == GT || code == GE || code == LE || code == LT)
3545 ? CC_SESWPmode : CC_ZESWPmode);
3547 /* For everything else, return CCmode. */
3548 return CCmode;
3552 aarch64_get_condition_code (rtx x)
3554 machine_mode mode = GET_MODE (XEXP (x, 0));
3555 enum rtx_code comp_code = GET_CODE (x);
3557 if (GET_MODE_CLASS (mode) != MODE_CC)
3558 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3560 switch (mode)
3562 case CCFPmode:
3563 case CCFPEmode:
3564 switch (comp_code)
3566 case GE: return AARCH64_GE;
3567 case GT: return AARCH64_GT;
3568 case LE: return AARCH64_LS;
3569 case LT: return AARCH64_MI;
3570 case NE: return AARCH64_NE;
3571 case EQ: return AARCH64_EQ;
3572 case ORDERED: return AARCH64_VC;
3573 case UNORDERED: return AARCH64_VS;
3574 case UNLT: return AARCH64_LT;
3575 case UNLE: return AARCH64_LE;
3576 case UNGT: return AARCH64_HI;
3577 case UNGE: return AARCH64_PL;
3578 default: return -1;
3580 break;
3582 case CCmode:
3583 switch (comp_code)
3585 case NE: return AARCH64_NE;
3586 case EQ: return AARCH64_EQ;
3587 case GE: return AARCH64_GE;
3588 case GT: return AARCH64_GT;
3589 case LE: return AARCH64_LE;
3590 case LT: return AARCH64_LT;
3591 case GEU: return AARCH64_CS;
3592 case GTU: return AARCH64_HI;
3593 case LEU: return AARCH64_LS;
3594 case LTU: return AARCH64_CC;
3595 default: return -1;
3597 break;
3599 case CC_SWPmode:
3600 case CC_ZESWPmode:
3601 case CC_SESWPmode:
3602 switch (comp_code)
3604 case NE: return AARCH64_NE;
3605 case EQ: return AARCH64_EQ;
3606 case GE: return AARCH64_LE;
3607 case GT: return AARCH64_LT;
3608 case LE: return AARCH64_GE;
3609 case LT: return AARCH64_GT;
3610 case GEU: return AARCH64_LS;
3611 case GTU: return AARCH64_CC;
3612 case LEU: return AARCH64_CS;
3613 case LTU: return AARCH64_HI;
3614 default: return -1;
3616 break;
3618 case CC_NZmode:
3619 switch (comp_code)
3621 case NE: return AARCH64_NE;
3622 case EQ: return AARCH64_EQ;
3623 case GE: return AARCH64_PL;
3624 case LT: return AARCH64_MI;
3625 default: return -1;
3627 break;
3629 case CC_Zmode:
3630 switch (comp_code)
3632 case NE: return AARCH64_NE;
3633 case EQ: return AARCH64_EQ;
3634 default: return -1;
3636 break;
3638 default:
3639 return -1;
3640 break;
3644 bool
3645 aarch64_const_vec_all_same_in_range_p (rtx x,
3646 HOST_WIDE_INT minval,
3647 HOST_WIDE_INT maxval)
3649 HOST_WIDE_INT firstval;
3650 int count, i;
3652 if (GET_CODE (x) != CONST_VECTOR
3653 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3654 return false;
3656 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3657 if (firstval < minval || firstval > maxval)
3658 return false;
3660 count = CONST_VECTOR_NUNITS (x);
3661 for (i = 1; i < count; i++)
3662 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3663 return false;
3665 return true;
3668 bool
3669 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3671 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3674 static unsigned
3675 bit_count (unsigned HOST_WIDE_INT value)
3677 unsigned count = 0;
3679 while (value)
3681 count++;
3682 value &= value - 1;
3685 return count;
3688 void
3689 aarch64_print_operand (FILE *f, rtx x, char code)
3691 switch (code)
3693 /* An integer or symbol address without a preceding # sign. */
3694 case 'c':
3695 switch (GET_CODE (x))
3697 case CONST_INT:
3698 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3699 break;
3701 case SYMBOL_REF:
3702 output_addr_const (f, x);
3703 break;
3705 case CONST:
3706 if (GET_CODE (XEXP (x, 0)) == PLUS
3707 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3709 output_addr_const (f, x);
3710 break;
3712 /* Fall through. */
3714 default:
3715 output_operand_lossage ("Unsupported operand for code '%c'", code);
3717 break;
3719 case 'e':
3720 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
3722 int n;
3724 if (!CONST_INT_P (x)
3725 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3727 output_operand_lossage ("invalid operand for '%%%c'", code);
3728 return;
3731 switch (n)
3733 case 3:
3734 fputc ('b', f);
3735 break;
3736 case 4:
3737 fputc ('h', f);
3738 break;
3739 case 5:
3740 fputc ('w', f);
3741 break;
3742 default:
3743 output_operand_lossage ("invalid operand for '%%%c'", code);
3744 return;
3747 break;
3749 case 'p':
3751 int n;
3753 /* Print N such that 2^N == X. */
3754 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
3756 output_operand_lossage ("invalid operand for '%%%c'", code);
3757 return;
3760 asm_fprintf (f, "%d", n);
3762 break;
3764 case 'P':
3765 /* Print the number of non-zero bits in X (a const_int). */
3766 if (!CONST_INT_P (x))
3768 output_operand_lossage ("invalid operand for '%%%c'", code);
3769 return;
3772 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3773 break;
3775 case 'H':
3776 /* Print the higher numbered register of a pair (TImode) of regs. */
3777 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
3779 output_operand_lossage ("invalid operand for '%%%c'", code);
3780 return;
3783 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3784 break;
3786 case 'm':
3788 int cond_code;
3789 /* Print a condition (eq, ne, etc). */
3791 /* CONST_TRUE_RTX means always -- that's the default. */
3792 if (x == const_true_rtx)
3793 return;
3795 if (!COMPARISON_P (x))
3797 output_operand_lossage ("invalid operand for '%%%c'", code);
3798 return;
3801 cond_code = aarch64_get_condition_code (x);
3802 gcc_assert (cond_code >= 0);
3803 fputs (aarch64_condition_codes[cond_code], f);
3805 break;
3807 case 'M':
3809 int cond_code;
3810 /* Print the inverse of a condition (eq <-> ne, etc). */
3812 /* CONST_TRUE_RTX means never -- that's the default. */
3813 if (x == const_true_rtx)
3815 fputs ("nv", f);
3816 return;
3819 if (!COMPARISON_P (x))
3821 output_operand_lossage ("invalid operand for '%%%c'", code);
3822 return;
3824 cond_code = aarch64_get_condition_code (x);
3825 gcc_assert (cond_code >= 0);
3826 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3827 (cond_code)], f);
3829 break;
3831 case 'b':
3832 case 'h':
3833 case 's':
3834 case 'd':
3835 case 'q':
3836 /* Print a scalar FP/SIMD register name. */
3837 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3839 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3840 return;
3842 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3843 break;
3845 case 'S':
3846 case 'T':
3847 case 'U':
3848 case 'V':
3849 /* Print the first FP/SIMD register name in a list. */
3850 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3852 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3853 return;
3855 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3856 break;
3858 case 'X':
3859 /* Print bottom 16 bits of integer constant in hex. */
3860 if (!CONST_INT_P (x))
3862 output_operand_lossage ("invalid operand for '%%%c'", code);
3863 return;
3865 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3866 break;
3868 case 'w':
3869 case 'x':
3870 /* Print a general register name or the zero register (32-bit or
3871 64-bit). */
3872 if (x == const0_rtx
3873 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3875 asm_fprintf (f, "%czr", code);
3876 break;
3879 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3881 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3882 break;
3885 if (REG_P (x) && REGNO (x) == SP_REGNUM)
3887 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3888 break;
3891 /* Fall through */
3893 case 0:
3894 /* Print a normal operand, if it's a general register, then we
3895 assume DImode. */
3896 if (x == NULL)
3898 output_operand_lossage ("missing operand");
3899 return;
3902 switch (GET_CODE (x))
3904 case REG:
3905 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3906 break;
3908 case MEM:
3909 aarch64_memory_reference_mode = GET_MODE (x);
3910 output_address (XEXP (x, 0));
3911 break;
3913 case LABEL_REF:
3914 case SYMBOL_REF:
3915 output_addr_const (asm_out_file, x);
3916 break;
3918 case CONST_INT:
3919 asm_fprintf (f, "%wd", INTVAL (x));
3920 break;
3922 case CONST_VECTOR:
3923 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3925 gcc_assert (
3926 aarch64_const_vec_all_same_in_range_p (x,
3927 HOST_WIDE_INT_MIN,
3928 HOST_WIDE_INT_MAX));
3929 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3931 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3933 fputc ('0', f);
3935 else
3936 gcc_unreachable ();
3937 break;
3939 case CONST_DOUBLE:
3940 /* CONST_DOUBLE can represent a double-width integer.
3941 In this case, the mode of x is VOIDmode. */
3942 if (GET_MODE (x) == VOIDmode)
3943 ; /* Do Nothing. */
3944 else if (aarch64_float_const_zero_rtx_p (x))
3946 fputc ('0', f);
3947 break;
3949 else if (aarch64_float_const_representable_p (x))
3951 #define buf_size 20
3952 char float_buf[buf_size] = {'\0'};
3953 REAL_VALUE_TYPE r;
3954 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3955 real_to_decimal_for_mode (float_buf, &r,
3956 buf_size, buf_size,
3957 1, GET_MODE (x));
3958 asm_fprintf (asm_out_file, "%s", float_buf);
3959 break;
3960 #undef buf_size
3962 output_operand_lossage ("invalid constant");
3963 return;
3964 default:
3965 output_operand_lossage ("invalid operand");
3966 return;
3968 break;
3970 case 'A':
3971 if (GET_CODE (x) == HIGH)
3972 x = XEXP (x, 0);
3974 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3976 case SYMBOL_SMALL_GOT:
3977 asm_fprintf (asm_out_file, ":got:");
3978 break;
3980 case SYMBOL_SMALL_TLSGD:
3981 asm_fprintf (asm_out_file, ":tlsgd:");
3982 break;
3984 case SYMBOL_SMALL_TLSDESC:
3985 asm_fprintf (asm_out_file, ":tlsdesc:");
3986 break;
3988 case SYMBOL_SMALL_GOTTPREL:
3989 asm_fprintf (asm_out_file, ":gottprel:");
3990 break;
3992 case SYMBOL_SMALL_TPREL:
3993 asm_fprintf (asm_out_file, ":tprel:");
3994 break;
3996 case SYMBOL_TINY_GOT:
3997 gcc_unreachable ();
3998 break;
4000 default:
4001 break;
4003 output_addr_const (asm_out_file, x);
4004 break;
4006 case 'L':
4007 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4009 case SYMBOL_SMALL_GOT:
4010 asm_fprintf (asm_out_file, ":lo12:");
4011 break;
4013 case SYMBOL_SMALL_TLSGD:
4014 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4015 break;
4017 case SYMBOL_SMALL_TLSDESC:
4018 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4019 break;
4021 case SYMBOL_SMALL_GOTTPREL:
4022 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4023 break;
4025 case SYMBOL_SMALL_TPREL:
4026 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4027 break;
4029 case SYMBOL_TINY_GOT:
4030 asm_fprintf (asm_out_file, ":got:");
4031 break;
4033 default:
4034 break;
4036 output_addr_const (asm_out_file, x);
4037 break;
4039 case 'G':
4041 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4043 case SYMBOL_SMALL_TPREL:
4044 asm_fprintf (asm_out_file, ":tprel_hi12:");
4045 break;
4046 default:
4047 break;
4049 output_addr_const (asm_out_file, x);
4050 break;
4052 default:
4053 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4054 return;
4058 void
4059 aarch64_print_operand_address (FILE *f, rtx x)
4061 struct aarch64_address_info addr;
4063 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4064 MEM, true))
4065 switch (addr.type)
4067 case ADDRESS_REG_IMM:
4068 if (addr.offset == const0_rtx)
4069 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4070 else
4071 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4072 INTVAL (addr.offset));
4073 return;
4075 case ADDRESS_REG_REG:
4076 if (addr.shift == 0)
4077 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4078 reg_names [REGNO (addr.offset)]);
4079 else
4080 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4081 reg_names [REGNO (addr.offset)], addr.shift);
4082 return;
4084 case ADDRESS_REG_UXTW:
4085 if (addr.shift == 0)
4086 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4087 REGNO (addr.offset) - R0_REGNUM);
4088 else
4089 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4090 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4091 return;
4093 case ADDRESS_REG_SXTW:
4094 if (addr.shift == 0)
4095 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4096 REGNO (addr.offset) - R0_REGNUM);
4097 else
4098 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4099 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4100 return;
4102 case ADDRESS_REG_WB:
4103 switch (GET_CODE (x))
4105 case PRE_INC:
4106 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4107 GET_MODE_SIZE (aarch64_memory_reference_mode));
4108 return;
4109 case POST_INC:
4110 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4111 GET_MODE_SIZE (aarch64_memory_reference_mode));
4112 return;
4113 case PRE_DEC:
4114 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4115 GET_MODE_SIZE (aarch64_memory_reference_mode));
4116 return;
4117 case POST_DEC:
4118 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4119 GET_MODE_SIZE (aarch64_memory_reference_mode));
4120 return;
4121 case PRE_MODIFY:
4122 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4123 INTVAL (addr.offset));
4124 return;
4125 case POST_MODIFY:
4126 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4127 INTVAL (addr.offset));
4128 return;
4129 default:
4130 break;
4132 break;
4134 case ADDRESS_LO_SUM:
4135 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4136 output_addr_const (f, addr.offset);
4137 asm_fprintf (f, "]");
4138 return;
4140 case ADDRESS_SYMBOLIC:
4141 break;
4144 output_addr_const (f, x);
4147 bool
4148 aarch64_label_mentioned_p (rtx x)
4150 const char *fmt;
4151 int i;
4153 if (GET_CODE (x) == LABEL_REF)
4154 return true;
4156 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4157 referencing instruction, but they are constant offsets, not
4158 symbols. */
4159 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4160 return false;
4162 fmt = GET_RTX_FORMAT (GET_CODE (x));
4163 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4165 if (fmt[i] == 'E')
4167 int j;
4169 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4170 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4171 return 1;
4173 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4174 return 1;
4177 return 0;
4180 /* Implement REGNO_REG_CLASS. */
4182 enum reg_class
4183 aarch64_regno_regclass (unsigned regno)
4185 if (GP_REGNUM_P (regno))
4186 return GENERAL_REGS;
4188 if (regno == SP_REGNUM)
4189 return STACK_REG;
4191 if (regno == FRAME_POINTER_REGNUM
4192 || regno == ARG_POINTER_REGNUM)
4193 return POINTER_REGS;
4195 if (FP_REGNUM_P (regno))
4196 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4198 return NO_REGS;
4201 static rtx
4202 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4204 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4205 where mask is selected by alignment and size of the offset.
4206 We try to pick as large a range for the offset as possible to
4207 maximize the chance of a CSE. However, for aligned addresses
4208 we limit the range to 4k so that structures with different sized
4209 elements are likely to use the same base. */
4211 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4213 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4214 HOST_WIDE_INT base_offset;
4216 /* Does it look like we'll need a load/store-pair operation? */
4217 if (GET_MODE_SIZE (mode) > 16
4218 || mode == TImode)
4219 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4220 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4221 /* For offsets aren't a multiple of the access size, the limit is
4222 -256...255. */
4223 else if (offset & (GET_MODE_SIZE (mode) - 1))
4224 base_offset = (offset + 0x100) & ~0x1ff;
4225 else
4226 base_offset = offset & ~0xfff;
4228 if (base_offset == 0)
4229 return x;
4231 offset -= base_offset;
4232 rtx base_reg = gen_reg_rtx (Pmode);
4233 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4234 NULL_RTX);
4235 emit_move_insn (base_reg, val);
4236 x = plus_constant (Pmode, base_reg, offset);
4239 return x;
4242 /* Try a machine-dependent way of reloading an illegitimate address
4243 operand. If we find one, push the reload and return the new rtx. */
4246 aarch64_legitimize_reload_address (rtx *x_p,
4247 machine_mode mode,
4248 int opnum, int type,
4249 int ind_levels ATTRIBUTE_UNUSED)
4251 rtx x = *x_p;
4253 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4254 if (aarch64_vect_struct_mode_p (mode)
4255 && GET_CODE (x) == PLUS
4256 && REG_P (XEXP (x, 0))
4257 && CONST_INT_P (XEXP (x, 1)))
4259 rtx orig_rtx = x;
4260 x = copy_rtx (x);
4261 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4262 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4263 opnum, (enum reload_type) type);
4264 return x;
4267 /* We must recognize output that we have already generated ourselves. */
4268 if (GET_CODE (x) == PLUS
4269 && GET_CODE (XEXP (x, 0)) == PLUS
4270 && REG_P (XEXP (XEXP (x, 0), 0))
4271 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4272 && CONST_INT_P (XEXP (x, 1)))
4274 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4275 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4276 opnum, (enum reload_type) type);
4277 return x;
4280 /* We wish to handle large displacements off a base register by splitting
4281 the addend across an add and the mem insn. This can cut the number of
4282 extra insns needed from 3 to 1. It is only useful for load/store of a
4283 single register with 12 bit offset field. */
4284 if (GET_CODE (x) == PLUS
4285 && REG_P (XEXP (x, 0))
4286 && CONST_INT_P (XEXP (x, 1))
4287 && HARD_REGISTER_P (XEXP (x, 0))
4288 && mode != TImode
4289 && mode != TFmode
4290 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4292 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4293 HOST_WIDE_INT low = val & 0xfff;
4294 HOST_WIDE_INT high = val - low;
4295 HOST_WIDE_INT offs;
4296 rtx cst;
4297 machine_mode xmode = GET_MODE (x);
4299 /* In ILP32, xmode can be either DImode or SImode. */
4300 gcc_assert (xmode == DImode || xmode == SImode);
4302 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4303 BLKmode alignment. */
4304 if (GET_MODE_SIZE (mode) == 0)
4305 return NULL_RTX;
4307 offs = low % GET_MODE_SIZE (mode);
4309 /* Align misaligned offset by adjusting high part to compensate. */
4310 if (offs != 0)
4312 if (aarch64_uimm12_shift (high + offs))
4314 /* Align down. */
4315 low = low - offs;
4316 high = high + offs;
4318 else
4320 /* Align up. */
4321 offs = GET_MODE_SIZE (mode) - offs;
4322 low = low + offs;
4323 high = high + (low & 0x1000) - offs;
4324 low &= 0xfff;
4328 /* Check for overflow. */
4329 if (high + low != val)
4330 return NULL_RTX;
4332 cst = GEN_INT (high);
4333 if (!aarch64_uimm12_shift (high))
4334 cst = force_const_mem (xmode, cst);
4336 /* Reload high part into base reg, leaving the low part
4337 in the mem instruction.
4338 Note that replacing this gen_rtx_PLUS with plus_constant is
4339 wrong in this case because we rely on the
4340 (plus (plus reg c1) c2) structure being preserved so that
4341 XEXP (*p, 0) in push_reload below uses the correct term. */
4342 x = gen_rtx_PLUS (xmode,
4343 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4344 GEN_INT (low));
4346 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4347 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4348 opnum, (enum reload_type) type);
4349 return x;
4352 return NULL_RTX;
4356 static reg_class_t
4357 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4358 reg_class_t rclass,
4359 machine_mode mode,
4360 secondary_reload_info *sri)
4362 /* Without the TARGET_SIMD instructions we cannot move a Q register
4363 to a Q register directly. We need a scratch. */
4364 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4365 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4366 && reg_class_subset_p (rclass, FP_REGS))
4368 if (mode == TFmode)
4369 sri->icode = CODE_FOR_aarch64_reload_movtf;
4370 else if (mode == TImode)
4371 sri->icode = CODE_FOR_aarch64_reload_movti;
4372 return NO_REGS;
4375 /* A TFmode or TImode memory access should be handled via an FP_REGS
4376 because AArch64 has richer addressing modes for LDR/STR instructions
4377 than LDP/STP instructions. */
4378 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4379 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4380 return FP_REGS;
4382 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4383 return GENERAL_REGS;
4385 return NO_REGS;
4388 static bool
4389 aarch64_can_eliminate (const int from, const int to)
4391 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4392 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4394 if (frame_pointer_needed)
4396 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4397 return true;
4398 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4399 return false;
4400 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4401 && !cfun->calls_alloca)
4402 return true;
4403 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4404 return true;
4406 return false;
4409 return true;
4412 HOST_WIDE_INT
4413 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4415 aarch64_layout_frame ();
4417 if (to == HARD_FRAME_POINTER_REGNUM)
4419 if (from == ARG_POINTER_REGNUM)
4420 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4422 if (from == FRAME_POINTER_REGNUM)
4423 return (cfun->machine->frame.hard_fp_offset
4424 - cfun->machine->frame.saved_varargs_size);
4427 if (to == STACK_POINTER_REGNUM)
4429 if (from == FRAME_POINTER_REGNUM)
4430 return (cfun->machine->frame.frame_size
4431 - cfun->machine->frame.saved_varargs_size);
4434 return cfun->machine->frame.frame_size;
4437 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4438 previous frame. */
4441 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4443 if (count != 0)
4444 return const0_rtx;
4445 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4449 static void
4450 aarch64_asm_trampoline_template (FILE *f)
4452 if (TARGET_ILP32)
4454 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4455 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4457 else
4459 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4460 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4462 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4463 assemble_aligned_integer (4, const0_rtx);
4464 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4465 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4468 static void
4469 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4471 rtx fnaddr, mem, a_tramp;
4472 const int tramp_code_sz = 16;
4474 /* Don't need to copy the trailing D-words, we fill those in below. */
4475 emit_block_move (m_tramp, assemble_trampoline_template (),
4476 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4477 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4478 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4479 if (GET_MODE (fnaddr) != ptr_mode)
4480 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4481 emit_move_insn (mem, fnaddr);
4483 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4484 emit_move_insn (mem, chain_value);
4486 /* XXX We should really define a "clear_cache" pattern and use
4487 gen_clear_cache(). */
4488 a_tramp = XEXP (m_tramp, 0);
4489 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4490 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4491 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4492 ptr_mode);
4495 static unsigned char
4496 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4498 switch (regclass)
4500 case CALLER_SAVE_REGS:
4501 case POINTER_REGS:
4502 case GENERAL_REGS:
4503 case ALL_REGS:
4504 case FP_REGS:
4505 case FP_LO_REGS:
4506 return
4507 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4508 (GET_MODE_SIZE (mode) + 7) / 8;
4509 case STACK_REG:
4510 return 1;
4512 case NO_REGS:
4513 return 0;
4515 default:
4516 break;
4518 gcc_unreachable ();
4521 static reg_class_t
4522 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4524 if (regclass == POINTER_REGS)
4525 return GENERAL_REGS;
4527 if (regclass == STACK_REG)
4529 if (REG_P(x)
4530 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4531 return regclass;
4533 return NO_REGS;
4536 /* If it's an integer immediate that MOVI can't handle, then
4537 FP_REGS is not an option, so we return NO_REGS instead. */
4538 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4539 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4540 return NO_REGS;
4542 /* Register eliminiation can result in a request for
4543 SP+constant->FP_REGS. We cannot support such operations which
4544 use SP as source and an FP_REG as destination, so reject out
4545 right now. */
4546 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4548 rtx lhs = XEXP (x, 0);
4550 /* Look through a possible SUBREG introduced by ILP32. */
4551 if (GET_CODE (lhs) == SUBREG)
4552 lhs = SUBREG_REG (lhs);
4554 gcc_assert (REG_P (lhs));
4555 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4556 POINTER_REGS));
4557 return NO_REGS;
4560 return regclass;
4563 void
4564 aarch64_asm_output_labelref (FILE* f, const char *name)
4566 asm_fprintf (f, "%U%s", name);
4569 static void
4570 aarch64_elf_asm_constructor (rtx symbol, int priority)
4572 if (priority == DEFAULT_INIT_PRIORITY)
4573 default_ctor_section_asm_out_constructor (symbol, priority);
4574 else
4576 section *s;
4577 char buf[18];
4578 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4579 s = get_section (buf, SECTION_WRITE, NULL);
4580 switch_to_section (s);
4581 assemble_align (POINTER_SIZE);
4582 assemble_aligned_integer (POINTER_BYTES, symbol);
4586 static void
4587 aarch64_elf_asm_destructor (rtx symbol, int priority)
4589 if (priority == DEFAULT_INIT_PRIORITY)
4590 default_dtor_section_asm_out_destructor (symbol, priority);
4591 else
4593 section *s;
4594 char buf[18];
4595 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4596 s = get_section (buf, SECTION_WRITE, NULL);
4597 switch_to_section (s);
4598 assemble_align (POINTER_SIZE);
4599 assemble_aligned_integer (POINTER_BYTES, symbol);
4603 const char*
4604 aarch64_output_casesi (rtx *operands)
4606 char buf[100];
4607 char label[100];
4608 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
4609 int index;
4610 static const char *const patterns[4][2] =
4613 "ldrb\t%w3, [%0,%w1,uxtw]",
4614 "add\t%3, %4, %w3, sxtb #2"
4617 "ldrh\t%w3, [%0,%w1,uxtw #1]",
4618 "add\t%3, %4, %w3, sxth #2"
4621 "ldr\t%w3, [%0,%w1,uxtw #2]",
4622 "add\t%3, %4, %w3, sxtw #2"
4624 /* We assume that DImode is only generated when not optimizing and
4625 that we don't really need 64-bit address offsets. That would
4626 imply an object file with 8GB of code in a single function! */
4628 "ldr\t%w3, [%0,%w1,uxtw #2]",
4629 "add\t%3, %4, %w3, sxtw #2"
4633 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4635 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4637 gcc_assert (index >= 0 && index <= 3);
4639 /* Need to implement table size reduction, by chaning the code below. */
4640 output_asm_insn (patterns[index][0], operands);
4641 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4642 snprintf (buf, sizeof (buf),
4643 "adr\t%%4, %s", targetm.strip_name_encoding (label));
4644 output_asm_insn (buf, operands);
4645 output_asm_insn (patterns[index][1], operands);
4646 output_asm_insn ("br\t%3", operands);
4647 assemble_label (asm_out_file, label);
4648 return "";
4652 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4653 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4654 operator. */
4657 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4659 if (shift >= 0 && shift <= 3)
4661 int size;
4662 for (size = 8; size <= 32; size *= 2)
4664 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4665 if (mask == bits << shift)
4666 return size;
4669 return 0;
4672 static bool
4673 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
4674 const_rtx x ATTRIBUTE_UNUSED)
4676 /* We can't use blocks for constants when we're using a per-function
4677 constant pool. */
4678 return false;
4681 static section *
4682 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
4683 rtx x ATTRIBUTE_UNUSED,
4684 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4686 /* Force all constant pool entries into the current function section. */
4687 return function_section (current_function_decl);
4691 /* Costs. */
4693 /* Helper function for rtx cost calculation. Strip a shift expression
4694 from X. Returns the inner operand if successful, or the original
4695 expression on failure. */
4696 static rtx
4697 aarch64_strip_shift (rtx x)
4699 rtx op = x;
4701 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4702 we can convert both to ROR during final output. */
4703 if ((GET_CODE (op) == ASHIFT
4704 || GET_CODE (op) == ASHIFTRT
4705 || GET_CODE (op) == LSHIFTRT
4706 || GET_CODE (op) == ROTATERT
4707 || GET_CODE (op) == ROTATE)
4708 && CONST_INT_P (XEXP (op, 1)))
4709 return XEXP (op, 0);
4711 if (GET_CODE (op) == MULT
4712 && CONST_INT_P (XEXP (op, 1))
4713 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4714 return XEXP (op, 0);
4716 return x;
4719 /* Helper function for rtx cost calculation. Strip an extend
4720 expression from X. Returns the inner operand if successful, or the
4721 original expression on failure. We deal with a number of possible
4722 canonicalization variations here. */
4723 static rtx
4724 aarch64_strip_extend (rtx x)
4726 rtx op = x;
4728 /* Zero and sign extraction of a widened value. */
4729 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4730 && XEXP (op, 2) == const0_rtx
4731 && GET_CODE (XEXP (op, 0)) == MULT
4732 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4733 XEXP (op, 1)))
4734 return XEXP (XEXP (op, 0), 0);
4736 /* It can also be represented (for zero-extend) as an AND with an
4737 immediate. */
4738 if (GET_CODE (op) == AND
4739 && GET_CODE (XEXP (op, 0)) == MULT
4740 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4741 && CONST_INT_P (XEXP (op, 1))
4742 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4743 INTVAL (XEXP (op, 1))) != 0)
4744 return XEXP (XEXP (op, 0), 0);
4746 /* Now handle extended register, as this may also have an optional
4747 left shift by 1..4. */
4748 if (GET_CODE (op) == ASHIFT
4749 && CONST_INT_P (XEXP (op, 1))
4750 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4751 op = XEXP (op, 0);
4753 if (GET_CODE (op) == ZERO_EXTEND
4754 || GET_CODE (op) == SIGN_EXTEND)
4755 op = XEXP (op, 0);
4757 if (op != x)
4758 return op;
4760 return x;
4763 /* Helper function for rtx cost calculation. Calculate the cost of
4764 a MULT, which may be part of a multiply-accumulate rtx. Return
4765 the calculated cost of the expression, recursing manually in to
4766 operands where needed. */
4768 static int
4769 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4771 rtx op0, op1;
4772 const struct cpu_cost_table *extra_cost
4773 = aarch64_tune_params->insn_extra_cost;
4774 int cost = 0;
4775 bool maybe_fma = (outer == PLUS || outer == MINUS);
4776 machine_mode mode = GET_MODE (x);
4778 gcc_checking_assert (code == MULT);
4780 op0 = XEXP (x, 0);
4781 op1 = XEXP (x, 1);
4783 if (VECTOR_MODE_P (mode))
4784 mode = GET_MODE_INNER (mode);
4786 /* Integer multiply/fma. */
4787 if (GET_MODE_CLASS (mode) == MODE_INT)
4789 /* The multiply will be canonicalized as a shift, cost it as such. */
4790 if (CONST_INT_P (op1)
4791 && exact_log2 (INTVAL (op1)) > 0)
4793 if (speed)
4795 if (maybe_fma)
4796 /* ADD (shifted register). */
4797 cost += extra_cost->alu.arith_shift;
4798 else
4799 /* LSL (immediate). */
4800 cost += extra_cost->alu.shift;
4803 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4805 return cost;
4808 /* Integer multiplies or FMAs have zero/sign extending variants. */
4809 if ((GET_CODE (op0) == ZERO_EXTEND
4810 && GET_CODE (op1) == ZERO_EXTEND)
4811 || (GET_CODE (op0) == SIGN_EXTEND
4812 && GET_CODE (op1) == SIGN_EXTEND))
4814 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4815 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4817 if (speed)
4819 if (maybe_fma)
4820 /* MADD/SMADDL/UMADDL. */
4821 cost += extra_cost->mult[0].extend_add;
4822 else
4823 /* MUL/SMULL/UMULL. */
4824 cost += extra_cost->mult[0].extend;
4827 return cost;
4830 /* This is either an integer multiply or an FMA. In both cases
4831 we want to recurse and cost the operands. */
4832 cost += rtx_cost (op0, MULT, 0, speed)
4833 + rtx_cost (op1, MULT, 1, speed);
4835 if (speed)
4837 if (maybe_fma)
4838 /* MADD. */
4839 cost += extra_cost->mult[mode == DImode].add;
4840 else
4841 /* MUL. */
4842 cost += extra_cost->mult[mode == DImode].simple;
4845 return cost;
4847 else
4849 if (speed)
4851 /* Floating-point FMA/FMUL can also support negations of the
4852 operands. */
4853 if (GET_CODE (op0) == NEG)
4854 op0 = XEXP (op0, 0);
4855 if (GET_CODE (op1) == NEG)
4856 op1 = XEXP (op1, 0);
4858 if (maybe_fma)
4859 /* FMADD/FNMADD/FNMSUB/FMSUB. */
4860 cost += extra_cost->fp[mode == DFmode].fma;
4861 else
4862 /* FMUL/FNMUL. */
4863 cost += extra_cost->fp[mode == DFmode].mult;
4866 cost += rtx_cost (op0, MULT, 0, speed)
4867 + rtx_cost (op1, MULT, 1, speed);
4868 return cost;
4872 static int
4873 aarch64_address_cost (rtx x,
4874 machine_mode mode,
4875 addr_space_t as ATTRIBUTE_UNUSED,
4876 bool speed)
4878 enum rtx_code c = GET_CODE (x);
4879 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4880 struct aarch64_address_info info;
4881 int cost = 0;
4882 info.shift = 0;
4884 if (!aarch64_classify_address (&info, x, mode, c, false))
4886 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4888 /* This is a CONST or SYMBOL ref which will be split
4889 in a different way depending on the code model in use.
4890 Cost it through the generic infrastructure. */
4891 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4892 /* Divide through by the cost of one instruction to
4893 bring it to the same units as the address costs. */
4894 cost_symbol_ref /= COSTS_N_INSNS (1);
4895 /* The cost is then the cost of preparing the address,
4896 followed by an immediate (possibly 0) offset. */
4897 return cost_symbol_ref + addr_cost->imm_offset;
4899 else
4901 /* This is most likely a jump table from a case
4902 statement. */
4903 return addr_cost->register_offset;
4907 switch (info.type)
4909 case ADDRESS_LO_SUM:
4910 case ADDRESS_SYMBOLIC:
4911 case ADDRESS_REG_IMM:
4912 cost += addr_cost->imm_offset;
4913 break;
4915 case ADDRESS_REG_WB:
4916 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4917 cost += addr_cost->pre_modify;
4918 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4919 cost += addr_cost->post_modify;
4920 else
4921 gcc_unreachable ();
4923 break;
4925 case ADDRESS_REG_REG:
4926 cost += addr_cost->register_offset;
4927 break;
4929 case ADDRESS_REG_UXTW:
4930 case ADDRESS_REG_SXTW:
4931 cost += addr_cost->register_extend;
4932 break;
4934 default:
4935 gcc_unreachable ();
4939 if (info.shift > 0)
4941 /* For the sake of calculating the cost of the shifted register
4942 component, we can treat same sized modes in the same way. */
4943 switch (GET_MODE_BITSIZE (mode))
4945 case 16:
4946 cost += addr_cost->addr_scale_costs.hi;
4947 break;
4949 case 32:
4950 cost += addr_cost->addr_scale_costs.si;
4951 break;
4953 case 64:
4954 cost += addr_cost->addr_scale_costs.di;
4955 break;
4957 /* We can't tell, or this is a 128-bit vector. */
4958 default:
4959 cost += addr_cost->addr_scale_costs.ti;
4960 break;
4964 return cost;
4967 /* Return true if the RTX X in mode MODE is a zero or sign extract
4968 usable in an ADD or SUB (extended register) instruction. */
4969 static bool
4970 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
4972 /* Catch add with a sign extract.
4973 This is add_<optab><mode>_multp2. */
4974 if (GET_CODE (x) == SIGN_EXTRACT
4975 || GET_CODE (x) == ZERO_EXTRACT)
4977 rtx op0 = XEXP (x, 0);
4978 rtx op1 = XEXP (x, 1);
4979 rtx op2 = XEXP (x, 2);
4981 if (GET_CODE (op0) == MULT
4982 && CONST_INT_P (op1)
4983 && op2 == const0_rtx
4984 && CONST_INT_P (XEXP (op0, 1))
4985 && aarch64_is_extend_from_extract (mode,
4986 XEXP (op0, 1),
4987 op1))
4989 return true;
4993 return false;
4996 static bool
4997 aarch64_frint_unspec_p (unsigned int u)
4999 switch (u)
5001 case UNSPEC_FRINTZ:
5002 case UNSPEC_FRINTP:
5003 case UNSPEC_FRINTM:
5004 case UNSPEC_FRINTA:
5005 case UNSPEC_FRINTN:
5006 case UNSPEC_FRINTX:
5007 case UNSPEC_FRINTI:
5008 return true;
5010 default:
5011 return false;
5015 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5016 storing it in *COST. Result is true if the total cost of the operation
5017 has now been calculated. */
5018 static bool
5019 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5021 rtx inner;
5022 rtx comparator;
5023 enum rtx_code cmpcode;
5025 if (COMPARISON_P (op0))
5027 inner = XEXP (op0, 0);
5028 comparator = XEXP (op0, 1);
5029 cmpcode = GET_CODE (op0);
5031 else
5033 inner = op0;
5034 comparator = const0_rtx;
5035 cmpcode = NE;
5038 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5040 /* Conditional branch. */
5041 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5042 return true;
5043 else
5045 if (cmpcode == NE || cmpcode == EQ)
5047 if (comparator == const0_rtx)
5049 /* TBZ/TBNZ/CBZ/CBNZ. */
5050 if (GET_CODE (inner) == ZERO_EXTRACT)
5051 /* TBZ/TBNZ. */
5052 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5053 0, speed);
5054 else
5055 /* CBZ/CBNZ. */
5056 *cost += rtx_cost (inner, cmpcode, 0, speed);
5058 return true;
5061 else if (cmpcode == LT || cmpcode == GE)
5063 /* TBZ/TBNZ. */
5064 if (comparator == const0_rtx)
5065 return true;
5069 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5071 /* It's a conditional operation based on the status flags,
5072 so it must be some flavor of CSEL. */
5074 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5075 if (GET_CODE (op1) == NEG
5076 || GET_CODE (op1) == NOT
5077 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5078 op1 = XEXP (op1, 0);
5080 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5081 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5082 return true;
5085 /* We don't know what this is, cost all operands. */
5086 return false;
5089 /* Calculate the cost of calculating X, storing it in *COST. Result
5090 is true if the total cost of the operation has now been calculated. */
5091 static bool
5092 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5093 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5095 rtx op0, op1, op2;
5096 const struct cpu_cost_table *extra_cost
5097 = aarch64_tune_params->insn_extra_cost;
5098 machine_mode mode = GET_MODE (x);
5100 /* By default, assume that everything has equivalent cost to the
5101 cheapest instruction. Any additional costs are applied as a delta
5102 above this default. */
5103 *cost = COSTS_N_INSNS (1);
5105 /* TODO: The cost infrastructure currently does not handle
5106 vector operations. Assume that all vector operations
5107 are equally expensive. */
5108 if (VECTOR_MODE_P (mode))
5110 if (speed)
5111 *cost += extra_cost->vect.alu;
5112 return true;
5115 switch (code)
5117 case SET:
5118 /* The cost depends entirely on the operands to SET. */
5119 *cost = 0;
5120 op0 = SET_DEST (x);
5121 op1 = SET_SRC (x);
5123 switch (GET_CODE (op0))
5125 case MEM:
5126 if (speed)
5128 rtx address = XEXP (op0, 0);
5129 if (GET_MODE_CLASS (mode) == MODE_INT)
5130 *cost += extra_cost->ldst.store;
5131 else if (mode == SFmode)
5132 *cost += extra_cost->ldst.storef;
5133 else if (mode == DFmode)
5134 *cost += extra_cost->ldst.stored;
5136 *cost +=
5137 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5138 0, speed));
5141 *cost += rtx_cost (op1, SET, 1, speed);
5142 return true;
5144 case SUBREG:
5145 if (! REG_P (SUBREG_REG (op0)))
5146 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5148 /* Fall through. */
5149 case REG:
5150 /* const0_rtx is in general free, but we will use an
5151 instruction to set a register to 0. */
5152 if (REG_P (op1) || op1 == const0_rtx)
5154 /* The cost is 1 per register copied. */
5155 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5156 / UNITS_PER_WORD;
5157 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5159 else
5160 /* Cost is just the cost of the RHS of the set. */
5161 *cost += rtx_cost (op1, SET, 1, speed);
5162 return true;
5164 case ZERO_EXTRACT:
5165 case SIGN_EXTRACT:
5166 /* Bit-field insertion. Strip any redundant widening of
5167 the RHS to meet the width of the target. */
5168 if (GET_CODE (op1) == SUBREG)
5169 op1 = SUBREG_REG (op1);
5170 if ((GET_CODE (op1) == ZERO_EXTEND
5171 || GET_CODE (op1) == SIGN_EXTEND)
5172 && CONST_INT_P (XEXP (op0, 1))
5173 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5174 >= INTVAL (XEXP (op0, 1))))
5175 op1 = XEXP (op1, 0);
5177 if (CONST_INT_P (op1))
5179 /* MOV immediate is assumed to always be cheap. */
5180 *cost = COSTS_N_INSNS (1);
5182 else
5184 /* BFM. */
5185 if (speed)
5186 *cost += extra_cost->alu.bfi;
5187 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5190 return true;
5192 default:
5193 /* We can't make sense of this, assume default cost. */
5194 *cost = COSTS_N_INSNS (1);
5195 return false;
5197 return false;
5199 case CONST_INT:
5200 /* If an instruction can incorporate a constant within the
5201 instruction, the instruction's expression avoids calling
5202 rtx_cost() on the constant. If rtx_cost() is called on a
5203 constant, then it is usually because the constant must be
5204 moved into a register by one or more instructions.
5206 The exception is constant 0, which can be expressed
5207 as XZR/WZR and is therefore free. The exception to this is
5208 if we have (set (reg) (const0_rtx)) in which case we must cost
5209 the move. However, we can catch that when we cost the SET, so
5210 we don't need to consider that here. */
5211 if (x == const0_rtx)
5212 *cost = 0;
5213 else
5215 /* To an approximation, building any other constant is
5216 proportionally expensive to the number of instructions
5217 required to build that constant. This is true whether we
5218 are compiling for SPEED or otherwise. */
5219 *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5220 INTVAL (x),
5221 false));
5223 return true;
5225 case CONST_DOUBLE:
5226 if (speed)
5228 /* mov[df,sf]_aarch64. */
5229 if (aarch64_float_const_representable_p (x))
5230 /* FMOV (scalar immediate). */
5231 *cost += extra_cost->fp[mode == DFmode].fpconst;
5232 else if (!aarch64_float_const_zero_rtx_p (x))
5234 /* This will be a load from memory. */
5235 if (mode == DFmode)
5236 *cost += extra_cost->ldst.loadd;
5237 else
5238 *cost += extra_cost->ldst.loadf;
5240 else
5241 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5242 or MOV v0.s[0], wzr - neither of which are modeled by the
5243 cost tables. Just use the default cost. */
5248 return true;
5250 case MEM:
5251 if (speed)
5253 /* For loads we want the base cost of a load, plus an
5254 approximation for the additional cost of the addressing
5255 mode. */
5256 rtx address = XEXP (x, 0);
5257 if (GET_MODE_CLASS (mode) == MODE_INT)
5258 *cost += extra_cost->ldst.load;
5259 else if (mode == SFmode)
5260 *cost += extra_cost->ldst.loadf;
5261 else if (mode == DFmode)
5262 *cost += extra_cost->ldst.loadd;
5264 *cost +=
5265 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5266 0, speed));
5269 return true;
5271 case NEG:
5272 op0 = XEXP (x, 0);
5274 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5276 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5277 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5279 /* CSETM. */
5280 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5281 return true;
5284 /* Cost this as SUB wzr, X. */
5285 op0 = CONST0_RTX (GET_MODE (x));
5286 op1 = XEXP (x, 0);
5287 goto cost_minus;
5290 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5292 /* Support (neg(fma...)) as a single instruction only if
5293 sign of zeros is unimportant. This matches the decision
5294 making in aarch64.md. */
5295 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5297 /* FNMADD. */
5298 *cost = rtx_cost (op0, NEG, 0, speed);
5299 return true;
5301 if (speed)
5302 /* FNEG. */
5303 *cost += extra_cost->fp[mode == DFmode].neg;
5304 return false;
5307 return false;
5309 case CLRSB:
5310 case CLZ:
5311 if (speed)
5312 *cost += extra_cost->alu.clz;
5314 return false;
5316 case COMPARE:
5317 op0 = XEXP (x, 0);
5318 op1 = XEXP (x, 1);
5320 if (op1 == const0_rtx
5321 && GET_CODE (op0) == AND)
5323 x = op0;
5324 goto cost_logic;
5327 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5329 /* TODO: A write to the CC flags possibly costs extra, this
5330 needs encoding in the cost tables. */
5332 /* CC_ZESWPmode supports zero extend for free. */
5333 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5334 op0 = XEXP (op0, 0);
5336 /* ANDS. */
5337 if (GET_CODE (op0) == AND)
5339 x = op0;
5340 goto cost_logic;
5343 if (GET_CODE (op0) == PLUS)
5345 /* ADDS (and CMN alias). */
5346 x = op0;
5347 goto cost_plus;
5350 if (GET_CODE (op0) == MINUS)
5352 /* SUBS. */
5353 x = op0;
5354 goto cost_minus;
5357 if (GET_CODE (op1) == NEG)
5359 /* CMN. */
5360 if (speed)
5361 *cost += extra_cost->alu.arith;
5363 *cost += rtx_cost (op0, COMPARE, 0, speed);
5364 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5365 return true;
5368 /* CMP.
5370 Compare can freely swap the order of operands, and
5371 canonicalization puts the more complex operation first.
5372 But the integer MINUS logic expects the shift/extend
5373 operation in op1. */
5374 if (! (REG_P (op0)
5375 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5377 op0 = XEXP (x, 1);
5378 op1 = XEXP (x, 0);
5380 goto cost_minus;
5383 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5385 /* FCMP. */
5386 if (speed)
5387 *cost += extra_cost->fp[mode == DFmode].compare;
5389 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5391 /* FCMP supports constant 0.0 for no extra cost. */
5392 return true;
5394 return false;
5397 return false;
5399 case MINUS:
5401 op0 = XEXP (x, 0);
5402 op1 = XEXP (x, 1);
5404 cost_minus:
5405 /* Detect valid immediates. */
5406 if ((GET_MODE_CLASS (mode) == MODE_INT
5407 || (GET_MODE_CLASS (mode) == MODE_CC
5408 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5409 && CONST_INT_P (op1)
5410 && aarch64_uimm12_shift (INTVAL (op1)))
5412 *cost += rtx_cost (op0, MINUS, 0, speed);
5414 if (speed)
5415 /* SUB(S) (immediate). */
5416 *cost += extra_cost->alu.arith;
5417 return true;
5421 /* Look for SUB (extended register). */
5422 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5424 if (speed)
5425 *cost += extra_cost->alu.arith_shift;
5427 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5428 (enum rtx_code) GET_CODE (op1),
5429 0, speed);
5430 return true;
5433 rtx new_op1 = aarch64_strip_extend (op1);
5435 /* Cost this as an FMA-alike operation. */
5436 if ((GET_CODE (new_op1) == MULT
5437 || GET_CODE (new_op1) == ASHIFT)
5438 && code != COMPARE)
5440 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5441 (enum rtx_code) code,
5442 speed);
5443 *cost += rtx_cost (op0, MINUS, 0, speed);
5444 return true;
5447 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5449 if (speed)
5451 if (GET_MODE_CLASS (mode) == MODE_INT)
5452 /* SUB(S). */
5453 *cost += extra_cost->alu.arith;
5454 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5455 /* FSUB. */
5456 *cost += extra_cost->fp[mode == DFmode].addsub;
5458 return true;
5461 case PLUS:
5463 rtx new_op0;
5465 op0 = XEXP (x, 0);
5466 op1 = XEXP (x, 1);
5468 cost_plus:
5469 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5470 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5472 /* CSINC. */
5473 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5474 *cost += rtx_cost (op1, PLUS, 1, speed);
5475 return true;
5478 if (GET_MODE_CLASS (mode) == MODE_INT
5479 && CONST_INT_P (op1)
5480 && aarch64_uimm12_shift (INTVAL (op1)))
5482 *cost += rtx_cost (op0, PLUS, 0, speed);
5484 if (speed)
5485 /* ADD (immediate). */
5486 *cost += extra_cost->alu.arith;
5487 return true;
5490 /* Look for ADD (extended register). */
5491 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5493 if (speed)
5494 *cost += extra_cost->alu.arith_shift;
5496 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5497 (enum rtx_code) GET_CODE (op0),
5498 0, speed);
5499 return true;
5502 /* Strip any extend, leave shifts behind as we will
5503 cost them through mult_cost. */
5504 new_op0 = aarch64_strip_extend (op0);
5506 if (GET_CODE (new_op0) == MULT
5507 || GET_CODE (new_op0) == ASHIFT)
5509 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5510 speed);
5511 *cost += rtx_cost (op1, PLUS, 1, speed);
5512 return true;
5515 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5516 + rtx_cost (op1, PLUS, 1, speed));
5518 if (speed)
5520 if (GET_MODE_CLASS (mode) == MODE_INT)
5521 /* ADD. */
5522 *cost += extra_cost->alu.arith;
5523 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5524 /* FADD. */
5525 *cost += extra_cost->fp[mode == DFmode].addsub;
5527 return true;
5530 case BSWAP:
5531 *cost = COSTS_N_INSNS (1);
5533 if (speed)
5534 *cost += extra_cost->alu.rev;
5536 return false;
5538 case IOR:
5539 if (aarch_rev16_p (x))
5541 *cost = COSTS_N_INSNS (1);
5543 if (speed)
5544 *cost += extra_cost->alu.rev;
5546 return true;
5548 /* Fall through. */
5549 case XOR:
5550 case AND:
5551 cost_logic:
5552 op0 = XEXP (x, 0);
5553 op1 = XEXP (x, 1);
5555 if (code == AND
5556 && GET_CODE (op0) == MULT
5557 && CONST_INT_P (XEXP (op0, 1))
5558 && CONST_INT_P (op1)
5559 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5560 INTVAL (op1)) != 0)
5562 /* This is a UBFM/SBFM. */
5563 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5564 if (speed)
5565 *cost += extra_cost->alu.bfx;
5566 return true;
5569 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5571 /* We possibly get the immediate for free, this is not
5572 modelled. */
5573 if (CONST_INT_P (op1)
5574 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5576 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5578 if (speed)
5579 *cost += extra_cost->alu.logical;
5581 return true;
5583 else
5585 rtx new_op0 = op0;
5587 /* Handle ORN, EON, or BIC. */
5588 if (GET_CODE (op0) == NOT)
5589 op0 = XEXP (op0, 0);
5591 new_op0 = aarch64_strip_shift (op0);
5593 /* If we had a shift on op0 then this is a logical-shift-
5594 by-register/immediate operation. Otherwise, this is just
5595 a logical operation. */
5596 if (speed)
5598 if (new_op0 != op0)
5600 /* Shift by immediate. */
5601 if (CONST_INT_P (XEXP (op0, 1)))
5602 *cost += extra_cost->alu.log_shift;
5603 else
5604 *cost += extra_cost->alu.log_shift_reg;
5606 else
5607 *cost += extra_cost->alu.logical;
5610 /* In both cases we want to cost both operands. */
5611 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5612 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5614 return true;
5617 return false;
5619 case NOT:
5620 /* MVN. */
5621 if (speed)
5622 *cost += extra_cost->alu.logical;
5624 /* The logical instruction could have the shifted register form,
5625 but the cost is the same if the shift is processed as a separate
5626 instruction, so we don't bother with it here. */
5627 return false;
5629 case ZERO_EXTEND:
5631 op0 = XEXP (x, 0);
5632 /* If a value is written in SI mode, then zero extended to DI
5633 mode, the operation will in general be free as a write to
5634 a 'w' register implicitly zeroes the upper bits of an 'x'
5635 register. However, if this is
5637 (set (reg) (zero_extend (reg)))
5639 we must cost the explicit register move. */
5640 if (mode == DImode
5641 && GET_MODE (op0) == SImode
5642 && outer == SET)
5644 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5646 if (!op_cost && speed)
5647 /* MOV. */
5648 *cost += extra_cost->alu.extend;
5649 else
5650 /* Free, the cost is that of the SI mode operation. */
5651 *cost = op_cost;
5653 return true;
5655 else if (MEM_P (XEXP (x, 0)))
5657 /* All loads can zero extend to any size for free. */
5658 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5659 return true;
5662 /* UXTB/UXTH. */
5663 if (speed)
5664 *cost += extra_cost->alu.extend;
5666 return false;
5668 case SIGN_EXTEND:
5669 if (MEM_P (XEXP (x, 0)))
5671 /* LDRSH. */
5672 if (speed)
5674 rtx address = XEXP (XEXP (x, 0), 0);
5675 *cost += extra_cost->ldst.load_sign_extend;
5677 *cost +=
5678 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5679 0, speed));
5681 return true;
5684 if (speed)
5685 *cost += extra_cost->alu.extend;
5686 return false;
5688 case ASHIFT:
5689 op0 = XEXP (x, 0);
5690 op1 = XEXP (x, 1);
5692 if (CONST_INT_P (op1))
5694 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
5695 aliases. */
5696 if (speed)
5697 *cost += extra_cost->alu.shift;
5699 /* We can incorporate zero/sign extend for free. */
5700 if (GET_CODE (op0) == ZERO_EXTEND
5701 || GET_CODE (op0) == SIGN_EXTEND)
5702 op0 = XEXP (op0, 0);
5704 *cost += rtx_cost (op0, ASHIFT, 0, speed);
5705 return true;
5707 else
5709 /* LSLV. */
5710 if (speed)
5711 *cost += extra_cost->alu.shift_reg;
5713 return false; /* All arguments need to be in registers. */
5716 case ROTATE:
5717 case ROTATERT:
5718 case LSHIFTRT:
5719 case ASHIFTRT:
5720 op0 = XEXP (x, 0);
5721 op1 = XEXP (x, 1);
5723 if (CONST_INT_P (op1))
5725 /* ASR (immediate) and friends. */
5726 if (speed)
5727 *cost += extra_cost->alu.shift;
5729 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5730 return true;
5732 else
5735 /* ASR (register) and friends. */
5736 if (speed)
5737 *cost += extra_cost->alu.shift_reg;
5739 return false; /* All arguments need to be in registers. */
5742 case SYMBOL_REF:
5744 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5746 /* LDR. */
5747 if (speed)
5748 *cost += extra_cost->ldst.load;
5750 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5751 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5753 /* ADRP, followed by ADD. */
5754 *cost += COSTS_N_INSNS (1);
5755 if (speed)
5756 *cost += 2 * extra_cost->alu.arith;
5758 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5759 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5761 /* ADR. */
5762 if (speed)
5763 *cost += extra_cost->alu.arith;
5766 if (flag_pic)
5768 /* One extra load instruction, after accessing the GOT. */
5769 *cost += COSTS_N_INSNS (1);
5770 if (speed)
5771 *cost += extra_cost->ldst.load;
5773 return true;
5775 case HIGH:
5776 case LO_SUM:
5777 /* ADRP/ADD (immediate). */
5778 if (speed)
5779 *cost += extra_cost->alu.arith;
5780 return true;
5782 case ZERO_EXTRACT:
5783 case SIGN_EXTRACT:
5784 /* UBFX/SBFX. */
5785 if (speed)
5786 *cost += extra_cost->alu.bfx;
5788 /* We can trust that the immediates used will be correct (there
5789 are no by-register forms), so we need only cost op0. */
5790 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5791 return true;
5793 case MULT:
5794 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5795 /* aarch64_rtx_mult_cost always handles recursion to its
5796 operands. */
5797 return true;
5799 case MOD:
5800 case UMOD:
5801 if (speed)
5803 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5804 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5805 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5806 else if (GET_MODE (x) == DFmode)
5807 *cost += (extra_cost->fp[1].mult
5808 + extra_cost->fp[1].div);
5809 else if (GET_MODE (x) == SFmode)
5810 *cost += (extra_cost->fp[0].mult
5811 + extra_cost->fp[0].div);
5813 return false; /* All arguments need to be in registers. */
5815 case DIV:
5816 case UDIV:
5817 case SQRT:
5818 if (speed)
5820 if (GET_MODE_CLASS (mode) == MODE_INT)
5821 /* There is no integer SQRT, so only DIV and UDIV can get
5822 here. */
5823 *cost += extra_cost->mult[mode == DImode].idiv;
5824 else
5825 *cost += extra_cost->fp[mode == DFmode].div;
5827 return false; /* All arguments need to be in registers. */
5829 case IF_THEN_ELSE:
5830 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5831 XEXP (x, 2), cost, speed);
5833 case EQ:
5834 case NE:
5835 case GT:
5836 case GTU:
5837 case LT:
5838 case LTU:
5839 case GE:
5840 case GEU:
5841 case LE:
5842 case LEU:
5844 return false; /* All arguments must be in registers. */
5846 case FMA:
5847 op0 = XEXP (x, 0);
5848 op1 = XEXP (x, 1);
5849 op2 = XEXP (x, 2);
5851 if (speed)
5852 *cost += extra_cost->fp[mode == DFmode].fma;
5854 /* FMSUB, FNMADD, and FNMSUB are free. */
5855 if (GET_CODE (op0) == NEG)
5856 op0 = XEXP (op0, 0);
5858 if (GET_CODE (op2) == NEG)
5859 op2 = XEXP (op2, 0);
5861 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5862 and the by-element operand as operand 0. */
5863 if (GET_CODE (op1) == NEG)
5864 op1 = XEXP (op1, 0);
5866 /* Catch vector-by-element operations. The by-element operand can
5867 either be (vec_duplicate (vec_select (x))) or just
5868 (vec_select (x)), depending on whether we are multiplying by
5869 a vector or a scalar.
5871 Canonicalization is not very good in these cases, FMA4 will put the
5872 by-element operand as operand 0, FNMA4 will have it as operand 1. */
5873 if (GET_CODE (op0) == VEC_DUPLICATE)
5874 op0 = XEXP (op0, 0);
5875 else if (GET_CODE (op1) == VEC_DUPLICATE)
5876 op1 = XEXP (op1, 0);
5878 if (GET_CODE (op0) == VEC_SELECT)
5879 op0 = XEXP (op0, 0);
5880 else if (GET_CODE (op1) == VEC_SELECT)
5881 op1 = XEXP (op1, 0);
5883 /* If the remaining parameters are not registers,
5884 get the cost to put them into registers. */
5885 *cost += rtx_cost (op0, FMA, 0, speed);
5886 *cost += rtx_cost (op1, FMA, 1, speed);
5887 *cost += rtx_cost (op2, FMA, 2, speed);
5888 return true;
5890 case FLOAT_EXTEND:
5891 if (speed)
5892 *cost += extra_cost->fp[mode == DFmode].widen;
5893 return false;
5895 case FLOAT_TRUNCATE:
5896 if (speed)
5897 *cost += extra_cost->fp[mode == DFmode].narrow;
5898 return false;
5900 case FIX:
5901 case UNSIGNED_FIX:
5902 x = XEXP (x, 0);
5903 /* Strip the rounding part. They will all be implemented
5904 by the fcvt* family of instructions anyway. */
5905 if (GET_CODE (x) == UNSPEC)
5907 unsigned int uns_code = XINT (x, 1);
5909 if (uns_code == UNSPEC_FRINTA
5910 || uns_code == UNSPEC_FRINTM
5911 || uns_code == UNSPEC_FRINTN
5912 || uns_code == UNSPEC_FRINTP
5913 || uns_code == UNSPEC_FRINTZ)
5914 x = XVECEXP (x, 0, 0);
5917 if (speed)
5918 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5920 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5921 return true;
5923 case ABS:
5924 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5926 /* FABS and FNEG are analogous. */
5927 if (speed)
5928 *cost += extra_cost->fp[mode == DFmode].neg;
5930 else
5932 /* Integer ABS will either be split to
5933 two arithmetic instructions, or will be an ABS
5934 (scalar), which we don't model. */
5935 *cost = COSTS_N_INSNS (2);
5936 if (speed)
5937 *cost += 2 * extra_cost->alu.arith;
5939 return false;
5941 case SMAX:
5942 case SMIN:
5943 if (speed)
5945 /* FMAXNM/FMINNM/FMAX/FMIN.
5946 TODO: This may not be accurate for all implementations, but
5947 we do not model this in the cost tables. */
5948 *cost += extra_cost->fp[mode == DFmode].addsub;
5950 return false;
5952 case UNSPEC:
5953 /* The floating point round to integer frint* instructions. */
5954 if (aarch64_frint_unspec_p (XINT (x, 1)))
5956 if (speed)
5957 *cost += extra_cost->fp[mode == DFmode].roundint;
5959 return false;
5962 if (XINT (x, 1) == UNSPEC_RBIT)
5964 if (speed)
5965 *cost += extra_cost->alu.rev;
5967 return false;
5969 break;
5971 case TRUNCATE:
5973 /* Decompose <su>muldi3_highpart. */
5974 if (/* (truncate:DI */
5975 mode == DImode
5976 /* (lshiftrt:TI */
5977 && GET_MODE (XEXP (x, 0)) == TImode
5978 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5979 /* (mult:TI */
5980 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5981 /* (ANY_EXTEND:TI (reg:DI))
5982 (ANY_EXTEND:TI (reg:DI))) */
5983 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5984 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5985 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5986 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5987 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5988 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5989 /* (const_int 64) */
5990 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5991 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5993 /* UMULH/SMULH. */
5994 if (speed)
5995 *cost += extra_cost->mult[mode == DImode].extend;
5996 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5997 MULT, 0, speed);
5998 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5999 MULT, 1, speed);
6000 return true;
6003 /* Fall through. */
6004 default:
6005 break;
6008 if (dump_file && (dump_flags & TDF_DETAILS))
6009 fprintf (dump_file,
6010 "\nFailed to cost RTX. Assuming default cost.\n");
6012 return true;
6015 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6016 calculated for X. This cost is stored in *COST. Returns true
6017 if the total cost of X was calculated. */
6018 static bool
6019 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6020 int param, int *cost, bool speed)
6022 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6024 if (dump_file && (dump_flags & TDF_DETAILS))
6026 print_rtl_single (dump_file, x);
6027 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6028 speed ? "Hot" : "Cold",
6029 *cost, result ? "final" : "partial");
6032 return result;
6035 static int
6036 aarch64_register_move_cost (machine_mode mode,
6037 reg_class_t from_i, reg_class_t to_i)
6039 enum reg_class from = (enum reg_class) from_i;
6040 enum reg_class to = (enum reg_class) to_i;
6041 const struct cpu_regmove_cost *regmove_cost
6042 = aarch64_tune_params->regmove_cost;
6044 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6045 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6046 to = GENERAL_REGS;
6048 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6049 from = GENERAL_REGS;
6051 /* Moving between GPR and stack cost is the same as GP2GP. */
6052 if ((from == GENERAL_REGS && to == STACK_REG)
6053 || (to == GENERAL_REGS && from == STACK_REG))
6054 return regmove_cost->GP2GP;
6056 /* To/From the stack register, we move via the gprs. */
6057 if (to == STACK_REG || from == STACK_REG)
6058 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6059 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6061 if (GET_MODE_SIZE (mode) == 16)
6063 /* 128-bit operations on general registers require 2 instructions. */
6064 if (from == GENERAL_REGS && to == GENERAL_REGS)
6065 return regmove_cost->GP2GP * 2;
6066 else if (from == GENERAL_REGS)
6067 return regmove_cost->GP2FP * 2;
6068 else if (to == GENERAL_REGS)
6069 return regmove_cost->FP2GP * 2;
6071 /* When AdvSIMD instructions are disabled it is not possible to move
6072 a 128-bit value directly between Q registers. This is handled in
6073 secondary reload. A general register is used as a scratch to move
6074 the upper DI value and the lower DI value is moved directly,
6075 hence the cost is the sum of three moves. */
6076 if (! TARGET_SIMD)
6077 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6079 return regmove_cost->FP2FP;
6082 if (from == GENERAL_REGS && to == GENERAL_REGS)
6083 return regmove_cost->GP2GP;
6084 else if (from == GENERAL_REGS)
6085 return regmove_cost->GP2FP;
6086 else if (to == GENERAL_REGS)
6087 return regmove_cost->FP2GP;
6089 return regmove_cost->FP2FP;
6092 static int
6093 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6094 reg_class_t rclass ATTRIBUTE_UNUSED,
6095 bool in ATTRIBUTE_UNUSED)
6097 return aarch64_tune_params->memmov_cost;
6100 /* Return the number of instructions that can be issued per cycle. */
6101 static int
6102 aarch64_sched_issue_rate (void)
6104 return aarch64_tune_params->issue_rate;
6107 /* Vectorizer cost model target hooks. */
6109 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6110 static int
6111 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6112 tree vectype,
6113 int misalign ATTRIBUTE_UNUSED)
6115 unsigned elements;
6117 switch (type_of_cost)
6119 case scalar_stmt:
6120 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6122 case scalar_load:
6123 return aarch64_tune_params->vec_costs->scalar_load_cost;
6125 case scalar_store:
6126 return aarch64_tune_params->vec_costs->scalar_store_cost;
6128 case vector_stmt:
6129 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6131 case vector_load:
6132 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6134 case vector_store:
6135 return aarch64_tune_params->vec_costs->vec_store_cost;
6137 case vec_to_scalar:
6138 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6140 case scalar_to_vec:
6141 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6143 case unaligned_load:
6144 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6146 case unaligned_store:
6147 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6149 case cond_branch_taken:
6150 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6152 case cond_branch_not_taken:
6153 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6155 case vec_perm:
6156 case vec_promote_demote:
6157 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6159 case vec_construct:
6160 elements = TYPE_VECTOR_SUBPARTS (vectype);
6161 return elements / 2 + 1;
6163 default:
6164 gcc_unreachable ();
6168 /* Implement targetm.vectorize.add_stmt_cost. */
6169 static unsigned
6170 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6171 struct _stmt_vec_info *stmt_info, int misalign,
6172 enum vect_cost_model_location where)
6174 unsigned *cost = (unsigned *) data;
6175 unsigned retval = 0;
6177 if (flag_vect_cost_model)
6179 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6180 int stmt_cost =
6181 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6183 /* Statements in an inner loop relative to the loop being
6184 vectorized are weighted more heavily. The value here is
6185 a function (linear for now) of the loop nest level. */
6186 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6188 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6189 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6190 unsigned nest_level = loop_depth (loop);
6192 count *= nest_level;
6195 retval = (unsigned) (count * stmt_cost);
6196 cost[where] += retval;
6199 return retval;
6202 static void initialize_aarch64_code_model (void);
6204 /* Parse the architecture extension string. */
6206 static void
6207 aarch64_parse_extension (char *str)
6209 /* The extension string is parsed left to right. */
6210 const struct aarch64_option_extension *opt = NULL;
6212 /* Flag to say whether we are adding or removing an extension. */
6213 int adding_ext = -1;
6215 while (str != NULL && *str != 0)
6217 char *ext;
6218 size_t len;
6220 str++;
6221 ext = strchr (str, '+');
6223 if (ext != NULL)
6224 len = ext - str;
6225 else
6226 len = strlen (str);
6228 if (len >= 2 && strncmp (str, "no", 2) == 0)
6230 adding_ext = 0;
6231 len -= 2;
6232 str += 2;
6234 else if (len > 0)
6235 adding_ext = 1;
6237 if (len == 0)
6239 error ("missing feature modifier after %qs", "+no");
6240 return;
6243 /* Scan over the extensions table trying to find an exact match. */
6244 for (opt = all_extensions; opt->name != NULL; opt++)
6246 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6248 /* Add or remove the extension. */
6249 if (adding_ext)
6250 aarch64_isa_flags |= opt->flags_on;
6251 else
6252 aarch64_isa_flags &= ~(opt->flags_off);
6253 break;
6257 if (opt->name == NULL)
6259 /* Extension not found in list. */
6260 error ("unknown feature modifier %qs", str);
6261 return;
6264 str = ext;
6267 return;
6270 /* Parse the ARCH string. */
6272 static void
6273 aarch64_parse_arch (void)
6275 char *ext;
6276 const struct processor *arch;
6277 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6278 size_t len;
6280 strcpy (str, aarch64_arch_string);
6282 ext = strchr (str, '+');
6284 if (ext != NULL)
6285 len = ext - str;
6286 else
6287 len = strlen (str);
6289 if (len == 0)
6291 error ("missing arch name in -march=%qs", str);
6292 return;
6295 /* Loop through the list of supported ARCHs to find a match. */
6296 for (arch = all_architectures; arch->name != NULL; arch++)
6298 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6300 selected_arch = arch;
6301 aarch64_isa_flags = selected_arch->flags;
6303 if (!selected_cpu)
6304 selected_cpu = &all_cores[selected_arch->core];
6306 if (ext != NULL)
6308 /* ARCH string contains at least one extension. */
6309 aarch64_parse_extension (ext);
6312 if (strcmp (selected_arch->arch, selected_cpu->arch))
6314 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6315 selected_cpu->name, selected_arch->name);
6318 return;
6322 /* ARCH name not found in list. */
6323 error ("unknown value %qs for -march", str);
6324 return;
6327 /* Parse the CPU string. */
6329 static void
6330 aarch64_parse_cpu (void)
6332 char *ext;
6333 const struct processor *cpu;
6334 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6335 size_t len;
6337 strcpy (str, aarch64_cpu_string);
6339 ext = strchr (str, '+');
6341 if (ext != NULL)
6342 len = ext - str;
6343 else
6344 len = strlen (str);
6346 if (len == 0)
6348 error ("missing cpu name in -mcpu=%qs", str);
6349 return;
6352 /* Loop through the list of supported CPUs to find a match. */
6353 for (cpu = all_cores; cpu->name != NULL; cpu++)
6355 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6357 selected_cpu = cpu;
6358 selected_tune = cpu;
6359 aarch64_isa_flags = selected_cpu->flags;
6361 if (ext != NULL)
6363 /* CPU string contains at least one extension. */
6364 aarch64_parse_extension (ext);
6367 return;
6371 /* CPU name not found in list. */
6372 error ("unknown value %qs for -mcpu", str);
6373 return;
6376 /* Parse the TUNE string. */
6378 static void
6379 aarch64_parse_tune (void)
6381 const struct processor *cpu;
6382 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6383 strcpy (str, aarch64_tune_string);
6385 /* Loop through the list of supported CPUs to find a match. */
6386 for (cpu = all_cores; cpu->name != NULL; cpu++)
6388 if (strcmp (cpu->name, str) == 0)
6390 selected_tune = cpu;
6391 return;
6395 /* CPU name not found in list. */
6396 error ("unknown value %qs for -mtune", str);
6397 return;
6401 /* Implement TARGET_OPTION_OVERRIDE. */
6403 static void
6404 aarch64_override_options (void)
6406 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6407 If either of -march or -mtune is given, they override their
6408 respective component of -mcpu.
6410 So, first parse AARCH64_CPU_STRING, then the others, be careful
6411 with -march as, if -mcpu is not present on the command line, march
6412 must set a sensible default CPU. */
6413 if (aarch64_cpu_string)
6415 aarch64_parse_cpu ();
6418 if (aarch64_arch_string)
6420 aarch64_parse_arch ();
6423 if (aarch64_tune_string)
6425 aarch64_parse_tune ();
6428 #ifndef HAVE_AS_MABI_OPTION
6429 /* The compiler may have been configured with 2.23.* binutils, which does
6430 not have support for ILP32. */
6431 if (TARGET_ILP32)
6432 error ("Assembler does not support -mabi=ilp32");
6433 #endif
6435 initialize_aarch64_code_model ();
6437 aarch64_build_bitmask_table ();
6439 /* This target defaults to strict volatile bitfields. */
6440 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6441 flag_strict_volatile_bitfields = 1;
6443 /* If the user did not specify a processor, choose the default
6444 one for them. This will be the CPU set during configuration using
6445 --with-cpu, otherwise it is "generic". */
6446 if (!selected_cpu)
6448 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6449 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6452 gcc_assert (selected_cpu);
6454 /* The selected cpu may be an architecture, so lookup tuning by core ID. */
6455 if (!selected_tune)
6456 selected_tune = &all_cores[selected_cpu->core];
6458 aarch64_tune_flags = selected_tune->flags;
6459 aarch64_tune = selected_tune->core;
6460 aarch64_tune_params = selected_tune->tune;
6462 if (aarch64_fix_a53_err835769 == 2)
6464 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6465 aarch64_fix_a53_err835769 = 1;
6466 #else
6467 aarch64_fix_a53_err835769 = 0;
6468 #endif
6471 aarch64_override_options_after_change ();
6474 /* Implement targetm.override_options_after_change. */
6476 static void
6477 aarch64_override_options_after_change (void)
6479 if (flag_omit_frame_pointer)
6480 flag_omit_leaf_frame_pointer = false;
6481 else if (flag_omit_leaf_frame_pointer)
6482 flag_omit_frame_pointer = true;
6485 static struct machine_function *
6486 aarch64_init_machine_status (void)
6488 struct machine_function *machine;
6489 machine = ggc_cleared_alloc<machine_function> ();
6490 return machine;
6493 void
6494 aarch64_init_expanders (void)
6496 init_machine_status = aarch64_init_machine_status;
6499 /* A checking mechanism for the implementation of the various code models. */
6500 static void
6501 initialize_aarch64_code_model (void)
6503 if (flag_pic)
6505 switch (aarch64_cmodel_var)
6507 case AARCH64_CMODEL_TINY:
6508 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6509 break;
6510 case AARCH64_CMODEL_SMALL:
6511 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6512 break;
6513 case AARCH64_CMODEL_LARGE:
6514 sorry ("code model %qs with -f%s", "large",
6515 flag_pic > 1 ? "PIC" : "pic");
6516 default:
6517 gcc_unreachable ();
6520 else
6521 aarch64_cmodel = aarch64_cmodel_var;
6524 /* Return true if SYMBOL_REF X binds locally. */
6526 static bool
6527 aarch64_symbol_binds_local_p (const_rtx x)
6529 return (SYMBOL_REF_DECL (x)
6530 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6531 : SYMBOL_REF_LOCAL_P (x));
6534 /* Return true if SYMBOL_REF X is thread local */
6535 static bool
6536 aarch64_tls_symbol_p (rtx x)
6538 if (! TARGET_HAVE_TLS)
6539 return false;
6541 if (GET_CODE (x) != SYMBOL_REF)
6542 return false;
6544 return SYMBOL_REF_TLS_MODEL (x) != 0;
6547 /* Classify a TLS symbol into one of the TLS kinds. */
6548 enum aarch64_symbol_type
6549 aarch64_classify_tls_symbol (rtx x)
6551 enum tls_model tls_kind = tls_symbolic_operand_type (x);
6553 switch (tls_kind)
6555 case TLS_MODEL_GLOBAL_DYNAMIC:
6556 case TLS_MODEL_LOCAL_DYNAMIC:
6557 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6559 case TLS_MODEL_INITIAL_EXEC:
6560 return SYMBOL_SMALL_GOTTPREL;
6562 case TLS_MODEL_LOCAL_EXEC:
6563 return SYMBOL_SMALL_TPREL;
6565 case TLS_MODEL_EMULATED:
6566 case TLS_MODEL_NONE:
6567 return SYMBOL_FORCE_TO_MEM;
6569 default:
6570 gcc_unreachable ();
6574 /* Return the method that should be used to access SYMBOL_REF or
6575 LABEL_REF X in context CONTEXT. */
6577 enum aarch64_symbol_type
6578 aarch64_classify_symbol (rtx x,
6579 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6581 if (GET_CODE (x) == LABEL_REF)
6583 switch (aarch64_cmodel)
6585 case AARCH64_CMODEL_LARGE:
6586 return SYMBOL_FORCE_TO_MEM;
6588 case AARCH64_CMODEL_TINY_PIC:
6589 case AARCH64_CMODEL_TINY:
6590 return SYMBOL_TINY_ABSOLUTE;
6592 case AARCH64_CMODEL_SMALL_PIC:
6593 case AARCH64_CMODEL_SMALL:
6594 return SYMBOL_SMALL_ABSOLUTE;
6596 default:
6597 gcc_unreachable ();
6601 if (GET_CODE (x) == SYMBOL_REF)
6603 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6604 return SYMBOL_FORCE_TO_MEM;
6606 if (aarch64_tls_symbol_p (x))
6607 return aarch64_classify_tls_symbol (x);
6609 switch (aarch64_cmodel)
6611 case AARCH64_CMODEL_TINY:
6612 if (SYMBOL_REF_WEAK (x))
6613 return SYMBOL_FORCE_TO_MEM;
6614 return SYMBOL_TINY_ABSOLUTE;
6616 case AARCH64_CMODEL_SMALL:
6617 if (SYMBOL_REF_WEAK (x))
6618 return SYMBOL_FORCE_TO_MEM;
6619 return SYMBOL_SMALL_ABSOLUTE;
6621 case AARCH64_CMODEL_TINY_PIC:
6622 if (!aarch64_symbol_binds_local_p (x))
6623 return SYMBOL_TINY_GOT;
6624 return SYMBOL_TINY_ABSOLUTE;
6626 case AARCH64_CMODEL_SMALL_PIC:
6627 if (!aarch64_symbol_binds_local_p (x))
6628 return SYMBOL_SMALL_GOT;
6629 return SYMBOL_SMALL_ABSOLUTE;
6631 default:
6632 gcc_unreachable ();
6636 /* By default push everything into the constant pool. */
6637 return SYMBOL_FORCE_TO_MEM;
6640 bool
6641 aarch64_constant_address_p (rtx x)
6643 return (CONSTANT_P (x) && memory_address_p (DImode, x));
6646 bool
6647 aarch64_legitimate_pic_operand_p (rtx x)
6649 if (GET_CODE (x) == SYMBOL_REF
6650 || (GET_CODE (x) == CONST
6651 && GET_CODE (XEXP (x, 0)) == PLUS
6652 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6653 return false;
6655 return true;
6658 /* Return true if X holds either a quarter-precision or
6659 floating-point +0.0 constant. */
6660 static bool
6661 aarch64_valid_floating_const (machine_mode mode, rtx x)
6663 if (!CONST_DOUBLE_P (x))
6664 return false;
6666 /* TODO: We could handle moving 0.0 to a TFmode register,
6667 but first we would like to refactor the movtf_aarch64
6668 to be more amicable to split moves properly and
6669 correctly gate on TARGET_SIMD. For now - reject all
6670 constants which are not to SFmode or DFmode registers. */
6671 if (!(mode == SFmode || mode == DFmode))
6672 return false;
6674 if (aarch64_float_const_zero_rtx_p (x))
6675 return true;
6676 return aarch64_float_const_representable_p (x);
6679 static bool
6680 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
6682 /* Do not allow vector struct mode constants. We could support
6683 0 and -1 easily, but they need support in aarch64-simd.md. */
6684 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6685 return false;
6687 /* This could probably go away because
6688 we now decompose CONST_INTs according to expand_mov_immediate. */
6689 if ((GET_CODE (x) == CONST_VECTOR
6690 && aarch64_simd_valid_immediate (x, mode, false, NULL))
6691 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6692 return !targetm.cannot_force_const_mem (mode, x);
6694 if (GET_CODE (x) == HIGH
6695 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6696 return true;
6698 return aarch64_constant_address_p (x);
6702 aarch64_load_tp (rtx target)
6704 if (!target
6705 || GET_MODE (target) != Pmode
6706 || !register_operand (target, Pmode))
6707 target = gen_reg_rtx (Pmode);
6709 /* Can return in any reg. */
6710 emit_insn (gen_aarch64_load_tp_hard (target));
6711 return target;
6714 /* On AAPCS systems, this is the "struct __va_list". */
6715 static GTY(()) tree va_list_type;
6717 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6718 Return the type to use as __builtin_va_list.
6720 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6722 struct __va_list
6724 void *__stack;
6725 void *__gr_top;
6726 void *__vr_top;
6727 int __gr_offs;
6728 int __vr_offs;
6729 }; */
6731 static tree
6732 aarch64_build_builtin_va_list (void)
6734 tree va_list_name;
6735 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6737 /* Create the type. */
6738 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6739 /* Give it the required name. */
6740 va_list_name = build_decl (BUILTINS_LOCATION,
6741 TYPE_DECL,
6742 get_identifier ("__va_list"),
6743 va_list_type);
6744 DECL_ARTIFICIAL (va_list_name) = 1;
6745 TYPE_NAME (va_list_type) = va_list_name;
6746 TYPE_STUB_DECL (va_list_type) = va_list_name;
6748 /* Create the fields. */
6749 f_stack = build_decl (BUILTINS_LOCATION,
6750 FIELD_DECL, get_identifier ("__stack"),
6751 ptr_type_node);
6752 f_grtop = build_decl (BUILTINS_LOCATION,
6753 FIELD_DECL, get_identifier ("__gr_top"),
6754 ptr_type_node);
6755 f_vrtop = build_decl (BUILTINS_LOCATION,
6756 FIELD_DECL, get_identifier ("__vr_top"),
6757 ptr_type_node);
6758 f_groff = build_decl (BUILTINS_LOCATION,
6759 FIELD_DECL, get_identifier ("__gr_offs"),
6760 integer_type_node);
6761 f_vroff = build_decl (BUILTINS_LOCATION,
6762 FIELD_DECL, get_identifier ("__vr_offs"),
6763 integer_type_node);
6765 DECL_ARTIFICIAL (f_stack) = 1;
6766 DECL_ARTIFICIAL (f_grtop) = 1;
6767 DECL_ARTIFICIAL (f_vrtop) = 1;
6768 DECL_ARTIFICIAL (f_groff) = 1;
6769 DECL_ARTIFICIAL (f_vroff) = 1;
6771 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6772 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6773 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6774 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6775 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6777 TYPE_FIELDS (va_list_type) = f_stack;
6778 DECL_CHAIN (f_stack) = f_grtop;
6779 DECL_CHAIN (f_grtop) = f_vrtop;
6780 DECL_CHAIN (f_vrtop) = f_groff;
6781 DECL_CHAIN (f_groff) = f_vroff;
6783 /* Compute its layout. */
6784 layout_type (va_list_type);
6786 return va_list_type;
6789 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
6790 static void
6791 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6793 const CUMULATIVE_ARGS *cum;
6794 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6795 tree stack, grtop, vrtop, groff, vroff;
6796 tree t;
6797 int gr_save_area_size;
6798 int vr_save_area_size;
6799 int vr_offset;
6801 cum = &crtl->args.info;
6802 gr_save_area_size
6803 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6804 vr_save_area_size
6805 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6807 if (TARGET_GENERAL_REGS_ONLY)
6809 if (cum->aapcs_nvrn > 0)
6810 sorry ("%qs and floating point or vector arguments",
6811 "-mgeneral-regs-only");
6812 vr_save_area_size = 0;
6815 f_stack = TYPE_FIELDS (va_list_type_node);
6816 f_grtop = DECL_CHAIN (f_stack);
6817 f_vrtop = DECL_CHAIN (f_grtop);
6818 f_groff = DECL_CHAIN (f_vrtop);
6819 f_vroff = DECL_CHAIN (f_groff);
6821 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6822 NULL_TREE);
6823 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6824 NULL_TREE);
6825 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6826 NULL_TREE);
6827 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6828 NULL_TREE);
6829 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6830 NULL_TREE);
6832 /* Emit code to initialize STACK, which points to the next varargs stack
6833 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
6834 by named arguments. STACK is 8-byte aligned. */
6835 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6836 if (cum->aapcs_stack_size > 0)
6837 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6838 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6839 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6841 /* Emit code to initialize GRTOP, the top of the GR save area.
6842 virtual_incoming_args_rtx should have been 16 byte aligned. */
6843 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6844 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6845 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6847 /* Emit code to initialize VRTOP, the top of the VR save area.
6848 This address is gr_save_area_bytes below GRTOP, rounded
6849 down to the next 16-byte boundary. */
6850 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6851 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6852 STACK_BOUNDARY / BITS_PER_UNIT);
6854 if (vr_offset)
6855 t = fold_build_pointer_plus_hwi (t, -vr_offset);
6856 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6857 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6859 /* Emit code to initialize GROFF, the offset from GRTOP of the
6860 next GPR argument. */
6861 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6862 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6863 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6865 /* Likewise emit code to initialize VROFF, the offset from FTOP
6866 of the next VR argument. */
6867 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6868 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6869 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6872 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
6874 static tree
6875 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6876 gimple_seq *post_p ATTRIBUTE_UNUSED)
6878 tree addr;
6879 bool indirect_p;
6880 bool is_ha; /* is HFA or HVA. */
6881 bool dw_align; /* double-word align. */
6882 machine_mode ag_mode = VOIDmode;
6883 int nregs;
6884 machine_mode mode;
6886 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6887 tree stack, f_top, f_off, off, arg, roundup, on_stack;
6888 HOST_WIDE_INT size, rsize, adjust, align;
6889 tree t, u, cond1, cond2;
6891 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6892 if (indirect_p)
6893 type = build_pointer_type (type);
6895 mode = TYPE_MODE (type);
6897 f_stack = TYPE_FIELDS (va_list_type_node);
6898 f_grtop = DECL_CHAIN (f_stack);
6899 f_vrtop = DECL_CHAIN (f_grtop);
6900 f_groff = DECL_CHAIN (f_vrtop);
6901 f_vroff = DECL_CHAIN (f_groff);
6903 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6904 f_stack, NULL_TREE);
6905 size = int_size_in_bytes (type);
6906 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6908 dw_align = false;
6909 adjust = 0;
6910 if (aarch64_vfp_is_call_or_return_candidate (mode,
6911 type,
6912 &ag_mode,
6913 &nregs,
6914 &is_ha))
6916 /* TYPE passed in fp/simd registers. */
6917 if (TARGET_GENERAL_REGS_ONLY)
6918 sorry ("%qs and floating point or vector arguments",
6919 "-mgeneral-regs-only");
6921 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6922 unshare_expr (valist), f_vrtop, NULL_TREE);
6923 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6924 unshare_expr (valist), f_vroff, NULL_TREE);
6926 rsize = nregs * UNITS_PER_VREG;
6928 if (is_ha)
6930 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6931 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6933 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6934 && size < UNITS_PER_VREG)
6936 adjust = UNITS_PER_VREG - size;
6939 else
6941 /* TYPE passed in general registers. */
6942 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6943 unshare_expr (valist), f_grtop, NULL_TREE);
6944 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6945 unshare_expr (valist), f_groff, NULL_TREE);
6946 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6947 nregs = rsize / UNITS_PER_WORD;
6949 if (align > 8)
6950 dw_align = true;
6952 if (BLOCK_REG_PADDING (mode, type, 1) == downward
6953 && size < UNITS_PER_WORD)
6955 adjust = UNITS_PER_WORD - size;
6959 /* Get a local temporary for the field value. */
6960 off = get_initialized_tmp_var (f_off, pre_p, NULL);
6962 /* Emit code to branch if off >= 0. */
6963 t = build2 (GE_EXPR, boolean_type_node, off,
6964 build_int_cst (TREE_TYPE (off), 0));
6965 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6967 if (dw_align)
6969 /* Emit: offs = (offs + 15) & -16. */
6970 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6971 build_int_cst (TREE_TYPE (off), 15));
6972 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6973 build_int_cst (TREE_TYPE (off), -16));
6974 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6976 else
6977 roundup = NULL;
6979 /* Update ap.__[g|v]r_offs */
6980 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6981 build_int_cst (TREE_TYPE (off), rsize));
6982 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6984 /* String up. */
6985 if (roundup)
6986 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6988 /* [cond2] if (ap.__[g|v]r_offs > 0) */
6989 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6990 build_int_cst (TREE_TYPE (f_off), 0));
6991 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6993 /* String up: make sure the assignment happens before the use. */
6994 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6995 COND_EXPR_ELSE (cond1) = t;
6997 /* Prepare the trees handling the argument that is passed on the stack;
6998 the top level node will store in ON_STACK. */
6999 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7000 if (align > 8)
7002 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7003 t = fold_convert (intDI_type_node, arg);
7004 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7005 build_int_cst (TREE_TYPE (t), 15));
7006 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7007 build_int_cst (TREE_TYPE (t), -16));
7008 t = fold_convert (TREE_TYPE (arg), t);
7009 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7011 else
7012 roundup = NULL;
7013 /* Advance ap.__stack */
7014 t = fold_convert (intDI_type_node, arg);
7015 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7016 build_int_cst (TREE_TYPE (t), size + 7));
7017 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7018 build_int_cst (TREE_TYPE (t), -8));
7019 t = fold_convert (TREE_TYPE (arg), t);
7020 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7021 /* String up roundup and advance. */
7022 if (roundup)
7023 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7024 /* String up with arg */
7025 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7026 /* Big-endianness related address adjustment. */
7027 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7028 && size < UNITS_PER_WORD)
7030 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7031 size_int (UNITS_PER_WORD - size));
7032 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7035 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7036 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7038 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7039 t = off;
7040 if (adjust)
7041 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7042 build_int_cst (TREE_TYPE (off), adjust));
7044 t = fold_convert (sizetype, t);
7045 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7047 if (is_ha)
7049 /* type ha; // treat as "struct {ftype field[n];}"
7050 ... [computing offs]
7051 for (i = 0; i <nregs; ++i, offs += 16)
7052 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7053 return ha; */
7054 int i;
7055 tree tmp_ha, field_t, field_ptr_t;
7057 /* Declare a local variable. */
7058 tmp_ha = create_tmp_var_raw (type, "ha");
7059 gimple_add_tmp_var (tmp_ha);
7061 /* Establish the base type. */
7062 switch (ag_mode)
7064 case SFmode:
7065 field_t = float_type_node;
7066 field_ptr_t = float_ptr_type_node;
7067 break;
7068 case DFmode:
7069 field_t = double_type_node;
7070 field_ptr_t = double_ptr_type_node;
7071 break;
7072 case TFmode:
7073 field_t = long_double_type_node;
7074 field_ptr_t = long_double_ptr_type_node;
7075 break;
7076 /* The half precision and quad precision are not fully supported yet. Enable
7077 the following code after the support is complete. Need to find the correct
7078 type node for __fp16 *. */
7079 #if 0
7080 case HFmode:
7081 field_t = float_type_node;
7082 field_ptr_t = float_ptr_type_node;
7083 break;
7084 #endif
7085 case V2SImode:
7086 case V4SImode:
7088 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7089 field_t = build_vector_type_for_mode (innertype, ag_mode);
7090 field_ptr_t = build_pointer_type (field_t);
7092 break;
7093 default:
7094 gcc_assert (0);
7097 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7098 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7099 addr = t;
7100 t = fold_convert (field_ptr_t, addr);
7101 t = build2 (MODIFY_EXPR, field_t,
7102 build1 (INDIRECT_REF, field_t, tmp_ha),
7103 build1 (INDIRECT_REF, field_t, t));
7105 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7106 for (i = 1; i < nregs; ++i)
7108 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7109 u = fold_convert (field_ptr_t, addr);
7110 u = build2 (MODIFY_EXPR, field_t,
7111 build2 (MEM_REF, field_t, tmp_ha,
7112 build_int_cst (field_ptr_t,
7113 (i *
7114 int_size_in_bytes (field_t)))),
7115 build1 (INDIRECT_REF, field_t, u));
7116 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7119 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7120 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7123 COND_EXPR_ELSE (cond2) = t;
7124 addr = fold_convert (build_pointer_type (type), cond1);
7125 addr = build_va_arg_indirect_ref (addr);
7127 if (indirect_p)
7128 addr = build_va_arg_indirect_ref (addr);
7130 return addr;
7133 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7135 static void
7136 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7137 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7138 int no_rtl)
7140 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7141 CUMULATIVE_ARGS local_cum;
7142 int gr_saved, vr_saved;
7144 /* The caller has advanced CUM up to, but not beyond, the last named
7145 argument. Advance a local copy of CUM past the last "real" named
7146 argument, to find out how many registers are left over. */
7147 local_cum = *cum;
7148 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7150 /* Found out how many registers we need to save. */
7151 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7152 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7154 if (TARGET_GENERAL_REGS_ONLY)
7156 if (local_cum.aapcs_nvrn > 0)
7157 sorry ("%qs and floating point or vector arguments",
7158 "-mgeneral-regs-only");
7159 vr_saved = 0;
7162 if (!no_rtl)
7164 if (gr_saved > 0)
7166 rtx ptr, mem;
7168 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7169 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7170 - gr_saved * UNITS_PER_WORD);
7171 mem = gen_frame_mem (BLKmode, ptr);
7172 set_mem_alias_set (mem, get_varargs_alias_set ());
7174 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7175 mem, gr_saved);
7177 if (vr_saved > 0)
7179 /* We can't use move_block_from_reg, because it will use
7180 the wrong mode, storing D regs only. */
7181 machine_mode mode = TImode;
7182 int off, i;
7184 /* Set OFF to the offset from virtual_incoming_args_rtx of
7185 the first vector register. The VR save area lies below
7186 the GR one, and is aligned to 16 bytes. */
7187 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7188 STACK_BOUNDARY / BITS_PER_UNIT);
7189 off -= vr_saved * UNITS_PER_VREG;
7191 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7193 rtx ptr, mem;
7195 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7196 mem = gen_frame_mem (mode, ptr);
7197 set_mem_alias_set (mem, get_varargs_alias_set ());
7198 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7199 off += UNITS_PER_VREG;
7204 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7205 any complication of having crtl->args.pretend_args_size changed. */
7206 cfun->machine->frame.saved_varargs_size
7207 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7208 STACK_BOUNDARY / BITS_PER_UNIT)
7209 + vr_saved * UNITS_PER_VREG);
7212 static void
7213 aarch64_conditional_register_usage (void)
7215 int i;
7216 if (!TARGET_FLOAT)
7218 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7220 fixed_regs[i] = 1;
7221 call_used_regs[i] = 1;
7226 /* Walk down the type tree of TYPE counting consecutive base elements.
7227 If *MODEP is VOIDmode, then set it to the first valid floating point
7228 type. If a non-floating point type is found, or if a floating point
7229 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7230 otherwise return the count in the sub-tree. */
7231 static int
7232 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7234 machine_mode mode;
7235 HOST_WIDE_INT size;
7237 switch (TREE_CODE (type))
7239 case REAL_TYPE:
7240 mode = TYPE_MODE (type);
7241 if (mode != DFmode && mode != SFmode && mode != TFmode)
7242 return -1;
7244 if (*modep == VOIDmode)
7245 *modep = mode;
7247 if (*modep == mode)
7248 return 1;
7250 break;
7252 case COMPLEX_TYPE:
7253 mode = TYPE_MODE (TREE_TYPE (type));
7254 if (mode != DFmode && mode != SFmode && mode != TFmode)
7255 return -1;
7257 if (*modep == VOIDmode)
7258 *modep = mode;
7260 if (*modep == mode)
7261 return 2;
7263 break;
7265 case VECTOR_TYPE:
7266 /* Use V2SImode and V4SImode as representatives of all 64-bit
7267 and 128-bit vector types. */
7268 size = int_size_in_bytes (type);
7269 switch (size)
7271 case 8:
7272 mode = V2SImode;
7273 break;
7274 case 16:
7275 mode = V4SImode;
7276 break;
7277 default:
7278 return -1;
7281 if (*modep == VOIDmode)
7282 *modep = mode;
7284 /* Vector modes are considered to be opaque: two vectors are
7285 equivalent for the purposes of being homogeneous aggregates
7286 if they are the same size. */
7287 if (*modep == mode)
7288 return 1;
7290 break;
7292 case ARRAY_TYPE:
7294 int count;
7295 tree index = TYPE_DOMAIN (type);
7297 /* Can't handle incomplete types nor sizes that are not
7298 fixed. */
7299 if (!COMPLETE_TYPE_P (type)
7300 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7301 return -1;
7303 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7304 if (count == -1
7305 || !index
7306 || !TYPE_MAX_VALUE (index)
7307 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7308 || !TYPE_MIN_VALUE (index)
7309 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7310 || count < 0)
7311 return -1;
7313 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7314 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7316 /* There must be no padding. */
7317 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7318 return -1;
7320 return count;
7323 case RECORD_TYPE:
7325 int count = 0;
7326 int sub_count;
7327 tree field;
7329 /* Can't handle incomplete types nor sizes that are not
7330 fixed. */
7331 if (!COMPLETE_TYPE_P (type)
7332 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7333 return -1;
7335 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7337 if (TREE_CODE (field) != FIELD_DECL)
7338 continue;
7340 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7341 if (sub_count < 0)
7342 return -1;
7343 count += sub_count;
7346 /* There must be no padding. */
7347 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7348 return -1;
7350 return count;
7353 case UNION_TYPE:
7354 case QUAL_UNION_TYPE:
7356 /* These aren't very interesting except in a degenerate case. */
7357 int count = 0;
7358 int sub_count;
7359 tree field;
7361 /* Can't handle incomplete types nor sizes that are not
7362 fixed. */
7363 if (!COMPLETE_TYPE_P (type)
7364 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7365 return -1;
7367 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7369 if (TREE_CODE (field) != FIELD_DECL)
7370 continue;
7372 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7373 if (sub_count < 0)
7374 return -1;
7375 count = count > sub_count ? count : sub_count;
7378 /* There must be no padding. */
7379 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7380 return -1;
7382 return count;
7385 default:
7386 break;
7389 return -1;
7392 /* Return true if we use LRA instead of reload pass. */
7393 static bool
7394 aarch64_lra_p (void)
7396 return aarch64_lra_flag;
7399 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7400 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7401 array types. The C99 floating-point complex types are also considered
7402 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7403 types, which are GCC extensions and out of the scope of AAPCS64, are
7404 treated as composite types here as well.
7406 Note that MODE itself is not sufficient in determining whether a type
7407 is such a composite type or not. This is because
7408 stor-layout.c:compute_record_mode may have already changed the MODE
7409 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7410 structure with only one field may have its MODE set to the mode of the
7411 field. Also an integer mode whose size matches the size of the
7412 RECORD_TYPE type may be used to substitute the original mode
7413 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7414 solely relied on. */
7416 static bool
7417 aarch64_composite_type_p (const_tree type,
7418 machine_mode mode)
7420 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7421 return true;
7423 if (mode == BLKmode
7424 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7425 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7426 return true;
7428 return false;
7431 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7432 type as described in AAPCS64 \S 4.1.2.
7434 See the comment above aarch64_composite_type_p for the notes on MODE. */
7436 static bool
7437 aarch64_short_vector_p (const_tree type,
7438 machine_mode mode)
7440 HOST_WIDE_INT size = -1;
7442 if (type && TREE_CODE (type) == VECTOR_TYPE)
7443 size = int_size_in_bytes (type);
7444 else if (!aarch64_composite_type_p (type, mode)
7445 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7446 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7447 size = GET_MODE_SIZE (mode);
7449 return (size == 8 || size == 16) ? true : false;
7452 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7453 shall be passed or returned in simd/fp register(s) (providing these
7454 parameter passing registers are available).
7456 Upon successful return, *COUNT returns the number of needed registers,
7457 *BASE_MODE returns the mode of the individual register and when IS_HAF
7458 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7459 floating-point aggregate or a homogeneous short-vector aggregate. */
7461 static bool
7462 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7463 const_tree type,
7464 machine_mode *base_mode,
7465 int *count,
7466 bool *is_ha)
7468 machine_mode new_mode = VOIDmode;
7469 bool composite_p = aarch64_composite_type_p (type, mode);
7471 if (is_ha != NULL) *is_ha = false;
7473 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7474 || aarch64_short_vector_p (type, mode))
7476 *count = 1;
7477 new_mode = mode;
7479 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7481 if (is_ha != NULL) *is_ha = true;
7482 *count = 2;
7483 new_mode = GET_MODE_INNER (mode);
7485 else if (type && composite_p)
7487 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7489 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7491 if (is_ha != NULL) *is_ha = true;
7492 *count = ag_count;
7494 else
7495 return false;
7497 else
7498 return false;
7500 *base_mode = new_mode;
7501 return true;
7504 /* Implement TARGET_STRUCT_VALUE_RTX. */
7506 static rtx
7507 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7508 int incoming ATTRIBUTE_UNUSED)
7510 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7513 /* Implements target hook vector_mode_supported_p. */
7514 static bool
7515 aarch64_vector_mode_supported_p (machine_mode mode)
7517 if (TARGET_SIMD
7518 && (mode == V4SImode || mode == V8HImode
7519 || mode == V16QImode || mode == V2DImode
7520 || mode == V2SImode || mode == V4HImode
7521 || mode == V8QImode || mode == V2SFmode
7522 || mode == V4SFmode || mode == V2DFmode
7523 || mode == V1DFmode))
7524 return true;
7526 return false;
7529 /* Return appropriate SIMD container
7530 for MODE within a vector of WIDTH bits. */
7531 static machine_mode
7532 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7534 gcc_assert (width == 64 || width == 128);
7535 if (TARGET_SIMD)
7537 if (width == 128)
7538 switch (mode)
7540 case DFmode:
7541 return V2DFmode;
7542 case SFmode:
7543 return V4SFmode;
7544 case SImode:
7545 return V4SImode;
7546 case HImode:
7547 return V8HImode;
7548 case QImode:
7549 return V16QImode;
7550 case DImode:
7551 return V2DImode;
7552 default:
7553 break;
7555 else
7556 switch (mode)
7558 case SFmode:
7559 return V2SFmode;
7560 case SImode:
7561 return V2SImode;
7562 case HImode:
7563 return V4HImode;
7564 case QImode:
7565 return V8QImode;
7566 default:
7567 break;
7570 return word_mode;
7573 /* Return 128-bit container as the preferred SIMD mode for MODE. */
7574 static machine_mode
7575 aarch64_preferred_simd_mode (machine_mode mode)
7577 return aarch64_simd_container_mode (mode, 128);
7580 /* Return the bitmask of possible vector sizes for the vectorizer
7581 to iterate over. */
7582 static unsigned int
7583 aarch64_autovectorize_vector_sizes (void)
7585 return (16 | 8);
7588 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7589 vector types in order to conform to the AAPCS64 (see "Procedure
7590 Call Standard for the ARM 64-bit Architecture", Appendix A). To
7591 qualify for emission with the mangled names defined in that document,
7592 a vector type must not only be of the correct mode but also be
7593 composed of AdvSIMD vector element types (e.g.
7594 _builtin_aarch64_simd_qi); these types are registered by
7595 aarch64_init_simd_builtins (). In other words, vector types defined
7596 in other ways e.g. via vector_size attribute will get default
7597 mangled names. */
7598 typedef struct
7600 machine_mode mode;
7601 const char *element_type_name;
7602 const char *mangled_name;
7603 } aarch64_simd_mangle_map_entry;
7605 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7606 /* 64-bit containerized types. */
7607 { V8QImode, "__builtin_aarch64_simd_qi", "10__Int8x8_t" },
7608 { V8QImode, "__builtin_aarch64_simd_uqi", "11__Uint8x8_t" },
7609 { V4HImode, "__builtin_aarch64_simd_hi", "11__Int16x4_t" },
7610 { V4HImode, "__builtin_aarch64_simd_uhi", "12__Uint16x4_t" },
7611 { V2SImode, "__builtin_aarch64_simd_si", "11__Int32x2_t" },
7612 { V2SImode, "__builtin_aarch64_simd_usi", "12__Uint32x2_t" },
7613 { V2SFmode, "__builtin_aarch64_simd_sf", "13__Float32x2_t" },
7614 { DImode, "__builtin_aarch64_simd_di", "11__Int64x1_t" },
7615 { DImode, "__builtin_aarch64_simd_udi", "12__Uint64x1_t" },
7616 { V1DFmode, "__builtin_aarch64_simd_df", "13__Float64x1_t" },
7617 { V8QImode, "__builtin_aarch64_simd_poly8", "11__Poly8x8_t" },
7618 { V4HImode, "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7619 /* 128-bit containerized types. */
7620 { V16QImode, "__builtin_aarch64_simd_qi", "11__Int8x16_t" },
7621 { V16QImode, "__builtin_aarch64_simd_uqi", "12__Uint8x16_t" },
7622 { V8HImode, "__builtin_aarch64_simd_hi", "11__Int16x8_t" },
7623 { V8HImode, "__builtin_aarch64_simd_uhi", "12__Uint16x8_t" },
7624 { V4SImode, "__builtin_aarch64_simd_si", "11__Int32x4_t" },
7625 { V4SImode, "__builtin_aarch64_simd_usi", "12__Uint32x4_t" },
7626 { V2DImode, "__builtin_aarch64_simd_di", "11__Int64x2_t" },
7627 { V2DImode, "__builtin_aarch64_simd_udi", "12__Uint64x2_t" },
7628 { V4SFmode, "__builtin_aarch64_simd_sf", "13__Float32x4_t" },
7629 { V2DFmode, "__builtin_aarch64_simd_df", "13__Float64x2_t" },
7630 { V16QImode, "__builtin_aarch64_simd_poly8", "12__Poly8x16_t" },
7631 { V8HImode, "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7632 { V2DImode, "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7633 { VOIDmode, NULL, NULL }
7636 /* Implement TARGET_MANGLE_TYPE. */
7638 static const char *
7639 aarch64_mangle_type (const_tree type)
7641 /* The AArch64 ABI documents say that "__va_list" has to be
7642 managled as if it is in the "std" namespace. */
7643 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7644 return "St9__va_list";
7646 /* Check the mode of the vector type, and the name of the vector
7647 element type, against the table. */
7648 if (TREE_CODE (type) == VECTOR_TYPE)
7650 aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7652 while (pos->mode != VOIDmode)
7654 tree elt_type = TREE_TYPE (type);
7656 if (pos->mode == TYPE_MODE (type)
7657 && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7658 && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7659 pos->element_type_name))
7660 return pos->mangled_name;
7662 pos++;
7666 /* Use the default mangling. */
7667 return NULL;
7671 /* Return true if the rtx_insn contains a MEM RTX somewhere
7672 in it. */
7674 static bool
7675 has_memory_op (rtx_insn *mem_insn)
7677 subrtx_iterator::array_type array;
7678 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
7679 if (MEM_P (*iter))
7680 return true;
7682 return false;
7685 /* Find the first rtx_insn before insn that will generate an assembly
7686 instruction. */
7688 static rtx_insn *
7689 aarch64_prev_real_insn (rtx_insn *insn)
7691 if (!insn)
7692 return NULL;
7696 insn = prev_real_insn (insn);
7698 while (insn && recog_memoized (insn) < 0);
7700 return insn;
7703 static bool
7704 is_madd_op (enum attr_type t1)
7706 unsigned int i;
7707 /* A number of these may be AArch32 only. */
7708 enum attr_type mlatypes[] = {
7709 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
7710 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
7711 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
7714 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
7716 if (t1 == mlatypes[i])
7717 return true;
7720 return false;
7723 /* Check if there is a register dependency between a load and the insn
7724 for which we hold recog_data. */
7726 static bool
7727 dep_between_memop_and_curr (rtx memop)
7729 rtx load_reg;
7730 int opno;
7732 gcc_assert (GET_CODE (memop) == SET);
7734 if (!REG_P (SET_DEST (memop)))
7735 return false;
7737 load_reg = SET_DEST (memop);
7738 for (opno = 1; opno < recog_data.n_operands; opno++)
7740 rtx operand = recog_data.operand[opno];
7741 if (REG_P (operand)
7742 && reg_overlap_mentioned_p (load_reg, operand))
7743 return true;
7746 return false;
7750 /* When working around the Cortex-A53 erratum 835769,
7751 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
7752 instruction and has a preceding memory instruction such that a NOP
7753 should be inserted between them. */
7755 bool
7756 aarch64_madd_needs_nop (rtx_insn* insn)
7758 enum attr_type attr_type;
7759 rtx_insn *prev;
7760 rtx body;
7762 if (!aarch64_fix_a53_err835769)
7763 return false;
7765 if (recog_memoized (insn) < 0)
7766 return false;
7768 attr_type = get_attr_type (insn);
7769 if (!is_madd_op (attr_type))
7770 return false;
7772 prev = aarch64_prev_real_insn (insn);
7773 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
7774 Restore recog state to INSN to avoid state corruption. */
7775 extract_constrain_insn_cached (insn);
7777 if (!prev || !has_memory_op (prev))
7778 return false;
7780 body = single_set (prev);
7782 /* If the previous insn is a memory op and there is no dependency between
7783 it and the DImode madd, emit a NOP between them. If body is NULL then we
7784 have a complex memory operation, probably a load/store pair.
7785 Be conservative for now and emit a NOP. */
7786 if (GET_MODE (recog_data.operand[0]) == DImode
7787 && (!body || !dep_between_memop_and_curr (body)))
7788 return true;
7790 return false;
7795 /* Implement FINAL_PRESCAN_INSN. */
7797 void
7798 aarch64_final_prescan_insn (rtx_insn *insn)
7800 if (aarch64_madd_needs_nop (insn))
7801 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
7805 /* Return the equivalent letter for size. */
7806 static char
7807 sizetochar (int size)
7809 switch (size)
7811 case 64: return 'd';
7812 case 32: return 's';
7813 case 16: return 'h';
7814 case 8 : return 'b';
7815 default: gcc_unreachable ();
7819 /* Return true iff x is a uniform vector of floating-point
7820 constants, and the constant can be represented in
7821 quarter-precision form. Note, as aarch64_float_const_representable
7822 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
7823 static bool
7824 aarch64_vect_float_const_representable_p (rtx x)
7826 int i = 0;
7827 REAL_VALUE_TYPE r0, ri;
7828 rtx x0, xi;
7830 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7831 return false;
7833 x0 = CONST_VECTOR_ELT (x, 0);
7834 if (!CONST_DOUBLE_P (x0))
7835 return false;
7837 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7839 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7841 xi = CONST_VECTOR_ELT (x, i);
7842 if (!CONST_DOUBLE_P (xi))
7843 return false;
7845 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7846 if (!REAL_VALUES_EQUAL (r0, ri))
7847 return false;
7850 return aarch64_float_const_representable_p (x0);
7853 /* Return true for valid and false for invalid. */
7854 bool
7855 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
7856 struct simd_immediate_info *info)
7858 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
7859 matches = 1; \
7860 for (i = 0; i < idx; i += (STRIDE)) \
7861 if (!(TEST)) \
7862 matches = 0; \
7863 if (matches) \
7865 immtype = (CLASS); \
7866 elsize = (ELSIZE); \
7867 eshift = (SHIFT); \
7868 emvn = (NEG); \
7869 break; \
7872 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7873 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7874 unsigned char bytes[16];
7875 int immtype = -1, matches;
7876 unsigned int invmask = inverse ? 0xff : 0;
7877 int eshift, emvn;
7879 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7881 if (! (aarch64_simd_imm_zero_p (op, mode)
7882 || aarch64_vect_float_const_representable_p (op)))
7883 return false;
7885 if (info)
7887 info->value = CONST_VECTOR_ELT (op, 0);
7888 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7889 info->mvn = false;
7890 info->shift = 0;
7893 return true;
7896 /* Splat vector constant out into a byte vector. */
7897 for (i = 0; i < n_elts; i++)
7899 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
7900 it must be laid out in the vector register in reverse order. */
7901 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7902 unsigned HOST_WIDE_INT elpart;
7903 unsigned int part, parts;
7905 if (CONST_INT_P (el))
7907 elpart = INTVAL (el);
7908 parts = 1;
7910 else if (GET_CODE (el) == CONST_DOUBLE)
7912 elpart = CONST_DOUBLE_LOW (el);
7913 parts = 2;
7915 else
7916 gcc_unreachable ();
7918 for (part = 0; part < parts; part++)
7920 unsigned int byte;
7921 for (byte = 0; byte < innersize; byte++)
7923 bytes[idx++] = (elpart & 0xff) ^ invmask;
7924 elpart >>= BITS_PER_UNIT;
7926 if (GET_CODE (el) == CONST_DOUBLE)
7927 elpart = CONST_DOUBLE_HIGH (el);
7931 /* Sanity check. */
7932 gcc_assert (idx == GET_MODE_SIZE (mode));
7936 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7937 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7939 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7940 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7942 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7943 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7945 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7946 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7948 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7950 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7952 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7953 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7955 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7956 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7958 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7959 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7961 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7962 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7964 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7966 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7968 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7969 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7971 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7972 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7974 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7975 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7977 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7978 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7980 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7982 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7983 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7985 while (0);
7987 if (immtype == -1)
7988 return false;
7990 if (info)
7992 info->element_width = elsize;
7993 info->mvn = emvn != 0;
7994 info->shift = eshift;
7996 unsigned HOST_WIDE_INT imm = 0;
7998 if (immtype >= 12 && immtype <= 15)
7999 info->msl = true;
8001 /* Un-invert bytes of recognized vector, if necessary. */
8002 if (invmask != 0)
8003 for (i = 0; i < idx; i++)
8004 bytes[i] ^= invmask;
8006 if (immtype == 17)
8008 /* FIXME: Broken on 32-bit H_W_I hosts. */
8009 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8011 for (i = 0; i < 8; i++)
8012 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8013 << (i * BITS_PER_UNIT);
8016 info->value = GEN_INT (imm);
8018 else
8020 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8021 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8023 /* Construct 'abcdefgh' because the assembler cannot handle
8024 generic constants. */
8025 if (info->mvn)
8026 imm = ~imm;
8027 imm = (imm >> info->shift) & 0xff;
8028 info->value = GEN_INT (imm);
8032 return true;
8033 #undef CHECK
8036 /* Check of immediate shift constants are within range. */
8037 bool
8038 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8040 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8041 if (left)
8042 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8043 else
8044 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8047 /* Return true if X is a uniform vector where all elements
8048 are either the floating-point constant 0.0 or the
8049 integer constant 0. */
8050 bool
8051 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8053 return x == CONST0_RTX (mode);
8056 bool
8057 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8059 HOST_WIDE_INT imm = INTVAL (x);
8060 int i;
8062 for (i = 0; i < 8; i++)
8064 unsigned int byte = imm & 0xff;
8065 if (byte != 0xff && byte != 0)
8066 return false;
8067 imm >>= 8;
8070 return true;
8073 bool
8074 aarch64_mov_operand_p (rtx x,
8075 enum aarch64_symbol_context context,
8076 machine_mode mode)
8078 if (GET_CODE (x) == HIGH
8079 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8080 return true;
8082 if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
8083 return true;
8085 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8086 return true;
8088 return aarch64_classify_symbolic_expression (x, context)
8089 == SYMBOL_TINY_ABSOLUTE;
8092 /* Return a const_int vector of VAL. */
8094 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8096 int nunits = GET_MODE_NUNITS (mode);
8097 rtvec v = rtvec_alloc (nunits);
8098 int i;
8100 for (i=0; i < nunits; i++)
8101 RTVEC_ELT (v, i) = GEN_INT (val);
8103 return gen_rtx_CONST_VECTOR (mode, v);
8106 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8108 bool
8109 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8111 machine_mode vmode;
8113 gcc_assert (!VECTOR_MODE_P (mode));
8114 vmode = aarch64_preferred_simd_mode (mode);
8115 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8116 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8119 /* Construct and return a PARALLEL RTX vector with elements numbering the
8120 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8121 the vector - from the perspective of the architecture. This does not
8122 line up with GCC's perspective on lane numbers, so we end up with
8123 different masks depending on our target endian-ness. The diagram
8124 below may help. We must draw the distinction when building masks
8125 which select one half of the vector. An instruction selecting
8126 architectural low-lanes for a big-endian target, must be described using
8127 a mask selecting GCC high-lanes.
8129 Big-Endian Little-Endian
8131 GCC 0 1 2 3 3 2 1 0
8132 | x | x | x | x | | x | x | x | x |
8133 Architecture 3 2 1 0 3 2 1 0
8135 Low Mask: { 2, 3 } { 0, 1 }
8136 High Mask: { 0, 1 } { 2, 3 }
8140 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8142 int nunits = GET_MODE_NUNITS (mode);
8143 rtvec v = rtvec_alloc (nunits / 2);
8144 int high_base = nunits / 2;
8145 int low_base = 0;
8146 int base;
8147 rtx t1;
8148 int i;
8150 if (BYTES_BIG_ENDIAN)
8151 base = high ? low_base : high_base;
8152 else
8153 base = high ? high_base : low_base;
8155 for (i = 0; i < nunits / 2; i++)
8156 RTVEC_ELT (v, i) = GEN_INT (base + i);
8158 t1 = gen_rtx_PARALLEL (mode, v);
8159 return t1;
8162 /* Check OP for validity as a PARALLEL RTX vector with elements
8163 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8164 from the perspective of the architecture. See the diagram above
8165 aarch64_simd_vect_par_cnst_half for more details. */
8167 bool
8168 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8169 bool high)
8171 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8172 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8173 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8174 int i = 0;
8176 if (!VECTOR_MODE_P (mode))
8177 return false;
8179 if (count_op != count_ideal)
8180 return false;
8182 for (i = 0; i < count_ideal; i++)
8184 rtx elt_op = XVECEXP (op, 0, i);
8185 rtx elt_ideal = XVECEXP (ideal, 0, i);
8187 if (!CONST_INT_P (elt_op)
8188 || INTVAL (elt_ideal) != INTVAL (elt_op))
8189 return false;
8191 return true;
8194 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8195 HIGH (exclusive). */
8196 void
8197 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
8199 HOST_WIDE_INT lane;
8200 gcc_assert (CONST_INT_P (operand));
8201 lane = INTVAL (operand);
8203 if (lane < low || lane >= high)
8204 error ("lane out of range");
8207 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8208 registers). */
8209 void
8210 aarch64_simd_emit_pair_result_insn (machine_mode mode,
8211 rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8212 rtx op1)
8214 rtx mem = gen_rtx_MEM (mode, destaddr);
8215 rtx tmp1 = gen_reg_rtx (mode);
8216 rtx tmp2 = gen_reg_rtx (mode);
8218 emit_insn (intfn (tmp1, op1, tmp2));
8220 emit_move_insn (mem, tmp1);
8221 mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8222 emit_move_insn (mem, tmp2);
8225 /* Return TRUE if OP is a valid vector addressing mode. */
8226 bool
8227 aarch64_simd_mem_operand_p (rtx op)
8229 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8230 || REG_P (XEXP (op, 0)));
8233 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8234 not to early-clobber SRC registers in the process.
8236 We assume that the operands described by SRC and DEST represent a
8237 decomposed copy of OPERANDS[1] into OPERANDS[0]. COUNT is the
8238 number of components into which the copy has been decomposed. */
8239 void
8240 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8241 rtx *src, unsigned int count)
8243 unsigned int i;
8245 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8246 || REGNO (operands[0]) < REGNO (operands[1]))
8248 for (i = 0; i < count; i++)
8250 operands[2 * i] = dest[i];
8251 operands[2 * i + 1] = src[i];
8254 else
8256 for (i = 0; i < count; i++)
8258 operands[2 * i] = dest[count - i - 1];
8259 operands[2 * i + 1] = src[count - i - 1];
8264 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8265 one of VSTRUCT modes: OI, CI or XI. */
8267 aarch64_simd_attr_length_move (rtx_insn *insn)
8269 machine_mode mode;
8271 extract_insn_cached (insn);
8273 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8275 mode = GET_MODE (recog_data.operand[0]);
8276 switch (mode)
8278 case OImode:
8279 return 8;
8280 case CImode:
8281 return 12;
8282 case XImode:
8283 return 16;
8284 default:
8285 gcc_unreachable ();
8288 return 4;
8291 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8292 alignment of a vector to 128 bits. */
8293 static HOST_WIDE_INT
8294 aarch64_simd_vector_alignment (const_tree type)
8296 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8297 return MIN (align, 128);
8300 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8301 static bool
8302 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8304 if (is_packed)
8305 return false;
8307 /* We guarantee alignment for vectors up to 128-bits. */
8308 if (tree_int_cst_compare (TYPE_SIZE (type),
8309 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8310 return false;
8312 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8313 return true;
8316 /* If VALS is a vector constant that can be loaded into a register
8317 using DUP, generate instructions to do so and return an RTX to
8318 assign to the register. Otherwise return NULL_RTX. */
8319 static rtx
8320 aarch64_simd_dup_constant (rtx vals)
8322 machine_mode mode = GET_MODE (vals);
8323 machine_mode inner_mode = GET_MODE_INNER (mode);
8324 int n_elts = GET_MODE_NUNITS (mode);
8325 bool all_same = true;
8326 rtx x;
8327 int i;
8329 if (GET_CODE (vals) != CONST_VECTOR)
8330 return NULL_RTX;
8332 for (i = 1; i < n_elts; ++i)
8334 x = CONST_VECTOR_ELT (vals, i);
8335 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8336 all_same = false;
8339 if (!all_same)
8340 return NULL_RTX;
8342 /* We can load this constant by using DUP and a constant in a
8343 single ARM register. This will be cheaper than a vector
8344 load. */
8345 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8346 return gen_rtx_VEC_DUPLICATE (mode, x);
8350 /* Generate code to load VALS, which is a PARALLEL containing only
8351 constants (for vec_init) or CONST_VECTOR, efficiently into a
8352 register. Returns an RTX to copy into the register, or NULL_RTX
8353 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8354 static rtx
8355 aarch64_simd_make_constant (rtx vals)
8357 machine_mode mode = GET_MODE (vals);
8358 rtx const_dup;
8359 rtx const_vec = NULL_RTX;
8360 int n_elts = GET_MODE_NUNITS (mode);
8361 int n_const = 0;
8362 int i;
8364 if (GET_CODE (vals) == CONST_VECTOR)
8365 const_vec = vals;
8366 else if (GET_CODE (vals) == PARALLEL)
8368 /* A CONST_VECTOR must contain only CONST_INTs and
8369 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8370 Only store valid constants in a CONST_VECTOR. */
8371 for (i = 0; i < n_elts; ++i)
8373 rtx x = XVECEXP (vals, 0, i);
8374 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8375 n_const++;
8377 if (n_const == n_elts)
8378 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8380 else
8381 gcc_unreachable ();
8383 if (const_vec != NULL_RTX
8384 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8385 /* Load using MOVI/MVNI. */
8386 return const_vec;
8387 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8388 /* Loaded using DUP. */
8389 return const_dup;
8390 else if (const_vec != NULL_RTX)
8391 /* Load from constant pool. We can not take advantage of single-cycle
8392 LD1 because we need a PC-relative addressing mode. */
8393 return const_vec;
8394 else
8395 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8396 We can not construct an initializer. */
8397 return NULL_RTX;
8400 void
8401 aarch64_expand_vector_init (rtx target, rtx vals)
8403 machine_mode mode = GET_MODE (target);
8404 machine_mode inner_mode = GET_MODE_INNER (mode);
8405 int n_elts = GET_MODE_NUNITS (mode);
8406 int n_var = 0, one_var = -1;
8407 bool all_same = true;
8408 rtx x, mem;
8409 int i;
8411 x = XVECEXP (vals, 0, 0);
8412 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8413 n_var = 1, one_var = 0;
8415 for (i = 1; i < n_elts; ++i)
8417 x = XVECEXP (vals, 0, i);
8418 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8419 ++n_var, one_var = i;
8421 if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8422 all_same = false;
8425 if (n_var == 0)
8427 rtx constant = aarch64_simd_make_constant (vals);
8428 if (constant != NULL_RTX)
8430 emit_move_insn (target, constant);
8431 return;
8435 /* Splat a single non-constant element if we can. */
8436 if (all_same)
8438 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8439 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8440 return;
8443 /* One field is non-constant. Load constant then overwrite varying
8444 field. This is more efficient than using the stack. */
8445 if (n_var == 1)
8447 rtx copy = copy_rtx (vals);
8448 rtx index = GEN_INT (one_var);
8449 enum insn_code icode;
8451 /* Load constant part of vector, substitute neighboring value for
8452 varying element. */
8453 XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8454 aarch64_expand_vector_init (target, copy);
8456 /* Insert variable. */
8457 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8458 icode = optab_handler (vec_set_optab, mode);
8459 gcc_assert (icode != CODE_FOR_nothing);
8460 emit_insn (GEN_FCN (icode) (target, x, index));
8461 return;
8464 /* Construct the vector in memory one field at a time
8465 and load the whole vector. */
8466 mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8467 for (i = 0; i < n_elts; i++)
8468 emit_move_insn (adjust_address_nv (mem, inner_mode,
8469 i * GET_MODE_SIZE (inner_mode)),
8470 XVECEXP (vals, 0, i));
8471 emit_move_insn (target, mem);
8475 static unsigned HOST_WIDE_INT
8476 aarch64_shift_truncation_mask (machine_mode mode)
8478 return
8479 (aarch64_vector_mode_supported_p (mode)
8480 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8483 #ifndef TLS_SECTION_ASM_FLAG
8484 #define TLS_SECTION_ASM_FLAG 'T'
8485 #endif
8487 void
8488 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8489 tree decl ATTRIBUTE_UNUSED)
8491 char flagchars[10], *f = flagchars;
8493 /* If we have already declared this section, we can use an
8494 abbreviated form to switch back to it -- unless this section is
8495 part of a COMDAT groups, in which case GAS requires the full
8496 declaration every time. */
8497 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8498 && (flags & SECTION_DECLARED))
8500 fprintf (asm_out_file, "\t.section\t%s\n", name);
8501 return;
8504 if (!(flags & SECTION_DEBUG))
8505 *f++ = 'a';
8506 if (flags & SECTION_WRITE)
8507 *f++ = 'w';
8508 if (flags & SECTION_CODE)
8509 *f++ = 'x';
8510 if (flags & SECTION_SMALL)
8511 *f++ = 's';
8512 if (flags & SECTION_MERGE)
8513 *f++ = 'M';
8514 if (flags & SECTION_STRINGS)
8515 *f++ = 'S';
8516 if (flags & SECTION_TLS)
8517 *f++ = TLS_SECTION_ASM_FLAG;
8518 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8519 *f++ = 'G';
8520 *f = '\0';
8522 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8524 if (!(flags & SECTION_NOTYPE))
8526 const char *type;
8527 const char *format;
8529 if (flags & SECTION_BSS)
8530 type = "nobits";
8531 else
8532 type = "progbits";
8534 #ifdef TYPE_OPERAND_FMT
8535 format = "," TYPE_OPERAND_FMT;
8536 #else
8537 format = ",@%s";
8538 #endif
8540 fprintf (asm_out_file, format, type);
8542 if (flags & SECTION_ENTSIZE)
8543 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8544 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8546 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8547 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8548 else
8549 fprintf (asm_out_file, ",%s,comdat",
8550 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8554 putc ('\n', asm_out_file);
8557 /* Select a format to encode pointers in exception handling data. */
8559 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8561 int type;
8562 switch (aarch64_cmodel)
8564 case AARCH64_CMODEL_TINY:
8565 case AARCH64_CMODEL_TINY_PIC:
8566 case AARCH64_CMODEL_SMALL:
8567 case AARCH64_CMODEL_SMALL_PIC:
8568 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8569 for everything. */
8570 type = DW_EH_PE_sdata4;
8571 break;
8572 default:
8573 /* No assumptions here. 8-byte relocs required. */
8574 type = DW_EH_PE_sdata8;
8575 break;
8577 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8580 /* Emit load exclusive. */
8582 static void
8583 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8584 rtx mem, rtx model_rtx)
8586 rtx (*gen) (rtx, rtx, rtx);
8588 switch (mode)
8590 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8591 case HImode: gen = gen_aarch64_load_exclusivehi; break;
8592 case SImode: gen = gen_aarch64_load_exclusivesi; break;
8593 case DImode: gen = gen_aarch64_load_exclusivedi; break;
8594 default:
8595 gcc_unreachable ();
8598 emit_insn (gen (rval, mem, model_rtx));
8601 /* Emit store exclusive. */
8603 static void
8604 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8605 rtx rval, rtx mem, rtx model_rtx)
8607 rtx (*gen) (rtx, rtx, rtx, rtx);
8609 switch (mode)
8611 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8612 case HImode: gen = gen_aarch64_store_exclusivehi; break;
8613 case SImode: gen = gen_aarch64_store_exclusivesi; break;
8614 case DImode: gen = gen_aarch64_store_exclusivedi; break;
8615 default:
8616 gcc_unreachable ();
8619 emit_insn (gen (bval, rval, mem, model_rtx));
8622 /* Mark the previous jump instruction as unlikely. */
8624 static void
8625 aarch64_emit_unlikely_jump (rtx insn)
8627 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8629 insn = emit_jump_insn (insn);
8630 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8633 /* Expand a compare and swap pattern. */
8635 void
8636 aarch64_expand_compare_and_swap (rtx operands[])
8638 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8639 machine_mode mode, cmp_mode;
8640 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8642 bval = operands[0];
8643 rval = operands[1];
8644 mem = operands[2];
8645 oldval = operands[3];
8646 newval = operands[4];
8647 is_weak = operands[5];
8648 mod_s = operands[6];
8649 mod_f = operands[7];
8650 mode = GET_MODE (mem);
8651 cmp_mode = mode;
8653 /* Normally the succ memory model must be stronger than fail, but in the
8654 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8655 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
8657 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8658 && INTVAL (mod_s) == MEMMODEL_RELEASE)
8659 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8661 switch (mode)
8663 case QImode:
8664 case HImode:
8665 /* For short modes, we're going to perform the comparison in SImode,
8666 so do the zero-extension now. */
8667 cmp_mode = SImode;
8668 rval = gen_reg_rtx (SImode);
8669 oldval = convert_modes (SImode, mode, oldval, true);
8670 /* Fall through. */
8672 case SImode:
8673 case DImode:
8674 /* Force the value into a register if needed. */
8675 if (!aarch64_plus_operand (oldval, mode))
8676 oldval = force_reg (cmp_mode, oldval);
8677 break;
8679 default:
8680 gcc_unreachable ();
8683 switch (mode)
8685 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8686 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8687 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8688 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8689 default:
8690 gcc_unreachable ();
8693 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8695 if (mode == QImode || mode == HImode)
8696 emit_move_insn (operands[1], gen_lowpart (mode, rval));
8698 x = gen_rtx_REG (CCmode, CC_REGNUM);
8699 x = gen_rtx_EQ (SImode, x, const0_rtx);
8700 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8703 /* Split a compare and swap pattern. */
8705 void
8706 aarch64_split_compare_and_swap (rtx operands[])
8708 rtx rval, mem, oldval, newval, scratch;
8709 machine_mode mode;
8710 bool is_weak;
8711 rtx_code_label *label1, *label2;
8712 rtx x, cond;
8714 rval = operands[0];
8715 mem = operands[1];
8716 oldval = operands[2];
8717 newval = operands[3];
8718 is_weak = (operands[4] != const0_rtx);
8719 scratch = operands[7];
8720 mode = GET_MODE (mem);
8722 label1 = NULL;
8723 if (!is_weak)
8725 label1 = gen_label_rtx ();
8726 emit_label (label1);
8728 label2 = gen_label_rtx ();
8730 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8732 cond = aarch64_gen_compare_reg (NE, rval, oldval);
8733 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8734 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8735 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8736 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8738 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8740 if (!is_weak)
8742 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8743 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8744 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8745 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8747 else
8749 cond = gen_rtx_REG (CCmode, CC_REGNUM);
8750 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8751 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8754 emit_label (label2);
8757 /* Split an atomic operation. */
8759 void
8760 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8761 rtx value, rtx model_rtx, rtx cond)
8763 machine_mode mode = GET_MODE (mem);
8764 machine_mode wmode = (mode == DImode ? DImode : SImode);
8765 rtx_code_label *label;
8766 rtx x;
8768 label = gen_label_rtx ();
8769 emit_label (label);
8771 if (new_out)
8772 new_out = gen_lowpart (wmode, new_out);
8773 if (old_out)
8774 old_out = gen_lowpart (wmode, old_out);
8775 else
8776 old_out = new_out;
8777 value = simplify_gen_subreg (wmode, value, mode, 0);
8779 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8781 switch (code)
8783 case SET:
8784 new_out = value;
8785 break;
8787 case NOT:
8788 x = gen_rtx_AND (wmode, old_out, value);
8789 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8790 x = gen_rtx_NOT (wmode, new_out);
8791 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8792 break;
8794 case MINUS:
8795 if (CONST_INT_P (value))
8797 value = GEN_INT (-INTVAL (value));
8798 code = PLUS;
8800 /* Fall through. */
8802 default:
8803 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8804 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8805 break;
8808 aarch64_emit_store_exclusive (mode, cond, mem,
8809 gen_lowpart (mode, new_out), model_rtx);
8811 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8812 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8813 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8814 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8817 static void
8818 aarch64_print_extension (void)
8820 const struct aarch64_option_extension *opt = NULL;
8822 for (opt = all_extensions; opt->name != NULL; opt++)
8823 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8824 asm_fprintf (asm_out_file, "+%s", opt->name);
8826 asm_fprintf (asm_out_file, "\n");
8829 static void
8830 aarch64_start_file (void)
8832 if (selected_arch)
8834 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8835 aarch64_print_extension ();
8837 else if (selected_cpu)
8839 const char *truncated_name
8840 = aarch64_rewrite_selected_cpu (selected_cpu->name);
8841 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8842 aarch64_print_extension ();
8844 default_file_start();
8847 /* Target hook for c_mode_for_suffix. */
8848 static machine_mode
8849 aarch64_c_mode_for_suffix (char suffix)
8851 if (suffix == 'q')
8852 return TFmode;
8854 return VOIDmode;
8857 /* We can only represent floating point constants which will fit in
8858 "quarter-precision" values. These values are characterised by
8859 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
8862 (-1)^s * (n/16) * 2^r
8864 Where:
8865 's' is the sign bit.
8866 'n' is an integer in the range 16 <= n <= 31.
8867 'r' is an integer in the range -3 <= r <= 4. */
8869 /* Return true iff X can be represented by a quarter-precision
8870 floating point immediate operand X. Note, we cannot represent 0.0. */
8871 bool
8872 aarch64_float_const_representable_p (rtx x)
8874 /* This represents our current view of how many bits
8875 make up the mantissa. */
8876 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8877 int exponent;
8878 unsigned HOST_WIDE_INT mantissa, mask;
8879 REAL_VALUE_TYPE r, m;
8880 bool fail;
8882 if (!CONST_DOUBLE_P (x))
8883 return false;
8885 if (GET_MODE (x) == VOIDmode)
8886 return false;
8888 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8890 /* We cannot represent infinities, NaNs or +/-zero. We won't
8891 know if we have +zero until we analyse the mantissa, but we
8892 can reject the other invalid values. */
8893 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8894 || REAL_VALUE_MINUS_ZERO (r))
8895 return false;
8897 /* Extract exponent. */
8898 r = real_value_abs (&r);
8899 exponent = REAL_EXP (&r);
8901 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8902 highest (sign) bit, with a fixed binary point at bit point_pos.
8903 m1 holds the low part of the mantissa, m2 the high part.
8904 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8905 bits for the mantissa, this can fail (low bits will be lost). */
8906 real_ldexp (&m, &r, point_pos - exponent);
8907 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8909 /* If the low part of the mantissa has bits set we cannot represent
8910 the value. */
8911 if (w.elt (0) != 0)
8912 return false;
8913 /* We have rejected the lower HOST_WIDE_INT, so update our
8914 understanding of how many bits lie in the mantissa and
8915 look only at the high HOST_WIDE_INT. */
8916 mantissa = w.elt (1);
8917 point_pos -= HOST_BITS_PER_WIDE_INT;
8919 /* We can only represent values with a mantissa of the form 1.xxxx. */
8920 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8921 if ((mantissa & mask) != 0)
8922 return false;
8924 /* Having filtered unrepresentable values, we may now remove all
8925 but the highest 5 bits. */
8926 mantissa >>= point_pos - 5;
8928 /* We cannot represent the value 0.0, so reject it. This is handled
8929 elsewhere. */
8930 if (mantissa == 0)
8931 return false;
8933 /* Then, as bit 4 is always set, we can mask it off, leaving
8934 the mantissa in the range [0, 15]. */
8935 mantissa &= ~(1 << 4);
8936 gcc_assert (mantissa <= 15);
8938 /* GCC internally does not use IEEE754-like encoding (where normalized
8939 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
8940 Our mantissa values are shifted 4 places to the left relative to
8941 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8942 by 5 places to correct for GCC's representation. */
8943 exponent = 5 - exponent;
8945 return (exponent >= 0 && exponent <= 7);
8948 char*
8949 aarch64_output_simd_mov_immediate (rtx const_vector,
8950 machine_mode mode,
8951 unsigned width)
8953 bool is_valid;
8954 static char templ[40];
8955 const char *mnemonic;
8956 const char *shift_op;
8957 unsigned int lane_count = 0;
8958 char element_char;
8960 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8962 /* This will return true to show const_vector is legal for use as either
8963 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
8964 also update INFO to show how the immediate should be generated. */
8965 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8966 gcc_assert (is_valid);
8968 element_char = sizetochar (info.element_width);
8969 lane_count = width / info.element_width;
8971 mode = GET_MODE_INNER (mode);
8972 if (mode == SFmode || mode == DFmode)
8974 gcc_assert (info.shift == 0 && ! info.mvn);
8975 if (aarch64_float_const_zero_rtx_p (info.value))
8976 info.value = GEN_INT (0);
8977 else
8979 #define buf_size 20
8980 REAL_VALUE_TYPE r;
8981 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8982 char float_buf[buf_size] = {'\0'};
8983 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8984 #undef buf_size
8986 if (lane_count == 1)
8987 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8988 else
8989 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8990 lane_count, element_char, float_buf);
8991 return templ;
8995 mnemonic = info.mvn ? "mvni" : "movi";
8996 shift_op = info.msl ? "msl" : "lsl";
8998 if (lane_count == 1)
8999 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9000 mnemonic, UINTVAL (info.value));
9001 else if (info.shift)
9002 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9003 ", %s %d", mnemonic, lane_count, element_char,
9004 UINTVAL (info.value), shift_op, info.shift);
9005 else
9006 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9007 mnemonic, lane_count, element_char, UINTVAL (info.value));
9008 return templ;
9011 char*
9012 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9013 machine_mode mode)
9015 machine_mode vmode;
9017 gcc_assert (!VECTOR_MODE_P (mode));
9018 vmode = aarch64_simd_container_mode (mode, 64);
9019 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9020 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9023 /* Split operands into moves from op[1] + op[2] into op[0]. */
9025 void
9026 aarch64_split_combinev16qi (rtx operands[3])
9028 unsigned int dest = REGNO (operands[0]);
9029 unsigned int src1 = REGNO (operands[1]);
9030 unsigned int src2 = REGNO (operands[2]);
9031 machine_mode halfmode = GET_MODE (operands[1]);
9032 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9033 rtx destlo, desthi;
9035 gcc_assert (halfmode == V16QImode);
9037 if (src1 == dest && src2 == dest + halfregs)
9039 /* No-op move. Can't split to nothing; emit something. */
9040 emit_note (NOTE_INSN_DELETED);
9041 return;
9044 /* Preserve register attributes for variable tracking. */
9045 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9046 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9047 GET_MODE_SIZE (halfmode));
9049 /* Special case of reversed high/low parts. */
9050 if (reg_overlap_mentioned_p (operands[2], destlo)
9051 && reg_overlap_mentioned_p (operands[1], desthi))
9053 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9054 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9055 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9057 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9059 /* Try to avoid unnecessary moves if part of the result
9060 is in the right place already. */
9061 if (src1 != dest)
9062 emit_move_insn (destlo, operands[1]);
9063 if (src2 != dest + halfregs)
9064 emit_move_insn (desthi, operands[2]);
9066 else
9068 if (src2 != dest + halfregs)
9069 emit_move_insn (desthi, operands[2]);
9070 if (src1 != dest)
9071 emit_move_insn (destlo, operands[1]);
9075 /* vec_perm support. */
9077 #define MAX_VECT_LEN 16
9079 struct expand_vec_perm_d
9081 rtx target, op0, op1;
9082 unsigned char perm[MAX_VECT_LEN];
9083 machine_mode vmode;
9084 unsigned char nelt;
9085 bool one_vector_p;
9086 bool testing_p;
9089 /* Generate a variable permutation. */
9091 static void
9092 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9094 machine_mode vmode = GET_MODE (target);
9095 bool one_vector_p = rtx_equal_p (op0, op1);
9097 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9098 gcc_checking_assert (GET_MODE (op0) == vmode);
9099 gcc_checking_assert (GET_MODE (op1) == vmode);
9100 gcc_checking_assert (GET_MODE (sel) == vmode);
9101 gcc_checking_assert (TARGET_SIMD);
9103 if (one_vector_p)
9105 if (vmode == V8QImode)
9107 /* Expand the argument to a V16QI mode by duplicating it. */
9108 rtx pair = gen_reg_rtx (V16QImode);
9109 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9110 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9112 else
9114 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9117 else
9119 rtx pair;
9121 if (vmode == V8QImode)
9123 pair = gen_reg_rtx (V16QImode);
9124 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9125 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9127 else
9129 pair = gen_reg_rtx (OImode);
9130 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9131 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9136 void
9137 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9139 machine_mode vmode = GET_MODE (target);
9140 unsigned int nelt = GET_MODE_NUNITS (vmode);
9141 bool one_vector_p = rtx_equal_p (op0, op1);
9142 rtx mask;
9144 /* The TBL instruction does not use a modulo index, so we must take care
9145 of that ourselves. */
9146 mask = aarch64_simd_gen_const_vector_dup (vmode,
9147 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9148 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9150 /* For big-endian, we also need to reverse the index within the vector
9151 (but not which vector). */
9152 if (BYTES_BIG_ENDIAN)
9154 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9155 if (!one_vector_p)
9156 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9157 sel = expand_simple_binop (vmode, XOR, sel, mask,
9158 NULL, 0, OPTAB_LIB_WIDEN);
9160 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9163 /* Recognize patterns suitable for the TRN instructions. */
9164 static bool
9165 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9167 unsigned int i, odd, mask, nelt = d->nelt;
9168 rtx out, in0, in1, x;
9169 rtx (*gen) (rtx, rtx, rtx);
9170 machine_mode vmode = d->vmode;
9172 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9173 return false;
9175 /* Note that these are little-endian tests.
9176 We correct for big-endian later. */
9177 if (d->perm[0] == 0)
9178 odd = 0;
9179 else if (d->perm[0] == 1)
9180 odd = 1;
9181 else
9182 return false;
9183 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9185 for (i = 0; i < nelt; i += 2)
9187 if (d->perm[i] != i + odd)
9188 return false;
9189 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9190 return false;
9193 /* Success! */
9194 if (d->testing_p)
9195 return true;
9197 in0 = d->op0;
9198 in1 = d->op1;
9199 if (BYTES_BIG_ENDIAN)
9201 x = in0, in0 = in1, in1 = x;
9202 odd = !odd;
9204 out = d->target;
9206 if (odd)
9208 switch (vmode)
9210 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9211 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9212 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9213 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9214 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9215 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9216 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9217 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9218 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9219 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9220 default:
9221 return false;
9224 else
9226 switch (vmode)
9228 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9229 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9230 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9231 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9232 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9233 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9234 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9235 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9236 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9237 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9238 default:
9239 return false;
9243 emit_insn (gen (out, in0, in1));
9244 return true;
9247 /* Recognize patterns suitable for the UZP instructions. */
9248 static bool
9249 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9251 unsigned int i, odd, mask, nelt = d->nelt;
9252 rtx out, in0, in1, x;
9253 rtx (*gen) (rtx, rtx, rtx);
9254 machine_mode vmode = d->vmode;
9256 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9257 return false;
9259 /* Note that these are little-endian tests.
9260 We correct for big-endian later. */
9261 if (d->perm[0] == 0)
9262 odd = 0;
9263 else if (d->perm[0] == 1)
9264 odd = 1;
9265 else
9266 return false;
9267 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9269 for (i = 0; i < nelt; i++)
9271 unsigned elt = (i * 2 + odd) & mask;
9272 if (d->perm[i] != elt)
9273 return false;
9276 /* Success! */
9277 if (d->testing_p)
9278 return true;
9280 in0 = d->op0;
9281 in1 = d->op1;
9282 if (BYTES_BIG_ENDIAN)
9284 x = in0, in0 = in1, in1 = x;
9285 odd = !odd;
9287 out = d->target;
9289 if (odd)
9291 switch (vmode)
9293 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9294 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9295 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9296 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9297 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9298 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9299 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9300 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9301 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9302 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9303 default:
9304 return false;
9307 else
9309 switch (vmode)
9311 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9312 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9313 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9314 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9315 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9316 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9317 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9318 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9319 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9320 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9321 default:
9322 return false;
9326 emit_insn (gen (out, in0, in1));
9327 return true;
9330 /* Recognize patterns suitable for the ZIP instructions. */
9331 static bool
9332 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9334 unsigned int i, high, mask, nelt = d->nelt;
9335 rtx out, in0, in1, x;
9336 rtx (*gen) (rtx, rtx, rtx);
9337 machine_mode vmode = d->vmode;
9339 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9340 return false;
9342 /* Note that these are little-endian tests.
9343 We correct for big-endian later. */
9344 high = nelt / 2;
9345 if (d->perm[0] == high)
9346 /* Do Nothing. */
9348 else if (d->perm[0] == 0)
9349 high = 0;
9350 else
9351 return false;
9352 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9354 for (i = 0; i < nelt / 2; i++)
9356 unsigned elt = (i + high) & mask;
9357 if (d->perm[i * 2] != elt)
9358 return false;
9359 elt = (elt + nelt) & mask;
9360 if (d->perm[i * 2 + 1] != elt)
9361 return false;
9364 /* Success! */
9365 if (d->testing_p)
9366 return true;
9368 in0 = d->op0;
9369 in1 = d->op1;
9370 if (BYTES_BIG_ENDIAN)
9372 x = in0, in0 = in1, in1 = x;
9373 high = !high;
9375 out = d->target;
9377 if (high)
9379 switch (vmode)
9381 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9382 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9383 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9384 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9385 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9386 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9387 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9388 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9389 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9390 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9391 default:
9392 return false;
9395 else
9397 switch (vmode)
9399 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9400 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9401 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9402 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9403 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9404 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9405 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9406 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9407 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9408 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9409 default:
9410 return false;
9414 emit_insn (gen (out, in0, in1));
9415 return true;
9418 /* Recognize patterns for the EXT insn. */
9420 static bool
9421 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9423 unsigned int i, nelt = d->nelt;
9424 rtx (*gen) (rtx, rtx, rtx, rtx);
9425 rtx offset;
9427 unsigned int location = d->perm[0]; /* Always < nelt. */
9429 /* Check if the extracted indices are increasing by one. */
9430 for (i = 1; i < nelt; i++)
9432 unsigned int required = location + i;
9433 if (d->one_vector_p)
9435 /* We'll pass the same vector in twice, so allow indices to wrap. */
9436 required &= (nelt - 1);
9438 if (d->perm[i] != required)
9439 return false;
9442 switch (d->vmode)
9444 case V16QImode: gen = gen_aarch64_extv16qi; break;
9445 case V8QImode: gen = gen_aarch64_extv8qi; break;
9446 case V4HImode: gen = gen_aarch64_extv4hi; break;
9447 case V8HImode: gen = gen_aarch64_extv8hi; break;
9448 case V2SImode: gen = gen_aarch64_extv2si; break;
9449 case V4SImode: gen = gen_aarch64_extv4si; break;
9450 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9451 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9452 case V2DImode: gen = gen_aarch64_extv2di; break;
9453 case V2DFmode: gen = gen_aarch64_extv2df; break;
9454 default:
9455 return false;
9458 /* Success! */
9459 if (d->testing_p)
9460 return true;
9462 /* The case where (location == 0) is a no-op for both big- and little-endian,
9463 and is removed by the mid-end at optimization levels -O1 and higher. */
9465 if (BYTES_BIG_ENDIAN && (location != 0))
9467 /* After setup, we want the high elements of the first vector (stored
9468 at the LSB end of the register), and the low elements of the second
9469 vector (stored at the MSB end of the register). So swap. */
9470 rtx temp = d->op0;
9471 d->op0 = d->op1;
9472 d->op1 = temp;
9473 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9474 location = nelt - location;
9477 offset = GEN_INT (location);
9478 emit_insn (gen (d->target, d->op0, d->op1, offset));
9479 return true;
9482 /* Recognize patterns for the REV insns. */
9484 static bool
9485 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9487 unsigned int i, j, diff, nelt = d->nelt;
9488 rtx (*gen) (rtx, rtx);
9490 if (!d->one_vector_p)
9491 return false;
9493 diff = d->perm[0];
9494 switch (diff)
9496 case 7:
9497 switch (d->vmode)
9499 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9500 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
9501 default:
9502 return false;
9504 break;
9505 case 3:
9506 switch (d->vmode)
9508 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9509 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
9510 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
9511 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
9512 default:
9513 return false;
9515 break;
9516 case 1:
9517 switch (d->vmode)
9519 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9520 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
9521 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
9522 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
9523 case V4SImode: gen = gen_aarch64_rev64v4si; break;
9524 case V2SImode: gen = gen_aarch64_rev64v2si; break;
9525 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
9526 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
9527 default:
9528 return false;
9530 break;
9531 default:
9532 return false;
9535 for (i = 0; i < nelt ; i += diff + 1)
9536 for (j = 0; j <= diff; j += 1)
9538 /* This is guaranteed to be true as the value of diff
9539 is 7, 3, 1 and we should have enough elements in the
9540 queue to generate this. Getting a vector mask with a
9541 value of diff other than these values implies that
9542 something is wrong by the time we get here. */
9543 gcc_assert (i + j < nelt);
9544 if (d->perm[i + j] != i + diff - j)
9545 return false;
9548 /* Success! */
9549 if (d->testing_p)
9550 return true;
9552 emit_insn (gen (d->target, d->op0));
9553 return true;
9556 static bool
9557 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9559 rtx (*gen) (rtx, rtx, rtx);
9560 rtx out = d->target;
9561 rtx in0;
9562 machine_mode vmode = d->vmode;
9563 unsigned int i, elt, nelt = d->nelt;
9564 rtx lane;
9566 elt = d->perm[0];
9567 for (i = 1; i < nelt; i++)
9569 if (elt != d->perm[i])
9570 return false;
9573 /* The generic preparation in aarch64_expand_vec_perm_const_1
9574 swaps the operand order and the permute indices if it finds
9575 d->perm[0] to be in the second operand. Thus, we can always
9576 use d->op0 and need not do any extra arithmetic to get the
9577 correct lane number. */
9578 in0 = d->op0;
9579 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
9581 switch (vmode)
9583 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9584 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9585 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9586 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9587 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9588 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9589 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9590 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9591 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9592 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9593 default:
9594 return false;
9597 emit_insn (gen (out, in0, lane));
9598 return true;
9601 static bool
9602 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9604 rtx rperm[MAX_VECT_LEN], sel;
9605 machine_mode vmode = d->vmode;
9606 unsigned int i, nelt = d->nelt;
9608 if (d->testing_p)
9609 return true;
9611 /* Generic code will try constant permutation twice. Once with the
9612 original mode and again with the elements lowered to QImode.
9613 So wait and don't do the selector expansion ourselves. */
9614 if (vmode != V8QImode && vmode != V16QImode)
9615 return false;
9617 for (i = 0; i < nelt; ++i)
9619 int nunits = GET_MODE_NUNITS (vmode);
9621 /* If big-endian and two vectors we end up with a weird mixed-endian
9622 mode on NEON. Reverse the index within each word but not the word
9623 itself. */
9624 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9625 : d->perm[i]);
9627 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9628 sel = force_reg (vmode, sel);
9630 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9631 return true;
9634 static bool
9635 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9637 /* The pattern matching functions above are written to look for a small
9638 number to begin the sequence (0, 1, N/2). If we begin with an index
9639 from the second operand, we can swap the operands. */
9640 if (d->perm[0] >= d->nelt)
9642 unsigned i, nelt = d->nelt;
9643 rtx x;
9645 gcc_assert (nelt == (nelt & -nelt));
9646 for (i = 0; i < nelt; ++i)
9647 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
9649 x = d->op0;
9650 d->op0 = d->op1;
9651 d->op1 = x;
9654 if (TARGET_SIMD)
9656 if (aarch64_evpc_rev (d))
9657 return true;
9658 else if (aarch64_evpc_ext (d))
9659 return true;
9660 else if (aarch64_evpc_dup (d))
9661 return true;
9662 else if (aarch64_evpc_zip (d))
9663 return true;
9664 else if (aarch64_evpc_uzp (d))
9665 return true;
9666 else if (aarch64_evpc_trn (d))
9667 return true;
9668 return aarch64_evpc_tbl (d);
9670 return false;
9673 /* Expand a vec_perm_const pattern. */
9675 bool
9676 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9678 struct expand_vec_perm_d d;
9679 int i, nelt, which;
9681 d.target = target;
9682 d.op0 = op0;
9683 d.op1 = op1;
9685 d.vmode = GET_MODE (target);
9686 gcc_assert (VECTOR_MODE_P (d.vmode));
9687 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9688 d.testing_p = false;
9690 for (i = which = 0; i < nelt; ++i)
9692 rtx e = XVECEXP (sel, 0, i);
9693 int ei = INTVAL (e) & (2 * nelt - 1);
9694 which |= (ei < nelt ? 1 : 2);
9695 d.perm[i] = ei;
9698 switch (which)
9700 default:
9701 gcc_unreachable ();
9703 case 3:
9704 d.one_vector_p = false;
9705 if (!rtx_equal_p (op0, op1))
9706 break;
9708 /* The elements of PERM do not suggest that only the first operand
9709 is used, but both operands are identical. Allow easier matching
9710 of the permutation by folding the permutation into the single
9711 input vector. */
9712 /* Fall Through. */
9713 case 2:
9714 for (i = 0; i < nelt; ++i)
9715 d.perm[i] &= nelt - 1;
9716 d.op0 = op1;
9717 d.one_vector_p = true;
9718 break;
9720 case 1:
9721 d.op1 = op0;
9722 d.one_vector_p = true;
9723 break;
9726 return aarch64_expand_vec_perm_const_1 (&d);
9729 static bool
9730 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
9731 const unsigned char *sel)
9733 struct expand_vec_perm_d d;
9734 unsigned int i, nelt, which;
9735 bool ret;
9737 d.vmode = vmode;
9738 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9739 d.testing_p = true;
9740 memcpy (d.perm, sel, nelt);
9742 /* Calculate whether all elements are in one vector. */
9743 for (i = which = 0; i < nelt; ++i)
9745 unsigned char e = d.perm[i];
9746 gcc_assert (e < 2 * nelt);
9747 which |= (e < nelt ? 1 : 2);
9750 /* If all elements are from the second vector, reindex as if from the
9751 first vector. */
9752 if (which == 2)
9753 for (i = 0; i < nelt; ++i)
9754 d.perm[i] -= nelt;
9756 /* Check whether the mask can be applied to a single vector. */
9757 d.one_vector_p = (which != 3);
9759 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9760 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9761 if (!d.one_vector_p)
9762 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9764 start_sequence ();
9765 ret = aarch64_expand_vec_perm_const_1 (&d);
9766 end_sequence ();
9768 return ret;
9771 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
9772 bool
9773 aarch64_cannot_change_mode_class (machine_mode from,
9774 machine_mode to,
9775 enum reg_class rclass)
9777 /* Full-reg subregs are allowed on general regs or any class if they are
9778 the same size. */
9779 if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9780 || !reg_classes_intersect_p (FP_REGS, rclass))
9781 return false;
9783 /* Limited combinations of subregs are safe on FPREGs. Particularly,
9784 1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9785 2. Scalar to Scalar for integer modes or same size float modes.
9786 3. Vector to Vector modes.
9787 4. On little-endian only, Vector-Structure to Vector modes. */
9788 if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9790 if (aarch64_vector_mode_supported_p (from)
9791 && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9792 return false;
9794 if (GET_MODE_NUNITS (from) == 1
9795 && GET_MODE_NUNITS (to) == 1
9796 && (GET_MODE_CLASS (from) == MODE_INT
9797 || from == to))
9798 return false;
9800 if (aarch64_vector_mode_supported_p (from)
9801 && aarch64_vector_mode_supported_p (to))
9802 return false;
9804 /* Within an vector structure straddling multiple vector registers
9805 we are in a mixed-endian representation. As such, we can't
9806 easily change modes for BYTES_BIG_ENDIAN. Otherwise, we can
9807 switch between vectors and vector structures cheaply. */
9808 if (!BYTES_BIG_ENDIAN)
9809 if ((aarch64_vector_mode_supported_p (from)
9810 && aarch64_vect_struct_mode_p (to))
9811 || (aarch64_vector_mode_supported_p (to)
9812 && aarch64_vect_struct_mode_p (from)))
9813 return false;
9816 return true;
9819 /* Implement MODES_TIEABLE_P. */
9821 bool
9822 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
9824 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9825 return true;
9827 /* We specifically want to allow elements of "structure" modes to
9828 be tieable to the structure. This more general condition allows
9829 other rarer situations too. */
9830 if (TARGET_SIMD
9831 && aarch64_vector_mode_p (mode1)
9832 && aarch64_vector_mode_p (mode2))
9833 return true;
9835 return false;
9838 /* Return a new RTX holding the result of moving POINTER forward by
9839 AMOUNT bytes. */
9841 static rtx
9842 aarch64_move_pointer (rtx pointer, int amount)
9844 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9846 return adjust_automodify_address (pointer, GET_MODE (pointer),
9847 next, amount);
9850 /* Return a new RTX holding the result of moving POINTER forward by the
9851 size of the mode it points to. */
9853 static rtx
9854 aarch64_progress_pointer (rtx pointer)
9856 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9858 return aarch64_move_pointer (pointer, amount);
9861 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9862 MODE bytes. */
9864 static void
9865 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9866 machine_mode mode)
9868 rtx reg = gen_reg_rtx (mode);
9870 /* "Cast" the pointers to the correct mode. */
9871 *src = adjust_address (*src, mode, 0);
9872 *dst = adjust_address (*dst, mode, 0);
9873 /* Emit the memcpy. */
9874 emit_move_insn (reg, *src);
9875 emit_move_insn (*dst, reg);
9876 /* Move the pointers forward. */
9877 *src = aarch64_progress_pointer (*src);
9878 *dst = aarch64_progress_pointer (*dst);
9881 /* Expand movmem, as if from a __builtin_memcpy. Return true if
9882 we succeed, otherwise return false. */
9884 bool
9885 aarch64_expand_movmem (rtx *operands)
9887 unsigned int n;
9888 rtx dst = operands[0];
9889 rtx src = operands[1];
9890 rtx base;
9891 bool speed_p = !optimize_function_for_size_p (cfun);
9893 /* When optimizing for size, give a better estimate of the length of a
9894 memcpy call, but use the default otherwise. */
9895 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9897 /* We can't do anything smart if the amount to copy is not constant. */
9898 if (!CONST_INT_P (operands[2]))
9899 return false;
9901 n = UINTVAL (operands[2]);
9903 /* Try to keep the number of instructions low. For cases below 16 bytes we
9904 need to make at most two moves. For cases above 16 bytes it will be one
9905 move for each 16 byte chunk, then at most two additional moves. */
9906 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9907 return false;
9909 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9910 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9912 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9913 src = adjust_automodify_address (src, VOIDmode, base, 0);
9915 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9916 1-byte chunk. */
9917 if (n < 4)
9919 if (n >= 2)
9921 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9922 n -= 2;
9925 if (n == 1)
9926 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9928 return true;
9931 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
9932 4-byte chunk, partially overlapping with the previously copied chunk. */
9933 if (n < 8)
9935 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9936 n -= 4;
9937 if (n > 0)
9939 int move = n - 4;
9941 src = aarch64_move_pointer (src, move);
9942 dst = aarch64_move_pointer (dst, move);
9943 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9945 return true;
9948 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
9949 them, then (if applicable) an 8-byte chunk. */
9950 while (n >= 8)
9952 if (n / 16)
9954 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9955 n -= 16;
9957 else
9959 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9960 n -= 8;
9964 /* Finish the final bytes of the copy. We can always do this in one
9965 instruction. We either copy the exact amount we need, or partially
9966 overlap with the previous chunk we copied and copy 8-bytes. */
9967 if (n == 0)
9968 return true;
9969 else if (n == 1)
9970 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9971 else if (n == 2)
9972 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9973 else if (n == 4)
9974 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9975 else
9977 if (n == 3)
9979 src = aarch64_move_pointer (src, -1);
9980 dst = aarch64_move_pointer (dst, -1);
9981 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9983 else
9985 int move = n - 8;
9987 src = aarch64_move_pointer (src, move);
9988 dst = aarch64_move_pointer (dst, move);
9989 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9993 return true;
9996 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
9998 static unsigned HOST_WIDE_INT
9999 aarch64_asan_shadow_offset (void)
10001 return (HOST_WIDE_INT_1 << 36);
10004 static bool
10005 aarch64_use_by_pieces_infrastructure_p (unsigned int size,
10006 unsigned int align,
10007 enum by_pieces_operation op,
10008 bool speed_p)
10010 /* STORE_BY_PIECES can be used when copying a constant string, but
10011 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10012 For now we always fail this and let the move_by_pieces code copy
10013 the string from read-only memory. */
10014 if (op == STORE_BY_PIECES)
10015 return false;
10017 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10020 #undef TARGET_ADDRESS_COST
10021 #define TARGET_ADDRESS_COST aarch64_address_cost
10023 /* This hook will determines whether unnamed bitfields affect the alignment
10024 of the containing structure. The hook returns true if the structure
10025 should inherit the alignment requirements of an unnamed bitfield's
10026 type. */
10027 #undef TARGET_ALIGN_ANON_BITFIELD
10028 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
10030 #undef TARGET_ASM_ALIGNED_DI_OP
10031 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
10033 #undef TARGET_ASM_ALIGNED_HI_OP
10034 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
10036 #undef TARGET_ASM_ALIGNED_SI_OP
10037 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
10039 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
10040 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
10041 hook_bool_const_tree_hwi_hwi_const_tree_true
10043 #undef TARGET_ASM_FILE_START
10044 #define TARGET_ASM_FILE_START aarch64_start_file
10046 #undef TARGET_ASM_OUTPUT_MI_THUNK
10047 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
10049 #undef TARGET_ASM_SELECT_RTX_SECTION
10050 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
10052 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
10053 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
10055 #undef TARGET_BUILD_BUILTIN_VA_LIST
10056 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
10058 #undef TARGET_CALLEE_COPIES
10059 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
10061 #undef TARGET_CAN_ELIMINATE
10062 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
10064 #undef TARGET_CANNOT_FORCE_CONST_MEM
10065 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
10067 #undef TARGET_CONDITIONAL_REGISTER_USAGE
10068 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
10070 /* Only the least significant bit is used for initialization guard
10071 variables. */
10072 #undef TARGET_CXX_GUARD_MASK_BIT
10073 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
10075 #undef TARGET_C_MODE_FOR_SUFFIX
10076 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
10078 #ifdef TARGET_BIG_ENDIAN_DEFAULT
10079 #undef TARGET_DEFAULT_TARGET_FLAGS
10080 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
10081 #endif
10083 #undef TARGET_CLASS_MAX_NREGS
10084 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
10086 #undef TARGET_BUILTIN_DECL
10087 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
10089 #undef TARGET_EXPAND_BUILTIN
10090 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
10092 #undef TARGET_EXPAND_BUILTIN_VA_START
10093 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
10095 #undef TARGET_FOLD_BUILTIN
10096 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
10098 #undef TARGET_FUNCTION_ARG
10099 #define TARGET_FUNCTION_ARG aarch64_function_arg
10101 #undef TARGET_FUNCTION_ARG_ADVANCE
10102 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
10104 #undef TARGET_FUNCTION_ARG_BOUNDARY
10105 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
10107 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
10108 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
10110 #undef TARGET_FUNCTION_VALUE
10111 #define TARGET_FUNCTION_VALUE aarch64_function_value
10113 #undef TARGET_FUNCTION_VALUE_REGNO_P
10114 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
10116 #undef TARGET_FRAME_POINTER_REQUIRED
10117 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
10119 #undef TARGET_GIMPLE_FOLD_BUILTIN
10120 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
10122 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
10123 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
10125 #undef TARGET_INIT_BUILTINS
10126 #define TARGET_INIT_BUILTINS aarch64_init_builtins
10128 #undef TARGET_LEGITIMATE_ADDRESS_P
10129 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
10131 #undef TARGET_LEGITIMATE_CONSTANT_P
10132 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
10134 #undef TARGET_LIBGCC_CMP_RETURN_MODE
10135 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
10137 #undef TARGET_LRA_P
10138 #define TARGET_LRA_P aarch64_lra_p
10140 #undef TARGET_MANGLE_TYPE
10141 #define TARGET_MANGLE_TYPE aarch64_mangle_type
10143 #undef TARGET_MEMORY_MOVE_COST
10144 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
10146 #undef TARGET_MUST_PASS_IN_STACK
10147 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
10149 /* This target hook should return true if accesses to volatile bitfields
10150 should use the narrowest mode possible. It should return false if these
10151 accesses should use the bitfield container type. */
10152 #undef TARGET_NARROW_VOLATILE_BITFIELD
10153 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
10155 #undef TARGET_OPTION_OVERRIDE
10156 #define TARGET_OPTION_OVERRIDE aarch64_override_options
10158 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
10159 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
10160 aarch64_override_options_after_change
10162 #undef TARGET_PASS_BY_REFERENCE
10163 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
10165 #undef TARGET_PREFERRED_RELOAD_CLASS
10166 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
10168 #undef TARGET_SECONDARY_RELOAD
10169 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
10171 #undef TARGET_SHIFT_TRUNCATION_MASK
10172 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
10174 #undef TARGET_SETUP_INCOMING_VARARGS
10175 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
10177 #undef TARGET_STRUCT_VALUE_RTX
10178 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
10180 #undef TARGET_REGISTER_MOVE_COST
10181 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
10183 #undef TARGET_RETURN_IN_MEMORY
10184 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
10186 #undef TARGET_RETURN_IN_MSB
10187 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
10189 #undef TARGET_RTX_COSTS
10190 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
10192 #undef TARGET_SCHED_ISSUE_RATE
10193 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
10195 #undef TARGET_TRAMPOLINE_INIT
10196 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
10198 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
10199 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
10201 #undef TARGET_VECTOR_MODE_SUPPORTED_P
10202 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
10204 #undef TARGET_ARRAY_MODE_SUPPORTED_P
10205 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
10207 #undef TARGET_VECTORIZE_ADD_STMT_COST
10208 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
10210 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
10211 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
10212 aarch64_builtin_vectorization_cost
10214 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
10215 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
10217 #undef TARGET_VECTORIZE_BUILTINS
10218 #define TARGET_VECTORIZE_BUILTINS
10220 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
10221 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
10222 aarch64_builtin_vectorized_function
10224 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
10225 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
10226 aarch64_autovectorize_vector_sizes
10228 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
10229 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
10230 aarch64_atomic_assign_expand_fenv
10232 /* Section anchor support. */
10234 #undef TARGET_MIN_ANCHOR_OFFSET
10235 #define TARGET_MIN_ANCHOR_OFFSET -256
10237 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
10238 byte offset; we can do much more for larger data types, but have no way
10239 to determine the size of the access. We assume accesses are aligned. */
10240 #undef TARGET_MAX_ANCHOR_OFFSET
10241 #define TARGET_MAX_ANCHOR_OFFSET 4095
10243 #undef TARGET_VECTOR_ALIGNMENT
10244 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
10246 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
10247 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
10248 aarch64_simd_vector_alignment_reachable
10250 /* vec_perm support. */
10252 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
10253 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
10254 aarch64_vectorize_vec_perm_const_ok
10257 #undef TARGET_FIXED_CONDITION_CODE_REGS
10258 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
10260 #undef TARGET_FLAGS_REGNUM
10261 #define TARGET_FLAGS_REGNUM CC_REGNUM
10263 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
10264 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
10266 #undef TARGET_ASAN_SHADOW_OFFSET
10267 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
10269 #undef TARGET_LEGITIMIZE_ADDRESS
10270 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
10272 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
10273 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
10274 aarch64_use_by_pieces_infrastructure_p
10276 struct gcc_target targetm = TARGET_INITIALIZER;
10278 #include "gt-aarch64.h"