[AArch64] Properly handle mvn-register and add EON+shift pattern and cost appropriately
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob7579f5b2519eb6d279f2c76bcfe13d9ea709809a
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
105 ADDRESS_REG_IMM
106 A simple base register plus immediate offset.
108 ADDRESS_REG_WB
109 A base register indexed by immediate offset with writeback.
111 ADDRESS_REG_REG
112 A base register indexed by (optionally scaled) register.
114 ADDRESS_REG_UXTW
115 A base register indexed by (optionally scaled) zero-extended register.
117 ADDRESS_REG_SXTW
118 A base register indexed by (optionally scaled) sign-extended register.
120 ADDRESS_LO_SUM
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
123 ADDRESS_SYMBOLIC:
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type {
127 ADDRESS_REG_IMM,
128 ADDRESS_REG_WB,
129 ADDRESS_REG_REG,
130 ADDRESS_REG_UXTW,
131 ADDRESS_REG_SXTW,
132 ADDRESS_LO_SUM,
133 ADDRESS_SYMBOLIC
136 struct aarch64_address_info {
137 enum aarch64_address_type type;
138 rtx base;
139 rtx offset;
140 int shift;
141 enum aarch64_symbol_type symbol_type;
144 struct simd_immediate_info
146 rtx value;
147 int shift;
148 int element_width;
149 bool mvn;
150 bool msl;
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel;
156 #ifdef HAVE_AS_TLS
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
159 #endif
161 static bool aarch64_composite_type_p (const_tree, machine_mode);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
163 const_tree,
164 machine_mode *, int *,
165 bool *);
166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode);
170 static unsigned bit_count (unsigned HOST_WIDE_INT);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
172 const unsigned char *sel);
173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune = cortexa53;
181 /* The current tuning set. */
182 const struct tune_params *aarch64_tune_params;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags = 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags = 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table =
195 0, /* hi */
196 0, /* si */
197 0, /* di */
198 0, /* ti */
200 0, /* pre_modify */
201 0, /* post_modify */
202 0, /* register_offset */
203 0, /* register_extend */
204 0 /* imm_offset */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 1, /* hi */
211 0, /* si */
212 0, /* di */
213 1, /* ti */
215 0, /* pre_modify */
216 0, /* post_modify */
217 0, /* register_offset */
218 0, /* register_extend */
219 0, /* imm_offset */
222 static const struct cpu_addrcost_table xgene1_addrcost_table =
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
230 1, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 1, /* register_extend */
234 0, /* imm_offset */
237 static const struct cpu_regmove_cost generic_regmove_cost =
239 1, /* GP2GP */
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
242 5, /* GP2FP */
243 5, /* FP2GP */
244 2 /* FP2FP */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
249 1, /* GP2GP */
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
252 5, /* GP2FP */
253 5, /* FP2GP */
254 2 /* FP2FP */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
259 1, /* GP2GP */
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
262 5, /* GP2FP */
263 5, /* FP2GP */
264 2 /* FP2FP */
267 static const struct cpu_regmove_cost thunderx_regmove_cost =
269 2, /* GP2GP */
270 2, /* GP2FP */
271 6, /* FP2GP */
272 4 /* FP2FP */
275 static const struct cpu_regmove_cost xgene1_regmove_cost =
277 1, /* GP2GP */
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
280 8, /* GP2FP */
281 8, /* FP2GP */
282 2 /* FP2FP */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost =
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost =
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost =
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 static const struct tune_params generic_tunings =
345 &cortexa57_extra_costs,
346 &generic_addrcost_table,
347 &generic_regmove_cost,
348 &generic_vector_cost,
349 4, /* memmov_cost */
350 2, /* issue_rate */
351 AARCH64_FUSE_NOTHING, /* fuseable_ops */
352 8, /* function_align. */
353 8, /* jump_align. */
354 4, /* loop_align. */
355 2, /* int_reassoc_width. */
356 4, /* fp_reassoc_width. */
357 1 /* vec_reassoc_width. */
360 static const struct tune_params cortexa53_tunings =
362 &cortexa53_extra_costs,
363 &generic_addrcost_table,
364 &cortexa53_regmove_cost,
365 &generic_vector_cost,
366 4, /* memmov_cost */
367 2, /* issue_rate */
368 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
369 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops */
370 8, /* function_align. */
371 8, /* jump_align. */
372 4, /* loop_align. */
373 2, /* int_reassoc_width. */
374 4, /* fp_reassoc_width. */
375 1 /* vec_reassoc_width. */
378 static const struct tune_params cortexa57_tunings =
380 &cortexa57_extra_costs,
381 &cortexa57_addrcost_table,
382 &cortexa57_regmove_cost,
383 &cortexa57_vector_cost,
384 4, /* memmov_cost */
385 3, /* issue_rate */
386 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
387 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
388 16, /* function_align. */
389 8, /* jump_align. */
390 4, /* loop_align. */
391 2, /* int_reassoc_width. */
392 4, /* fp_reassoc_width. */
393 1 /* vec_reassoc_width. */
396 static const struct tune_params thunderx_tunings =
398 &thunderx_extra_costs,
399 &generic_addrcost_table,
400 &thunderx_regmove_cost,
401 &generic_vector_cost,
402 6, /* memmov_cost */
403 2, /* issue_rate */
404 AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops */
405 8, /* function_align. */
406 8, /* jump_align. */
407 8, /* loop_align. */
408 2, /* int_reassoc_width. */
409 4, /* fp_reassoc_width. */
410 1 /* vec_reassoc_width. */
413 static const struct tune_params xgene1_tunings =
415 &xgene1_extra_costs,
416 &xgene1_addrcost_table,
417 &xgene1_regmove_cost,
418 &xgene1_vector_cost,
419 6, /* memmov_cost */
420 4, /* issue_rate */
421 AARCH64_FUSE_NOTHING, /* fuseable_ops */
422 16, /* function_align. */
423 8, /* jump_align. */
424 16, /* loop_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1 /* vec_reassoc_width. */
430 /* A processor implementing AArch64. */
431 struct processor
433 const char *const name;
434 enum aarch64_processor core;
435 const char *arch;
436 unsigned architecture_version;
437 const unsigned long flags;
438 const struct tune_params *const tune;
441 /* Processor cores implementing AArch64. */
442 static const struct processor all_cores[] =
444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
445 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
446 #include "aarch64-cores.def"
447 #undef AARCH64_CORE
448 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
449 {NULL, aarch64_none, NULL, 0, 0, NULL}
452 /* Architectures implementing AArch64. */
453 static const struct processor all_architectures[] =
455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
456 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
457 #include "aarch64-arches.def"
458 #undef AARCH64_ARCH
459 {NULL, aarch64_none, NULL, 0, 0, NULL}
462 /* Target specification. These are populated as commandline arguments
463 are processed, or NULL if not specified. */
464 static const struct processor *selected_arch;
465 static const struct processor *selected_cpu;
466 static const struct processor *selected_tune;
468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
470 /* An ISA extension in the co-processor and main instruction set space. */
471 struct aarch64_option_extension
473 const char *const name;
474 const unsigned long flags_on;
475 const unsigned long flags_off;
478 /* ISA extensions in AArch64. */
479 static const struct aarch64_option_extension all_extensions[] =
481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
482 {NAME, FLAGS_ON, FLAGS_OFF},
483 #include "aarch64-option-extensions.def"
484 #undef AARCH64_OPT_EXTENSION
485 {NULL, 0, 0}
488 /* Used to track the size of an address when generating a pre/post
489 increment address. */
490 static machine_mode aarch64_memory_reference_mode;
492 /* A table of valid AArch64 "bitmask immediate" values for
493 logical instructions. */
495 #define AARCH64_NUM_BITMASKS 5334
496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
498 typedef enum aarch64_cond_code
500 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
501 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
502 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
504 aarch64_cc;
506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
508 /* The condition codes of the processor, and the inverse function. */
509 static const char * const aarch64_condition_codes[] =
511 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
512 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
515 static unsigned int
516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
518 return 2;
521 static int
522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
523 enum machine_mode mode)
525 if (VECTOR_MODE_P (mode))
526 return aarch64_tune_params->vec_reassoc_width;
527 if (INTEGRAL_MODE_P (mode))
528 return aarch64_tune_params->int_reassoc_width;
529 if (FLOAT_MODE_P (mode))
530 return aarch64_tune_params->fp_reassoc_width;
531 return 1;
534 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
535 unsigned
536 aarch64_dbx_register_number (unsigned regno)
538 if (GP_REGNUM_P (regno))
539 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
540 else if (regno == SP_REGNUM)
541 return AARCH64_DWARF_SP;
542 else if (FP_REGNUM_P (regno))
543 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
545 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
546 equivalent DWARF register. */
547 return DWARF_FRAME_REGISTERS;
550 /* Return TRUE if MODE is any of the large INT modes. */
551 static bool
552 aarch64_vect_struct_mode_p (machine_mode mode)
554 return mode == OImode || mode == CImode || mode == XImode;
557 /* Return TRUE if MODE is any of the vector modes. */
558 static bool
559 aarch64_vector_mode_p (machine_mode mode)
561 return aarch64_vector_mode_supported_p (mode)
562 || aarch64_vect_struct_mode_p (mode);
565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
566 static bool
567 aarch64_array_mode_supported_p (machine_mode mode,
568 unsigned HOST_WIDE_INT nelems)
570 if (TARGET_SIMD
571 && AARCH64_VALID_SIMD_QREG_MODE (mode)
572 && (nelems >= 2 && nelems <= 4))
573 return true;
575 return false;
578 /* Implement HARD_REGNO_NREGS. */
581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
583 switch (aarch64_regno_regclass (regno))
585 case FP_REGS:
586 case FP_LO_REGS:
587 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
588 default:
589 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
591 gcc_unreachable ();
594 /* Implement HARD_REGNO_MODE_OK. */
597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
599 if (GET_MODE_CLASS (mode) == MODE_CC)
600 return regno == CC_REGNUM;
602 if (regno == SP_REGNUM)
603 /* The purpose of comparing with ptr_mode is to support the
604 global register variable associated with the stack pointer
605 register via the syntax of asm ("wsp") in ILP32. */
606 return mode == Pmode || mode == ptr_mode;
608 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
609 return mode == Pmode;
611 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
612 return 1;
614 if (FP_REGNUM_P (regno))
616 if (aarch64_vect_struct_mode_p (mode))
617 return
618 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
619 else
620 return 1;
623 return 0;
626 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
627 machine_mode
628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
629 machine_mode mode)
631 /* Handle modes that fit within single registers. */
632 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
634 if (GET_MODE_SIZE (mode) >= 4)
635 return mode;
636 else
637 return SImode;
639 /* Fall back to generic for multi-reg and very large modes. */
640 else
641 return choose_hard_reg_mode (regno, nregs, false);
644 /* Return true if calls to DECL should be treated as
645 long-calls (ie called via a register). */
646 static bool
647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
649 return false;
652 /* Return true if calls to symbol-ref SYM should be treated as
653 long-calls (ie called via a register). */
654 bool
655 aarch64_is_long_call_p (rtx sym)
657 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
660 /* Return true if the offsets to a zero/sign-extract operation
661 represent an expression that matches an extend operation. The
662 operands represent the paramters from
664 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
665 bool
666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
667 rtx extract_imm)
669 HOST_WIDE_INT mult_val, extract_val;
671 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
672 return false;
674 mult_val = INTVAL (mult_imm);
675 extract_val = INTVAL (extract_imm);
677 if (extract_val > 8
678 && extract_val < GET_MODE_BITSIZE (mode)
679 && exact_log2 (extract_val & ~7) > 0
680 && (extract_val & 7) <= 4
681 && mult_val == (1 << (extract_val & 7)))
682 return true;
684 return false;
687 /* Emit an insn that's a simple single-set. Both the operands must be
688 known to be valid. */
689 inline static rtx
690 emit_set_insn (rtx x, rtx y)
692 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
695 /* X and Y are two things to compare using CODE. Emit the compare insn and
696 return the rtx for register 0 in the proper mode. */
698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
700 machine_mode mode = SELECT_CC_MODE (code, x, y);
701 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
703 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
704 return cc_reg;
707 /* Build the SYMBOL_REF for __tls_get_addr. */
709 static GTY(()) rtx tls_get_addr_libfunc;
712 aarch64_tls_get_addr (void)
714 if (!tls_get_addr_libfunc)
715 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
716 return tls_get_addr_libfunc;
719 /* Return the TLS model to use for ADDR. */
721 static enum tls_model
722 tls_symbolic_operand_type (rtx addr)
724 enum tls_model tls_kind = TLS_MODEL_NONE;
725 rtx sym, addend;
727 if (GET_CODE (addr) == CONST)
729 split_const (addr, &sym, &addend);
730 if (GET_CODE (sym) == SYMBOL_REF)
731 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
733 else if (GET_CODE (addr) == SYMBOL_REF)
734 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
736 return tls_kind;
739 /* We'll allow lo_sum's in addresses in our legitimate addresses
740 so that combine would take care of combining addresses where
741 necessary, but for generation purposes, we'll generate the address
742 as :
743 RTL Absolute
744 tmp = hi (symbol_ref); adrp x1, foo
745 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
748 PIC TLS
749 adrp x1, :got:foo adrp tmp, :tlsgd:foo
750 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
751 bl __tls_get_addr
754 Load TLS symbol, depending on TLS mechanism and TLS access model.
756 Global Dynamic - Traditional TLS:
757 adrp tmp, :tlsgd:imm
758 add dest, tmp, #:tlsgd_lo12:imm
759 bl __tls_get_addr
761 Global Dynamic - TLS Descriptors:
762 adrp dest, :tlsdesc:imm
763 ldr tmp, [dest, #:tlsdesc_lo12:imm]
764 add dest, dest, #:tlsdesc_lo12:imm
765 blr tmp
766 mrs tp, tpidr_el0
767 add dest, dest, tp
769 Initial Exec:
770 mrs tp, tpidr_el0
771 adrp tmp, :gottprel:imm
772 ldr dest, [tmp, #:gottprel_lo12:imm]
773 add dest, dest, tp
775 Local Exec:
776 mrs tp, tpidr_el0
777 add t0, tp, #:tprel_hi12:imm, lsl #12
778 add t0, t0, #:tprel_lo12_nc:imm
781 static void
782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
783 enum aarch64_symbol_type type)
785 switch (type)
787 case SYMBOL_SMALL_ABSOLUTE:
789 /* In ILP32, the mode of dest can be either SImode or DImode. */
790 rtx tmp_reg = dest;
791 machine_mode mode = GET_MODE (dest);
793 gcc_assert (mode == Pmode || mode == ptr_mode);
795 if (can_create_pseudo_p ())
796 tmp_reg = gen_reg_rtx (mode);
798 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
799 emit_insn (gen_add_losym (dest, tmp_reg, imm));
800 return;
803 case SYMBOL_TINY_ABSOLUTE:
804 emit_insn (gen_rtx_SET (Pmode, dest, imm));
805 return;
807 case SYMBOL_SMALL_GOT:
809 /* In ILP32, the mode of dest can be either SImode or DImode,
810 while the got entry is always of SImode size. The mode of
811 dest depends on how dest is used: if dest is assigned to a
812 pointer (e.g. in the memory), it has SImode; it may have
813 DImode if dest is dereferenced to access the memeory.
814 This is why we have to handle three different ldr_got_small
815 patterns here (two patterns for ILP32). */
816 rtx tmp_reg = dest;
817 machine_mode mode = GET_MODE (dest);
819 if (can_create_pseudo_p ())
820 tmp_reg = gen_reg_rtx (mode);
822 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
823 if (mode == ptr_mode)
825 if (mode == DImode)
826 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
827 else
828 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
830 else
832 gcc_assert (mode == Pmode);
833 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
836 return;
839 case SYMBOL_SMALL_TLSGD:
841 rtx_insn *insns;
842 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
844 start_sequence ();
845 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
846 insns = get_insns ();
847 end_sequence ();
849 RTL_CONST_CALL_P (insns) = 1;
850 emit_libcall_block (insns, dest, result, imm);
851 return;
854 case SYMBOL_SMALL_TLSDESC:
856 machine_mode mode = GET_MODE (dest);
857 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
858 rtx tp;
860 gcc_assert (mode == Pmode || mode == ptr_mode);
862 /* In ILP32, the got entry is always of SImode size. Unlike
863 small GOT, the dest is fixed at reg 0. */
864 if (TARGET_ILP32)
865 emit_insn (gen_tlsdesc_small_si (imm));
866 else
867 emit_insn (gen_tlsdesc_small_di (imm));
868 tp = aarch64_load_tp (NULL);
870 if (mode != Pmode)
871 tp = gen_lowpart (mode, tp);
873 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
874 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
875 return;
878 case SYMBOL_SMALL_GOTTPREL:
880 /* In ILP32, the mode of dest can be either SImode or DImode,
881 while the got entry is always of SImode size. The mode of
882 dest depends on how dest is used: if dest is assigned to a
883 pointer (e.g. in the memory), it has SImode; it may have
884 DImode if dest is dereferenced to access the memeory.
885 This is why we have to handle three different tlsie_small
886 patterns here (two patterns for ILP32). */
887 machine_mode mode = GET_MODE (dest);
888 rtx tmp_reg = gen_reg_rtx (mode);
889 rtx tp = aarch64_load_tp (NULL);
891 if (mode == ptr_mode)
893 if (mode == DImode)
894 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
895 else
897 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
898 tp = gen_lowpart (mode, tp);
901 else
903 gcc_assert (mode == Pmode);
904 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
907 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
908 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
909 return;
912 case SYMBOL_SMALL_TPREL:
914 rtx tp = aarch64_load_tp (NULL);
916 if (GET_MODE (dest) != Pmode)
917 tp = gen_lowpart (GET_MODE (dest), tp);
919 emit_insn (gen_tlsle_small (dest, tp, imm));
920 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
921 return;
924 case SYMBOL_TINY_GOT:
925 emit_insn (gen_ldr_got_tiny (dest, imm));
926 return;
928 default:
929 gcc_unreachable ();
933 /* Emit a move from SRC to DEST. Assume that the move expanders can
934 handle all moves if !can_create_pseudo_p (). The distinction is
935 important because, unlike emit_move_insn, the move expanders know
936 how to force Pmode objects into the constant pool even when the
937 constant pool address is not itself legitimate. */
938 static rtx
939 aarch64_emit_move (rtx dest, rtx src)
941 return (can_create_pseudo_p ()
942 ? emit_move_insn (dest, src)
943 : emit_move_insn_1 (dest, src));
946 /* Split a 128-bit move operation into two 64-bit move operations,
947 taking care to handle partial overlap of register to register
948 copies. Special cases are needed when moving between GP regs and
949 FP regs. SRC can be a register, constant or memory; DST a register
950 or memory. If either operand is memory it must not have any side
951 effects. */
952 void
953 aarch64_split_128bit_move (rtx dst, rtx src)
955 rtx dst_lo, dst_hi;
956 rtx src_lo, src_hi;
958 machine_mode mode = GET_MODE (dst);
960 gcc_assert (mode == TImode || mode == TFmode);
961 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
962 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
964 if (REG_P (dst) && REG_P (src))
966 int src_regno = REGNO (src);
967 int dst_regno = REGNO (dst);
969 /* Handle FP <-> GP regs. */
970 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
972 src_lo = gen_lowpart (word_mode, src);
973 src_hi = gen_highpart (word_mode, src);
975 if (mode == TImode)
977 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
978 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
980 else
982 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
983 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
985 return;
987 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
989 dst_lo = gen_lowpart (word_mode, dst);
990 dst_hi = gen_highpart (word_mode, dst);
992 if (mode == TImode)
994 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
995 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
997 else
999 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1002 return;
1006 dst_lo = gen_lowpart (word_mode, dst);
1007 dst_hi = gen_highpart (word_mode, dst);
1008 src_lo = gen_lowpart (word_mode, src);
1009 src_hi = gen_highpart_mode (word_mode, mode, src);
1011 /* At most one pairing may overlap. */
1012 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1014 aarch64_emit_move (dst_hi, src_hi);
1015 aarch64_emit_move (dst_lo, src_lo);
1017 else
1019 aarch64_emit_move (dst_lo, src_lo);
1020 aarch64_emit_move (dst_hi, src_hi);
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1027 return (! REG_P (src)
1028 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1031 /* Split a complex SIMD combine. */
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1036 machine_mode src_mode = GET_MODE (src1);
1037 machine_mode dst_mode = GET_MODE (dst);
1039 gcc_assert (VECTOR_MODE_P (dst_mode));
1041 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1043 rtx (*gen) (rtx, rtx, rtx);
1045 switch (src_mode)
1047 case V8QImode:
1048 gen = gen_aarch64_simd_combinev8qi;
1049 break;
1050 case V4HImode:
1051 gen = gen_aarch64_simd_combinev4hi;
1052 break;
1053 case V2SImode:
1054 gen = gen_aarch64_simd_combinev2si;
1055 break;
1056 case V2SFmode:
1057 gen = gen_aarch64_simd_combinev2sf;
1058 break;
1059 case DImode:
1060 gen = gen_aarch64_simd_combinedi;
1061 break;
1062 case DFmode:
1063 gen = gen_aarch64_simd_combinedf;
1064 break;
1065 default:
1066 gcc_unreachable ();
1069 emit_insn (gen (dst, src1, src2));
1070 return;
1074 /* Split a complex SIMD move. */
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1079 machine_mode src_mode = GET_MODE (src);
1080 machine_mode dst_mode = GET_MODE (dst);
1082 gcc_assert (VECTOR_MODE_P (dst_mode));
1084 if (REG_P (dst) && REG_P (src))
1086 rtx (*gen) (rtx, rtx);
1088 gcc_assert (VECTOR_MODE_P (src_mode));
1090 switch (src_mode)
1092 case V16QImode:
1093 gen = gen_aarch64_split_simd_movv16qi;
1094 break;
1095 case V8HImode:
1096 gen = gen_aarch64_split_simd_movv8hi;
1097 break;
1098 case V4SImode:
1099 gen = gen_aarch64_split_simd_movv4si;
1100 break;
1101 case V2DImode:
1102 gen = gen_aarch64_split_simd_movv2di;
1103 break;
1104 case V4SFmode:
1105 gen = gen_aarch64_split_simd_movv4sf;
1106 break;
1107 case V2DFmode:
1108 gen = gen_aarch64_split_simd_movv2df;
1109 break;
1110 default:
1111 gcc_unreachable ();
1114 emit_insn (gen (dst, src));
1115 return;
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1122 if (can_create_pseudo_p ())
1123 return force_reg (mode, value);
1124 else
1126 x = aarch64_emit_move (x, value);
1127 return x;
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1135 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1137 rtx high;
1138 /* Load the full offset into a register. This
1139 might be improvable in the future. */
1140 high = GEN_INT (offset);
1141 offset = 0;
1142 high = aarch64_force_temporary (mode, temp, high);
1143 reg = aarch64_force_temporary (mode, temp,
1144 gen_rtx_PLUS (mode, high, reg));
1146 return plus_constant (mode, reg, offset);
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151 machine_mode mode)
1153 unsigned HOST_WIDE_INT mask;
1154 int i;
1155 bool first;
1156 unsigned HOST_WIDE_INT val;
1157 bool subtargets;
1158 rtx subtarget;
1159 int one_match, zero_match, first_not_ffff_match;
1160 int num_insns = 0;
1162 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1164 if (generate)
1165 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166 num_insns++;
1167 return num_insns;
1170 if (mode == SImode)
1172 /* We know we can't do this in 1 insn, and we must be able to do it
1173 in two; so don't mess around looking for sequences that don't buy
1174 us anything. */
1175 if (generate)
1177 emit_insn (gen_rtx_SET (VOIDmode, dest,
1178 GEN_INT (INTVAL (imm) & 0xffff)));
1179 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1182 num_insns += 2;
1183 return num_insns;
1186 /* Remaining cases are all for DImode. */
1188 val = INTVAL (imm);
1189 subtargets = optimize && can_create_pseudo_p ();
1191 one_match = 0;
1192 zero_match = 0;
1193 mask = 0xffff;
1194 first_not_ffff_match = -1;
1196 for (i = 0; i < 64; i += 16, mask <<= 16)
1198 if ((val & mask) == mask)
1199 one_match++;
1200 else
1202 if (first_not_ffff_match < 0)
1203 first_not_ffff_match = i;
1204 if ((val & mask) == 0)
1205 zero_match++;
1209 if (one_match == 2)
1211 /* Set one of the quarters and then insert back into result. */
1212 mask = 0xffffll << first_not_ffff_match;
1213 if (generate)
1215 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217 GEN_INT ((val >> first_not_ffff_match)
1218 & 0xffff)));
1220 num_insns += 2;
1221 return num_insns;
1224 if (zero_match == 2)
1225 goto simple_sequence;
1227 mask = 0x0ffff0000UL;
1228 for (i = 16; i < 64; i += 16, mask <<= 16)
1230 HOST_WIDE_INT comp = mask & ~(mask - 1);
1232 if (aarch64_uimm12_shift (val - (val & mask)))
1234 if (generate)
1236 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238 GEN_INT (val & mask)));
1239 emit_insn (gen_adddi3 (dest, subtarget,
1240 GEN_INT (val - (val & mask))));
1242 num_insns += 2;
1243 return num_insns;
1245 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1247 if (generate)
1249 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251 GEN_INT ((val + comp) & mask)));
1252 emit_insn (gen_adddi3 (dest, subtarget,
1253 GEN_INT (val - ((val + comp) & mask))));
1255 num_insns += 2;
1256 return num_insns;
1258 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1260 if (generate)
1262 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264 GEN_INT ((val - comp) | ~mask)));
1265 emit_insn (gen_adddi3 (dest, subtarget,
1266 GEN_INT (val - ((val - comp) | ~mask))));
1268 num_insns += 2;
1269 return num_insns;
1271 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1273 if (generate)
1275 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277 GEN_INT (val | ~mask)));
1278 emit_insn (gen_adddi3 (dest, subtarget,
1279 GEN_INT (val - (val | ~mask))));
1281 num_insns += 2;
1282 return num_insns;
1286 /* See if we can do it by arithmetically combining two
1287 immediates. */
1288 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1290 int j;
1291 mask = 0xffff;
1293 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1296 if (generate)
1298 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300 GEN_INT (aarch64_bitmasks[i])));
1301 emit_insn (gen_adddi3 (dest, subtarget,
1302 GEN_INT (val - aarch64_bitmasks[i])));
1304 num_insns += 2;
1305 return num_insns;
1308 for (j = 0; j < 64; j += 16, mask <<= 16)
1310 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1312 if (generate)
1314 emit_insn (gen_rtx_SET (VOIDmode, dest,
1315 GEN_INT (aarch64_bitmasks[i])));
1316 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317 GEN_INT ((val >> j) & 0xffff)));
1319 num_insns += 2;
1320 return num_insns;
1325 /* See if we can do it by logically combining two immediates. */
1326 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1328 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1330 int j;
1332 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1335 if (generate)
1337 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339 GEN_INT (aarch64_bitmasks[i])));
1340 emit_insn (gen_iordi3 (dest, subtarget,
1341 GEN_INT (aarch64_bitmasks[j])));
1343 num_insns += 2;
1344 return num_insns;
1347 else if ((val & aarch64_bitmasks[i]) == val)
1349 int j;
1351 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1354 if (generate)
1356 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358 GEN_INT (aarch64_bitmasks[j])));
1359 emit_insn (gen_anddi3 (dest, subtarget,
1360 GEN_INT (aarch64_bitmasks[i])));
1362 num_insns += 2;
1363 return num_insns;
1368 if (one_match > zero_match)
1370 /* Set either first three quarters or all but the third. */
1371 mask = 0xffffll << (16 - first_not_ffff_match);
1372 if (generate)
1373 emit_insn (gen_rtx_SET (VOIDmode, dest,
1374 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375 num_insns ++;
1377 /* Now insert other two quarters. */
1378 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379 i < 64; i += 16, mask <<= 16)
1381 if ((val & mask) != mask)
1383 if (generate)
1384 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385 GEN_INT ((val >> i) & 0xffff)));
1386 num_insns ++;
1389 return num_insns;
1392 simple_sequence:
1393 first = true;
1394 mask = 0xffff;
1395 for (i = 0; i < 64; i += 16, mask <<= 16)
1397 if ((val & mask) != 0)
1399 if (first)
1401 if (generate)
1402 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403 GEN_INT (val & mask)));
1404 num_insns ++;
1405 first = false;
1407 else
1409 if (generate)
1410 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411 GEN_INT ((val >> i) & 0xffff)));
1412 num_insns ++;
1417 return num_insns;
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1424 machine_mode mode = GET_MODE (dest);
1426 gcc_assert (mode == SImode || mode == DImode);
1428 /* Check on what type of symbol it is. */
1429 if (GET_CODE (imm) == SYMBOL_REF
1430 || GET_CODE (imm) == LABEL_REF
1431 || GET_CODE (imm) == CONST)
1433 rtx mem, base, offset;
1434 enum aarch64_symbol_type sty;
1436 /* If we have (const (plus symbol offset)), separate out the offset
1437 before we start classifying the symbol. */
1438 split_const (imm, &base, &offset);
1440 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441 switch (sty)
1443 case SYMBOL_FORCE_TO_MEM:
1444 if (offset != const0_rtx
1445 && targetm.cannot_force_const_mem (mode, imm))
1447 gcc_assert (can_create_pseudo_p ());
1448 base = aarch64_force_temporary (mode, dest, base);
1449 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450 aarch64_emit_move (dest, base);
1451 return;
1453 mem = force_const_mem (ptr_mode, imm);
1454 gcc_assert (mem);
1455 if (mode != ptr_mode)
1456 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458 return;
1460 case SYMBOL_SMALL_TLSGD:
1461 case SYMBOL_SMALL_TLSDESC:
1462 case SYMBOL_SMALL_GOTTPREL:
1463 case SYMBOL_SMALL_GOT:
1464 case SYMBOL_TINY_GOT:
1465 if (offset != const0_rtx)
1467 gcc_assert(can_create_pseudo_p ());
1468 base = aarch64_force_temporary (mode, dest, base);
1469 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470 aarch64_emit_move (dest, base);
1471 return;
1473 /* FALLTHRU */
1475 case SYMBOL_SMALL_TPREL:
1476 case SYMBOL_SMALL_ABSOLUTE:
1477 case SYMBOL_TINY_ABSOLUTE:
1478 aarch64_load_symref_appropriately (dest, imm, sty);
1479 return;
1481 default:
1482 gcc_unreachable ();
1486 if (!CONST_INT_P (imm))
1488 if (GET_CODE (imm) == HIGH)
1489 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490 else
1492 rtx mem = force_const_mem (mode, imm);
1493 gcc_assert (mem);
1494 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1497 return;
1500 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505 tree exp ATTRIBUTE_UNUSED)
1507 /* Currently, always true. */
1508 return true;
1511 /* Implement TARGET_PASS_BY_REFERENCE. */
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515 machine_mode mode,
1516 const_tree type,
1517 bool named ATTRIBUTE_UNUSED)
1519 HOST_WIDE_INT size;
1520 machine_mode dummymode;
1521 int nregs;
1523 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1524 size = (mode == BLKmode && type)
1525 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1527 /* Aggregates are passed by reference based on their size. */
1528 if (type && AGGREGATE_TYPE_P (type))
1530 size = int_size_in_bytes (type);
1533 /* Variable sized arguments are always returned by reference. */
1534 if (size < 0)
1535 return true;
1537 /* Can this be a candidate to be passed in fp/simd register(s)? */
1538 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539 &dummymode, &nregs,
1540 NULL))
1541 return false;
1543 /* Arguments which are variable sized or larger than 2 registers are
1544 passed by reference unless they are a homogenous floating point
1545 aggregate. */
1546 return size > 2 * UNITS_PER_WORD;
1549 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1553 machine_mode dummy_mode;
1554 int dummy_int;
1556 /* Never happens in little-endian mode. */
1557 if (!BYTES_BIG_ENDIAN)
1558 return false;
1560 /* Only composite types smaller than or equal to 16 bytes can
1561 be potentially returned in registers. */
1562 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563 || int_size_in_bytes (valtype) <= 0
1564 || int_size_in_bytes (valtype) > 16)
1565 return false;
1567 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569 is always passed/returned in the least significant bits of fp/simd
1570 register(s). */
1571 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572 &dummy_mode, &dummy_int, NULL))
1573 return false;
1575 return true;
1578 /* Implement TARGET_FUNCTION_VALUE.
1579 Define how to find the value returned by a function. */
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583 bool outgoing ATTRIBUTE_UNUSED)
1585 machine_mode mode;
1586 int unsignedp;
1587 int count;
1588 machine_mode ag_mode;
1590 mode = TYPE_MODE (type);
1591 if (INTEGRAL_TYPE_P (type))
1592 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1594 if (aarch64_return_in_msb (type))
1596 HOST_WIDE_INT size = int_size_in_bytes (type);
1598 if (size % UNITS_PER_WORD != 0)
1600 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1605 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606 &ag_mode, &count, NULL))
1608 if (!aarch64_composite_type_p (type, mode))
1610 gcc_assert (count == 1 && mode == ag_mode);
1611 return gen_rtx_REG (mode, V0_REGNUM);
1613 else
1615 int i;
1616 rtx par;
1618 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619 for (i = 0; i < count; i++)
1621 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624 XVECEXP (par, 0, i) = tmp;
1626 return par;
1629 else
1630 return gen_rtx_REG (mode, R0_REGNUM);
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634 Return true if REGNO is the number of a hard register in which the values
1635 of called function may come back. */
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1640 /* Maximum of 16 bytes can be returned in the general registers. Examples
1641 of 16-byte return values are: 128-bit integers and 16-byte small
1642 structures (excluding homogeneous floating-point aggregates). */
1643 if (regno == R0_REGNUM || regno == R1_REGNUM)
1644 return true;
1646 /* Up to four fp/simd registers can return a function value, e.g. a
1647 homogeneous floating-point aggregate having four members. */
1648 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649 return !TARGET_GENERAL_REGS_ONLY;
1651 return false;
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1656 If the type T of the result of a function is such that
1657 void func (T arg)
1658 would require that arg be passed as a value in a register (or set of
1659 registers) according to the parameter passing rules, then the result
1660 is returned in the same registers as would be used for such an
1661 argument. */
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1666 HOST_WIDE_INT size;
1667 machine_mode ag_mode;
1668 int count;
1670 if (!AGGREGATE_TYPE_P (type)
1671 && TREE_CODE (type) != COMPLEX_TYPE
1672 && TREE_CODE (type) != VECTOR_TYPE)
1673 /* Simple scalar types always returned in registers. */
1674 return false;
1676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677 type,
1678 &ag_mode,
1679 &count,
1680 NULL))
1681 return false;
1683 /* Types larger than 2 registers returned in memory. */
1684 size = int_size_in_bytes (type);
1685 return (size < 0 || size > 2 * UNITS_PER_WORD);
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690 const_tree type, int *nregs)
1692 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693 return aarch64_vfp_is_call_or_return_candidate (mode,
1694 type,
1695 &pcum->aapcs_vfp_rmode,
1696 nregs,
1697 NULL);
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701 bits. The idea is to suppress any stronger alignment requested by
1702 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703 This is a helper function for local use only. */
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1708 unsigned int alignment;
1710 if (type)
1712 if (!integer_zerop (TYPE_SIZE (type)))
1714 if (TYPE_MODE (type) == mode)
1715 alignment = TYPE_ALIGN (type);
1716 else
1717 alignment = GET_MODE_ALIGNMENT (mode);
1719 else
1720 alignment = 0;
1722 else
1723 alignment = GET_MODE_ALIGNMENT (mode);
1725 return alignment;
1728 /* Layout a function argument according to the AAPCS64 rules. The rule
1729 numbers refer to the rule numbers in the AAPCS64. */
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733 const_tree type,
1734 bool named ATTRIBUTE_UNUSED)
1736 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737 int ncrn, nvrn, nregs;
1738 bool allocate_ncrn, allocate_nvrn;
1739 HOST_WIDE_INT size;
1741 /* We need to do this once per argument. */
1742 if (pcum->aapcs_arg_processed)
1743 return;
1745 pcum->aapcs_arg_processed = true;
1747 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1748 size
1749 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750 UNITS_PER_WORD);
1752 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754 mode,
1755 type,
1756 &nregs);
1758 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759 The following code thus handles passing by SIMD/FP registers first. */
1761 nvrn = pcum->aapcs_nvrn;
1763 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764 and homogenous short-vector aggregates (HVA). */
1765 if (allocate_nvrn)
1767 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1769 pcum->aapcs_nextnvrn = nvrn + nregs;
1770 if (!aarch64_composite_type_p (type, mode))
1772 gcc_assert (nregs == 1);
1773 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1775 else
1777 rtx par;
1778 int i;
1779 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780 for (i = 0; i < nregs; i++)
1782 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783 V0_REGNUM + nvrn + i);
1784 tmp = gen_rtx_EXPR_LIST
1785 (VOIDmode, tmp,
1786 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787 XVECEXP (par, 0, i) = tmp;
1789 pcum->aapcs_reg = par;
1791 return;
1793 else
1795 /* C.3 NSRN is set to 8. */
1796 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797 goto on_stack;
1801 ncrn = pcum->aapcs_ncrn;
1802 nregs = size / UNITS_PER_WORD;
1804 /* C6 - C9. though the sign and zero extension semantics are
1805 handled elsewhere. This is the case where the argument fits
1806 entirely general registers. */
1807 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1809 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1811 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1813 /* C.8 if the argument has an alignment of 16 then the NGRN is
1814 rounded up to the next even number. */
1815 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1817 ++ncrn;
1818 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1820 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821 A reg is still generated for it, but the caller should be smart
1822 enough not to use it. */
1823 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1825 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1827 else
1829 rtx par;
1830 int i;
1832 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833 for (i = 0; i < nregs; i++)
1835 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837 GEN_INT (i * UNITS_PER_WORD));
1838 XVECEXP (par, 0, i) = tmp;
1840 pcum->aapcs_reg = par;
1843 pcum->aapcs_nextncrn = ncrn + nregs;
1844 return;
1847 /* C.11 */
1848 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1850 /* The argument is passed on stack; record the needed number of words for
1851 this argument and align the total size if necessary. */
1852 on_stack:
1853 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856 16 / UNITS_PER_WORD);
1857 return;
1860 /* Implement TARGET_FUNCTION_ARG. */
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864 const_tree type, bool named)
1866 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1869 if (mode == VOIDmode)
1870 return NULL_RTX;
1872 aarch64_layout_arg (pcum_v, mode, type, named);
1873 return pcum->aapcs_reg;
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878 const_tree fntype ATTRIBUTE_UNUSED,
1879 rtx libname ATTRIBUTE_UNUSED,
1880 const_tree fndecl ATTRIBUTE_UNUSED,
1881 unsigned n_named ATTRIBUTE_UNUSED)
1883 pcum->aapcs_ncrn = 0;
1884 pcum->aapcs_nvrn = 0;
1885 pcum->aapcs_nextncrn = 0;
1886 pcum->aapcs_nextnvrn = 0;
1887 pcum->pcs_variant = ARM_PCS_AAPCS64;
1888 pcum->aapcs_reg = NULL_RTX;
1889 pcum->aapcs_arg_processed = false;
1890 pcum->aapcs_stack_words = 0;
1891 pcum->aapcs_stack_size = 0;
1893 return;
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898 machine_mode mode,
1899 const_tree type,
1900 bool named)
1902 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1905 aarch64_layout_arg (pcum_v, mode, type, named);
1906 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907 != (pcum->aapcs_stack_words != 0));
1908 pcum->aapcs_arg_processed = false;
1909 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912 pcum->aapcs_stack_words = 0;
1913 pcum->aapcs_reg = NULL_RTX;
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1920 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1924 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1925 PARM_BOUNDARY bits of alignment, but will be given anything up
1926 to STACK_BOUNDARY bits if the type requires it. This makes sure
1927 that both before and after the layout of each argument, the Next
1928 Stacked Argument Address (NSAA) will have a minimum alignment of
1929 8 bytes. */
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1934 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1936 if (alignment < PARM_BOUNDARY)
1937 alignment = PARM_BOUNDARY;
1938 if (alignment > STACK_BOUNDARY)
1939 alignment = STACK_BOUNDARY;
1940 return alignment;
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1945 Return true if an argument passed on the stack should be padded upwards,
1946 i.e. if the least-significant byte of the stack slot has useful data.
1948 Small aggregate types are placed in the lowest memory address.
1950 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1955 /* On little-endian targets, the least significant byte of every stack
1956 argument is passed at the lowest byte address of the stack slot. */
1957 if (!BYTES_BIG_ENDIAN)
1958 return true;
1960 /* Otherwise, integral, floating-point and pointer types are padded downward:
1961 the least significant byte of a stack argument is passed at the highest
1962 byte address of the stack slot. */
1963 if (type
1964 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965 || POINTER_TYPE_P (type))
1966 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967 return false;
1969 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1970 return true;
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1975 It specifies padding for the last (may also be the only)
1976 element of a block move between registers and memory. If
1977 assuming the block is in the memory, padding upward means that
1978 the last element is padded after its highest significant byte,
1979 while in downward padding, the last element is padded at the
1980 its least significant byte side.
1982 Small aggregates and small complex types are always padded
1983 upwards.
1985 We don't need to worry about homogeneous floating-point or
1986 short-vector aggregates; their move is not affected by the
1987 padding direction determined here. Regardless of endianness,
1988 each element of such an aggregate is put in the least
1989 significant bits of a fp/simd register.
1991 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992 register has useful data, and return the opposite if the most
1993 significant byte does. */
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997 bool first ATTRIBUTE_UNUSED)
2000 /* Small composite types are always padded upward. */
2001 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2003 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004 : GET_MODE_SIZE (mode));
2005 if (size < 2 * UNITS_PER_WORD)
2006 return true;
2009 /* Otherwise, use the default padding. */
2010 return !BYTES_BIG_ENDIAN;
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2016 return SImode;
2019 static bool
2020 aarch64_frame_pointer_required (void)
2022 /* In aarch64_override_options_after_change
2023 flag_omit_leaf_frame_pointer turns off the frame pointer by
2024 default. Turn it back on now if we've not got a leaf
2025 function. */
2026 if (flag_omit_leaf_frame_pointer
2027 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028 return true;
2030 return false;
2033 /* Mark the registers that need to be saved by the callee and calculate
2034 the size of the callee-saved registers area and frame record (both FP
2035 and LR may be omitted). */
2036 static void
2037 aarch64_layout_frame (void)
2039 HOST_WIDE_INT offset = 0;
2040 int regno;
2042 if (reload_completed && cfun->machine->frame.laid_out)
2043 return;
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED (-1)
2048 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2051 /* First mark all the registers that really need to be saved... */
2052 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2055 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2058 /* ... that includes the eh data registers (if needed)... */
2059 if (crtl->calls_eh_return)
2060 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062 = SLOT_REQUIRED;
2064 /* ... and any callee saved register that dataflow says is live. */
2065 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066 if (df_regs_ever_live_p (regno)
2067 && (regno == R30_REGNUM
2068 || !call_used_regs[regno]))
2069 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2071 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072 if (df_regs_ever_live_p (regno)
2073 && !call_used_regs[regno])
2074 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2076 if (frame_pointer_needed)
2078 /* FP and LR are placed in the linkage record. */
2079 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084 offset += 2 * UNITS_PER_WORD;
2087 /* Now assign stack slots for them. */
2088 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2091 cfun->machine->frame.reg_offset[regno] = offset;
2092 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093 cfun->machine->frame.wb_candidate1 = regno;
2094 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095 cfun->machine->frame.wb_candidate2 = regno;
2096 offset += UNITS_PER_WORD;
2099 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2102 cfun->machine->frame.reg_offset[regno] = offset;
2103 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104 cfun->machine->frame.wb_candidate1 = regno;
2105 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107 cfun->machine->frame.wb_candidate2 = regno;
2108 offset += UNITS_PER_WORD;
2111 cfun->machine->frame.padding0 =
2112 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2115 cfun->machine->frame.saved_regs_size = offset;
2117 cfun->machine->frame.hard_fp_offset
2118 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119 + get_frame_size ()
2120 + cfun->machine->frame.saved_regs_size,
2121 STACK_BOUNDARY / BITS_PER_UNIT);
2123 cfun->machine->frame.frame_size
2124 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125 + crtl->outgoing_args_size,
2126 STACK_BOUNDARY / BITS_PER_UNIT);
2128 cfun->machine->frame.laid_out = true;
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2134 return cfun->machine->frame.reg_offset[regno] >= 0;
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2140 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141 regno ++;
2142 return regno;
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147 HOST_WIDE_INT adjustment)
2149 rtx base_rtx = stack_pointer_rtx;
2150 rtx insn, reg, mem;
2152 reg = gen_rtx_REG (mode, regno);
2153 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154 plus_constant (Pmode, base_rtx, -adjustment));
2155 mem = gen_rtx_MEM (mode, mem);
2157 insn = emit_move_insn (mem, reg);
2158 RTX_FRAME_RELATED_P (insn) = 1;
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163 HOST_WIDE_INT adjustment)
2165 switch (mode)
2167 case DImode:
2168 return gen_storewb_pairdi_di (base, base, reg, reg2,
2169 GEN_INT (-adjustment),
2170 GEN_INT (UNITS_PER_WORD - adjustment));
2171 case DFmode:
2172 return gen_storewb_pairdf_di (base, base, reg, reg2,
2173 GEN_INT (-adjustment),
2174 GEN_INT (UNITS_PER_WORD - adjustment));
2175 default:
2176 gcc_unreachable ();
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182 unsigned regno2, HOST_WIDE_INT adjustment)
2184 rtx_insn *insn;
2185 rtx reg1 = gen_rtx_REG (mode, regno1);
2186 rtx reg2 = gen_rtx_REG (mode, regno2);
2188 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189 reg2, adjustment));
2190 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192 RTX_FRAME_RELATED_P (insn) = 1;
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197 HOST_WIDE_INT adjustment)
2199 switch (mode)
2201 case DImode:
2202 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203 GEN_INT (UNITS_PER_WORD));
2204 case DFmode:
2205 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206 GEN_INT (UNITS_PER_WORD));
2207 default:
2208 gcc_unreachable ();
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214 rtx reg2)
2216 switch (mode)
2218 case DImode:
2219 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2221 case DFmode:
2222 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2224 default:
2225 gcc_unreachable ();
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231 rtx mem2)
2233 switch (mode)
2235 case DImode:
2236 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2238 case DFmode:
2239 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2241 default:
2242 gcc_unreachable ();
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249 unsigned start, unsigned limit, bool skip_wb)
2251 rtx_insn *insn;
2252 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253 ? gen_frame_mem : gen_rtx_MEM);
2254 unsigned regno;
2255 unsigned regno2;
2257 for (regno = aarch64_next_callee_save (start, limit);
2258 regno <= limit;
2259 regno = aarch64_next_callee_save (regno + 1, limit))
2261 rtx reg, mem;
2262 HOST_WIDE_INT offset;
2264 if (skip_wb
2265 && (regno == cfun->machine->frame.wb_candidate1
2266 || regno == cfun->machine->frame.wb_candidate2))
2267 continue;
2269 reg = gen_rtx_REG (mode, regno);
2270 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272 offset));
2274 regno2 = aarch64_next_callee_save (regno + 1, limit);
2276 if (regno2 <= limit
2277 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278 == cfun->machine->frame.reg_offset[regno2]))
2281 rtx reg2 = gen_rtx_REG (mode, regno2);
2282 rtx mem2;
2284 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286 offset));
2287 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288 reg2));
2290 /* The first part of a frame-related parallel insn is
2291 always assumed to be relevant to the frame
2292 calculations; subsequent parts, are only
2293 frame-related if explicitly marked. */
2294 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295 regno = regno2;
2297 else
2298 insn = emit_move_insn (mem, reg);
2300 RTX_FRAME_RELATED_P (insn) = 1;
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306 HOST_WIDE_INT start_offset, unsigned start,
2307 unsigned limit, bool skip_wb, rtx *cfi_ops)
2309 rtx base_rtx = stack_pointer_rtx;
2310 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311 ? gen_frame_mem : gen_rtx_MEM);
2312 unsigned regno;
2313 unsigned regno2;
2314 HOST_WIDE_INT offset;
2316 for (regno = aarch64_next_callee_save (start, limit);
2317 regno <= limit;
2318 regno = aarch64_next_callee_save (regno + 1, limit))
2320 rtx reg, mem;
2322 if (skip_wb
2323 && (regno == cfun->machine->frame.wb_candidate1
2324 || regno == cfun->machine->frame.wb_candidate2))
2325 continue;
2327 reg = gen_rtx_REG (mode, regno);
2328 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2331 regno2 = aarch64_next_callee_save (regno + 1, limit);
2333 if (regno2 <= limit
2334 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335 == cfun->machine->frame.reg_offset[regno2]))
2337 rtx reg2 = gen_rtx_REG (mode, regno2);
2338 rtx mem2;
2340 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2344 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345 regno = regno2;
2347 else
2348 emit_move_insn (reg, mem);
2349 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2353 /* AArch64 stack frames generated by this compiler look like:
2355 +-------------------------------+
2357 | incoming stack arguments |
2359 +-------------------------------+
2360 | | <-- incoming stack pointer (aligned)
2361 | callee-allocated save area |
2362 | for register varargs |
2364 +-------------------------------+
2365 | local variables | <-- frame_pointer_rtx
2367 +-------------------------------+
2368 | padding0 | \
2369 +-------------------------------+ |
2370 | callee-saved registers | | frame.saved_regs_size
2371 +-------------------------------+ |
2372 | LR' | |
2373 +-------------------------------+ |
2374 | FP' | / <- hard_frame_pointer_rtx (aligned)
2375 +-------------------------------+
2376 | dynamic allocation |
2377 +-------------------------------+
2378 | padding |
2379 +-------------------------------+
2380 | outgoing stack arguments | <-- arg_pointer
2382 +-------------------------------+
2383 | | <-- stack_pointer_rtx (aligned)
2385 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387 unchanged. */
2389 /* Generate the prologue instructions for entry into a function.
2390 Establish the stack frame by decreasing the stack pointer with a
2391 properly calculated size and, if necessary, create a frame record
2392 filled with the values of LR and previous frame pointer. The
2393 current FP is also set up if it is in use. */
2395 void
2396 aarch64_expand_prologue (void)
2398 /* sub sp, sp, #<frame_size>
2399 stp {fp, lr}, [sp, #<frame_size> - 16]
2400 add fp, sp, #<frame_size> - hardfp_offset
2401 stp {cs_reg}, [fp, #-16] etc.
2403 sub sp, sp, <final_adjustment_if_any>
2405 HOST_WIDE_INT frame_size, offset;
2406 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2407 HOST_WIDE_INT hard_fp_offset;
2408 rtx_insn *insn;
2410 aarch64_layout_frame ();
2412 offset = frame_size = cfun->machine->frame.frame_size;
2413 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414 fp_offset = frame_size - hard_fp_offset;
2416 if (flag_stack_usage_info)
2417 current_function_static_stack_size = frame_size;
2419 /* Store pairs and load pairs have a range only -512 to 504. */
2420 if (offset >= 512)
2422 /* When the frame has a large size, an initial decrease is done on
2423 the stack pointer to jump over the callee-allocated save area for
2424 register varargs, the local variable area and/or the callee-saved
2425 register area. This will allow the pre-index write-back
2426 store pair instructions to be used for setting up the stack frame
2427 efficiently. */
2428 offset = hard_fp_offset;
2429 if (offset >= 512)
2430 offset = cfun->machine->frame.saved_regs_size;
2432 frame_size -= (offset + crtl->outgoing_args_size);
2433 fp_offset = 0;
2435 if (frame_size >= 0x1000000)
2437 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438 emit_move_insn (op0, GEN_INT (-frame_size));
2439 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2441 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443 plus_constant (Pmode, stack_pointer_rtx,
2444 -frame_size)));
2445 RTX_FRAME_RELATED_P (insn) = 1;
2447 else if (frame_size > 0)
2449 int hi_ofs = frame_size & 0xfff000;
2450 int lo_ofs = frame_size & 0x000fff;
2452 if (hi_ofs)
2454 insn = emit_insn (gen_add2_insn
2455 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456 RTX_FRAME_RELATED_P (insn) = 1;
2458 if (lo_ofs)
2460 insn = emit_insn (gen_add2_insn
2461 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462 RTX_FRAME_RELATED_P (insn) = 1;
2466 else
2467 frame_size = -1;
2469 if (offset > 0)
2471 bool skip_wb = false;
2473 if (frame_pointer_needed)
2475 skip_wb = true;
2477 if (fp_offset)
2479 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480 GEN_INT (-offset)));
2481 RTX_FRAME_RELATED_P (insn) = 1;
2483 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484 R30_REGNUM, false);
2486 else
2487 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2489 /* Set up frame pointer to point to the location of the
2490 previous frame pointer on the stack. */
2491 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492 stack_pointer_rtx,
2493 GEN_INT (fp_offset)));
2494 RTX_FRAME_RELATED_P (insn) = 1;
2495 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2497 else
2499 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2502 if (fp_offset
2503 || reg1 == FIRST_PSEUDO_REGISTER
2504 || (reg2 == FIRST_PSEUDO_REGISTER
2505 && offset >= 256))
2507 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508 GEN_INT (-offset)));
2509 RTX_FRAME_RELATED_P (insn) = 1;
2511 else
2513 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2515 skip_wb = true;
2517 if (reg2 == FIRST_PSEUDO_REGISTER)
2518 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519 else
2520 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2524 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525 skip_wb);
2526 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527 skip_wb);
2530 /* when offset >= 512,
2531 sub sp, sp, #<outgoing_args_size> */
2532 if (frame_size > -1)
2534 if (crtl->outgoing_args_size > 0)
2536 insn = emit_insn (gen_add2_insn
2537 (stack_pointer_rtx,
2538 GEN_INT (- crtl->outgoing_args_size)));
2539 RTX_FRAME_RELATED_P (insn) = 1;
2544 /* Return TRUE if we can use a simple_return insn.
2546 This function checks whether the callee saved stack is empty, which
2547 means no restore actions are need. The pro_and_epilogue will use
2548 this to check whether shrink-wrapping opt is feasible. */
2550 bool
2551 aarch64_use_return_insn_p (void)
2553 if (!reload_completed)
2554 return false;
2556 if (crtl->profile)
2557 return false;
2559 aarch64_layout_frame ();
2561 return cfun->machine->frame.frame_size == 0;
2564 /* Generate the epilogue instructions for returning from a function. */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2568 HOST_WIDE_INT frame_size, offset;
2569 HOST_WIDE_INT fp_offset;
2570 HOST_WIDE_INT hard_fp_offset;
2571 rtx_insn *insn;
2572 /* We need to add memory barrier to prevent read from deallocated stack. */
2573 bool need_barrier_p = (get_frame_size () != 0
2574 || cfun->machine->frame.saved_varargs_size);
2576 aarch64_layout_frame ();
2578 offset = frame_size = cfun->machine->frame.frame_size;
2579 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580 fp_offset = frame_size - hard_fp_offset;
2582 /* Store pairs and load pairs have a range only -512 to 504. */
2583 if (offset >= 512)
2585 offset = hard_fp_offset;
2586 if (offset >= 512)
2587 offset = cfun->machine->frame.saved_regs_size;
2589 frame_size -= (offset + crtl->outgoing_args_size);
2590 fp_offset = 0;
2591 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2593 insn = emit_insn (gen_add2_insn
2594 (stack_pointer_rtx,
2595 GEN_INT (crtl->outgoing_args_size)));
2596 RTX_FRAME_RELATED_P (insn) = 1;
2599 else
2600 frame_size = -1;
2602 /* If there were outgoing arguments or we've done dynamic stack
2603 allocation, then restore the stack pointer from the frame
2604 pointer. This is at most one insn and more efficient than using
2605 GCC's internal mechanism. */
2606 if (frame_pointer_needed
2607 && (crtl->outgoing_args_size || cfun->calls_alloca))
2609 if (cfun->calls_alloca)
2610 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2612 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613 hard_frame_pointer_rtx,
2614 GEN_INT (0)));
2615 offset = offset - fp_offset;
2618 if (offset > 0)
2620 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622 bool skip_wb = true;
2623 rtx cfi_ops = NULL;
2625 if (frame_pointer_needed)
2626 fp_offset = 0;
2627 else if (fp_offset
2628 || reg1 == FIRST_PSEUDO_REGISTER
2629 || (reg2 == FIRST_PSEUDO_REGISTER
2630 && offset >= 256))
2631 skip_wb = false;
2633 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634 skip_wb, &cfi_ops);
2635 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636 skip_wb, &cfi_ops);
2638 if (need_barrier_p)
2639 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2641 if (skip_wb)
2643 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2646 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647 if (reg2 == FIRST_PSEUDO_REGISTER)
2649 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651 mem = gen_rtx_MEM (mode1, mem);
2652 insn = emit_move_insn (rreg1, mem);
2654 else
2656 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2658 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659 insn = emit_insn (aarch64_gen_loadwb_pair
2660 (mode1, stack_pointer_rtx, rreg1,
2661 rreg2, offset));
2664 else
2666 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667 GEN_INT (offset)));
2670 /* Reset the CFA to be SP + FRAME_SIZE. */
2671 rtx new_cfa = stack_pointer_rtx;
2672 if (frame_size > 0)
2673 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675 REG_NOTES (insn) = cfi_ops;
2676 RTX_FRAME_RELATED_P (insn) = 1;
2679 if (frame_size > 0)
2681 if (need_barrier_p)
2682 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2684 if (frame_size >= 0x1000000)
2686 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687 emit_move_insn (op0, GEN_INT (frame_size));
2688 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2690 else
2692 int hi_ofs = frame_size & 0xfff000;
2693 int lo_ofs = frame_size & 0x000fff;
2695 if (hi_ofs && lo_ofs)
2697 insn = emit_insn (gen_add2_insn
2698 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699 RTX_FRAME_RELATED_P (insn) = 1;
2700 frame_size = lo_ofs;
2702 insn = emit_insn (gen_add2_insn
2703 (stack_pointer_rtx, GEN_INT (frame_size)));
2706 /* Reset the CFA to be SP + 0. */
2707 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708 RTX_FRAME_RELATED_P (insn) = 1;
2711 /* Stack adjustment for exception handler. */
2712 if (crtl->calls_eh_return)
2714 /* We need to unwind the stack by the offset computed by
2715 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2716 to be SP; letting the CFA move during this adjustment
2717 is just as correct as retaining the CFA from the body
2718 of the function. Therefore, do nothing special. */
2719 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2722 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723 if (!for_sibcall)
2724 emit_jump_insn (ret_rtx);
2727 /* Return the place to copy the exception unwinding return address to.
2728 This will probably be a stack slot, but could (in theory be the
2729 return register). */
2731 aarch64_final_eh_return_addr (void)
2733 HOST_WIDE_INT fp_offset;
2735 aarch64_layout_frame ();
2737 fp_offset = cfun->machine->frame.frame_size
2738 - cfun->machine->frame.hard_fp_offset;
2740 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741 return gen_rtx_REG (DImode, LR_REGNUM);
2743 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2744 result in a store to save LR introduced by builtin_eh_return () being
2745 incorrectly deleted because the alias is not detected.
2746 So in the calculation of the address to copy the exception unwinding
2747 return address to, we note 2 cases.
2748 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749 we return a SP-relative location since all the addresses are SP-relative
2750 in this case. This prevents the store from being optimized away.
2751 If the fp_offset is not 0, then the addresses will be FP-relative and
2752 therefore we return a FP-relative location. */
2754 if (frame_pointer_needed)
2756 if (fp_offset)
2757 return gen_frame_mem (DImode,
2758 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759 else
2760 return gen_frame_mem (DImode,
2761 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2764 /* If FP is not needed, we calculate the location of LR, which would be
2765 at the top of the saved registers block. */
2767 return gen_frame_mem (DImode,
2768 plus_constant (Pmode,
2769 stack_pointer_rtx,
2770 fp_offset
2771 + cfun->machine->frame.saved_regs_size
2772 - 2 * UNITS_PER_WORD));
2775 /* Possibly output code to build up a constant in a register. For
2776 the benefit of the costs infrastructure, returns the number of
2777 instructions which would be emitted. GENERATE inhibits or
2778 enables code generation. */
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2783 int insns = 0;
2785 if (aarch64_bitmask_imm (val, DImode))
2787 if (generate)
2788 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789 insns = 1;
2791 else
2793 int i;
2794 int ncount = 0;
2795 int zcount = 0;
2796 HOST_WIDE_INT valp = val >> 16;
2797 HOST_WIDE_INT valm;
2798 HOST_WIDE_INT tval;
2800 for (i = 16; i < 64; i += 16)
2802 valm = (valp & 0xffff);
2804 if (valm != 0)
2805 ++ zcount;
2807 if (valm != 0xffff)
2808 ++ ncount;
2810 valp >>= 16;
2813 /* zcount contains the number of additional MOVK instructions
2814 required if the constant is built up with an initial MOVZ instruction,
2815 while ncount is the number of MOVK instructions required if starting
2816 with a MOVN instruction. Choose the sequence that yields the fewest
2817 number of instructions, preferring MOVZ instructions when they are both
2818 the same. */
2819 if (ncount < zcount)
2821 if (generate)
2822 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824 tval = 0xffff;
2825 insns++;
2827 else
2829 if (generate)
2830 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831 GEN_INT (val & 0xffff));
2832 tval = 0;
2833 insns++;
2836 val >>= 16;
2838 for (i = 16; i < 64; i += 16)
2840 if ((val & 0xffff) != tval)
2842 if (generate)
2843 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844 GEN_INT (i),
2845 GEN_INT (val & 0xffff)));
2846 insns++;
2848 val >>= 16;
2851 return insns;
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2857 HOST_WIDE_INT mdelta = delta;
2858 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2861 if (mdelta < 0)
2862 mdelta = -mdelta;
2864 if (mdelta >= 4096 * 4096)
2866 (void) aarch64_build_constant (scratchreg, delta, true);
2867 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2869 else if (mdelta > 0)
2871 if (mdelta >= 4096)
2873 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875 if (delta < 0)
2876 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878 else
2879 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2882 if (mdelta % 4096 != 0)
2884 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2891 /* Output code to add DELTA to the first argument, and then jump
2892 to FUNCTION. Used for C++ multiple inheritance. */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895 HOST_WIDE_INT delta,
2896 HOST_WIDE_INT vcall_offset,
2897 tree function)
2899 /* The this pointer is always in x0. Note that this differs from
2900 Arm where the this pointer maybe bumped to r1 if r0 is required
2901 to return a pointer to an aggregate. On AArch64 a result value
2902 pointer will be in x8. */
2903 int this_regno = R0_REGNUM;
2904 rtx this_rtx, temp0, temp1, addr, funexp;
2905 rtx_insn *insn;
2907 reload_completed = 1;
2908 emit_note (NOTE_INSN_PROLOGUE_END);
2910 if (vcall_offset == 0)
2911 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912 else
2914 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2916 this_rtx = gen_rtx_REG (Pmode, this_regno);
2917 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2920 addr = this_rtx;
2921 if (delta != 0)
2923 if (delta >= -256 && delta < 256)
2924 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925 plus_constant (Pmode, this_rtx, delta));
2926 else
2927 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2930 if (Pmode == ptr_mode)
2931 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932 else
2933 aarch64_emit_move (temp0,
2934 gen_rtx_ZERO_EXTEND (Pmode,
2935 gen_rtx_MEM (ptr_mode, addr)));
2937 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938 addr = plus_constant (Pmode, temp0, vcall_offset);
2939 else
2941 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2945 if (Pmode == ptr_mode)
2946 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947 else
2948 aarch64_emit_move (temp1,
2949 gen_rtx_SIGN_EXTEND (Pmode,
2950 gen_rtx_MEM (ptr_mode, addr)));
2952 emit_insn (gen_add2_insn (this_rtx, temp1));
2955 /* Generate a tail call to the target function. */
2956 if (!TREE_USED (function))
2958 assemble_external (function);
2959 TREE_USED (function) = 1;
2961 funexp = XEXP (DECL_RTL (function), 0);
2962 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964 SIBLING_CALL_P (insn) = 1;
2966 insn = get_insns ();
2967 shorten_branches (insn);
2968 final_start_function (insn, file, 1);
2969 final (insn, file, 1);
2970 final_end_function ();
2972 /* Stop pretending to be a post-reload pass. */
2973 reload_completed = 0;
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2979 if (!TARGET_HAVE_TLS)
2980 return false;
2981 subrtx_iterator::array_type array;
2982 FOR_EACH_SUBRTX (iter, array, x, ALL)
2984 const_rtx x = *iter;
2985 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986 return true;
2987 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988 TLS offsets, not real symbol references. */
2989 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990 iter.skip_subrtxes ();
2992 return false;
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2999 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3002 if (*imm1 < *imm2)
3003 return -1;
3004 if (*imm1 > *imm2)
3005 return +1;
3006 return 0;
3010 static void
3011 aarch64_build_bitmask_table (void)
3013 unsigned HOST_WIDE_INT mask, imm;
3014 unsigned int log_e, e, s, r;
3015 unsigned int nimms = 0;
3017 for (log_e = 1; log_e <= 6; log_e++)
3019 e = 1 << log_e;
3020 if (e == 64)
3021 mask = ~(HOST_WIDE_INT) 0;
3022 else
3023 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024 for (s = 1; s < e; s++)
3026 for (r = 0; r < e; r++)
3028 /* set s consecutive bits to 1 (s < 64) */
3029 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030 /* rotate right by r */
3031 if (r != 0)
3032 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033 /* replicate the constant depending on SIMD size */
3034 switch (log_e) {
3035 case 1: imm |= (imm << 2);
3036 case 2: imm |= (imm << 4);
3037 case 3: imm |= (imm << 8);
3038 case 4: imm |= (imm << 16);
3039 case 5: imm |= (imm << 32);
3040 case 6:
3041 break;
3042 default:
3043 gcc_unreachable ();
3045 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046 aarch64_bitmasks[nimms++] = imm;
3051 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053 aarch64_bitmasks_cmp);
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058 a left shift of 0 or 12 bits. */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3062 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3068 /* Return true if val is an immediate that can be loaded into a
3069 register by a MOVZ instruction. */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3073 if (GET_MODE_SIZE (mode) > 4)
3075 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077 return 1;
3079 else
3081 /* Ignore sign extension. */
3082 val &= (HOST_WIDE_INT) 0xffffffff;
3084 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3089 /* Return true if val is a valid bitmask immediate. */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3093 if (GET_MODE_SIZE (mode) < 8)
3095 /* Replicate bit pattern. */
3096 val &= (HOST_WIDE_INT) 0xffffffff;
3097 val |= val << 32;
3099 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3104 /* Return true if val is an immediate that can be loaded into a
3105 register in a single instruction. */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3109 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110 return 1;
3111 return aarch64_bitmask_imm (val, mode);
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3117 rtx base, offset;
3119 if (GET_CODE (x) == HIGH)
3120 return true;
3122 split_const (x, &base, &offset);
3123 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3125 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126 != SYMBOL_FORCE_TO_MEM)
3127 return true;
3128 else
3129 /* Avoid generating a 64-bit relocation in ILP32; leave
3130 to aarch64_expand_mov_immediate to handle it properly. */
3131 return mode != ptr_mode;
3134 return aarch64_tls_referenced_p (x);
3137 /* Return true if register REGNO is a valid index register.
3138 STRICT_P is true if REG_OK_STRICT is in effect. */
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3143 if (!HARD_REGISTER_NUM_P (regno))
3145 if (!strict_p)
3146 return true;
3148 if (!reg_renumber)
3149 return false;
3151 regno = reg_renumber[regno];
3153 return GP_REGNUM_P (regno);
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157 STRICT_P is true if REG_OK_STRICT is in effect. */
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3162 if (!HARD_REGISTER_NUM_P (regno))
3164 if (!strict_p)
3165 return true;
3167 if (!reg_renumber)
3168 return false;
3170 regno = reg_renumber[regno];
3173 /* The fake registers will be eliminated to either the stack or
3174 hard frame pointer, both of which are usually valid base registers.
3175 Reload deals with the cases where the eliminated form isn't valid. */
3176 return (GP_REGNUM_P (regno)
3177 || regno == SP_REGNUM
3178 || regno == FRAME_POINTER_REGNUM
3179 || regno == ARG_POINTER_REGNUM);
3182 /* Return true if X is a valid base register for mode MODE.
3183 STRICT_P is true if REG_OK_STRICT is in effect. */
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3188 if (!strict_p && GET_CODE (x) == SUBREG)
3189 x = SUBREG_REG (x);
3191 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3194 /* Return true if address offset is a valid index. If it is, fill in INFO
3195 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199 machine_mode mode, bool strict_p)
3201 enum aarch64_address_type type;
3202 rtx index;
3203 int shift;
3205 /* (reg:P) */
3206 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207 && GET_MODE (x) == Pmode)
3209 type = ADDRESS_REG_REG;
3210 index = x;
3211 shift = 0;
3213 /* (sign_extend:DI (reg:SI)) */
3214 else if ((GET_CODE (x) == SIGN_EXTEND
3215 || GET_CODE (x) == ZERO_EXTEND)
3216 && GET_MODE (x) == DImode
3217 && GET_MODE (XEXP (x, 0)) == SImode)
3219 type = (GET_CODE (x) == SIGN_EXTEND)
3220 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221 index = XEXP (x, 0);
3222 shift = 0;
3224 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225 else if (GET_CODE (x) == MULT
3226 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228 && GET_MODE (XEXP (x, 0)) == DImode
3229 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230 && CONST_INT_P (XEXP (x, 1)))
3232 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234 index = XEXP (XEXP (x, 0), 0);
3235 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3237 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238 else if (GET_CODE (x) == ASHIFT
3239 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241 && GET_MODE (XEXP (x, 0)) == DImode
3242 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243 && CONST_INT_P (XEXP (x, 1)))
3245 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247 index = XEXP (XEXP (x, 0), 0);
3248 shift = INTVAL (XEXP (x, 1));
3250 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251 else if ((GET_CODE (x) == SIGN_EXTRACT
3252 || GET_CODE (x) == ZERO_EXTRACT)
3253 && GET_MODE (x) == DImode
3254 && GET_CODE (XEXP (x, 0)) == MULT
3255 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3258 type = (GET_CODE (x) == SIGN_EXTRACT)
3259 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260 index = XEXP (XEXP (x, 0), 0);
3261 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262 if (INTVAL (XEXP (x, 1)) != 32 + shift
3263 || INTVAL (XEXP (x, 2)) != 0)
3264 shift = -1;
3266 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267 (const_int 0xffffffff<<shift)) */
3268 else if (GET_CODE (x) == AND
3269 && GET_MODE (x) == DImode
3270 && GET_CODE (XEXP (x, 0)) == MULT
3271 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273 && CONST_INT_P (XEXP (x, 1)))
3275 type = ADDRESS_REG_UXTW;
3276 index = XEXP (XEXP (x, 0), 0);
3277 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279 shift = -1;
3281 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282 else if ((GET_CODE (x) == SIGN_EXTRACT
3283 || GET_CODE (x) == ZERO_EXTRACT)
3284 && GET_MODE (x) == DImode
3285 && GET_CODE (XEXP (x, 0)) == ASHIFT
3286 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3289 type = (GET_CODE (x) == SIGN_EXTRACT)
3290 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291 index = XEXP (XEXP (x, 0), 0);
3292 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293 if (INTVAL (XEXP (x, 1)) != 32 + shift
3294 || INTVAL (XEXP (x, 2)) != 0)
3295 shift = -1;
3297 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298 (const_int 0xffffffff<<shift)) */
3299 else if (GET_CODE (x) == AND
3300 && GET_MODE (x) == DImode
3301 && GET_CODE (XEXP (x, 0)) == ASHIFT
3302 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304 && CONST_INT_P (XEXP (x, 1)))
3306 type = ADDRESS_REG_UXTW;
3307 index = XEXP (XEXP (x, 0), 0);
3308 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310 shift = -1;
3312 /* (mult:P (reg:P) (const_int scale)) */
3313 else if (GET_CODE (x) == MULT
3314 && GET_MODE (x) == Pmode
3315 && GET_MODE (XEXP (x, 0)) == Pmode
3316 && CONST_INT_P (XEXP (x, 1)))
3318 type = ADDRESS_REG_REG;
3319 index = XEXP (x, 0);
3320 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3322 /* (ashift:P (reg:P) (const_int shift)) */
3323 else if (GET_CODE (x) == ASHIFT
3324 && GET_MODE (x) == Pmode
3325 && GET_MODE (XEXP (x, 0)) == Pmode
3326 && CONST_INT_P (XEXP (x, 1)))
3328 type = ADDRESS_REG_REG;
3329 index = XEXP (x, 0);
3330 shift = INTVAL (XEXP (x, 1));
3332 else
3333 return false;
3335 if (GET_CODE (index) == SUBREG)
3336 index = SUBREG_REG (index);
3338 if ((shift == 0 ||
3339 (shift > 0 && shift <= 3
3340 && (1 << shift) == GET_MODE_SIZE (mode)))
3341 && REG_P (index)
3342 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3344 info->type = type;
3345 info->offset = index;
3346 info->shift = shift;
3347 return true;
3350 return false;
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 return (offset >= -64 * GET_MODE_SIZE (mode)
3357 && offset < 64 * GET_MODE_SIZE (mode)
3358 && offset % GET_MODE_SIZE (mode) == 0);
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363 HOST_WIDE_INT offset)
3365 return offset >= -256 && offset < 256;
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3371 return (offset >= 0
3372 && offset < 4096 * GET_MODE_SIZE (mode)
3373 && offset % GET_MODE_SIZE (mode) == 0);
3376 /* Return true if X is a valid address for machine mode MODE. If it is,
3377 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3378 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382 rtx x, machine_mode mode,
3383 RTX_CODE outer_code, bool strict_p)
3385 enum rtx_code code = GET_CODE (x);
3386 rtx op0, op1;
3388 /* On BE, we use load/store pair for all large int mode load/stores. */
3389 bool load_store_pair_p = (outer_code == PARALLEL
3390 || (BYTES_BIG_ENDIAN
3391 && aarch64_vect_struct_mode_p (mode)));
3393 bool allow_reg_index_p =
3394 !load_store_pair_p
3395 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396 && !aarch64_vect_struct_mode_p (mode);
3398 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399 REG addressing. */
3400 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401 && (code != POST_INC && code != REG))
3402 return false;
3404 switch (code)
3406 case REG:
3407 case SUBREG:
3408 info->type = ADDRESS_REG_IMM;
3409 info->base = x;
3410 info->offset = const0_rtx;
3411 return aarch64_base_register_rtx_p (x, strict_p);
3413 case PLUS:
3414 op0 = XEXP (x, 0);
3415 op1 = XEXP (x, 1);
3417 if (! strict_p
3418 && REG_P (op0)
3419 && (op0 == virtual_stack_vars_rtx
3420 || op0 == frame_pointer_rtx
3421 || op0 == arg_pointer_rtx)
3422 && CONST_INT_P (op1))
3424 info->type = ADDRESS_REG_IMM;
3425 info->base = op0;
3426 info->offset = op1;
3428 return true;
3431 if (GET_MODE_SIZE (mode) != 0
3432 && CONST_INT_P (op1)
3433 && aarch64_base_register_rtx_p (op0, strict_p))
3435 HOST_WIDE_INT offset = INTVAL (op1);
3437 info->type = ADDRESS_REG_IMM;
3438 info->base = op0;
3439 info->offset = op1;
3441 /* TImode and TFmode values are allowed in both pairs of X
3442 registers and individual Q registers. The available
3443 address modes are:
3444 X,X: 7-bit signed scaled offset
3445 Q: 9-bit signed offset
3446 We conservatively require an offset representable in either mode.
3448 if (mode == TImode || mode == TFmode)
3449 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450 && offset_9bit_signed_unscaled_p (mode, offset));
3452 /* A 7bit offset check because OImode will emit a ldp/stp
3453 instruction (only big endian will get here).
3454 For ldp/stp instructions, the offset is scaled for the size of a
3455 single element of the pair. */
3456 if (mode == OImode)
3457 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3459 /* Three 9/12 bit offsets checks because CImode will emit three
3460 ldr/str instructions (only big endian will get here). */
3461 if (mode == CImode)
3462 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464 || offset_12bit_unsigned_scaled_p (V16QImode,
3465 offset + 32)));
3467 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468 instructions (only big endian will get here). */
3469 if (mode == XImode)
3470 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471 && aarch64_offset_7bit_signed_scaled_p (TImode,
3472 offset + 32));
3474 if (load_store_pair_p)
3475 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477 else
3478 return (offset_9bit_signed_unscaled_p (mode, offset)
3479 || offset_12bit_unsigned_scaled_p (mode, offset));
3482 if (allow_reg_index_p)
3484 /* Look for base + (scaled/extended) index register. */
3485 if (aarch64_base_register_rtx_p (op0, strict_p)
3486 && aarch64_classify_index (info, op1, mode, strict_p))
3488 info->base = op0;
3489 return true;
3491 if (aarch64_base_register_rtx_p (op1, strict_p)
3492 && aarch64_classify_index (info, op0, mode, strict_p))
3494 info->base = op1;
3495 return true;
3499 return false;
3501 case POST_INC:
3502 case POST_DEC:
3503 case PRE_INC:
3504 case PRE_DEC:
3505 info->type = ADDRESS_REG_WB;
3506 info->base = XEXP (x, 0);
3507 info->offset = NULL_RTX;
3508 return aarch64_base_register_rtx_p (info->base, strict_p);
3510 case POST_MODIFY:
3511 case PRE_MODIFY:
3512 info->type = ADDRESS_REG_WB;
3513 info->base = XEXP (x, 0);
3514 if (GET_CODE (XEXP (x, 1)) == PLUS
3515 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517 && aarch64_base_register_rtx_p (info->base, strict_p))
3519 HOST_WIDE_INT offset;
3520 info->offset = XEXP (XEXP (x, 1), 1);
3521 offset = INTVAL (info->offset);
3523 /* TImode and TFmode values are allowed in both pairs of X
3524 registers and individual Q registers. The available
3525 address modes are:
3526 X,X: 7-bit signed scaled offset
3527 Q: 9-bit signed offset
3528 We conservatively require an offset representable in either mode.
3530 if (mode == TImode || mode == TFmode)
3531 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532 && offset_9bit_signed_unscaled_p (mode, offset));
3534 if (load_store_pair_p)
3535 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537 else
3538 return offset_9bit_signed_unscaled_p (mode, offset);
3540 return false;
3542 case CONST:
3543 case SYMBOL_REF:
3544 case LABEL_REF:
3545 /* load literal: pc-relative constant pool entry. Only supported
3546 for SI mode or larger. */
3547 info->type = ADDRESS_SYMBOLIC;
3549 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3551 rtx sym, addend;
3553 split_const (x, &sym, &addend);
3554 return (GET_CODE (sym) == LABEL_REF
3555 || (GET_CODE (sym) == SYMBOL_REF
3556 && CONSTANT_POOL_ADDRESS_P (sym)));
3558 return false;
3560 case LO_SUM:
3561 info->type = ADDRESS_LO_SUM;
3562 info->base = XEXP (x, 0);
3563 info->offset = XEXP (x, 1);
3564 if (allow_reg_index_p
3565 && aarch64_base_register_rtx_p (info->base, strict_p))
3567 rtx sym, offs;
3568 split_const (info->offset, &sym, &offs);
3569 if (GET_CODE (sym) == SYMBOL_REF
3570 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571 == SYMBOL_SMALL_ABSOLUTE))
3573 /* The symbol and offset must be aligned to the access size. */
3574 unsigned int align;
3575 unsigned int ref_size;
3577 if (CONSTANT_POOL_ADDRESS_P (sym))
3578 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3581 tree exp = SYMBOL_REF_DECL (sym);
3582 align = TYPE_ALIGN (TREE_TYPE (exp));
3583 align = CONSTANT_ALIGNMENT (exp, align);
3585 else if (SYMBOL_REF_DECL (sym))
3586 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588 && SYMBOL_REF_BLOCK (sym) != NULL)
3589 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590 else
3591 align = BITS_PER_UNIT;
3593 ref_size = GET_MODE_SIZE (mode);
3594 if (ref_size == 0)
3595 ref_size = GET_MODE_SIZE (DImode);
3597 return ((INTVAL (offs) & (ref_size - 1)) == 0
3598 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3601 return false;
3603 default:
3604 return false;
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3611 rtx offset;
3613 split_const (x, &x, &offset);
3614 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3617 /* Classify the base of symbolic expression X, given that X appears in
3618 context CONTEXT. */
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622 enum aarch64_symbol_context context)
3624 rtx offset;
3626 split_const (x, &x, &offset);
3627 return aarch64_classify_symbol (x, offset, context);
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632 mode MODE. */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3636 struct aarch64_address_info addr;
3638 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3643 pair operation. */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646 RTX_CODE outer_code, bool strict_p)
3648 struct aarch64_address_info addr;
3650 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3657 REAL_VALUE_TYPE r;
3659 if (GET_MODE (x) == VOIDmode)
3660 return false;
3662 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663 if (REAL_VALUE_MINUS_ZERO (r))
3664 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665 return REAL_VALUES_EQUAL (r, dconst0);
3668 /* Return the fixed registers used for condition codes. */
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3673 *p1 = CC_REGNUM;
3674 *p2 = INVALID_REGNUM;
3675 return true;
3678 /* Emit call insn with PAT and do aarch64-specific handling. */
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3683 rtx insn = emit_call_insn (pat);
3685 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3693 /* All floating point compares return CCFP if it is an equality
3694 comparison, and CCFPE otherwise. */
3695 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3697 switch (code)
3699 case EQ:
3700 case NE:
3701 case UNORDERED:
3702 case ORDERED:
3703 case UNLT:
3704 case UNLE:
3705 case UNGT:
3706 case UNGE:
3707 case UNEQ:
3708 case LTGT:
3709 return CCFPmode;
3711 case LT:
3712 case LE:
3713 case GT:
3714 case GE:
3715 return CCFPEmode;
3717 default:
3718 gcc_unreachable ();
3722 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723 && y == const0_rtx
3724 && (code == EQ || code == NE || code == LT || code == GE)
3725 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726 || GET_CODE (x) == NEG))
3727 return CC_NZmode;
3729 /* A compare with a shifted operand. Because of canonicalization,
3730 the comparison will have to be swapped when we emit the assembly
3731 code. */
3732 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733 && (REG_P (y) || GET_CODE (y) == SUBREG)
3734 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735 || GET_CODE (x) == LSHIFTRT
3736 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737 return CC_SWPmode;
3739 /* Similarly for a negated operand, but we can only do this for
3740 equalities. */
3741 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742 && (REG_P (y) || GET_CODE (y) == SUBREG)
3743 && (code == EQ || code == NE)
3744 && GET_CODE (x) == NEG)
3745 return CC_Zmode;
3747 /* A compare of a mode narrower than SI mode against zero can be done
3748 by extending the value in the comparison. */
3749 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750 && y == const0_rtx)
3751 /* Only use sign-extension if we really need it. */
3752 return ((code == GT || code == GE || code == LE || code == LT)
3753 ? CC_SESWPmode : CC_ZESWPmode);
3755 /* For everything else, return CCmode. */
3756 return CCmode;
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3763 aarch64_get_condition_code (rtx x)
3765 machine_mode mode = GET_MODE (XEXP (x, 0));
3766 enum rtx_code comp_code = GET_CODE (x);
3768 if (GET_MODE_CLASS (mode) != MODE_CC)
3769 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770 return aarch64_get_condition_code_1 (mode, comp_code);
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3776 int ne = -1, eq = -1;
3777 switch (mode)
3779 case CCFPmode:
3780 case CCFPEmode:
3781 switch (comp_code)
3783 case GE: return AARCH64_GE;
3784 case GT: return AARCH64_GT;
3785 case LE: return AARCH64_LS;
3786 case LT: return AARCH64_MI;
3787 case NE: return AARCH64_NE;
3788 case EQ: return AARCH64_EQ;
3789 case ORDERED: return AARCH64_VC;
3790 case UNORDERED: return AARCH64_VS;
3791 case UNLT: return AARCH64_LT;
3792 case UNLE: return AARCH64_LE;
3793 case UNGT: return AARCH64_HI;
3794 case UNGE: return AARCH64_PL;
3795 default: return -1;
3797 break;
3799 case CC_DNEmode:
3800 ne = AARCH64_NE;
3801 eq = AARCH64_EQ;
3802 break;
3804 case CC_DEQmode:
3805 ne = AARCH64_EQ;
3806 eq = AARCH64_NE;
3807 break;
3809 case CC_DGEmode:
3810 ne = AARCH64_GE;
3811 eq = AARCH64_LT;
3812 break;
3814 case CC_DLTmode:
3815 ne = AARCH64_LT;
3816 eq = AARCH64_GE;
3817 break;
3819 case CC_DGTmode:
3820 ne = AARCH64_GT;
3821 eq = AARCH64_LE;
3822 break;
3824 case CC_DLEmode:
3825 ne = AARCH64_LE;
3826 eq = AARCH64_GT;
3827 break;
3829 case CC_DGEUmode:
3830 ne = AARCH64_CS;
3831 eq = AARCH64_CC;
3832 break;
3834 case CC_DLTUmode:
3835 ne = AARCH64_CC;
3836 eq = AARCH64_CS;
3837 break;
3839 case CC_DGTUmode:
3840 ne = AARCH64_HI;
3841 eq = AARCH64_LS;
3842 break;
3844 case CC_DLEUmode:
3845 ne = AARCH64_LS;
3846 eq = AARCH64_HI;
3847 break;
3849 case CCmode:
3850 switch (comp_code)
3852 case NE: return AARCH64_NE;
3853 case EQ: return AARCH64_EQ;
3854 case GE: return AARCH64_GE;
3855 case GT: return AARCH64_GT;
3856 case LE: return AARCH64_LE;
3857 case LT: return AARCH64_LT;
3858 case GEU: return AARCH64_CS;
3859 case GTU: return AARCH64_HI;
3860 case LEU: return AARCH64_LS;
3861 case LTU: return AARCH64_CC;
3862 default: return -1;
3864 break;
3866 case CC_SWPmode:
3867 case CC_ZESWPmode:
3868 case CC_SESWPmode:
3869 switch (comp_code)
3871 case NE: return AARCH64_NE;
3872 case EQ: return AARCH64_EQ;
3873 case GE: return AARCH64_LE;
3874 case GT: return AARCH64_LT;
3875 case LE: return AARCH64_GE;
3876 case LT: return AARCH64_GT;
3877 case GEU: return AARCH64_LS;
3878 case GTU: return AARCH64_CC;
3879 case LEU: return AARCH64_CS;
3880 case LTU: return AARCH64_HI;
3881 default: return -1;
3883 break;
3885 case CC_NZmode:
3886 switch (comp_code)
3888 case NE: return AARCH64_NE;
3889 case EQ: return AARCH64_EQ;
3890 case GE: return AARCH64_PL;
3891 case LT: return AARCH64_MI;
3892 default: return -1;
3894 break;
3896 case CC_Zmode:
3897 switch (comp_code)
3899 case NE: return AARCH64_NE;
3900 case EQ: return AARCH64_EQ;
3901 default: return -1;
3903 break;
3905 default:
3906 return -1;
3907 break;
3910 if (comp_code == NE)
3911 return ne;
3913 if (comp_code == EQ)
3914 return eq;
3916 return -1;
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921 HOST_WIDE_INT minval,
3922 HOST_WIDE_INT maxval)
3924 HOST_WIDE_INT firstval;
3925 int count, i;
3927 if (GET_CODE (x) != CONST_VECTOR
3928 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929 return false;
3931 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932 if (firstval < minval || firstval > maxval)
3933 return false;
3935 count = CONST_VECTOR_NUNITS (x);
3936 for (i = 1; i < count; i++)
3937 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938 return false;
3940 return true;
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3946 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3952 unsigned count = 0;
3954 while (value)
3956 count++;
3957 value &= value - 1;
3960 return count;
3963 /* N Z C V. */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3969 /* N Z C V flags for ccmp. The first code is for AND op and the other
3970 is for IOR op. Indexed by AARCH64_COND_CODE. */
3971 static const int aarch64_nzcv_codes[][2] =
3973 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3974 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3975 {AARCH64_CC_C, 0}, /* CS, C == 1. */
3976 {0, AARCH64_CC_C}, /* CC, C == 0. */
3977 {AARCH64_CC_N, 0}, /* MI, N == 1. */
3978 {0, AARCH64_CC_N}, /* PL, N == 0. */
3979 {AARCH64_CC_V, 0}, /* VS, V == 1. */
3980 {0, AARCH64_CC_V}, /* VC, V == 0. */
3981 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
3982 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
3983 {0, AARCH64_CC_V}, /* GE, N == V. */
3984 {AARCH64_CC_V, 0}, /* LT, N != V. */
3985 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
3986 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
3987 {0, 0}, /* AL, Any. */
3988 {0, 0}, /* NV, Any. */
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3994 switch (mode)
3996 case CC_DNEmode:
3997 return NE;
3999 case CC_DEQmode:
4000 return EQ;
4002 case CC_DLEmode:
4003 return LE;
4005 case CC_DGTmode:
4006 return GT;
4008 case CC_DLTmode:
4009 return LT;
4011 case CC_DGEmode:
4012 return GE;
4014 case CC_DLEUmode:
4015 return LEU;
4017 case CC_DGTUmode:
4018 return GTU;
4020 case CC_DLTUmode:
4021 return LTU;
4023 case CC_DGEUmode:
4024 return GEU;
4026 default:
4027 gcc_unreachable ();
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4035 switch (code)
4037 /* An integer or symbol address without a preceding # sign. */
4038 case 'c':
4039 switch (GET_CODE (x))
4041 case CONST_INT:
4042 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043 break;
4045 case SYMBOL_REF:
4046 output_addr_const (f, x);
4047 break;
4049 case CONST:
4050 if (GET_CODE (XEXP (x, 0)) == PLUS
4051 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4053 output_addr_const (f, x);
4054 break;
4056 /* Fall through. */
4058 default:
4059 output_operand_lossage ("Unsupported operand for code '%c'", code);
4061 break;
4063 case 'e':
4064 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4066 int n;
4068 if (!CONST_INT_P (x)
4069 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4071 output_operand_lossage ("invalid operand for '%%%c'", code);
4072 return;
4075 switch (n)
4077 case 3:
4078 fputc ('b', f);
4079 break;
4080 case 4:
4081 fputc ('h', f);
4082 break;
4083 case 5:
4084 fputc ('w', f);
4085 break;
4086 default:
4087 output_operand_lossage ("invalid operand for '%%%c'", code);
4088 return;
4091 break;
4093 case 'p':
4095 int n;
4097 /* Print N such that 2^N == X. */
4098 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4100 output_operand_lossage ("invalid operand for '%%%c'", code);
4101 return;
4104 asm_fprintf (f, "%d", n);
4106 break;
4108 case 'P':
4109 /* Print the number of non-zero bits in X (a const_int). */
4110 if (!CONST_INT_P (x))
4112 output_operand_lossage ("invalid operand for '%%%c'", code);
4113 return;
4116 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117 break;
4119 case 'H':
4120 /* Print the higher numbered register of a pair (TImode) of regs. */
4121 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4123 output_operand_lossage ("invalid operand for '%%%c'", code);
4124 return;
4127 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128 break;
4130 case 'm':
4132 int cond_code;
4133 /* Print a condition (eq, ne, etc). */
4135 /* CONST_TRUE_RTX means always -- that's the default. */
4136 if (x == const_true_rtx)
4137 return;
4139 if (!COMPARISON_P (x))
4141 output_operand_lossage ("invalid operand for '%%%c'", code);
4142 return;
4145 cond_code = aarch64_get_condition_code (x);
4146 gcc_assert (cond_code >= 0);
4147 fputs (aarch64_condition_codes[cond_code], f);
4149 break;
4151 case 'M':
4153 int cond_code;
4154 /* Print the inverse of a condition (eq <-> ne, etc). */
4156 /* CONST_TRUE_RTX means never -- that's the default. */
4157 if (x == const_true_rtx)
4159 fputs ("nv", f);
4160 return;
4163 if (!COMPARISON_P (x))
4165 output_operand_lossage ("invalid operand for '%%%c'", code);
4166 return;
4168 cond_code = aarch64_get_condition_code (x);
4169 gcc_assert (cond_code >= 0);
4170 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171 (cond_code)], f);
4173 break;
4175 case 'b':
4176 case 'h':
4177 case 's':
4178 case 'd':
4179 case 'q':
4180 /* Print a scalar FP/SIMD register name. */
4181 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4183 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184 return;
4186 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187 break;
4189 case 'S':
4190 case 'T':
4191 case 'U':
4192 case 'V':
4193 /* Print the first FP/SIMD register name in a list. */
4194 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4196 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197 return;
4199 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200 break;
4202 case 'R':
4203 /* Print a scalar FP/SIMD register name + 1. */
4204 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4206 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207 return;
4209 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210 break;
4212 case 'X':
4213 /* Print bottom 16 bits of integer constant in hex. */
4214 if (!CONST_INT_P (x))
4216 output_operand_lossage ("invalid operand for '%%%c'", code);
4217 return;
4219 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220 break;
4222 case 'w':
4223 case 'x':
4224 /* Print a general register name or the zero register (32-bit or
4225 64-bit). */
4226 if (x == const0_rtx
4227 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4229 asm_fprintf (f, "%czr", code);
4230 break;
4233 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4235 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236 break;
4239 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4241 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242 break;
4245 /* Fall through */
4247 case 0:
4248 /* Print a normal operand, if it's a general register, then we
4249 assume DImode. */
4250 if (x == NULL)
4252 output_operand_lossage ("missing operand");
4253 return;
4256 switch (GET_CODE (x))
4258 case REG:
4259 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260 break;
4262 case MEM:
4263 aarch64_memory_reference_mode = GET_MODE (x);
4264 output_address (XEXP (x, 0));
4265 break;
4267 case LABEL_REF:
4268 case SYMBOL_REF:
4269 output_addr_const (asm_out_file, x);
4270 break;
4272 case CONST_INT:
4273 asm_fprintf (f, "%wd", INTVAL (x));
4274 break;
4276 case CONST_VECTOR:
4277 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4279 gcc_assert (
4280 aarch64_const_vec_all_same_in_range_p (x,
4281 HOST_WIDE_INT_MIN,
4282 HOST_WIDE_INT_MAX));
4283 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4285 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4287 fputc ('0', f);
4289 else
4290 gcc_unreachable ();
4291 break;
4293 case CONST_DOUBLE:
4294 /* CONST_DOUBLE can represent a double-width integer.
4295 In this case, the mode of x is VOIDmode. */
4296 if (GET_MODE (x) == VOIDmode)
4297 ; /* Do Nothing. */
4298 else if (aarch64_float_const_zero_rtx_p (x))
4300 fputc ('0', f);
4301 break;
4303 else if (aarch64_float_const_representable_p (x))
4305 #define buf_size 20
4306 char float_buf[buf_size] = {'\0'};
4307 REAL_VALUE_TYPE r;
4308 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309 real_to_decimal_for_mode (float_buf, &r,
4310 buf_size, buf_size,
4311 1, GET_MODE (x));
4312 asm_fprintf (asm_out_file, "%s", float_buf);
4313 break;
4314 #undef buf_size
4316 output_operand_lossage ("invalid constant");
4317 return;
4318 default:
4319 output_operand_lossage ("invalid operand");
4320 return;
4322 break;
4324 case 'A':
4325 if (GET_CODE (x) == HIGH)
4326 x = XEXP (x, 0);
4328 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4330 case SYMBOL_SMALL_GOT:
4331 asm_fprintf (asm_out_file, ":got:");
4332 break;
4334 case SYMBOL_SMALL_TLSGD:
4335 asm_fprintf (asm_out_file, ":tlsgd:");
4336 break;
4338 case SYMBOL_SMALL_TLSDESC:
4339 asm_fprintf (asm_out_file, ":tlsdesc:");
4340 break;
4342 case SYMBOL_SMALL_GOTTPREL:
4343 asm_fprintf (asm_out_file, ":gottprel:");
4344 break;
4346 case SYMBOL_SMALL_TPREL:
4347 asm_fprintf (asm_out_file, ":tprel:");
4348 break;
4350 case SYMBOL_TINY_GOT:
4351 gcc_unreachable ();
4352 break;
4354 default:
4355 break;
4357 output_addr_const (asm_out_file, x);
4358 break;
4360 case 'L':
4361 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4363 case SYMBOL_SMALL_GOT:
4364 asm_fprintf (asm_out_file, ":lo12:");
4365 break;
4367 case SYMBOL_SMALL_TLSGD:
4368 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369 break;
4371 case SYMBOL_SMALL_TLSDESC:
4372 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373 break;
4375 case SYMBOL_SMALL_GOTTPREL:
4376 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377 break;
4379 case SYMBOL_SMALL_TPREL:
4380 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381 break;
4383 case SYMBOL_TINY_GOT:
4384 asm_fprintf (asm_out_file, ":got:");
4385 break;
4387 default:
4388 break;
4390 output_addr_const (asm_out_file, x);
4391 break;
4393 case 'G':
4395 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4397 case SYMBOL_SMALL_TPREL:
4398 asm_fprintf (asm_out_file, ":tprel_hi12:");
4399 break;
4400 default:
4401 break;
4403 output_addr_const (asm_out_file, x);
4404 break;
4406 case 'K':
4408 int cond_code;
4409 /* Print nzcv. */
4411 if (!COMPARISON_P (x))
4413 output_operand_lossage ("invalid operand for '%%%c'", code);
4414 return;
4417 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418 gcc_assert (cond_code >= 0);
4419 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4421 break;
4423 case 'k':
4425 int cond_code;
4426 /* Print nzcv. */
4428 if (!COMPARISON_P (x))
4430 output_operand_lossage ("invalid operand for '%%%c'", code);
4431 return;
4434 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435 gcc_assert (cond_code >= 0);
4436 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4438 break;
4440 default:
4441 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442 return;
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4449 struct aarch64_address_info addr;
4451 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452 MEM, true))
4453 switch (addr.type)
4455 case ADDRESS_REG_IMM:
4456 if (addr.offset == const0_rtx)
4457 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458 else
4459 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460 INTVAL (addr.offset));
4461 return;
4463 case ADDRESS_REG_REG:
4464 if (addr.shift == 0)
4465 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466 reg_names [REGNO (addr.offset)]);
4467 else
4468 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469 reg_names [REGNO (addr.offset)], addr.shift);
4470 return;
4472 case ADDRESS_REG_UXTW:
4473 if (addr.shift == 0)
4474 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475 REGNO (addr.offset) - R0_REGNUM);
4476 else
4477 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479 return;
4481 case ADDRESS_REG_SXTW:
4482 if (addr.shift == 0)
4483 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484 REGNO (addr.offset) - R0_REGNUM);
4485 else
4486 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488 return;
4490 case ADDRESS_REG_WB:
4491 switch (GET_CODE (x))
4493 case PRE_INC:
4494 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495 GET_MODE_SIZE (aarch64_memory_reference_mode));
4496 return;
4497 case POST_INC:
4498 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499 GET_MODE_SIZE (aarch64_memory_reference_mode));
4500 return;
4501 case PRE_DEC:
4502 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503 GET_MODE_SIZE (aarch64_memory_reference_mode));
4504 return;
4505 case POST_DEC:
4506 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507 GET_MODE_SIZE (aarch64_memory_reference_mode));
4508 return;
4509 case PRE_MODIFY:
4510 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511 INTVAL (addr.offset));
4512 return;
4513 case POST_MODIFY:
4514 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515 INTVAL (addr.offset));
4516 return;
4517 default:
4518 break;
4520 break;
4522 case ADDRESS_LO_SUM:
4523 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524 output_addr_const (f, addr.offset);
4525 asm_fprintf (f, "]");
4526 return;
4528 case ADDRESS_SYMBOLIC:
4529 break;
4532 output_addr_const (f, x);
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4538 const char *fmt;
4539 int i;
4541 if (GET_CODE (x) == LABEL_REF)
4542 return true;
4544 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545 referencing instruction, but they are constant offsets, not
4546 symbols. */
4547 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548 return false;
4550 fmt = GET_RTX_FORMAT (GET_CODE (x));
4551 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4553 if (fmt[i] == 'E')
4555 int j;
4557 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559 return 1;
4561 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562 return 1;
4565 return 0;
4568 /* Implement REGNO_REG_CLASS. */
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4573 if (GP_REGNUM_P (regno))
4574 return GENERAL_REGS;
4576 if (regno == SP_REGNUM)
4577 return STACK_REG;
4579 if (regno == FRAME_POINTER_REGNUM
4580 || regno == ARG_POINTER_REGNUM)
4581 return POINTER_REGS;
4583 if (FP_REGNUM_P (regno))
4584 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4586 return NO_REGS;
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4592 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593 where mask is selected by alignment and size of the offset.
4594 We try to pick as large a range for the offset as possible to
4595 maximize the chance of a CSE. However, for aligned addresses
4596 we limit the range to 4k so that structures with different sized
4597 elements are likely to use the same base. */
4599 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4601 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602 HOST_WIDE_INT base_offset;
4604 /* Does it look like we'll need a load/store-pair operation? */
4605 if (GET_MODE_SIZE (mode) > 16
4606 || mode == TImode)
4607 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609 /* For offsets aren't a multiple of the access size, the limit is
4610 -256...255. */
4611 else if (offset & (GET_MODE_SIZE (mode) - 1))
4612 base_offset = (offset + 0x100) & ~0x1ff;
4613 else
4614 base_offset = offset & ~0xfff;
4616 if (base_offset == 0)
4617 return x;
4619 offset -= base_offset;
4620 rtx base_reg = gen_reg_rtx (Pmode);
4621 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622 NULL_RTX);
4623 emit_move_insn (base_reg, val);
4624 x = plus_constant (Pmode, base_reg, offset);
4627 return x;
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631 operand. If we find one, push the reload and return the new rtx. */
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635 machine_mode mode,
4636 int opnum, int type,
4637 int ind_levels ATTRIBUTE_UNUSED)
4639 rtx x = *x_p;
4641 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4642 if (aarch64_vect_struct_mode_p (mode)
4643 && GET_CODE (x) == PLUS
4644 && REG_P (XEXP (x, 0))
4645 && CONST_INT_P (XEXP (x, 1)))
4647 rtx orig_rtx = x;
4648 x = copy_rtx (x);
4649 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651 opnum, (enum reload_type) type);
4652 return x;
4655 /* We must recognize output that we have already generated ourselves. */
4656 if (GET_CODE (x) == PLUS
4657 && GET_CODE (XEXP (x, 0)) == PLUS
4658 && REG_P (XEXP (XEXP (x, 0), 0))
4659 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660 && CONST_INT_P (XEXP (x, 1)))
4662 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664 opnum, (enum reload_type) type);
4665 return x;
4668 /* We wish to handle large displacements off a base register by splitting
4669 the addend across an add and the mem insn. This can cut the number of
4670 extra insns needed from 3 to 1. It is only useful for load/store of a
4671 single register with 12 bit offset field. */
4672 if (GET_CODE (x) == PLUS
4673 && REG_P (XEXP (x, 0))
4674 && CONST_INT_P (XEXP (x, 1))
4675 && HARD_REGISTER_P (XEXP (x, 0))
4676 && mode != TImode
4677 && mode != TFmode
4678 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4680 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681 HOST_WIDE_INT low = val & 0xfff;
4682 HOST_WIDE_INT high = val - low;
4683 HOST_WIDE_INT offs;
4684 rtx cst;
4685 machine_mode xmode = GET_MODE (x);
4687 /* In ILP32, xmode can be either DImode or SImode. */
4688 gcc_assert (xmode == DImode || xmode == SImode);
4690 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4691 BLKmode alignment. */
4692 if (GET_MODE_SIZE (mode) == 0)
4693 return NULL_RTX;
4695 offs = low % GET_MODE_SIZE (mode);
4697 /* Align misaligned offset by adjusting high part to compensate. */
4698 if (offs != 0)
4700 if (aarch64_uimm12_shift (high + offs))
4702 /* Align down. */
4703 low = low - offs;
4704 high = high + offs;
4706 else
4708 /* Align up. */
4709 offs = GET_MODE_SIZE (mode) - offs;
4710 low = low + offs;
4711 high = high + (low & 0x1000) - offs;
4712 low &= 0xfff;
4716 /* Check for overflow. */
4717 if (high + low != val)
4718 return NULL_RTX;
4720 cst = GEN_INT (high);
4721 if (!aarch64_uimm12_shift (high))
4722 cst = force_const_mem (xmode, cst);
4724 /* Reload high part into base reg, leaving the low part
4725 in the mem instruction.
4726 Note that replacing this gen_rtx_PLUS with plus_constant is
4727 wrong in this case because we rely on the
4728 (plus (plus reg c1) c2) structure being preserved so that
4729 XEXP (*p, 0) in push_reload below uses the correct term. */
4730 x = gen_rtx_PLUS (xmode,
4731 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732 GEN_INT (low));
4734 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736 opnum, (enum reload_type) type);
4737 return x;
4740 return NULL_RTX;
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746 reg_class_t rclass,
4747 machine_mode mode,
4748 secondary_reload_info *sri)
4750 /* Without the TARGET_SIMD instructions we cannot move a Q register
4751 to a Q register directly. We need a scratch. */
4752 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754 && reg_class_subset_p (rclass, FP_REGS))
4756 if (mode == TFmode)
4757 sri->icode = CODE_FOR_aarch64_reload_movtf;
4758 else if (mode == TImode)
4759 sri->icode = CODE_FOR_aarch64_reload_movti;
4760 return NO_REGS;
4763 /* A TFmode or TImode memory access should be handled via an FP_REGS
4764 because AArch64 has richer addressing modes for LDR/STR instructions
4765 than LDP/STP instructions. */
4766 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768 return FP_REGS;
4770 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771 return GENERAL_REGS;
4773 return NO_REGS;
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4779 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4782 if (frame_pointer_needed)
4784 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785 return true;
4786 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787 return false;
4788 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789 && !cfun->calls_alloca)
4790 return true;
4791 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792 return true;
4794 return false;
4796 else
4798 /* If we decided that we didn't need a leaf frame pointer but then used
4799 LR in the function, then we'll want a frame pointer after all, so
4800 prevent this elimination to ensure a frame pointer is used. */
4801 if (to == STACK_POINTER_REGNUM
4802 && flag_omit_leaf_frame_pointer
4803 && df_regs_ever_live_p (LR_REGNUM))
4804 return false;
4807 return true;
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4813 aarch64_layout_frame ();
4815 if (to == HARD_FRAME_POINTER_REGNUM)
4817 if (from == ARG_POINTER_REGNUM)
4818 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4820 if (from == FRAME_POINTER_REGNUM)
4821 return (cfun->machine->frame.hard_fp_offset
4822 - cfun->machine->frame.saved_varargs_size);
4825 if (to == STACK_POINTER_REGNUM)
4827 if (from == FRAME_POINTER_REGNUM)
4828 return (cfun->machine->frame.frame_size
4829 - cfun->machine->frame.saved_varargs_size);
4832 return cfun->machine->frame.frame_size;
4835 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4836 previous frame. */
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4841 if (count != 0)
4842 return const0_rtx;
4843 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4850 if (TARGET_ILP32)
4852 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4855 else
4857 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4860 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861 assemble_aligned_integer (4, const0_rtx);
4862 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4869 rtx fnaddr, mem, a_tramp;
4870 const int tramp_code_sz = 16;
4872 /* Don't need to copy the trailing D-words, we fill those in below. */
4873 emit_block_move (m_tramp, assemble_trampoline_template (),
4874 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877 if (GET_MODE (fnaddr) != ptr_mode)
4878 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879 emit_move_insn (mem, fnaddr);
4881 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882 emit_move_insn (mem, chain_value);
4884 /* XXX We should really define a "clear_cache" pattern and use
4885 gen_clear_cache(). */
4886 a_tramp = XEXP (m_tramp, 0);
4887 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890 ptr_mode);
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4896 switch (regclass)
4898 case CALLER_SAVE_REGS:
4899 case POINTER_REGS:
4900 case GENERAL_REGS:
4901 case ALL_REGS:
4902 case FP_REGS:
4903 case FP_LO_REGS:
4904 return
4905 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906 (GET_MODE_SIZE (mode) + 7) / 8;
4907 case STACK_REG:
4908 return 1;
4910 case NO_REGS:
4911 return 0;
4913 default:
4914 break;
4916 gcc_unreachable ();
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4922 if (regclass == POINTER_REGS)
4923 return GENERAL_REGS;
4925 if (regclass == STACK_REG)
4927 if (REG_P(x)
4928 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929 return regclass;
4931 return NO_REGS;
4934 /* If it's an integer immediate that MOVI can't handle, then
4935 FP_REGS is not an option, so we return NO_REGS instead. */
4936 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938 return NO_REGS;
4940 /* Register eliminiation can result in a request for
4941 SP+constant->FP_REGS. We cannot support such operations which
4942 use SP as source and an FP_REG as destination, so reject out
4943 right now. */
4944 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4946 rtx lhs = XEXP (x, 0);
4948 /* Look through a possible SUBREG introduced by ILP32. */
4949 if (GET_CODE (lhs) == SUBREG)
4950 lhs = SUBREG_REG (lhs);
4952 gcc_assert (REG_P (lhs));
4953 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954 POINTER_REGS));
4955 return NO_REGS;
4958 return regclass;
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4964 asm_fprintf (f, "%U%s", name);
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4970 if (priority == DEFAULT_INIT_PRIORITY)
4971 default_ctor_section_asm_out_constructor (symbol, priority);
4972 else
4974 section *s;
4975 char buf[18];
4976 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977 s = get_section (buf, SECTION_WRITE, NULL);
4978 switch_to_section (s);
4979 assemble_align (POINTER_SIZE);
4980 assemble_aligned_integer (POINTER_BYTES, symbol);
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4987 if (priority == DEFAULT_INIT_PRIORITY)
4988 default_dtor_section_asm_out_destructor (symbol, priority);
4989 else
4991 section *s;
4992 char buf[18];
4993 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994 s = get_section (buf, SECTION_WRITE, NULL);
4995 switch_to_section (s);
4996 assemble_align (POINTER_SIZE);
4997 assemble_aligned_integer (POINTER_BYTES, symbol);
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5004 char buf[100];
5005 char label[100];
5006 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007 int index;
5008 static const char *const patterns[4][2] =
5011 "ldrb\t%w3, [%0,%w1,uxtw]",
5012 "add\t%3, %4, %w3, sxtb #2"
5015 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016 "add\t%3, %4, %w3, sxth #2"
5019 "ldr\t%w3, [%0,%w1,uxtw #2]",
5020 "add\t%3, %4, %w3, sxtw #2"
5022 /* We assume that DImode is only generated when not optimizing and
5023 that we don't really need 64-bit address offsets. That would
5024 imply an object file with 8GB of code in a single function! */
5026 "ldr\t%w3, [%0,%w1,uxtw #2]",
5027 "add\t%3, %4, %w3, sxtw #2"
5031 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5033 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5035 gcc_assert (index >= 0 && index <= 3);
5037 /* Need to implement table size reduction, by chaning the code below. */
5038 output_asm_insn (patterns[index][0], operands);
5039 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040 snprintf (buf, sizeof (buf),
5041 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042 output_asm_insn (buf, operands);
5043 output_asm_insn (patterns[index][1], operands);
5044 output_asm_insn ("br\t%3", operands);
5045 assemble_label (asm_out_file, label);
5046 return "";
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052 operator. */
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5057 if (shift >= 0 && shift <= 3)
5059 int size;
5060 for (size = 8; size <= 32; size *= 2)
5062 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063 if (mask == bits << shift)
5064 return size;
5067 return 0;
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072 const_rtx x ATTRIBUTE_UNUSED)
5074 /* We can't use blocks for constants when we're using a per-function
5075 constant pool. */
5076 return false;
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081 rtx x ATTRIBUTE_UNUSED,
5082 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5084 /* Force all constant pool entries into the current function section. */
5085 return function_section (current_function_decl);
5089 /* Costs. */
5091 /* Helper function for rtx cost calculation. Strip a shift expression
5092 from X. Returns the inner operand if successful, or the original
5093 expression on failure. */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5097 rtx op = x;
5099 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100 we can convert both to ROR during final output. */
5101 if ((GET_CODE (op) == ASHIFT
5102 || GET_CODE (op) == ASHIFTRT
5103 || GET_CODE (op) == LSHIFTRT
5104 || GET_CODE (op) == ROTATERT
5105 || GET_CODE (op) == ROTATE)
5106 && CONST_INT_P (XEXP (op, 1)))
5107 return XEXP (op, 0);
5109 if (GET_CODE (op) == MULT
5110 && CONST_INT_P (XEXP (op, 1))
5111 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112 return XEXP (op, 0);
5114 return x;
5117 /* Helper function for rtx cost calculation. Strip an extend
5118 expression from X. Returns the inner operand if successful, or the
5119 original expression on failure. We deal with a number of possible
5120 canonicalization variations here. */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5124 rtx op = x;
5126 /* Zero and sign extraction of a widened value. */
5127 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128 && XEXP (op, 2) == const0_rtx
5129 && GET_CODE (XEXP (op, 0)) == MULT
5130 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131 XEXP (op, 1)))
5132 return XEXP (XEXP (op, 0), 0);
5134 /* It can also be represented (for zero-extend) as an AND with an
5135 immediate. */
5136 if (GET_CODE (op) == AND
5137 && GET_CODE (XEXP (op, 0)) == MULT
5138 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139 && CONST_INT_P (XEXP (op, 1))
5140 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141 INTVAL (XEXP (op, 1))) != 0)
5142 return XEXP (XEXP (op, 0), 0);
5144 /* Now handle extended register, as this may also have an optional
5145 left shift by 1..4. */
5146 if (GET_CODE (op) == ASHIFT
5147 && CONST_INT_P (XEXP (op, 1))
5148 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149 op = XEXP (op, 0);
5151 if (GET_CODE (op) == ZERO_EXTEND
5152 || GET_CODE (op) == SIGN_EXTEND)
5153 op = XEXP (op, 0);
5155 if (op != x)
5156 return op;
5158 return x;
5161 /* Return true iff CODE is a shift supported in combination
5162 with arithmetic instructions. */
5164 static bool
5165 aarch64_shift_p (enum rtx_code code)
5167 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5170 /* Helper function for rtx cost calculation. Calculate the cost of
5171 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5172 Return the calculated cost of the expression, recursing manually in to
5173 operands where needed. */
5175 static int
5176 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5178 rtx op0, op1;
5179 const struct cpu_cost_table *extra_cost
5180 = aarch64_tune_params->insn_extra_cost;
5181 int cost = 0;
5182 bool compound_p = (outer == PLUS || outer == MINUS);
5183 machine_mode mode = GET_MODE (x);
5185 gcc_checking_assert (code == MULT);
5187 op0 = XEXP (x, 0);
5188 op1 = XEXP (x, 1);
5190 if (VECTOR_MODE_P (mode))
5191 mode = GET_MODE_INNER (mode);
5193 /* Integer multiply/fma. */
5194 if (GET_MODE_CLASS (mode) == MODE_INT)
5196 /* The multiply will be canonicalized as a shift, cost it as such. */
5197 if (aarch64_shift_p (GET_CODE (x))
5198 || (CONST_INT_P (op1)
5199 && exact_log2 (INTVAL (op1)) > 0))
5201 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5202 || GET_CODE (op0) == SIGN_EXTEND;
5203 if (speed)
5205 if (compound_p)
5207 if (REG_P (op1))
5208 /* ARITH + shift-by-register. */
5209 cost += extra_cost->alu.arith_shift_reg;
5210 else if (is_extend)
5211 /* ARITH + extended register. We don't have a cost field
5212 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5213 cost += extra_cost->alu.extend_arith;
5214 else
5215 /* ARITH + shift-by-immediate. */
5216 cost += extra_cost->alu.arith_shift;
5218 else
5219 /* LSL (immediate). */
5220 cost += extra_cost->alu.shift;
5223 /* Strip extends as we will have costed them in the case above. */
5224 if (is_extend)
5225 op0 = aarch64_strip_extend (op0);
5227 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5229 return cost;
5232 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5233 compound and let the below cases handle it. After all, MNEG is a
5234 special-case alias of MSUB. */
5235 if (GET_CODE (op0) == NEG)
5237 op0 = XEXP (op0, 0);
5238 compound_p = true;
5241 /* Integer multiplies or FMAs have zero/sign extending variants. */
5242 if ((GET_CODE (op0) == ZERO_EXTEND
5243 && GET_CODE (op1) == ZERO_EXTEND)
5244 || (GET_CODE (op0) == SIGN_EXTEND
5245 && GET_CODE (op1) == SIGN_EXTEND))
5247 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5248 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5250 if (speed)
5252 if (compound_p)
5253 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5254 cost += extra_cost->mult[0].extend_add;
5255 else
5256 /* MUL/SMULL/UMULL. */
5257 cost += extra_cost->mult[0].extend;
5260 return cost;
5263 /* This is either an integer multiply or a MADD. In both cases
5264 we want to recurse and cost the operands. */
5265 cost += rtx_cost (op0, MULT, 0, speed)
5266 + rtx_cost (op1, MULT, 1, speed);
5268 if (speed)
5270 if (compound_p)
5271 /* MADD/MSUB. */
5272 cost += extra_cost->mult[mode == DImode].add;
5273 else
5274 /* MUL. */
5275 cost += extra_cost->mult[mode == DImode].simple;
5278 return cost;
5280 else
5282 if (speed)
5284 /* Floating-point FMA/FMUL can also support negations of the
5285 operands. */
5286 if (GET_CODE (op0) == NEG)
5287 op0 = XEXP (op0, 0);
5288 if (GET_CODE (op1) == NEG)
5289 op1 = XEXP (op1, 0);
5291 if (compound_p)
5292 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5293 cost += extra_cost->fp[mode == DFmode].fma;
5294 else
5295 /* FMUL/FNMUL. */
5296 cost += extra_cost->fp[mode == DFmode].mult;
5299 cost += rtx_cost (op0, MULT, 0, speed)
5300 + rtx_cost (op1, MULT, 1, speed);
5301 return cost;
5305 static int
5306 aarch64_address_cost (rtx x,
5307 machine_mode mode,
5308 addr_space_t as ATTRIBUTE_UNUSED,
5309 bool speed)
5311 enum rtx_code c = GET_CODE (x);
5312 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5313 struct aarch64_address_info info;
5314 int cost = 0;
5315 info.shift = 0;
5317 if (!aarch64_classify_address (&info, x, mode, c, false))
5319 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5321 /* This is a CONST or SYMBOL ref which will be split
5322 in a different way depending on the code model in use.
5323 Cost it through the generic infrastructure. */
5324 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5325 /* Divide through by the cost of one instruction to
5326 bring it to the same units as the address costs. */
5327 cost_symbol_ref /= COSTS_N_INSNS (1);
5328 /* The cost is then the cost of preparing the address,
5329 followed by an immediate (possibly 0) offset. */
5330 return cost_symbol_ref + addr_cost->imm_offset;
5332 else
5334 /* This is most likely a jump table from a case
5335 statement. */
5336 return addr_cost->register_offset;
5340 switch (info.type)
5342 case ADDRESS_LO_SUM:
5343 case ADDRESS_SYMBOLIC:
5344 case ADDRESS_REG_IMM:
5345 cost += addr_cost->imm_offset;
5346 break;
5348 case ADDRESS_REG_WB:
5349 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5350 cost += addr_cost->pre_modify;
5351 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5352 cost += addr_cost->post_modify;
5353 else
5354 gcc_unreachable ();
5356 break;
5358 case ADDRESS_REG_REG:
5359 cost += addr_cost->register_offset;
5360 break;
5362 case ADDRESS_REG_UXTW:
5363 case ADDRESS_REG_SXTW:
5364 cost += addr_cost->register_extend;
5365 break;
5367 default:
5368 gcc_unreachable ();
5372 if (info.shift > 0)
5374 /* For the sake of calculating the cost of the shifted register
5375 component, we can treat same sized modes in the same way. */
5376 switch (GET_MODE_BITSIZE (mode))
5378 case 16:
5379 cost += addr_cost->addr_scale_costs.hi;
5380 break;
5382 case 32:
5383 cost += addr_cost->addr_scale_costs.si;
5384 break;
5386 case 64:
5387 cost += addr_cost->addr_scale_costs.di;
5388 break;
5390 /* We can't tell, or this is a 128-bit vector. */
5391 default:
5392 cost += addr_cost->addr_scale_costs.ti;
5393 break;
5397 return cost;
5400 /* Return true if the RTX X in mode MODE is a zero or sign extract
5401 usable in an ADD or SUB (extended register) instruction. */
5402 static bool
5403 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5405 /* Catch add with a sign extract.
5406 This is add_<optab><mode>_multp2. */
5407 if (GET_CODE (x) == SIGN_EXTRACT
5408 || GET_CODE (x) == ZERO_EXTRACT)
5410 rtx op0 = XEXP (x, 0);
5411 rtx op1 = XEXP (x, 1);
5412 rtx op2 = XEXP (x, 2);
5414 if (GET_CODE (op0) == MULT
5415 && CONST_INT_P (op1)
5416 && op2 == const0_rtx
5417 && CONST_INT_P (XEXP (op0, 1))
5418 && aarch64_is_extend_from_extract (mode,
5419 XEXP (op0, 1),
5420 op1))
5422 return true;
5426 return false;
5429 static bool
5430 aarch64_frint_unspec_p (unsigned int u)
5432 switch (u)
5434 case UNSPEC_FRINTZ:
5435 case UNSPEC_FRINTP:
5436 case UNSPEC_FRINTM:
5437 case UNSPEC_FRINTA:
5438 case UNSPEC_FRINTN:
5439 case UNSPEC_FRINTX:
5440 case UNSPEC_FRINTI:
5441 return true;
5443 default:
5444 return false;
5448 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5449 storing it in *COST. Result is true if the total cost of the operation
5450 has now been calculated. */
5451 static bool
5452 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5454 rtx inner;
5455 rtx comparator;
5456 enum rtx_code cmpcode;
5458 if (COMPARISON_P (op0))
5460 inner = XEXP (op0, 0);
5461 comparator = XEXP (op0, 1);
5462 cmpcode = GET_CODE (op0);
5464 else
5466 inner = op0;
5467 comparator = const0_rtx;
5468 cmpcode = NE;
5471 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5473 /* Conditional branch. */
5474 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5475 return true;
5476 else
5478 if (cmpcode == NE || cmpcode == EQ)
5480 if (comparator == const0_rtx)
5482 /* TBZ/TBNZ/CBZ/CBNZ. */
5483 if (GET_CODE (inner) == ZERO_EXTRACT)
5484 /* TBZ/TBNZ. */
5485 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5486 0, speed);
5487 else
5488 /* CBZ/CBNZ. */
5489 *cost += rtx_cost (inner, cmpcode, 0, speed);
5491 return true;
5494 else if (cmpcode == LT || cmpcode == GE)
5496 /* TBZ/TBNZ. */
5497 if (comparator == const0_rtx)
5498 return true;
5502 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5504 /* It's a conditional operation based on the status flags,
5505 so it must be some flavor of CSEL. */
5507 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5508 if (GET_CODE (op1) == NEG
5509 || GET_CODE (op1) == NOT
5510 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5511 op1 = XEXP (op1, 0);
5513 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5514 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5515 return true;
5518 /* We don't know what this is, cost all operands. */
5519 return false;
5522 /* Calculate the cost of calculating X, storing it in *COST. Result
5523 is true if the total cost of the operation has now been calculated. */
5524 static bool
5525 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5526 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5528 rtx op0, op1, op2;
5529 const struct cpu_cost_table *extra_cost
5530 = aarch64_tune_params->insn_extra_cost;
5531 machine_mode mode = GET_MODE (x);
5533 /* By default, assume that everything has equivalent cost to the
5534 cheapest instruction. Any additional costs are applied as a delta
5535 above this default. */
5536 *cost = COSTS_N_INSNS (1);
5538 /* TODO: The cost infrastructure currently does not handle
5539 vector operations. Assume that all vector operations
5540 are equally expensive. */
5541 if (VECTOR_MODE_P (mode))
5543 if (speed)
5544 *cost += extra_cost->vect.alu;
5545 return true;
5548 switch (code)
5550 case SET:
5551 /* The cost depends entirely on the operands to SET. */
5552 *cost = 0;
5553 op0 = SET_DEST (x);
5554 op1 = SET_SRC (x);
5556 switch (GET_CODE (op0))
5558 case MEM:
5559 if (speed)
5561 rtx address = XEXP (op0, 0);
5562 if (GET_MODE_CLASS (mode) == MODE_INT)
5563 *cost += extra_cost->ldst.store;
5564 else if (mode == SFmode)
5565 *cost += extra_cost->ldst.storef;
5566 else if (mode == DFmode)
5567 *cost += extra_cost->ldst.stored;
5569 *cost +=
5570 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5571 0, speed));
5574 *cost += rtx_cost (op1, SET, 1, speed);
5575 return true;
5577 case SUBREG:
5578 if (! REG_P (SUBREG_REG (op0)))
5579 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5581 /* Fall through. */
5582 case REG:
5583 /* const0_rtx is in general free, but we will use an
5584 instruction to set a register to 0. */
5585 if (REG_P (op1) || op1 == const0_rtx)
5587 /* The cost is 1 per register copied. */
5588 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5589 / UNITS_PER_WORD;
5590 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5592 else
5593 /* Cost is just the cost of the RHS of the set. */
5594 *cost += rtx_cost (op1, SET, 1, speed);
5595 return true;
5597 case ZERO_EXTRACT:
5598 case SIGN_EXTRACT:
5599 /* Bit-field insertion. Strip any redundant widening of
5600 the RHS to meet the width of the target. */
5601 if (GET_CODE (op1) == SUBREG)
5602 op1 = SUBREG_REG (op1);
5603 if ((GET_CODE (op1) == ZERO_EXTEND
5604 || GET_CODE (op1) == SIGN_EXTEND)
5605 && CONST_INT_P (XEXP (op0, 1))
5606 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5607 >= INTVAL (XEXP (op0, 1))))
5608 op1 = XEXP (op1, 0);
5610 if (CONST_INT_P (op1))
5612 /* MOV immediate is assumed to always be cheap. */
5613 *cost = COSTS_N_INSNS (1);
5615 else
5617 /* BFM. */
5618 if (speed)
5619 *cost += extra_cost->alu.bfi;
5620 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5623 return true;
5625 default:
5626 /* We can't make sense of this, assume default cost. */
5627 *cost = COSTS_N_INSNS (1);
5628 return false;
5630 return false;
5632 case CONST_INT:
5633 /* If an instruction can incorporate a constant within the
5634 instruction, the instruction's expression avoids calling
5635 rtx_cost() on the constant. If rtx_cost() is called on a
5636 constant, then it is usually because the constant must be
5637 moved into a register by one or more instructions.
5639 The exception is constant 0, which can be expressed
5640 as XZR/WZR and is therefore free. The exception to this is
5641 if we have (set (reg) (const0_rtx)) in which case we must cost
5642 the move. However, we can catch that when we cost the SET, so
5643 we don't need to consider that here. */
5644 if (x == const0_rtx)
5645 *cost = 0;
5646 else
5648 /* To an approximation, building any other constant is
5649 proportionally expensive to the number of instructions
5650 required to build that constant. This is true whether we
5651 are compiling for SPEED or otherwise. */
5652 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5653 (NULL_RTX, x, false, mode));
5655 return true;
5657 case CONST_DOUBLE:
5658 if (speed)
5660 /* mov[df,sf]_aarch64. */
5661 if (aarch64_float_const_representable_p (x))
5662 /* FMOV (scalar immediate). */
5663 *cost += extra_cost->fp[mode == DFmode].fpconst;
5664 else if (!aarch64_float_const_zero_rtx_p (x))
5666 /* This will be a load from memory. */
5667 if (mode == DFmode)
5668 *cost += extra_cost->ldst.loadd;
5669 else
5670 *cost += extra_cost->ldst.loadf;
5672 else
5673 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5674 or MOV v0.s[0], wzr - neither of which are modeled by the
5675 cost tables. Just use the default cost. */
5680 return true;
5682 case MEM:
5683 if (speed)
5685 /* For loads we want the base cost of a load, plus an
5686 approximation for the additional cost of the addressing
5687 mode. */
5688 rtx address = XEXP (x, 0);
5689 if (GET_MODE_CLASS (mode) == MODE_INT)
5690 *cost += extra_cost->ldst.load;
5691 else if (mode == SFmode)
5692 *cost += extra_cost->ldst.loadf;
5693 else if (mode == DFmode)
5694 *cost += extra_cost->ldst.loadd;
5696 *cost +=
5697 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5698 0, speed));
5701 return true;
5703 case NEG:
5704 op0 = XEXP (x, 0);
5706 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5708 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5709 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5711 /* CSETM. */
5712 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5713 return true;
5716 /* Cost this as SUB wzr, X. */
5717 op0 = CONST0_RTX (GET_MODE (x));
5718 op1 = XEXP (x, 0);
5719 goto cost_minus;
5722 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5724 /* Support (neg(fma...)) as a single instruction only if
5725 sign of zeros is unimportant. This matches the decision
5726 making in aarch64.md. */
5727 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5729 /* FNMADD. */
5730 *cost = rtx_cost (op0, NEG, 0, speed);
5731 return true;
5733 if (speed)
5734 /* FNEG. */
5735 *cost += extra_cost->fp[mode == DFmode].neg;
5736 return false;
5739 return false;
5741 case CLRSB:
5742 case CLZ:
5743 if (speed)
5744 *cost += extra_cost->alu.clz;
5746 return false;
5748 case COMPARE:
5749 op0 = XEXP (x, 0);
5750 op1 = XEXP (x, 1);
5752 if (op1 == const0_rtx
5753 && GET_CODE (op0) == AND)
5755 x = op0;
5756 goto cost_logic;
5759 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5761 /* TODO: A write to the CC flags possibly costs extra, this
5762 needs encoding in the cost tables. */
5764 /* CC_ZESWPmode supports zero extend for free. */
5765 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5766 op0 = XEXP (op0, 0);
5768 /* ANDS. */
5769 if (GET_CODE (op0) == AND)
5771 x = op0;
5772 goto cost_logic;
5775 if (GET_CODE (op0) == PLUS)
5777 /* ADDS (and CMN alias). */
5778 x = op0;
5779 goto cost_plus;
5782 if (GET_CODE (op0) == MINUS)
5784 /* SUBS. */
5785 x = op0;
5786 goto cost_minus;
5789 if (GET_CODE (op1) == NEG)
5791 /* CMN. */
5792 if (speed)
5793 *cost += extra_cost->alu.arith;
5795 *cost += rtx_cost (op0, COMPARE, 0, speed);
5796 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5797 return true;
5800 /* CMP.
5802 Compare can freely swap the order of operands, and
5803 canonicalization puts the more complex operation first.
5804 But the integer MINUS logic expects the shift/extend
5805 operation in op1. */
5806 if (! (REG_P (op0)
5807 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5809 op0 = XEXP (x, 1);
5810 op1 = XEXP (x, 0);
5812 goto cost_minus;
5815 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5817 /* FCMP. */
5818 if (speed)
5819 *cost += extra_cost->fp[mode == DFmode].compare;
5821 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5823 /* FCMP supports constant 0.0 for no extra cost. */
5824 return true;
5826 return false;
5829 return false;
5831 case MINUS:
5833 op0 = XEXP (x, 0);
5834 op1 = XEXP (x, 1);
5836 cost_minus:
5837 /* Detect valid immediates. */
5838 if ((GET_MODE_CLASS (mode) == MODE_INT
5839 || (GET_MODE_CLASS (mode) == MODE_CC
5840 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5841 && CONST_INT_P (op1)
5842 && aarch64_uimm12_shift (INTVAL (op1)))
5844 *cost += rtx_cost (op0, MINUS, 0, speed);
5846 if (speed)
5847 /* SUB(S) (immediate). */
5848 *cost += extra_cost->alu.arith;
5849 return true;
5853 /* Look for SUB (extended register). */
5854 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5856 if (speed)
5857 *cost += extra_cost->alu.extend_arith;
5859 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5860 (enum rtx_code) GET_CODE (op1),
5861 0, speed);
5862 return true;
5865 rtx new_op1 = aarch64_strip_extend (op1);
5867 /* Cost this as an FMA-alike operation. */
5868 if ((GET_CODE (new_op1) == MULT
5869 || aarch64_shift_p (GET_CODE (new_op1)))
5870 && code != COMPARE)
5872 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5873 (enum rtx_code) code,
5874 speed);
5875 *cost += rtx_cost (op0, MINUS, 0, speed);
5876 return true;
5879 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5881 if (speed)
5883 if (GET_MODE_CLASS (mode) == MODE_INT)
5884 /* SUB(S). */
5885 *cost += extra_cost->alu.arith;
5886 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5887 /* FSUB. */
5888 *cost += extra_cost->fp[mode == DFmode].addsub;
5890 return true;
5893 case PLUS:
5895 rtx new_op0;
5897 op0 = XEXP (x, 0);
5898 op1 = XEXP (x, 1);
5900 cost_plus:
5901 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5902 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5904 /* CSINC. */
5905 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5906 *cost += rtx_cost (op1, PLUS, 1, speed);
5907 return true;
5910 if (GET_MODE_CLASS (mode) == MODE_INT
5911 && CONST_INT_P (op1)
5912 && aarch64_uimm12_shift (INTVAL (op1)))
5914 *cost += rtx_cost (op0, PLUS, 0, speed);
5916 if (speed)
5917 /* ADD (immediate). */
5918 *cost += extra_cost->alu.arith;
5919 return true;
5922 /* Look for ADD (extended register). */
5923 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5925 if (speed)
5926 *cost += extra_cost->alu.extend_arith;
5928 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5929 (enum rtx_code) GET_CODE (op0),
5930 0, speed);
5931 return true;
5934 /* Strip any extend, leave shifts behind as we will
5935 cost them through mult_cost. */
5936 new_op0 = aarch64_strip_extend (op0);
5938 if (GET_CODE (new_op0) == MULT
5939 || aarch64_shift_p (GET_CODE (new_op0)))
5941 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5942 speed);
5943 *cost += rtx_cost (op1, PLUS, 1, speed);
5944 return true;
5947 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5948 + rtx_cost (op1, PLUS, 1, speed));
5950 if (speed)
5952 if (GET_MODE_CLASS (mode) == MODE_INT)
5953 /* ADD. */
5954 *cost += extra_cost->alu.arith;
5955 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5956 /* FADD. */
5957 *cost += extra_cost->fp[mode == DFmode].addsub;
5959 return true;
5962 case BSWAP:
5963 *cost = COSTS_N_INSNS (1);
5965 if (speed)
5966 *cost += extra_cost->alu.rev;
5968 return false;
5970 case IOR:
5971 if (aarch_rev16_p (x))
5973 *cost = COSTS_N_INSNS (1);
5975 if (speed)
5976 *cost += extra_cost->alu.rev;
5978 return true;
5980 /* Fall through. */
5981 case XOR:
5982 case AND:
5983 cost_logic:
5984 op0 = XEXP (x, 0);
5985 op1 = XEXP (x, 1);
5987 if (code == AND
5988 && GET_CODE (op0) == MULT
5989 && CONST_INT_P (XEXP (op0, 1))
5990 && CONST_INT_P (op1)
5991 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5992 INTVAL (op1)) != 0)
5994 /* This is a UBFM/SBFM. */
5995 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5996 if (speed)
5997 *cost += extra_cost->alu.bfx;
5998 return true;
6001 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6003 /* We possibly get the immediate for free, this is not
6004 modelled. */
6005 if (CONST_INT_P (op1)
6006 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6008 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6010 if (speed)
6011 *cost += extra_cost->alu.logical;
6013 return true;
6015 else
6017 rtx new_op0 = op0;
6019 /* Handle ORN, EON, or BIC. */
6020 if (GET_CODE (op0) == NOT)
6021 op0 = XEXP (op0, 0);
6023 new_op0 = aarch64_strip_shift (op0);
6025 /* If we had a shift on op0 then this is a logical-shift-
6026 by-register/immediate operation. Otherwise, this is just
6027 a logical operation. */
6028 if (speed)
6030 if (new_op0 != op0)
6032 /* Shift by immediate. */
6033 if (CONST_INT_P (XEXP (op0, 1)))
6034 *cost += extra_cost->alu.log_shift;
6035 else
6036 *cost += extra_cost->alu.log_shift_reg;
6038 else
6039 *cost += extra_cost->alu.logical;
6042 /* In both cases we want to cost both operands. */
6043 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6044 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6046 return true;
6049 return false;
6051 case NOT:
6052 x = XEXP (x, 0);
6053 op0 = aarch64_strip_shift (x);
6055 /* MVN-shifted-reg. */
6056 if (op0 != x)
6058 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6060 if (speed)
6061 *cost += extra_cost->alu.log_shift;
6063 return true;
6065 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6066 Handle the second form here taking care that 'a' in the above can
6067 be a shift. */
6068 else if (GET_CODE (op0) == XOR)
6070 rtx newop0 = XEXP (op0, 0);
6071 rtx newop1 = XEXP (op0, 1);
6072 rtx op0_stripped = aarch64_strip_shift (newop0);
6074 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6075 + rtx_cost (op0_stripped, XOR, 0, speed);
6077 if (speed)
6079 if (op0_stripped != newop0)
6080 *cost += extra_cost->alu.log_shift;
6081 else
6082 *cost += extra_cost->alu.logical;
6085 return true;
6087 /* MVN. */
6088 if (speed)
6089 *cost += extra_cost->alu.logical;
6091 return false;
6093 case ZERO_EXTEND:
6095 op0 = XEXP (x, 0);
6096 /* If a value is written in SI mode, then zero extended to DI
6097 mode, the operation will in general be free as a write to
6098 a 'w' register implicitly zeroes the upper bits of an 'x'
6099 register. However, if this is
6101 (set (reg) (zero_extend (reg)))
6103 we must cost the explicit register move. */
6104 if (mode == DImode
6105 && GET_MODE (op0) == SImode
6106 && outer == SET)
6108 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6110 if (!op_cost && speed)
6111 /* MOV. */
6112 *cost += extra_cost->alu.extend;
6113 else
6114 /* Free, the cost is that of the SI mode operation. */
6115 *cost = op_cost;
6117 return true;
6119 else if (MEM_P (XEXP (x, 0)))
6121 /* All loads can zero extend to any size for free. */
6122 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6123 return true;
6126 /* UXTB/UXTH. */
6127 if (speed)
6128 *cost += extra_cost->alu.extend;
6130 return false;
6132 case SIGN_EXTEND:
6133 if (MEM_P (XEXP (x, 0)))
6135 /* LDRSH. */
6136 if (speed)
6138 rtx address = XEXP (XEXP (x, 0), 0);
6139 *cost += extra_cost->ldst.load_sign_extend;
6141 *cost +=
6142 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6143 0, speed));
6145 return true;
6148 if (speed)
6149 *cost += extra_cost->alu.extend;
6150 return false;
6152 case ASHIFT:
6153 op0 = XEXP (x, 0);
6154 op1 = XEXP (x, 1);
6156 if (CONST_INT_P (op1))
6158 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6159 aliases. */
6160 if (speed)
6161 *cost += extra_cost->alu.shift;
6163 /* We can incorporate zero/sign extend for free. */
6164 if (GET_CODE (op0) == ZERO_EXTEND
6165 || GET_CODE (op0) == SIGN_EXTEND)
6166 op0 = XEXP (op0, 0);
6168 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6169 return true;
6171 else
6173 /* LSLV. */
6174 if (speed)
6175 *cost += extra_cost->alu.shift_reg;
6177 return false; /* All arguments need to be in registers. */
6180 case ROTATE:
6181 case ROTATERT:
6182 case LSHIFTRT:
6183 case ASHIFTRT:
6184 op0 = XEXP (x, 0);
6185 op1 = XEXP (x, 1);
6187 if (CONST_INT_P (op1))
6189 /* ASR (immediate) and friends. */
6190 if (speed)
6191 *cost += extra_cost->alu.shift;
6193 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6194 return true;
6196 else
6199 /* ASR (register) and friends. */
6200 if (speed)
6201 *cost += extra_cost->alu.shift_reg;
6203 return false; /* All arguments need to be in registers. */
6206 case SYMBOL_REF:
6208 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6210 /* LDR. */
6211 if (speed)
6212 *cost += extra_cost->ldst.load;
6214 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6215 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6217 /* ADRP, followed by ADD. */
6218 *cost += COSTS_N_INSNS (1);
6219 if (speed)
6220 *cost += 2 * extra_cost->alu.arith;
6222 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6223 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6225 /* ADR. */
6226 if (speed)
6227 *cost += extra_cost->alu.arith;
6230 if (flag_pic)
6232 /* One extra load instruction, after accessing the GOT. */
6233 *cost += COSTS_N_INSNS (1);
6234 if (speed)
6235 *cost += extra_cost->ldst.load;
6237 return true;
6239 case HIGH:
6240 case LO_SUM:
6241 /* ADRP/ADD (immediate). */
6242 if (speed)
6243 *cost += extra_cost->alu.arith;
6244 return true;
6246 case ZERO_EXTRACT:
6247 case SIGN_EXTRACT:
6248 /* UBFX/SBFX. */
6249 if (speed)
6250 *cost += extra_cost->alu.bfx;
6252 /* We can trust that the immediates used will be correct (there
6253 are no by-register forms), so we need only cost op0. */
6254 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6255 return true;
6257 case MULT:
6258 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6259 /* aarch64_rtx_mult_cost always handles recursion to its
6260 operands. */
6261 return true;
6263 case MOD:
6264 case UMOD:
6265 if (speed)
6267 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6268 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6269 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6270 else if (GET_MODE (x) == DFmode)
6271 *cost += (extra_cost->fp[1].mult
6272 + extra_cost->fp[1].div);
6273 else if (GET_MODE (x) == SFmode)
6274 *cost += (extra_cost->fp[0].mult
6275 + extra_cost->fp[0].div);
6277 return false; /* All arguments need to be in registers. */
6279 case DIV:
6280 case UDIV:
6281 case SQRT:
6282 if (speed)
6284 if (GET_MODE_CLASS (mode) == MODE_INT)
6285 /* There is no integer SQRT, so only DIV and UDIV can get
6286 here. */
6287 *cost += extra_cost->mult[mode == DImode].idiv;
6288 else
6289 *cost += extra_cost->fp[mode == DFmode].div;
6291 return false; /* All arguments need to be in registers. */
6293 case IF_THEN_ELSE:
6294 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6295 XEXP (x, 2), cost, speed);
6297 case EQ:
6298 case NE:
6299 case GT:
6300 case GTU:
6301 case LT:
6302 case LTU:
6303 case GE:
6304 case GEU:
6305 case LE:
6306 case LEU:
6308 return false; /* All arguments must be in registers. */
6310 case FMA:
6311 op0 = XEXP (x, 0);
6312 op1 = XEXP (x, 1);
6313 op2 = XEXP (x, 2);
6315 if (speed)
6316 *cost += extra_cost->fp[mode == DFmode].fma;
6318 /* FMSUB, FNMADD, and FNMSUB are free. */
6319 if (GET_CODE (op0) == NEG)
6320 op0 = XEXP (op0, 0);
6322 if (GET_CODE (op2) == NEG)
6323 op2 = XEXP (op2, 0);
6325 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6326 and the by-element operand as operand 0. */
6327 if (GET_CODE (op1) == NEG)
6328 op1 = XEXP (op1, 0);
6330 /* Catch vector-by-element operations. The by-element operand can
6331 either be (vec_duplicate (vec_select (x))) or just
6332 (vec_select (x)), depending on whether we are multiplying by
6333 a vector or a scalar.
6335 Canonicalization is not very good in these cases, FMA4 will put the
6336 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6337 if (GET_CODE (op0) == VEC_DUPLICATE)
6338 op0 = XEXP (op0, 0);
6339 else if (GET_CODE (op1) == VEC_DUPLICATE)
6340 op1 = XEXP (op1, 0);
6342 if (GET_CODE (op0) == VEC_SELECT)
6343 op0 = XEXP (op0, 0);
6344 else if (GET_CODE (op1) == VEC_SELECT)
6345 op1 = XEXP (op1, 0);
6347 /* If the remaining parameters are not registers,
6348 get the cost to put them into registers. */
6349 *cost += rtx_cost (op0, FMA, 0, speed);
6350 *cost += rtx_cost (op1, FMA, 1, speed);
6351 *cost += rtx_cost (op2, FMA, 2, speed);
6352 return true;
6354 case FLOAT_EXTEND:
6355 if (speed)
6356 *cost += extra_cost->fp[mode == DFmode].widen;
6357 return false;
6359 case FLOAT_TRUNCATE:
6360 if (speed)
6361 *cost += extra_cost->fp[mode == DFmode].narrow;
6362 return false;
6364 case FIX:
6365 case UNSIGNED_FIX:
6366 x = XEXP (x, 0);
6367 /* Strip the rounding part. They will all be implemented
6368 by the fcvt* family of instructions anyway. */
6369 if (GET_CODE (x) == UNSPEC)
6371 unsigned int uns_code = XINT (x, 1);
6373 if (uns_code == UNSPEC_FRINTA
6374 || uns_code == UNSPEC_FRINTM
6375 || uns_code == UNSPEC_FRINTN
6376 || uns_code == UNSPEC_FRINTP
6377 || uns_code == UNSPEC_FRINTZ)
6378 x = XVECEXP (x, 0, 0);
6381 if (speed)
6382 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6384 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6385 return true;
6387 case ABS:
6388 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6390 /* FABS and FNEG are analogous. */
6391 if (speed)
6392 *cost += extra_cost->fp[mode == DFmode].neg;
6394 else
6396 /* Integer ABS will either be split to
6397 two arithmetic instructions, or will be an ABS
6398 (scalar), which we don't model. */
6399 *cost = COSTS_N_INSNS (2);
6400 if (speed)
6401 *cost += 2 * extra_cost->alu.arith;
6403 return false;
6405 case SMAX:
6406 case SMIN:
6407 if (speed)
6409 /* FMAXNM/FMINNM/FMAX/FMIN.
6410 TODO: This may not be accurate for all implementations, but
6411 we do not model this in the cost tables. */
6412 *cost += extra_cost->fp[mode == DFmode].addsub;
6414 return false;
6416 case UNSPEC:
6417 /* The floating point round to integer frint* instructions. */
6418 if (aarch64_frint_unspec_p (XINT (x, 1)))
6420 if (speed)
6421 *cost += extra_cost->fp[mode == DFmode].roundint;
6423 return false;
6426 if (XINT (x, 1) == UNSPEC_RBIT)
6428 if (speed)
6429 *cost += extra_cost->alu.rev;
6431 return false;
6433 break;
6435 case TRUNCATE:
6437 /* Decompose <su>muldi3_highpart. */
6438 if (/* (truncate:DI */
6439 mode == DImode
6440 /* (lshiftrt:TI */
6441 && GET_MODE (XEXP (x, 0)) == TImode
6442 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6443 /* (mult:TI */
6444 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6445 /* (ANY_EXTEND:TI (reg:DI))
6446 (ANY_EXTEND:TI (reg:DI))) */
6447 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6448 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6449 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6450 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6451 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6452 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6453 /* (const_int 64) */
6454 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6455 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6457 /* UMULH/SMULH. */
6458 if (speed)
6459 *cost += extra_cost->mult[mode == DImode].extend;
6460 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6461 MULT, 0, speed);
6462 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6463 MULT, 1, speed);
6464 return true;
6467 /* Fall through. */
6468 default:
6469 break;
6472 if (dump_file && (dump_flags & TDF_DETAILS))
6473 fprintf (dump_file,
6474 "\nFailed to cost RTX. Assuming default cost.\n");
6476 return true;
6479 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6480 calculated for X. This cost is stored in *COST. Returns true
6481 if the total cost of X was calculated. */
6482 static bool
6483 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6484 int param, int *cost, bool speed)
6486 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6488 if (dump_file && (dump_flags & TDF_DETAILS))
6490 print_rtl_single (dump_file, x);
6491 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6492 speed ? "Hot" : "Cold",
6493 *cost, result ? "final" : "partial");
6496 return result;
6499 static int
6500 aarch64_register_move_cost (machine_mode mode,
6501 reg_class_t from_i, reg_class_t to_i)
6503 enum reg_class from = (enum reg_class) from_i;
6504 enum reg_class to = (enum reg_class) to_i;
6505 const struct cpu_regmove_cost *regmove_cost
6506 = aarch64_tune_params->regmove_cost;
6508 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6509 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6510 to = GENERAL_REGS;
6512 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6513 from = GENERAL_REGS;
6515 /* Moving between GPR and stack cost is the same as GP2GP. */
6516 if ((from == GENERAL_REGS && to == STACK_REG)
6517 || (to == GENERAL_REGS && from == STACK_REG))
6518 return regmove_cost->GP2GP;
6520 /* To/From the stack register, we move via the gprs. */
6521 if (to == STACK_REG || from == STACK_REG)
6522 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6523 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6525 if (GET_MODE_SIZE (mode) == 16)
6527 /* 128-bit operations on general registers require 2 instructions. */
6528 if (from == GENERAL_REGS && to == GENERAL_REGS)
6529 return regmove_cost->GP2GP * 2;
6530 else if (from == GENERAL_REGS)
6531 return regmove_cost->GP2FP * 2;
6532 else if (to == GENERAL_REGS)
6533 return regmove_cost->FP2GP * 2;
6535 /* When AdvSIMD instructions are disabled it is not possible to move
6536 a 128-bit value directly between Q registers. This is handled in
6537 secondary reload. A general register is used as a scratch to move
6538 the upper DI value and the lower DI value is moved directly,
6539 hence the cost is the sum of three moves. */
6540 if (! TARGET_SIMD)
6541 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6543 return regmove_cost->FP2FP;
6546 if (from == GENERAL_REGS && to == GENERAL_REGS)
6547 return regmove_cost->GP2GP;
6548 else if (from == GENERAL_REGS)
6549 return regmove_cost->GP2FP;
6550 else if (to == GENERAL_REGS)
6551 return regmove_cost->FP2GP;
6553 return regmove_cost->FP2FP;
6556 static int
6557 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6558 reg_class_t rclass ATTRIBUTE_UNUSED,
6559 bool in ATTRIBUTE_UNUSED)
6561 return aarch64_tune_params->memmov_cost;
6564 /* Return the number of instructions that can be issued per cycle. */
6565 static int
6566 aarch64_sched_issue_rate (void)
6568 return aarch64_tune_params->issue_rate;
6571 static int
6572 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6574 int issue_rate = aarch64_sched_issue_rate ();
6576 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6579 /* Vectorizer cost model target hooks. */
6581 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6582 static int
6583 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6584 tree vectype,
6585 int misalign ATTRIBUTE_UNUSED)
6587 unsigned elements;
6589 switch (type_of_cost)
6591 case scalar_stmt:
6592 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6594 case scalar_load:
6595 return aarch64_tune_params->vec_costs->scalar_load_cost;
6597 case scalar_store:
6598 return aarch64_tune_params->vec_costs->scalar_store_cost;
6600 case vector_stmt:
6601 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6603 case vector_load:
6604 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6606 case vector_store:
6607 return aarch64_tune_params->vec_costs->vec_store_cost;
6609 case vec_to_scalar:
6610 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6612 case scalar_to_vec:
6613 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6615 case unaligned_load:
6616 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6618 case unaligned_store:
6619 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6621 case cond_branch_taken:
6622 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6624 case cond_branch_not_taken:
6625 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6627 case vec_perm:
6628 case vec_promote_demote:
6629 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6631 case vec_construct:
6632 elements = TYPE_VECTOR_SUBPARTS (vectype);
6633 return elements / 2 + 1;
6635 default:
6636 gcc_unreachable ();
6640 /* Implement targetm.vectorize.add_stmt_cost. */
6641 static unsigned
6642 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6643 struct _stmt_vec_info *stmt_info, int misalign,
6644 enum vect_cost_model_location where)
6646 unsigned *cost = (unsigned *) data;
6647 unsigned retval = 0;
6649 if (flag_vect_cost_model)
6651 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6652 int stmt_cost =
6653 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6655 /* Statements in an inner loop relative to the loop being
6656 vectorized are weighted more heavily. The value here is
6657 a function (linear for now) of the loop nest level. */
6658 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6660 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6661 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6662 unsigned nest_level = loop_depth (loop);
6664 count *= nest_level;
6667 retval = (unsigned) (count * stmt_cost);
6668 cost[where] += retval;
6671 return retval;
6674 static void initialize_aarch64_code_model (void);
6676 /* Parse the architecture extension string. */
6678 static void
6679 aarch64_parse_extension (char *str)
6681 /* The extension string is parsed left to right. */
6682 const struct aarch64_option_extension *opt = NULL;
6684 /* Flag to say whether we are adding or removing an extension. */
6685 int adding_ext = -1;
6687 while (str != NULL && *str != 0)
6689 char *ext;
6690 size_t len;
6692 str++;
6693 ext = strchr (str, '+');
6695 if (ext != NULL)
6696 len = ext - str;
6697 else
6698 len = strlen (str);
6700 if (len >= 2 && strncmp (str, "no", 2) == 0)
6702 adding_ext = 0;
6703 len -= 2;
6704 str += 2;
6706 else if (len > 0)
6707 adding_ext = 1;
6709 if (len == 0)
6711 error ("missing feature modifier after %qs", adding_ext ? "+"
6712 : "+no");
6713 return;
6716 /* Scan over the extensions table trying to find an exact match. */
6717 for (opt = all_extensions; opt->name != NULL; opt++)
6719 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6721 /* Add or remove the extension. */
6722 if (adding_ext)
6723 aarch64_isa_flags |= opt->flags_on;
6724 else
6725 aarch64_isa_flags &= ~(opt->flags_off);
6726 break;
6730 if (opt->name == NULL)
6732 /* Extension not found in list. */
6733 error ("unknown feature modifier %qs", str);
6734 return;
6737 str = ext;
6740 return;
6743 /* Parse the ARCH string. */
6745 static void
6746 aarch64_parse_arch (void)
6748 char *ext;
6749 const struct processor *arch;
6750 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6751 size_t len;
6753 strcpy (str, aarch64_arch_string);
6755 ext = strchr (str, '+');
6757 if (ext != NULL)
6758 len = ext - str;
6759 else
6760 len = strlen (str);
6762 if (len == 0)
6764 error ("missing arch name in -march=%qs", str);
6765 return;
6768 /* Loop through the list of supported ARCHs to find a match. */
6769 for (arch = all_architectures; arch->name != NULL; arch++)
6771 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6773 selected_arch = arch;
6774 aarch64_isa_flags = selected_arch->flags;
6776 if (!selected_cpu)
6777 selected_cpu = &all_cores[selected_arch->core];
6779 if (ext != NULL)
6781 /* ARCH string contains at least one extension. */
6782 aarch64_parse_extension (ext);
6785 if (strcmp (selected_arch->arch, selected_cpu->arch))
6787 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6788 selected_cpu->name, selected_arch->name);
6791 return;
6795 /* ARCH name not found in list. */
6796 error ("unknown value %qs for -march", str);
6797 return;
6800 /* Parse the CPU string. */
6802 static void
6803 aarch64_parse_cpu (void)
6805 char *ext;
6806 const struct processor *cpu;
6807 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6808 size_t len;
6810 strcpy (str, aarch64_cpu_string);
6812 ext = strchr (str, '+');
6814 if (ext != NULL)
6815 len = ext - str;
6816 else
6817 len = strlen (str);
6819 if (len == 0)
6821 error ("missing cpu name in -mcpu=%qs", str);
6822 return;
6825 /* Loop through the list of supported CPUs to find a match. */
6826 for (cpu = all_cores; cpu->name != NULL; cpu++)
6828 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6830 selected_cpu = cpu;
6831 aarch64_isa_flags = selected_cpu->flags;
6833 if (ext != NULL)
6835 /* CPU string contains at least one extension. */
6836 aarch64_parse_extension (ext);
6839 return;
6843 /* CPU name not found in list. */
6844 error ("unknown value %qs for -mcpu", str);
6845 return;
6848 /* Parse the TUNE string. */
6850 static void
6851 aarch64_parse_tune (void)
6853 const struct processor *cpu;
6854 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6855 strcpy (str, aarch64_tune_string);
6857 /* Loop through the list of supported CPUs to find a match. */
6858 for (cpu = all_cores; cpu->name != NULL; cpu++)
6860 if (strcmp (cpu->name, str) == 0)
6862 selected_tune = cpu;
6863 return;
6867 /* CPU name not found in list. */
6868 error ("unknown value %qs for -mtune", str);
6869 return;
6873 /* Implement TARGET_OPTION_OVERRIDE. */
6875 static void
6876 aarch64_override_options (void)
6878 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6879 If either of -march or -mtune is given, they override their
6880 respective component of -mcpu.
6882 So, first parse AARCH64_CPU_STRING, then the others, be careful
6883 with -march as, if -mcpu is not present on the command line, march
6884 must set a sensible default CPU. */
6885 if (aarch64_cpu_string)
6887 aarch64_parse_cpu ();
6890 if (aarch64_arch_string)
6892 aarch64_parse_arch ();
6895 if (aarch64_tune_string)
6897 aarch64_parse_tune ();
6900 #ifndef HAVE_AS_MABI_OPTION
6901 /* The compiler may have been configured with 2.23.* binutils, which does
6902 not have support for ILP32. */
6903 if (TARGET_ILP32)
6904 error ("Assembler does not support -mabi=ilp32");
6905 #endif
6907 initialize_aarch64_code_model ();
6909 aarch64_build_bitmask_table ();
6911 /* This target defaults to strict volatile bitfields. */
6912 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6913 flag_strict_volatile_bitfields = 1;
6915 /* If the user did not specify a processor, choose the default
6916 one for them. This will be the CPU set during configuration using
6917 --with-cpu, otherwise it is "generic". */
6918 if (!selected_cpu)
6920 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6921 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6924 gcc_assert (selected_cpu);
6926 if (!selected_tune)
6927 selected_tune = selected_cpu;
6929 aarch64_tune_flags = selected_tune->flags;
6930 aarch64_tune = selected_tune->core;
6931 aarch64_tune_params = selected_tune->tune;
6932 aarch64_architecture_version = selected_cpu->architecture_version;
6934 if (aarch64_fix_a53_err835769 == 2)
6936 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6937 aarch64_fix_a53_err835769 = 1;
6938 #else
6939 aarch64_fix_a53_err835769 = 0;
6940 #endif
6943 /* If not opzimizing for size, set the default
6944 alignment to what the target wants */
6945 if (!optimize_size)
6947 if (align_loops <= 0)
6948 align_loops = aarch64_tune_params->loop_align;
6949 if (align_jumps <= 0)
6950 align_jumps = aarch64_tune_params->jump_align;
6951 if (align_functions <= 0)
6952 align_functions = aarch64_tune_params->function_align;
6955 if (AARCH64_TUNE_FMA_STEERING)
6956 aarch64_register_fma_steering ();
6958 aarch64_override_options_after_change ();
6961 /* Implement targetm.override_options_after_change. */
6963 static void
6964 aarch64_override_options_after_change (void)
6966 if (flag_omit_frame_pointer)
6967 flag_omit_leaf_frame_pointer = false;
6968 else if (flag_omit_leaf_frame_pointer)
6969 flag_omit_frame_pointer = true;
6972 static struct machine_function *
6973 aarch64_init_machine_status (void)
6975 struct machine_function *machine;
6976 machine = ggc_cleared_alloc<machine_function> ();
6977 return machine;
6980 void
6981 aarch64_init_expanders (void)
6983 init_machine_status = aarch64_init_machine_status;
6986 /* A checking mechanism for the implementation of the various code models. */
6987 static void
6988 initialize_aarch64_code_model (void)
6990 if (flag_pic)
6992 switch (aarch64_cmodel_var)
6994 case AARCH64_CMODEL_TINY:
6995 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6996 break;
6997 case AARCH64_CMODEL_SMALL:
6998 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6999 break;
7000 case AARCH64_CMODEL_LARGE:
7001 sorry ("code model %qs with -f%s", "large",
7002 flag_pic > 1 ? "PIC" : "pic");
7003 default:
7004 gcc_unreachable ();
7007 else
7008 aarch64_cmodel = aarch64_cmodel_var;
7011 /* Return true if SYMBOL_REF X binds locally. */
7013 static bool
7014 aarch64_symbol_binds_local_p (const_rtx x)
7016 return (SYMBOL_REF_DECL (x)
7017 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7018 : SYMBOL_REF_LOCAL_P (x));
7021 /* Return true if SYMBOL_REF X is thread local */
7022 static bool
7023 aarch64_tls_symbol_p (rtx x)
7025 if (! TARGET_HAVE_TLS)
7026 return false;
7028 if (GET_CODE (x) != SYMBOL_REF)
7029 return false;
7031 return SYMBOL_REF_TLS_MODEL (x) != 0;
7034 /* Classify a TLS symbol into one of the TLS kinds. */
7035 enum aarch64_symbol_type
7036 aarch64_classify_tls_symbol (rtx x)
7038 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7040 switch (tls_kind)
7042 case TLS_MODEL_GLOBAL_DYNAMIC:
7043 case TLS_MODEL_LOCAL_DYNAMIC:
7044 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7046 case TLS_MODEL_INITIAL_EXEC:
7047 return SYMBOL_SMALL_GOTTPREL;
7049 case TLS_MODEL_LOCAL_EXEC:
7050 return SYMBOL_SMALL_TPREL;
7052 case TLS_MODEL_EMULATED:
7053 case TLS_MODEL_NONE:
7054 return SYMBOL_FORCE_TO_MEM;
7056 default:
7057 gcc_unreachable ();
7061 /* Return the method that should be used to access SYMBOL_REF or
7062 LABEL_REF X in context CONTEXT. */
7064 enum aarch64_symbol_type
7065 aarch64_classify_symbol (rtx x, rtx offset,
7066 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7068 if (GET_CODE (x) == LABEL_REF)
7070 switch (aarch64_cmodel)
7072 case AARCH64_CMODEL_LARGE:
7073 return SYMBOL_FORCE_TO_MEM;
7075 case AARCH64_CMODEL_TINY_PIC:
7076 case AARCH64_CMODEL_TINY:
7077 return SYMBOL_TINY_ABSOLUTE;
7079 case AARCH64_CMODEL_SMALL_PIC:
7080 case AARCH64_CMODEL_SMALL:
7081 return SYMBOL_SMALL_ABSOLUTE;
7083 default:
7084 gcc_unreachable ();
7088 if (GET_CODE (x) == SYMBOL_REF)
7090 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7091 return SYMBOL_FORCE_TO_MEM;
7093 if (aarch64_tls_symbol_p (x))
7094 return aarch64_classify_tls_symbol (x);
7096 switch (aarch64_cmodel)
7098 case AARCH64_CMODEL_TINY:
7099 /* When we retreive symbol + offset address, we have to make sure
7100 the offset does not cause overflow of the final address. But
7101 we have no way of knowing the address of symbol at compile time
7102 so we can't accurately say if the distance between the PC and
7103 symbol + offset is outside the addressible range of +/-1M in the
7104 TINY code model. So we rely on images not being greater than
7105 1M and cap the offset at 1M and anything beyond 1M will have to
7106 be loaded using an alternative mechanism. */
7107 if (SYMBOL_REF_WEAK (x)
7108 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7109 return SYMBOL_FORCE_TO_MEM;
7110 return SYMBOL_TINY_ABSOLUTE;
7112 case AARCH64_CMODEL_SMALL:
7113 /* Same reasoning as the tiny code model, but the offset cap here is
7114 4G. */
7115 if (SYMBOL_REF_WEAK (x)
7116 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7117 HOST_WIDE_INT_C (4294967264)))
7118 return SYMBOL_FORCE_TO_MEM;
7119 return SYMBOL_SMALL_ABSOLUTE;
7121 case AARCH64_CMODEL_TINY_PIC:
7122 if (!aarch64_symbol_binds_local_p (x))
7123 return SYMBOL_TINY_GOT;
7124 return SYMBOL_TINY_ABSOLUTE;
7126 case AARCH64_CMODEL_SMALL_PIC:
7127 if (!aarch64_symbol_binds_local_p (x))
7128 return SYMBOL_SMALL_GOT;
7129 return SYMBOL_SMALL_ABSOLUTE;
7131 default:
7132 gcc_unreachable ();
7136 /* By default push everything into the constant pool. */
7137 return SYMBOL_FORCE_TO_MEM;
7140 bool
7141 aarch64_constant_address_p (rtx x)
7143 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7146 bool
7147 aarch64_legitimate_pic_operand_p (rtx x)
7149 if (GET_CODE (x) == SYMBOL_REF
7150 || (GET_CODE (x) == CONST
7151 && GET_CODE (XEXP (x, 0)) == PLUS
7152 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7153 return false;
7155 return true;
7158 /* Return true if X holds either a quarter-precision or
7159 floating-point +0.0 constant. */
7160 static bool
7161 aarch64_valid_floating_const (machine_mode mode, rtx x)
7163 if (!CONST_DOUBLE_P (x))
7164 return false;
7166 /* TODO: We could handle moving 0.0 to a TFmode register,
7167 but first we would like to refactor the movtf_aarch64
7168 to be more amicable to split moves properly and
7169 correctly gate on TARGET_SIMD. For now - reject all
7170 constants which are not to SFmode or DFmode registers. */
7171 if (!(mode == SFmode || mode == DFmode))
7172 return false;
7174 if (aarch64_float_const_zero_rtx_p (x))
7175 return true;
7176 return aarch64_float_const_representable_p (x);
7179 static bool
7180 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7182 /* Do not allow vector struct mode constants. We could support
7183 0 and -1 easily, but they need support in aarch64-simd.md. */
7184 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7185 return false;
7187 /* This could probably go away because
7188 we now decompose CONST_INTs according to expand_mov_immediate. */
7189 if ((GET_CODE (x) == CONST_VECTOR
7190 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7191 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7192 return !targetm.cannot_force_const_mem (mode, x);
7194 if (GET_CODE (x) == HIGH
7195 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7196 return true;
7198 return aarch64_constant_address_p (x);
7202 aarch64_load_tp (rtx target)
7204 if (!target
7205 || GET_MODE (target) != Pmode
7206 || !register_operand (target, Pmode))
7207 target = gen_reg_rtx (Pmode);
7209 /* Can return in any reg. */
7210 emit_insn (gen_aarch64_load_tp_hard (target));
7211 return target;
7214 /* On AAPCS systems, this is the "struct __va_list". */
7215 static GTY(()) tree va_list_type;
7217 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7218 Return the type to use as __builtin_va_list.
7220 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7222 struct __va_list
7224 void *__stack;
7225 void *__gr_top;
7226 void *__vr_top;
7227 int __gr_offs;
7228 int __vr_offs;
7229 }; */
7231 static tree
7232 aarch64_build_builtin_va_list (void)
7234 tree va_list_name;
7235 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7237 /* Create the type. */
7238 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7239 /* Give it the required name. */
7240 va_list_name = build_decl (BUILTINS_LOCATION,
7241 TYPE_DECL,
7242 get_identifier ("__va_list"),
7243 va_list_type);
7244 DECL_ARTIFICIAL (va_list_name) = 1;
7245 TYPE_NAME (va_list_type) = va_list_name;
7246 TYPE_STUB_DECL (va_list_type) = va_list_name;
7248 /* Create the fields. */
7249 f_stack = build_decl (BUILTINS_LOCATION,
7250 FIELD_DECL, get_identifier ("__stack"),
7251 ptr_type_node);
7252 f_grtop = build_decl (BUILTINS_LOCATION,
7253 FIELD_DECL, get_identifier ("__gr_top"),
7254 ptr_type_node);
7255 f_vrtop = build_decl (BUILTINS_LOCATION,
7256 FIELD_DECL, get_identifier ("__vr_top"),
7257 ptr_type_node);
7258 f_groff = build_decl (BUILTINS_LOCATION,
7259 FIELD_DECL, get_identifier ("__gr_offs"),
7260 integer_type_node);
7261 f_vroff = build_decl (BUILTINS_LOCATION,
7262 FIELD_DECL, get_identifier ("__vr_offs"),
7263 integer_type_node);
7265 DECL_ARTIFICIAL (f_stack) = 1;
7266 DECL_ARTIFICIAL (f_grtop) = 1;
7267 DECL_ARTIFICIAL (f_vrtop) = 1;
7268 DECL_ARTIFICIAL (f_groff) = 1;
7269 DECL_ARTIFICIAL (f_vroff) = 1;
7271 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7272 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7273 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7274 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7275 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7277 TYPE_FIELDS (va_list_type) = f_stack;
7278 DECL_CHAIN (f_stack) = f_grtop;
7279 DECL_CHAIN (f_grtop) = f_vrtop;
7280 DECL_CHAIN (f_vrtop) = f_groff;
7281 DECL_CHAIN (f_groff) = f_vroff;
7283 /* Compute its layout. */
7284 layout_type (va_list_type);
7286 return va_list_type;
7289 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7290 static void
7291 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7293 const CUMULATIVE_ARGS *cum;
7294 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7295 tree stack, grtop, vrtop, groff, vroff;
7296 tree t;
7297 int gr_save_area_size;
7298 int vr_save_area_size;
7299 int vr_offset;
7301 cum = &crtl->args.info;
7302 gr_save_area_size
7303 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7304 vr_save_area_size
7305 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7307 if (TARGET_GENERAL_REGS_ONLY)
7309 if (cum->aapcs_nvrn > 0)
7310 sorry ("%qs and floating point or vector arguments",
7311 "-mgeneral-regs-only");
7312 vr_save_area_size = 0;
7315 f_stack = TYPE_FIELDS (va_list_type_node);
7316 f_grtop = DECL_CHAIN (f_stack);
7317 f_vrtop = DECL_CHAIN (f_grtop);
7318 f_groff = DECL_CHAIN (f_vrtop);
7319 f_vroff = DECL_CHAIN (f_groff);
7321 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7322 NULL_TREE);
7323 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7324 NULL_TREE);
7325 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7326 NULL_TREE);
7327 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7328 NULL_TREE);
7329 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7330 NULL_TREE);
7332 /* Emit code to initialize STACK, which points to the next varargs stack
7333 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7334 by named arguments. STACK is 8-byte aligned. */
7335 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7336 if (cum->aapcs_stack_size > 0)
7337 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7338 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7339 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7341 /* Emit code to initialize GRTOP, the top of the GR save area.
7342 virtual_incoming_args_rtx should have been 16 byte aligned. */
7343 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7344 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7345 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7347 /* Emit code to initialize VRTOP, the top of the VR save area.
7348 This address is gr_save_area_bytes below GRTOP, rounded
7349 down to the next 16-byte boundary. */
7350 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7351 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7352 STACK_BOUNDARY / BITS_PER_UNIT);
7354 if (vr_offset)
7355 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7356 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7357 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7359 /* Emit code to initialize GROFF, the offset from GRTOP of the
7360 next GPR argument. */
7361 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7362 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7363 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7365 /* Likewise emit code to initialize VROFF, the offset from FTOP
7366 of the next VR argument. */
7367 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7368 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7369 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7372 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7374 static tree
7375 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7376 gimple_seq *post_p ATTRIBUTE_UNUSED)
7378 tree addr;
7379 bool indirect_p;
7380 bool is_ha; /* is HFA or HVA. */
7381 bool dw_align; /* double-word align. */
7382 machine_mode ag_mode = VOIDmode;
7383 int nregs;
7384 machine_mode mode;
7386 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7387 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7388 HOST_WIDE_INT size, rsize, adjust, align;
7389 tree t, u, cond1, cond2;
7391 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7392 if (indirect_p)
7393 type = build_pointer_type (type);
7395 mode = TYPE_MODE (type);
7397 f_stack = TYPE_FIELDS (va_list_type_node);
7398 f_grtop = DECL_CHAIN (f_stack);
7399 f_vrtop = DECL_CHAIN (f_grtop);
7400 f_groff = DECL_CHAIN (f_vrtop);
7401 f_vroff = DECL_CHAIN (f_groff);
7403 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7404 f_stack, NULL_TREE);
7405 size = int_size_in_bytes (type);
7406 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7408 dw_align = false;
7409 adjust = 0;
7410 if (aarch64_vfp_is_call_or_return_candidate (mode,
7411 type,
7412 &ag_mode,
7413 &nregs,
7414 &is_ha))
7416 /* TYPE passed in fp/simd registers. */
7417 if (TARGET_GENERAL_REGS_ONLY)
7418 sorry ("%qs and floating point or vector arguments",
7419 "-mgeneral-regs-only");
7421 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7422 unshare_expr (valist), f_vrtop, NULL_TREE);
7423 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7424 unshare_expr (valist), f_vroff, NULL_TREE);
7426 rsize = nregs * UNITS_PER_VREG;
7428 if (is_ha)
7430 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7431 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7433 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7434 && size < UNITS_PER_VREG)
7436 adjust = UNITS_PER_VREG - size;
7439 else
7441 /* TYPE passed in general registers. */
7442 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7443 unshare_expr (valist), f_grtop, NULL_TREE);
7444 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7445 unshare_expr (valist), f_groff, NULL_TREE);
7446 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7447 nregs = rsize / UNITS_PER_WORD;
7449 if (align > 8)
7450 dw_align = true;
7452 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7453 && size < UNITS_PER_WORD)
7455 adjust = UNITS_PER_WORD - size;
7459 /* Get a local temporary for the field value. */
7460 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7462 /* Emit code to branch if off >= 0. */
7463 t = build2 (GE_EXPR, boolean_type_node, off,
7464 build_int_cst (TREE_TYPE (off), 0));
7465 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7467 if (dw_align)
7469 /* Emit: offs = (offs + 15) & -16. */
7470 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7471 build_int_cst (TREE_TYPE (off), 15));
7472 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7473 build_int_cst (TREE_TYPE (off), -16));
7474 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7476 else
7477 roundup = NULL;
7479 /* Update ap.__[g|v]r_offs */
7480 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7481 build_int_cst (TREE_TYPE (off), rsize));
7482 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7484 /* String up. */
7485 if (roundup)
7486 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7488 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7489 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7490 build_int_cst (TREE_TYPE (f_off), 0));
7491 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7493 /* String up: make sure the assignment happens before the use. */
7494 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7495 COND_EXPR_ELSE (cond1) = t;
7497 /* Prepare the trees handling the argument that is passed on the stack;
7498 the top level node will store in ON_STACK. */
7499 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7500 if (align > 8)
7502 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7503 t = fold_convert (intDI_type_node, arg);
7504 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7505 build_int_cst (TREE_TYPE (t), 15));
7506 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7507 build_int_cst (TREE_TYPE (t), -16));
7508 t = fold_convert (TREE_TYPE (arg), t);
7509 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7511 else
7512 roundup = NULL;
7513 /* Advance ap.__stack */
7514 t = fold_convert (intDI_type_node, arg);
7515 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7516 build_int_cst (TREE_TYPE (t), size + 7));
7517 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7518 build_int_cst (TREE_TYPE (t), -8));
7519 t = fold_convert (TREE_TYPE (arg), t);
7520 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7521 /* String up roundup and advance. */
7522 if (roundup)
7523 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7524 /* String up with arg */
7525 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7526 /* Big-endianness related address adjustment. */
7527 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7528 && size < UNITS_PER_WORD)
7530 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7531 size_int (UNITS_PER_WORD - size));
7532 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7535 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7536 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7538 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7539 t = off;
7540 if (adjust)
7541 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7542 build_int_cst (TREE_TYPE (off), adjust));
7544 t = fold_convert (sizetype, t);
7545 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7547 if (is_ha)
7549 /* type ha; // treat as "struct {ftype field[n];}"
7550 ... [computing offs]
7551 for (i = 0; i <nregs; ++i, offs += 16)
7552 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7553 return ha; */
7554 int i;
7555 tree tmp_ha, field_t, field_ptr_t;
7557 /* Declare a local variable. */
7558 tmp_ha = create_tmp_var_raw (type, "ha");
7559 gimple_add_tmp_var (tmp_ha);
7561 /* Establish the base type. */
7562 switch (ag_mode)
7564 case SFmode:
7565 field_t = float_type_node;
7566 field_ptr_t = float_ptr_type_node;
7567 break;
7568 case DFmode:
7569 field_t = double_type_node;
7570 field_ptr_t = double_ptr_type_node;
7571 break;
7572 case TFmode:
7573 field_t = long_double_type_node;
7574 field_ptr_t = long_double_ptr_type_node;
7575 break;
7576 /* The half precision and quad precision are not fully supported yet. Enable
7577 the following code after the support is complete. Need to find the correct
7578 type node for __fp16 *. */
7579 #if 0
7580 case HFmode:
7581 field_t = float_type_node;
7582 field_ptr_t = float_ptr_type_node;
7583 break;
7584 #endif
7585 case V2SImode:
7586 case V4SImode:
7588 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7589 field_t = build_vector_type_for_mode (innertype, ag_mode);
7590 field_ptr_t = build_pointer_type (field_t);
7592 break;
7593 default:
7594 gcc_assert (0);
7597 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7598 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7599 addr = t;
7600 t = fold_convert (field_ptr_t, addr);
7601 t = build2 (MODIFY_EXPR, field_t,
7602 build1 (INDIRECT_REF, field_t, tmp_ha),
7603 build1 (INDIRECT_REF, field_t, t));
7605 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7606 for (i = 1; i < nregs; ++i)
7608 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7609 u = fold_convert (field_ptr_t, addr);
7610 u = build2 (MODIFY_EXPR, field_t,
7611 build2 (MEM_REF, field_t, tmp_ha,
7612 build_int_cst (field_ptr_t,
7613 (i *
7614 int_size_in_bytes (field_t)))),
7615 build1 (INDIRECT_REF, field_t, u));
7616 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7619 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7620 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7623 COND_EXPR_ELSE (cond2) = t;
7624 addr = fold_convert (build_pointer_type (type), cond1);
7625 addr = build_va_arg_indirect_ref (addr);
7627 if (indirect_p)
7628 addr = build_va_arg_indirect_ref (addr);
7630 return addr;
7633 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7635 static void
7636 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7637 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7638 int no_rtl)
7640 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7641 CUMULATIVE_ARGS local_cum;
7642 int gr_saved, vr_saved;
7644 /* The caller has advanced CUM up to, but not beyond, the last named
7645 argument. Advance a local copy of CUM past the last "real" named
7646 argument, to find out how many registers are left over. */
7647 local_cum = *cum;
7648 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7650 /* Found out how many registers we need to save. */
7651 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7652 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7654 if (TARGET_GENERAL_REGS_ONLY)
7656 if (local_cum.aapcs_nvrn > 0)
7657 sorry ("%qs and floating point or vector arguments",
7658 "-mgeneral-regs-only");
7659 vr_saved = 0;
7662 if (!no_rtl)
7664 if (gr_saved > 0)
7666 rtx ptr, mem;
7668 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7669 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7670 - gr_saved * UNITS_PER_WORD);
7671 mem = gen_frame_mem (BLKmode, ptr);
7672 set_mem_alias_set (mem, get_varargs_alias_set ());
7674 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7675 mem, gr_saved);
7677 if (vr_saved > 0)
7679 /* We can't use move_block_from_reg, because it will use
7680 the wrong mode, storing D regs only. */
7681 machine_mode mode = TImode;
7682 int off, i;
7684 /* Set OFF to the offset from virtual_incoming_args_rtx of
7685 the first vector register. The VR save area lies below
7686 the GR one, and is aligned to 16 bytes. */
7687 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7688 STACK_BOUNDARY / BITS_PER_UNIT);
7689 off -= vr_saved * UNITS_PER_VREG;
7691 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7693 rtx ptr, mem;
7695 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7696 mem = gen_frame_mem (mode, ptr);
7697 set_mem_alias_set (mem, get_varargs_alias_set ());
7698 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7699 off += UNITS_PER_VREG;
7704 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7705 any complication of having crtl->args.pretend_args_size changed. */
7706 cfun->machine->frame.saved_varargs_size
7707 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7708 STACK_BOUNDARY / BITS_PER_UNIT)
7709 + vr_saved * UNITS_PER_VREG);
7712 static void
7713 aarch64_conditional_register_usage (void)
7715 int i;
7716 if (!TARGET_FLOAT)
7718 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7720 fixed_regs[i] = 1;
7721 call_used_regs[i] = 1;
7726 /* Walk down the type tree of TYPE counting consecutive base elements.
7727 If *MODEP is VOIDmode, then set it to the first valid floating point
7728 type. If a non-floating point type is found, or if a floating point
7729 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7730 otherwise return the count in the sub-tree. */
7731 static int
7732 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7734 machine_mode mode;
7735 HOST_WIDE_INT size;
7737 switch (TREE_CODE (type))
7739 case REAL_TYPE:
7740 mode = TYPE_MODE (type);
7741 if (mode != DFmode && mode != SFmode && mode != TFmode)
7742 return -1;
7744 if (*modep == VOIDmode)
7745 *modep = mode;
7747 if (*modep == mode)
7748 return 1;
7750 break;
7752 case COMPLEX_TYPE:
7753 mode = TYPE_MODE (TREE_TYPE (type));
7754 if (mode != DFmode && mode != SFmode && mode != TFmode)
7755 return -1;
7757 if (*modep == VOIDmode)
7758 *modep = mode;
7760 if (*modep == mode)
7761 return 2;
7763 break;
7765 case VECTOR_TYPE:
7766 /* Use V2SImode and V4SImode as representatives of all 64-bit
7767 and 128-bit vector types. */
7768 size = int_size_in_bytes (type);
7769 switch (size)
7771 case 8:
7772 mode = V2SImode;
7773 break;
7774 case 16:
7775 mode = V4SImode;
7776 break;
7777 default:
7778 return -1;
7781 if (*modep == VOIDmode)
7782 *modep = mode;
7784 /* Vector modes are considered to be opaque: two vectors are
7785 equivalent for the purposes of being homogeneous aggregates
7786 if they are the same size. */
7787 if (*modep == mode)
7788 return 1;
7790 break;
7792 case ARRAY_TYPE:
7794 int count;
7795 tree index = TYPE_DOMAIN (type);
7797 /* Can't handle incomplete types nor sizes that are not
7798 fixed. */
7799 if (!COMPLETE_TYPE_P (type)
7800 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7801 return -1;
7803 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7804 if (count == -1
7805 || !index
7806 || !TYPE_MAX_VALUE (index)
7807 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7808 || !TYPE_MIN_VALUE (index)
7809 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7810 || count < 0)
7811 return -1;
7813 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7814 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7816 /* There must be no padding. */
7817 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7818 return -1;
7820 return count;
7823 case RECORD_TYPE:
7825 int count = 0;
7826 int sub_count;
7827 tree field;
7829 /* Can't handle incomplete types nor sizes that are not
7830 fixed. */
7831 if (!COMPLETE_TYPE_P (type)
7832 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7833 return -1;
7835 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7837 if (TREE_CODE (field) != FIELD_DECL)
7838 continue;
7840 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7841 if (sub_count < 0)
7842 return -1;
7843 count += sub_count;
7846 /* There must be no padding. */
7847 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7848 return -1;
7850 return count;
7853 case UNION_TYPE:
7854 case QUAL_UNION_TYPE:
7856 /* These aren't very interesting except in a degenerate case. */
7857 int count = 0;
7858 int sub_count;
7859 tree field;
7861 /* Can't handle incomplete types nor sizes that are not
7862 fixed. */
7863 if (!COMPLETE_TYPE_P (type)
7864 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7865 return -1;
7867 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7869 if (TREE_CODE (field) != FIELD_DECL)
7870 continue;
7872 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7873 if (sub_count < 0)
7874 return -1;
7875 count = count > sub_count ? count : sub_count;
7878 /* There must be no padding. */
7879 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7880 return -1;
7882 return count;
7885 default:
7886 break;
7889 return -1;
7892 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7893 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7894 array types. The C99 floating-point complex types are also considered
7895 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7896 types, which are GCC extensions and out of the scope of AAPCS64, are
7897 treated as composite types here as well.
7899 Note that MODE itself is not sufficient in determining whether a type
7900 is such a composite type or not. This is because
7901 stor-layout.c:compute_record_mode may have already changed the MODE
7902 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7903 structure with only one field may have its MODE set to the mode of the
7904 field. Also an integer mode whose size matches the size of the
7905 RECORD_TYPE type may be used to substitute the original mode
7906 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7907 solely relied on. */
7909 static bool
7910 aarch64_composite_type_p (const_tree type,
7911 machine_mode mode)
7913 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7914 return true;
7916 if (mode == BLKmode
7917 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7918 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7919 return true;
7921 return false;
7924 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7925 type as described in AAPCS64 \S 4.1.2.
7927 See the comment above aarch64_composite_type_p for the notes on MODE. */
7929 static bool
7930 aarch64_short_vector_p (const_tree type,
7931 machine_mode mode)
7933 HOST_WIDE_INT size = -1;
7935 if (type && TREE_CODE (type) == VECTOR_TYPE)
7936 size = int_size_in_bytes (type);
7937 else if (!aarch64_composite_type_p (type, mode)
7938 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7939 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7940 size = GET_MODE_SIZE (mode);
7942 return (size == 8 || size == 16) ? true : false;
7945 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7946 shall be passed or returned in simd/fp register(s) (providing these
7947 parameter passing registers are available).
7949 Upon successful return, *COUNT returns the number of needed registers,
7950 *BASE_MODE returns the mode of the individual register and when IS_HAF
7951 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7952 floating-point aggregate or a homogeneous short-vector aggregate. */
7954 static bool
7955 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7956 const_tree type,
7957 machine_mode *base_mode,
7958 int *count,
7959 bool *is_ha)
7961 machine_mode new_mode = VOIDmode;
7962 bool composite_p = aarch64_composite_type_p (type, mode);
7964 if (is_ha != NULL) *is_ha = false;
7966 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7967 || aarch64_short_vector_p (type, mode))
7969 *count = 1;
7970 new_mode = mode;
7972 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7974 if (is_ha != NULL) *is_ha = true;
7975 *count = 2;
7976 new_mode = GET_MODE_INNER (mode);
7978 else if (type && composite_p)
7980 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7982 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7984 if (is_ha != NULL) *is_ha = true;
7985 *count = ag_count;
7987 else
7988 return false;
7990 else
7991 return false;
7993 *base_mode = new_mode;
7994 return true;
7997 /* Implement TARGET_STRUCT_VALUE_RTX. */
7999 static rtx
8000 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8001 int incoming ATTRIBUTE_UNUSED)
8003 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8006 /* Implements target hook vector_mode_supported_p. */
8007 static bool
8008 aarch64_vector_mode_supported_p (machine_mode mode)
8010 if (TARGET_SIMD
8011 && (mode == V4SImode || mode == V8HImode
8012 || mode == V16QImode || mode == V2DImode
8013 || mode == V2SImode || mode == V4HImode
8014 || mode == V8QImode || mode == V2SFmode
8015 || mode == V4SFmode || mode == V2DFmode
8016 || mode == V1DFmode))
8017 return true;
8019 return false;
8022 /* Return appropriate SIMD container
8023 for MODE within a vector of WIDTH bits. */
8024 static machine_mode
8025 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8027 gcc_assert (width == 64 || width == 128);
8028 if (TARGET_SIMD)
8030 if (width == 128)
8031 switch (mode)
8033 case DFmode:
8034 return V2DFmode;
8035 case SFmode:
8036 return V4SFmode;
8037 case SImode:
8038 return V4SImode;
8039 case HImode:
8040 return V8HImode;
8041 case QImode:
8042 return V16QImode;
8043 case DImode:
8044 return V2DImode;
8045 default:
8046 break;
8048 else
8049 switch (mode)
8051 case SFmode:
8052 return V2SFmode;
8053 case SImode:
8054 return V2SImode;
8055 case HImode:
8056 return V4HImode;
8057 case QImode:
8058 return V8QImode;
8059 default:
8060 break;
8063 return word_mode;
8066 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8067 static machine_mode
8068 aarch64_preferred_simd_mode (machine_mode mode)
8070 return aarch64_simd_container_mode (mode, 128);
8073 /* Return the bitmask of possible vector sizes for the vectorizer
8074 to iterate over. */
8075 static unsigned int
8076 aarch64_autovectorize_vector_sizes (void)
8078 return (16 | 8);
8081 /* Implement TARGET_MANGLE_TYPE. */
8083 static const char *
8084 aarch64_mangle_type (const_tree type)
8086 /* The AArch64 ABI documents say that "__va_list" has to be
8087 managled as if it is in the "std" namespace. */
8088 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8089 return "St9__va_list";
8091 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8092 builtin types. */
8093 if (TYPE_NAME (type) != NULL)
8094 return aarch64_mangle_builtin_type (type);
8096 /* Use the default mangling. */
8097 return NULL;
8101 /* Return true if the rtx_insn contains a MEM RTX somewhere
8102 in it. */
8104 static bool
8105 has_memory_op (rtx_insn *mem_insn)
8107 subrtx_iterator::array_type array;
8108 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8109 if (MEM_P (*iter))
8110 return true;
8112 return false;
8115 /* Find the first rtx_insn before insn that will generate an assembly
8116 instruction. */
8118 static rtx_insn *
8119 aarch64_prev_real_insn (rtx_insn *insn)
8121 if (!insn)
8122 return NULL;
8126 insn = prev_real_insn (insn);
8128 while (insn && recog_memoized (insn) < 0);
8130 return insn;
8133 static bool
8134 is_madd_op (enum attr_type t1)
8136 unsigned int i;
8137 /* A number of these may be AArch32 only. */
8138 enum attr_type mlatypes[] = {
8139 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8140 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8141 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8144 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8146 if (t1 == mlatypes[i])
8147 return true;
8150 return false;
8153 /* Check if there is a register dependency between a load and the insn
8154 for which we hold recog_data. */
8156 static bool
8157 dep_between_memop_and_curr (rtx memop)
8159 rtx load_reg;
8160 int opno;
8162 gcc_assert (GET_CODE (memop) == SET);
8164 if (!REG_P (SET_DEST (memop)))
8165 return false;
8167 load_reg = SET_DEST (memop);
8168 for (opno = 1; opno < recog_data.n_operands; opno++)
8170 rtx operand = recog_data.operand[opno];
8171 if (REG_P (operand)
8172 && reg_overlap_mentioned_p (load_reg, operand))
8173 return true;
8176 return false;
8180 /* When working around the Cortex-A53 erratum 835769,
8181 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8182 instruction and has a preceding memory instruction such that a NOP
8183 should be inserted between them. */
8185 bool
8186 aarch64_madd_needs_nop (rtx_insn* insn)
8188 enum attr_type attr_type;
8189 rtx_insn *prev;
8190 rtx body;
8192 if (!aarch64_fix_a53_err835769)
8193 return false;
8195 if (recog_memoized (insn) < 0)
8196 return false;
8198 attr_type = get_attr_type (insn);
8199 if (!is_madd_op (attr_type))
8200 return false;
8202 prev = aarch64_prev_real_insn (insn);
8203 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8204 Restore recog state to INSN to avoid state corruption. */
8205 extract_constrain_insn_cached (insn);
8207 if (!prev || !has_memory_op (prev))
8208 return false;
8210 body = single_set (prev);
8212 /* If the previous insn is a memory op and there is no dependency between
8213 it and the DImode madd, emit a NOP between them. If body is NULL then we
8214 have a complex memory operation, probably a load/store pair.
8215 Be conservative for now and emit a NOP. */
8216 if (GET_MODE (recog_data.operand[0]) == DImode
8217 && (!body || !dep_between_memop_and_curr (body)))
8218 return true;
8220 return false;
8225 /* Implement FINAL_PRESCAN_INSN. */
8227 void
8228 aarch64_final_prescan_insn (rtx_insn *insn)
8230 if (aarch64_madd_needs_nop (insn))
8231 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8235 /* Return the equivalent letter for size. */
8236 static char
8237 sizetochar (int size)
8239 switch (size)
8241 case 64: return 'd';
8242 case 32: return 's';
8243 case 16: return 'h';
8244 case 8 : return 'b';
8245 default: gcc_unreachable ();
8249 /* Return true iff x is a uniform vector of floating-point
8250 constants, and the constant can be represented in
8251 quarter-precision form. Note, as aarch64_float_const_representable
8252 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8253 static bool
8254 aarch64_vect_float_const_representable_p (rtx x)
8256 int i = 0;
8257 REAL_VALUE_TYPE r0, ri;
8258 rtx x0, xi;
8260 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8261 return false;
8263 x0 = CONST_VECTOR_ELT (x, 0);
8264 if (!CONST_DOUBLE_P (x0))
8265 return false;
8267 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8269 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8271 xi = CONST_VECTOR_ELT (x, i);
8272 if (!CONST_DOUBLE_P (xi))
8273 return false;
8275 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8276 if (!REAL_VALUES_EQUAL (r0, ri))
8277 return false;
8280 return aarch64_float_const_representable_p (x0);
8283 /* Return true for valid and false for invalid. */
8284 bool
8285 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8286 struct simd_immediate_info *info)
8288 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8289 matches = 1; \
8290 for (i = 0; i < idx; i += (STRIDE)) \
8291 if (!(TEST)) \
8292 matches = 0; \
8293 if (matches) \
8295 immtype = (CLASS); \
8296 elsize = (ELSIZE); \
8297 eshift = (SHIFT); \
8298 emvn = (NEG); \
8299 break; \
8302 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8303 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8304 unsigned char bytes[16];
8305 int immtype = -1, matches;
8306 unsigned int invmask = inverse ? 0xff : 0;
8307 int eshift, emvn;
8309 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8311 if (! (aarch64_simd_imm_zero_p (op, mode)
8312 || aarch64_vect_float_const_representable_p (op)))
8313 return false;
8315 if (info)
8317 info->value = CONST_VECTOR_ELT (op, 0);
8318 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8319 info->mvn = false;
8320 info->shift = 0;
8323 return true;
8326 /* Splat vector constant out into a byte vector. */
8327 for (i = 0; i < n_elts; i++)
8329 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8330 it must be laid out in the vector register in reverse order. */
8331 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8332 unsigned HOST_WIDE_INT elpart;
8333 unsigned int part, parts;
8335 if (CONST_INT_P (el))
8337 elpart = INTVAL (el);
8338 parts = 1;
8340 else if (GET_CODE (el) == CONST_DOUBLE)
8342 elpart = CONST_DOUBLE_LOW (el);
8343 parts = 2;
8345 else
8346 gcc_unreachable ();
8348 for (part = 0; part < parts; part++)
8350 unsigned int byte;
8351 for (byte = 0; byte < innersize; byte++)
8353 bytes[idx++] = (elpart & 0xff) ^ invmask;
8354 elpart >>= BITS_PER_UNIT;
8356 if (GET_CODE (el) == CONST_DOUBLE)
8357 elpart = CONST_DOUBLE_HIGH (el);
8361 /* Sanity check. */
8362 gcc_assert (idx == GET_MODE_SIZE (mode));
8366 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8367 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8369 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8370 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8372 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8373 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8375 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8376 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8378 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8380 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8382 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8383 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8385 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8386 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8388 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8389 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8391 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8392 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8394 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8396 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8398 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8399 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8401 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8402 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8404 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8405 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8407 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8408 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8410 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8412 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8413 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8415 while (0);
8417 if (immtype == -1)
8418 return false;
8420 if (info)
8422 info->element_width = elsize;
8423 info->mvn = emvn != 0;
8424 info->shift = eshift;
8426 unsigned HOST_WIDE_INT imm = 0;
8428 if (immtype >= 12 && immtype <= 15)
8429 info->msl = true;
8431 /* Un-invert bytes of recognized vector, if necessary. */
8432 if (invmask != 0)
8433 for (i = 0; i < idx; i++)
8434 bytes[i] ^= invmask;
8436 if (immtype == 17)
8438 /* FIXME: Broken on 32-bit H_W_I hosts. */
8439 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8441 for (i = 0; i < 8; i++)
8442 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8443 << (i * BITS_PER_UNIT);
8446 info->value = GEN_INT (imm);
8448 else
8450 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8451 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8453 /* Construct 'abcdefgh' because the assembler cannot handle
8454 generic constants. */
8455 if (info->mvn)
8456 imm = ~imm;
8457 imm = (imm >> info->shift) & 0xff;
8458 info->value = GEN_INT (imm);
8462 return true;
8463 #undef CHECK
8466 /* Check of immediate shift constants are within range. */
8467 bool
8468 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8470 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8471 if (left)
8472 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8473 else
8474 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8477 /* Return true if X is a uniform vector where all elements
8478 are either the floating-point constant 0.0 or the
8479 integer constant 0. */
8480 bool
8481 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8483 return x == CONST0_RTX (mode);
8486 bool
8487 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8489 HOST_WIDE_INT imm = INTVAL (x);
8490 int i;
8492 for (i = 0; i < 8; i++)
8494 unsigned int byte = imm & 0xff;
8495 if (byte != 0xff && byte != 0)
8496 return false;
8497 imm >>= 8;
8500 return true;
8503 bool
8504 aarch64_mov_operand_p (rtx x,
8505 enum aarch64_symbol_context context,
8506 machine_mode mode)
8508 if (GET_CODE (x) == HIGH
8509 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8510 return true;
8512 if (CONST_INT_P (x))
8513 return true;
8515 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8516 return true;
8518 return aarch64_classify_symbolic_expression (x, context)
8519 == SYMBOL_TINY_ABSOLUTE;
8522 /* Return a const_int vector of VAL. */
8524 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8526 int nunits = GET_MODE_NUNITS (mode);
8527 rtvec v = rtvec_alloc (nunits);
8528 int i;
8530 for (i=0; i < nunits; i++)
8531 RTVEC_ELT (v, i) = GEN_INT (val);
8533 return gen_rtx_CONST_VECTOR (mode, v);
8536 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8538 bool
8539 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8541 machine_mode vmode;
8543 gcc_assert (!VECTOR_MODE_P (mode));
8544 vmode = aarch64_preferred_simd_mode (mode);
8545 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8546 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8549 /* Construct and return a PARALLEL RTX vector with elements numbering the
8550 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8551 the vector - from the perspective of the architecture. This does not
8552 line up with GCC's perspective on lane numbers, so we end up with
8553 different masks depending on our target endian-ness. The diagram
8554 below may help. We must draw the distinction when building masks
8555 which select one half of the vector. An instruction selecting
8556 architectural low-lanes for a big-endian target, must be described using
8557 a mask selecting GCC high-lanes.
8559 Big-Endian Little-Endian
8561 GCC 0 1 2 3 3 2 1 0
8562 | x | x | x | x | | x | x | x | x |
8563 Architecture 3 2 1 0 3 2 1 0
8565 Low Mask: { 2, 3 } { 0, 1 }
8566 High Mask: { 0, 1 } { 2, 3 }
8570 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8572 int nunits = GET_MODE_NUNITS (mode);
8573 rtvec v = rtvec_alloc (nunits / 2);
8574 int high_base = nunits / 2;
8575 int low_base = 0;
8576 int base;
8577 rtx t1;
8578 int i;
8580 if (BYTES_BIG_ENDIAN)
8581 base = high ? low_base : high_base;
8582 else
8583 base = high ? high_base : low_base;
8585 for (i = 0; i < nunits / 2; i++)
8586 RTVEC_ELT (v, i) = GEN_INT (base + i);
8588 t1 = gen_rtx_PARALLEL (mode, v);
8589 return t1;
8592 /* Check OP for validity as a PARALLEL RTX vector with elements
8593 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8594 from the perspective of the architecture. See the diagram above
8595 aarch64_simd_vect_par_cnst_half for more details. */
8597 bool
8598 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8599 bool high)
8601 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8602 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8603 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8604 int i = 0;
8606 if (!VECTOR_MODE_P (mode))
8607 return false;
8609 if (count_op != count_ideal)
8610 return false;
8612 for (i = 0; i < count_ideal; i++)
8614 rtx elt_op = XVECEXP (op, 0, i);
8615 rtx elt_ideal = XVECEXP (ideal, 0, i);
8617 if (!CONST_INT_P (elt_op)
8618 || INTVAL (elt_ideal) != INTVAL (elt_op))
8619 return false;
8621 return true;
8624 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8625 HIGH (exclusive). */
8626 void
8627 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8628 const_tree exp)
8630 HOST_WIDE_INT lane;
8631 gcc_assert (CONST_INT_P (operand));
8632 lane = INTVAL (operand);
8634 if (lane < low || lane >= high)
8636 if (exp)
8637 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8638 else
8639 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8643 /* Return TRUE if OP is a valid vector addressing mode. */
8644 bool
8645 aarch64_simd_mem_operand_p (rtx op)
8647 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8648 || REG_P (XEXP (op, 0)));
8651 /* Emit a register copy from operand to operand, taking care not to
8652 early-clobber source registers in the process.
8654 COUNT is the number of components into which the copy needs to be
8655 decomposed. */
8656 void
8657 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8658 unsigned int count)
8660 unsigned int i;
8661 int rdest = REGNO (operands[0]);
8662 int rsrc = REGNO (operands[1]);
8664 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8665 || rdest < rsrc)
8666 for (i = 0; i < count; i++)
8667 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8668 gen_rtx_REG (mode, rsrc + i));
8669 else
8670 for (i = 0; i < count; i++)
8671 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8672 gen_rtx_REG (mode, rsrc + count - i - 1));
8675 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8676 one of VSTRUCT modes: OI, CI or XI. */
8678 aarch64_simd_attr_length_move (rtx_insn *insn)
8680 machine_mode mode;
8682 extract_insn_cached (insn);
8684 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8686 mode = GET_MODE (recog_data.operand[0]);
8687 switch (mode)
8689 case OImode:
8690 return 8;
8691 case CImode:
8692 return 12;
8693 case XImode:
8694 return 16;
8695 default:
8696 gcc_unreachable ();
8699 return 4;
8702 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8703 one of VSTRUCT modes: OI, CI, EI, or XI. */
8705 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8707 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8710 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8711 alignment of a vector to 128 bits. */
8712 static HOST_WIDE_INT
8713 aarch64_simd_vector_alignment (const_tree type)
8715 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8716 return MIN (align, 128);
8719 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8720 static bool
8721 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8723 if (is_packed)
8724 return false;
8726 /* We guarantee alignment for vectors up to 128-bits. */
8727 if (tree_int_cst_compare (TYPE_SIZE (type),
8728 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8729 return false;
8731 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8732 return true;
8735 /* If VALS is a vector constant that can be loaded into a register
8736 using DUP, generate instructions to do so and return an RTX to
8737 assign to the register. Otherwise return NULL_RTX. */
8738 static rtx
8739 aarch64_simd_dup_constant (rtx vals)
8741 machine_mode mode = GET_MODE (vals);
8742 machine_mode inner_mode = GET_MODE_INNER (mode);
8743 int n_elts = GET_MODE_NUNITS (mode);
8744 bool all_same = true;
8745 rtx x;
8746 int i;
8748 if (GET_CODE (vals) != CONST_VECTOR)
8749 return NULL_RTX;
8751 for (i = 1; i < n_elts; ++i)
8753 x = CONST_VECTOR_ELT (vals, i);
8754 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8755 all_same = false;
8758 if (!all_same)
8759 return NULL_RTX;
8761 /* We can load this constant by using DUP and a constant in a
8762 single ARM register. This will be cheaper than a vector
8763 load. */
8764 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8765 return gen_rtx_VEC_DUPLICATE (mode, x);
8769 /* Generate code to load VALS, which is a PARALLEL containing only
8770 constants (for vec_init) or CONST_VECTOR, efficiently into a
8771 register. Returns an RTX to copy into the register, or NULL_RTX
8772 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8773 static rtx
8774 aarch64_simd_make_constant (rtx vals)
8776 machine_mode mode = GET_MODE (vals);
8777 rtx const_dup;
8778 rtx const_vec = NULL_RTX;
8779 int n_elts = GET_MODE_NUNITS (mode);
8780 int n_const = 0;
8781 int i;
8783 if (GET_CODE (vals) == CONST_VECTOR)
8784 const_vec = vals;
8785 else if (GET_CODE (vals) == PARALLEL)
8787 /* A CONST_VECTOR must contain only CONST_INTs and
8788 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8789 Only store valid constants in a CONST_VECTOR. */
8790 for (i = 0; i < n_elts; ++i)
8792 rtx x = XVECEXP (vals, 0, i);
8793 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8794 n_const++;
8796 if (n_const == n_elts)
8797 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8799 else
8800 gcc_unreachable ();
8802 if (const_vec != NULL_RTX
8803 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8804 /* Load using MOVI/MVNI. */
8805 return const_vec;
8806 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8807 /* Loaded using DUP. */
8808 return const_dup;
8809 else if (const_vec != NULL_RTX)
8810 /* Load from constant pool. We can not take advantage of single-cycle
8811 LD1 because we need a PC-relative addressing mode. */
8812 return const_vec;
8813 else
8814 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8815 We can not construct an initializer. */
8816 return NULL_RTX;
8819 void
8820 aarch64_expand_vector_init (rtx target, rtx vals)
8822 machine_mode mode = GET_MODE (target);
8823 machine_mode inner_mode = GET_MODE_INNER (mode);
8824 int n_elts = GET_MODE_NUNITS (mode);
8825 int n_var = 0;
8826 rtx any_const = NULL_RTX;
8827 bool all_same = true;
8829 for (int i = 0; i < n_elts; ++i)
8831 rtx x = XVECEXP (vals, 0, i);
8832 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8833 ++n_var;
8834 else
8835 any_const = x;
8837 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8838 all_same = false;
8841 if (n_var == 0)
8843 rtx constant = aarch64_simd_make_constant (vals);
8844 if (constant != NULL_RTX)
8846 emit_move_insn (target, constant);
8847 return;
8851 /* Splat a single non-constant element if we can. */
8852 if (all_same)
8854 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8855 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8856 return;
8859 /* Half the fields (or less) are non-constant. Load constant then overwrite
8860 varying fields. Hope that this is more efficient than using the stack. */
8861 if (n_var <= n_elts/2)
8863 rtx copy = copy_rtx (vals);
8865 /* Load constant part of vector. We really don't care what goes into the
8866 parts we will overwrite, but we're more likely to be able to load the
8867 constant efficiently if it has fewer, larger, repeating parts
8868 (see aarch64_simd_valid_immediate). */
8869 for (int i = 0; i < n_elts; i++)
8871 rtx x = XVECEXP (vals, 0, i);
8872 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8873 continue;
8874 rtx subst = any_const;
8875 for (int bit = n_elts / 2; bit > 0; bit /= 2)
8877 /* Look in the copied vector, as more elements are const. */
8878 rtx test = XVECEXP (copy, 0, i ^ bit);
8879 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8881 subst = test;
8882 break;
8885 XVECEXP (copy, 0, i) = subst;
8887 aarch64_expand_vector_init (target, copy);
8889 /* Insert variables. */
8890 enum insn_code icode = optab_handler (vec_set_optab, mode);
8891 gcc_assert (icode != CODE_FOR_nothing);
8893 for (int i = 0; i < n_elts; i++)
8895 rtx x = XVECEXP (vals, 0, i);
8896 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8897 continue;
8898 x = copy_to_mode_reg (inner_mode, x);
8899 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8901 return;
8904 /* Construct the vector in memory one field at a time
8905 and load the whole vector. */
8906 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8907 for (int i = 0; i < n_elts; i++)
8908 emit_move_insn (adjust_address_nv (mem, inner_mode,
8909 i * GET_MODE_SIZE (inner_mode)),
8910 XVECEXP (vals, 0, i));
8911 emit_move_insn (target, mem);
8915 static unsigned HOST_WIDE_INT
8916 aarch64_shift_truncation_mask (machine_mode mode)
8918 return
8919 (aarch64_vector_mode_supported_p (mode)
8920 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8923 #ifndef TLS_SECTION_ASM_FLAG
8924 #define TLS_SECTION_ASM_FLAG 'T'
8925 #endif
8927 void
8928 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8929 tree decl ATTRIBUTE_UNUSED)
8931 char flagchars[10], *f = flagchars;
8933 /* If we have already declared this section, we can use an
8934 abbreviated form to switch back to it -- unless this section is
8935 part of a COMDAT groups, in which case GAS requires the full
8936 declaration every time. */
8937 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8938 && (flags & SECTION_DECLARED))
8940 fprintf (asm_out_file, "\t.section\t%s\n", name);
8941 return;
8944 if (!(flags & SECTION_DEBUG))
8945 *f++ = 'a';
8946 if (flags & SECTION_WRITE)
8947 *f++ = 'w';
8948 if (flags & SECTION_CODE)
8949 *f++ = 'x';
8950 if (flags & SECTION_SMALL)
8951 *f++ = 's';
8952 if (flags & SECTION_MERGE)
8953 *f++ = 'M';
8954 if (flags & SECTION_STRINGS)
8955 *f++ = 'S';
8956 if (flags & SECTION_TLS)
8957 *f++ = TLS_SECTION_ASM_FLAG;
8958 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8959 *f++ = 'G';
8960 *f = '\0';
8962 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8964 if (!(flags & SECTION_NOTYPE))
8966 const char *type;
8967 const char *format;
8969 if (flags & SECTION_BSS)
8970 type = "nobits";
8971 else
8972 type = "progbits";
8974 #ifdef TYPE_OPERAND_FMT
8975 format = "," TYPE_OPERAND_FMT;
8976 #else
8977 format = ",@%s";
8978 #endif
8980 fprintf (asm_out_file, format, type);
8982 if (flags & SECTION_ENTSIZE)
8983 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8984 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8986 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8987 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8988 else
8989 fprintf (asm_out_file, ",%s,comdat",
8990 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8994 putc ('\n', asm_out_file);
8997 /* Select a format to encode pointers in exception handling data. */
8999 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9001 int type;
9002 switch (aarch64_cmodel)
9004 case AARCH64_CMODEL_TINY:
9005 case AARCH64_CMODEL_TINY_PIC:
9006 case AARCH64_CMODEL_SMALL:
9007 case AARCH64_CMODEL_SMALL_PIC:
9008 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9009 for everything. */
9010 type = DW_EH_PE_sdata4;
9011 break;
9012 default:
9013 /* No assumptions here. 8-byte relocs required. */
9014 type = DW_EH_PE_sdata8;
9015 break;
9017 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9020 /* Emit load exclusive. */
9022 static void
9023 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9024 rtx mem, rtx model_rtx)
9026 rtx (*gen) (rtx, rtx, rtx);
9028 switch (mode)
9030 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9031 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9032 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9033 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9034 default:
9035 gcc_unreachable ();
9038 emit_insn (gen (rval, mem, model_rtx));
9041 /* Emit store exclusive. */
9043 static void
9044 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9045 rtx rval, rtx mem, rtx model_rtx)
9047 rtx (*gen) (rtx, rtx, rtx, rtx);
9049 switch (mode)
9051 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9052 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9053 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9054 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9055 default:
9056 gcc_unreachable ();
9059 emit_insn (gen (bval, rval, mem, model_rtx));
9062 /* Mark the previous jump instruction as unlikely. */
9064 static void
9065 aarch64_emit_unlikely_jump (rtx insn)
9067 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9069 insn = emit_jump_insn (insn);
9070 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9073 /* Expand a compare and swap pattern. */
9075 void
9076 aarch64_expand_compare_and_swap (rtx operands[])
9078 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9079 machine_mode mode, cmp_mode;
9080 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9082 bval = operands[0];
9083 rval = operands[1];
9084 mem = operands[2];
9085 oldval = operands[3];
9086 newval = operands[4];
9087 is_weak = operands[5];
9088 mod_s = operands[6];
9089 mod_f = operands[7];
9090 mode = GET_MODE (mem);
9091 cmp_mode = mode;
9093 /* Normally the succ memory model must be stronger than fail, but in the
9094 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9095 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9097 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9098 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9099 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9101 switch (mode)
9103 case QImode:
9104 case HImode:
9105 /* For short modes, we're going to perform the comparison in SImode,
9106 so do the zero-extension now. */
9107 cmp_mode = SImode;
9108 rval = gen_reg_rtx (SImode);
9109 oldval = convert_modes (SImode, mode, oldval, true);
9110 /* Fall through. */
9112 case SImode:
9113 case DImode:
9114 /* Force the value into a register if needed. */
9115 if (!aarch64_plus_operand (oldval, mode))
9116 oldval = force_reg (cmp_mode, oldval);
9117 break;
9119 default:
9120 gcc_unreachable ();
9123 switch (mode)
9125 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9126 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9127 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9128 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9129 default:
9130 gcc_unreachable ();
9133 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9135 if (mode == QImode || mode == HImode)
9136 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9138 x = gen_rtx_REG (CCmode, CC_REGNUM);
9139 x = gen_rtx_EQ (SImode, x, const0_rtx);
9140 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9143 /* Split a compare and swap pattern. */
9145 void
9146 aarch64_split_compare_and_swap (rtx operands[])
9148 rtx rval, mem, oldval, newval, scratch;
9149 machine_mode mode;
9150 bool is_weak;
9151 rtx_code_label *label1, *label2;
9152 rtx x, cond;
9154 rval = operands[0];
9155 mem = operands[1];
9156 oldval = operands[2];
9157 newval = operands[3];
9158 is_weak = (operands[4] != const0_rtx);
9159 scratch = operands[7];
9160 mode = GET_MODE (mem);
9162 label1 = NULL;
9163 if (!is_weak)
9165 label1 = gen_label_rtx ();
9166 emit_label (label1);
9168 label2 = gen_label_rtx ();
9170 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9172 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9173 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9174 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9175 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9176 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9178 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9180 if (!is_weak)
9182 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9183 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9184 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9185 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9187 else
9189 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9190 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9191 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9194 emit_label (label2);
9197 /* Split an atomic operation. */
9199 void
9200 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9201 rtx value, rtx model_rtx, rtx cond)
9203 machine_mode mode = GET_MODE (mem);
9204 machine_mode wmode = (mode == DImode ? DImode : SImode);
9205 rtx_code_label *label;
9206 rtx x;
9208 label = gen_label_rtx ();
9209 emit_label (label);
9211 if (new_out)
9212 new_out = gen_lowpart (wmode, new_out);
9213 if (old_out)
9214 old_out = gen_lowpart (wmode, old_out);
9215 else
9216 old_out = new_out;
9217 value = simplify_gen_subreg (wmode, value, mode, 0);
9219 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9221 switch (code)
9223 case SET:
9224 new_out = value;
9225 break;
9227 case NOT:
9228 x = gen_rtx_AND (wmode, old_out, value);
9229 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9230 x = gen_rtx_NOT (wmode, new_out);
9231 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9232 break;
9234 case MINUS:
9235 if (CONST_INT_P (value))
9237 value = GEN_INT (-INTVAL (value));
9238 code = PLUS;
9240 /* Fall through. */
9242 default:
9243 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9244 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9245 break;
9248 aarch64_emit_store_exclusive (mode, cond, mem,
9249 gen_lowpart (mode, new_out), model_rtx);
9251 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9252 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9253 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9254 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9257 static void
9258 aarch64_print_extension (void)
9260 const struct aarch64_option_extension *opt = NULL;
9262 for (opt = all_extensions; opt->name != NULL; opt++)
9263 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9264 asm_fprintf (asm_out_file, "+%s", opt->name);
9266 asm_fprintf (asm_out_file, "\n");
9269 static void
9270 aarch64_start_file (void)
9272 if (selected_arch)
9274 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9275 aarch64_print_extension ();
9277 else if (selected_cpu)
9279 const char *truncated_name
9280 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9281 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9282 aarch64_print_extension ();
9284 default_file_start();
9287 /* Target hook for c_mode_for_suffix. */
9288 static machine_mode
9289 aarch64_c_mode_for_suffix (char suffix)
9291 if (suffix == 'q')
9292 return TFmode;
9294 return VOIDmode;
9297 /* We can only represent floating point constants which will fit in
9298 "quarter-precision" values. These values are characterised by
9299 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9302 (-1)^s * (n/16) * 2^r
9304 Where:
9305 's' is the sign bit.
9306 'n' is an integer in the range 16 <= n <= 31.
9307 'r' is an integer in the range -3 <= r <= 4. */
9309 /* Return true iff X can be represented by a quarter-precision
9310 floating point immediate operand X. Note, we cannot represent 0.0. */
9311 bool
9312 aarch64_float_const_representable_p (rtx x)
9314 /* This represents our current view of how many bits
9315 make up the mantissa. */
9316 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9317 int exponent;
9318 unsigned HOST_WIDE_INT mantissa, mask;
9319 REAL_VALUE_TYPE r, m;
9320 bool fail;
9322 if (!CONST_DOUBLE_P (x))
9323 return false;
9325 if (GET_MODE (x) == VOIDmode)
9326 return false;
9328 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9330 /* We cannot represent infinities, NaNs or +/-zero. We won't
9331 know if we have +zero until we analyse the mantissa, but we
9332 can reject the other invalid values. */
9333 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9334 || REAL_VALUE_MINUS_ZERO (r))
9335 return false;
9337 /* Extract exponent. */
9338 r = real_value_abs (&r);
9339 exponent = REAL_EXP (&r);
9341 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9342 highest (sign) bit, with a fixed binary point at bit point_pos.
9343 m1 holds the low part of the mantissa, m2 the high part.
9344 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9345 bits for the mantissa, this can fail (low bits will be lost). */
9346 real_ldexp (&m, &r, point_pos - exponent);
9347 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9349 /* If the low part of the mantissa has bits set we cannot represent
9350 the value. */
9351 if (w.elt (0) != 0)
9352 return false;
9353 /* We have rejected the lower HOST_WIDE_INT, so update our
9354 understanding of how many bits lie in the mantissa and
9355 look only at the high HOST_WIDE_INT. */
9356 mantissa = w.elt (1);
9357 point_pos -= HOST_BITS_PER_WIDE_INT;
9359 /* We can only represent values with a mantissa of the form 1.xxxx. */
9360 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9361 if ((mantissa & mask) != 0)
9362 return false;
9364 /* Having filtered unrepresentable values, we may now remove all
9365 but the highest 5 bits. */
9366 mantissa >>= point_pos - 5;
9368 /* We cannot represent the value 0.0, so reject it. This is handled
9369 elsewhere. */
9370 if (mantissa == 0)
9371 return false;
9373 /* Then, as bit 4 is always set, we can mask it off, leaving
9374 the mantissa in the range [0, 15]. */
9375 mantissa &= ~(1 << 4);
9376 gcc_assert (mantissa <= 15);
9378 /* GCC internally does not use IEEE754-like encoding (where normalized
9379 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9380 Our mantissa values are shifted 4 places to the left relative to
9381 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9382 by 5 places to correct for GCC's representation. */
9383 exponent = 5 - exponent;
9385 return (exponent >= 0 && exponent <= 7);
9388 char*
9389 aarch64_output_simd_mov_immediate (rtx const_vector,
9390 machine_mode mode,
9391 unsigned width)
9393 bool is_valid;
9394 static char templ[40];
9395 const char *mnemonic;
9396 const char *shift_op;
9397 unsigned int lane_count = 0;
9398 char element_char;
9400 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9402 /* This will return true to show const_vector is legal for use as either
9403 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9404 also update INFO to show how the immediate should be generated. */
9405 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9406 gcc_assert (is_valid);
9408 element_char = sizetochar (info.element_width);
9409 lane_count = width / info.element_width;
9411 mode = GET_MODE_INNER (mode);
9412 if (mode == SFmode || mode == DFmode)
9414 gcc_assert (info.shift == 0 && ! info.mvn);
9415 if (aarch64_float_const_zero_rtx_p (info.value))
9416 info.value = GEN_INT (0);
9417 else
9419 #define buf_size 20
9420 REAL_VALUE_TYPE r;
9421 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9422 char float_buf[buf_size] = {'\0'};
9423 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9424 #undef buf_size
9426 if (lane_count == 1)
9427 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9428 else
9429 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9430 lane_count, element_char, float_buf);
9431 return templ;
9435 mnemonic = info.mvn ? "mvni" : "movi";
9436 shift_op = info.msl ? "msl" : "lsl";
9438 if (lane_count == 1)
9439 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9440 mnemonic, UINTVAL (info.value));
9441 else if (info.shift)
9442 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9443 ", %s %d", mnemonic, lane_count, element_char,
9444 UINTVAL (info.value), shift_op, info.shift);
9445 else
9446 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9447 mnemonic, lane_count, element_char, UINTVAL (info.value));
9448 return templ;
9451 char*
9452 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9453 machine_mode mode)
9455 machine_mode vmode;
9457 gcc_assert (!VECTOR_MODE_P (mode));
9458 vmode = aarch64_simd_container_mode (mode, 64);
9459 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9460 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9463 /* Split operands into moves from op[1] + op[2] into op[0]. */
9465 void
9466 aarch64_split_combinev16qi (rtx operands[3])
9468 unsigned int dest = REGNO (operands[0]);
9469 unsigned int src1 = REGNO (operands[1]);
9470 unsigned int src2 = REGNO (operands[2]);
9471 machine_mode halfmode = GET_MODE (operands[1]);
9472 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9473 rtx destlo, desthi;
9475 gcc_assert (halfmode == V16QImode);
9477 if (src1 == dest && src2 == dest + halfregs)
9479 /* No-op move. Can't split to nothing; emit something. */
9480 emit_note (NOTE_INSN_DELETED);
9481 return;
9484 /* Preserve register attributes for variable tracking. */
9485 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9486 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9487 GET_MODE_SIZE (halfmode));
9489 /* Special case of reversed high/low parts. */
9490 if (reg_overlap_mentioned_p (operands[2], destlo)
9491 && reg_overlap_mentioned_p (operands[1], desthi))
9493 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9494 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9495 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9497 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9499 /* Try to avoid unnecessary moves if part of the result
9500 is in the right place already. */
9501 if (src1 != dest)
9502 emit_move_insn (destlo, operands[1]);
9503 if (src2 != dest + halfregs)
9504 emit_move_insn (desthi, operands[2]);
9506 else
9508 if (src2 != dest + halfregs)
9509 emit_move_insn (desthi, operands[2]);
9510 if (src1 != dest)
9511 emit_move_insn (destlo, operands[1]);
9515 /* vec_perm support. */
9517 #define MAX_VECT_LEN 16
9519 struct expand_vec_perm_d
9521 rtx target, op0, op1;
9522 unsigned char perm[MAX_VECT_LEN];
9523 machine_mode vmode;
9524 unsigned char nelt;
9525 bool one_vector_p;
9526 bool testing_p;
9529 /* Generate a variable permutation. */
9531 static void
9532 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9534 machine_mode vmode = GET_MODE (target);
9535 bool one_vector_p = rtx_equal_p (op0, op1);
9537 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9538 gcc_checking_assert (GET_MODE (op0) == vmode);
9539 gcc_checking_assert (GET_MODE (op1) == vmode);
9540 gcc_checking_assert (GET_MODE (sel) == vmode);
9541 gcc_checking_assert (TARGET_SIMD);
9543 if (one_vector_p)
9545 if (vmode == V8QImode)
9547 /* Expand the argument to a V16QI mode by duplicating it. */
9548 rtx pair = gen_reg_rtx (V16QImode);
9549 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9550 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9552 else
9554 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9557 else
9559 rtx pair;
9561 if (vmode == V8QImode)
9563 pair = gen_reg_rtx (V16QImode);
9564 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9565 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9567 else
9569 pair = gen_reg_rtx (OImode);
9570 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9571 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9576 void
9577 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9579 machine_mode vmode = GET_MODE (target);
9580 unsigned int nelt = GET_MODE_NUNITS (vmode);
9581 bool one_vector_p = rtx_equal_p (op0, op1);
9582 rtx mask;
9584 /* The TBL instruction does not use a modulo index, so we must take care
9585 of that ourselves. */
9586 mask = aarch64_simd_gen_const_vector_dup (vmode,
9587 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9588 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9590 /* For big-endian, we also need to reverse the index within the vector
9591 (but not which vector). */
9592 if (BYTES_BIG_ENDIAN)
9594 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9595 if (!one_vector_p)
9596 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9597 sel = expand_simple_binop (vmode, XOR, sel, mask,
9598 NULL, 0, OPTAB_LIB_WIDEN);
9600 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9603 /* Recognize patterns suitable for the TRN instructions. */
9604 static bool
9605 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9607 unsigned int i, odd, mask, nelt = d->nelt;
9608 rtx out, in0, in1, x;
9609 rtx (*gen) (rtx, rtx, rtx);
9610 machine_mode vmode = d->vmode;
9612 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9613 return false;
9615 /* Note that these are little-endian tests.
9616 We correct for big-endian later. */
9617 if (d->perm[0] == 0)
9618 odd = 0;
9619 else if (d->perm[0] == 1)
9620 odd = 1;
9621 else
9622 return false;
9623 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9625 for (i = 0; i < nelt; i += 2)
9627 if (d->perm[i] != i + odd)
9628 return false;
9629 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9630 return false;
9633 /* Success! */
9634 if (d->testing_p)
9635 return true;
9637 in0 = d->op0;
9638 in1 = d->op1;
9639 if (BYTES_BIG_ENDIAN)
9641 x = in0, in0 = in1, in1 = x;
9642 odd = !odd;
9644 out = d->target;
9646 if (odd)
9648 switch (vmode)
9650 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9651 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9652 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9653 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9654 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9655 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9656 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9657 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9658 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9659 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9660 default:
9661 return false;
9664 else
9666 switch (vmode)
9668 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9669 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9670 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9671 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9672 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9673 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9674 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9675 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9676 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9677 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9678 default:
9679 return false;
9683 emit_insn (gen (out, in0, in1));
9684 return true;
9687 /* Recognize patterns suitable for the UZP instructions. */
9688 static bool
9689 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9691 unsigned int i, odd, mask, nelt = d->nelt;
9692 rtx out, in0, in1, x;
9693 rtx (*gen) (rtx, rtx, rtx);
9694 machine_mode vmode = d->vmode;
9696 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9697 return false;
9699 /* Note that these are little-endian tests.
9700 We correct for big-endian later. */
9701 if (d->perm[0] == 0)
9702 odd = 0;
9703 else if (d->perm[0] == 1)
9704 odd = 1;
9705 else
9706 return false;
9707 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9709 for (i = 0; i < nelt; i++)
9711 unsigned elt = (i * 2 + odd) & mask;
9712 if (d->perm[i] != elt)
9713 return false;
9716 /* Success! */
9717 if (d->testing_p)
9718 return true;
9720 in0 = d->op0;
9721 in1 = d->op1;
9722 if (BYTES_BIG_ENDIAN)
9724 x = in0, in0 = in1, in1 = x;
9725 odd = !odd;
9727 out = d->target;
9729 if (odd)
9731 switch (vmode)
9733 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9734 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9735 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9736 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9737 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9738 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9739 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9740 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9741 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9742 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9743 default:
9744 return false;
9747 else
9749 switch (vmode)
9751 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9752 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9753 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9754 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9755 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9756 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9757 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9758 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9759 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9760 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9761 default:
9762 return false;
9766 emit_insn (gen (out, in0, in1));
9767 return true;
9770 /* Recognize patterns suitable for the ZIP instructions. */
9771 static bool
9772 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9774 unsigned int i, high, mask, nelt = d->nelt;
9775 rtx out, in0, in1, x;
9776 rtx (*gen) (rtx, rtx, rtx);
9777 machine_mode vmode = d->vmode;
9779 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9780 return false;
9782 /* Note that these are little-endian tests.
9783 We correct for big-endian later. */
9784 high = nelt / 2;
9785 if (d->perm[0] == high)
9786 /* Do Nothing. */
9788 else if (d->perm[0] == 0)
9789 high = 0;
9790 else
9791 return false;
9792 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9794 for (i = 0; i < nelt / 2; i++)
9796 unsigned elt = (i + high) & mask;
9797 if (d->perm[i * 2] != elt)
9798 return false;
9799 elt = (elt + nelt) & mask;
9800 if (d->perm[i * 2 + 1] != elt)
9801 return false;
9804 /* Success! */
9805 if (d->testing_p)
9806 return true;
9808 in0 = d->op0;
9809 in1 = d->op1;
9810 if (BYTES_BIG_ENDIAN)
9812 x = in0, in0 = in1, in1 = x;
9813 high = !high;
9815 out = d->target;
9817 if (high)
9819 switch (vmode)
9821 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9822 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9823 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9824 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9825 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9826 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9827 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9828 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9829 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9830 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9831 default:
9832 return false;
9835 else
9837 switch (vmode)
9839 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9840 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9841 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9842 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9843 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9844 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9845 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9846 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9847 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9848 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9849 default:
9850 return false;
9854 emit_insn (gen (out, in0, in1));
9855 return true;
9858 /* Recognize patterns for the EXT insn. */
9860 static bool
9861 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9863 unsigned int i, nelt = d->nelt;
9864 rtx (*gen) (rtx, rtx, rtx, rtx);
9865 rtx offset;
9867 unsigned int location = d->perm[0]; /* Always < nelt. */
9869 /* Check if the extracted indices are increasing by one. */
9870 for (i = 1; i < nelt; i++)
9872 unsigned int required = location + i;
9873 if (d->one_vector_p)
9875 /* We'll pass the same vector in twice, so allow indices to wrap. */
9876 required &= (nelt - 1);
9878 if (d->perm[i] != required)
9879 return false;
9882 switch (d->vmode)
9884 case V16QImode: gen = gen_aarch64_extv16qi; break;
9885 case V8QImode: gen = gen_aarch64_extv8qi; break;
9886 case V4HImode: gen = gen_aarch64_extv4hi; break;
9887 case V8HImode: gen = gen_aarch64_extv8hi; break;
9888 case V2SImode: gen = gen_aarch64_extv2si; break;
9889 case V4SImode: gen = gen_aarch64_extv4si; break;
9890 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9891 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9892 case V2DImode: gen = gen_aarch64_extv2di; break;
9893 case V2DFmode: gen = gen_aarch64_extv2df; break;
9894 default:
9895 return false;
9898 /* Success! */
9899 if (d->testing_p)
9900 return true;
9902 /* The case where (location == 0) is a no-op for both big- and little-endian,
9903 and is removed by the mid-end at optimization levels -O1 and higher. */
9905 if (BYTES_BIG_ENDIAN && (location != 0))
9907 /* After setup, we want the high elements of the first vector (stored
9908 at the LSB end of the register), and the low elements of the second
9909 vector (stored at the MSB end of the register). So swap. */
9910 std::swap (d->op0, d->op1);
9911 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9912 location = nelt - location;
9915 offset = GEN_INT (location);
9916 emit_insn (gen (d->target, d->op0, d->op1, offset));
9917 return true;
9920 /* Recognize patterns for the REV insns. */
9922 static bool
9923 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9925 unsigned int i, j, diff, nelt = d->nelt;
9926 rtx (*gen) (rtx, rtx);
9928 if (!d->one_vector_p)
9929 return false;
9931 diff = d->perm[0];
9932 switch (diff)
9934 case 7:
9935 switch (d->vmode)
9937 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9938 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
9939 default:
9940 return false;
9942 break;
9943 case 3:
9944 switch (d->vmode)
9946 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9947 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
9948 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
9949 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
9950 default:
9951 return false;
9953 break;
9954 case 1:
9955 switch (d->vmode)
9957 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9958 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
9959 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
9960 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
9961 case V4SImode: gen = gen_aarch64_rev64v4si; break;
9962 case V2SImode: gen = gen_aarch64_rev64v2si; break;
9963 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
9964 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
9965 default:
9966 return false;
9968 break;
9969 default:
9970 return false;
9973 for (i = 0; i < nelt ; i += diff + 1)
9974 for (j = 0; j <= diff; j += 1)
9976 /* This is guaranteed to be true as the value of diff
9977 is 7, 3, 1 and we should have enough elements in the
9978 queue to generate this. Getting a vector mask with a
9979 value of diff other than these values implies that
9980 something is wrong by the time we get here. */
9981 gcc_assert (i + j < nelt);
9982 if (d->perm[i + j] != i + diff - j)
9983 return false;
9986 /* Success! */
9987 if (d->testing_p)
9988 return true;
9990 emit_insn (gen (d->target, d->op0));
9991 return true;
9994 static bool
9995 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9997 rtx (*gen) (rtx, rtx, rtx);
9998 rtx out = d->target;
9999 rtx in0;
10000 machine_mode vmode = d->vmode;
10001 unsigned int i, elt, nelt = d->nelt;
10002 rtx lane;
10004 elt = d->perm[0];
10005 for (i = 1; i < nelt; i++)
10007 if (elt != d->perm[i])
10008 return false;
10011 /* The generic preparation in aarch64_expand_vec_perm_const_1
10012 swaps the operand order and the permute indices if it finds
10013 d->perm[0] to be in the second operand. Thus, we can always
10014 use d->op0 and need not do any extra arithmetic to get the
10015 correct lane number. */
10016 in0 = d->op0;
10017 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10019 switch (vmode)
10021 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10022 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10023 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10024 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10025 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10026 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10027 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10028 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10029 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10030 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10031 default:
10032 return false;
10035 emit_insn (gen (out, in0, lane));
10036 return true;
10039 static bool
10040 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10042 rtx rperm[MAX_VECT_LEN], sel;
10043 machine_mode vmode = d->vmode;
10044 unsigned int i, nelt = d->nelt;
10046 if (d->testing_p)
10047 return true;
10049 /* Generic code will try constant permutation twice. Once with the
10050 original mode and again with the elements lowered to QImode.
10051 So wait and don't do the selector expansion ourselves. */
10052 if (vmode != V8QImode && vmode != V16QImode)
10053 return false;
10055 for (i = 0; i < nelt; ++i)
10057 int nunits = GET_MODE_NUNITS (vmode);
10059 /* If big-endian and two vectors we end up with a weird mixed-endian
10060 mode on NEON. Reverse the index within each word but not the word
10061 itself. */
10062 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10063 : d->perm[i]);
10065 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10066 sel = force_reg (vmode, sel);
10068 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10069 return true;
10072 static bool
10073 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10075 /* The pattern matching functions above are written to look for a small
10076 number to begin the sequence (0, 1, N/2). If we begin with an index
10077 from the second operand, we can swap the operands. */
10078 if (d->perm[0] >= d->nelt)
10080 unsigned i, nelt = d->nelt;
10082 gcc_assert (nelt == (nelt & -nelt));
10083 for (i = 0; i < nelt; ++i)
10084 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10086 std::swap (d->op0, d->op1);
10089 if (TARGET_SIMD)
10091 if (aarch64_evpc_rev (d))
10092 return true;
10093 else if (aarch64_evpc_ext (d))
10094 return true;
10095 else if (aarch64_evpc_dup (d))
10096 return true;
10097 else if (aarch64_evpc_zip (d))
10098 return true;
10099 else if (aarch64_evpc_uzp (d))
10100 return true;
10101 else if (aarch64_evpc_trn (d))
10102 return true;
10103 return aarch64_evpc_tbl (d);
10105 return false;
10108 /* Expand a vec_perm_const pattern. */
10110 bool
10111 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10113 struct expand_vec_perm_d d;
10114 int i, nelt, which;
10116 d.target = target;
10117 d.op0 = op0;
10118 d.op1 = op1;
10120 d.vmode = GET_MODE (target);
10121 gcc_assert (VECTOR_MODE_P (d.vmode));
10122 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10123 d.testing_p = false;
10125 for (i = which = 0; i < nelt; ++i)
10127 rtx e = XVECEXP (sel, 0, i);
10128 int ei = INTVAL (e) & (2 * nelt - 1);
10129 which |= (ei < nelt ? 1 : 2);
10130 d.perm[i] = ei;
10133 switch (which)
10135 default:
10136 gcc_unreachable ();
10138 case 3:
10139 d.one_vector_p = false;
10140 if (!rtx_equal_p (op0, op1))
10141 break;
10143 /* The elements of PERM do not suggest that only the first operand
10144 is used, but both operands are identical. Allow easier matching
10145 of the permutation by folding the permutation into the single
10146 input vector. */
10147 /* Fall Through. */
10148 case 2:
10149 for (i = 0; i < nelt; ++i)
10150 d.perm[i] &= nelt - 1;
10151 d.op0 = op1;
10152 d.one_vector_p = true;
10153 break;
10155 case 1:
10156 d.op1 = op0;
10157 d.one_vector_p = true;
10158 break;
10161 return aarch64_expand_vec_perm_const_1 (&d);
10164 static bool
10165 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10166 const unsigned char *sel)
10168 struct expand_vec_perm_d d;
10169 unsigned int i, nelt, which;
10170 bool ret;
10172 d.vmode = vmode;
10173 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10174 d.testing_p = true;
10175 memcpy (d.perm, sel, nelt);
10177 /* Calculate whether all elements are in one vector. */
10178 for (i = which = 0; i < nelt; ++i)
10180 unsigned char e = d.perm[i];
10181 gcc_assert (e < 2 * nelt);
10182 which |= (e < nelt ? 1 : 2);
10185 /* If all elements are from the second vector, reindex as if from the
10186 first vector. */
10187 if (which == 2)
10188 for (i = 0; i < nelt; ++i)
10189 d.perm[i] -= nelt;
10191 /* Check whether the mask can be applied to a single vector. */
10192 d.one_vector_p = (which != 3);
10194 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10195 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10196 if (!d.one_vector_p)
10197 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10199 start_sequence ();
10200 ret = aarch64_expand_vec_perm_const_1 (&d);
10201 end_sequence ();
10203 return ret;
10207 aarch64_reverse_mask (enum machine_mode mode)
10209 /* We have to reverse each vector because we dont have
10210 a permuted load that can reverse-load according to ABI rules. */
10211 rtx mask;
10212 rtvec v = rtvec_alloc (16);
10213 int i, j;
10214 int nunits = GET_MODE_NUNITS (mode);
10215 int usize = GET_MODE_UNIT_SIZE (mode);
10217 gcc_assert (BYTES_BIG_ENDIAN);
10218 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10220 for (i = 0; i < nunits; i++)
10221 for (j = 0; j < usize; j++)
10222 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10223 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10224 return force_reg (V16QImode, mask);
10227 /* Implement MODES_TIEABLE_P. */
10229 bool
10230 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10232 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10233 return true;
10235 /* We specifically want to allow elements of "structure" modes to
10236 be tieable to the structure. This more general condition allows
10237 other rarer situations too. */
10238 if (TARGET_SIMD
10239 && aarch64_vector_mode_p (mode1)
10240 && aarch64_vector_mode_p (mode2))
10241 return true;
10243 return false;
10246 /* Return a new RTX holding the result of moving POINTER forward by
10247 AMOUNT bytes. */
10249 static rtx
10250 aarch64_move_pointer (rtx pointer, int amount)
10252 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10254 return adjust_automodify_address (pointer, GET_MODE (pointer),
10255 next, amount);
10258 /* Return a new RTX holding the result of moving POINTER forward by the
10259 size of the mode it points to. */
10261 static rtx
10262 aarch64_progress_pointer (rtx pointer)
10264 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10266 return aarch64_move_pointer (pointer, amount);
10269 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10270 MODE bytes. */
10272 static void
10273 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10274 machine_mode mode)
10276 rtx reg = gen_reg_rtx (mode);
10278 /* "Cast" the pointers to the correct mode. */
10279 *src = adjust_address (*src, mode, 0);
10280 *dst = adjust_address (*dst, mode, 0);
10281 /* Emit the memcpy. */
10282 emit_move_insn (reg, *src);
10283 emit_move_insn (*dst, reg);
10284 /* Move the pointers forward. */
10285 *src = aarch64_progress_pointer (*src);
10286 *dst = aarch64_progress_pointer (*dst);
10289 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10290 we succeed, otherwise return false. */
10292 bool
10293 aarch64_expand_movmem (rtx *operands)
10295 unsigned int n;
10296 rtx dst = operands[0];
10297 rtx src = operands[1];
10298 rtx base;
10299 bool speed_p = !optimize_function_for_size_p (cfun);
10301 /* When optimizing for size, give a better estimate of the length of a
10302 memcpy call, but use the default otherwise. */
10303 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10305 /* We can't do anything smart if the amount to copy is not constant. */
10306 if (!CONST_INT_P (operands[2]))
10307 return false;
10309 n = UINTVAL (operands[2]);
10311 /* Try to keep the number of instructions low. For cases below 16 bytes we
10312 need to make at most two moves. For cases above 16 bytes it will be one
10313 move for each 16 byte chunk, then at most two additional moves. */
10314 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10315 return false;
10317 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10318 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10320 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10321 src = adjust_automodify_address (src, VOIDmode, base, 0);
10323 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10324 1-byte chunk. */
10325 if (n < 4)
10327 if (n >= 2)
10329 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10330 n -= 2;
10333 if (n == 1)
10334 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10336 return true;
10339 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10340 4-byte chunk, partially overlapping with the previously copied chunk. */
10341 if (n < 8)
10343 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10344 n -= 4;
10345 if (n > 0)
10347 int move = n - 4;
10349 src = aarch64_move_pointer (src, move);
10350 dst = aarch64_move_pointer (dst, move);
10351 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10353 return true;
10356 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10357 them, then (if applicable) an 8-byte chunk. */
10358 while (n >= 8)
10360 if (n / 16)
10362 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10363 n -= 16;
10365 else
10367 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10368 n -= 8;
10372 /* Finish the final bytes of the copy. We can always do this in one
10373 instruction. We either copy the exact amount we need, or partially
10374 overlap with the previous chunk we copied and copy 8-bytes. */
10375 if (n == 0)
10376 return true;
10377 else if (n == 1)
10378 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10379 else if (n == 2)
10380 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10381 else if (n == 4)
10382 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10383 else
10385 if (n == 3)
10387 src = aarch64_move_pointer (src, -1);
10388 dst = aarch64_move_pointer (dst, -1);
10389 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10391 else
10393 int move = n - 8;
10395 src = aarch64_move_pointer (src, move);
10396 dst = aarch64_move_pointer (dst, move);
10397 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10401 return true;
10404 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10406 static unsigned HOST_WIDE_INT
10407 aarch64_asan_shadow_offset (void)
10409 return (HOST_WIDE_INT_1 << 36);
10412 static bool
10413 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10414 unsigned int align,
10415 enum by_pieces_operation op,
10416 bool speed_p)
10418 /* STORE_BY_PIECES can be used when copying a constant string, but
10419 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10420 For now we always fail this and let the move_by_pieces code copy
10421 the string from read-only memory. */
10422 if (op == STORE_BY_PIECES)
10423 return false;
10425 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10428 static enum machine_mode
10429 aarch64_code_to_ccmode (enum rtx_code code)
10431 switch (code)
10433 case NE:
10434 return CC_DNEmode;
10436 case EQ:
10437 return CC_DEQmode;
10439 case LE:
10440 return CC_DLEmode;
10442 case LT:
10443 return CC_DLTmode;
10445 case GE:
10446 return CC_DGEmode;
10448 case GT:
10449 return CC_DGTmode;
10451 case LEU:
10452 return CC_DLEUmode;
10454 case LTU:
10455 return CC_DLTUmode;
10457 case GEU:
10458 return CC_DGEUmode;
10460 case GTU:
10461 return CC_DGTUmode;
10463 default:
10464 return CCmode;
10468 static rtx
10469 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10470 int code, tree treeop0, tree treeop1)
10472 enum machine_mode op_mode, cmp_mode, cc_mode;
10473 rtx op0, op1, cmp, target;
10474 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10475 enum insn_code icode;
10476 struct expand_operand ops[4];
10478 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10479 if (cc_mode == CCmode)
10480 return NULL_RTX;
10482 start_sequence ();
10483 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10485 op_mode = GET_MODE (op0);
10486 if (op_mode == VOIDmode)
10487 op_mode = GET_MODE (op1);
10489 switch (op_mode)
10491 case QImode:
10492 case HImode:
10493 case SImode:
10494 cmp_mode = SImode;
10495 icode = CODE_FOR_cmpsi;
10496 break;
10498 case DImode:
10499 cmp_mode = DImode;
10500 icode = CODE_FOR_cmpdi;
10501 break;
10503 default:
10504 end_sequence ();
10505 return NULL_RTX;
10508 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10509 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10510 if (!op0 || !op1)
10512 end_sequence ();
10513 return NULL_RTX;
10515 *prep_seq = get_insns ();
10516 end_sequence ();
10518 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10519 target = gen_rtx_REG (CCmode, CC_REGNUM);
10521 create_output_operand (&ops[0], target, CCmode);
10522 create_fixed_operand (&ops[1], cmp);
10523 create_fixed_operand (&ops[2], op0);
10524 create_fixed_operand (&ops[3], op1);
10526 start_sequence ();
10527 if (!maybe_expand_insn (icode, 4, ops))
10529 end_sequence ();
10530 return NULL_RTX;
10532 *gen_seq = get_insns ();
10533 end_sequence ();
10535 return gen_rtx_REG (cc_mode, CC_REGNUM);
10538 static rtx
10539 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10540 tree treeop0, tree treeop1, int bit_code)
10542 rtx op0, op1, cmp0, cmp1, target;
10543 enum machine_mode op_mode, cmp_mode, cc_mode;
10544 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10545 enum insn_code icode = CODE_FOR_ccmp_andsi;
10546 struct expand_operand ops[6];
10548 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10549 if (cc_mode == CCmode)
10550 return NULL_RTX;
10552 push_to_sequence ((rtx_insn*) *prep_seq);
10553 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10555 op_mode = GET_MODE (op0);
10556 if (op_mode == VOIDmode)
10557 op_mode = GET_MODE (op1);
10559 switch (op_mode)
10561 case QImode:
10562 case HImode:
10563 case SImode:
10564 cmp_mode = SImode;
10565 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10566 : CODE_FOR_ccmp_iorsi;
10567 break;
10569 case DImode:
10570 cmp_mode = DImode;
10571 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10572 : CODE_FOR_ccmp_iordi;
10573 break;
10575 default:
10576 end_sequence ();
10577 return NULL_RTX;
10580 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10581 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10582 if (!op0 || !op1)
10584 end_sequence ();
10585 return NULL_RTX;
10587 *prep_seq = get_insns ();
10588 end_sequence ();
10590 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10591 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10592 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10594 create_fixed_operand (&ops[0], prev);
10595 create_fixed_operand (&ops[1], target);
10596 create_fixed_operand (&ops[2], op0);
10597 create_fixed_operand (&ops[3], op1);
10598 create_fixed_operand (&ops[4], cmp0);
10599 create_fixed_operand (&ops[5], cmp1);
10601 push_to_sequence ((rtx_insn*) *gen_seq);
10602 if (!maybe_expand_insn (icode, 6, ops))
10604 end_sequence ();
10605 return NULL_RTX;
10608 *gen_seq = get_insns ();
10609 end_sequence ();
10611 return target;
10614 #undef TARGET_GEN_CCMP_FIRST
10615 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10617 #undef TARGET_GEN_CCMP_NEXT
10618 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10620 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10621 instruction fusion of some sort. */
10623 static bool
10624 aarch64_macro_fusion_p (void)
10626 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10630 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10631 should be kept together during scheduling. */
10633 static bool
10634 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10636 rtx set_dest;
10637 rtx prev_set = single_set (prev);
10638 rtx curr_set = single_set (curr);
10639 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10640 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10642 if (!aarch64_macro_fusion_p ())
10643 return false;
10645 if (simple_sets_p
10646 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10648 /* We are trying to match:
10649 prev (mov) == (set (reg r0) (const_int imm16))
10650 curr (movk) == (set (zero_extract (reg r0)
10651 (const_int 16)
10652 (const_int 16))
10653 (const_int imm16_1)) */
10655 set_dest = SET_DEST (curr_set);
10657 if (GET_CODE (set_dest) == ZERO_EXTRACT
10658 && CONST_INT_P (SET_SRC (curr_set))
10659 && CONST_INT_P (SET_SRC (prev_set))
10660 && CONST_INT_P (XEXP (set_dest, 2))
10661 && INTVAL (XEXP (set_dest, 2)) == 16
10662 && REG_P (XEXP (set_dest, 0))
10663 && REG_P (SET_DEST (prev_set))
10664 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10666 return true;
10670 if (simple_sets_p
10671 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10674 /* We're trying to match:
10675 prev (adrp) == (set (reg r1)
10676 (high (symbol_ref ("SYM"))))
10677 curr (add) == (set (reg r0)
10678 (lo_sum (reg r1)
10679 (symbol_ref ("SYM"))))
10680 Note that r0 need not necessarily be the same as r1, especially
10681 during pre-regalloc scheduling. */
10683 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10684 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10686 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10687 && REG_P (XEXP (SET_SRC (curr_set), 0))
10688 && REGNO (XEXP (SET_SRC (curr_set), 0))
10689 == REGNO (SET_DEST (prev_set))
10690 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10691 XEXP (SET_SRC (curr_set), 1)))
10692 return true;
10696 if (simple_sets_p
10697 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10700 /* We're trying to match:
10701 prev (movk) == (set (zero_extract (reg r0)
10702 (const_int 16)
10703 (const_int 32))
10704 (const_int imm16_1))
10705 curr (movk) == (set (zero_extract (reg r0)
10706 (const_int 16)
10707 (const_int 48))
10708 (const_int imm16_2)) */
10710 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10711 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10712 && REG_P (XEXP (SET_DEST (prev_set), 0))
10713 && REG_P (XEXP (SET_DEST (curr_set), 0))
10714 && REGNO (XEXP (SET_DEST (prev_set), 0))
10715 == REGNO (XEXP (SET_DEST (curr_set), 0))
10716 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10717 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10718 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10719 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10720 && CONST_INT_P (SET_SRC (prev_set))
10721 && CONST_INT_P (SET_SRC (curr_set)))
10722 return true;
10725 if (simple_sets_p
10726 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10728 /* We're trying to match:
10729 prev (adrp) == (set (reg r0)
10730 (high (symbol_ref ("SYM"))))
10731 curr (ldr) == (set (reg r1)
10732 (mem (lo_sum (reg r0)
10733 (symbol_ref ("SYM")))))
10735 curr (ldr) == (set (reg r1)
10736 (zero_extend (mem
10737 (lo_sum (reg r0)
10738 (symbol_ref ("SYM")))))) */
10739 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10740 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10742 rtx curr_src = SET_SRC (curr_set);
10744 if (GET_CODE (curr_src) == ZERO_EXTEND)
10745 curr_src = XEXP (curr_src, 0);
10747 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10748 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10749 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10750 == REGNO (SET_DEST (prev_set))
10751 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10752 XEXP (SET_SRC (prev_set), 0)))
10753 return true;
10757 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10758 && any_condjump_p (curr))
10760 enum attr_type prev_type = get_attr_type (prev);
10762 /* FIXME: this misses some which is considered simple arthematic
10763 instructions for ThunderX. Simple shifts are missed here. */
10764 if (prev_type == TYPE_ALUS_SREG
10765 || prev_type == TYPE_ALUS_IMM
10766 || prev_type == TYPE_LOGICS_REG
10767 || prev_type == TYPE_LOGICS_IMM)
10768 return true;
10771 return false;
10774 /* If MEM is in the form of [base+offset], extract the two parts
10775 of address and set to BASE and OFFSET, otherwise return false
10776 after clearing BASE and OFFSET. */
10778 bool
10779 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10781 rtx addr;
10783 gcc_assert (MEM_P (mem));
10785 addr = XEXP (mem, 0);
10787 if (REG_P (addr))
10789 *base = addr;
10790 *offset = const0_rtx;
10791 return true;
10794 if (GET_CODE (addr) == PLUS
10795 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10797 *base = XEXP (addr, 0);
10798 *offset = XEXP (addr, 1);
10799 return true;
10802 *base = NULL_RTX;
10803 *offset = NULL_RTX;
10805 return false;
10808 /* Types for scheduling fusion. */
10809 enum sched_fusion_type
10811 SCHED_FUSION_NONE = 0,
10812 SCHED_FUSION_LD_SIGN_EXTEND,
10813 SCHED_FUSION_LD_ZERO_EXTEND,
10814 SCHED_FUSION_LD,
10815 SCHED_FUSION_ST,
10816 SCHED_FUSION_NUM
10819 /* If INSN is a load or store of address in the form of [base+offset],
10820 extract the two parts and set to BASE and OFFSET. Return scheduling
10821 fusion type this INSN is. */
10823 static enum sched_fusion_type
10824 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10826 rtx x, dest, src;
10827 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10829 gcc_assert (INSN_P (insn));
10830 x = PATTERN (insn);
10831 if (GET_CODE (x) != SET)
10832 return SCHED_FUSION_NONE;
10834 src = SET_SRC (x);
10835 dest = SET_DEST (x);
10837 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10838 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10839 return SCHED_FUSION_NONE;
10841 if (GET_CODE (src) == SIGN_EXTEND)
10843 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10844 src = XEXP (src, 0);
10845 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10846 return SCHED_FUSION_NONE;
10848 else if (GET_CODE (src) == ZERO_EXTEND)
10850 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10851 src = XEXP (src, 0);
10852 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10853 return SCHED_FUSION_NONE;
10856 if (GET_CODE (src) == MEM && REG_P (dest))
10857 extract_base_offset_in_addr (src, base, offset);
10858 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10860 fusion = SCHED_FUSION_ST;
10861 extract_base_offset_in_addr (dest, base, offset);
10863 else
10864 return SCHED_FUSION_NONE;
10866 if (*base == NULL_RTX || *offset == NULL_RTX)
10867 fusion = SCHED_FUSION_NONE;
10869 return fusion;
10872 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10874 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10875 and PRI are only calculated for these instructions. For other instruction,
10876 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10877 type instruction fusion can be added by returning different priorities.
10879 It's important that irrelevant instructions get the largest FUSION_PRI. */
10881 static void
10882 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10883 int *fusion_pri, int *pri)
10885 int tmp, off_val;
10886 rtx base, offset;
10887 enum sched_fusion_type fusion;
10889 gcc_assert (INSN_P (insn));
10891 tmp = max_pri - 1;
10892 fusion = fusion_load_store (insn, &base, &offset);
10893 if (fusion == SCHED_FUSION_NONE)
10895 *pri = tmp;
10896 *fusion_pri = tmp;
10897 return;
10900 /* Set FUSION_PRI according to fusion type and base register. */
10901 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10903 /* Calculate PRI. */
10904 tmp /= 2;
10906 /* INSN with smaller offset goes first. */
10907 off_val = (int)(INTVAL (offset));
10908 if (off_val >= 0)
10909 tmp -= (off_val & 0xfffff);
10910 else
10911 tmp += ((- off_val) & 0xfffff);
10913 *pri = tmp;
10914 return;
10917 /* Given OPERANDS of consecutive load/store, check if we can merge
10918 them into ldp/stp. LOAD is true if they are load instructions.
10919 MODE is the mode of memory operands. */
10921 bool
10922 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10923 enum machine_mode mode)
10925 HOST_WIDE_INT offval_1, offval_2, msize;
10926 enum reg_class rclass_1, rclass_2;
10927 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10929 if (load)
10931 mem_1 = operands[1];
10932 mem_2 = operands[3];
10933 reg_1 = operands[0];
10934 reg_2 = operands[2];
10935 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10936 if (REGNO (reg_1) == REGNO (reg_2))
10937 return false;
10939 else
10941 mem_1 = operands[0];
10942 mem_2 = operands[2];
10943 reg_1 = operands[1];
10944 reg_2 = operands[3];
10947 /* The mems cannot be volatile. */
10948 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10949 return false;
10951 /* Check if the addresses are in the form of [base+offset]. */
10952 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10953 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10954 return false;
10955 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10956 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10957 return false;
10959 /* Check if the bases are same. */
10960 if (!rtx_equal_p (base_1, base_2))
10961 return false;
10963 offval_1 = INTVAL (offset_1);
10964 offval_2 = INTVAL (offset_2);
10965 msize = GET_MODE_SIZE (mode);
10966 /* Check if the offsets are consecutive. */
10967 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10968 return false;
10970 /* Check if the addresses are clobbered by load. */
10971 if (load)
10973 if (reg_mentioned_p (reg_1, mem_1))
10974 return false;
10976 /* In increasing order, the last load can clobber the address. */
10977 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10978 return false;
10981 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10982 rclass_1 = FP_REGS;
10983 else
10984 rclass_1 = GENERAL_REGS;
10986 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10987 rclass_2 = FP_REGS;
10988 else
10989 rclass_2 = GENERAL_REGS;
10991 /* Check if the registers are of same class. */
10992 if (rclass_1 != rclass_2)
10993 return false;
10995 return true;
10998 /* Given OPERANDS of consecutive load/store, check if we can merge
10999 them into ldp/stp by adjusting the offset. LOAD is true if they
11000 are load instructions. MODE is the mode of memory operands.
11002 Given below consecutive stores:
11004 str w1, [xb, 0x100]
11005 str w1, [xb, 0x104]
11006 str w1, [xb, 0x108]
11007 str w1, [xb, 0x10c]
11009 Though the offsets are out of the range supported by stp, we can
11010 still pair them after adjusting the offset, like:
11012 add scratch, xb, 0x100
11013 stp w1, w1, [scratch]
11014 stp w1, w1, [scratch, 0x8]
11016 The peephole patterns detecting this opportunity should guarantee
11017 the scratch register is avaliable. */
11019 bool
11020 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11021 enum machine_mode mode)
11023 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11024 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11025 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11026 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11028 if (load)
11030 reg_1 = operands[0];
11031 mem_1 = operands[1];
11032 reg_2 = operands[2];
11033 mem_2 = operands[3];
11034 reg_3 = operands[4];
11035 mem_3 = operands[5];
11036 reg_4 = operands[6];
11037 mem_4 = operands[7];
11038 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11039 && REG_P (reg_3) && REG_P (reg_4));
11040 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11041 return false;
11043 else
11045 mem_1 = operands[0];
11046 reg_1 = operands[1];
11047 mem_2 = operands[2];
11048 reg_2 = operands[3];
11049 mem_3 = operands[4];
11050 reg_3 = operands[5];
11051 mem_4 = operands[6];
11052 reg_4 = operands[7];
11054 /* Skip if memory operand is by itslef valid for ldp/stp. */
11055 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11056 return false;
11058 /* The mems cannot be volatile. */
11059 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11060 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11061 return false;
11063 /* Check if the addresses are in the form of [base+offset]. */
11064 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11065 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11066 return false;
11067 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11068 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11069 return false;
11070 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11071 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11072 return false;
11073 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11074 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11075 return false;
11077 /* Check if the bases are same. */
11078 if (!rtx_equal_p (base_1, base_2)
11079 || !rtx_equal_p (base_2, base_3)
11080 || !rtx_equal_p (base_3, base_4))
11081 return false;
11083 offval_1 = INTVAL (offset_1);
11084 offval_2 = INTVAL (offset_2);
11085 offval_3 = INTVAL (offset_3);
11086 offval_4 = INTVAL (offset_4);
11087 msize = GET_MODE_SIZE (mode);
11088 /* Check if the offsets are consecutive. */
11089 if ((offval_1 != (offval_2 + msize)
11090 || offval_1 != (offval_3 + msize * 2)
11091 || offval_1 != (offval_4 + msize * 3))
11092 && (offval_4 != (offval_3 + msize)
11093 || offval_4 != (offval_2 + msize * 2)
11094 || offval_4 != (offval_1 + msize * 3)))
11095 return false;
11097 /* Check if the addresses are clobbered by load. */
11098 if (load)
11100 if (reg_mentioned_p (reg_1, mem_1)
11101 || reg_mentioned_p (reg_2, mem_2)
11102 || reg_mentioned_p (reg_3, mem_3))
11103 return false;
11105 /* In increasing order, the last load can clobber the address. */
11106 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11107 return false;
11110 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11111 rclass_1 = FP_REGS;
11112 else
11113 rclass_1 = GENERAL_REGS;
11115 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11116 rclass_2 = FP_REGS;
11117 else
11118 rclass_2 = GENERAL_REGS;
11120 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11121 rclass_3 = FP_REGS;
11122 else
11123 rclass_3 = GENERAL_REGS;
11125 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11126 rclass_4 = FP_REGS;
11127 else
11128 rclass_4 = GENERAL_REGS;
11130 /* Check if the registers are of same class. */
11131 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11132 return false;
11134 return true;
11137 /* Given OPERANDS of consecutive load/store, this function pairs them
11138 into ldp/stp after adjusting the offset. It depends on the fact
11139 that addresses of load/store instructions are in increasing order.
11140 MODE is the mode of memory operands. CODE is the rtl operator
11141 which should be applied to all memory operands, it's SIGN_EXTEND,
11142 ZERO_EXTEND or UNKNOWN. */
11144 bool
11145 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11146 enum machine_mode mode, RTX_CODE code)
11148 rtx base, offset, t1, t2;
11149 rtx mem_1, mem_2, mem_3, mem_4;
11150 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11152 if (load)
11154 mem_1 = operands[1];
11155 mem_2 = operands[3];
11156 mem_3 = operands[5];
11157 mem_4 = operands[7];
11159 else
11161 mem_1 = operands[0];
11162 mem_2 = operands[2];
11163 mem_3 = operands[4];
11164 mem_4 = operands[6];
11165 gcc_assert (code == UNKNOWN);
11168 extract_base_offset_in_addr (mem_1, &base, &offset);
11169 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11171 /* Adjust offset thus it can fit in ldp/stp instruction. */
11172 msize = GET_MODE_SIZE (mode);
11173 stp_off_limit = msize * 0x40;
11174 off_val = INTVAL (offset);
11175 abs_off = (off_val < 0) ? -off_val : off_val;
11176 new_off = abs_off % stp_off_limit;
11177 adj_off = abs_off - new_off;
11179 /* Further adjust to make sure all offsets are OK. */
11180 if ((new_off + msize * 2) >= stp_off_limit)
11182 adj_off += stp_off_limit;
11183 new_off -= stp_off_limit;
11186 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11187 if (adj_off >= 0x1000)
11188 return false;
11190 if (off_val < 0)
11192 adj_off = -adj_off;
11193 new_off = -new_off;
11196 /* Create new memory references. */
11197 mem_1 = change_address (mem_1, VOIDmode,
11198 plus_constant (DImode, operands[8], new_off));
11200 /* Check if the adjusted address is OK for ldp/stp. */
11201 if (!aarch64_mem_pair_operand (mem_1, mode))
11202 return false;
11204 msize = GET_MODE_SIZE (mode);
11205 mem_2 = change_address (mem_2, VOIDmode,
11206 plus_constant (DImode,
11207 operands[8],
11208 new_off + msize));
11209 mem_3 = change_address (mem_3, VOIDmode,
11210 plus_constant (DImode,
11211 operands[8],
11212 new_off + msize * 2));
11213 mem_4 = change_address (mem_4, VOIDmode,
11214 plus_constant (DImode,
11215 operands[8],
11216 new_off + msize * 3));
11218 if (code == ZERO_EXTEND)
11220 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11221 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11222 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11223 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11225 else if (code == SIGN_EXTEND)
11227 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11228 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11229 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11230 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11233 if (load)
11235 operands[1] = mem_1;
11236 operands[3] = mem_2;
11237 operands[5] = mem_3;
11238 operands[7] = mem_4;
11240 else
11242 operands[0] = mem_1;
11243 operands[2] = mem_2;
11244 operands[4] = mem_3;
11245 operands[6] = mem_4;
11248 /* Emit adjusting instruction. */
11249 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11250 plus_constant (DImode, base, adj_off)));
11251 /* Emit ldp/stp instructions. */
11252 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11253 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11254 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11255 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11256 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11257 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11258 return true;
11261 #undef TARGET_ADDRESS_COST
11262 #define TARGET_ADDRESS_COST aarch64_address_cost
11264 /* This hook will determines whether unnamed bitfields affect the alignment
11265 of the containing structure. The hook returns true if the structure
11266 should inherit the alignment requirements of an unnamed bitfield's
11267 type. */
11268 #undef TARGET_ALIGN_ANON_BITFIELD
11269 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11271 #undef TARGET_ASM_ALIGNED_DI_OP
11272 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11274 #undef TARGET_ASM_ALIGNED_HI_OP
11275 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11277 #undef TARGET_ASM_ALIGNED_SI_OP
11278 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11280 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11281 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11282 hook_bool_const_tree_hwi_hwi_const_tree_true
11284 #undef TARGET_ASM_FILE_START
11285 #define TARGET_ASM_FILE_START aarch64_start_file
11287 #undef TARGET_ASM_OUTPUT_MI_THUNK
11288 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11290 #undef TARGET_ASM_SELECT_RTX_SECTION
11291 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11293 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11294 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11296 #undef TARGET_BUILD_BUILTIN_VA_LIST
11297 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11299 #undef TARGET_CALLEE_COPIES
11300 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11302 #undef TARGET_CAN_ELIMINATE
11303 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11305 #undef TARGET_CANNOT_FORCE_CONST_MEM
11306 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11308 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11309 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11311 /* Only the least significant bit is used for initialization guard
11312 variables. */
11313 #undef TARGET_CXX_GUARD_MASK_BIT
11314 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11316 #undef TARGET_C_MODE_FOR_SUFFIX
11317 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11319 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11320 #undef TARGET_DEFAULT_TARGET_FLAGS
11321 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11322 #endif
11324 #undef TARGET_CLASS_MAX_NREGS
11325 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11327 #undef TARGET_BUILTIN_DECL
11328 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11330 #undef TARGET_EXPAND_BUILTIN
11331 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11333 #undef TARGET_EXPAND_BUILTIN_VA_START
11334 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11336 #undef TARGET_FOLD_BUILTIN
11337 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11339 #undef TARGET_FUNCTION_ARG
11340 #define TARGET_FUNCTION_ARG aarch64_function_arg
11342 #undef TARGET_FUNCTION_ARG_ADVANCE
11343 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11345 #undef TARGET_FUNCTION_ARG_BOUNDARY
11346 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11348 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11349 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11351 #undef TARGET_FUNCTION_VALUE
11352 #define TARGET_FUNCTION_VALUE aarch64_function_value
11354 #undef TARGET_FUNCTION_VALUE_REGNO_P
11355 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11357 #undef TARGET_FRAME_POINTER_REQUIRED
11358 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11360 #undef TARGET_GIMPLE_FOLD_BUILTIN
11361 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11363 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11364 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11366 #undef TARGET_INIT_BUILTINS
11367 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11369 #undef TARGET_LEGITIMATE_ADDRESS_P
11370 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11372 #undef TARGET_LEGITIMATE_CONSTANT_P
11373 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11375 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11376 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11378 #undef TARGET_LRA_P
11379 #define TARGET_LRA_P hook_bool_void_true
11381 #undef TARGET_MANGLE_TYPE
11382 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11384 #undef TARGET_MEMORY_MOVE_COST
11385 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11387 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11388 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11390 #undef TARGET_MUST_PASS_IN_STACK
11391 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11393 /* This target hook should return true if accesses to volatile bitfields
11394 should use the narrowest mode possible. It should return false if these
11395 accesses should use the bitfield container type. */
11396 #undef TARGET_NARROW_VOLATILE_BITFIELD
11397 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11399 #undef TARGET_OPTION_OVERRIDE
11400 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11402 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11403 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11404 aarch64_override_options_after_change
11406 #undef TARGET_PASS_BY_REFERENCE
11407 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11409 #undef TARGET_PREFERRED_RELOAD_CLASS
11410 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11412 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11413 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11415 #undef TARGET_SECONDARY_RELOAD
11416 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11418 #undef TARGET_SHIFT_TRUNCATION_MASK
11419 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11421 #undef TARGET_SETUP_INCOMING_VARARGS
11422 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11424 #undef TARGET_STRUCT_VALUE_RTX
11425 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11427 #undef TARGET_REGISTER_MOVE_COST
11428 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11430 #undef TARGET_RETURN_IN_MEMORY
11431 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11433 #undef TARGET_RETURN_IN_MSB
11434 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11436 #undef TARGET_RTX_COSTS
11437 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11439 #undef TARGET_SCHED_ISSUE_RATE
11440 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11442 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11443 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11444 aarch64_sched_first_cycle_multipass_dfa_lookahead
11446 #undef TARGET_TRAMPOLINE_INIT
11447 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11449 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11450 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11452 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11453 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11455 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11456 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11458 #undef TARGET_VECTORIZE_ADD_STMT_COST
11459 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11461 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11462 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11463 aarch64_builtin_vectorization_cost
11465 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11466 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11468 #undef TARGET_VECTORIZE_BUILTINS
11469 #define TARGET_VECTORIZE_BUILTINS
11471 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11472 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11473 aarch64_builtin_vectorized_function
11475 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11476 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11477 aarch64_autovectorize_vector_sizes
11479 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11480 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11481 aarch64_atomic_assign_expand_fenv
11483 /* Section anchor support. */
11485 #undef TARGET_MIN_ANCHOR_OFFSET
11486 #define TARGET_MIN_ANCHOR_OFFSET -256
11488 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11489 byte offset; we can do much more for larger data types, but have no way
11490 to determine the size of the access. We assume accesses are aligned. */
11491 #undef TARGET_MAX_ANCHOR_OFFSET
11492 #define TARGET_MAX_ANCHOR_OFFSET 4095
11494 #undef TARGET_VECTOR_ALIGNMENT
11495 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11497 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11498 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11499 aarch64_simd_vector_alignment_reachable
11501 /* vec_perm support. */
11503 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11504 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11505 aarch64_vectorize_vec_perm_const_ok
11508 #undef TARGET_FIXED_CONDITION_CODE_REGS
11509 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11511 #undef TARGET_FLAGS_REGNUM
11512 #define TARGET_FLAGS_REGNUM CC_REGNUM
11514 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11515 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11517 #undef TARGET_ASAN_SHADOW_OFFSET
11518 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11520 #undef TARGET_LEGITIMIZE_ADDRESS
11521 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11523 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11524 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11525 aarch64_use_by_pieces_infrastructure_p
11527 #undef TARGET_CAN_USE_DOLOOP_P
11528 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11530 #undef TARGET_SCHED_MACRO_FUSION_P
11531 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11533 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11534 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11536 #undef TARGET_SCHED_FUSION_PRIORITY
11537 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11539 struct gcc_target targetm = TARGET_INITIALIZER;
11541 #include "gt-aarch64.h"