[AArch64] Use extend_arith rtx cost appropriately
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobdf76267a32b0132989a88fb69208f81f59483eb6
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
105 ADDRESS_REG_IMM
106 A simple base register plus immediate offset.
108 ADDRESS_REG_WB
109 A base register indexed by immediate offset with writeback.
111 ADDRESS_REG_REG
112 A base register indexed by (optionally scaled) register.
114 ADDRESS_REG_UXTW
115 A base register indexed by (optionally scaled) zero-extended register.
117 ADDRESS_REG_SXTW
118 A base register indexed by (optionally scaled) sign-extended register.
120 ADDRESS_LO_SUM
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
123 ADDRESS_SYMBOLIC:
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type {
127 ADDRESS_REG_IMM,
128 ADDRESS_REG_WB,
129 ADDRESS_REG_REG,
130 ADDRESS_REG_UXTW,
131 ADDRESS_REG_SXTW,
132 ADDRESS_LO_SUM,
133 ADDRESS_SYMBOLIC
136 struct aarch64_address_info {
137 enum aarch64_address_type type;
138 rtx base;
139 rtx offset;
140 int shift;
141 enum aarch64_symbol_type symbol_type;
144 struct simd_immediate_info
146 rtx value;
147 int shift;
148 int element_width;
149 bool mvn;
150 bool msl;
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel;
156 #ifdef HAVE_AS_TLS
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
159 #endif
161 static bool aarch64_composite_type_p (const_tree, machine_mode);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
163 const_tree,
164 machine_mode *, int *,
165 bool *);
166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode);
170 static unsigned bit_count (unsigned HOST_WIDE_INT);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
172 const unsigned char *sel);
173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune = cortexa53;
181 /* The current tuning set. */
182 const struct tune_params *aarch64_tune_params;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags = 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags = 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table =
195 0, /* hi */
196 0, /* si */
197 0, /* di */
198 0, /* ti */
200 0, /* pre_modify */
201 0, /* post_modify */
202 0, /* register_offset */
203 0, /* register_extend */
204 0 /* imm_offset */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 1, /* hi */
211 0, /* si */
212 0, /* di */
213 1, /* ti */
215 0, /* pre_modify */
216 0, /* post_modify */
217 0, /* register_offset */
218 0, /* register_extend */
219 0, /* imm_offset */
222 static const struct cpu_addrcost_table xgene1_addrcost_table =
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
230 1, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 1, /* register_extend */
234 0, /* imm_offset */
237 static const struct cpu_regmove_cost generic_regmove_cost =
239 1, /* GP2GP */
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
242 5, /* GP2FP */
243 5, /* FP2GP */
244 2 /* FP2FP */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
249 1, /* GP2GP */
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
252 5, /* GP2FP */
253 5, /* FP2GP */
254 2 /* FP2FP */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
259 1, /* GP2GP */
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
262 5, /* GP2FP */
263 5, /* FP2GP */
264 2 /* FP2FP */
267 static const struct cpu_regmove_cost thunderx_regmove_cost =
269 2, /* GP2GP */
270 2, /* GP2FP */
271 6, /* FP2GP */
272 4 /* FP2FP */
275 static const struct cpu_regmove_cost xgene1_regmove_cost =
277 1, /* GP2GP */
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
280 8, /* GP2FP */
281 8, /* FP2GP */
282 2 /* FP2FP */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost =
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost =
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost =
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 static const struct tune_params generic_tunings =
345 &cortexa57_extra_costs,
346 &generic_addrcost_table,
347 &generic_regmove_cost,
348 &generic_vector_cost,
349 4, /* memmov_cost */
350 2, /* issue_rate */
351 AARCH64_FUSE_NOTHING, /* fuseable_ops */
352 8, /* function_align. */
353 8, /* jump_align. */
354 4, /* loop_align. */
355 2, /* int_reassoc_width. */
356 4, /* fp_reassoc_width. */
357 1 /* vec_reassoc_width. */
360 static const struct tune_params cortexa53_tunings =
362 &cortexa53_extra_costs,
363 &generic_addrcost_table,
364 &cortexa53_regmove_cost,
365 &generic_vector_cost,
366 4, /* memmov_cost */
367 2, /* issue_rate */
368 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
369 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops */
370 8, /* function_align. */
371 8, /* jump_align. */
372 4, /* loop_align. */
373 2, /* int_reassoc_width. */
374 4, /* fp_reassoc_width. */
375 1 /* vec_reassoc_width. */
378 static const struct tune_params cortexa57_tunings =
380 &cortexa57_extra_costs,
381 &cortexa57_addrcost_table,
382 &cortexa57_regmove_cost,
383 &cortexa57_vector_cost,
384 4, /* memmov_cost */
385 3, /* issue_rate */
386 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
387 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
388 16, /* function_align. */
389 8, /* jump_align. */
390 4, /* loop_align. */
391 2, /* int_reassoc_width. */
392 4, /* fp_reassoc_width. */
393 1 /* vec_reassoc_width. */
396 static const struct tune_params thunderx_tunings =
398 &thunderx_extra_costs,
399 &generic_addrcost_table,
400 &thunderx_regmove_cost,
401 &generic_vector_cost,
402 6, /* memmov_cost */
403 2, /* issue_rate */
404 AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops */
405 8, /* function_align. */
406 8, /* jump_align. */
407 8, /* loop_align. */
408 2, /* int_reassoc_width. */
409 4, /* fp_reassoc_width. */
410 1 /* vec_reassoc_width. */
413 static const struct tune_params xgene1_tunings =
415 &xgene1_extra_costs,
416 &xgene1_addrcost_table,
417 &xgene1_regmove_cost,
418 &xgene1_vector_cost,
419 6, /* memmov_cost */
420 4, /* issue_rate */
421 AARCH64_FUSE_NOTHING, /* fuseable_ops */
422 16, /* function_align. */
423 8, /* jump_align. */
424 16, /* loop_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1 /* vec_reassoc_width. */
430 /* A processor implementing AArch64. */
431 struct processor
433 const char *const name;
434 enum aarch64_processor core;
435 const char *arch;
436 unsigned architecture_version;
437 const unsigned long flags;
438 const struct tune_params *const tune;
441 /* Processor cores implementing AArch64. */
442 static const struct processor all_cores[] =
444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
445 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
446 #include "aarch64-cores.def"
447 #undef AARCH64_CORE
448 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
449 {NULL, aarch64_none, NULL, 0, 0, NULL}
452 /* Architectures implementing AArch64. */
453 static const struct processor all_architectures[] =
455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
456 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
457 #include "aarch64-arches.def"
458 #undef AARCH64_ARCH
459 {NULL, aarch64_none, NULL, 0, 0, NULL}
462 /* Target specification. These are populated as commandline arguments
463 are processed, or NULL if not specified. */
464 static const struct processor *selected_arch;
465 static const struct processor *selected_cpu;
466 static const struct processor *selected_tune;
468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
470 /* An ISA extension in the co-processor and main instruction set space. */
471 struct aarch64_option_extension
473 const char *const name;
474 const unsigned long flags_on;
475 const unsigned long flags_off;
478 /* ISA extensions in AArch64. */
479 static const struct aarch64_option_extension all_extensions[] =
481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
482 {NAME, FLAGS_ON, FLAGS_OFF},
483 #include "aarch64-option-extensions.def"
484 #undef AARCH64_OPT_EXTENSION
485 {NULL, 0, 0}
488 /* Used to track the size of an address when generating a pre/post
489 increment address. */
490 static machine_mode aarch64_memory_reference_mode;
492 /* A table of valid AArch64 "bitmask immediate" values for
493 logical instructions. */
495 #define AARCH64_NUM_BITMASKS 5334
496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
498 typedef enum aarch64_cond_code
500 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
501 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
502 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
504 aarch64_cc;
506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
508 /* The condition codes of the processor, and the inverse function. */
509 static const char * const aarch64_condition_codes[] =
511 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
512 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
515 static unsigned int
516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
518 return 2;
521 static int
522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
523 enum machine_mode mode)
525 if (VECTOR_MODE_P (mode))
526 return aarch64_tune_params->vec_reassoc_width;
527 if (INTEGRAL_MODE_P (mode))
528 return aarch64_tune_params->int_reassoc_width;
529 if (FLOAT_MODE_P (mode))
530 return aarch64_tune_params->fp_reassoc_width;
531 return 1;
534 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
535 unsigned
536 aarch64_dbx_register_number (unsigned regno)
538 if (GP_REGNUM_P (regno))
539 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
540 else if (regno == SP_REGNUM)
541 return AARCH64_DWARF_SP;
542 else if (FP_REGNUM_P (regno))
543 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
545 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
546 equivalent DWARF register. */
547 return DWARF_FRAME_REGISTERS;
550 /* Return TRUE if MODE is any of the large INT modes. */
551 static bool
552 aarch64_vect_struct_mode_p (machine_mode mode)
554 return mode == OImode || mode == CImode || mode == XImode;
557 /* Return TRUE if MODE is any of the vector modes. */
558 static bool
559 aarch64_vector_mode_p (machine_mode mode)
561 return aarch64_vector_mode_supported_p (mode)
562 || aarch64_vect_struct_mode_p (mode);
565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
566 static bool
567 aarch64_array_mode_supported_p (machine_mode mode,
568 unsigned HOST_WIDE_INT nelems)
570 if (TARGET_SIMD
571 && AARCH64_VALID_SIMD_QREG_MODE (mode)
572 && (nelems >= 2 && nelems <= 4))
573 return true;
575 return false;
578 /* Implement HARD_REGNO_NREGS. */
581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
583 switch (aarch64_regno_regclass (regno))
585 case FP_REGS:
586 case FP_LO_REGS:
587 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
588 default:
589 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
591 gcc_unreachable ();
594 /* Implement HARD_REGNO_MODE_OK. */
597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
599 if (GET_MODE_CLASS (mode) == MODE_CC)
600 return regno == CC_REGNUM;
602 if (regno == SP_REGNUM)
603 /* The purpose of comparing with ptr_mode is to support the
604 global register variable associated with the stack pointer
605 register via the syntax of asm ("wsp") in ILP32. */
606 return mode == Pmode || mode == ptr_mode;
608 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
609 return mode == Pmode;
611 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
612 return 1;
614 if (FP_REGNUM_P (regno))
616 if (aarch64_vect_struct_mode_p (mode))
617 return
618 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
619 else
620 return 1;
623 return 0;
626 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
627 machine_mode
628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
629 machine_mode mode)
631 /* Handle modes that fit within single registers. */
632 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
634 if (GET_MODE_SIZE (mode) >= 4)
635 return mode;
636 else
637 return SImode;
639 /* Fall back to generic for multi-reg and very large modes. */
640 else
641 return choose_hard_reg_mode (regno, nregs, false);
644 /* Return true if calls to DECL should be treated as
645 long-calls (ie called via a register). */
646 static bool
647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
649 return false;
652 /* Return true if calls to symbol-ref SYM should be treated as
653 long-calls (ie called via a register). */
654 bool
655 aarch64_is_long_call_p (rtx sym)
657 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
660 /* Return true if the offsets to a zero/sign-extract operation
661 represent an expression that matches an extend operation. The
662 operands represent the paramters from
664 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
665 bool
666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
667 rtx extract_imm)
669 HOST_WIDE_INT mult_val, extract_val;
671 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
672 return false;
674 mult_val = INTVAL (mult_imm);
675 extract_val = INTVAL (extract_imm);
677 if (extract_val > 8
678 && extract_val < GET_MODE_BITSIZE (mode)
679 && exact_log2 (extract_val & ~7) > 0
680 && (extract_val & 7) <= 4
681 && mult_val == (1 << (extract_val & 7)))
682 return true;
684 return false;
687 /* Emit an insn that's a simple single-set. Both the operands must be
688 known to be valid. */
689 inline static rtx
690 emit_set_insn (rtx x, rtx y)
692 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
695 /* X and Y are two things to compare using CODE. Emit the compare insn and
696 return the rtx for register 0 in the proper mode. */
698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
700 machine_mode mode = SELECT_CC_MODE (code, x, y);
701 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
703 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
704 return cc_reg;
707 /* Build the SYMBOL_REF for __tls_get_addr. */
709 static GTY(()) rtx tls_get_addr_libfunc;
712 aarch64_tls_get_addr (void)
714 if (!tls_get_addr_libfunc)
715 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
716 return tls_get_addr_libfunc;
719 /* Return the TLS model to use for ADDR. */
721 static enum tls_model
722 tls_symbolic_operand_type (rtx addr)
724 enum tls_model tls_kind = TLS_MODEL_NONE;
725 rtx sym, addend;
727 if (GET_CODE (addr) == CONST)
729 split_const (addr, &sym, &addend);
730 if (GET_CODE (sym) == SYMBOL_REF)
731 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
733 else if (GET_CODE (addr) == SYMBOL_REF)
734 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
736 return tls_kind;
739 /* We'll allow lo_sum's in addresses in our legitimate addresses
740 so that combine would take care of combining addresses where
741 necessary, but for generation purposes, we'll generate the address
742 as :
743 RTL Absolute
744 tmp = hi (symbol_ref); adrp x1, foo
745 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
748 PIC TLS
749 adrp x1, :got:foo adrp tmp, :tlsgd:foo
750 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
751 bl __tls_get_addr
754 Load TLS symbol, depending on TLS mechanism and TLS access model.
756 Global Dynamic - Traditional TLS:
757 adrp tmp, :tlsgd:imm
758 add dest, tmp, #:tlsgd_lo12:imm
759 bl __tls_get_addr
761 Global Dynamic - TLS Descriptors:
762 adrp dest, :tlsdesc:imm
763 ldr tmp, [dest, #:tlsdesc_lo12:imm]
764 add dest, dest, #:tlsdesc_lo12:imm
765 blr tmp
766 mrs tp, tpidr_el0
767 add dest, dest, tp
769 Initial Exec:
770 mrs tp, tpidr_el0
771 adrp tmp, :gottprel:imm
772 ldr dest, [tmp, #:gottprel_lo12:imm]
773 add dest, dest, tp
775 Local Exec:
776 mrs tp, tpidr_el0
777 add t0, tp, #:tprel_hi12:imm, lsl #12
778 add t0, t0, #:tprel_lo12_nc:imm
781 static void
782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
783 enum aarch64_symbol_type type)
785 switch (type)
787 case SYMBOL_SMALL_ABSOLUTE:
789 /* In ILP32, the mode of dest can be either SImode or DImode. */
790 rtx tmp_reg = dest;
791 machine_mode mode = GET_MODE (dest);
793 gcc_assert (mode == Pmode || mode == ptr_mode);
795 if (can_create_pseudo_p ())
796 tmp_reg = gen_reg_rtx (mode);
798 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
799 emit_insn (gen_add_losym (dest, tmp_reg, imm));
800 return;
803 case SYMBOL_TINY_ABSOLUTE:
804 emit_insn (gen_rtx_SET (Pmode, dest, imm));
805 return;
807 case SYMBOL_SMALL_GOT:
809 /* In ILP32, the mode of dest can be either SImode or DImode,
810 while the got entry is always of SImode size. The mode of
811 dest depends on how dest is used: if dest is assigned to a
812 pointer (e.g. in the memory), it has SImode; it may have
813 DImode if dest is dereferenced to access the memeory.
814 This is why we have to handle three different ldr_got_small
815 patterns here (two patterns for ILP32). */
816 rtx tmp_reg = dest;
817 machine_mode mode = GET_MODE (dest);
819 if (can_create_pseudo_p ())
820 tmp_reg = gen_reg_rtx (mode);
822 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
823 if (mode == ptr_mode)
825 if (mode == DImode)
826 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
827 else
828 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
830 else
832 gcc_assert (mode == Pmode);
833 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
836 return;
839 case SYMBOL_SMALL_TLSGD:
841 rtx_insn *insns;
842 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
844 start_sequence ();
845 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
846 insns = get_insns ();
847 end_sequence ();
849 RTL_CONST_CALL_P (insns) = 1;
850 emit_libcall_block (insns, dest, result, imm);
851 return;
854 case SYMBOL_SMALL_TLSDESC:
856 machine_mode mode = GET_MODE (dest);
857 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
858 rtx tp;
860 gcc_assert (mode == Pmode || mode == ptr_mode);
862 /* In ILP32, the got entry is always of SImode size. Unlike
863 small GOT, the dest is fixed at reg 0. */
864 if (TARGET_ILP32)
865 emit_insn (gen_tlsdesc_small_si (imm));
866 else
867 emit_insn (gen_tlsdesc_small_di (imm));
868 tp = aarch64_load_tp (NULL);
870 if (mode != Pmode)
871 tp = gen_lowpart (mode, tp);
873 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
874 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
875 return;
878 case SYMBOL_SMALL_GOTTPREL:
880 /* In ILP32, the mode of dest can be either SImode or DImode,
881 while the got entry is always of SImode size. The mode of
882 dest depends on how dest is used: if dest is assigned to a
883 pointer (e.g. in the memory), it has SImode; it may have
884 DImode if dest is dereferenced to access the memeory.
885 This is why we have to handle three different tlsie_small
886 patterns here (two patterns for ILP32). */
887 machine_mode mode = GET_MODE (dest);
888 rtx tmp_reg = gen_reg_rtx (mode);
889 rtx tp = aarch64_load_tp (NULL);
891 if (mode == ptr_mode)
893 if (mode == DImode)
894 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
895 else
897 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
898 tp = gen_lowpart (mode, tp);
901 else
903 gcc_assert (mode == Pmode);
904 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
907 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
908 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
909 return;
912 case SYMBOL_SMALL_TPREL:
914 rtx tp = aarch64_load_tp (NULL);
916 if (GET_MODE (dest) != Pmode)
917 tp = gen_lowpart (GET_MODE (dest), tp);
919 emit_insn (gen_tlsle_small (dest, tp, imm));
920 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
921 return;
924 case SYMBOL_TINY_GOT:
925 emit_insn (gen_ldr_got_tiny (dest, imm));
926 return;
928 default:
929 gcc_unreachable ();
933 /* Emit a move from SRC to DEST. Assume that the move expanders can
934 handle all moves if !can_create_pseudo_p (). The distinction is
935 important because, unlike emit_move_insn, the move expanders know
936 how to force Pmode objects into the constant pool even when the
937 constant pool address is not itself legitimate. */
938 static rtx
939 aarch64_emit_move (rtx dest, rtx src)
941 return (can_create_pseudo_p ()
942 ? emit_move_insn (dest, src)
943 : emit_move_insn_1 (dest, src));
946 /* Split a 128-bit move operation into two 64-bit move operations,
947 taking care to handle partial overlap of register to register
948 copies. Special cases are needed when moving between GP regs and
949 FP regs. SRC can be a register, constant or memory; DST a register
950 or memory. If either operand is memory it must not have any side
951 effects. */
952 void
953 aarch64_split_128bit_move (rtx dst, rtx src)
955 rtx dst_lo, dst_hi;
956 rtx src_lo, src_hi;
958 machine_mode mode = GET_MODE (dst);
960 gcc_assert (mode == TImode || mode == TFmode);
961 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
962 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
964 if (REG_P (dst) && REG_P (src))
966 int src_regno = REGNO (src);
967 int dst_regno = REGNO (dst);
969 /* Handle FP <-> GP regs. */
970 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
972 src_lo = gen_lowpart (word_mode, src);
973 src_hi = gen_highpart (word_mode, src);
975 if (mode == TImode)
977 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
978 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
980 else
982 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
983 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
985 return;
987 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
989 dst_lo = gen_lowpart (word_mode, dst);
990 dst_hi = gen_highpart (word_mode, dst);
992 if (mode == TImode)
994 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
995 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
997 else
999 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1002 return;
1006 dst_lo = gen_lowpart (word_mode, dst);
1007 dst_hi = gen_highpart (word_mode, dst);
1008 src_lo = gen_lowpart (word_mode, src);
1009 src_hi = gen_highpart_mode (word_mode, mode, src);
1011 /* At most one pairing may overlap. */
1012 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1014 aarch64_emit_move (dst_hi, src_hi);
1015 aarch64_emit_move (dst_lo, src_lo);
1017 else
1019 aarch64_emit_move (dst_lo, src_lo);
1020 aarch64_emit_move (dst_hi, src_hi);
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1027 return (! REG_P (src)
1028 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1031 /* Split a complex SIMD combine. */
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1036 machine_mode src_mode = GET_MODE (src1);
1037 machine_mode dst_mode = GET_MODE (dst);
1039 gcc_assert (VECTOR_MODE_P (dst_mode));
1041 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1043 rtx (*gen) (rtx, rtx, rtx);
1045 switch (src_mode)
1047 case V8QImode:
1048 gen = gen_aarch64_simd_combinev8qi;
1049 break;
1050 case V4HImode:
1051 gen = gen_aarch64_simd_combinev4hi;
1052 break;
1053 case V2SImode:
1054 gen = gen_aarch64_simd_combinev2si;
1055 break;
1056 case V2SFmode:
1057 gen = gen_aarch64_simd_combinev2sf;
1058 break;
1059 case DImode:
1060 gen = gen_aarch64_simd_combinedi;
1061 break;
1062 case DFmode:
1063 gen = gen_aarch64_simd_combinedf;
1064 break;
1065 default:
1066 gcc_unreachable ();
1069 emit_insn (gen (dst, src1, src2));
1070 return;
1074 /* Split a complex SIMD move. */
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1079 machine_mode src_mode = GET_MODE (src);
1080 machine_mode dst_mode = GET_MODE (dst);
1082 gcc_assert (VECTOR_MODE_P (dst_mode));
1084 if (REG_P (dst) && REG_P (src))
1086 rtx (*gen) (rtx, rtx);
1088 gcc_assert (VECTOR_MODE_P (src_mode));
1090 switch (src_mode)
1092 case V16QImode:
1093 gen = gen_aarch64_split_simd_movv16qi;
1094 break;
1095 case V8HImode:
1096 gen = gen_aarch64_split_simd_movv8hi;
1097 break;
1098 case V4SImode:
1099 gen = gen_aarch64_split_simd_movv4si;
1100 break;
1101 case V2DImode:
1102 gen = gen_aarch64_split_simd_movv2di;
1103 break;
1104 case V4SFmode:
1105 gen = gen_aarch64_split_simd_movv4sf;
1106 break;
1107 case V2DFmode:
1108 gen = gen_aarch64_split_simd_movv2df;
1109 break;
1110 default:
1111 gcc_unreachable ();
1114 emit_insn (gen (dst, src));
1115 return;
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1122 if (can_create_pseudo_p ())
1123 return force_reg (mode, value);
1124 else
1126 x = aarch64_emit_move (x, value);
1127 return x;
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1135 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1137 rtx high;
1138 /* Load the full offset into a register. This
1139 might be improvable in the future. */
1140 high = GEN_INT (offset);
1141 offset = 0;
1142 high = aarch64_force_temporary (mode, temp, high);
1143 reg = aarch64_force_temporary (mode, temp,
1144 gen_rtx_PLUS (mode, high, reg));
1146 return plus_constant (mode, reg, offset);
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151 machine_mode mode)
1153 unsigned HOST_WIDE_INT mask;
1154 int i;
1155 bool first;
1156 unsigned HOST_WIDE_INT val;
1157 bool subtargets;
1158 rtx subtarget;
1159 int one_match, zero_match, first_not_ffff_match;
1160 int num_insns = 0;
1162 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1164 if (generate)
1165 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166 num_insns++;
1167 return num_insns;
1170 if (mode == SImode)
1172 /* We know we can't do this in 1 insn, and we must be able to do it
1173 in two; so don't mess around looking for sequences that don't buy
1174 us anything. */
1175 if (generate)
1177 emit_insn (gen_rtx_SET (VOIDmode, dest,
1178 GEN_INT (INTVAL (imm) & 0xffff)));
1179 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1182 num_insns += 2;
1183 return num_insns;
1186 /* Remaining cases are all for DImode. */
1188 val = INTVAL (imm);
1189 subtargets = optimize && can_create_pseudo_p ();
1191 one_match = 0;
1192 zero_match = 0;
1193 mask = 0xffff;
1194 first_not_ffff_match = -1;
1196 for (i = 0; i < 64; i += 16, mask <<= 16)
1198 if ((val & mask) == mask)
1199 one_match++;
1200 else
1202 if (first_not_ffff_match < 0)
1203 first_not_ffff_match = i;
1204 if ((val & mask) == 0)
1205 zero_match++;
1209 if (one_match == 2)
1211 /* Set one of the quarters and then insert back into result. */
1212 mask = 0xffffll << first_not_ffff_match;
1213 if (generate)
1215 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217 GEN_INT ((val >> first_not_ffff_match)
1218 & 0xffff)));
1220 num_insns += 2;
1221 return num_insns;
1224 if (zero_match == 2)
1225 goto simple_sequence;
1227 mask = 0x0ffff0000UL;
1228 for (i = 16; i < 64; i += 16, mask <<= 16)
1230 HOST_WIDE_INT comp = mask & ~(mask - 1);
1232 if (aarch64_uimm12_shift (val - (val & mask)))
1234 if (generate)
1236 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238 GEN_INT (val & mask)));
1239 emit_insn (gen_adddi3 (dest, subtarget,
1240 GEN_INT (val - (val & mask))));
1242 num_insns += 2;
1243 return num_insns;
1245 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1247 if (generate)
1249 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251 GEN_INT ((val + comp) & mask)));
1252 emit_insn (gen_adddi3 (dest, subtarget,
1253 GEN_INT (val - ((val + comp) & mask))));
1255 num_insns += 2;
1256 return num_insns;
1258 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1260 if (generate)
1262 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264 GEN_INT ((val - comp) | ~mask)));
1265 emit_insn (gen_adddi3 (dest, subtarget,
1266 GEN_INT (val - ((val - comp) | ~mask))));
1268 num_insns += 2;
1269 return num_insns;
1271 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1273 if (generate)
1275 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277 GEN_INT (val | ~mask)));
1278 emit_insn (gen_adddi3 (dest, subtarget,
1279 GEN_INT (val - (val | ~mask))));
1281 num_insns += 2;
1282 return num_insns;
1286 /* See if we can do it by arithmetically combining two
1287 immediates. */
1288 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1290 int j;
1291 mask = 0xffff;
1293 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1296 if (generate)
1298 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300 GEN_INT (aarch64_bitmasks[i])));
1301 emit_insn (gen_adddi3 (dest, subtarget,
1302 GEN_INT (val - aarch64_bitmasks[i])));
1304 num_insns += 2;
1305 return num_insns;
1308 for (j = 0; j < 64; j += 16, mask <<= 16)
1310 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1312 if (generate)
1314 emit_insn (gen_rtx_SET (VOIDmode, dest,
1315 GEN_INT (aarch64_bitmasks[i])));
1316 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317 GEN_INT ((val >> j) & 0xffff)));
1319 num_insns += 2;
1320 return num_insns;
1325 /* See if we can do it by logically combining two immediates. */
1326 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1328 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1330 int j;
1332 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1335 if (generate)
1337 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339 GEN_INT (aarch64_bitmasks[i])));
1340 emit_insn (gen_iordi3 (dest, subtarget,
1341 GEN_INT (aarch64_bitmasks[j])));
1343 num_insns += 2;
1344 return num_insns;
1347 else if ((val & aarch64_bitmasks[i]) == val)
1349 int j;
1351 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1354 if (generate)
1356 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358 GEN_INT (aarch64_bitmasks[j])));
1359 emit_insn (gen_anddi3 (dest, subtarget,
1360 GEN_INT (aarch64_bitmasks[i])));
1362 num_insns += 2;
1363 return num_insns;
1368 if (one_match > zero_match)
1370 /* Set either first three quarters or all but the third. */
1371 mask = 0xffffll << (16 - first_not_ffff_match);
1372 if (generate)
1373 emit_insn (gen_rtx_SET (VOIDmode, dest,
1374 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375 num_insns ++;
1377 /* Now insert other two quarters. */
1378 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379 i < 64; i += 16, mask <<= 16)
1381 if ((val & mask) != mask)
1383 if (generate)
1384 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385 GEN_INT ((val >> i) & 0xffff)));
1386 num_insns ++;
1389 return num_insns;
1392 simple_sequence:
1393 first = true;
1394 mask = 0xffff;
1395 for (i = 0; i < 64; i += 16, mask <<= 16)
1397 if ((val & mask) != 0)
1399 if (first)
1401 if (generate)
1402 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403 GEN_INT (val & mask)));
1404 num_insns ++;
1405 first = false;
1407 else
1409 if (generate)
1410 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411 GEN_INT ((val >> i) & 0xffff)));
1412 num_insns ++;
1417 return num_insns;
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1424 machine_mode mode = GET_MODE (dest);
1426 gcc_assert (mode == SImode || mode == DImode);
1428 /* Check on what type of symbol it is. */
1429 if (GET_CODE (imm) == SYMBOL_REF
1430 || GET_CODE (imm) == LABEL_REF
1431 || GET_CODE (imm) == CONST)
1433 rtx mem, base, offset;
1434 enum aarch64_symbol_type sty;
1436 /* If we have (const (plus symbol offset)), separate out the offset
1437 before we start classifying the symbol. */
1438 split_const (imm, &base, &offset);
1440 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441 switch (sty)
1443 case SYMBOL_FORCE_TO_MEM:
1444 if (offset != const0_rtx
1445 && targetm.cannot_force_const_mem (mode, imm))
1447 gcc_assert (can_create_pseudo_p ());
1448 base = aarch64_force_temporary (mode, dest, base);
1449 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450 aarch64_emit_move (dest, base);
1451 return;
1453 mem = force_const_mem (ptr_mode, imm);
1454 gcc_assert (mem);
1455 if (mode != ptr_mode)
1456 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458 return;
1460 case SYMBOL_SMALL_TLSGD:
1461 case SYMBOL_SMALL_TLSDESC:
1462 case SYMBOL_SMALL_GOTTPREL:
1463 case SYMBOL_SMALL_GOT:
1464 case SYMBOL_TINY_GOT:
1465 if (offset != const0_rtx)
1467 gcc_assert(can_create_pseudo_p ());
1468 base = aarch64_force_temporary (mode, dest, base);
1469 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470 aarch64_emit_move (dest, base);
1471 return;
1473 /* FALLTHRU */
1475 case SYMBOL_SMALL_TPREL:
1476 case SYMBOL_SMALL_ABSOLUTE:
1477 case SYMBOL_TINY_ABSOLUTE:
1478 aarch64_load_symref_appropriately (dest, imm, sty);
1479 return;
1481 default:
1482 gcc_unreachable ();
1486 if (!CONST_INT_P (imm))
1488 if (GET_CODE (imm) == HIGH)
1489 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490 else
1492 rtx mem = force_const_mem (mode, imm);
1493 gcc_assert (mem);
1494 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1497 return;
1500 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505 tree exp ATTRIBUTE_UNUSED)
1507 /* Currently, always true. */
1508 return true;
1511 /* Implement TARGET_PASS_BY_REFERENCE. */
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515 machine_mode mode,
1516 const_tree type,
1517 bool named ATTRIBUTE_UNUSED)
1519 HOST_WIDE_INT size;
1520 machine_mode dummymode;
1521 int nregs;
1523 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1524 size = (mode == BLKmode && type)
1525 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1527 /* Aggregates are passed by reference based on their size. */
1528 if (type && AGGREGATE_TYPE_P (type))
1530 size = int_size_in_bytes (type);
1533 /* Variable sized arguments are always returned by reference. */
1534 if (size < 0)
1535 return true;
1537 /* Can this be a candidate to be passed in fp/simd register(s)? */
1538 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539 &dummymode, &nregs,
1540 NULL))
1541 return false;
1543 /* Arguments which are variable sized or larger than 2 registers are
1544 passed by reference unless they are a homogenous floating point
1545 aggregate. */
1546 return size > 2 * UNITS_PER_WORD;
1549 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1553 machine_mode dummy_mode;
1554 int dummy_int;
1556 /* Never happens in little-endian mode. */
1557 if (!BYTES_BIG_ENDIAN)
1558 return false;
1560 /* Only composite types smaller than or equal to 16 bytes can
1561 be potentially returned in registers. */
1562 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563 || int_size_in_bytes (valtype) <= 0
1564 || int_size_in_bytes (valtype) > 16)
1565 return false;
1567 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569 is always passed/returned in the least significant bits of fp/simd
1570 register(s). */
1571 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572 &dummy_mode, &dummy_int, NULL))
1573 return false;
1575 return true;
1578 /* Implement TARGET_FUNCTION_VALUE.
1579 Define how to find the value returned by a function. */
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583 bool outgoing ATTRIBUTE_UNUSED)
1585 machine_mode mode;
1586 int unsignedp;
1587 int count;
1588 machine_mode ag_mode;
1590 mode = TYPE_MODE (type);
1591 if (INTEGRAL_TYPE_P (type))
1592 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1594 if (aarch64_return_in_msb (type))
1596 HOST_WIDE_INT size = int_size_in_bytes (type);
1598 if (size % UNITS_PER_WORD != 0)
1600 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1605 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606 &ag_mode, &count, NULL))
1608 if (!aarch64_composite_type_p (type, mode))
1610 gcc_assert (count == 1 && mode == ag_mode);
1611 return gen_rtx_REG (mode, V0_REGNUM);
1613 else
1615 int i;
1616 rtx par;
1618 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619 for (i = 0; i < count; i++)
1621 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624 XVECEXP (par, 0, i) = tmp;
1626 return par;
1629 else
1630 return gen_rtx_REG (mode, R0_REGNUM);
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634 Return true if REGNO is the number of a hard register in which the values
1635 of called function may come back. */
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1640 /* Maximum of 16 bytes can be returned in the general registers. Examples
1641 of 16-byte return values are: 128-bit integers and 16-byte small
1642 structures (excluding homogeneous floating-point aggregates). */
1643 if (regno == R0_REGNUM || regno == R1_REGNUM)
1644 return true;
1646 /* Up to four fp/simd registers can return a function value, e.g. a
1647 homogeneous floating-point aggregate having four members. */
1648 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649 return !TARGET_GENERAL_REGS_ONLY;
1651 return false;
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1656 If the type T of the result of a function is such that
1657 void func (T arg)
1658 would require that arg be passed as a value in a register (or set of
1659 registers) according to the parameter passing rules, then the result
1660 is returned in the same registers as would be used for such an
1661 argument. */
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1666 HOST_WIDE_INT size;
1667 machine_mode ag_mode;
1668 int count;
1670 if (!AGGREGATE_TYPE_P (type)
1671 && TREE_CODE (type) != COMPLEX_TYPE
1672 && TREE_CODE (type) != VECTOR_TYPE)
1673 /* Simple scalar types always returned in registers. */
1674 return false;
1676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677 type,
1678 &ag_mode,
1679 &count,
1680 NULL))
1681 return false;
1683 /* Types larger than 2 registers returned in memory. */
1684 size = int_size_in_bytes (type);
1685 return (size < 0 || size > 2 * UNITS_PER_WORD);
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690 const_tree type, int *nregs)
1692 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693 return aarch64_vfp_is_call_or_return_candidate (mode,
1694 type,
1695 &pcum->aapcs_vfp_rmode,
1696 nregs,
1697 NULL);
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701 bits. The idea is to suppress any stronger alignment requested by
1702 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703 This is a helper function for local use only. */
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1708 unsigned int alignment;
1710 if (type)
1712 if (!integer_zerop (TYPE_SIZE (type)))
1714 if (TYPE_MODE (type) == mode)
1715 alignment = TYPE_ALIGN (type);
1716 else
1717 alignment = GET_MODE_ALIGNMENT (mode);
1719 else
1720 alignment = 0;
1722 else
1723 alignment = GET_MODE_ALIGNMENT (mode);
1725 return alignment;
1728 /* Layout a function argument according to the AAPCS64 rules. The rule
1729 numbers refer to the rule numbers in the AAPCS64. */
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733 const_tree type,
1734 bool named ATTRIBUTE_UNUSED)
1736 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737 int ncrn, nvrn, nregs;
1738 bool allocate_ncrn, allocate_nvrn;
1739 HOST_WIDE_INT size;
1741 /* We need to do this once per argument. */
1742 if (pcum->aapcs_arg_processed)
1743 return;
1745 pcum->aapcs_arg_processed = true;
1747 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1748 size
1749 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750 UNITS_PER_WORD);
1752 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754 mode,
1755 type,
1756 &nregs);
1758 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759 The following code thus handles passing by SIMD/FP registers first. */
1761 nvrn = pcum->aapcs_nvrn;
1763 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764 and homogenous short-vector aggregates (HVA). */
1765 if (allocate_nvrn)
1767 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1769 pcum->aapcs_nextnvrn = nvrn + nregs;
1770 if (!aarch64_composite_type_p (type, mode))
1772 gcc_assert (nregs == 1);
1773 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1775 else
1777 rtx par;
1778 int i;
1779 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780 for (i = 0; i < nregs; i++)
1782 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783 V0_REGNUM + nvrn + i);
1784 tmp = gen_rtx_EXPR_LIST
1785 (VOIDmode, tmp,
1786 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787 XVECEXP (par, 0, i) = tmp;
1789 pcum->aapcs_reg = par;
1791 return;
1793 else
1795 /* C.3 NSRN is set to 8. */
1796 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797 goto on_stack;
1801 ncrn = pcum->aapcs_ncrn;
1802 nregs = size / UNITS_PER_WORD;
1804 /* C6 - C9. though the sign and zero extension semantics are
1805 handled elsewhere. This is the case where the argument fits
1806 entirely general registers. */
1807 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1809 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1811 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1813 /* C.8 if the argument has an alignment of 16 then the NGRN is
1814 rounded up to the next even number. */
1815 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1817 ++ncrn;
1818 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1820 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821 A reg is still generated for it, but the caller should be smart
1822 enough not to use it. */
1823 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1825 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1827 else
1829 rtx par;
1830 int i;
1832 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833 for (i = 0; i < nregs; i++)
1835 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837 GEN_INT (i * UNITS_PER_WORD));
1838 XVECEXP (par, 0, i) = tmp;
1840 pcum->aapcs_reg = par;
1843 pcum->aapcs_nextncrn = ncrn + nregs;
1844 return;
1847 /* C.11 */
1848 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1850 /* The argument is passed on stack; record the needed number of words for
1851 this argument and align the total size if necessary. */
1852 on_stack:
1853 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856 16 / UNITS_PER_WORD);
1857 return;
1860 /* Implement TARGET_FUNCTION_ARG. */
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864 const_tree type, bool named)
1866 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1869 if (mode == VOIDmode)
1870 return NULL_RTX;
1872 aarch64_layout_arg (pcum_v, mode, type, named);
1873 return pcum->aapcs_reg;
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878 const_tree fntype ATTRIBUTE_UNUSED,
1879 rtx libname ATTRIBUTE_UNUSED,
1880 const_tree fndecl ATTRIBUTE_UNUSED,
1881 unsigned n_named ATTRIBUTE_UNUSED)
1883 pcum->aapcs_ncrn = 0;
1884 pcum->aapcs_nvrn = 0;
1885 pcum->aapcs_nextncrn = 0;
1886 pcum->aapcs_nextnvrn = 0;
1887 pcum->pcs_variant = ARM_PCS_AAPCS64;
1888 pcum->aapcs_reg = NULL_RTX;
1889 pcum->aapcs_arg_processed = false;
1890 pcum->aapcs_stack_words = 0;
1891 pcum->aapcs_stack_size = 0;
1893 return;
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898 machine_mode mode,
1899 const_tree type,
1900 bool named)
1902 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1905 aarch64_layout_arg (pcum_v, mode, type, named);
1906 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907 != (pcum->aapcs_stack_words != 0));
1908 pcum->aapcs_arg_processed = false;
1909 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912 pcum->aapcs_stack_words = 0;
1913 pcum->aapcs_reg = NULL_RTX;
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1920 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1924 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1925 PARM_BOUNDARY bits of alignment, but will be given anything up
1926 to STACK_BOUNDARY bits if the type requires it. This makes sure
1927 that both before and after the layout of each argument, the Next
1928 Stacked Argument Address (NSAA) will have a minimum alignment of
1929 8 bytes. */
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1934 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1936 if (alignment < PARM_BOUNDARY)
1937 alignment = PARM_BOUNDARY;
1938 if (alignment > STACK_BOUNDARY)
1939 alignment = STACK_BOUNDARY;
1940 return alignment;
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1945 Return true if an argument passed on the stack should be padded upwards,
1946 i.e. if the least-significant byte of the stack slot has useful data.
1948 Small aggregate types are placed in the lowest memory address.
1950 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1955 /* On little-endian targets, the least significant byte of every stack
1956 argument is passed at the lowest byte address of the stack slot. */
1957 if (!BYTES_BIG_ENDIAN)
1958 return true;
1960 /* Otherwise, integral, floating-point and pointer types are padded downward:
1961 the least significant byte of a stack argument is passed at the highest
1962 byte address of the stack slot. */
1963 if (type
1964 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965 || POINTER_TYPE_P (type))
1966 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967 return false;
1969 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1970 return true;
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1975 It specifies padding for the last (may also be the only)
1976 element of a block move between registers and memory. If
1977 assuming the block is in the memory, padding upward means that
1978 the last element is padded after its highest significant byte,
1979 while in downward padding, the last element is padded at the
1980 its least significant byte side.
1982 Small aggregates and small complex types are always padded
1983 upwards.
1985 We don't need to worry about homogeneous floating-point or
1986 short-vector aggregates; their move is not affected by the
1987 padding direction determined here. Regardless of endianness,
1988 each element of such an aggregate is put in the least
1989 significant bits of a fp/simd register.
1991 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992 register has useful data, and return the opposite if the most
1993 significant byte does. */
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997 bool first ATTRIBUTE_UNUSED)
2000 /* Small composite types are always padded upward. */
2001 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2003 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004 : GET_MODE_SIZE (mode));
2005 if (size < 2 * UNITS_PER_WORD)
2006 return true;
2009 /* Otherwise, use the default padding. */
2010 return !BYTES_BIG_ENDIAN;
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2016 return SImode;
2019 static bool
2020 aarch64_frame_pointer_required (void)
2022 /* In aarch64_override_options_after_change
2023 flag_omit_leaf_frame_pointer turns off the frame pointer by
2024 default. Turn it back on now if we've not got a leaf
2025 function. */
2026 if (flag_omit_leaf_frame_pointer
2027 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028 return true;
2030 return false;
2033 /* Mark the registers that need to be saved by the callee and calculate
2034 the size of the callee-saved registers area and frame record (both FP
2035 and LR may be omitted). */
2036 static void
2037 aarch64_layout_frame (void)
2039 HOST_WIDE_INT offset = 0;
2040 int regno;
2042 if (reload_completed && cfun->machine->frame.laid_out)
2043 return;
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED (-1)
2048 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2051 /* First mark all the registers that really need to be saved... */
2052 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2055 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2058 /* ... that includes the eh data registers (if needed)... */
2059 if (crtl->calls_eh_return)
2060 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062 = SLOT_REQUIRED;
2064 /* ... and any callee saved register that dataflow says is live. */
2065 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066 if (df_regs_ever_live_p (regno)
2067 && (regno == R30_REGNUM
2068 || !call_used_regs[regno]))
2069 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2071 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072 if (df_regs_ever_live_p (regno)
2073 && !call_used_regs[regno])
2074 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2076 if (frame_pointer_needed)
2078 /* FP and LR are placed in the linkage record. */
2079 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084 offset += 2 * UNITS_PER_WORD;
2087 /* Now assign stack slots for them. */
2088 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2091 cfun->machine->frame.reg_offset[regno] = offset;
2092 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093 cfun->machine->frame.wb_candidate1 = regno;
2094 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095 cfun->machine->frame.wb_candidate2 = regno;
2096 offset += UNITS_PER_WORD;
2099 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2102 cfun->machine->frame.reg_offset[regno] = offset;
2103 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104 cfun->machine->frame.wb_candidate1 = regno;
2105 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107 cfun->machine->frame.wb_candidate2 = regno;
2108 offset += UNITS_PER_WORD;
2111 cfun->machine->frame.padding0 =
2112 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2115 cfun->machine->frame.saved_regs_size = offset;
2117 cfun->machine->frame.hard_fp_offset
2118 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119 + get_frame_size ()
2120 + cfun->machine->frame.saved_regs_size,
2121 STACK_BOUNDARY / BITS_PER_UNIT);
2123 cfun->machine->frame.frame_size
2124 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125 + crtl->outgoing_args_size,
2126 STACK_BOUNDARY / BITS_PER_UNIT);
2128 cfun->machine->frame.laid_out = true;
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2134 return cfun->machine->frame.reg_offset[regno] >= 0;
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2140 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141 regno ++;
2142 return regno;
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147 HOST_WIDE_INT adjustment)
2149 rtx base_rtx = stack_pointer_rtx;
2150 rtx insn, reg, mem;
2152 reg = gen_rtx_REG (mode, regno);
2153 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154 plus_constant (Pmode, base_rtx, -adjustment));
2155 mem = gen_rtx_MEM (mode, mem);
2157 insn = emit_move_insn (mem, reg);
2158 RTX_FRAME_RELATED_P (insn) = 1;
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163 HOST_WIDE_INT adjustment)
2165 switch (mode)
2167 case DImode:
2168 return gen_storewb_pairdi_di (base, base, reg, reg2,
2169 GEN_INT (-adjustment),
2170 GEN_INT (UNITS_PER_WORD - adjustment));
2171 case DFmode:
2172 return gen_storewb_pairdf_di (base, base, reg, reg2,
2173 GEN_INT (-adjustment),
2174 GEN_INT (UNITS_PER_WORD - adjustment));
2175 default:
2176 gcc_unreachable ();
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182 unsigned regno2, HOST_WIDE_INT adjustment)
2184 rtx_insn *insn;
2185 rtx reg1 = gen_rtx_REG (mode, regno1);
2186 rtx reg2 = gen_rtx_REG (mode, regno2);
2188 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189 reg2, adjustment));
2190 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192 RTX_FRAME_RELATED_P (insn) = 1;
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197 HOST_WIDE_INT adjustment)
2199 switch (mode)
2201 case DImode:
2202 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203 GEN_INT (UNITS_PER_WORD));
2204 case DFmode:
2205 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206 GEN_INT (UNITS_PER_WORD));
2207 default:
2208 gcc_unreachable ();
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214 rtx reg2)
2216 switch (mode)
2218 case DImode:
2219 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2221 case DFmode:
2222 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2224 default:
2225 gcc_unreachable ();
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231 rtx mem2)
2233 switch (mode)
2235 case DImode:
2236 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2238 case DFmode:
2239 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2241 default:
2242 gcc_unreachable ();
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249 unsigned start, unsigned limit, bool skip_wb)
2251 rtx_insn *insn;
2252 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253 ? gen_frame_mem : gen_rtx_MEM);
2254 unsigned regno;
2255 unsigned regno2;
2257 for (regno = aarch64_next_callee_save (start, limit);
2258 regno <= limit;
2259 regno = aarch64_next_callee_save (regno + 1, limit))
2261 rtx reg, mem;
2262 HOST_WIDE_INT offset;
2264 if (skip_wb
2265 && (regno == cfun->machine->frame.wb_candidate1
2266 || regno == cfun->machine->frame.wb_candidate2))
2267 continue;
2269 reg = gen_rtx_REG (mode, regno);
2270 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272 offset));
2274 regno2 = aarch64_next_callee_save (regno + 1, limit);
2276 if (regno2 <= limit
2277 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278 == cfun->machine->frame.reg_offset[regno2]))
2281 rtx reg2 = gen_rtx_REG (mode, regno2);
2282 rtx mem2;
2284 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286 offset));
2287 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288 reg2));
2290 /* The first part of a frame-related parallel insn is
2291 always assumed to be relevant to the frame
2292 calculations; subsequent parts, are only
2293 frame-related if explicitly marked. */
2294 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295 regno = regno2;
2297 else
2298 insn = emit_move_insn (mem, reg);
2300 RTX_FRAME_RELATED_P (insn) = 1;
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306 HOST_WIDE_INT start_offset, unsigned start,
2307 unsigned limit, bool skip_wb, rtx *cfi_ops)
2309 rtx base_rtx = stack_pointer_rtx;
2310 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311 ? gen_frame_mem : gen_rtx_MEM);
2312 unsigned regno;
2313 unsigned regno2;
2314 HOST_WIDE_INT offset;
2316 for (regno = aarch64_next_callee_save (start, limit);
2317 regno <= limit;
2318 regno = aarch64_next_callee_save (regno + 1, limit))
2320 rtx reg, mem;
2322 if (skip_wb
2323 && (regno == cfun->machine->frame.wb_candidate1
2324 || regno == cfun->machine->frame.wb_candidate2))
2325 continue;
2327 reg = gen_rtx_REG (mode, regno);
2328 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2331 regno2 = aarch64_next_callee_save (regno + 1, limit);
2333 if (regno2 <= limit
2334 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335 == cfun->machine->frame.reg_offset[regno2]))
2337 rtx reg2 = gen_rtx_REG (mode, regno2);
2338 rtx mem2;
2340 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2344 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345 regno = regno2;
2347 else
2348 emit_move_insn (reg, mem);
2349 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2353 /* AArch64 stack frames generated by this compiler look like:
2355 +-------------------------------+
2357 | incoming stack arguments |
2359 +-------------------------------+
2360 | | <-- incoming stack pointer (aligned)
2361 | callee-allocated save area |
2362 | for register varargs |
2364 +-------------------------------+
2365 | local variables | <-- frame_pointer_rtx
2367 +-------------------------------+
2368 | padding0 | \
2369 +-------------------------------+ |
2370 | callee-saved registers | | frame.saved_regs_size
2371 +-------------------------------+ |
2372 | LR' | |
2373 +-------------------------------+ |
2374 | FP' | / <- hard_frame_pointer_rtx (aligned)
2375 +-------------------------------+
2376 | dynamic allocation |
2377 +-------------------------------+
2378 | padding |
2379 +-------------------------------+
2380 | outgoing stack arguments | <-- arg_pointer
2382 +-------------------------------+
2383 | | <-- stack_pointer_rtx (aligned)
2385 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387 unchanged. */
2389 /* Generate the prologue instructions for entry into a function.
2390 Establish the stack frame by decreasing the stack pointer with a
2391 properly calculated size and, if necessary, create a frame record
2392 filled with the values of LR and previous frame pointer. The
2393 current FP is also set up if it is in use. */
2395 void
2396 aarch64_expand_prologue (void)
2398 /* sub sp, sp, #<frame_size>
2399 stp {fp, lr}, [sp, #<frame_size> - 16]
2400 add fp, sp, #<frame_size> - hardfp_offset
2401 stp {cs_reg}, [fp, #-16] etc.
2403 sub sp, sp, <final_adjustment_if_any>
2405 HOST_WIDE_INT frame_size, offset;
2406 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2407 HOST_WIDE_INT hard_fp_offset;
2408 rtx_insn *insn;
2410 aarch64_layout_frame ();
2412 offset = frame_size = cfun->machine->frame.frame_size;
2413 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414 fp_offset = frame_size - hard_fp_offset;
2416 if (flag_stack_usage_info)
2417 current_function_static_stack_size = frame_size;
2419 /* Store pairs and load pairs have a range only -512 to 504. */
2420 if (offset >= 512)
2422 /* When the frame has a large size, an initial decrease is done on
2423 the stack pointer to jump over the callee-allocated save area for
2424 register varargs, the local variable area and/or the callee-saved
2425 register area. This will allow the pre-index write-back
2426 store pair instructions to be used for setting up the stack frame
2427 efficiently. */
2428 offset = hard_fp_offset;
2429 if (offset >= 512)
2430 offset = cfun->machine->frame.saved_regs_size;
2432 frame_size -= (offset + crtl->outgoing_args_size);
2433 fp_offset = 0;
2435 if (frame_size >= 0x1000000)
2437 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438 emit_move_insn (op0, GEN_INT (-frame_size));
2439 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2441 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443 plus_constant (Pmode, stack_pointer_rtx,
2444 -frame_size)));
2445 RTX_FRAME_RELATED_P (insn) = 1;
2447 else if (frame_size > 0)
2449 int hi_ofs = frame_size & 0xfff000;
2450 int lo_ofs = frame_size & 0x000fff;
2452 if (hi_ofs)
2454 insn = emit_insn (gen_add2_insn
2455 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456 RTX_FRAME_RELATED_P (insn) = 1;
2458 if (lo_ofs)
2460 insn = emit_insn (gen_add2_insn
2461 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462 RTX_FRAME_RELATED_P (insn) = 1;
2466 else
2467 frame_size = -1;
2469 if (offset > 0)
2471 bool skip_wb = false;
2473 if (frame_pointer_needed)
2475 skip_wb = true;
2477 if (fp_offset)
2479 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480 GEN_INT (-offset)));
2481 RTX_FRAME_RELATED_P (insn) = 1;
2483 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484 R30_REGNUM, false);
2486 else
2487 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2489 /* Set up frame pointer to point to the location of the
2490 previous frame pointer on the stack. */
2491 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492 stack_pointer_rtx,
2493 GEN_INT (fp_offset)));
2494 RTX_FRAME_RELATED_P (insn) = 1;
2495 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2497 else
2499 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2502 if (fp_offset
2503 || reg1 == FIRST_PSEUDO_REGISTER
2504 || (reg2 == FIRST_PSEUDO_REGISTER
2505 && offset >= 256))
2507 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508 GEN_INT (-offset)));
2509 RTX_FRAME_RELATED_P (insn) = 1;
2511 else
2513 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2515 skip_wb = true;
2517 if (reg2 == FIRST_PSEUDO_REGISTER)
2518 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519 else
2520 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2524 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525 skip_wb);
2526 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527 skip_wb);
2530 /* when offset >= 512,
2531 sub sp, sp, #<outgoing_args_size> */
2532 if (frame_size > -1)
2534 if (crtl->outgoing_args_size > 0)
2536 insn = emit_insn (gen_add2_insn
2537 (stack_pointer_rtx,
2538 GEN_INT (- crtl->outgoing_args_size)));
2539 RTX_FRAME_RELATED_P (insn) = 1;
2544 /* Return TRUE if we can use a simple_return insn.
2546 This function checks whether the callee saved stack is empty, which
2547 means no restore actions are need. The pro_and_epilogue will use
2548 this to check whether shrink-wrapping opt is feasible. */
2550 bool
2551 aarch64_use_return_insn_p (void)
2553 if (!reload_completed)
2554 return false;
2556 if (crtl->profile)
2557 return false;
2559 aarch64_layout_frame ();
2561 return cfun->machine->frame.frame_size == 0;
2564 /* Generate the epilogue instructions for returning from a function. */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2568 HOST_WIDE_INT frame_size, offset;
2569 HOST_WIDE_INT fp_offset;
2570 HOST_WIDE_INT hard_fp_offset;
2571 rtx_insn *insn;
2572 /* We need to add memory barrier to prevent read from deallocated stack. */
2573 bool need_barrier_p = (get_frame_size () != 0
2574 || cfun->machine->frame.saved_varargs_size);
2576 aarch64_layout_frame ();
2578 offset = frame_size = cfun->machine->frame.frame_size;
2579 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580 fp_offset = frame_size - hard_fp_offset;
2582 /* Store pairs and load pairs have a range only -512 to 504. */
2583 if (offset >= 512)
2585 offset = hard_fp_offset;
2586 if (offset >= 512)
2587 offset = cfun->machine->frame.saved_regs_size;
2589 frame_size -= (offset + crtl->outgoing_args_size);
2590 fp_offset = 0;
2591 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2593 insn = emit_insn (gen_add2_insn
2594 (stack_pointer_rtx,
2595 GEN_INT (crtl->outgoing_args_size)));
2596 RTX_FRAME_RELATED_P (insn) = 1;
2599 else
2600 frame_size = -1;
2602 /* If there were outgoing arguments or we've done dynamic stack
2603 allocation, then restore the stack pointer from the frame
2604 pointer. This is at most one insn and more efficient than using
2605 GCC's internal mechanism. */
2606 if (frame_pointer_needed
2607 && (crtl->outgoing_args_size || cfun->calls_alloca))
2609 if (cfun->calls_alloca)
2610 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2612 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613 hard_frame_pointer_rtx,
2614 GEN_INT (0)));
2615 offset = offset - fp_offset;
2618 if (offset > 0)
2620 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622 bool skip_wb = true;
2623 rtx cfi_ops = NULL;
2625 if (frame_pointer_needed)
2626 fp_offset = 0;
2627 else if (fp_offset
2628 || reg1 == FIRST_PSEUDO_REGISTER
2629 || (reg2 == FIRST_PSEUDO_REGISTER
2630 && offset >= 256))
2631 skip_wb = false;
2633 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634 skip_wb, &cfi_ops);
2635 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636 skip_wb, &cfi_ops);
2638 if (need_barrier_p)
2639 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2641 if (skip_wb)
2643 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2646 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647 if (reg2 == FIRST_PSEUDO_REGISTER)
2649 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651 mem = gen_rtx_MEM (mode1, mem);
2652 insn = emit_move_insn (rreg1, mem);
2654 else
2656 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2658 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659 insn = emit_insn (aarch64_gen_loadwb_pair
2660 (mode1, stack_pointer_rtx, rreg1,
2661 rreg2, offset));
2664 else
2666 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667 GEN_INT (offset)));
2670 /* Reset the CFA to be SP + FRAME_SIZE. */
2671 rtx new_cfa = stack_pointer_rtx;
2672 if (frame_size > 0)
2673 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675 REG_NOTES (insn) = cfi_ops;
2676 RTX_FRAME_RELATED_P (insn) = 1;
2679 if (frame_size > 0)
2681 if (need_barrier_p)
2682 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2684 if (frame_size >= 0x1000000)
2686 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687 emit_move_insn (op0, GEN_INT (frame_size));
2688 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2690 else
2692 int hi_ofs = frame_size & 0xfff000;
2693 int lo_ofs = frame_size & 0x000fff;
2695 if (hi_ofs && lo_ofs)
2697 insn = emit_insn (gen_add2_insn
2698 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699 RTX_FRAME_RELATED_P (insn) = 1;
2700 frame_size = lo_ofs;
2702 insn = emit_insn (gen_add2_insn
2703 (stack_pointer_rtx, GEN_INT (frame_size)));
2706 /* Reset the CFA to be SP + 0. */
2707 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708 RTX_FRAME_RELATED_P (insn) = 1;
2711 /* Stack adjustment for exception handler. */
2712 if (crtl->calls_eh_return)
2714 /* We need to unwind the stack by the offset computed by
2715 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2716 to be SP; letting the CFA move during this adjustment
2717 is just as correct as retaining the CFA from the body
2718 of the function. Therefore, do nothing special. */
2719 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2722 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723 if (!for_sibcall)
2724 emit_jump_insn (ret_rtx);
2727 /* Return the place to copy the exception unwinding return address to.
2728 This will probably be a stack slot, but could (in theory be the
2729 return register). */
2731 aarch64_final_eh_return_addr (void)
2733 HOST_WIDE_INT fp_offset;
2735 aarch64_layout_frame ();
2737 fp_offset = cfun->machine->frame.frame_size
2738 - cfun->machine->frame.hard_fp_offset;
2740 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741 return gen_rtx_REG (DImode, LR_REGNUM);
2743 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2744 result in a store to save LR introduced by builtin_eh_return () being
2745 incorrectly deleted because the alias is not detected.
2746 So in the calculation of the address to copy the exception unwinding
2747 return address to, we note 2 cases.
2748 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749 we return a SP-relative location since all the addresses are SP-relative
2750 in this case. This prevents the store from being optimized away.
2751 If the fp_offset is not 0, then the addresses will be FP-relative and
2752 therefore we return a FP-relative location. */
2754 if (frame_pointer_needed)
2756 if (fp_offset)
2757 return gen_frame_mem (DImode,
2758 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759 else
2760 return gen_frame_mem (DImode,
2761 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2764 /* If FP is not needed, we calculate the location of LR, which would be
2765 at the top of the saved registers block. */
2767 return gen_frame_mem (DImode,
2768 plus_constant (Pmode,
2769 stack_pointer_rtx,
2770 fp_offset
2771 + cfun->machine->frame.saved_regs_size
2772 - 2 * UNITS_PER_WORD));
2775 /* Possibly output code to build up a constant in a register. For
2776 the benefit of the costs infrastructure, returns the number of
2777 instructions which would be emitted. GENERATE inhibits or
2778 enables code generation. */
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2783 int insns = 0;
2785 if (aarch64_bitmask_imm (val, DImode))
2787 if (generate)
2788 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789 insns = 1;
2791 else
2793 int i;
2794 int ncount = 0;
2795 int zcount = 0;
2796 HOST_WIDE_INT valp = val >> 16;
2797 HOST_WIDE_INT valm;
2798 HOST_WIDE_INT tval;
2800 for (i = 16; i < 64; i += 16)
2802 valm = (valp & 0xffff);
2804 if (valm != 0)
2805 ++ zcount;
2807 if (valm != 0xffff)
2808 ++ ncount;
2810 valp >>= 16;
2813 /* zcount contains the number of additional MOVK instructions
2814 required if the constant is built up with an initial MOVZ instruction,
2815 while ncount is the number of MOVK instructions required if starting
2816 with a MOVN instruction. Choose the sequence that yields the fewest
2817 number of instructions, preferring MOVZ instructions when they are both
2818 the same. */
2819 if (ncount < zcount)
2821 if (generate)
2822 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824 tval = 0xffff;
2825 insns++;
2827 else
2829 if (generate)
2830 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831 GEN_INT (val & 0xffff));
2832 tval = 0;
2833 insns++;
2836 val >>= 16;
2838 for (i = 16; i < 64; i += 16)
2840 if ((val & 0xffff) != tval)
2842 if (generate)
2843 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844 GEN_INT (i),
2845 GEN_INT (val & 0xffff)));
2846 insns++;
2848 val >>= 16;
2851 return insns;
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2857 HOST_WIDE_INT mdelta = delta;
2858 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2861 if (mdelta < 0)
2862 mdelta = -mdelta;
2864 if (mdelta >= 4096 * 4096)
2866 (void) aarch64_build_constant (scratchreg, delta, true);
2867 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2869 else if (mdelta > 0)
2871 if (mdelta >= 4096)
2873 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875 if (delta < 0)
2876 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878 else
2879 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2882 if (mdelta % 4096 != 0)
2884 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2891 /* Output code to add DELTA to the first argument, and then jump
2892 to FUNCTION. Used for C++ multiple inheritance. */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895 HOST_WIDE_INT delta,
2896 HOST_WIDE_INT vcall_offset,
2897 tree function)
2899 /* The this pointer is always in x0. Note that this differs from
2900 Arm where the this pointer maybe bumped to r1 if r0 is required
2901 to return a pointer to an aggregate. On AArch64 a result value
2902 pointer will be in x8. */
2903 int this_regno = R0_REGNUM;
2904 rtx this_rtx, temp0, temp1, addr, funexp;
2905 rtx_insn *insn;
2907 reload_completed = 1;
2908 emit_note (NOTE_INSN_PROLOGUE_END);
2910 if (vcall_offset == 0)
2911 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912 else
2914 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2916 this_rtx = gen_rtx_REG (Pmode, this_regno);
2917 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2920 addr = this_rtx;
2921 if (delta != 0)
2923 if (delta >= -256 && delta < 256)
2924 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925 plus_constant (Pmode, this_rtx, delta));
2926 else
2927 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2930 if (Pmode == ptr_mode)
2931 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932 else
2933 aarch64_emit_move (temp0,
2934 gen_rtx_ZERO_EXTEND (Pmode,
2935 gen_rtx_MEM (ptr_mode, addr)));
2937 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938 addr = plus_constant (Pmode, temp0, vcall_offset);
2939 else
2941 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2945 if (Pmode == ptr_mode)
2946 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947 else
2948 aarch64_emit_move (temp1,
2949 gen_rtx_SIGN_EXTEND (Pmode,
2950 gen_rtx_MEM (ptr_mode, addr)));
2952 emit_insn (gen_add2_insn (this_rtx, temp1));
2955 /* Generate a tail call to the target function. */
2956 if (!TREE_USED (function))
2958 assemble_external (function);
2959 TREE_USED (function) = 1;
2961 funexp = XEXP (DECL_RTL (function), 0);
2962 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964 SIBLING_CALL_P (insn) = 1;
2966 insn = get_insns ();
2967 shorten_branches (insn);
2968 final_start_function (insn, file, 1);
2969 final (insn, file, 1);
2970 final_end_function ();
2972 /* Stop pretending to be a post-reload pass. */
2973 reload_completed = 0;
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2979 if (!TARGET_HAVE_TLS)
2980 return false;
2981 subrtx_iterator::array_type array;
2982 FOR_EACH_SUBRTX (iter, array, x, ALL)
2984 const_rtx x = *iter;
2985 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986 return true;
2987 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988 TLS offsets, not real symbol references. */
2989 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990 iter.skip_subrtxes ();
2992 return false;
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2999 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3002 if (*imm1 < *imm2)
3003 return -1;
3004 if (*imm1 > *imm2)
3005 return +1;
3006 return 0;
3010 static void
3011 aarch64_build_bitmask_table (void)
3013 unsigned HOST_WIDE_INT mask, imm;
3014 unsigned int log_e, e, s, r;
3015 unsigned int nimms = 0;
3017 for (log_e = 1; log_e <= 6; log_e++)
3019 e = 1 << log_e;
3020 if (e == 64)
3021 mask = ~(HOST_WIDE_INT) 0;
3022 else
3023 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024 for (s = 1; s < e; s++)
3026 for (r = 0; r < e; r++)
3028 /* set s consecutive bits to 1 (s < 64) */
3029 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030 /* rotate right by r */
3031 if (r != 0)
3032 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033 /* replicate the constant depending on SIMD size */
3034 switch (log_e) {
3035 case 1: imm |= (imm << 2);
3036 case 2: imm |= (imm << 4);
3037 case 3: imm |= (imm << 8);
3038 case 4: imm |= (imm << 16);
3039 case 5: imm |= (imm << 32);
3040 case 6:
3041 break;
3042 default:
3043 gcc_unreachable ();
3045 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046 aarch64_bitmasks[nimms++] = imm;
3051 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053 aarch64_bitmasks_cmp);
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058 a left shift of 0 or 12 bits. */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3062 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3068 /* Return true if val is an immediate that can be loaded into a
3069 register by a MOVZ instruction. */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3073 if (GET_MODE_SIZE (mode) > 4)
3075 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077 return 1;
3079 else
3081 /* Ignore sign extension. */
3082 val &= (HOST_WIDE_INT) 0xffffffff;
3084 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3089 /* Return true if val is a valid bitmask immediate. */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3093 if (GET_MODE_SIZE (mode) < 8)
3095 /* Replicate bit pattern. */
3096 val &= (HOST_WIDE_INT) 0xffffffff;
3097 val |= val << 32;
3099 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3104 /* Return true if val is an immediate that can be loaded into a
3105 register in a single instruction. */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3109 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110 return 1;
3111 return aarch64_bitmask_imm (val, mode);
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3117 rtx base, offset;
3119 if (GET_CODE (x) == HIGH)
3120 return true;
3122 split_const (x, &base, &offset);
3123 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3125 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126 != SYMBOL_FORCE_TO_MEM)
3127 return true;
3128 else
3129 /* Avoid generating a 64-bit relocation in ILP32; leave
3130 to aarch64_expand_mov_immediate to handle it properly. */
3131 return mode != ptr_mode;
3134 return aarch64_tls_referenced_p (x);
3137 /* Return true if register REGNO is a valid index register.
3138 STRICT_P is true if REG_OK_STRICT is in effect. */
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3143 if (!HARD_REGISTER_NUM_P (regno))
3145 if (!strict_p)
3146 return true;
3148 if (!reg_renumber)
3149 return false;
3151 regno = reg_renumber[regno];
3153 return GP_REGNUM_P (regno);
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157 STRICT_P is true if REG_OK_STRICT is in effect. */
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3162 if (!HARD_REGISTER_NUM_P (regno))
3164 if (!strict_p)
3165 return true;
3167 if (!reg_renumber)
3168 return false;
3170 regno = reg_renumber[regno];
3173 /* The fake registers will be eliminated to either the stack or
3174 hard frame pointer, both of which are usually valid base registers.
3175 Reload deals with the cases where the eliminated form isn't valid. */
3176 return (GP_REGNUM_P (regno)
3177 || regno == SP_REGNUM
3178 || regno == FRAME_POINTER_REGNUM
3179 || regno == ARG_POINTER_REGNUM);
3182 /* Return true if X is a valid base register for mode MODE.
3183 STRICT_P is true if REG_OK_STRICT is in effect. */
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3188 if (!strict_p && GET_CODE (x) == SUBREG)
3189 x = SUBREG_REG (x);
3191 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3194 /* Return true if address offset is a valid index. If it is, fill in INFO
3195 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199 machine_mode mode, bool strict_p)
3201 enum aarch64_address_type type;
3202 rtx index;
3203 int shift;
3205 /* (reg:P) */
3206 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207 && GET_MODE (x) == Pmode)
3209 type = ADDRESS_REG_REG;
3210 index = x;
3211 shift = 0;
3213 /* (sign_extend:DI (reg:SI)) */
3214 else if ((GET_CODE (x) == SIGN_EXTEND
3215 || GET_CODE (x) == ZERO_EXTEND)
3216 && GET_MODE (x) == DImode
3217 && GET_MODE (XEXP (x, 0)) == SImode)
3219 type = (GET_CODE (x) == SIGN_EXTEND)
3220 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221 index = XEXP (x, 0);
3222 shift = 0;
3224 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225 else if (GET_CODE (x) == MULT
3226 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228 && GET_MODE (XEXP (x, 0)) == DImode
3229 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230 && CONST_INT_P (XEXP (x, 1)))
3232 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234 index = XEXP (XEXP (x, 0), 0);
3235 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3237 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238 else if (GET_CODE (x) == ASHIFT
3239 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241 && GET_MODE (XEXP (x, 0)) == DImode
3242 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243 && CONST_INT_P (XEXP (x, 1)))
3245 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247 index = XEXP (XEXP (x, 0), 0);
3248 shift = INTVAL (XEXP (x, 1));
3250 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251 else if ((GET_CODE (x) == SIGN_EXTRACT
3252 || GET_CODE (x) == ZERO_EXTRACT)
3253 && GET_MODE (x) == DImode
3254 && GET_CODE (XEXP (x, 0)) == MULT
3255 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3258 type = (GET_CODE (x) == SIGN_EXTRACT)
3259 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260 index = XEXP (XEXP (x, 0), 0);
3261 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262 if (INTVAL (XEXP (x, 1)) != 32 + shift
3263 || INTVAL (XEXP (x, 2)) != 0)
3264 shift = -1;
3266 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267 (const_int 0xffffffff<<shift)) */
3268 else if (GET_CODE (x) == AND
3269 && GET_MODE (x) == DImode
3270 && GET_CODE (XEXP (x, 0)) == MULT
3271 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273 && CONST_INT_P (XEXP (x, 1)))
3275 type = ADDRESS_REG_UXTW;
3276 index = XEXP (XEXP (x, 0), 0);
3277 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279 shift = -1;
3281 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282 else if ((GET_CODE (x) == SIGN_EXTRACT
3283 || GET_CODE (x) == ZERO_EXTRACT)
3284 && GET_MODE (x) == DImode
3285 && GET_CODE (XEXP (x, 0)) == ASHIFT
3286 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3289 type = (GET_CODE (x) == SIGN_EXTRACT)
3290 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291 index = XEXP (XEXP (x, 0), 0);
3292 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293 if (INTVAL (XEXP (x, 1)) != 32 + shift
3294 || INTVAL (XEXP (x, 2)) != 0)
3295 shift = -1;
3297 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298 (const_int 0xffffffff<<shift)) */
3299 else if (GET_CODE (x) == AND
3300 && GET_MODE (x) == DImode
3301 && GET_CODE (XEXP (x, 0)) == ASHIFT
3302 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304 && CONST_INT_P (XEXP (x, 1)))
3306 type = ADDRESS_REG_UXTW;
3307 index = XEXP (XEXP (x, 0), 0);
3308 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310 shift = -1;
3312 /* (mult:P (reg:P) (const_int scale)) */
3313 else if (GET_CODE (x) == MULT
3314 && GET_MODE (x) == Pmode
3315 && GET_MODE (XEXP (x, 0)) == Pmode
3316 && CONST_INT_P (XEXP (x, 1)))
3318 type = ADDRESS_REG_REG;
3319 index = XEXP (x, 0);
3320 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3322 /* (ashift:P (reg:P) (const_int shift)) */
3323 else if (GET_CODE (x) == ASHIFT
3324 && GET_MODE (x) == Pmode
3325 && GET_MODE (XEXP (x, 0)) == Pmode
3326 && CONST_INT_P (XEXP (x, 1)))
3328 type = ADDRESS_REG_REG;
3329 index = XEXP (x, 0);
3330 shift = INTVAL (XEXP (x, 1));
3332 else
3333 return false;
3335 if (GET_CODE (index) == SUBREG)
3336 index = SUBREG_REG (index);
3338 if ((shift == 0 ||
3339 (shift > 0 && shift <= 3
3340 && (1 << shift) == GET_MODE_SIZE (mode)))
3341 && REG_P (index)
3342 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3344 info->type = type;
3345 info->offset = index;
3346 info->shift = shift;
3347 return true;
3350 return false;
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 return (offset >= -64 * GET_MODE_SIZE (mode)
3357 && offset < 64 * GET_MODE_SIZE (mode)
3358 && offset % GET_MODE_SIZE (mode) == 0);
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363 HOST_WIDE_INT offset)
3365 return offset >= -256 && offset < 256;
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3371 return (offset >= 0
3372 && offset < 4096 * GET_MODE_SIZE (mode)
3373 && offset % GET_MODE_SIZE (mode) == 0);
3376 /* Return true if X is a valid address for machine mode MODE. If it is,
3377 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3378 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382 rtx x, machine_mode mode,
3383 RTX_CODE outer_code, bool strict_p)
3385 enum rtx_code code = GET_CODE (x);
3386 rtx op0, op1;
3388 /* On BE, we use load/store pair for all large int mode load/stores. */
3389 bool load_store_pair_p = (outer_code == PARALLEL
3390 || (BYTES_BIG_ENDIAN
3391 && aarch64_vect_struct_mode_p (mode)));
3393 bool allow_reg_index_p =
3394 !load_store_pair_p
3395 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396 && !aarch64_vect_struct_mode_p (mode);
3398 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399 REG addressing. */
3400 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401 && (code != POST_INC && code != REG))
3402 return false;
3404 switch (code)
3406 case REG:
3407 case SUBREG:
3408 info->type = ADDRESS_REG_IMM;
3409 info->base = x;
3410 info->offset = const0_rtx;
3411 return aarch64_base_register_rtx_p (x, strict_p);
3413 case PLUS:
3414 op0 = XEXP (x, 0);
3415 op1 = XEXP (x, 1);
3417 if (! strict_p
3418 && REG_P (op0)
3419 && (op0 == virtual_stack_vars_rtx
3420 || op0 == frame_pointer_rtx
3421 || op0 == arg_pointer_rtx)
3422 && CONST_INT_P (op1))
3424 info->type = ADDRESS_REG_IMM;
3425 info->base = op0;
3426 info->offset = op1;
3428 return true;
3431 if (GET_MODE_SIZE (mode) != 0
3432 && CONST_INT_P (op1)
3433 && aarch64_base_register_rtx_p (op0, strict_p))
3435 HOST_WIDE_INT offset = INTVAL (op1);
3437 info->type = ADDRESS_REG_IMM;
3438 info->base = op0;
3439 info->offset = op1;
3441 /* TImode and TFmode values are allowed in both pairs of X
3442 registers and individual Q registers. The available
3443 address modes are:
3444 X,X: 7-bit signed scaled offset
3445 Q: 9-bit signed offset
3446 We conservatively require an offset representable in either mode.
3448 if (mode == TImode || mode == TFmode)
3449 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450 && offset_9bit_signed_unscaled_p (mode, offset));
3452 /* A 7bit offset check because OImode will emit a ldp/stp
3453 instruction (only big endian will get here).
3454 For ldp/stp instructions, the offset is scaled for the size of a
3455 single element of the pair. */
3456 if (mode == OImode)
3457 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3459 /* Three 9/12 bit offsets checks because CImode will emit three
3460 ldr/str instructions (only big endian will get here). */
3461 if (mode == CImode)
3462 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464 || offset_12bit_unsigned_scaled_p (V16QImode,
3465 offset + 32)));
3467 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468 instructions (only big endian will get here). */
3469 if (mode == XImode)
3470 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471 && aarch64_offset_7bit_signed_scaled_p (TImode,
3472 offset + 32));
3474 if (load_store_pair_p)
3475 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477 else
3478 return (offset_9bit_signed_unscaled_p (mode, offset)
3479 || offset_12bit_unsigned_scaled_p (mode, offset));
3482 if (allow_reg_index_p)
3484 /* Look for base + (scaled/extended) index register. */
3485 if (aarch64_base_register_rtx_p (op0, strict_p)
3486 && aarch64_classify_index (info, op1, mode, strict_p))
3488 info->base = op0;
3489 return true;
3491 if (aarch64_base_register_rtx_p (op1, strict_p)
3492 && aarch64_classify_index (info, op0, mode, strict_p))
3494 info->base = op1;
3495 return true;
3499 return false;
3501 case POST_INC:
3502 case POST_DEC:
3503 case PRE_INC:
3504 case PRE_DEC:
3505 info->type = ADDRESS_REG_WB;
3506 info->base = XEXP (x, 0);
3507 info->offset = NULL_RTX;
3508 return aarch64_base_register_rtx_p (info->base, strict_p);
3510 case POST_MODIFY:
3511 case PRE_MODIFY:
3512 info->type = ADDRESS_REG_WB;
3513 info->base = XEXP (x, 0);
3514 if (GET_CODE (XEXP (x, 1)) == PLUS
3515 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517 && aarch64_base_register_rtx_p (info->base, strict_p))
3519 HOST_WIDE_INT offset;
3520 info->offset = XEXP (XEXP (x, 1), 1);
3521 offset = INTVAL (info->offset);
3523 /* TImode and TFmode values are allowed in both pairs of X
3524 registers and individual Q registers. The available
3525 address modes are:
3526 X,X: 7-bit signed scaled offset
3527 Q: 9-bit signed offset
3528 We conservatively require an offset representable in either mode.
3530 if (mode == TImode || mode == TFmode)
3531 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532 && offset_9bit_signed_unscaled_p (mode, offset));
3534 if (load_store_pair_p)
3535 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537 else
3538 return offset_9bit_signed_unscaled_p (mode, offset);
3540 return false;
3542 case CONST:
3543 case SYMBOL_REF:
3544 case LABEL_REF:
3545 /* load literal: pc-relative constant pool entry. Only supported
3546 for SI mode or larger. */
3547 info->type = ADDRESS_SYMBOLIC;
3549 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3551 rtx sym, addend;
3553 split_const (x, &sym, &addend);
3554 return (GET_CODE (sym) == LABEL_REF
3555 || (GET_CODE (sym) == SYMBOL_REF
3556 && CONSTANT_POOL_ADDRESS_P (sym)));
3558 return false;
3560 case LO_SUM:
3561 info->type = ADDRESS_LO_SUM;
3562 info->base = XEXP (x, 0);
3563 info->offset = XEXP (x, 1);
3564 if (allow_reg_index_p
3565 && aarch64_base_register_rtx_p (info->base, strict_p))
3567 rtx sym, offs;
3568 split_const (info->offset, &sym, &offs);
3569 if (GET_CODE (sym) == SYMBOL_REF
3570 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571 == SYMBOL_SMALL_ABSOLUTE))
3573 /* The symbol and offset must be aligned to the access size. */
3574 unsigned int align;
3575 unsigned int ref_size;
3577 if (CONSTANT_POOL_ADDRESS_P (sym))
3578 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3581 tree exp = SYMBOL_REF_DECL (sym);
3582 align = TYPE_ALIGN (TREE_TYPE (exp));
3583 align = CONSTANT_ALIGNMENT (exp, align);
3585 else if (SYMBOL_REF_DECL (sym))
3586 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588 && SYMBOL_REF_BLOCK (sym) != NULL)
3589 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590 else
3591 align = BITS_PER_UNIT;
3593 ref_size = GET_MODE_SIZE (mode);
3594 if (ref_size == 0)
3595 ref_size = GET_MODE_SIZE (DImode);
3597 return ((INTVAL (offs) & (ref_size - 1)) == 0
3598 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3601 return false;
3603 default:
3604 return false;
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3611 rtx offset;
3613 split_const (x, &x, &offset);
3614 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3617 /* Classify the base of symbolic expression X, given that X appears in
3618 context CONTEXT. */
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622 enum aarch64_symbol_context context)
3624 rtx offset;
3626 split_const (x, &x, &offset);
3627 return aarch64_classify_symbol (x, offset, context);
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632 mode MODE. */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3636 struct aarch64_address_info addr;
3638 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3643 pair operation. */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646 RTX_CODE outer_code, bool strict_p)
3648 struct aarch64_address_info addr;
3650 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3657 REAL_VALUE_TYPE r;
3659 if (GET_MODE (x) == VOIDmode)
3660 return false;
3662 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663 if (REAL_VALUE_MINUS_ZERO (r))
3664 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665 return REAL_VALUES_EQUAL (r, dconst0);
3668 /* Return the fixed registers used for condition codes. */
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3673 *p1 = CC_REGNUM;
3674 *p2 = INVALID_REGNUM;
3675 return true;
3678 /* Emit call insn with PAT and do aarch64-specific handling. */
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3683 rtx insn = emit_call_insn (pat);
3685 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3693 /* All floating point compares return CCFP if it is an equality
3694 comparison, and CCFPE otherwise. */
3695 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3697 switch (code)
3699 case EQ:
3700 case NE:
3701 case UNORDERED:
3702 case ORDERED:
3703 case UNLT:
3704 case UNLE:
3705 case UNGT:
3706 case UNGE:
3707 case UNEQ:
3708 case LTGT:
3709 return CCFPmode;
3711 case LT:
3712 case LE:
3713 case GT:
3714 case GE:
3715 return CCFPEmode;
3717 default:
3718 gcc_unreachable ();
3722 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723 && y == const0_rtx
3724 && (code == EQ || code == NE || code == LT || code == GE)
3725 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726 || GET_CODE (x) == NEG))
3727 return CC_NZmode;
3729 /* A compare with a shifted operand. Because of canonicalization,
3730 the comparison will have to be swapped when we emit the assembly
3731 code. */
3732 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733 && (REG_P (y) || GET_CODE (y) == SUBREG)
3734 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735 || GET_CODE (x) == LSHIFTRT
3736 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737 return CC_SWPmode;
3739 /* Similarly for a negated operand, but we can only do this for
3740 equalities. */
3741 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742 && (REG_P (y) || GET_CODE (y) == SUBREG)
3743 && (code == EQ || code == NE)
3744 && GET_CODE (x) == NEG)
3745 return CC_Zmode;
3747 /* A compare of a mode narrower than SI mode against zero can be done
3748 by extending the value in the comparison. */
3749 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750 && y == const0_rtx)
3751 /* Only use sign-extension if we really need it. */
3752 return ((code == GT || code == GE || code == LE || code == LT)
3753 ? CC_SESWPmode : CC_ZESWPmode);
3755 /* For everything else, return CCmode. */
3756 return CCmode;
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3763 aarch64_get_condition_code (rtx x)
3765 machine_mode mode = GET_MODE (XEXP (x, 0));
3766 enum rtx_code comp_code = GET_CODE (x);
3768 if (GET_MODE_CLASS (mode) != MODE_CC)
3769 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770 return aarch64_get_condition_code_1 (mode, comp_code);
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3776 int ne = -1, eq = -1;
3777 switch (mode)
3779 case CCFPmode:
3780 case CCFPEmode:
3781 switch (comp_code)
3783 case GE: return AARCH64_GE;
3784 case GT: return AARCH64_GT;
3785 case LE: return AARCH64_LS;
3786 case LT: return AARCH64_MI;
3787 case NE: return AARCH64_NE;
3788 case EQ: return AARCH64_EQ;
3789 case ORDERED: return AARCH64_VC;
3790 case UNORDERED: return AARCH64_VS;
3791 case UNLT: return AARCH64_LT;
3792 case UNLE: return AARCH64_LE;
3793 case UNGT: return AARCH64_HI;
3794 case UNGE: return AARCH64_PL;
3795 default: return -1;
3797 break;
3799 case CC_DNEmode:
3800 ne = AARCH64_NE;
3801 eq = AARCH64_EQ;
3802 break;
3804 case CC_DEQmode:
3805 ne = AARCH64_EQ;
3806 eq = AARCH64_NE;
3807 break;
3809 case CC_DGEmode:
3810 ne = AARCH64_GE;
3811 eq = AARCH64_LT;
3812 break;
3814 case CC_DLTmode:
3815 ne = AARCH64_LT;
3816 eq = AARCH64_GE;
3817 break;
3819 case CC_DGTmode:
3820 ne = AARCH64_GT;
3821 eq = AARCH64_LE;
3822 break;
3824 case CC_DLEmode:
3825 ne = AARCH64_LE;
3826 eq = AARCH64_GT;
3827 break;
3829 case CC_DGEUmode:
3830 ne = AARCH64_CS;
3831 eq = AARCH64_CC;
3832 break;
3834 case CC_DLTUmode:
3835 ne = AARCH64_CC;
3836 eq = AARCH64_CS;
3837 break;
3839 case CC_DGTUmode:
3840 ne = AARCH64_HI;
3841 eq = AARCH64_LS;
3842 break;
3844 case CC_DLEUmode:
3845 ne = AARCH64_LS;
3846 eq = AARCH64_HI;
3847 break;
3849 case CCmode:
3850 switch (comp_code)
3852 case NE: return AARCH64_NE;
3853 case EQ: return AARCH64_EQ;
3854 case GE: return AARCH64_GE;
3855 case GT: return AARCH64_GT;
3856 case LE: return AARCH64_LE;
3857 case LT: return AARCH64_LT;
3858 case GEU: return AARCH64_CS;
3859 case GTU: return AARCH64_HI;
3860 case LEU: return AARCH64_LS;
3861 case LTU: return AARCH64_CC;
3862 default: return -1;
3864 break;
3866 case CC_SWPmode:
3867 case CC_ZESWPmode:
3868 case CC_SESWPmode:
3869 switch (comp_code)
3871 case NE: return AARCH64_NE;
3872 case EQ: return AARCH64_EQ;
3873 case GE: return AARCH64_LE;
3874 case GT: return AARCH64_LT;
3875 case LE: return AARCH64_GE;
3876 case LT: return AARCH64_GT;
3877 case GEU: return AARCH64_LS;
3878 case GTU: return AARCH64_CC;
3879 case LEU: return AARCH64_CS;
3880 case LTU: return AARCH64_HI;
3881 default: return -1;
3883 break;
3885 case CC_NZmode:
3886 switch (comp_code)
3888 case NE: return AARCH64_NE;
3889 case EQ: return AARCH64_EQ;
3890 case GE: return AARCH64_PL;
3891 case LT: return AARCH64_MI;
3892 default: return -1;
3894 break;
3896 case CC_Zmode:
3897 switch (comp_code)
3899 case NE: return AARCH64_NE;
3900 case EQ: return AARCH64_EQ;
3901 default: return -1;
3903 break;
3905 default:
3906 return -1;
3907 break;
3910 if (comp_code == NE)
3911 return ne;
3913 if (comp_code == EQ)
3914 return eq;
3916 return -1;
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921 HOST_WIDE_INT minval,
3922 HOST_WIDE_INT maxval)
3924 HOST_WIDE_INT firstval;
3925 int count, i;
3927 if (GET_CODE (x) != CONST_VECTOR
3928 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929 return false;
3931 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932 if (firstval < minval || firstval > maxval)
3933 return false;
3935 count = CONST_VECTOR_NUNITS (x);
3936 for (i = 1; i < count; i++)
3937 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938 return false;
3940 return true;
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3946 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3952 unsigned count = 0;
3954 while (value)
3956 count++;
3957 value &= value - 1;
3960 return count;
3963 /* N Z C V. */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3969 /* N Z C V flags for ccmp. The first code is for AND op and the other
3970 is for IOR op. Indexed by AARCH64_COND_CODE. */
3971 static const int aarch64_nzcv_codes[][2] =
3973 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3974 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3975 {AARCH64_CC_C, 0}, /* CS, C == 1. */
3976 {0, AARCH64_CC_C}, /* CC, C == 0. */
3977 {AARCH64_CC_N, 0}, /* MI, N == 1. */
3978 {0, AARCH64_CC_N}, /* PL, N == 0. */
3979 {AARCH64_CC_V, 0}, /* VS, V == 1. */
3980 {0, AARCH64_CC_V}, /* VC, V == 0. */
3981 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
3982 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
3983 {0, AARCH64_CC_V}, /* GE, N == V. */
3984 {AARCH64_CC_V, 0}, /* LT, N != V. */
3985 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
3986 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
3987 {0, 0}, /* AL, Any. */
3988 {0, 0}, /* NV, Any. */
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3994 switch (mode)
3996 case CC_DNEmode:
3997 return NE;
3999 case CC_DEQmode:
4000 return EQ;
4002 case CC_DLEmode:
4003 return LE;
4005 case CC_DGTmode:
4006 return GT;
4008 case CC_DLTmode:
4009 return LT;
4011 case CC_DGEmode:
4012 return GE;
4014 case CC_DLEUmode:
4015 return LEU;
4017 case CC_DGTUmode:
4018 return GTU;
4020 case CC_DLTUmode:
4021 return LTU;
4023 case CC_DGEUmode:
4024 return GEU;
4026 default:
4027 gcc_unreachable ();
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4035 switch (code)
4037 /* An integer or symbol address without a preceding # sign. */
4038 case 'c':
4039 switch (GET_CODE (x))
4041 case CONST_INT:
4042 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043 break;
4045 case SYMBOL_REF:
4046 output_addr_const (f, x);
4047 break;
4049 case CONST:
4050 if (GET_CODE (XEXP (x, 0)) == PLUS
4051 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4053 output_addr_const (f, x);
4054 break;
4056 /* Fall through. */
4058 default:
4059 output_operand_lossage ("Unsupported operand for code '%c'", code);
4061 break;
4063 case 'e':
4064 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4066 int n;
4068 if (!CONST_INT_P (x)
4069 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4071 output_operand_lossage ("invalid operand for '%%%c'", code);
4072 return;
4075 switch (n)
4077 case 3:
4078 fputc ('b', f);
4079 break;
4080 case 4:
4081 fputc ('h', f);
4082 break;
4083 case 5:
4084 fputc ('w', f);
4085 break;
4086 default:
4087 output_operand_lossage ("invalid operand for '%%%c'", code);
4088 return;
4091 break;
4093 case 'p':
4095 int n;
4097 /* Print N such that 2^N == X. */
4098 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4100 output_operand_lossage ("invalid operand for '%%%c'", code);
4101 return;
4104 asm_fprintf (f, "%d", n);
4106 break;
4108 case 'P':
4109 /* Print the number of non-zero bits in X (a const_int). */
4110 if (!CONST_INT_P (x))
4112 output_operand_lossage ("invalid operand for '%%%c'", code);
4113 return;
4116 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117 break;
4119 case 'H':
4120 /* Print the higher numbered register of a pair (TImode) of regs. */
4121 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4123 output_operand_lossage ("invalid operand for '%%%c'", code);
4124 return;
4127 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128 break;
4130 case 'm':
4132 int cond_code;
4133 /* Print a condition (eq, ne, etc). */
4135 /* CONST_TRUE_RTX means always -- that's the default. */
4136 if (x == const_true_rtx)
4137 return;
4139 if (!COMPARISON_P (x))
4141 output_operand_lossage ("invalid operand for '%%%c'", code);
4142 return;
4145 cond_code = aarch64_get_condition_code (x);
4146 gcc_assert (cond_code >= 0);
4147 fputs (aarch64_condition_codes[cond_code], f);
4149 break;
4151 case 'M':
4153 int cond_code;
4154 /* Print the inverse of a condition (eq <-> ne, etc). */
4156 /* CONST_TRUE_RTX means never -- that's the default. */
4157 if (x == const_true_rtx)
4159 fputs ("nv", f);
4160 return;
4163 if (!COMPARISON_P (x))
4165 output_operand_lossage ("invalid operand for '%%%c'", code);
4166 return;
4168 cond_code = aarch64_get_condition_code (x);
4169 gcc_assert (cond_code >= 0);
4170 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171 (cond_code)], f);
4173 break;
4175 case 'b':
4176 case 'h':
4177 case 's':
4178 case 'd':
4179 case 'q':
4180 /* Print a scalar FP/SIMD register name. */
4181 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4183 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184 return;
4186 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187 break;
4189 case 'S':
4190 case 'T':
4191 case 'U':
4192 case 'V':
4193 /* Print the first FP/SIMD register name in a list. */
4194 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4196 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197 return;
4199 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200 break;
4202 case 'R':
4203 /* Print a scalar FP/SIMD register name + 1. */
4204 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4206 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207 return;
4209 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210 break;
4212 case 'X':
4213 /* Print bottom 16 bits of integer constant in hex. */
4214 if (!CONST_INT_P (x))
4216 output_operand_lossage ("invalid operand for '%%%c'", code);
4217 return;
4219 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220 break;
4222 case 'w':
4223 case 'x':
4224 /* Print a general register name or the zero register (32-bit or
4225 64-bit). */
4226 if (x == const0_rtx
4227 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4229 asm_fprintf (f, "%czr", code);
4230 break;
4233 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4235 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236 break;
4239 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4241 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242 break;
4245 /* Fall through */
4247 case 0:
4248 /* Print a normal operand, if it's a general register, then we
4249 assume DImode. */
4250 if (x == NULL)
4252 output_operand_lossage ("missing operand");
4253 return;
4256 switch (GET_CODE (x))
4258 case REG:
4259 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260 break;
4262 case MEM:
4263 aarch64_memory_reference_mode = GET_MODE (x);
4264 output_address (XEXP (x, 0));
4265 break;
4267 case LABEL_REF:
4268 case SYMBOL_REF:
4269 output_addr_const (asm_out_file, x);
4270 break;
4272 case CONST_INT:
4273 asm_fprintf (f, "%wd", INTVAL (x));
4274 break;
4276 case CONST_VECTOR:
4277 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4279 gcc_assert (
4280 aarch64_const_vec_all_same_in_range_p (x,
4281 HOST_WIDE_INT_MIN,
4282 HOST_WIDE_INT_MAX));
4283 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4285 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4287 fputc ('0', f);
4289 else
4290 gcc_unreachable ();
4291 break;
4293 case CONST_DOUBLE:
4294 /* CONST_DOUBLE can represent a double-width integer.
4295 In this case, the mode of x is VOIDmode. */
4296 if (GET_MODE (x) == VOIDmode)
4297 ; /* Do Nothing. */
4298 else if (aarch64_float_const_zero_rtx_p (x))
4300 fputc ('0', f);
4301 break;
4303 else if (aarch64_float_const_representable_p (x))
4305 #define buf_size 20
4306 char float_buf[buf_size] = {'\0'};
4307 REAL_VALUE_TYPE r;
4308 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309 real_to_decimal_for_mode (float_buf, &r,
4310 buf_size, buf_size,
4311 1, GET_MODE (x));
4312 asm_fprintf (asm_out_file, "%s", float_buf);
4313 break;
4314 #undef buf_size
4316 output_operand_lossage ("invalid constant");
4317 return;
4318 default:
4319 output_operand_lossage ("invalid operand");
4320 return;
4322 break;
4324 case 'A':
4325 if (GET_CODE (x) == HIGH)
4326 x = XEXP (x, 0);
4328 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4330 case SYMBOL_SMALL_GOT:
4331 asm_fprintf (asm_out_file, ":got:");
4332 break;
4334 case SYMBOL_SMALL_TLSGD:
4335 asm_fprintf (asm_out_file, ":tlsgd:");
4336 break;
4338 case SYMBOL_SMALL_TLSDESC:
4339 asm_fprintf (asm_out_file, ":tlsdesc:");
4340 break;
4342 case SYMBOL_SMALL_GOTTPREL:
4343 asm_fprintf (asm_out_file, ":gottprel:");
4344 break;
4346 case SYMBOL_SMALL_TPREL:
4347 asm_fprintf (asm_out_file, ":tprel:");
4348 break;
4350 case SYMBOL_TINY_GOT:
4351 gcc_unreachable ();
4352 break;
4354 default:
4355 break;
4357 output_addr_const (asm_out_file, x);
4358 break;
4360 case 'L':
4361 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4363 case SYMBOL_SMALL_GOT:
4364 asm_fprintf (asm_out_file, ":lo12:");
4365 break;
4367 case SYMBOL_SMALL_TLSGD:
4368 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369 break;
4371 case SYMBOL_SMALL_TLSDESC:
4372 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373 break;
4375 case SYMBOL_SMALL_GOTTPREL:
4376 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377 break;
4379 case SYMBOL_SMALL_TPREL:
4380 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381 break;
4383 case SYMBOL_TINY_GOT:
4384 asm_fprintf (asm_out_file, ":got:");
4385 break;
4387 default:
4388 break;
4390 output_addr_const (asm_out_file, x);
4391 break;
4393 case 'G':
4395 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4397 case SYMBOL_SMALL_TPREL:
4398 asm_fprintf (asm_out_file, ":tprel_hi12:");
4399 break;
4400 default:
4401 break;
4403 output_addr_const (asm_out_file, x);
4404 break;
4406 case 'K':
4408 int cond_code;
4409 /* Print nzcv. */
4411 if (!COMPARISON_P (x))
4413 output_operand_lossage ("invalid operand for '%%%c'", code);
4414 return;
4417 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418 gcc_assert (cond_code >= 0);
4419 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4421 break;
4423 case 'k':
4425 int cond_code;
4426 /* Print nzcv. */
4428 if (!COMPARISON_P (x))
4430 output_operand_lossage ("invalid operand for '%%%c'", code);
4431 return;
4434 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435 gcc_assert (cond_code >= 0);
4436 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4438 break;
4440 default:
4441 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442 return;
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4449 struct aarch64_address_info addr;
4451 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452 MEM, true))
4453 switch (addr.type)
4455 case ADDRESS_REG_IMM:
4456 if (addr.offset == const0_rtx)
4457 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458 else
4459 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460 INTVAL (addr.offset));
4461 return;
4463 case ADDRESS_REG_REG:
4464 if (addr.shift == 0)
4465 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466 reg_names [REGNO (addr.offset)]);
4467 else
4468 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469 reg_names [REGNO (addr.offset)], addr.shift);
4470 return;
4472 case ADDRESS_REG_UXTW:
4473 if (addr.shift == 0)
4474 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475 REGNO (addr.offset) - R0_REGNUM);
4476 else
4477 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479 return;
4481 case ADDRESS_REG_SXTW:
4482 if (addr.shift == 0)
4483 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484 REGNO (addr.offset) - R0_REGNUM);
4485 else
4486 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488 return;
4490 case ADDRESS_REG_WB:
4491 switch (GET_CODE (x))
4493 case PRE_INC:
4494 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495 GET_MODE_SIZE (aarch64_memory_reference_mode));
4496 return;
4497 case POST_INC:
4498 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499 GET_MODE_SIZE (aarch64_memory_reference_mode));
4500 return;
4501 case PRE_DEC:
4502 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503 GET_MODE_SIZE (aarch64_memory_reference_mode));
4504 return;
4505 case POST_DEC:
4506 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507 GET_MODE_SIZE (aarch64_memory_reference_mode));
4508 return;
4509 case PRE_MODIFY:
4510 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511 INTVAL (addr.offset));
4512 return;
4513 case POST_MODIFY:
4514 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515 INTVAL (addr.offset));
4516 return;
4517 default:
4518 break;
4520 break;
4522 case ADDRESS_LO_SUM:
4523 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524 output_addr_const (f, addr.offset);
4525 asm_fprintf (f, "]");
4526 return;
4528 case ADDRESS_SYMBOLIC:
4529 break;
4532 output_addr_const (f, x);
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4538 const char *fmt;
4539 int i;
4541 if (GET_CODE (x) == LABEL_REF)
4542 return true;
4544 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545 referencing instruction, but they are constant offsets, not
4546 symbols. */
4547 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548 return false;
4550 fmt = GET_RTX_FORMAT (GET_CODE (x));
4551 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4553 if (fmt[i] == 'E')
4555 int j;
4557 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559 return 1;
4561 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562 return 1;
4565 return 0;
4568 /* Implement REGNO_REG_CLASS. */
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4573 if (GP_REGNUM_P (regno))
4574 return GENERAL_REGS;
4576 if (regno == SP_REGNUM)
4577 return STACK_REG;
4579 if (regno == FRAME_POINTER_REGNUM
4580 || regno == ARG_POINTER_REGNUM)
4581 return POINTER_REGS;
4583 if (FP_REGNUM_P (regno))
4584 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4586 return NO_REGS;
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4592 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593 where mask is selected by alignment and size of the offset.
4594 We try to pick as large a range for the offset as possible to
4595 maximize the chance of a CSE. However, for aligned addresses
4596 we limit the range to 4k so that structures with different sized
4597 elements are likely to use the same base. */
4599 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4601 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602 HOST_WIDE_INT base_offset;
4604 /* Does it look like we'll need a load/store-pair operation? */
4605 if (GET_MODE_SIZE (mode) > 16
4606 || mode == TImode)
4607 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609 /* For offsets aren't a multiple of the access size, the limit is
4610 -256...255. */
4611 else if (offset & (GET_MODE_SIZE (mode) - 1))
4612 base_offset = (offset + 0x100) & ~0x1ff;
4613 else
4614 base_offset = offset & ~0xfff;
4616 if (base_offset == 0)
4617 return x;
4619 offset -= base_offset;
4620 rtx base_reg = gen_reg_rtx (Pmode);
4621 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622 NULL_RTX);
4623 emit_move_insn (base_reg, val);
4624 x = plus_constant (Pmode, base_reg, offset);
4627 return x;
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631 operand. If we find one, push the reload and return the new rtx. */
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635 machine_mode mode,
4636 int opnum, int type,
4637 int ind_levels ATTRIBUTE_UNUSED)
4639 rtx x = *x_p;
4641 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4642 if (aarch64_vect_struct_mode_p (mode)
4643 && GET_CODE (x) == PLUS
4644 && REG_P (XEXP (x, 0))
4645 && CONST_INT_P (XEXP (x, 1)))
4647 rtx orig_rtx = x;
4648 x = copy_rtx (x);
4649 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651 opnum, (enum reload_type) type);
4652 return x;
4655 /* We must recognize output that we have already generated ourselves. */
4656 if (GET_CODE (x) == PLUS
4657 && GET_CODE (XEXP (x, 0)) == PLUS
4658 && REG_P (XEXP (XEXP (x, 0), 0))
4659 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660 && CONST_INT_P (XEXP (x, 1)))
4662 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664 opnum, (enum reload_type) type);
4665 return x;
4668 /* We wish to handle large displacements off a base register by splitting
4669 the addend across an add and the mem insn. This can cut the number of
4670 extra insns needed from 3 to 1. It is only useful for load/store of a
4671 single register with 12 bit offset field. */
4672 if (GET_CODE (x) == PLUS
4673 && REG_P (XEXP (x, 0))
4674 && CONST_INT_P (XEXP (x, 1))
4675 && HARD_REGISTER_P (XEXP (x, 0))
4676 && mode != TImode
4677 && mode != TFmode
4678 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4680 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681 HOST_WIDE_INT low = val & 0xfff;
4682 HOST_WIDE_INT high = val - low;
4683 HOST_WIDE_INT offs;
4684 rtx cst;
4685 machine_mode xmode = GET_MODE (x);
4687 /* In ILP32, xmode can be either DImode or SImode. */
4688 gcc_assert (xmode == DImode || xmode == SImode);
4690 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4691 BLKmode alignment. */
4692 if (GET_MODE_SIZE (mode) == 0)
4693 return NULL_RTX;
4695 offs = low % GET_MODE_SIZE (mode);
4697 /* Align misaligned offset by adjusting high part to compensate. */
4698 if (offs != 0)
4700 if (aarch64_uimm12_shift (high + offs))
4702 /* Align down. */
4703 low = low - offs;
4704 high = high + offs;
4706 else
4708 /* Align up. */
4709 offs = GET_MODE_SIZE (mode) - offs;
4710 low = low + offs;
4711 high = high + (low & 0x1000) - offs;
4712 low &= 0xfff;
4716 /* Check for overflow. */
4717 if (high + low != val)
4718 return NULL_RTX;
4720 cst = GEN_INT (high);
4721 if (!aarch64_uimm12_shift (high))
4722 cst = force_const_mem (xmode, cst);
4724 /* Reload high part into base reg, leaving the low part
4725 in the mem instruction.
4726 Note that replacing this gen_rtx_PLUS with plus_constant is
4727 wrong in this case because we rely on the
4728 (plus (plus reg c1) c2) structure being preserved so that
4729 XEXP (*p, 0) in push_reload below uses the correct term. */
4730 x = gen_rtx_PLUS (xmode,
4731 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732 GEN_INT (low));
4734 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736 opnum, (enum reload_type) type);
4737 return x;
4740 return NULL_RTX;
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746 reg_class_t rclass,
4747 machine_mode mode,
4748 secondary_reload_info *sri)
4750 /* Without the TARGET_SIMD instructions we cannot move a Q register
4751 to a Q register directly. We need a scratch. */
4752 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754 && reg_class_subset_p (rclass, FP_REGS))
4756 if (mode == TFmode)
4757 sri->icode = CODE_FOR_aarch64_reload_movtf;
4758 else if (mode == TImode)
4759 sri->icode = CODE_FOR_aarch64_reload_movti;
4760 return NO_REGS;
4763 /* A TFmode or TImode memory access should be handled via an FP_REGS
4764 because AArch64 has richer addressing modes for LDR/STR instructions
4765 than LDP/STP instructions. */
4766 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768 return FP_REGS;
4770 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771 return GENERAL_REGS;
4773 return NO_REGS;
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4779 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4782 if (frame_pointer_needed)
4784 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785 return true;
4786 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787 return false;
4788 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789 && !cfun->calls_alloca)
4790 return true;
4791 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792 return true;
4794 return false;
4796 else
4798 /* If we decided that we didn't need a leaf frame pointer but then used
4799 LR in the function, then we'll want a frame pointer after all, so
4800 prevent this elimination to ensure a frame pointer is used. */
4801 if (to == STACK_POINTER_REGNUM
4802 && flag_omit_leaf_frame_pointer
4803 && df_regs_ever_live_p (LR_REGNUM))
4804 return false;
4807 return true;
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4813 aarch64_layout_frame ();
4815 if (to == HARD_FRAME_POINTER_REGNUM)
4817 if (from == ARG_POINTER_REGNUM)
4818 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4820 if (from == FRAME_POINTER_REGNUM)
4821 return (cfun->machine->frame.hard_fp_offset
4822 - cfun->machine->frame.saved_varargs_size);
4825 if (to == STACK_POINTER_REGNUM)
4827 if (from == FRAME_POINTER_REGNUM)
4828 return (cfun->machine->frame.frame_size
4829 - cfun->machine->frame.saved_varargs_size);
4832 return cfun->machine->frame.frame_size;
4835 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4836 previous frame. */
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4841 if (count != 0)
4842 return const0_rtx;
4843 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4850 if (TARGET_ILP32)
4852 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4855 else
4857 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4860 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861 assemble_aligned_integer (4, const0_rtx);
4862 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4869 rtx fnaddr, mem, a_tramp;
4870 const int tramp_code_sz = 16;
4872 /* Don't need to copy the trailing D-words, we fill those in below. */
4873 emit_block_move (m_tramp, assemble_trampoline_template (),
4874 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877 if (GET_MODE (fnaddr) != ptr_mode)
4878 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879 emit_move_insn (mem, fnaddr);
4881 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882 emit_move_insn (mem, chain_value);
4884 /* XXX We should really define a "clear_cache" pattern and use
4885 gen_clear_cache(). */
4886 a_tramp = XEXP (m_tramp, 0);
4887 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890 ptr_mode);
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4896 switch (regclass)
4898 case CALLER_SAVE_REGS:
4899 case POINTER_REGS:
4900 case GENERAL_REGS:
4901 case ALL_REGS:
4902 case FP_REGS:
4903 case FP_LO_REGS:
4904 return
4905 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906 (GET_MODE_SIZE (mode) + 7) / 8;
4907 case STACK_REG:
4908 return 1;
4910 case NO_REGS:
4911 return 0;
4913 default:
4914 break;
4916 gcc_unreachable ();
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4922 if (regclass == POINTER_REGS)
4923 return GENERAL_REGS;
4925 if (regclass == STACK_REG)
4927 if (REG_P(x)
4928 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929 return regclass;
4931 return NO_REGS;
4934 /* If it's an integer immediate that MOVI can't handle, then
4935 FP_REGS is not an option, so we return NO_REGS instead. */
4936 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938 return NO_REGS;
4940 /* Register eliminiation can result in a request for
4941 SP+constant->FP_REGS. We cannot support such operations which
4942 use SP as source and an FP_REG as destination, so reject out
4943 right now. */
4944 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4946 rtx lhs = XEXP (x, 0);
4948 /* Look through a possible SUBREG introduced by ILP32. */
4949 if (GET_CODE (lhs) == SUBREG)
4950 lhs = SUBREG_REG (lhs);
4952 gcc_assert (REG_P (lhs));
4953 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954 POINTER_REGS));
4955 return NO_REGS;
4958 return regclass;
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4964 asm_fprintf (f, "%U%s", name);
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4970 if (priority == DEFAULT_INIT_PRIORITY)
4971 default_ctor_section_asm_out_constructor (symbol, priority);
4972 else
4974 section *s;
4975 char buf[18];
4976 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977 s = get_section (buf, SECTION_WRITE, NULL);
4978 switch_to_section (s);
4979 assemble_align (POINTER_SIZE);
4980 assemble_aligned_integer (POINTER_BYTES, symbol);
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4987 if (priority == DEFAULT_INIT_PRIORITY)
4988 default_dtor_section_asm_out_destructor (symbol, priority);
4989 else
4991 section *s;
4992 char buf[18];
4993 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994 s = get_section (buf, SECTION_WRITE, NULL);
4995 switch_to_section (s);
4996 assemble_align (POINTER_SIZE);
4997 assemble_aligned_integer (POINTER_BYTES, symbol);
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5004 char buf[100];
5005 char label[100];
5006 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007 int index;
5008 static const char *const patterns[4][2] =
5011 "ldrb\t%w3, [%0,%w1,uxtw]",
5012 "add\t%3, %4, %w3, sxtb #2"
5015 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016 "add\t%3, %4, %w3, sxth #2"
5019 "ldr\t%w3, [%0,%w1,uxtw #2]",
5020 "add\t%3, %4, %w3, sxtw #2"
5022 /* We assume that DImode is only generated when not optimizing and
5023 that we don't really need 64-bit address offsets. That would
5024 imply an object file with 8GB of code in a single function! */
5026 "ldr\t%w3, [%0,%w1,uxtw #2]",
5027 "add\t%3, %4, %w3, sxtw #2"
5031 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5033 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5035 gcc_assert (index >= 0 && index <= 3);
5037 /* Need to implement table size reduction, by chaning the code below. */
5038 output_asm_insn (patterns[index][0], operands);
5039 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040 snprintf (buf, sizeof (buf),
5041 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042 output_asm_insn (buf, operands);
5043 output_asm_insn (patterns[index][1], operands);
5044 output_asm_insn ("br\t%3", operands);
5045 assemble_label (asm_out_file, label);
5046 return "";
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052 operator. */
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5057 if (shift >= 0 && shift <= 3)
5059 int size;
5060 for (size = 8; size <= 32; size *= 2)
5062 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063 if (mask == bits << shift)
5064 return size;
5067 return 0;
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072 const_rtx x ATTRIBUTE_UNUSED)
5074 /* We can't use blocks for constants when we're using a per-function
5075 constant pool. */
5076 return false;
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081 rtx x ATTRIBUTE_UNUSED,
5082 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5084 /* Force all constant pool entries into the current function section. */
5085 return function_section (current_function_decl);
5089 /* Costs. */
5091 /* Helper function for rtx cost calculation. Strip a shift expression
5092 from X. Returns the inner operand if successful, or the original
5093 expression on failure. */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5097 rtx op = x;
5099 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100 we can convert both to ROR during final output. */
5101 if ((GET_CODE (op) == ASHIFT
5102 || GET_CODE (op) == ASHIFTRT
5103 || GET_CODE (op) == LSHIFTRT
5104 || GET_CODE (op) == ROTATERT
5105 || GET_CODE (op) == ROTATE)
5106 && CONST_INT_P (XEXP (op, 1)))
5107 return XEXP (op, 0);
5109 if (GET_CODE (op) == MULT
5110 && CONST_INT_P (XEXP (op, 1))
5111 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112 return XEXP (op, 0);
5114 return x;
5117 /* Helper function for rtx cost calculation. Strip an extend
5118 expression from X. Returns the inner operand if successful, or the
5119 original expression on failure. We deal with a number of possible
5120 canonicalization variations here. */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5124 rtx op = x;
5126 /* Zero and sign extraction of a widened value. */
5127 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128 && XEXP (op, 2) == const0_rtx
5129 && GET_CODE (XEXP (op, 0)) == MULT
5130 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131 XEXP (op, 1)))
5132 return XEXP (XEXP (op, 0), 0);
5134 /* It can also be represented (for zero-extend) as an AND with an
5135 immediate. */
5136 if (GET_CODE (op) == AND
5137 && GET_CODE (XEXP (op, 0)) == MULT
5138 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139 && CONST_INT_P (XEXP (op, 1))
5140 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141 INTVAL (XEXP (op, 1))) != 0)
5142 return XEXP (XEXP (op, 0), 0);
5144 /* Now handle extended register, as this may also have an optional
5145 left shift by 1..4. */
5146 if (GET_CODE (op) == ASHIFT
5147 && CONST_INT_P (XEXP (op, 1))
5148 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149 op = XEXP (op, 0);
5151 if (GET_CODE (op) == ZERO_EXTEND
5152 || GET_CODE (op) == SIGN_EXTEND)
5153 op = XEXP (op, 0);
5155 if (op != x)
5156 return op;
5158 return x;
5161 /* Helper function for rtx cost calculation. Calculate the cost of
5162 a MULT, which may be part of a multiply-accumulate rtx. Return
5163 the calculated cost of the expression, recursing manually in to
5164 operands where needed. */
5166 static int
5167 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5169 rtx op0, op1;
5170 const struct cpu_cost_table *extra_cost
5171 = aarch64_tune_params->insn_extra_cost;
5172 int cost = 0;
5173 bool maybe_fma = (outer == PLUS || outer == MINUS);
5174 machine_mode mode = GET_MODE (x);
5176 gcc_checking_assert (code == MULT);
5178 op0 = XEXP (x, 0);
5179 op1 = XEXP (x, 1);
5181 if (VECTOR_MODE_P (mode))
5182 mode = GET_MODE_INNER (mode);
5184 /* Integer multiply/fma. */
5185 if (GET_MODE_CLASS (mode) == MODE_INT)
5187 /* The multiply will be canonicalized as a shift, cost it as such. */
5188 if (CONST_INT_P (op1)
5189 && exact_log2 (INTVAL (op1)) > 0)
5191 if (speed)
5193 if (maybe_fma)
5194 /* ADD (shifted register). */
5195 cost += extra_cost->alu.arith_shift;
5196 else
5197 /* LSL (immediate). */
5198 cost += extra_cost->alu.shift;
5201 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5203 return cost;
5206 /* Integer multiplies or FMAs have zero/sign extending variants. */
5207 if ((GET_CODE (op0) == ZERO_EXTEND
5208 && GET_CODE (op1) == ZERO_EXTEND)
5209 || (GET_CODE (op0) == SIGN_EXTEND
5210 && GET_CODE (op1) == SIGN_EXTEND))
5212 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5213 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5215 if (speed)
5217 if (maybe_fma)
5218 /* MADD/SMADDL/UMADDL. */
5219 cost += extra_cost->mult[0].extend_add;
5220 else
5221 /* MUL/SMULL/UMULL. */
5222 cost += extra_cost->mult[0].extend;
5225 return cost;
5228 /* This is either an integer multiply or an FMA. In both cases
5229 we want to recurse and cost the operands. */
5230 cost += rtx_cost (op0, MULT, 0, speed)
5231 + rtx_cost (op1, MULT, 1, speed);
5233 if (speed)
5235 if (maybe_fma)
5236 /* MADD. */
5237 cost += extra_cost->mult[mode == DImode].add;
5238 else
5239 /* MUL. */
5240 cost += extra_cost->mult[mode == DImode].simple;
5243 return cost;
5245 else
5247 if (speed)
5249 /* Floating-point FMA/FMUL can also support negations of the
5250 operands. */
5251 if (GET_CODE (op0) == NEG)
5252 op0 = XEXP (op0, 0);
5253 if (GET_CODE (op1) == NEG)
5254 op1 = XEXP (op1, 0);
5256 if (maybe_fma)
5257 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5258 cost += extra_cost->fp[mode == DFmode].fma;
5259 else
5260 /* FMUL/FNMUL. */
5261 cost += extra_cost->fp[mode == DFmode].mult;
5264 cost += rtx_cost (op0, MULT, 0, speed)
5265 + rtx_cost (op1, MULT, 1, speed);
5266 return cost;
5270 static int
5271 aarch64_address_cost (rtx x,
5272 machine_mode mode,
5273 addr_space_t as ATTRIBUTE_UNUSED,
5274 bool speed)
5276 enum rtx_code c = GET_CODE (x);
5277 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5278 struct aarch64_address_info info;
5279 int cost = 0;
5280 info.shift = 0;
5282 if (!aarch64_classify_address (&info, x, mode, c, false))
5284 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5286 /* This is a CONST or SYMBOL ref which will be split
5287 in a different way depending on the code model in use.
5288 Cost it through the generic infrastructure. */
5289 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5290 /* Divide through by the cost of one instruction to
5291 bring it to the same units as the address costs. */
5292 cost_symbol_ref /= COSTS_N_INSNS (1);
5293 /* The cost is then the cost of preparing the address,
5294 followed by an immediate (possibly 0) offset. */
5295 return cost_symbol_ref + addr_cost->imm_offset;
5297 else
5299 /* This is most likely a jump table from a case
5300 statement. */
5301 return addr_cost->register_offset;
5305 switch (info.type)
5307 case ADDRESS_LO_SUM:
5308 case ADDRESS_SYMBOLIC:
5309 case ADDRESS_REG_IMM:
5310 cost += addr_cost->imm_offset;
5311 break;
5313 case ADDRESS_REG_WB:
5314 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5315 cost += addr_cost->pre_modify;
5316 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5317 cost += addr_cost->post_modify;
5318 else
5319 gcc_unreachable ();
5321 break;
5323 case ADDRESS_REG_REG:
5324 cost += addr_cost->register_offset;
5325 break;
5327 case ADDRESS_REG_UXTW:
5328 case ADDRESS_REG_SXTW:
5329 cost += addr_cost->register_extend;
5330 break;
5332 default:
5333 gcc_unreachable ();
5337 if (info.shift > 0)
5339 /* For the sake of calculating the cost of the shifted register
5340 component, we can treat same sized modes in the same way. */
5341 switch (GET_MODE_BITSIZE (mode))
5343 case 16:
5344 cost += addr_cost->addr_scale_costs.hi;
5345 break;
5347 case 32:
5348 cost += addr_cost->addr_scale_costs.si;
5349 break;
5351 case 64:
5352 cost += addr_cost->addr_scale_costs.di;
5353 break;
5355 /* We can't tell, or this is a 128-bit vector. */
5356 default:
5357 cost += addr_cost->addr_scale_costs.ti;
5358 break;
5362 return cost;
5365 /* Return true if the RTX X in mode MODE is a zero or sign extract
5366 usable in an ADD or SUB (extended register) instruction. */
5367 static bool
5368 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5370 /* Catch add with a sign extract.
5371 This is add_<optab><mode>_multp2. */
5372 if (GET_CODE (x) == SIGN_EXTRACT
5373 || GET_CODE (x) == ZERO_EXTRACT)
5375 rtx op0 = XEXP (x, 0);
5376 rtx op1 = XEXP (x, 1);
5377 rtx op2 = XEXP (x, 2);
5379 if (GET_CODE (op0) == MULT
5380 && CONST_INT_P (op1)
5381 && op2 == const0_rtx
5382 && CONST_INT_P (XEXP (op0, 1))
5383 && aarch64_is_extend_from_extract (mode,
5384 XEXP (op0, 1),
5385 op1))
5387 return true;
5391 return false;
5394 static bool
5395 aarch64_frint_unspec_p (unsigned int u)
5397 switch (u)
5399 case UNSPEC_FRINTZ:
5400 case UNSPEC_FRINTP:
5401 case UNSPEC_FRINTM:
5402 case UNSPEC_FRINTA:
5403 case UNSPEC_FRINTN:
5404 case UNSPEC_FRINTX:
5405 case UNSPEC_FRINTI:
5406 return true;
5408 default:
5409 return false;
5413 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5414 storing it in *COST. Result is true if the total cost of the operation
5415 has now been calculated. */
5416 static bool
5417 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5419 rtx inner;
5420 rtx comparator;
5421 enum rtx_code cmpcode;
5423 if (COMPARISON_P (op0))
5425 inner = XEXP (op0, 0);
5426 comparator = XEXP (op0, 1);
5427 cmpcode = GET_CODE (op0);
5429 else
5431 inner = op0;
5432 comparator = const0_rtx;
5433 cmpcode = NE;
5436 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5438 /* Conditional branch. */
5439 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5440 return true;
5441 else
5443 if (cmpcode == NE || cmpcode == EQ)
5445 if (comparator == const0_rtx)
5447 /* TBZ/TBNZ/CBZ/CBNZ. */
5448 if (GET_CODE (inner) == ZERO_EXTRACT)
5449 /* TBZ/TBNZ. */
5450 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5451 0, speed);
5452 else
5453 /* CBZ/CBNZ. */
5454 *cost += rtx_cost (inner, cmpcode, 0, speed);
5456 return true;
5459 else if (cmpcode == LT || cmpcode == GE)
5461 /* TBZ/TBNZ. */
5462 if (comparator == const0_rtx)
5463 return true;
5467 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5469 /* It's a conditional operation based on the status flags,
5470 so it must be some flavor of CSEL. */
5472 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5473 if (GET_CODE (op1) == NEG
5474 || GET_CODE (op1) == NOT
5475 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5476 op1 = XEXP (op1, 0);
5478 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5479 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5480 return true;
5483 /* We don't know what this is, cost all operands. */
5484 return false;
5487 /* Calculate the cost of calculating X, storing it in *COST. Result
5488 is true if the total cost of the operation has now been calculated. */
5489 static bool
5490 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5491 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5493 rtx op0, op1, op2;
5494 const struct cpu_cost_table *extra_cost
5495 = aarch64_tune_params->insn_extra_cost;
5496 machine_mode mode = GET_MODE (x);
5498 /* By default, assume that everything has equivalent cost to the
5499 cheapest instruction. Any additional costs are applied as a delta
5500 above this default. */
5501 *cost = COSTS_N_INSNS (1);
5503 /* TODO: The cost infrastructure currently does not handle
5504 vector operations. Assume that all vector operations
5505 are equally expensive. */
5506 if (VECTOR_MODE_P (mode))
5508 if (speed)
5509 *cost += extra_cost->vect.alu;
5510 return true;
5513 switch (code)
5515 case SET:
5516 /* The cost depends entirely on the operands to SET. */
5517 *cost = 0;
5518 op0 = SET_DEST (x);
5519 op1 = SET_SRC (x);
5521 switch (GET_CODE (op0))
5523 case MEM:
5524 if (speed)
5526 rtx address = XEXP (op0, 0);
5527 if (GET_MODE_CLASS (mode) == MODE_INT)
5528 *cost += extra_cost->ldst.store;
5529 else if (mode == SFmode)
5530 *cost += extra_cost->ldst.storef;
5531 else if (mode == DFmode)
5532 *cost += extra_cost->ldst.stored;
5534 *cost +=
5535 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5536 0, speed));
5539 *cost += rtx_cost (op1, SET, 1, speed);
5540 return true;
5542 case SUBREG:
5543 if (! REG_P (SUBREG_REG (op0)))
5544 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5546 /* Fall through. */
5547 case REG:
5548 /* const0_rtx is in general free, but we will use an
5549 instruction to set a register to 0. */
5550 if (REG_P (op1) || op1 == const0_rtx)
5552 /* The cost is 1 per register copied. */
5553 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5554 / UNITS_PER_WORD;
5555 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5557 else
5558 /* Cost is just the cost of the RHS of the set. */
5559 *cost += rtx_cost (op1, SET, 1, speed);
5560 return true;
5562 case ZERO_EXTRACT:
5563 case SIGN_EXTRACT:
5564 /* Bit-field insertion. Strip any redundant widening of
5565 the RHS to meet the width of the target. */
5566 if (GET_CODE (op1) == SUBREG)
5567 op1 = SUBREG_REG (op1);
5568 if ((GET_CODE (op1) == ZERO_EXTEND
5569 || GET_CODE (op1) == SIGN_EXTEND)
5570 && CONST_INT_P (XEXP (op0, 1))
5571 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5572 >= INTVAL (XEXP (op0, 1))))
5573 op1 = XEXP (op1, 0);
5575 if (CONST_INT_P (op1))
5577 /* MOV immediate is assumed to always be cheap. */
5578 *cost = COSTS_N_INSNS (1);
5580 else
5582 /* BFM. */
5583 if (speed)
5584 *cost += extra_cost->alu.bfi;
5585 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5588 return true;
5590 default:
5591 /* We can't make sense of this, assume default cost. */
5592 *cost = COSTS_N_INSNS (1);
5593 return false;
5595 return false;
5597 case CONST_INT:
5598 /* If an instruction can incorporate a constant within the
5599 instruction, the instruction's expression avoids calling
5600 rtx_cost() on the constant. If rtx_cost() is called on a
5601 constant, then it is usually because the constant must be
5602 moved into a register by one or more instructions.
5604 The exception is constant 0, which can be expressed
5605 as XZR/WZR and is therefore free. The exception to this is
5606 if we have (set (reg) (const0_rtx)) in which case we must cost
5607 the move. However, we can catch that when we cost the SET, so
5608 we don't need to consider that here. */
5609 if (x == const0_rtx)
5610 *cost = 0;
5611 else
5613 /* To an approximation, building any other constant is
5614 proportionally expensive to the number of instructions
5615 required to build that constant. This is true whether we
5616 are compiling for SPEED or otherwise. */
5617 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5618 (NULL_RTX, x, false, mode));
5620 return true;
5622 case CONST_DOUBLE:
5623 if (speed)
5625 /* mov[df,sf]_aarch64. */
5626 if (aarch64_float_const_representable_p (x))
5627 /* FMOV (scalar immediate). */
5628 *cost += extra_cost->fp[mode == DFmode].fpconst;
5629 else if (!aarch64_float_const_zero_rtx_p (x))
5631 /* This will be a load from memory. */
5632 if (mode == DFmode)
5633 *cost += extra_cost->ldst.loadd;
5634 else
5635 *cost += extra_cost->ldst.loadf;
5637 else
5638 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5639 or MOV v0.s[0], wzr - neither of which are modeled by the
5640 cost tables. Just use the default cost. */
5645 return true;
5647 case MEM:
5648 if (speed)
5650 /* For loads we want the base cost of a load, plus an
5651 approximation for the additional cost of the addressing
5652 mode. */
5653 rtx address = XEXP (x, 0);
5654 if (GET_MODE_CLASS (mode) == MODE_INT)
5655 *cost += extra_cost->ldst.load;
5656 else if (mode == SFmode)
5657 *cost += extra_cost->ldst.loadf;
5658 else if (mode == DFmode)
5659 *cost += extra_cost->ldst.loadd;
5661 *cost +=
5662 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5663 0, speed));
5666 return true;
5668 case NEG:
5669 op0 = XEXP (x, 0);
5671 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5673 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5674 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5676 /* CSETM. */
5677 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5678 return true;
5681 /* Cost this as SUB wzr, X. */
5682 op0 = CONST0_RTX (GET_MODE (x));
5683 op1 = XEXP (x, 0);
5684 goto cost_minus;
5687 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5689 /* Support (neg(fma...)) as a single instruction only if
5690 sign of zeros is unimportant. This matches the decision
5691 making in aarch64.md. */
5692 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5694 /* FNMADD. */
5695 *cost = rtx_cost (op0, NEG, 0, speed);
5696 return true;
5698 if (speed)
5699 /* FNEG. */
5700 *cost += extra_cost->fp[mode == DFmode].neg;
5701 return false;
5704 return false;
5706 case CLRSB:
5707 case CLZ:
5708 if (speed)
5709 *cost += extra_cost->alu.clz;
5711 return false;
5713 case COMPARE:
5714 op0 = XEXP (x, 0);
5715 op1 = XEXP (x, 1);
5717 if (op1 == const0_rtx
5718 && GET_CODE (op0) == AND)
5720 x = op0;
5721 goto cost_logic;
5724 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5726 /* TODO: A write to the CC flags possibly costs extra, this
5727 needs encoding in the cost tables. */
5729 /* CC_ZESWPmode supports zero extend for free. */
5730 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5731 op0 = XEXP (op0, 0);
5733 /* ANDS. */
5734 if (GET_CODE (op0) == AND)
5736 x = op0;
5737 goto cost_logic;
5740 if (GET_CODE (op0) == PLUS)
5742 /* ADDS (and CMN alias). */
5743 x = op0;
5744 goto cost_plus;
5747 if (GET_CODE (op0) == MINUS)
5749 /* SUBS. */
5750 x = op0;
5751 goto cost_minus;
5754 if (GET_CODE (op1) == NEG)
5756 /* CMN. */
5757 if (speed)
5758 *cost += extra_cost->alu.arith;
5760 *cost += rtx_cost (op0, COMPARE, 0, speed);
5761 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5762 return true;
5765 /* CMP.
5767 Compare can freely swap the order of operands, and
5768 canonicalization puts the more complex operation first.
5769 But the integer MINUS logic expects the shift/extend
5770 operation in op1. */
5771 if (! (REG_P (op0)
5772 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5774 op0 = XEXP (x, 1);
5775 op1 = XEXP (x, 0);
5777 goto cost_minus;
5780 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5782 /* FCMP. */
5783 if (speed)
5784 *cost += extra_cost->fp[mode == DFmode].compare;
5786 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5788 /* FCMP supports constant 0.0 for no extra cost. */
5789 return true;
5791 return false;
5794 return false;
5796 case MINUS:
5798 op0 = XEXP (x, 0);
5799 op1 = XEXP (x, 1);
5801 cost_minus:
5802 /* Detect valid immediates. */
5803 if ((GET_MODE_CLASS (mode) == MODE_INT
5804 || (GET_MODE_CLASS (mode) == MODE_CC
5805 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5806 && CONST_INT_P (op1)
5807 && aarch64_uimm12_shift (INTVAL (op1)))
5809 *cost += rtx_cost (op0, MINUS, 0, speed);
5811 if (speed)
5812 /* SUB(S) (immediate). */
5813 *cost += extra_cost->alu.arith;
5814 return true;
5818 /* Look for SUB (extended register). */
5819 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5821 if (speed)
5822 *cost += extra_cost->alu.extend_arith;
5824 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5825 (enum rtx_code) GET_CODE (op1),
5826 0, speed);
5827 return true;
5830 rtx new_op1 = aarch64_strip_extend (op1);
5832 /* Cost this as an FMA-alike operation. */
5833 if ((GET_CODE (new_op1) == MULT
5834 || GET_CODE (new_op1) == ASHIFT)
5835 && code != COMPARE)
5837 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5838 (enum rtx_code) code,
5839 speed);
5840 *cost += rtx_cost (op0, MINUS, 0, speed);
5841 return true;
5844 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5846 if (speed)
5848 if (GET_MODE_CLASS (mode) == MODE_INT)
5849 /* SUB(S). */
5850 *cost += extra_cost->alu.arith;
5851 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5852 /* FSUB. */
5853 *cost += extra_cost->fp[mode == DFmode].addsub;
5855 return true;
5858 case PLUS:
5860 rtx new_op0;
5862 op0 = XEXP (x, 0);
5863 op1 = XEXP (x, 1);
5865 cost_plus:
5866 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5867 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5869 /* CSINC. */
5870 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5871 *cost += rtx_cost (op1, PLUS, 1, speed);
5872 return true;
5875 if (GET_MODE_CLASS (mode) == MODE_INT
5876 && CONST_INT_P (op1)
5877 && aarch64_uimm12_shift (INTVAL (op1)))
5879 *cost += rtx_cost (op0, PLUS, 0, speed);
5881 if (speed)
5882 /* ADD (immediate). */
5883 *cost += extra_cost->alu.arith;
5884 return true;
5887 /* Look for ADD (extended register). */
5888 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5890 if (speed)
5891 *cost += extra_cost->alu.extend_arith;
5893 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5894 (enum rtx_code) GET_CODE (op0),
5895 0, speed);
5896 return true;
5899 /* Strip any extend, leave shifts behind as we will
5900 cost them through mult_cost. */
5901 new_op0 = aarch64_strip_extend (op0);
5903 if (GET_CODE (new_op0) == MULT
5904 || GET_CODE (new_op0) == ASHIFT)
5906 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5907 speed);
5908 *cost += rtx_cost (op1, PLUS, 1, speed);
5909 return true;
5912 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5913 + rtx_cost (op1, PLUS, 1, speed));
5915 if (speed)
5917 if (GET_MODE_CLASS (mode) == MODE_INT)
5918 /* ADD. */
5919 *cost += extra_cost->alu.arith;
5920 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5921 /* FADD. */
5922 *cost += extra_cost->fp[mode == DFmode].addsub;
5924 return true;
5927 case BSWAP:
5928 *cost = COSTS_N_INSNS (1);
5930 if (speed)
5931 *cost += extra_cost->alu.rev;
5933 return false;
5935 case IOR:
5936 if (aarch_rev16_p (x))
5938 *cost = COSTS_N_INSNS (1);
5940 if (speed)
5941 *cost += extra_cost->alu.rev;
5943 return true;
5945 /* Fall through. */
5946 case XOR:
5947 case AND:
5948 cost_logic:
5949 op0 = XEXP (x, 0);
5950 op1 = XEXP (x, 1);
5952 if (code == AND
5953 && GET_CODE (op0) == MULT
5954 && CONST_INT_P (XEXP (op0, 1))
5955 && CONST_INT_P (op1)
5956 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5957 INTVAL (op1)) != 0)
5959 /* This is a UBFM/SBFM. */
5960 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5961 if (speed)
5962 *cost += extra_cost->alu.bfx;
5963 return true;
5966 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5968 /* We possibly get the immediate for free, this is not
5969 modelled. */
5970 if (CONST_INT_P (op1)
5971 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5973 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5975 if (speed)
5976 *cost += extra_cost->alu.logical;
5978 return true;
5980 else
5982 rtx new_op0 = op0;
5984 /* Handle ORN, EON, or BIC. */
5985 if (GET_CODE (op0) == NOT)
5986 op0 = XEXP (op0, 0);
5988 new_op0 = aarch64_strip_shift (op0);
5990 /* If we had a shift on op0 then this is a logical-shift-
5991 by-register/immediate operation. Otherwise, this is just
5992 a logical operation. */
5993 if (speed)
5995 if (new_op0 != op0)
5997 /* Shift by immediate. */
5998 if (CONST_INT_P (XEXP (op0, 1)))
5999 *cost += extra_cost->alu.log_shift;
6000 else
6001 *cost += extra_cost->alu.log_shift_reg;
6003 else
6004 *cost += extra_cost->alu.logical;
6007 /* In both cases we want to cost both operands. */
6008 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6009 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6011 return true;
6014 return false;
6016 case NOT:
6017 /* MVN. */
6018 if (speed)
6019 *cost += extra_cost->alu.logical;
6021 /* The logical instruction could have the shifted register form,
6022 but the cost is the same if the shift is processed as a separate
6023 instruction, so we don't bother with it here. */
6024 return false;
6026 case ZERO_EXTEND:
6028 op0 = XEXP (x, 0);
6029 /* If a value is written in SI mode, then zero extended to DI
6030 mode, the operation will in general be free as a write to
6031 a 'w' register implicitly zeroes the upper bits of an 'x'
6032 register. However, if this is
6034 (set (reg) (zero_extend (reg)))
6036 we must cost the explicit register move. */
6037 if (mode == DImode
6038 && GET_MODE (op0) == SImode
6039 && outer == SET)
6041 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6043 if (!op_cost && speed)
6044 /* MOV. */
6045 *cost += extra_cost->alu.extend;
6046 else
6047 /* Free, the cost is that of the SI mode operation. */
6048 *cost = op_cost;
6050 return true;
6052 else if (MEM_P (XEXP (x, 0)))
6054 /* All loads can zero extend to any size for free. */
6055 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6056 return true;
6059 /* UXTB/UXTH. */
6060 if (speed)
6061 *cost += extra_cost->alu.extend;
6063 return false;
6065 case SIGN_EXTEND:
6066 if (MEM_P (XEXP (x, 0)))
6068 /* LDRSH. */
6069 if (speed)
6071 rtx address = XEXP (XEXP (x, 0), 0);
6072 *cost += extra_cost->ldst.load_sign_extend;
6074 *cost +=
6075 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6076 0, speed));
6078 return true;
6081 if (speed)
6082 *cost += extra_cost->alu.extend;
6083 return false;
6085 case ASHIFT:
6086 op0 = XEXP (x, 0);
6087 op1 = XEXP (x, 1);
6089 if (CONST_INT_P (op1))
6091 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6092 aliases. */
6093 if (speed)
6094 *cost += extra_cost->alu.shift;
6096 /* We can incorporate zero/sign extend for free. */
6097 if (GET_CODE (op0) == ZERO_EXTEND
6098 || GET_CODE (op0) == SIGN_EXTEND)
6099 op0 = XEXP (op0, 0);
6101 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6102 return true;
6104 else
6106 /* LSLV. */
6107 if (speed)
6108 *cost += extra_cost->alu.shift_reg;
6110 return false; /* All arguments need to be in registers. */
6113 case ROTATE:
6114 case ROTATERT:
6115 case LSHIFTRT:
6116 case ASHIFTRT:
6117 op0 = XEXP (x, 0);
6118 op1 = XEXP (x, 1);
6120 if (CONST_INT_P (op1))
6122 /* ASR (immediate) and friends. */
6123 if (speed)
6124 *cost += extra_cost->alu.shift;
6126 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6127 return true;
6129 else
6132 /* ASR (register) and friends. */
6133 if (speed)
6134 *cost += extra_cost->alu.shift_reg;
6136 return false; /* All arguments need to be in registers. */
6139 case SYMBOL_REF:
6141 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6143 /* LDR. */
6144 if (speed)
6145 *cost += extra_cost->ldst.load;
6147 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6148 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6150 /* ADRP, followed by ADD. */
6151 *cost += COSTS_N_INSNS (1);
6152 if (speed)
6153 *cost += 2 * extra_cost->alu.arith;
6155 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6156 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6158 /* ADR. */
6159 if (speed)
6160 *cost += extra_cost->alu.arith;
6163 if (flag_pic)
6165 /* One extra load instruction, after accessing the GOT. */
6166 *cost += COSTS_N_INSNS (1);
6167 if (speed)
6168 *cost += extra_cost->ldst.load;
6170 return true;
6172 case HIGH:
6173 case LO_SUM:
6174 /* ADRP/ADD (immediate). */
6175 if (speed)
6176 *cost += extra_cost->alu.arith;
6177 return true;
6179 case ZERO_EXTRACT:
6180 case SIGN_EXTRACT:
6181 /* UBFX/SBFX. */
6182 if (speed)
6183 *cost += extra_cost->alu.bfx;
6185 /* We can trust that the immediates used will be correct (there
6186 are no by-register forms), so we need only cost op0. */
6187 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6188 return true;
6190 case MULT:
6191 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6192 /* aarch64_rtx_mult_cost always handles recursion to its
6193 operands. */
6194 return true;
6196 case MOD:
6197 case UMOD:
6198 if (speed)
6200 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6201 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6202 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6203 else if (GET_MODE (x) == DFmode)
6204 *cost += (extra_cost->fp[1].mult
6205 + extra_cost->fp[1].div);
6206 else if (GET_MODE (x) == SFmode)
6207 *cost += (extra_cost->fp[0].mult
6208 + extra_cost->fp[0].div);
6210 return false; /* All arguments need to be in registers. */
6212 case DIV:
6213 case UDIV:
6214 case SQRT:
6215 if (speed)
6217 if (GET_MODE_CLASS (mode) == MODE_INT)
6218 /* There is no integer SQRT, so only DIV and UDIV can get
6219 here. */
6220 *cost += extra_cost->mult[mode == DImode].idiv;
6221 else
6222 *cost += extra_cost->fp[mode == DFmode].div;
6224 return false; /* All arguments need to be in registers. */
6226 case IF_THEN_ELSE:
6227 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6228 XEXP (x, 2), cost, speed);
6230 case EQ:
6231 case NE:
6232 case GT:
6233 case GTU:
6234 case LT:
6235 case LTU:
6236 case GE:
6237 case GEU:
6238 case LE:
6239 case LEU:
6241 return false; /* All arguments must be in registers. */
6243 case FMA:
6244 op0 = XEXP (x, 0);
6245 op1 = XEXP (x, 1);
6246 op2 = XEXP (x, 2);
6248 if (speed)
6249 *cost += extra_cost->fp[mode == DFmode].fma;
6251 /* FMSUB, FNMADD, and FNMSUB are free. */
6252 if (GET_CODE (op0) == NEG)
6253 op0 = XEXP (op0, 0);
6255 if (GET_CODE (op2) == NEG)
6256 op2 = XEXP (op2, 0);
6258 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6259 and the by-element operand as operand 0. */
6260 if (GET_CODE (op1) == NEG)
6261 op1 = XEXP (op1, 0);
6263 /* Catch vector-by-element operations. The by-element operand can
6264 either be (vec_duplicate (vec_select (x))) or just
6265 (vec_select (x)), depending on whether we are multiplying by
6266 a vector or a scalar.
6268 Canonicalization is not very good in these cases, FMA4 will put the
6269 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6270 if (GET_CODE (op0) == VEC_DUPLICATE)
6271 op0 = XEXP (op0, 0);
6272 else if (GET_CODE (op1) == VEC_DUPLICATE)
6273 op1 = XEXP (op1, 0);
6275 if (GET_CODE (op0) == VEC_SELECT)
6276 op0 = XEXP (op0, 0);
6277 else if (GET_CODE (op1) == VEC_SELECT)
6278 op1 = XEXP (op1, 0);
6280 /* If the remaining parameters are not registers,
6281 get the cost to put them into registers. */
6282 *cost += rtx_cost (op0, FMA, 0, speed);
6283 *cost += rtx_cost (op1, FMA, 1, speed);
6284 *cost += rtx_cost (op2, FMA, 2, speed);
6285 return true;
6287 case FLOAT_EXTEND:
6288 if (speed)
6289 *cost += extra_cost->fp[mode == DFmode].widen;
6290 return false;
6292 case FLOAT_TRUNCATE:
6293 if (speed)
6294 *cost += extra_cost->fp[mode == DFmode].narrow;
6295 return false;
6297 case FIX:
6298 case UNSIGNED_FIX:
6299 x = XEXP (x, 0);
6300 /* Strip the rounding part. They will all be implemented
6301 by the fcvt* family of instructions anyway. */
6302 if (GET_CODE (x) == UNSPEC)
6304 unsigned int uns_code = XINT (x, 1);
6306 if (uns_code == UNSPEC_FRINTA
6307 || uns_code == UNSPEC_FRINTM
6308 || uns_code == UNSPEC_FRINTN
6309 || uns_code == UNSPEC_FRINTP
6310 || uns_code == UNSPEC_FRINTZ)
6311 x = XVECEXP (x, 0, 0);
6314 if (speed)
6315 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6317 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6318 return true;
6320 case ABS:
6321 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6323 /* FABS and FNEG are analogous. */
6324 if (speed)
6325 *cost += extra_cost->fp[mode == DFmode].neg;
6327 else
6329 /* Integer ABS will either be split to
6330 two arithmetic instructions, or will be an ABS
6331 (scalar), which we don't model. */
6332 *cost = COSTS_N_INSNS (2);
6333 if (speed)
6334 *cost += 2 * extra_cost->alu.arith;
6336 return false;
6338 case SMAX:
6339 case SMIN:
6340 if (speed)
6342 /* FMAXNM/FMINNM/FMAX/FMIN.
6343 TODO: This may not be accurate for all implementations, but
6344 we do not model this in the cost tables. */
6345 *cost += extra_cost->fp[mode == DFmode].addsub;
6347 return false;
6349 case UNSPEC:
6350 /* The floating point round to integer frint* instructions. */
6351 if (aarch64_frint_unspec_p (XINT (x, 1)))
6353 if (speed)
6354 *cost += extra_cost->fp[mode == DFmode].roundint;
6356 return false;
6359 if (XINT (x, 1) == UNSPEC_RBIT)
6361 if (speed)
6362 *cost += extra_cost->alu.rev;
6364 return false;
6366 break;
6368 case TRUNCATE:
6370 /* Decompose <su>muldi3_highpart. */
6371 if (/* (truncate:DI */
6372 mode == DImode
6373 /* (lshiftrt:TI */
6374 && GET_MODE (XEXP (x, 0)) == TImode
6375 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6376 /* (mult:TI */
6377 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6378 /* (ANY_EXTEND:TI (reg:DI))
6379 (ANY_EXTEND:TI (reg:DI))) */
6380 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6381 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6382 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6383 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6384 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6385 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6386 /* (const_int 64) */
6387 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6388 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6390 /* UMULH/SMULH. */
6391 if (speed)
6392 *cost += extra_cost->mult[mode == DImode].extend;
6393 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6394 MULT, 0, speed);
6395 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6396 MULT, 1, speed);
6397 return true;
6400 /* Fall through. */
6401 default:
6402 break;
6405 if (dump_file && (dump_flags & TDF_DETAILS))
6406 fprintf (dump_file,
6407 "\nFailed to cost RTX. Assuming default cost.\n");
6409 return true;
6412 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6413 calculated for X. This cost is stored in *COST. Returns true
6414 if the total cost of X was calculated. */
6415 static bool
6416 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6417 int param, int *cost, bool speed)
6419 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6421 if (dump_file && (dump_flags & TDF_DETAILS))
6423 print_rtl_single (dump_file, x);
6424 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6425 speed ? "Hot" : "Cold",
6426 *cost, result ? "final" : "partial");
6429 return result;
6432 static int
6433 aarch64_register_move_cost (machine_mode mode,
6434 reg_class_t from_i, reg_class_t to_i)
6436 enum reg_class from = (enum reg_class) from_i;
6437 enum reg_class to = (enum reg_class) to_i;
6438 const struct cpu_regmove_cost *regmove_cost
6439 = aarch64_tune_params->regmove_cost;
6441 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6442 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6443 to = GENERAL_REGS;
6445 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6446 from = GENERAL_REGS;
6448 /* Moving between GPR and stack cost is the same as GP2GP. */
6449 if ((from == GENERAL_REGS && to == STACK_REG)
6450 || (to == GENERAL_REGS && from == STACK_REG))
6451 return regmove_cost->GP2GP;
6453 /* To/From the stack register, we move via the gprs. */
6454 if (to == STACK_REG || from == STACK_REG)
6455 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6456 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6458 if (GET_MODE_SIZE (mode) == 16)
6460 /* 128-bit operations on general registers require 2 instructions. */
6461 if (from == GENERAL_REGS && to == GENERAL_REGS)
6462 return regmove_cost->GP2GP * 2;
6463 else if (from == GENERAL_REGS)
6464 return regmove_cost->GP2FP * 2;
6465 else if (to == GENERAL_REGS)
6466 return regmove_cost->FP2GP * 2;
6468 /* When AdvSIMD instructions are disabled it is not possible to move
6469 a 128-bit value directly between Q registers. This is handled in
6470 secondary reload. A general register is used as a scratch to move
6471 the upper DI value and the lower DI value is moved directly,
6472 hence the cost is the sum of three moves. */
6473 if (! TARGET_SIMD)
6474 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6476 return regmove_cost->FP2FP;
6479 if (from == GENERAL_REGS && to == GENERAL_REGS)
6480 return regmove_cost->GP2GP;
6481 else if (from == GENERAL_REGS)
6482 return regmove_cost->GP2FP;
6483 else if (to == GENERAL_REGS)
6484 return regmove_cost->FP2GP;
6486 return regmove_cost->FP2FP;
6489 static int
6490 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6491 reg_class_t rclass ATTRIBUTE_UNUSED,
6492 bool in ATTRIBUTE_UNUSED)
6494 return aarch64_tune_params->memmov_cost;
6497 /* Return the number of instructions that can be issued per cycle. */
6498 static int
6499 aarch64_sched_issue_rate (void)
6501 return aarch64_tune_params->issue_rate;
6504 static int
6505 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6507 int issue_rate = aarch64_sched_issue_rate ();
6509 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6512 /* Vectorizer cost model target hooks. */
6514 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6515 static int
6516 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6517 tree vectype,
6518 int misalign ATTRIBUTE_UNUSED)
6520 unsigned elements;
6522 switch (type_of_cost)
6524 case scalar_stmt:
6525 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6527 case scalar_load:
6528 return aarch64_tune_params->vec_costs->scalar_load_cost;
6530 case scalar_store:
6531 return aarch64_tune_params->vec_costs->scalar_store_cost;
6533 case vector_stmt:
6534 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6536 case vector_load:
6537 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6539 case vector_store:
6540 return aarch64_tune_params->vec_costs->vec_store_cost;
6542 case vec_to_scalar:
6543 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6545 case scalar_to_vec:
6546 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6548 case unaligned_load:
6549 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6551 case unaligned_store:
6552 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6554 case cond_branch_taken:
6555 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6557 case cond_branch_not_taken:
6558 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6560 case vec_perm:
6561 case vec_promote_demote:
6562 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6564 case vec_construct:
6565 elements = TYPE_VECTOR_SUBPARTS (vectype);
6566 return elements / 2 + 1;
6568 default:
6569 gcc_unreachable ();
6573 /* Implement targetm.vectorize.add_stmt_cost. */
6574 static unsigned
6575 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6576 struct _stmt_vec_info *stmt_info, int misalign,
6577 enum vect_cost_model_location where)
6579 unsigned *cost = (unsigned *) data;
6580 unsigned retval = 0;
6582 if (flag_vect_cost_model)
6584 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6585 int stmt_cost =
6586 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6588 /* Statements in an inner loop relative to the loop being
6589 vectorized are weighted more heavily. The value here is
6590 a function (linear for now) of the loop nest level. */
6591 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6593 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6594 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6595 unsigned nest_level = loop_depth (loop);
6597 count *= nest_level;
6600 retval = (unsigned) (count * stmt_cost);
6601 cost[where] += retval;
6604 return retval;
6607 static void initialize_aarch64_code_model (void);
6609 /* Parse the architecture extension string. */
6611 static void
6612 aarch64_parse_extension (char *str)
6614 /* The extension string is parsed left to right. */
6615 const struct aarch64_option_extension *opt = NULL;
6617 /* Flag to say whether we are adding or removing an extension. */
6618 int adding_ext = -1;
6620 while (str != NULL && *str != 0)
6622 char *ext;
6623 size_t len;
6625 str++;
6626 ext = strchr (str, '+');
6628 if (ext != NULL)
6629 len = ext - str;
6630 else
6631 len = strlen (str);
6633 if (len >= 2 && strncmp (str, "no", 2) == 0)
6635 adding_ext = 0;
6636 len -= 2;
6637 str += 2;
6639 else if (len > 0)
6640 adding_ext = 1;
6642 if (len == 0)
6644 error ("missing feature modifier after %qs", adding_ext ? "+"
6645 : "+no");
6646 return;
6649 /* Scan over the extensions table trying to find an exact match. */
6650 for (opt = all_extensions; opt->name != NULL; opt++)
6652 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6654 /* Add or remove the extension. */
6655 if (adding_ext)
6656 aarch64_isa_flags |= opt->flags_on;
6657 else
6658 aarch64_isa_flags &= ~(opt->flags_off);
6659 break;
6663 if (opt->name == NULL)
6665 /* Extension not found in list. */
6666 error ("unknown feature modifier %qs", str);
6667 return;
6670 str = ext;
6673 return;
6676 /* Parse the ARCH string. */
6678 static void
6679 aarch64_parse_arch (void)
6681 char *ext;
6682 const struct processor *arch;
6683 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6684 size_t len;
6686 strcpy (str, aarch64_arch_string);
6688 ext = strchr (str, '+');
6690 if (ext != NULL)
6691 len = ext - str;
6692 else
6693 len = strlen (str);
6695 if (len == 0)
6697 error ("missing arch name in -march=%qs", str);
6698 return;
6701 /* Loop through the list of supported ARCHs to find a match. */
6702 for (arch = all_architectures; arch->name != NULL; arch++)
6704 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6706 selected_arch = arch;
6707 aarch64_isa_flags = selected_arch->flags;
6709 if (!selected_cpu)
6710 selected_cpu = &all_cores[selected_arch->core];
6712 if (ext != NULL)
6714 /* ARCH string contains at least one extension. */
6715 aarch64_parse_extension (ext);
6718 if (strcmp (selected_arch->arch, selected_cpu->arch))
6720 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6721 selected_cpu->name, selected_arch->name);
6724 return;
6728 /* ARCH name not found in list. */
6729 error ("unknown value %qs for -march", str);
6730 return;
6733 /* Parse the CPU string. */
6735 static void
6736 aarch64_parse_cpu (void)
6738 char *ext;
6739 const struct processor *cpu;
6740 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6741 size_t len;
6743 strcpy (str, aarch64_cpu_string);
6745 ext = strchr (str, '+');
6747 if (ext != NULL)
6748 len = ext - str;
6749 else
6750 len = strlen (str);
6752 if (len == 0)
6754 error ("missing cpu name in -mcpu=%qs", str);
6755 return;
6758 /* Loop through the list of supported CPUs to find a match. */
6759 for (cpu = all_cores; cpu->name != NULL; cpu++)
6761 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6763 selected_cpu = cpu;
6764 aarch64_isa_flags = selected_cpu->flags;
6766 if (ext != NULL)
6768 /* CPU string contains at least one extension. */
6769 aarch64_parse_extension (ext);
6772 return;
6776 /* CPU name not found in list. */
6777 error ("unknown value %qs for -mcpu", str);
6778 return;
6781 /* Parse the TUNE string. */
6783 static void
6784 aarch64_parse_tune (void)
6786 const struct processor *cpu;
6787 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6788 strcpy (str, aarch64_tune_string);
6790 /* Loop through the list of supported CPUs to find a match. */
6791 for (cpu = all_cores; cpu->name != NULL; cpu++)
6793 if (strcmp (cpu->name, str) == 0)
6795 selected_tune = cpu;
6796 return;
6800 /* CPU name not found in list. */
6801 error ("unknown value %qs for -mtune", str);
6802 return;
6806 /* Implement TARGET_OPTION_OVERRIDE. */
6808 static void
6809 aarch64_override_options (void)
6811 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6812 If either of -march or -mtune is given, they override their
6813 respective component of -mcpu.
6815 So, first parse AARCH64_CPU_STRING, then the others, be careful
6816 with -march as, if -mcpu is not present on the command line, march
6817 must set a sensible default CPU. */
6818 if (aarch64_cpu_string)
6820 aarch64_parse_cpu ();
6823 if (aarch64_arch_string)
6825 aarch64_parse_arch ();
6828 if (aarch64_tune_string)
6830 aarch64_parse_tune ();
6833 #ifndef HAVE_AS_MABI_OPTION
6834 /* The compiler may have been configured with 2.23.* binutils, which does
6835 not have support for ILP32. */
6836 if (TARGET_ILP32)
6837 error ("Assembler does not support -mabi=ilp32");
6838 #endif
6840 initialize_aarch64_code_model ();
6842 aarch64_build_bitmask_table ();
6844 /* This target defaults to strict volatile bitfields. */
6845 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6846 flag_strict_volatile_bitfields = 1;
6848 /* If the user did not specify a processor, choose the default
6849 one for them. This will be the CPU set during configuration using
6850 --with-cpu, otherwise it is "generic". */
6851 if (!selected_cpu)
6853 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6854 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6857 gcc_assert (selected_cpu);
6859 if (!selected_tune)
6860 selected_tune = selected_cpu;
6862 aarch64_tune_flags = selected_tune->flags;
6863 aarch64_tune = selected_tune->core;
6864 aarch64_tune_params = selected_tune->tune;
6865 aarch64_architecture_version = selected_cpu->architecture_version;
6867 if (aarch64_fix_a53_err835769 == 2)
6869 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6870 aarch64_fix_a53_err835769 = 1;
6871 #else
6872 aarch64_fix_a53_err835769 = 0;
6873 #endif
6876 /* If not opzimizing for size, set the default
6877 alignment to what the target wants */
6878 if (!optimize_size)
6880 if (align_loops <= 0)
6881 align_loops = aarch64_tune_params->loop_align;
6882 if (align_jumps <= 0)
6883 align_jumps = aarch64_tune_params->jump_align;
6884 if (align_functions <= 0)
6885 align_functions = aarch64_tune_params->function_align;
6888 if (AARCH64_TUNE_FMA_STEERING)
6889 aarch64_register_fma_steering ();
6891 aarch64_override_options_after_change ();
6894 /* Implement targetm.override_options_after_change. */
6896 static void
6897 aarch64_override_options_after_change (void)
6899 if (flag_omit_frame_pointer)
6900 flag_omit_leaf_frame_pointer = false;
6901 else if (flag_omit_leaf_frame_pointer)
6902 flag_omit_frame_pointer = true;
6905 static struct machine_function *
6906 aarch64_init_machine_status (void)
6908 struct machine_function *machine;
6909 machine = ggc_cleared_alloc<machine_function> ();
6910 return machine;
6913 void
6914 aarch64_init_expanders (void)
6916 init_machine_status = aarch64_init_machine_status;
6919 /* A checking mechanism for the implementation of the various code models. */
6920 static void
6921 initialize_aarch64_code_model (void)
6923 if (flag_pic)
6925 switch (aarch64_cmodel_var)
6927 case AARCH64_CMODEL_TINY:
6928 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6929 break;
6930 case AARCH64_CMODEL_SMALL:
6931 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6932 break;
6933 case AARCH64_CMODEL_LARGE:
6934 sorry ("code model %qs with -f%s", "large",
6935 flag_pic > 1 ? "PIC" : "pic");
6936 default:
6937 gcc_unreachable ();
6940 else
6941 aarch64_cmodel = aarch64_cmodel_var;
6944 /* Return true if SYMBOL_REF X binds locally. */
6946 static bool
6947 aarch64_symbol_binds_local_p (const_rtx x)
6949 return (SYMBOL_REF_DECL (x)
6950 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6951 : SYMBOL_REF_LOCAL_P (x));
6954 /* Return true if SYMBOL_REF X is thread local */
6955 static bool
6956 aarch64_tls_symbol_p (rtx x)
6958 if (! TARGET_HAVE_TLS)
6959 return false;
6961 if (GET_CODE (x) != SYMBOL_REF)
6962 return false;
6964 return SYMBOL_REF_TLS_MODEL (x) != 0;
6967 /* Classify a TLS symbol into one of the TLS kinds. */
6968 enum aarch64_symbol_type
6969 aarch64_classify_tls_symbol (rtx x)
6971 enum tls_model tls_kind = tls_symbolic_operand_type (x);
6973 switch (tls_kind)
6975 case TLS_MODEL_GLOBAL_DYNAMIC:
6976 case TLS_MODEL_LOCAL_DYNAMIC:
6977 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6979 case TLS_MODEL_INITIAL_EXEC:
6980 return SYMBOL_SMALL_GOTTPREL;
6982 case TLS_MODEL_LOCAL_EXEC:
6983 return SYMBOL_SMALL_TPREL;
6985 case TLS_MODEL_EMULATED:
6986 case TLS_MODEL_NONE:
6987 return SYMBOL_FORCE_TO_MEM;
6989 default:
6990 gcc_unreachable ();
6994 /* Return the method that should be used to access SYMBOL_REF or
6995 LABEL_REF X in context CONTEXT. */
6997 enum aarch64_symbol_type
6998 aarch64_classify_symbol (rtx x, rtx offset,
6999 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7001 if (GET_CODE (x) == LABEL_REF)
7003 switch (aarch64_cmodel)
7005 case AARCH64_CMODEL_LARGE:
7006 return SYMBOL_FORCE_TO_MEM;
7008 case AARCH64_CMODEL_TINY_PIC:
7009 case AARCH64_CMODEL_TINY:
7010 return SYMBOL_TINY_ABSOLUTE;
7012 case AARCH64_CMODEL_SMALL_PIC:
7013 case AARCH64_CMODEL_SMALL:
7014 return SYMBOL_SMALL_ABSOLUTE;
7016 default:
7017 gcc_unreachable ();
7021 if (GET_CODE (x) == SYMBOL_REF)
7023 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7024 return SYMBOL_FORCE_TO_MEM;
7026 if (aarch64_tls_symbol_p (x))
7027 return aarch64_classify_tls_symbol (x);
7029 switch (aarch64_cmodel)
7031 case AARCH64_CMODEL_TINY:
7032 /* When we retreive symbol + offset address, we have to make sure
7033 the offset does not cause overflow of the final address. But
7034 we have no way of knowing the address of symbol at compile time
7035 so we can't accurately say if the distance between the PC and
7036 symbol + offset is outside the addressible range of +/-1M in the
7037 TINY code model. So we rely on images not being greater than
7038 1M and cap the offset at 1M and anything beyond 1M will have to
7039 be loaded using an alternative mechanism. */
7040 if (SYMBOL_REF_WEAK (x)
7041 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7042 return SYMBOL_FORCE_TO_MEM;
7043 return SYMBOL_TINY_ABSOLUTE;
7045 case AARCH64_CMODEL_SMALL:
7046 /* Same reasoning as the tiny code model, but the offset cap here is
7047 4G. */
7048 if (SYMBOL_REF_WEAK (x)
7049 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7050 HOST_WIDE_INT_C (4294967264)))
7051 return SYMBOL_FORCE_TO_MEM;
7052 return SYMBOL_SMALL_ABSOLUTE;
7054 case AARCH64_CMODEL_TINY_PIC:
7055 if (!aarch64_symbol_binds_local_p (x))
7056 return SYMBOL_TINY_GOT;
7057 return SYMBOL_TINY_ABSOLUTE;
7059 case AARCH64_CMODEL_SMALL_PIC:
7060 if (!aarch64_symbol_binds_local_p (x))
7061 return SYMBOL_SMALL_GOT;
7062 return SYMBOL_SMALL_ABSOLUTE;
7064 default:
7065 gcc_unreachable ();
7069 /* By default push everything into the constant pool. */
7070 return SYMBOL_FORCE_TO_MEM;
7073 bool
7074 aarch64_constant_address_p (rtx x)
7076 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7079 bool
7080 aarch64_legitimate_pic_operand_p (rtx x)
7082 if (GET_CODE (x) == SYMBOL_REF
7083 || (GET_CODE (x) == CONST
7084 && GET_CODE (XEXP (x, 0)) == PLUS
7085 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7086 return false;
7088 return true;
7091 /* Return true if X holds either a quarter-precision or
7092 floating-point +0.0 constant. */
7093 static bool
7094 aarch64_valid_floating_const (machine_mode mode, rtx x)
7096 if (!CONST_DOUBLE_P (x))
7097 return false;
7099 /* TODO: We could handle moving 0.0 to a TFmode register,
7100 but first we would like to refactor the movtf_aarch64
7101 to be more amicable to split moves properly and
7102 correctly gate on TARGET_SIMD. For now - reject all
7103 constants which are not to SFmode or DFmode registers. */
7104 if (!(mode == SFmode || mode == DFmode))
7105 return false;
7107 if (aarch64_float_const_zero_rtx_p (x))
7108 return true;
7109 return aarch64_float_const_representable_p (x);
7112 static bool
7113 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7115 /* Do not allow vector struct mode constants. We could support
7116 0 and -1 easily, but they need support in aarch64-simd.md. */
7117 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7118 return false;
7120 /* This could probably go away because
7121 we now decompose CONST_INTs according to expand_mov_immediate. */
7122 if ((GET_CODE (x) == CONST_VECTOR
7123 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7124 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7125 return !targetm.cannot_force_const_mem (mode, x);
7127 if (GET_CODE (x) == HIGH
7128 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7129 return true;
7131 return aarch64_constant_address_p (x);
7135 aarch64_load_tp (rtx target)
7137 if (!target
7138 || GET_MODE (target) != Pmode
7139 || !register_operand (target, Pmode))
7140 target = gen_reg_rtx (Pmode);
7142 /* Can return in any reg. */
7143 emit_insn (gen_aarch64_load_tp_hard (target));
7144 return target;
7147 /* On AAPCS systems, this is the "struct __va_list". */
7148 static GTY(()) tree va_list_type;
7150 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7151 Return the type to use as __builtin_va_list.
7153 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7155 struct __va_list
7157 void *__stack;
7158 void *__gr_top;
7159 void *__vr_top;
7160 int __gr_offs;
7161 int __vr_offs;
7162 }; */
7164 static tree
7165 aarch64_build_builtin_va_list (void)
7167 tree va_list_name;
7168 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7170 /* Create the type. */
7171 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7172 /* Give it the required name. */
7173 va_list_name = build_decl (BUILTINS_LOCATION,
7174 TYPE_DECL,
7175 get_identifier ("__va_list"),
7176 va_list_type);
7177 DECL_ARTIFICIAL (va_list_name) = 1;
7178 TYPE_NAME (va_list_type) = va_list_name;
7179 TYPE_STUB_DECL (va_list_type) = va_list_name;
7181 /* Create the fields. */
7182 f_stack = build_decl (BUILTINS_LOCATION,
7183 FIELD_DECL, get_identifier ("__stack"),
7184 ptr_type_node);
7185 f_grtop = build_decl (BUILTINS_LOCATION,
7186 FIELD_DECL, get_identifier ("__gr_top"),
7187 ptr_type_node);
7188 f_vrtop = build_decl (BUILTINS_LOCATION,
7189 FIELD_DECL, get_identifier ("__vr_top"),
7190 ptr_type_node);
7191 f_groff = build_decl (BUILTINS_LOCATION,
7192 FIELD_DECL, get_identifier ("__gr_offs"),
7193 integer_type_node);
7194 f_vroff = build_decl (BUILTINS_LOCATION,
7195 FIELD_DECL, get_identifier ("__vr_offs"),
7196 integer_type_node);
7198 DECL_ARTIFICIAL (f_stack) = 1;
7199 DECL_ARTIFICIAL (f_grtop) = 1;
7200 DECL_ARTIFICIAL (f_vrtop) = 1;
7201 DECL_ARTIFICIAL (f_groff) = 1;
7202 DECL_ARTIFICIAL (f_vroff) = 1;
7204 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7205 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7206 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7207 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7208 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7210 TYPE_FIELDS (va_list_type) = f_stack;
7211 DECL_CHAIN (f_stack) = f_grtop;
7212 DECL_CHAIN (f_grtop) = f_vrtop;
7213 DECL_CHAIN (f_vrtop) = f_groff;
7214 DECL_CHAIN (f_groff) = f_vroff;
7216 /* Compute its layout. */
7217 layout_type (va_list_type);
7219 return va_list_type;
7222 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7223 static void
7224 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7226 const CUMULATIVE_ARGS *cum;
7227 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7228 tree stack, grtop, vrtop, groff, vroff;
7229 tree t;
7230 int gr_save_area_size;
7231 int vr_save_area_size;
7232 int vr_offset;
7234 cum = &crtl->args.info;
7235 gr_save_area_size
7236 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7237 vr_save_area_size
7238 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7240 if (TARGET_GENERAL_REGS_ONLY)
7242 if (cum->aapcs_nvrn > 0)
7243 sorry ("%qs and floating point or vector arguments",
7244 "-mgeneral-regs-only");
7245 vr_save_area_size = 0;
7248 f_stack = TYPE_FIELDS (va_list_type_node);
7249 f_grtop = DECL_CHAIN (f_stack);
7250 f_vrtop = DECL_CHAIN (f_grtop);
7251 f_groff = DECL_CHAIN (f_vrtop);
7252 f_vroff = DECL_CHAIN (f_groff);
7254 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7255 NULL_TREE);
7256 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7257 NULL_TREE);
7258 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7259 NULL_TREE);
7260 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7261 NULL_TREE);
7262 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7263 NULL_TREE);
7265 /* Emit code to initialize STACK, which points to the next varargs stack
7266 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7267 by named arguments. STACK is 8-byte aligned. */
7268 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7269 if (cum->aapcs_stack_size > 0)
7270 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7271 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7272 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7274 /* Emit code to initialize GRTOP, the top of the GR save area.
7275 virtual_incoming_args_rtx should have been 16 byte aligned. */
7276 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7277 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7278 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7280 /* Emit code to initialize VRTOP, the top of the VR save area.
7281 This address is gr_save_area_bytes below GRTOP, rounded
7282 down to the next 16-byte boundary. */
7283 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7284 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7285 STACK_BOUNDARY / BITS_PER_UNIT);
7287 if (vr_offset)
7288 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7289 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7290 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7292 /* Emit code to initialize GROFF, the offset from GRTOP of the
7293 next GPR argument. */
7294 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7295 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7296 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7298 /* Likewise emit code to initialize VROFF, the offset from FTOP
7299 of the next VR argument. */
7300 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7301 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7302 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7305 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7307 static tree
7308 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7309 gimple_seq *post_p ATTRIBUTE_UNUSED)
7311 tree addr;
7312 bool indirect_p;
7313 bool is_ha; /* is HFA or HVA. */
7314 bool dw_align; /* double-word align. */
7315 machine_mode ag_mode = VOIDmode;
7316 int nregs;
7317 machine_mode mode;
7319 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7320 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7321 HOST_WIDE_INT size, rsize, adjust, align;
7322 tree t, u, cond1, cond2;
7324 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7325 if (indirect_p)
7326 type = build_pointer_type (type);
7328 mode = TYPE_MODE (type);
7330 f_stack = TYPE_FIELDS (va_list_type_node);
7331 f_grtop = DECL_CHAIN (f_stack);
7332 f_vrtop = DECL_CHAIN (f_grtop);
7333 f_groff = DECL_CHAIN (f_vrtop);
7334 f_vroff = DECL_CHAIN (f_groff);
7336 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7337 f_stack, NULL_TREE);
7338 size = int_size_in_bytes (type);
7339 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7341 dw_align = false;
7342 adjust = 0;
7343 if (aarch64_vfp_is_call_or_return_candidate (mode,
7344 type,
7345 &ag_mode,
7346 &nregs,
7347 &is_ha))
7349 /* TYPE passed in fp/simd registers. */
7350 if (TARGET_GENERAL_REGS_ONLY)
7351 sorry ("%qs and floating point or vector arguments",
7352 "-mgeneral-regs-only");
7354 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7355 unshare_expr (valist), f_vrtop, NULL_TREE);
7356 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7357 unshare_expr (valist), f_vroff, NULL_TREE);
7359 rsize = nregs * UNITS_PER_VREG;
7361 if (is_ha)
7363 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7364 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7366 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7367 && size < UNITS_PER_VREG)
7369 adjust = UNITS_PER_VREG - size;
7372 else
7374 /* TYPE passed in general registers. */
7375 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7376 unshare_expr (valist), f_grtop, NULL_TREE);
7377 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7378 unshare_expr (valist), f_groff, NULL_TREE);
7379 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7380 nregs = rsize / UNITS_PER_WORD;
7382 if (align > 8)
7383 dw_align = true;
7385 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7386 && size < UNITS_PER_WORD)
7388 adjust = UNITS_PER_WORD - size;
7392 /* Get a local temporary for the field value. */
7393 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7395 /* Emit code to branch if off >= 0. */
7396 t = build2 (GE_EXPR, boolean_type_node, off,
7397 build_int_cst (TREE_TYPE (off), 0));
7398 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7400 if (dw_align)
7402 /* Emit: offs = (offs + 15) & -16. */
7403 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7404 build_int_cst (TREE_TYPE (off), 15));
7405 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7406 build_int_cst (TREE_TYPE (off), -16));
7407 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7409 else
7410 roundup = NULL;
7412 /* Update ap.__[g|v]r_offs */
7413 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7414 build_int_cst (TREE_TYPE (off), rsize));
7415 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7417 /* String up. */
7418 if (roundup)
7419 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7421 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7422 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7423 build_int_cst (TREE_TYPE (f_off), 0));
7424 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7426 /* String up: make sure the assignment happens before the use. */
7427 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7428 COND_EXPR_ELSE (cond1) = t;
7430 /* Prepare the trees handling the argument that is passed on the stack;
7431 the top level node will store in ON_STACK. */
7432 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7433 if (align > 8)
7435 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7436 t = fold_convert (intDI_type_node, arg);
7437 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7438 build_int_cst (TREE_TYPE (t), 15));
7439 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7440 build_int_cst (TREE_TYPE (t), -16));
7441 t = fold_convert (TREE_TYPE (arg), t);
7442 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7444 else
7445 roundup = NULL;
7446 /* Advance ap.__stack */
7447 t = fold_convert (intDI_type_node, arg);
7448 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7449 build_int_cst (TREE_TYPE (t), size + 7));
7450 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7451 build_int_cst (TREE_TYPE (t), -8));
7452 t = fold_convert (TREE_TYPE (arg), t);
7453 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7454 /* String up roundup and advance. */
7455 if (roundup)
7456 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7457 /* String up with arg */
7458 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7459 /* Big-endianness related address adjustment. */
7460 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7461 && size < UNITS_PER_WORD)
7463 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7464 size_int (UNITS_PER_WORD - size));
7465 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7468 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7469 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7471 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7472 t = off;
7473 if (adjust)
7474 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7475 build_int_cst (TREE_TYPE (off), adjust));
7477 t = fold_convert (sizetype, t);
7478 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7480 if (is_ha)
7482 /* type ha; // treat as "struct {ftype field[n];}"
7483 ... [computing offs]
7484 for (i = 0; i <nregs; ++i, offs += 16)
7485 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7486 return ha; */
7487 int i;
7488 tree tmp_ha, field_t, field_ptr_t;
7490 /* Declare a local variable. */
7491 tmp_ha = create_tmp_var_raw (type, "ha");
7492 gimple_add_tmp_var (tmp_ha);
7494 /* Establish the base type. */
7495 switch (ag_mode)
7497 case SFmode:
7498 field_t = float_type_node;
7499 field_ptr_t = float_ptr_type_node;
7500 break;
7501 case DFmode:
7502 field_t = double_type_node;
7503 field_ptr_t = double_ptr_type_node;
7504 break;
7505 case TFmode:
7506 field_t = long_double_type_node;
7507 field_ptr_t = long_double_ptr_type_node;
7508 break;
7509 /* The half precision and quad precision are not fully supported yet. Enable
7510 the following code after the support is complete. Need to find the correct
7511 type node for __fp16 *. */
7512 #if 0
7513 case HFmode:
7514 field_t = float_type_node;
7515 field_ptr_t = float_ptr_type_node;
7516 break;
7517 #endif
7518 case V2SImode:
7519 case V4SImode:
7521 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7522 field_t = build_vector_type_for_mode (innertype, ag_mode);
7523 field_ptr_t = build_pointer_type (field_t);
7525 break;
7526 default:
7527 gcc_assert (0);
7530 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7531 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7532 addr = t;
7533 t = fold_convert (field_ptr_t, addr);
7534 t = build2 (MODIFY_EXPR, field_t,
7535 build1 (INDIRECT_REF, field_t, tmp_ha),
7536 build1 (INDIRECT_REF, field_t, t));
7538 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7539 for (i = 1; i < nregs; ++i)
7541 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7542 u = fold_convert (field_ptr_t, addr);
7543 u = build2 (MODIFY_EXPR, field_t,
7544 build2 (MEM_REF, field_t, tmp_ha,
7545 build_int_cst (field_ptr_t,
7546 (i *
7547 int_size_in_bytes (field_t)))),
7548 build1 (INDIRECT_REF, field_t, u));
7549 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7552 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7553 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7556 COND_EXPR_ELSE (cond2) = t;
7557 addr = fold_convert (build_pointer_type (type), cond1);
7558 addr = build_va_arg_indirect_ref (addr);
7560 if (indirect_p)
7561 addr = build_va_arg_indirect_ref (addr);
7563 return addr;
7566 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7568 static void
7569 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7570 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7571 int no_rtl)
7573 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7574 CUMULATIVE_ARGS local_cum;
7575 int gr_saved, vr_saved;
7577 /* The caller has advanced CUM up to, but not beyond, the last named
7578 argument. Advance a local copy of CUM past the last "real" named
7579 argument, to find out how many registers are left over. */
7580 local_cum = *cum;
7581 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7583 /* Found out how many registers we need to save. */
7584 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7585 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7587 if (TARGET_GENERAL_REGS_ONLY)
7589 if (local_cum.aapcs_nvrn > 0)
7590 sorry ("%qs and floating point or vector arguments",
7591 "-mgeneral-regs-only");
7592 vr_saved = 0;
7595 if (!no_rtl)
7597 if (gr_saved > 0)
7599 rtx ptr, mem;
7601 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7602 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7603 - gr_saved * UNITS_PER_WORD);
7604 mem = gen_frame_mem (BLKmode, ptr);
7605 set_mem_alias_set (mem, get_varargs_alias_set ());
7607 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7608 mem, gr_saved);
7610 if (vr_saved > 0)
7612 /* We can't use move_block_from_reg, because it will use
7613 the wrong mode, storing D regs only. */
7614 machine_mode mode = TImode;
7615 int off, i;
7617 /* Set OFF to the offset from virtual_incoming_args_rtx of
7618 the first vector register. The VR save area lies below
7619 the GR one, and is aligned to 16 bytes. */
7620 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7621 STACK_BOUNDARY / BITS_PER_UNIT);
7622 off -= vr_saved * UNITS_PER_VREG;
7624 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7626 rtx ptr, mem;
7628 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7629 mem = gen_frame_mem (mode, ptr);
7630 set_mem_alias_set (mem, get_varargs_alias_set ());
7631 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7632 off += UNITS_PER_VREG;
7637 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7638 any complication of having crtl->args.pretend_args_size changed. */
7639 cfun->machine->frame.saved_varargs_size
7640 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7641 STACK_BOUNDARY / BITS_PER_UNIT)
7642 + vr_saved * UNITS_PER_VREG);
7645 static void
7646 aarch64_conditional_register_usage (void)
7648 int i;
7649 if (!TARGET_FLOAT)
7651 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7653 fixed_regs[i] = 1;
7654 call_used_regs[i] = 1;
7659 /* Walk down the type tree of TYPE counting consecutive base elements.
7660 If *MODEP is VOIDmode, then set it to the first valid floating point
7661 type. If a non-floating point type is found, or if a floating point
7662 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7663 otherwise return the count in the sub-tree. */
7664 static int
7665 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7667 machine_mode mode;
7668 HOST_WIDE_INT size;
7670 switch (TREE_CODE (type))
7672 case REAL_TYPE:
7673 mode = TYPE_MODE (type);
7674 if (mode != DFmode && mode != SFmode && mode != TFmode)
7675 return -1;
7677 if (*modep == VOIDmode)
7678 *modep = mode;
7680 if (*modep == mode)
7681 return 1;
7683 break;
7685 case COMPLEX_TYPE:
7686 mode = TYPE_MODE (TREE_TYPE (type));
7687 if (mode != DFmode && mode != SFmode && mode != TFmode)
7688 return -1;
7690 if (*modep == VOIDmode)
7691 *modep = mode;
7693 if (*modep == mode)
7694 return 2;
7696 break;
7698 case VECTOR_TYPE:
7699 /* Use V2SImode and V4SImode as representatives of all 64-bit
7700 and 128-bit vector types. */
7701 size = int_size_in_bytes (type);
7702 switch (size)
7704 case 8:
7705 mode = V2SImode;
7706 break;
7707 case 16:
7708 mode = V4SImode;
7709 break;
7710 default:
7711 return -1;
7714 if (*modep == VOIDmode)
7715 *modep = mode;
7717 /* Vector modes are considered to be opaque: two vectors are
7718 equivalent for the purposes of being homogeneous aggregates
7719 if they are the same size. */
7720 if (*modep == mode)
7721 return 1;
7723 break;
7725 case ARRAY_TYPE:
7727 int count;
7728 tree index = TYPE_DOMAIN (type);
7730 /* Can't handle incomplete types nor sizes that are not
7731 fixed. */
7732 if (!COMPLETE_TYPE_P (type)
7733 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7734 return -1;
7736 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7737 if (count == -1
7738 || !index
7739 || !TYPE_MAX_VALUE (index)
7740 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7741 || !TYPE_MIN_VALUE (index)
7742 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7743 || count < 0)
7744 return -1;
7746 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7747 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7749 /* There must be no padding. */
7750 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7751 return -1;
7753 return count;
7756 case RECORD_TYPE:
7758 int count = 0;
7759 int sub_count;
7760 tree field;
7762 /* Can't handle incomplete types nor sizes that are not
7763 fixed. */
7764 if (!COMPLETE_TYPE_P (type)
7765 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7766 return -1;
7768 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7770 if (TREE_CODE (field) != FIELD_DECL)
7771 continue;
7773 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7774 if (sub_count < 0)
7775 return -1;
7776 count += sub_count;
7779 /* There must be no padding. */
7780 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7781 return -1;
7783 return count;
7786 case UNION_TYPE:
7787 case QUAL_UNION_TYPE:
7789 /* These aren't very interesting except in a degenerate case. */
7790 int count = 0;
7791 int sub_count;
7792 tree field;
7794 /* Can't handle incomplete types nor sizes that are not
7795 fixed. */
7796 if (!COMPLETE_TYPE_P (type)
7797 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7798 return -1;
7800 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7802 if (TREE_CODE (field) != FIELD_DECL)
7803 continue;
7805 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7806 if (sub_count < 0)
7807 return -1;
7808 count = count > sub_count ? count : sub_count;
7811 /* There must be no padding. */
7812 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7813 return -1;
7815 return count;
7818 default:
7819 break;
7822 return -1;
7825 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7826 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7827 array types. The C99 floating-point complex types are also considered
7828 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7829 types, which are GCC extensions and out of the scope of AAPCS64, are
7830 treated as composite types here as well.
7832 Note that MODE itself is not sufficient in determining whether a type
7833 is such a composite type or not. This is because
7834 stor-layout.c:compute_record_mode may have already changed the MODE
7835 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7836 structure with only one field may have its MODE set to the mode of the
7837 field. Also an integer mode whose size matches the size of the
7838 RECORD_TYPE type may be used to substitute the original mode
7839 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7840 solely relied on. */
7842 static bool
7843 aarch64_composite_type_p (const_tree type,
7844 machine_mode mode)
7846 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7847 return true;
7849 if (mode == BLKmode
7850 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7851 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7852 return true;
7854 return false;
7857 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7858 type as described in AAPCS64 \S 4.1.2.
7860 See the comment above aarch64_composite_type_p for the notes on MODE. */
7862 static bool
7863 aarch64_short_vector_p (const_tree type,
7864 machine_mode mode)
7866 HOST_WIDE_INT size = -1;
7868 if (type && TREE_CODE (type) == VECTOR_TYPE)
7869 size = int_size_in_bytes (type);
7870 else if (!aarch64_composite_type_p (type, mode)
7871 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7872 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7873 size = GET_MODE_SIZE (mode);
7875 return (size == 8 || size == 16) ? true : false;
7878 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7879 shall be passed or returned in simd/fp register(s) (providing these
7880 parameter passing registers are available).
7882 Upon successful return, *COUNT returns the number of needed registers,
7883 *BASE_MODE returns the mode of the individual register and when IS_HAF
7884 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7885 floating-point aggregate or a homogeneous short-vector aggregate. */
7887 static bool
7888 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7889 const_tree type,
7890 machine_mode *base_mode,
7891 int *count,
7892 bool *is_ha)
7894 machine_mode new_mode = VOIDmode;
7895 bool composite_p = aarch64_composite_type_p (type, mode);
7897 if (is_ha != NULL) *is_ha = false;
7899 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7900 || aarch64_short_vector_p (type, mode))
7902 *count = 1;
7903 new_mode = mode;
7905 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7907 if (is_ha != NULL) *is_ha = true;
7908 *count = 2;
7909 new_mode = GET_MODE_INNER (mode);
7911 else if (type && composite_p)
7913 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7915 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7917 if (is_ha != NULL) *is_ha = true;
7918 *count = ag_count;
7920 else
7921 return false;
7923 else
7924 return false;
7926 *base_mode = new_mode;
7927 return true;
7930 /* Implement TARGET_STRUCT_VALUE_RTX. */
7932 static rtx
7933 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7934 int incoming ATTRIBUTE_UNUSED)
7936 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7939 /* Implements target hook vector_mode_supported_p. */
7940 static bool
7941 aarch64_vector_mode_supported_p (machine_mode mode)
7943 if (TARGET_SIMD
7944 && (mode == V4SImode || mode == V8HImode
7945 || mode == V16QImode || mode == V2DImode
7946 || mode == V2SImode || mode == V4HImode
7947 || mode == V8QImode || mode == V2SFmode
7948 || mode == V4SFmode || mode == V2DFmode
7949 || mode == V1DFmode))
7950 return true;
7952 return false;
7955 /* Return appropriate SIMD container
7956 for MODE within a vector of WIDTH bits. */
7957 static machine_mode
7958 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7960 gcc_assert (width == 64 || width == 128);
7961 if (TARGET_SIMD)
7963 if (width == 128)
7964 switch (mode)
7966 case DFmode:
7967 return V2DFmode;
7968 case SFmode:
7969 return V4SFmode;
7970 case SImode:
7971 return V4SImode;
7972 case HImode:
7973 return V8HImode;
7974 case QImode:
7975 return V16QImode;
7976 case DImode:
7977 return V2DImode;
7978 default:
7979 break;
7981 else
7982 switch (mode)
7984 case SFmode:
7985 return V2SFmode;
7986 case SImode:
7987 return V2SImode;
7988 case HImode:
7989 return V4HImode;
7990 case QImode:
7991 return V8QImode;
7992 default:
7993 break;
7996 return word_mode;
7999 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8000 static machine_mode
8001 aarch64_preferred_simd_mode (machine_mode mode)
8003 return aarch64_simd_container_mode (mode, 128);
8006 /* Return the bitmask of possible vector sizes for the vectorizer
8007 to iterate over. */
8008 static unsigned int
8009 aarch64_autovectorize_vector_sizes (void)
8011 return (16 | 8);
8014 /* Implement TARGET_MANGLE_TYPE. */
8016 static const char *
8017 aarch64_mangle_type (const_tree type)
8019 /* The AArch64 ABI documents say that "__va_list" has to be
8020 managled as if it is in the "std" namespace. */
8021 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8022 return "St9__va_list";
8024 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8025 builtin types. */
8026 if (TYPE_NAME (type) != NULL)
8027 return aarch64_mangle_builtin_type (type);
8029 /* Use the default mangling. */
8030 return NULL;
8034 /* Return true if the rtx_insn contains a MEM RTX somewhere
8035 in it. */
8037 static bool
8038 has_memory_op (rtx_insn *mem_insn)
8040 subrtx_iterator::array_type array;
8041 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8042 if (MEM_P (*iter))
8043 return true;
8045 return false;
8048 /* Find the first rtx_insn before insn that will generate an assembly
8049 instruction. */
8051 static rtx_insn *
8052 aarch64_prev_real_insn (rtx_insn *insn)
8054 if (!insn)
8055 return NULL;
8059 insn = prev_real_insn (insn);
8061 while (insn && recog_memoized (insn) < 0);
8063 return insn;
8066 static bool
8067 is_madd_op (enum attr_type t1)
8069 unsigned int i;
8070 /* A number of these may be AArch32 only. */
8071 enum attr_type mlatypes[] = {
8072 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8073 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8074 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8077 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8079 if (t1 == mlatypes[i])
8080 return true;
8083 return false;
8086 /* Check if there is a register dependency between a load and the insn
8087 for which we hold recog_data. */
8089 static bool
8090 dep_between_memop_and_curr (rtx memop)
8092 rtx load_reg;
8093 int opno;
8095 gcc_assert (GET_CODE (memop) == SET);
8097 if (!REG_P (SET_DEST (memop)))
8098 return false;
8100 load_reg = SET_DEST (memop);
8101 for (opno = 1; opno < recog_data.n_operands; opno++)
8103 rtx operand = recog_data.operand[opno];
8104 if (REG_P (operand)
8105 && reg_overlap_mentioned_p (load_reg, operand))
8106 return true;
8109 return false;
8113 /* When working around the Cortex-A53 erratum 835769,
8114 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8115 instruction and has a preceding memory instruction such that a NOP
8116 should be inserted between them. */
8118 bool
8119 aarch64_madd_needs_nop (rtx_insn* insn)
8121 enum attr_type attr_type;
8122 rtx_insn *prev;
8123 rtx body;
8125 if (!aarch64_fix_a53_err835769)
8126 return false;
8128 if (recog_memoized (insn) < 0)
8129 return false;
8131 attr_type = get_attr_type (insn);
8132 if (!is_madd_op (attr_type))
8133 return false;
8135 prev = aarch64_prev_real_insn (insn);
8136 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8137 Restore recog state to INSN to avoid state corruption. */
8138 extract_constrain_insn_cached (insn);
8140 if (!prev || !has_memory_op (prev))
8141 return false;
8143 body = single_set (prev);
8145 /* If the previous insn is a memory op and there is no dependency between
8146 it and the DImode madd, emit a NOP between them. If body is NULL then we
8147 have a complex memory operation, probably a load/store pair.
8148 Be conservative for now and emit a NOP. */
8149 if (GET_MODE (recog_data.operand[0]) == DImode
8150 && (!body || !dep_between_memop_and_curr (body)))
8151 return true;
8153 return false;
8158 /* Implement FINAL_PRESCAN_INSN. */
8160 void
8161 aarch64_final_prescan_insn (rtx_insn *insn)
8163 if (aarch64_madd_needs_nop (insn))
8164 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8168 /* Return the equivalent letter for size. */
8169 static char
8170 sizetochar (int size)
8172 switch (size)
8174 case 64: return 'd';
8175 case 32: return 's';
8176 case 16: return 'h';
8177 case 8 : return 'b';
8178 default: gcc_unreachable ();
8182 /* Return true iff x is a uniform vector of floating-point
8183 constants, and the constant can be represented in
8184 quarter-precision form. Note, as aarch64_float_const_representable
8185 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8186 static bool
8187 aarch64_vect_float_const_representable_p (rtx x)
8189 int i = 0;
8190 REAL_VALUE_TYPE r0, ri;
8191 rtx x0, xi;
8193 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8194 return false;
8196 x0 = CONST_VECTOR_ELT (x, 0);
8197 if (!CONST_DOUBLE_P (x0))
8198 return false;
8200 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8202 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8204 xi = CONST_VECTOR_ELT (x, i);
8205 if (!CONST_DOUBLE_P (xi))
8206 return false;
8208 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8209 if (!REAL_VALUES_EQUAL (r0, ri))
8210 return false;
8213 return aarch64_float_const_representable_p (x0);
8216 /* Return true for valid and false for invalid. */
8217 bool
8218 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8219 struct simd_immediate_info *info)
8221 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8222 matches = 1; \
8223 for (i = 0; i < idx; i += (STRIDE)) \
8224 if (!(TEST)) \
8225 matches = 0; \
8226 if (matches) \
8228 immtype = (CLASS); \
8229 elsize = (ELSIZE); \
8230 eshift = (SHIFT); \
8231 emvn = (NEG); \
8232 break; \
8235 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8236 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8237 unsigned char bytes[16];
8238 int immtype = -1, matches;
8239 unsigned int invmask = inverse ? 0xff : 0;
8240 int eshift, emvn;
8242 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8244 if (! (aarch64_simd_imm_zero_p (op, mode)
8245 || aarch64_vect_float_const_representable_p (op)))
8246 return false;
8248 if (info)
8250 info->value = CONST_VECTOR_ELT (op, 0);
8251 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8252 info->mvn = false;
8253 info->shift = 0;
8256 return true;
8259 /* Splat vector constant out into a byte vector. */
8260 for (i = 0; i < n_elts; i++)
8262 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8263 it must be laid out in the vector register in reverse order. */
8264 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8265 unsigned HOST_WIDE_INT elpart;
8266 unsigned int part, parts;
8268 if (CONST_INT_P (el))
8270 elpart = INTVAL (el);
8271 parts = 1;
8273 else if (GET_CODE (el) == CONST_DOUBLE)
8275 elpart = CONST_DOUBLE_LOW (el);
8276 parts = 2;
8278 else
8279 gcc_unreachable ();
8281 for (part = 0; part < parts; part++)
8283 unsigned int byte;
8284 for (byte = 0; byte < innersize; byte++)
8286 bytes[idx++] = (elpart & 0xff) ^ invmask;
8287 elpart >>= BITS_PER_UNIT;
8289 if (GET_CODE (el) == CONST_DOUBLE)
8290 elpart = CONST_DOUBLE_HIGH (el);
8294 /* Sanity check. */
8295 gcc_assert (idx == GET_MODE_SIZE (mode));
8299 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8300 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8302 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8303 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8305 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8306 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8308 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8309 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8311 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8313 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8315 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8316 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8318 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8319 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8321 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8322 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8324 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8325 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8327 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8329 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8331 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8332 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8334 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8335 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8337 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8338 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8340 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8341 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8343 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8345 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8346 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8348 while (0);
8350 if (immtype == -1)
8351 return false;
8353 if (info)
8355 info->element_width = elsize;
8356 info->mvn = emvn != 0;
8357 info->shift = eshift;
8359 unsigned HOST_WIDE_INT imm = 0;
8361 if (immtype >= 12 && immtype <= 15)
8362 info->msl = true;
8364 /* Un-invert bytes of recognized vector, if necessary. */
8365 if (invmask != 0)
8366 for (i = 0; i < idx; i++)
8367 bytes[i] ^= invmask;
8369 if (immtype == 17)
8371 /* FIXME: Broken on 32-bit H_W_I hosts. */
8372 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8374 for (i = 0; i < 8; i++)
8375 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8376 << (i * BITS_PER_UNIT);
8379 info->value = GEN_INT (imm);
8381 else
8383 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8384 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8386 /* Construct 'abcdefgh' because the assembler cannot handle
8387 generic constants. */
8388 if (info->mvn)
8389 imm = ~imm;
8390 imm = (imm >> info->shift) & 0xff;
8391 info->value = GEN_INT (imm);
8395 return true;
8396 #undef CHECK
8399 /* Check of immediate shift constants are within range. */
8400 bool
8401 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8403 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8404 if (left)
8405 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8406 else
8407 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8410 /* Return true if X is a uniform vector where all elements
8411 are either the floating-point constant 0.0 or the
8412 integer constant 0. */
8413 bool
8414 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8416 return x == CONST0_RTX (mode);
8419 bool
8420 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8422 HOST_WIDE_INT imm = INTVAL (x);
8423 int i;
8425 for (i = 0; i < 8; i++)
8427 unsigned int byte = imm & 0xff;
8428 if (byte != 0xff && byte != 0)
8429 return false;
8430 imm >>= 8;
8433 return true;
8436 bool
8437 aarch64_mov_operand_p (rtx x,
8438 enum aarch64_symbol_context context,
8439 machine_mode mode)
8441 if (GET_CODE (x) == HIGH
8442 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8443 return true;
8445 if (CONST_INT_P (x))
8446 return true;
8448 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8449 return true;
8451 return aarch64_classify_symbolic_expression (x, context)
8452 == SYMBOL_TINY_ABSOLUTE;
8455 /* Return a const_int vector of VAL. */
8457 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8459 int nunits = GET_MODE_NUNITS (mode);
8460 rtvec v = rtvec_alloc (nunits);
8461 int i;
8463 for (i=0; i < nunits; i++)
8464 RTVEC_ELT (v, i) = GEN_INT (val);
8466 return gen_rtx_CONST_VECTOR (mode, v);
8469 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8471 bool
8472 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8474 machine_mode vmode;
8476 gcc_assert (!VECTOR_MODE_P (mode));
8477 vmode = aarch64_preferred_simd_mode (mode);
8478 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8479 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8482 /* Construct and return a PARALLEL RTX vector with elements numbering the
8483 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8484 the vector - from the perspective of the architecture. This does not
8485 line up with GCC's perspective on lane numbers, so we end up with
8486 different masks depending on our target endian-ness. The diagram
8487 below may help. We must draw the distinction when building masks
8488 which select one half of the vector. An instruction selecting
8489 architectural low-lanes for a big-endian target, must be described using
8490 a mask selecting GCC high-lanes.
8492 Big-Endian Little-Endian
8494 GCC 0 1 2 3 3 2 1 0
8495 | x | x | x | x | | x | x | x | x |
8496 Architecture 3 2 1 0 3 2 1 0
8498 Low Mask: { 2, 3 } { 0, 1 }
8499 High Mask: { 0, 1 } { 2, 3 }
8503 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8505 int nunits = GET_MODE_NUNITS (mode);
8506 rtvec v = rtvec_alloc (nunits / 2);
8507 int high_base = nunits / 2;
8508 int low_base = 0;
8509 int base;
8510 rtx t1;
8511 int i;
8513 if (BYTES_BIG_ENDIAN)
8514 base = high ? low_base : high_base;
8515 else
8516 base = high ? high_base : low_base;
8518 for (i = 0; i < nunits / 2; i++)
8519 RTVEC_ELT (v, i) = GEN_INT (base + i);
8521 t1 = gen_rtx_PARALLEL (mode, v);
8522 return t1;
8525 /* Check OP for validity as a PARALLEL RTX vector with elements
8526 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8527 from the perspective of the architecture. See the diagram above
8528 aarch64_simd_vect_par_cnst_half for more details. */
8530 bool
8531 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8532 bool high)
8534 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8535 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8536 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8537 int i = 0;
8539 if (!VECTOR_MODE_P (mode))
8540 return false;
8542 if (count_op != count_ideal)
8543 return false;
8545 for (i = 0; i < count_ideal; i++)
8547 rtx elt_op = XVECEXP (op, 0, i);
8548 rtx elt_ideal = XVECEXP (ideal, 0, i);
8550 if (!CONST_INT_P (elt_op)
8551 || INTVAL (elt_ideal) != INTVAL (elt_op))
8552 return false;
8554 return true;
8557 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8558 HIGH (exclusive). */
8559 void
8560 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8561 const_tree exp)
8563 HOST_WIDE_INT lane;
8564 gcc_assert (CONST_INT_P (operand));
8565 lane = INTVAL (operand);
8567 if (lane < low || lane >= high)
8569 if (exp)
8570 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8571 else
8572 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8576 /* Return TRUE if OP is a valid vector addressing mode. */
8577 bool
8578 aarch64_simd_mem_operand_p (rtx op)
8580 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8581 || REG_P (XEXP (op, 0)));
8584 /* Emit a register copy from operand to operand, taking care not to
8585 early-clobber source registers in the process.
8587 COUNT is the number of components into which the copy needs to be
8588 decomposed. */
8589 void
8590 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8591 unsigned int count)
8593 unsigned int i;
8594 int rdest = REGNO (operands[0]);
8595 int rsrc = REGNO (operands[1]);
8597 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8598 || rdest < rsrc)
8599 for (i = 0; i < count; i++)
8600 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8601 gen_rtx_REG (mode, rsrc + i));
8602 else
8603 for (i = 0; i < count; i++)
8604 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8605 gen_rtx_REG (mode, rsrc + count - i - 1));
8608 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8609 one of VSTRUCT modes: OI, CI or XI. */
8611 aarch64_simd_attr_length_move (rtx_insn *insn)
8613 machine_mode mode;
8615 extract_insn_cached (insn);
8617 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8619 mode = GET_MODE (recog_data.operand[0]);
8620 switch (mode)
8622 case OImode:
8623 return 8;
8624 case CImode:
8625 return 12;
8626 case XImode:
8627 return 16;
8628 default:
8629 gcc_unreachable ();
8632 return 4;
8635 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8636 one of VSTRUCT modes: OI, CI, EI, or XI. */
8638 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8640 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8643 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8644 alignment of a vector to 128 bits. */
8645 static HOST_WIDE_INT
8646 aarch64_simd_vector_alignment (const_tree type)
8648 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8649 return MIN (align, 128);
8652 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8653 static bool
8654 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8656 if (is_packed)
8657 return false;
8659 /* We guarantee alignment for vectors up to 128-bits. */
8660 if (tree_int_cst_compare (TYPE_SIZE (type),
8661 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8662 return false;
8664 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8665 return true;
8668 /* If VALS is a vector constant that can be loaded into a register
8669 using DUP, generate instructions to do so and return an RTX to
8670 assign to the register. Otherwise return NULL_RTX. */
8671 static rtx
8672 aarch64_simd_dup_constant (rtx vals)
8674 machine_mode mode = GET_MODE (vals);
8675 machine_mode inner_mode = GET_MODE_INNER (mode);
8676 int n_elts = GET_MODE_NUNITS (mode);
8677 bool all_same = true;
8678 rtx x;
8679 int i;
8681 if (GET_CODE (vals) != CONST_VECTOR)
8682 return NULL_RTX;
8684 for (i = 1; i < n_elts; ++i)
8686 x = CONST_VECTOR_ELT (vals, i);
8687 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8688 all_same = false;
8691 if (!all_same)
8692 return NULL_RTX;
8694 /* We can load this constant by using DUP and a constant in a
8695 single ARM register. This will be cheaper than a vector
8696 load. */
8697 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8698 return gen_rtx_VEC_DUPLICATE (mode, x);
8702 /* Generate code to load VALS, which is a PARALLEL containing only
8703 constants (for vec_init) or CONST_VECTOR, efficiently into a
8704 register. Returns an RTX to copy into the register, or NULL_RTX
8705 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8706 static rtx
8707 aarch64_simd_make_constant (rtx vals)
8709 machine_mode mode = GET_MODE (vals);
8710 rtx const_dup;
8711 rtx const_vec = NULL_RTX;
8712 int n_elts = GET_MODE_NUNITS (mode);
8713 int n_const = 0;
8714 int i;
8716 if (GET_CODE (vals) == CONST_VECTOR)
8717 const_vec = vals;
8718 else if (GET_CODE (vals) == PARALLEL)
8720 /* A CONST_VECTOR must contain only CONST_INTs and
8721 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8722 Only store valid constants in a CONST_VECTOR. */
8723 for (i = 0; i < n_elts; ++i)
8725 rtx x = XVECEXP (vals, 0, i);
8726 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8727 n_const++;
8729 if (n_const == n_elts)
8730 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8732 else
8733 gcc_unreachable ();
8735 if (const_vec != NULL_RTX
8736 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8737 /* Load using MOVI/MVNI. */
8738 return const_vec;
8739 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8740 /* Loaded using DUP. */
8741 return const_dup;
8742 else if (const_vec != NULL_RTX)
8743 /* Load from constant pool. We can not take advantage of single-cycle
8744 LD1 because we need a PC-relative addressing mode. */
8745 return const_vec;
8746 else
8747 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8748 We can not construct an initializer. */
8749 return NULL_RTX;
8752 void
8753 aarch64_expand_vector_init (rtx target, rtx vals)
8755 machine_mode mode = GET_MODE (target);
8756 machine_mode inner_mode = GET_MODE_INNER (mode);
8757 int n_elts = GET_MODE_NUNITS (mode);
8758 int n_var = 0;
8759 rtx any_const = NULL_RTX;
8760 bool all_same = true;
8762 for (int i = 0; i < n_elts; ++i)
8764 rtx x = XVECEXP (vals, 0, i);
8765 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8766 ++n_var;
8767 else
8768 any_const = x;
8770 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8771 all_same = false;
8774 if (n_var == 0)
8776 rtx constant = aarch64_simd_make_constant (vals);
8777 if (constant != NULL_RTX)
8779 emit_move_insn (target, constant);
8780 return;
8784 /* Splat a single non-constant element if we can. */
8785 if (all_same)
8787 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8788 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8789 return;
8792 /* Half the fields (or less) are non-constant. Load constant then overwrite
8793 varying fields. Hope that this is more efficient than using the stack. */
8794 if (n_var <= n_elts/2)
8796 rtx copy = copy_rtx (vals);
8798 /* Load constant part of vector. We really don't care what goes into the
8799 parts we will overwrite, but we're more likely to be able to load the
8800 constant efficiently if it has fewer, larger, repeating parts
8801 (see aarch64_simd_valid_immediate). */
8802 for (int i = 0; i < n_elts; i++)
8804 rtx x = XVECEXP (vals, 0, i);
8805 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8806 continue;
8807 rtx subst = any_const;
8808 for (int bit = n_elts / 2; bit > 0; bit /= 2)
8810 /* Look in the copied vector, as more elements are const. */
8811 rtx test = XVECEXP (copy, 0, i ^ bit);
8812 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8814 subst = test;
8815 break;
8818 XVECEXP (copy, 0, i) = subst;
8820 aarch64_expand_vector_init (target, copy);
8822 /* Insert variables. */
8823 enum insn_code icode = optab_handler (vec_set_optab, mode);
8824 gcc_assert (icode != CODE_FOR_nothing);
8826 for (int i = 0; i < n_elts; i++)
8828 rtx x = XVECEXP (vals, 0, i);
8829 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8830 continue;
8831 x = copy_to_mode_reg (inner_mode, x);
8832 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8834 return;
8837 /* Construct the vector in memory one field at a time
8838 and load the whole vector. */
8839 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8840 for (int i = 0; i < n_elts; i++)
8841 emit_move_insn (adjust_address_nv (mem, inner_mode,
8842 i * GET_MODE_SIZE (inner_mode)),
8843 XVECEXP (vals, 0, i));
8844 emit_move_insn (target, mem);
8848 static unsigned HOST_WIDE_INT
8849 aarch64_shift_truncation_mask (machine_mode mode)
8851 return
8852 (aarch64_vector_mode_supported_p (mode)
8853 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8856 #ifndef TLS_SECTION_ASM_FLAG
8857 #define TLS_SECTION_ASM_FLAG 'T'
8858 #endif
8860 void
8861 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8862 tree decl ATTRIBUTE_UNUSED)
8864 char flagchars[10], *f = flagchars;
8866 /* If we have already declared this section, we can use an
8867 abbreviated form to switch back to it -- unless this section is
8868 part of a COMDAT groups, in which case GAS requires the full
8869 declaration every time. */
8870 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8871 && (flags & SECTION_DECLARED))
8873 fprintf (asm_out_file, "\t.section\t%s\n", name);
8874 return;
8877 if (!(flags & SECTION_DEBUG))
8878 *f++ = 'a';
8879 if (flags & SECTION_WRITE)
8880 *f++ = 'w';
8881 if (flags & SECTION_CODE)
8882 *f++ = 'x';
8883 if (flags & SECTION_SMALL)
8884 *f++ = 's';
8885 if (flags & SECTION_MERGE)
8886 *f++ = 'M';
8887 if (flags & SECTION_STRINGS)
8888 *f++ = 'S';
8889 if (flags & SECTION_TLS)
8890 *f++ = TLS_SECTION_ASM_FLAG;
8891 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8892 *f++ = 'G';
8893 *f = '\0';
8895 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8897 if (!(flags & SECTION_NOTYPE))
8899 const char *type;
8900 const char *format;
8902 if (flags & SECTION_BSS)
8903 type = "nobits";
8904 else
8905 type = "progbits";
8907 #ifdef TYPE_OPERAND_FMT
8908 format = "," TYPE_OPERAND_FMT;
8909 #else
8910 format = ",@%s";
8911 #endif
8913 fprintf (asm_out_file, format, type);
8915 if (flags & SECTION_ENTSIZE)
8916 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8917 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8919 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8920 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8921 else
8922 fprintf (asm_out_file, ",%s,comdat",
8923 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8927 putc ('\n', asm_out_file);
8930 /* Select a format to encode pointers in exception handling data. */
8932 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8934 int type;
8935 switch (aarch64_cmodel)
8937 case AARCH64_CMODEL_TINY:
8938 case AARCH64_CMODEL_TINY_PIC:
8939 case AARCH64_CMODEL_SMALL:
8940 case AARCH64_CMODEL_SMALL_PIC:
8941 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8942 for everything. */
8943 type = DW_EH_PE_sdata4;
8944 break;
8945 default:
8946 /* No assumptions here. 8-byte relocs required. */
8947 type = DW_EH_PE_sdata8;
8948 break;
8950 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8953 /* Emit load exclusive. */
8955 static void
8956 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8957 rtx mem, rtx model_rtx)
8959 rtx (*gen) (rtx, rtx, rtx);
8961 switch (mode)
8963 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8964 case HImode: gen = gen_aarch64_load_exclusivehi; break;
8965 case SImode: gen = gen_aarch64_load_exclusivesi; break;
8966 case DImode: gen = gen_aarch64_load_exclusivedi; break;
8967 default:
8968 gcc_unreachable ();
8971 emit_insn (gen (rval, mem, model_rtx));
8974 /* Emit store exclusive. */
8976 static void
8977 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8978 rtx rval, rtx mem, rtx model_rtx)
8980 rtx (*gen) (rtx, rtx, rtx, rtx);
8982 switch (mode)
8984 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8985 case HImode: gen = gen_aarch64_store_exclusivehi; break;
8986 case SImode: gen = gen_aarch64_store_exclusivesi; break;
8987 case DImode: gen = gen_aarch64_store_exclusivedi; break;
8988 default:
8989 gcc_unreachable ();
8992 emit_insn (gen (bval, rval, mem, model_rtx));
8995 /* Mark the previous jump instruction as unlikely. */
8997 static void
8998 aarch64_emit_unlikely_jump (rtx insn)
9000 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9002 insn = emit_jump_insn (insn);
9003 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9006 /* Expand a compare and swap pattern. */
9008 void
9009 aarch64_expand_compare_and_swap (rtx operands[])
9011 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9012 machine_mode mode, cmp_mode;
9013 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9015 bval = operands[0];
9016 rval = operands[1];
9017 mem = operands[2];
9018 oldval = operands[3];
9019 newval = operands[4];
9020 is_weak = operands[5];
9021 mod_s = operands[6];
9022 mod_f = operands[7];
9023 mode = GET_MODE (mem);
9024 cmp_mode = mode;
9026 /* Normally the succ memory model must be stronger than fail, but in the
9027 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9028 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9030 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9031 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9032 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9034 switch (mode)
9036 case QImode:
9037 case HImode:
9038 /* For short modes, we're going to perform the comparison in SImode,
9039 so do the zero-extension now. */
9040 cmp_mode = SImode;
9041 rval = gen_reg_rtx (SImode);
9042 oldval = convert_modes (SImode, mode, oldval, true);
9043 /* Fall through. */
9045 case SImode:
9046 case DImode:
9047 /* Force the value into a register if needed. */
9048 if (!aarch64_plus_operand (oldval, mode))
9049 oldval = force_reg (cmp_mode, oldval);
9050 break;
9052 default:
9053 gcc_unreachable ();
9056 switch (mode)
9058 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9059 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9060 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9061 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9062 default:
9063 gcc_unreachable ();
9066 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9068 if (mode == QImode || mode == HImode)
9069 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9071 x = gen_rtx_REG (CCmode, CC_REGNUM);
9072 x = gen_rtx_EQ (SImode, x, const0_rtx);
9073 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9076 /* Split a compare and swap pattern. */
9078 void
9079 aarch64_split_compare_and_swap (rtx operands[])
9081 rtx rval, mem, oldval, newval, scratch;
9082 machine_mode mode;
9083 bool is_weak;
9084 rtx_code_label *label1, *label2;
9085 rtx x, cond;
9087 rval = operands[0];
9088 mem = operands[1];
9089 oldval = operands[2];
9090 newval = operands[3];
9091 is_weak = (operands[4] != const0_rtx);
9092 scratch = operands[7];
9093 mode = GET_MODE (mem);
9095 label1 = NULL;
9096 if (!is_weak)
9098 label1 = gen_label_rtx ();
9099 emit_label (label1);
9101 label2 = gen_label_rtx ();
9103 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9105 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9106 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9107 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9108 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9109 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9111 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9113 if (!is_weak)
9115 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9116 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9117 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9118 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9120 else
9122 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9123 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9124 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9127 emit_label (label2);
9130 /* Split an atomic operation. */
9132 void
9133 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9134 rtx value, rtx model_rtx, rtx cond)
9136 machine_mode mode = GET_MODE (mem);
9137 machine_mode wmode = (mode == DImode ? DImode : SImode);
9138 rtx_code_label *label;
9139 rtx x;
9141 label = gen_label_rtx ();
9142 emit_label (label);
9144 if (new_out)
9145 new_out = gen_lowpart (wmode, new_out);
9146 if (old_out)
9147 old_out = gen_lowpart (wmode, old_out);
9148 else
9149 old_out = new_out;
9150 value = simplify_gen_subreg (wmode, value, mode, 0);
9152 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9154 switch (code)
9156 case SET:
9157 new_out = value;
9158 break;
9160 case NOT:
9161 x = gen_rtx_AND (wmode, old_out, value);
9162 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9163 x = gen_rtx_NOT (wmode, new_out);
9164 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9165 break;
9167 case MINUS:
9168 if (CONST_INT_P (value))
9170 value = GEN_INT (-INTVAL (value));
9171 code = PLUS;
9173 /* Fall through. */
9175 default:
9176 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9177 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9178 break;
9181 aarch64_emit_store_exclusive (mode, cond, mem,
9182 gen_lowpart (mode, new_out), model_rtx);
9184 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9185 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9186 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9187 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9190 static void
9191 aarch64_print_extension (void)
9193 const struct aarch64_option_extension *opt = NULL;
9195 for (opt = all_extensions; opt->name != NULL; opt++)
9196 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9197 asm_fprintf (asm_out_file, "+%s", opt->name);
9199 asm_fprintf (asm_out_file, "\n");
9202 static void
9203 aarch64_start_file (void)
9205 if (selected_arch)
9207 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9208 aarch64_print_extension ();
9210 else if (selected_cpu)
9212 const char *truncated_name
9213 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9214 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9215 aarch64_print_extension ();
9217 default_file_start();
9220 /* Target hook for c_mode_for_suffix. */
9221 static machine_mode
9222 aarch64_c_mode_for_suffix (char suffix)
9224 if (suffix == 'q')
9225 return TFmode;
9227 return VOIDmode;
9230 /* We can only represent floating point constants which will fit in
9231 "quarter-precision" values. These values are characterised by
9232 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9235 (-1)^s * (n/16) * 2^r
9237 Where:
9238 's' is the sign bit.
9239 'n' is an integer in the range 16 <= n <= 31.
9240 'r' is an integer in the range -3 <= r <= 4. */
9242 /* Return true iff X can be represented by a quarter-precision
9243 floating point immediate operand X. Note, we cannot represent 0.0. */
9244 bool
9245 aarch64_float_const_representable_p (rtx x)
9247 /* This represents our current view of how many bits
9248 make up the mantissa. */
9249 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9250 int exponent;
9251 unsigned HOST_WIDE_INT mantissa, mask;
9252 REAL_VALUE_TYPE r, m;
9253 bool fail;
9255 if (!CONST_DOUBLE_P (x))
9256 return false;
9258 if (GET_MODE (x) == VOIDmode)
9259 return false;
9261 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9263 /* We cannot represent infinities, NaNs or +/-zero. We won't
9264 know if we have +zero until we analyse the mantissa, but we
9265 can reject the other invalid values. */
9266 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9267 || REAL_VALUE_MINUS_ZERO (r))
9268 return false;
9270 /* Extract exponent. */
9271 r = real_value_abs (&r);
9272 exponent = REAL_EXP (&r);
9274 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9275 highest (sign) bit, with a fixed binary point at bit point_pos.
9276 m1 holds the low part of the mantissa, m2 the high part.
9277 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9278 bits for the mantissa, this can fail (low bits will be lost). */
9279 real_ldexp (&m, &r, point_pos - exponent);
9280 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9282 /* If the low part of the mantissa has bits set we cannot represent
9283 the value. */
9284 if (w.elt (0) != 0)
9285 return false;
9286 /* We have rejected the lower HOST_WIDE_INT, so update our
9287 understanding of how many bits lie in the mantissa and
9288 look only at the high HOST_WIDE_INT. */
9289 mantissa = w.elt (1);
9290 point_pos -= HOST_BITS_PER_WIDE_INT;
9292 /* We can only represent values with a mantissa of the form 1.xxxx. */
9293 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9294 if ((mantissa & mask) != 0)
9295 return false;
9297 /* Having filtered unrepresentable values, we may now remove all
9298 but the highest 5 bits. */
9299 mantissa >>= point_pos - 5;
9301 /* We cannot represent the value 0.0, so reject it. This is handled
9302 elsewhere. */
9303 if (mantissa == 0)
9304 return false;
9306 /* Then, as bit 4 is always set, we can mask it off, leaving
9307 the mantissa in the range [0, 15]. */
9308 mantissa &= ~(1 << 4);
9309 gcc_assert (mantissa <= 15);
9311 /* GCC internally does not use IEEE754-like encoding (where normalized
9312 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9313 Our mantissa values are shifted 4 places to the left relative to
9314 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9315 by 5 places to correct for GCC's representation. */
9316 exponent = 5 - exponent;
9318 return (exponent >= 0 && exponent <= 7);
9321 char*
9322 aarch64_output_simd_mov_immediate (rtx const_vector,
9323 machine_mode mode,
9324 unsigned width)
9326 bool is_valid;
9327 static char templ[40];
9328 const char *mnemonic;
9329 const char *shift_op;
9330 unsigned int lane_count = 0;
9331 char element_char;
9333 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9335 /* This will return true to show const_vector is legal for use as either
9336 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9337 also update INFO to show how the immediate should be generated. */
9338 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9339 gcc_assert (is_valid);
9341 element_char = sizetochar (info.element_width);
9342 lane_count = width / info.element_width;
9344 mode = GET_MODE_INNER (mode);
9345 if (mode == SFmode || mode == DFmode)
9347 gcc_assert (info.shift == 0 && ! info.mvn);
9348 if (aarch64_float_const_zero_rtx_p (info.value))
9349 info.value = GEN_INT (0);
9350 else
9352 #define buf_size 20
9353 REAL_VALUE_TYPE r;
9354 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9355 char float_buf[buf_size] = {'\0'};
9356 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9357 #undef buf_size
9359 if (lane_count == 1)
9360 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9361 else
9362 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9363 lane_count, element_char, float_buf);
9364 return templ;
9368 mnemonic = info.mvn ? "mvni" : "movi";
9369 shift_op = info.msl ? "msl" : "lsl";
9371 if (lane_count == 1)
9372 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9373 mnemonic, UINTVAL (info.value));
9374 else if (info.shift)
9375 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9376 ", %s %d", mnemonic, lane_count, element_char,
9377 UINTVAL (info.value), shift_op, info.shift);
9378 else
9379 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9380 mnemonic, lane_count, element_char, UINTVAL (info.value));
9381 return templ;
9384 char*
9385 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9386 machine_mode mode)
9388 machine_mode vmode;
9390 gcc_assert (!VECTOR_MODE_P (mode));
9391 vmode = aarch64_simd_container_mode (mode, 64);
9392 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9393 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9396 /* Split operands into moves from op[1] + op[2] into op[0]. */
9398 void
9399 aarch64_split_combinev16qi (rtx operands[3])
9401 unsigned int dest = REGNO (operands[0]);
9402 unsigned int src1 = REGNO (operands[1]);
9403 unsigned int src2 = REGNO (operands[2]);
9404 machine_mode halfmode = GET_MODE (operands[1]);
9405 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9406 rtx destlo, desthi;
9408 gcc_assert (halfmode == V16QImode);
9410 if (src1 == dest && src2 == dest + halfregs)
9412 /* No-op move. Can't split to nothing; emit something. */
9413 emit_note (NOTE_INSN_DELETED);
9414 return;
9417 /* Preserve register attributes for variable tracking. */
9418 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9419 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9420 GET_MODE_SIZE (halfmode));
9422 /* Special case of reversed high/low parts. */
9423 if (reg_overlap_mentioned_p (operands[2], destlo)
9424 && reg_overlap_mentioned_p (operands[1], desthi))
9426 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9427 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9428 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9430 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9432 /* Try to avoid unnecessary moves if part of the result
9433 is in the right place already. */
9434 if (src1 != dest)
9435 emit_move_insn (destlo, operands[1]);
9436 if (src2 != dest + halfregs)
9437 emit_move_insn (desthi, operands[2]);
9439 else
9441 if (src2 != dest + halfregs)
9442 emit_move_insn (desthi, operands[2]);
9443 if (src1 != dest)
9444 emit_move_insn (destlo, operands[1]);
9448 /* vec_perm support. */
9450 #define MAX_VECT_LEN 16
9452 struct expand_vec_perm_d
9454 rtx target, op0, op1;
9455 unsigned char perm[MAX_VECT_LEN];
9456 machine_mode vmode;
9457 unsigned char nelt;
9458 bool one_vector_p;
9459 bool testing_p;
9462 /* Generate a variable permutation. */
9464 static void
9465 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9467 machine_mode vmode = GET_MODE (target);
9468 bool one_vector_p = rtx_equal_p (op0, op1);
9470 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9471 gcc_checking_assert (GET_MODE (op0) == vmode);
9472 gcc_checking_assert (GET_MODE (op1) == vmode);
9473 gcc_checking_assert (GET_MODE (sel) == vmode);
9474 gcc_checking_assert (TARGET_SIMD);
9476 if (one_vector_p)
9478 if (vmode == V8QImode)
9480 /* Expand the argument to a V16QI mode by duplicating it. */
9481 rtx pair = gen_reg_rtx (V16QImode);
9482 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9483 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9485 else
9487 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9490 else
9492 rtx pair;
9494 if (vmode == V8QImode)
9496 pair = gen_reg_rtx (V16QImode);
9497 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9498 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9500 else
9502 pair = gen_reg_rtx (OImode);
9503 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9504 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9509 void
9510 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9512 machine_mode vmode = GET_MODE (target);
9513 unsigned int nelt = GET_MODE_NUNITS (vmode);
9514 bool one_vector_p = rtx_equal_p (op0, op1);
9515 rtx mask;
9517 /* The TBL instruction does not use a modulo index, so we must take care
9518 of that ourselves. */
9519 mask = aarch64_simd_gen_const_vector_dup (vmode,
9520 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9521 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9523 /* For big-endian, we also need to reverse the index within the vector
9524 (but not which vector). */
9525 if (BYTES_BIG_ENDIAN)
9527 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9528 if (!one_vector_p)
9529 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9530 sel = expand_simple_binop (vmode, XOR, sel, mask,
9531 NULL, 0, OPTAB_LIB_WIDEN);
9533 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9536 /* Recognize patterns suitable for the TRN instructions. */
9537 static bool
9538 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9540 unsigned int i, odd, mask, nelt = d->nelt;
9541 rtx out, in0, in1, x;
9542 rtx (*gen) (rtx, rtx, rtx);
9543 machine_mode vmode = d->vmode;
9545 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9546 return false;
9548 /* Note that these are little-endian tests.
9549 We correct for big-endian later. */
9550 if (d->perm[0] == 0)
9551 odd = 0;
9552 else if (d->perm[0] == 1)
9553 odd = 1;
9554 else
9555 return false;
9556 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9558 for (i = 0; i < nelt; i += 2)
9560 if (d->perm[i] != i + odd)
9561 return false;
9562 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9563 return false;
9566 /* Success! */
9567 if (d->testing_p)
9568 return true;
9570 in0 = d->op0;
9571 in1 = d->op1;
9572 if (BYTES_BIG_ENDIAN)
9574 x = in0, in0 = in1, in1 = x;
9575 odd = !odd;
9577 out = d->target;
9579 if (odd)
9581 switch (vmode)
9583 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9584 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9585 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9586 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9587 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9588 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9589 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9590 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9591 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9592 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9593 default:
9594 return false;
9597 else
9599 switch (vmode)
9601 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9602 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9603 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9604 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9605 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9606 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9607 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9608 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9609 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9610 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9611 default:
9612 return false;
9616 emit_insn (gen (out, in0, in1));
9617 return true;
9620 /* Recognize patterns suitable for the UZP instructions. */
9621 static bool
9622 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9624 unsigned int i, odd, mask, nelt = d->nelt;
9625 rtx out, in0, in1, x;
9626 rtx (*gen) (rtx, rtx, rtx);
9627 machine_mode vmode = d->vmode;
9629 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9630 return false;
9632 /* Note that these are little-endian tests.
9633 We correct for big-endian later. */
9634 if (d->perm[0] == 0)
9635 odd = 0;
9636 else if (d->perm[0] == 1)
9637 odd = 1;
9638 else
9639 return false;
9640 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9642 for (i = 0; i < nelt; i++)
9644 unsigned elt = (i * 2 + odd) & mask;
9645 if (d->perm[i] != elt)
9646 return false;
9649 /* Success! */
9650 if (d->testing_p)
9651 return true;
9653 in0 = d->op0;
9654 in1 = d->op1;
9655 if (BYTES_BIG_ENDIAN)
9657 x = in0, in0 = in1, in1 = x;
9658 odd = !odd;
9660 out = d->target;
9662 if (odd)
9664 switch (vmode)
9666 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9667 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9668 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9669 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9670 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9671 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9672 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9673 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9674 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9675 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9676 default:
9677 return false;
9680 else
9682 switch (vmode)
9684 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9685 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9686 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9687 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9688 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9689 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9690 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9691 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9692 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9693 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9694 default:
9695 return false;
9699 emit_insn (gen (out, in0, in1));
9700 return true;
9703 /* Recognize patterns suitable for the ZIP instructions. */
9704 static bool
9705 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9707 unsigned int i, high, mask, nelt = d->nelt;
9708 rtx out, in0, in1, x;
9709 rtx (*gen) (rtx, rtx, rtx);
9710 machine_mode vmode = d->vmode;
9712 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9713 return false;
9715 /* Note that these are little-endian tests.
9716 We correct for big-endian later. */
9717 high = nelt / 2;
9718 if (d->perm[0] == high)
9719 /* Do Nothing. */
9721 else if (d->perm[0] == 0)
9722 high = 0;
9723 else
9724 return false;
9725 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9727 for (i = 0; i < nelt / 2; i++)
9729 unsigned elt = (i + high) & mask;
9730 if (d->perm[i * 2] != elt)
9731 return false;
9732 elt = (elt + nelt) & mask;
9733 if (d->perm[i * 2 + 1] != elt)
9734 return false;
9737 /* Success! */
9738 if (d->testing_p)
9739 return true;
9741 in0 = d->op0;
9742 in1 = d->op1;
9743 if (BYTES_BIG_ENDIAN)
9745 x = in0, in0 = in1, in1 = x;
9746 high = !high;
9748 out = d->target;
9750 if (high)
9752 switch (vmode)
9754 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9755 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9756 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9757 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9758 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9759 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9760 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9761 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9762 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9763 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9764 default:
9765 return false;
9768 else
9770 switch (vmode)
9772 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9773 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9774 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9775 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9776 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9777 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9778 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9779 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9780 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9781 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9782 default:
9783 return false;
9787 emit_insn (gen (out, in0, in1));
9788 return true;
9791 /* Recognize patterns for the EXT insn. */
9793 static bool
9794 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9796 unsigned int i, nelt = d->nelt;
9797 rtx (*gen) (rtx, rtx, rtx, rtx);
9798 rtx offset;
9800 unsigned int location = d->perm[0]; /* Always < nelt. */
9802 /* Check if the extracted indices are increasing by one. */
9803 for (i = 1; i < nelt; i++)
9805 unsigned int required = location + i;
9806 if (d->one_vector_p)
9808 /* We'll pass the same vector in twice, so allow indices to wrap. */
9809 required &= (nelt - 1);
9811 if (d->perm[i] != required)
9812 return false;
9815 switch (d->vmode)
9817 case V16QImode: gen = gen_aarch64_extv16qi; break;
9818 case V8QImode: gen = gen_aarch64_extv8qi; break;
9819 case V4HImode: gen = gen_aarch64_extv4hi; break;
9820 case V8HImode: gen = gen_aarch64_extv8hi; break;
9821 case V2SImode: gen = gen_aarch64_extv2si; break;
9822 case V4SImode: gen = gen_aarch64_extv4si; break;
9823 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9824 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9825 case V2DImode: gen = gen_aarch64_extv2di; break;
9826 case V2DFmode: gen = gen_aarch64_extv2df; break;
9827 default:
9828 return false;
9831 /* Success! */
9832 if (d->testing_p)
9833 return true;
9835 /* The case where (location == 0) is a no-op for both big- and little-endian,
9836 and is removed by the mid-end at optimization levels -O1 and higher. */
9838 if (BYTES_BIG_ENDIAN && (location != 0))
9840 /* After setup, we want the high elements of the first vector (stored
9841 at the LSB end of the register), and the low elements of the second
9842 vector (stored at the MSB end of the register). So swap. */
9843 std::swap (d->op0, d->op1);
9844 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9845 location = nelt - location;
9848 offset = GEN_INT (location);
9849 emit_insn (gen (d->target, d->op0, d->op1, offset));
9850 return true;
9853 /* Recognize patterns for the REV insns. */
9855 static bool
9856 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9858 unsigned int i, j, diff, nelt = d->nelt;
9859 rtx (*gen) (rtx, rtx);
9861 if (!d->one_vector_p)
9862 return false;
9864 diff = d->perm[0];
9865 switch (diff)
9867 case 7:
9868 switch (d->vmode)
9870 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9871 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
9872 default:
9873 return false;
9875 break;
9876 case 3:
9877 switch (d->vmode)
9879 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9880 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
9881 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
9882 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
9883 default:
9884 return false;
9886 break;
9887 case 1:
9888 switch (d->vmode)
9890 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9891 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
9892 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
9893 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
9894 case V4SImode: gen = gen_aarch64_rev64v4si; break;
9895 case V2SImode: gen = gen_aarch64_rev64v2si; break;
9896 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
9897 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
9898 default:
9899 return false;
9901 break;
9902 default:
9903 return false;
9906 for (i = 0; i < nelt ; i += diff + 1)
9907 for (j = 0; j <= diff; j += 1)
9909 /* This is guaranteed to be true as the value of diff
9910 is 7, 3, 1 and we should have enough elements in the
9911 queue to generate this. Getting a vector mask with a
9912 value of diff other than these values implies that
9913 something is wrong by the time we get here. */
9914 gcc_assert (i + j < nelt);
9915 if (d->perm[i + j] != i + diff - j)
9916 return false;
9919 /* Success! */
9920 if (d->testing_p)
9921 return true;
9923 emit_insn (gen (d->target, d->op0));
9924 return true;
9927 static bool
9928 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9930 rtx (*gen) (rtx, rtx, rtx);
9931 rtx out = d->target;
9932 rtx in0;
9933 machine_mode vmode = d->vmode;
9934 unsigned int i, elt, nelt = d->nelt;
9935 rtx lane;
9937 elt = d->perm[0];
9938 for (i = 1; i < nelt; i++)
9940 if (elt != d->perm[i])
9941 return false;
9944 /* The generic preparation in aarch64_expand_vec_perm_const_1
9945 swaps the operand order and the permute indices if it finds
9946 d->perm[0] to be in the second operand. Thus, we can always
9947 use d->op0 and need not do any extra arithmetic to get the
9948 correct lane number. */
9949 in0 = d->op0;
9950 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
9952 switch (vmode)
9954 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9955 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9956 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9957 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9958 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9959 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9960 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9961 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9962 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9963 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9964 default:
9965 return false;
9968 emit_insn (gen (out, in0, lane));
9969 return true;
9972 static bool
9973 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9975 rtx rperm[MAX_VECT_LEN], sel;
9976 machine_mode vmode = d->vmode;
9977 unsigned int i, nelt = d->nelt;
9979 if (d->testing_p)
9980 return true;
9982 /* Generic code will try constant permutation twice. Once with the
9983 original mode and again with the elements lowered to QImode.
9984 So wait and don't do the selector expansion ourselves. */
9985 if (vmode != V8QImode && vmode != V16QImode)
9986 return false;
9988 for (i = 0; i < nelt; ++i)
9990 int nunits = GET_MODE_NUNITS (vmode);
9992 /* If big-endian and two vectors we end up with a weird mixed-endian
9993 mode on NEON. Reverse the index within each word but not the word
9994 itself. */
9995 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9996 : d->perm[i]);
9998 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9999 sel = force_reg (vmode, sel);
10001 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10002 return true;
10005 static bool
10006 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10008 /* The pattern matching functions above are written to look for a small
10009 number to begin the sequence (0, 1, N/2). If we begin with an index
10010 from the second operand, we can swap the operands. */
10011 if (d->perm[0] >= d->nelt)
10013 unsigned i, nelt = d->nelt;
10015 gcc_assert (nelt == (nelt & -nelt));
10016 for (i = 0; i < nelt; ++i)
10017 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10019 std::swap (d->op0, d->op1);
10022 if (TARGET_SIMD)
10024 if (aarch64_evpc_rev (d))
10025 return true;
10026 else if (aarch64_evpc_ext (d))
10027 return true;
10028 else if (aarch64_evpc_dup (d))
10029 return true;
10030 else if (aarch64_evpc_zip (d))
10031 return true;
10032 else if (aarch64_evpc_uzp (d))
10033 return true;
10034 else if (aarch64_evpc_trn (d))
10035 return true;
10036 return aarch64_evpc_tbl (d);
10038 return false;
10041 /* Expand a vec_perm_const pattern. */
10043 bool
10044 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10046 struct expand_vec_perm_d d;
10047 int i, nelt, which;
10049 d.target = target;
10050 d.op0 = op0;
10051 d.op1 = op1;
10053 d.vmode = GET_MODE (target);
10054 gcc_assert (VECTOR_MODE_P (d.vmode));
10055 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10056 d.testing_p = false;
10058 for (i = which = 0; i < nelt; ++i)
10060 rtx e = XVECEXP (sel, 0, i);
10061 int ei = INTVAL (e) & (2 * nelt - 1);
10062 which |= (ei < nelt ? 1 : 2);
10063 d.perm[i] = ei;
10066 switch (which)
10068 default:
10069 gcc_unreachable ();
10071 case 3:
10072 d.one_vector_p = false;
10073 if (!rtx_equal_p (op0, op1))
10074 break;
10076 /* The elements of PERM do not suggest that only the first operand
10077 is used, but both operands are identical. Allow easier matching
10078 of the permutation by folding the permutation into the single
10079 input vector. */
10080 /* Fall Through. */
10081 case 2:
10082 for (i = 0; i < nelt; ++i)
10083 d.perm[i] &= nelt - 1;
10084 d.op0 = op1;
10085 d.one_vector_p = true;
10086 break;
10088 case 1:
10089 d.op1 = op0;
10090 d.one_vector_p = true;
10091 break;
10094 return aarch64_expand_vec_perm_const_1 (&d);
10097 static bool
10098 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10099 const unsigned char *sel)
10101 struct expand_vec_perm_d d;
10102 unsigned int i, nelt, which;
10103 bool ret;
10105 d.vmode = vmode;
10106 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10107 d.testing_p = true;
10108 memcpy (d.perm, sel, nelt);
10110 /* Calculate whether all elements are in one vector. */
10111 for (i = which = 0; i < nelt; ++i)
10113 unsigned char e = d.perm[i];
10114 gcc_assert (e < 2 * nelt);
10115 which |= (e < nelt ? 1 : 2);
10118 /* If all elements are from the second vector, reindex as if from the
10119 first vector. */
10120 if (which == 2)
10121 for (i = 0; i < nelt; ++i)
10122 d.perm[i] -= nelt;
10124 /* Check whether the mask can be applied to a single vector. */
10125 d.one_vector_p = (which != 3);
10127 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10128 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10129 if (!d.one_vector_p)
10130 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10132 start_sequence ();
10133 ret = aarch64_expand_vec_perm_const_1 (&d);
10134 end_sequence ();
10136 return ret;
10140 aarch64_reverse_mask (enum machine_mode mode)
10142 /* We have to reverse each vector because we dont have
10143 a permuted load that can reverse-load according to ABI rules. */
10144 rtx mask;
10145 rtvec v = rtvec_alloc (16);
10146 int i, j;
10147 int nunits = GET_MODE_NUNITS (mode);
10148 int usize = GET_MODE_UNIT_SIZE (mode);
10150 gcc_assert (BYTES_BIG_ENDIAN);
10151 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10153 for (i = 0; i < nunits; i++)
10154 for (j = 0; j < usize; j++)
10155 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10156 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10157 return force_reg (V16QImode, mask);
10160 /* Implement MODES_TIEABLE_P. */
10162 bool
10163 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10165 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10166 return true;
10168 /* We specifically want to allow elements of "structure" modes to
10169 be tieable to the structure. This more general condition allows
10170 other rarer situations too. */
10171 if (TARGET_SIMD
10172 && aarch64_vector_mode_p (mode1)
10173 && aarch64_vector_mode_p (mode2))
10174 return true;
10176 return false;
10179 /* Return a new RTX holding the result of moving POINTER forward by
10180 AMOUNT bytes. */
10182 static rtx
10183 aarch64_move_pointer (rtx pointer, int amount)
10185 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10187 return adjust_automodify_address (pointer, GET_MODE (pointer),
10188 next, amount);
10191 /* Return a new RTX holding the result of moving POINTER forward by the
10192 size of the mode it points to. */
10194 static rtx
10195 aarch64_progress_pointer (rtx pointer)
10197 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10199 return aarch64_move_pointer (pointer, amount);
10202 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10203 MODE bytes. */
10205 static void
10206 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10207 machine_mode mode)
10209 rtx reg = gen_reg_rtx (mode);
10211 /* "Cast" the pointers to the correct mode. */
10212 *src = adjust_address (*src, mode, 0);
10213 *dst = adjust_address (*dst, mode, 0);
10214 /* Emit the memcpy. */
10215 emit_move_insn (reg, *src);
10216 emit_move_insn (*dst, reg);
10217 /* Move the pointers forward. */
10218 *src = aarch64_progress_pointer (*src);
10219 *dst = aarch64_progress_pointer (*dst);
10222 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10223 we succeed, otherwise return false. */
10225 bool
10226 aarch64_expand_movmem (rtx *operands)
10228 unsigned int n;
10229 rtx dst = operands[0];
10230 rtx src = operands[1];
10231 rtx base;
10232 bool speed_p = !optimize_function_for_size_p (cfun);
10234 /* When optimizing for size, give a better estimate of the length of a
10235 memcpy call, but use the default otherwise. */
10236 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10238 /* We can't do anything smart if the amount to copy is not constant. */
10239 if (!CONST_INT_P (operands[2]))
10240 return false;
10242 n = UINTVAL (operands[2]);
10244 /* Try to keep the number of instructions low. For cases below 16 bytes we
10245 need to make at most two moves. For cases above 16 bytes it will be one
10246 move for each 16 byte chunk, then at most two additional moves. */
10247 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10248 return false;
10250 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10251 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10253 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10254 src = adjust_automodify_address (src, VOIDmode, base, 0);
10256 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10257 1-byte chunk. */
10258 if (n < 4)
10260 if (n >= 2)
10262 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10263 n -= 2;
10266 if (n == 1)
10267 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10269 return true;
10272 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10273 4-byte chunk, partially overlapping with the previously copied chunk. */
10274 if (n < 8)
10276 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10277 n -= 4;
10278 if (n > 0)
10280 int move = n - 4;
10282 src = aarch64_move_pointer (src, move);
10283 dst = aarch64_move_pointer (dst, move);
10284 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10286 return true;
10289 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10290 them, then (if applicable) an 8-byte chunk. */
10291 while (n >= 8)
10293 if (n / 16)
10295 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10296 n -= 16;
10298 else
10300 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10301 n -= 8;
10305 /* Finish the final bytes of the copy. We can always do this in one
10306 instruction. We either copy the exact amount we need, or partially
10307 overlap with the previous chunk we copied and copy 8-bytes. */
10308 if (n == 0)
10309 return true;
10310 else if (n == 1)
10311 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10312 else if (n == 2)
10313 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10314 else if (n == 4)
10315 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10316 else
10318 if (n == 3)
10320 src = aarch64_move_pointer (src, -1);
10321 dst = aarch64_move_pointer (dst, -1);
10322 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10324 else
10326 int move = n - 8;
10328 src = aarch64_move_pointer (src, move);
10329 dst = aarch64_move_pointer (dst, move);
10330 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10334 return true;
10337 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10339 static unsigned HOST_WIDE_INT
10340 aarch64_asan_shadow_offset (void)
10342 return (HOST_WIDE_INT_1 << 36);
10345 static bool
10346 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10347 unsigned int align,
10348 enum by_pieces_operation op,
10349 bool speed_p)
10351 /* STORE_BY_PIECES can be used when copying a constant string, but
10352 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10353 For now we always fail this and let the move_by_pieces code copy
10354 the string from read-only memory. */
10355 if (op == STORE_BY_PIECES)
10356 return false;
10358 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10361 static enum machine_mode
10362 aarch64_code_to_ccmode (enum rtx_code code)
10364 switch (code)
10366 case NE:
10367 return CC_DNEmode;
10369 case EQ:
10370 return CC_DEQmode;
10372 case LE:
10373 return CC_DLEmode;
10375 case LT:
10376 return CC_DLTmode;
10378 case GE:
10379 return CC_DGEmode;
10381 case GT:
10382 return CC_DGTmode;
10384 case LEU:
10385 return CC_DLEUmode;
10387 case LTU:
10388 return CC_DLTUmode;
10390 case GEU:
10391 return CC_DGEUmode;
10393 case GTU:
10394 return CC_DGTUmode;
10396 default:
10397 return CCmode;
10401 static rtx
10402 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10403 int code, tree treeop0, tree treeop1)
10405 enum machine_mode op_mode, cmp_mode, cc_mode;
10406 rtx op0, op1, cmp, target;
10407 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10408 enum insn_code icode;
10409 struct expand_operand ops[4];
10411 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10412 if (cc_mode == CCmode)
10413 return NULL_RTX;
10415 start_sequence ();
10416 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10418 op_mode = GET_MODE (op0);
10419 if (op_mode == VOIDmode)
10420 op_mode = GET_MODE (op1);
10422 switch (op_mode)
10424 case QImode:
10425 case HImode:
10426 case SImode:
10427 cmp_mode = SImode;
10428 icode = CODE_FOR_cmpsi;
10429 break;
10431 case DImode:
10432 cmp_mode = DImode;
10433 icode = CODE_FOR_cmpdi;
10434 break;
10436 default:
10437 end_sequence ();
10438 return NULL_RTX;
10441 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10442 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10443 if (!op0 || !op1)
10445 end_sequence ();
10446 return NULL_RTX;
10448 *prep_seq = get_insns ();
10449 end_sequence ();
10451 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10452 target = gen_rtx_REG (CCmode, CC_REGNUM);
10454 create_output_operand (&ops[0], target, CCmode);
10455 create_fixed_operand (&ops[1], cmp);
10456 create_fixed_operand (&ops[2], op0);
10457 create_fixed_operand (&ops[3], op1);
10459 start_sequence ();
10460 if (!maybe_expand_insn (icode, 4, ops))
10462 end_sequence ();
10463 return NULL_RTX;
10465 *gen_seq = get_insns ();
10466 end_sequence ();
10468 return gen_rtx_REG (cc_mode, CC_REGNUM);
10471 static rtx
10472 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10473 tree treeop0, tree treeop1, int bit_code)
10475 rtx op0, op1, cmp0, cmp1, target;
10476 enum machine_mode op_mode, cmp_mode, cc_mode;
10477 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10478 enum insn_code icode = CODE_FOR_ccmp_andsi;
10479 struct expand_operand ops[6];
10481 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10482 if (cc_mode == CCmode)
10483 return NULL_RTX;
10485 push_to_sequence ((rtx_insn*) *prep_seq);
10486 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10488 op_mode = GET_MODE (op0);
10489 if (op_mode == VOIDmode)
10490 op_mode = GET_MODE (op1);
10492 switch (op_mode)
10494 case QImode:
10495 case HImode:
10496 case SImode:
10497 cmp_mode = SImode;
10498 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10499 : CODE_FOR_ccmp_iorsi;
10500 break;
10502 case DImode:
10503 cmp_mode = DImode;
10504 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10505 : CODE_FOR_ccmp_iordi;
10506 break;
10508 default:
10509 end_sequence ();
10510 return NULL_RTX;
10513 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10514 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10515 if (!op0 || !op1)
10517 end_sequence ();
10518 return NULL_RTX;
10520 *prep_seq = get_insns ();
10521 end_sequence ();
10523 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10524 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10525 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10527 create_fixed_operand (&ops[0], prev);
10528 create_fixed_operand (&ops[1], target);
10529 create_fixed_operand (&ops[2], op0);
10530 create_fixed_operand (&ops[3], op1);
10531 create_fixed_operand (&ops[4], cmp0);
10532 create_fixed_operand (&ops[5], cmp1);
10534 push_to_sequence ((rtx_insn*) *gen_seq);
10535 if (!maybe_expand_insn (icode, 6, ops))
10537 end_sequence ();
10538 return NULL_RTX;
10541 *gen_seq = get_insns ();
10542 end_sequence ();
10544 return target;
10547 #undef TARGET_GEN_CCMP_FIRST
10548 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10550 #undef TARGET_GEN_CCMP_NEXT
10551 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10553 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10554 instruction fusion of some sort. */
10556 static bool
10557 aarch64_macro_fusion_p (void)
10559 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10563 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10564 should be kept together during scheduling. */
10566 static bool
10567 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10569 rtx set_dest;
10570 rtx prev_set = single_set (prev);
10571 rtx curr_set = single_set (curr);
10572 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10573 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10575 if (!aarch64_macro_fusion_p ())
10576 return false;
10578 if (simple_sets_p
10579 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10581 /* We are trying to match:
10582 prev (mov) == (set (reg r0) (const_int imm16))
10583 curr (movk) == (set (zero_extract (reg r0)
10584 (const_int 16)
10585 (const_int 16))
10586 (const_int imm16_1)) */
10588 set_dest = SET_DEST (curr_set);
10590 if (GET_CODE (set_dest) == ZERO_EXTRACT
10591 && CONST_INT_P (SET_SRC (curr_set))
10592 && CONST_INT_P (SET_SRC (prev_set))
10593 && CONST_INT_P (XEXP (set_dest, 2))
10594 && INTVAL (XEXP (set_dest, 2)) == 16
10595 && REG_P (XEXP (set_dest, 0))
10596 && REG_P (SET_DEST (prev_set))
10597 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10599 return true;
10603 if (simple_sets_p
10604 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10607 /* We're trying to match:
10608 prev (adrp) == (set (reg r1)
10609 (high (symbol_ref ("SYM"))))
10610 curr (add) == (set (reg r0)
10611 (lo_sum (reg r1)
10612 (symbol_ref ("SYM"))))
10613 Note that r0 need not necessarily be the same as r1, especially
10614 during pre-regalloc scheduling. */
10616 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10617 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10619 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10620 && REG_P (XEXP (SET_SRC (curr_set), 0))
10621 && REGNO (XEXP (SET_SRC (curr_set), 0))
10622 == REGNO (SET_DEST (prev_set))
10623 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10624 XEXP (SET_SRC (curr_set), 1)))
10625 return true;
10629 if (simple_sets_p
10630 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10633 /* We're trying to match:
10634 prev (movk) == (set (zero_extract (reg r0)
10635 (const_int 16)
10636 (const_int 32))
10637 (const_int imm16_1))
10638 curr (movk) == (set (zero_extract (reg r0)
10639 (const_int 16)
10640 (const_int 48))
10641 (const_int imm16_2)) */
10643 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10644 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10645 && REG_P (XEXP (SET_DEST (prev_set), 0))
10646 && REG_P (XEXP (SET_DEST (curr_set), 0))
10647 && REGNO (XEXP (SET_DEST (prev_set), 0))
10648 == REGNO (XEXP (SET_DEST (curr_set), 0))
10649 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10650 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10651 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10652 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10653 && CONST_INT_P (SET_SRC (prev_set))
10654 && CONST_INT_P (SET_SRC (curr_set)))
10655 return true;
10658 if (simple_sets_p
10659 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10661 /* We're trying to match:
10662 prev (adrp) == (set (reg r0)
10663 (high (symbol_ref ("SYM"))))
10664 curr (ldr) == (set (reg r1)
10665 (mem (lo_sum (reg r0)
10666 (symbol_ref ("SYM")))))
10668 curr (ldr) == (set (reg r1)
10669 (zero_extend (mem
10670 (lo_sum (reg r0)
10671 (symbol_ref ("SYM")))))) */
10672 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10673 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10675 rtx curr_src = SET_SRC (curr_set);
10677 if (GET_CODE (curr_src) == ZERO_EXTEND)
10678 curr_src = XEXP (curr_src, 0);
10680 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10681 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10682 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10683 == REGNO (SET_DEST (prev_set))
10684 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10685 XEXP (SET_SRC (prev_set), 0)))
10686 return true;
10690 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10691 && any_condjump_p (curr))
10693 enum attr_type prev_type = get_attr_type (prev);
10695 /* FIXME: this misses some which is considered simple arthematic
10696 instructions for ThunderX. Simple shifts are missed here. */
10697 if (prev_type == TYPE_ALUS_SREG
10698 || prev_type == TYPE_ALUS_IMM
10699 || prev_type == TYPE_LOGICS_REG
10700 || prev_type == TYPE_LOGICS_IMM)
10701 return true;
10704 return false;
10707 /* If MEM is in the form of [base+offset], extract the two parts
10708 of address and set to BASE and OFFSET, otherwise return false
10709 after clearing BASE and OFFSET. */
10711 bool
10712 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10714 rtx addr;
10716 gcc_assert (MEM_P (mem));
10718 addr = XEXP (mem, 0);
10720 if (REG_P (addr))
10722 *base = addr;
10723 *offset = const0_rtx;
10724 return true;
10727 if (GET_CODE (addr) == PLUS
10728 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10730 *base = XEXP (addr, 0);
10731 *offset = XEXP (addr, 1);
10732 return true;
10735 *base = NULL_RTX;
10736 *offset = NULL_RTX;
10738 return false;
10741 /* Types for scheduling fusion. */
10742 enum sched_fusion_type
10744 SCHED_FUSION_NONE = 0,
10745 SCHED_FUSION_LD_SIGN_EXTEND,
10746 SCHED_FUSION_LD_ZERO_EXTEND,
10747 SCHED_FUSION_LD,
10748 SCHED_FUSION_ST,
10749 SCHED_FUSION_NUM
10752 /* If INSN is a load or store of address in the form of [base+offset],
10753 extract the two parts and set to BASE and OFFSET. Return scheduling
10754 fusion type this INSN is. */
10756 static enum sched_fusion_type
10757 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10759 rtx x, dest, src;
10760 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10762 gcc_assert (INSN_P (insn));
10763 x = PATTERN (insn);
10764 if (GET_CODE (x) != SET)
10765 return SCHED_FUSION_NONE;
10767 src = SET_SRC (x);
10768 dest = SET_DEST (x);
10770 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10771 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10772 return SCHED_FUSION_NONE;
10774 if (GET_CODE (src) == SIGN_EXTEND)
10776 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10777 src = XEXP (src, 0);
10778 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10779 return SCHED_FUSION_NONE;
10781 else if (GET_CODE (src) == ZERO_EXTEND)
10783 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10784 src = XEXP (src, 0);
10785 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10786 return SCHED_FUSION_NONE;
10789 if (GET_CODE (src) == MEM && REG_P (dest))
10790 extract_base_offset_in_addr (src, base, offset);
10791 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10793 fusion = SCHED_FUSION_ST;
10794 extract_base_offset_in_addr (dest, base, offset);
10796 else
10797 return SCHED_FUSION_NONE;
10799 if (*base == NULL_RTX || *offset == NULL_RTX)
10800 fusion = SCHED_FUSION_NONE;
10802 return fusion;
10805 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10807 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10808 and PRI are only calculated for these instructions. For other instruction,
10809 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10810 type instruction fusion can be added by returning different priorities.
10812 It's important that irrelevant instructions get the largest FUSION_PRI. */
10814 static void
10815 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10816 int *fusion_pri, int *pri)
10818 int tmp, off_val;
10819 rtx base, offset;
10820 enum sched_fusion_type fusion;
10822 gcc_assert (INSN_P (insn));
10824 tmp = max_pri - 1;
10825 fusion = fusion_load_store (insn, &base, &offset);
10826 if (fusion == SCHED_FUSION_NONE)
10828 *pri = tmp;
10829 *fusion_pri = tmp;
10830 return;
10833 /* Set FUSION_PRI according to fusion type and base register. */
10834 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10836 /* Calculate PRI. */
10837 tmp /= 2;
10839 /* INSN with smaller offset goes first. */
10840 off_val = (int)(INTVAL (offset));
10841 if (off_val >= 0)
10842 tmp -= (off_val & 0xfffff);
10843 else
10844 tmp += ((- off_val) & 0xfffff);
10846 *pri = tmp;
10847 return;
10850 /* Given OPERANDS of consecutive load/store, check if we can merge
10851 them into ldp/stp. LOAD is true if they are load instructions.
10852 MODE is the mode of memory operands. */
10854 bool
10855 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10856 enum machine_mode mode)
10858 HOST_WIDE_INT offval_1, offval_2, msize;
10859 enum reg_class rclass_1, rclass_2;
10860 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10862 if (load)
10864 mem_1 = operands[1];
10865 mem_2 = operands[3];
10866 reg_1 = operands[0];
10867 reg_2 = operands[2];
10868 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10869 if (REGNO (reg_1) == REGNO (reg_2))
10870 return false;
10872 else
10874 mem_1 = operands[0];
10875 mem_2 = operands[2];
10876 reg_1 = operands[1];
10877 reg_2 = operands[3];
10880 /* The mems cannot be volatile. */
10881 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10882 return false;
10884 /* Check if the addresses are in the form of [base+offset]. */
10885 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10886 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10887 return false;
10888 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10889 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10890 return false;
10892 /* Check if the bases are same. */
10893 if (!rtx_equal_p (base_1, base_2))
10894 return false;
10896 offval_1 = INTVAL (offset_1);
10897 offval_2 = INTVAL (offset_2);
10898 msize = GET_MODE_SIZE (mode);
10899 /* Check if the offsets are consecutive. */
10900 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10901 return false;
10903 /* Check if the addresses are clobbered by load. */
10904 if (load)
10906 if (reg_mentioned_p (reg_1, mem_1))
10907 return false;
10909 /* In increasing order, the last load can clobber the address. */
10910 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10911 return false;
10914 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10915 rclass_1 = FP_REGS;
10916 else
10917 rclass_1 = GENERAL_REGS;
10919 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10920 rclass_2 = FP_REGS;
10921 else
10922 rclass_2 = GENERAL_REGS;
10924 /* Check if the registers are of same class. */
10925 if (rclass_1 != rclass_2)
10926 return false;
10928 return true;
10931 /* Given OPERANDS of consecutive load/store, check if we can merge
10932 them into ldp/stp by adjusting the offset. LOAD is true if they
10933 are load instructions. MODE is the mode of memory operands.
10935 Given below consecutive stores:
10937 str w1, [xb, 0x100]
10938 str w1, [xb, 0x104]
10939 str w1, [xb, 0x108]
10940 str w1, [xb, 0x10c]
10942 Though the offsets are out of the range supported by stp, we can
10943 still pair them after adjusting the offset, like:
10945 add scratch, xb, 0x100
10946 stp w1, w1, [scratch]
10947 stp w1, w1, [scratch, 0x8]
10949 The peephole patterns detecting this opportunity should guarantee
10950 the scratch register is avaliable. */
10952 bool
10953 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
10954 enum machine_mode mode)
10956 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
10957 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
10958 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
10959 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
10961 if (load)
10963 reg_1 = operands[0];
10964 mem_1 = operands[1];
10965 reg_2 = operands[2];
10966 mem_2 = operands[3];
10967 reg_3 = operands[4];
10968 mem_3 = operands[5];
10969 reg_4 = operands[6];
10970 mem_4 = operands[7];
10971 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
10972 && REG_P (reg_3) && REG_P (reg_4));
10973 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
10974 return false;
10976 else
10978 mem_1 = operands[0];
10979 reg_1 = operands[1];
10980 mem_2 = operands[2];
10981 reg_2 = operands[3];
10982 mem_3 = operands[4];
10983 reg_3 = operands[5];
10984 mem_4 = operands[6];
10985 reg_4 = operands[7];
10987 /* Skip if memory operand is by itslef valid for ldp/stp. */
10988 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
10989 return false;
10991 /* The mems cannot be volatile. */
10992 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
10993 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
10994 return false;
10996 /* Check if the addresses are in the form of [base+offset]. */
10997 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10998 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10999 return false;
11000 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11001 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11002 return false;
11003 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11004 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11005 return false;
11006 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11007 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11008 return false;
11010 /* Check if the bases are same. */
11011 if (!rtx_equal_p (base_1, base_2)
11012 || !rtx_equal_p (base_2, base_3)
11013 || !rtx_equal_p (base_3, base_4))
11014 return false;
11016 offval_1 = INTVAL (offset_1);
11017 offval_2 = INTVAL (offset_2);
11018 offval_3 = INTVAL (offset_3);
11019 offval_4 = INTVAL (offset_4);
11020 msize = GET_MODE_SIZE (mode);
11021 /* Check if the offsets are consecutive. */
11022 if ((offval_1 != (offval_2 + msize)
11023 || offval_1 != (offval_3 + msize * 2)
11024 || offval_1 != (offval_4 + msize * 3))
11025 && (offval_4 != (offval_3 + msize)
11026 || offval_4 != (offval_2 + msize * 2)
11027 || offval_4 != (offval_1 + msize * 3)))
11028 return false;
11030 /* Check if the addresses are clobbered by load. */
11031 if (load)
11033 if (reg_mentioned_p (reg_1, mem_1)
11034 || reg_mentioned_p (reg_2, mem_2)
11035 || reg_mentioned_p (reg_3, mem_3))
11036 return false;
11038 /* In increasing order, the last load can clobber the address. */
11039 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11040 return false;
11043 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11044 rclass_1 = FP_REGS;
11045 else
11046 rclass_1 = GENERAL_REGS;
11048 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11049 rclass_2 = FP_REGS;
11050 else
11051 rclass_2 = GENERAL_REGS;
11053 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11054 rclass_3 = FP_REGS;
11055 else
11056 rclass_3 = GENERAL_REGS;
11058 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11059 rclass_4 = FP_REGS;
11060 else
11061 rclass_4 = GENERAL_REGS;
11063 /* Check if the registers are of same class. */
11064 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11065 return false;
11067 return true;
11070 /* Given OPERANDS of consecutive load/store, this function pairs them
11071 into ldp/stp after adjusting the offset. It depends on the fact
11072 that addresses of load/store instructions are in increasing order.
11073 MODE is the mode of memory operands. CODE is the rtl operator
11074 which should be applied to all memory operands, it's SIGN_EXTEND,
11075 ZERO_EXTEND or UNKNOWN. */
11077 bool
11078 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11079 enum machine_mode mode, RTX_CODE code)
11081 rtx base, offset, t1, t2;
11082 rtx mem_1, mem_2, mem_3, mem_4;
11083 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11085 if (load)
11087 mem_1 = operands[1];
11088 mem_2 = operands[3];
11089 mem_3 = operands[5];
11090 mem_4 = operands[7];
11092 else
11094 mem_1 = operands[0];
11095 mem_2 = operands[2];
11096 mem_3 = operands[4];
11097 mem_4 = operands[6];
11098 gcc_assert (code == UNKNOWN);
11101 extract_base_offset_in_addr (mem_1, &base, &offset);
11102 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11104 /* Adjust offset thus it can fit in ldp/stp instruction. */
11105 msize = GET_MODE_SIZE (mode);
11106 stp_off_limit = msize * 0x40;
11107 off_val = INTVAL (offset);
11108 abs_off = (off_val < 0) ? -off_val : off_val;
11109 new_off = abs_off % stp_off_limit;
11110 adj_off = abs_off - new_off;
11112 /* Further adjust to make sure all offsets are OK. */
11113 if ((new_off + msize * 2) >= stp_off_limit)
11115 adj_off += stp_off_limit;
11116 new_off -= stp_off_limit;
11119 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11120 if (adj_off >= 0x1000)
11121 return false;
11123 if (off_val < 0)
11125 adj_off = -adj_off;
11126 new_off = -new_off;
11129 /* Create new memory references. */
11130 mem_1 = change_address (mem_1, VOIDmode,
11131 plus_constant (DImode, operands[8], new_off));
11133 /* Check if the adjusted address is OK for ldp/stp. */
11134 if (!aarch64_mem_pair_operand (mem_1, mode))
11135 return false;
11137 msize = GET_MODE_SIZE (mode);
11138 mem_2 = change_address (mem_2, VOIDmode,
11139 plus_constant (DImode,
11140 operands[8],
11141 new_off + msize));
11142 mem_3 = change_address (mem_3, VOIDmode,
11143 plus_constant (DImode,
11144 operands[8],
11145 new_off + msize * 2));
11146 mem_4 = change_address (mem_4, VOIDmode,
11147 plus_constant (DImode,
11148 operands[8],
11149 new_off + msize * 3));
11151 if (code == ZERO_EXTEND)
11153 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11154 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11155 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11156 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11158 else if (code == SIGN_EXTEND)
11160 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11161 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11162 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11163 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11166 if (load)
11168 operands[1] = mem_1;
11169 operands[3] = mem_2;
11170 operands[5] = mem_3;
11171 operands[7] = mem_4;
11173 else
11175 operands[0] = mem_1;
11176 operands[2] = mem_2;
11177 operands[4] = mem_3;
11178 operands[6] = mem_4;
11181 /* Emit adjusting instruction. */
11182 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11183 plus_constant (DImode, base, adj_off)));
11184 /* Emit ldp/stp instructions. */
11185 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11186 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11187 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11188 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11189 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11190 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11191 return true;
11194 #undef TARGET_ADDRESS_COST
11195 #define TARGET_ADDRESS_COST aarch64_address_cost
11197 /* This hook will determines whether unnamed bitfields affect the alignment
11198 of the containing structure. The hook returns true if the structure
11199 should inherit the alignment requirements of an unnamed bitfield's
11200 type. */
11201 #undef TARGET_ALIGN_ANON_BITFIELD
11202 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11204 #undef TARGET_ASM_ALIGNED_DI_OP
11205 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11207 #undef TARGET_ASM_ALIGNED_HI_OP
11208 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11210 #undef TARGET_ASM_ALIGNED_SI_OP
11211 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11213 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11214 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11215 hook_bool_const_tree_hwi_hwi_const_tree_true
11217 #undef TARGET_ASM_FILE_START
11218 #define TARGET_ASM_FILE_START aarch64_start_file
11220 #undef TARGET_ASM_OUTPUT_MI_THUNK
11221 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11223 #undef TARGET_ASM_SELECT_RTX_SECTION
11224 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11226 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11227 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11229 #undef TARGET_BUILD_BUILTIN_VA_LIST
11230 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11232 #undef TARGET_CALLEE_COPIES
11233 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11235 #undef TARGET_CAN_ELIMINATE
11236 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11238 #undef TARGET_CANNOT_FORCE_CONST_MEM
11239 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11241 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11242 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11244 /* Only the least significant bit is used for initialization guard
11245 variables. */
11246 #undef TARGET_CXX_GUARD_MASK_BIT
11247 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11249 #undef TARGET_C_MODE_FOR_SUFFIX
11250 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11252 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11253 #undef TARGET_DEFAULT_TARGET_FLAGS
11254 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11255 #endif
11257 #undef TARGET_CLASS_MAX_NREGS
11258 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11260 #undef TARGET_BUILTIN_DECL
11261 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11263 #undef TARGET_EXPAND_BUILTIN
11264 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11266 #undef TARGET_EXPAND_BUILTIN_VA_START
11267 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11269 #undef TARGET_FOLD_BUILTIN
11270 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11272 #undef TARGET_FUNCTION_ARG
11273 #define TARGET_FUNCTION_ARG aarch64_function_arg
11275 #undef TARGET_FUNCTION_ARG_ADVANCE
11276 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11278 #undef TARGET_FUNCTION_ARG_BOUNDARY
11279 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11281 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11282 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11284 #undef TARGET_FUNCTION_VALUE
11285 #define TARGET_FUNCTION_VALUE aarch64_function_value
11287 #undef TARGET_FUNCTION_VALUE_REGNO_P
11288 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11290 #undef TARGET_FRAME_POINTER_REQUIRED
11291 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11293 #undef TARGET_GIMPLE_FOLD_BUILTIN
11294 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11296 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11297 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11299 #undef TARGET_INIT_BUILTINS
11300 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11302 #undef TARGET_LEGITIMATE_ADDRESS_P
11303 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11305 #undef TARGET_LEGITIMATE_CONSTANT_P
11306 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11308 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11309 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11311 #undef TARGET_LRA_P
11312 #define TARGET_LRA_P hook_bool_void_true
11314 #undef TARGET_MANGLE_TYPE
11315 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11317 #undef TARGET_MEMORY_MOVE_COST
11318 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11320 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11321 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11323 #undef TARGET_MUST_PASS_IN_STACK
11324 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11326 /* This target hook should return true if accesses to volatile bitfields
11327 should use the narrowest mode possible. It should return false if these
11328 accesses should use the bitfield container type. */
11329 #undef TARGET_NARROW_VOLATILE_BITFIELD
11330 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11332 #undef TARGET_OPTION_OVERRIDE
11333 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11335 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11336 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11337 aarch64_override_options_after_change
11339 #undef TARGET_PASS_BY_REFERENCE
11340 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11342 #undef TARGET_PREFERRED_RELOAD_CLASS
11343 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11345 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11346 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11348 #undef TARGET_SECONDARY_RELOAD
11349 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11351 #undef TARGET_SHIFT_TRUNCATION_MASK
11352 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11354 #undef TARGET_SETUP_INCOMING_VARARGS
11355 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11357 #undef TARGET_STRUCT_VALUE_RTX
11358 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11360 #undef TARGET_REGISTER_MOVE_COST
11361 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11363 #undef TARGET_RETURN_IN_MEMORY
11364 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11366 #undef TARGET_RETURN_IN_MSB
11367 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11369 #undef TARGET_RTX_COSTS
11370 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11372 #undef TARGET_SCHED_ISSUE_RATE
11373 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11375 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11376 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11377 aarch64_sched_first_cycle_multipass_dfa_lookahead
11379 #undef TARGET_TRAMPOLINE_INIT
11380 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11382 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11383 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11385 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11386 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11388 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11389 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11391 #undef TARGET_VECTORIZE_ADD_STMT_COST
11392 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11394 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11395 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11396 aarch64_builtin_vectorization_cost
11398 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11399 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11401 #undef TARGET_VECTORIZE_BUILTINS
11402 #define TARGET_VECTORIZE_BUILTINS
11404 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11405 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11406 aarch64_builtin_vectorized_function
11408 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11409 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11410 aarch64_autovectorize_vector_sizes
11412 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11413 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11414 aarch64_atomic_assign_expand_fenv
11416 /* Section anchor support. */
11418 #undef TARGET_MIN_ANCHOR_OFFSET
11419 #define TARGET_MIN_ANCHOR_OFFSET -256
11421 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11422 byte offset; we can do much more for larger data types, but have no way
11423 to determine the size of the access. We assume accesses are aligned. */
11424 #undef TARGET_MAX_ANCHOR_OFFSET
11425 #define TARGET_MAX_ANCHOR_OFFSET 4095
11427 #undef TARGET_VECTOR_ALIGNMENT
11428 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11430 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11431 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11432 aarch64_simd_vector_alignment_reachable
11434 /* vec_perm support. */
11436 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11437 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11438 aarch64_vectorize_vec_perm_const_ok
11441 #undef TARGET_FIXED_CONDITION_CODE_REGS
11442 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11444 #undef TARGET_FLAGS_REGNUM
11445 #define TARGET_FLAGS_REGNUM CC_REGNUM
11447 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11448 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11450 #undef TARGET_ASAN_SHADOW_OFFSET
11451 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11453 #undef TARGET_LEGITIMIZE_ADDRESS
11454 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11456 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11457 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11458 aarch64_use_by_pieces_infrastructure_p
11460 #undef TARGET_CAN_USE_DOLOOP_P
11461 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11463 #undef TARGET_SCHED_MACRO_FUSION_P
11464 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11466 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11467 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11469 #undef TARGET_SCHED_FUSION_PRIORITY
11470 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11472 struct gcc_target targetm = TARGET_INITIALIZER;
11474 #include "gt-aarch64.h"