[AArch64] Fix aarch64_rtx_costs of PLUS/MINUS
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobeed37dbd91f4c75f741bb7b9276dc5193f02d256
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
105 ADDRESS_REG_IMM
106 A simple base register plus immediate offset.
108 ADDRESS_REG_WB
109 A base register indexed by immediate offset with writeback.
111 ADDRESS_REG_REG
112 A base register indexed by (optionally scaled) register.
114 ADDRESS_REG_UXTW
115 A base register indexed by (optionally scaled) zero-extended register.
117 ADDRESS_REG_SXTW
118 A base register indexed by (optionally scaled) sign-extended register.
120 ADDRESS_LO_SUM
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
123 ADDRESS_SYMBOLIC:
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type {
127 ADDRESS_REG_IMM,
128 ADDRESS_REG_WB,
129 ADDRESS_REG_REG,
130 ADDRESS_REG_UXTW,
131 ADDRESS_REG_SXTW,
132 ADDRESS_LO_SUM,
133 ADDRESS_SYMBOLIC
136 struct aarch64_address_info {
137 enum aarch64_address_type type;
138 rtx base;
139 rtx offset;
140 int shift;
141 enum aarch64_symbol_type symbol_type;
144 struct simd_immediate_info
146 rtx value;
147 int shift;
148 int element_width;
149 bool mvn;
150 bool msl;
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel;
156 #ifdef HAVE_AS_TLS
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
159 #endif
161 static bool aarch64_composite_type_p (const_tree, machine_mode);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
163 const_tree,
164 machine_mode *, int *,
165 bool *);
166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode);
170 static unsigned bit_count (unsigned HOST_WIDE_INT);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
172 const unsigned char *sel);
173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune = cortexa53;
181 /* The current tuning set. */
182 const struct tune_params *aarch64_tune_params;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags = 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags = 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table =
195 0, /* hi */
196 0, /* si */
197 0, /* di */
198 0, /* ti */
200 0, /* pre_modify */
201 0, /* post_modify */
202 0, /* register_offset */
203 0, /* register_extend */
204 0 /* imm_offset */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 1, /* hi */
211 0, /* si */
212 0, /* di */
213 1, /* ti */
215 0, /* pre_modify */
216 0, /* post_modify */
217 0, /* register_offset */
218 0, /* register_extend */
219 0, /* imm_offset */
222 static const struct cpu_addrcost_table xgene1_addrcost_table =
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
230 1, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 1, /* register_extend */
234 0, /* imm_offset */
237 static const struct cpu_regmove_cost generic_regmove_cost =
239 1, /* GP2GP */
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
242 5, /* GP2FP */
243 5, /* FP2GP */
244 2 /* FP2FP */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
249 1, /* GP2GP */
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
252 5, /* GP2FP */
253 5, /* FP2GP */
254 2 /* FP2FP */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
259 1, /* GP2GP */
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
262 5, /* GP2FP */
263 5, /* FP2GP */
264 2 /* FP2FP */
267 static const struct cpu_regmove_cost thunderx_regmove_cost =
269 2, /* GP2GP */
270 2, /* GP2FP */
271 6, /* FP2GP */
272 4 /* FP2FP */
275 static const struct cpu_regmove_cost xgene1_regmove_cost =
277 1, /* GP2GP */
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
280 8, /* GP2FP */
281 8, /* FP2GP */
282 2 /* FP2FP */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost =
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost =
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost =
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 static const struct tune_params generic_tunings =
345 &cortexa57_extra_costs,
346 &generic_addrcost_table,
347 &generic_regmove_cost,
348 &generic_vector_cost,
349 4, /* memmov_cost */
350 2, /* issue_rate */
351 AARCH64_FUSE_NOTHING, /* fuseable_ops */
352 8, /* function_align. */
353 8, /* jump_align. */
354 4, /* loop_align. */
355 2, /* int_reassoc_width. */
356 4, /* fp_reassoc_width. */
357 1 /* vec_reassoc_width. */
360 static const struct tune_params cortexa53_tunings =
362 &cortexa53_extra_costs,
363 &generic_addrcost_table,
364 &cortexa53_regmove_cost,
365 &generic_vector_cost,
366 4, /* memmov_cost */
367 2, /* issue_rate */
368 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
369 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops */
370 8, /* function_align. */
371 8, /* jump_align. */
372 4, /* loop_align. */
373 2, /* int_reassoc_width. */
374 4, /* fp_reassoc_width. */
375 1 /* vec_reassoc_width. */
378 static const struct tune_params cortexa57_tunings =
380 &cortexa57_extra_costs,
381 &cortexa57_addrcost_table,
382 &cortexa57_regmove_cost,
383 &cortexa57_vector_cost,
384 4, /* memmov_cost */
385 3, /* issue_rate */
386 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
387 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
388 16, /* function_align. */
389 8, /* jump_align. */
390 4, /* loop_align. */
391 2, /* int_reassoc_width. */
392 4, /* fp_reassoc_width. */
393 1 /* vec_reassoc_width. */
396 static const struct tune_params thunderx_tunings =
398 &thunderx_extra_costs,
399 &generic_addrcost_table,
400 &thunderx_regmove_cost,
401 &generic_vector_cost,
402 6, /* memmov_cost */
403 2, /* issue_rate */
404 AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops */
405 8, /* function_align. */
406 8, /* jump_align. */
407 8, /* loop_align. */
408 2, /* int_reassoc_width. */
409 4, /* fp_reassoc_width. */
410 1 /* vec_reassoc_width. */
413 static const struct tune_params xgene1_tunings =
415 &xgene1_extra_costs,
416 &xgene1_addrcost_table,
417 &xgene1_regmove_cost,
418 &xgene1_vector_cost,
419 6, /* memmov_cost */
420 4, /* issue_rate */
421 AARCH64_FUSE_NOTHING, /* fuseable_ops */
422 16, /* function_align. */
423 8, /* jump_align. */
424 16, /* loop_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1 /* vec_reassoc_width. */
430 /* A processor implementing AArch64. */
431 struct processor
433 const char *const name;
434 enum aarch64_processor core;
435 const char *arch;
436 unsigned architecture_version;
437 const unsigned long flags;
438 const struct tune_params *const tune;
441 /* Processor cores implementing AArch64. */
442 static const struct processor all_cores[] =
444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
445 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
446 #include "aarch64-cores.def"
447 #undef AARCH64_CORE
448 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
449 {NULL, aarch64_none, NULL, 0, 0, NULL}
452 /* Architectures implementing AArch64. */
453 static const struct processor all_architectures[] =
455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
456 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
457 #include "aarch64-arches.def"
458 #undef AARCH64_ARCH
459 {NULL, aarch64_none, NULL, 0, 0, NULL}
462 /* Target specification. These are populated as commandline arguments
463 are processed, or NULL if not specified. */
464 static const struct processor *selected_arch;
465 static const struct processor *selected_cpu;
466 static const struct processor *selected_tune;
468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
470 /* An ISA extension in the co-processor and main instruction set space. */
471 struct aarch64_option_extension
473 const char *const name;
474 const unsigned long flags_on;
475 const unsigned long flags_off;
478 /* ISA extensions in AArch64. */
479 static const struct aarch64_option_extension all_extensions[] =
481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
482 {NAME, FLAGS_ON, FLAGS_OFF},
483 #include "aarch64-option-extensions.def"
484 #undef AARCH64_OPT_EXTENSION
485 {NULL, 0, 0}
488 /* Used to track the size of an address when generating a pre/post
489 increment address. */
490 static machine_mode aarch64_memory_reference_mode;
492 /* A table of valid AArch64 "bitmask immediate" values for
493 logical instructions. */
495 #define AARCH64_NUM_BITMASKS 5334
496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
498 typedef enum aarch64_cond_code
500 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
501 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
502 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
504 aarch64_cc;
506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
508 /* The condition codes of the processor, and the inverse function. */
509 static const char * const aarch64_condition_codes[] =
511 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
512 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
515 static unsigned int
516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
518 return 2;
521 static int
522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
523 enum machine_mode mode)
525 if (VECTOR_MODE_P (mode))
526 return aarch64_tune_params->vec_reassoc_width;
527 if (INTEGRAL_MODE_P (mode))
528 return aarch64_tune_params->int_reassoc_width;
529 if (FLOAT_MODE_P (mode))
530 return aarch64_tune_params->fp_reassoc_width;
531 return 1;
534 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
535 unsigned
536 aarch64_dbx_register_number (unsigned regno)
538 if (GP_REGNUM_P (regno))
539 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
540 else if (regno == SP_REGNUM)
541 return AARCH64_DWARF_SP;
542 else if (FP_REGNUM_P (regno))
543 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
545 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
546 equivalent DWARF register. */
547 return DWARF_FRAME_REGISTERS;
550 /* Return TRUE if MODE is any of the large INT modes. */
551 static bool
552 aarch64_vect_struct_mode_p (machine_mode mode)
554 return mode == OImode || mode == CImode || mode == XImode;
557 /* Return TRUE if MODE is any of the vector modes. */
558 static bool
559 aarch64_vector_mode_p (machine_mode mode)
561 return aarch64_vector_mode_supported_p (mode)
562 || aarch64_vect_struct_mode_p (mode);
565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
566 static bool
567 aarch64_array_mode_supported_p (machine_mode mode,
568 unsigned HOST_WIDE_INT nelems)
570 if (TARGET_SIMD
571 && AARCH64_VALID_SIMD_QREG_MODE (mode)
572 && (nelems >= 2 && nelems <= 4))
573 return true;
575 return false;
578 /* Implement HARD_REGNO_NREGS. */
581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
583 switch (aarch64_regno_regclass (regno))
585 case FP_REGS:
586 case FP_LO_REGS:
587 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
588 default:
589 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
591 gcc_unreachable ();
594 /* Implement HARD_REGNO_MODE_OK. */
597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
599 if (GET_MODE_CLASS (mode) == MODE_CC)
600 return regno == CC_REGNUM;
602 if (regno == SP_REGNUM)
603 /* The purpose of comparing with ptr_mode is to support the
604 global register variable associated with the stack pointer
605 register via the syntax of asm ("wsp") in ILP32. */
606 return mode == Pmode || mode == ptr_mode;
608 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
609 return mode == Pmode;
611 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
612 return 1;
614 if (FP_REGNUM_P (regno))
616 if (aarch64_vect_struct_mode_p (mode))
617 return
618 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
619 else
620 return 1;
623 return 0;
626 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
627 machine_mode
628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
629 machine_mode mode)
631 /* Handle modes that fit within single registers. */
632 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
634 if (GET_MODE_SIZE (mode) >= 4)
635 return mode;
636 else
637 return SImode;
639 /* Fall back to generic for multi-reg and very large modes. */
640 else
641 return choose_hard_reg_mode (regno, nregs, false);
644 /* Return true if calls to DECL should be treated as
645 long-calls (ie called via a register). */
646 static bool
647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
649 return false;
652 /* Return true if calls to symbol-ref SYM should be treated as
653 long-calls (ie called via a register). */
654 bool
655 aarch64_is_long_call_p (rtx sym)
657 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
660 /* Return true if the offsets to a zero/sign-extract operation
661 represent an expression that matches an extend operation. The
662 operands represent the paramters from
664 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
665 bool
666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
667 rtx extract_imm)
669 HOST_WIDE_INT mult_val, extract_val;
671 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
672 return false;
674 mult_val = INTVAL (mult_imm);
675 extract_val = INTVAL (extract_imm);
677 if (extract_val > 8
678 && extract_val < GET_MODE_BITSIZE (mode)
679 && exact_log2 (extract_val & ~7) > 0
680 && (extract_val & 7) <= 4
681 && mult_val == (1 << (extract_val & 7)))
682 return true;
684 return false;
687 /* Emit an insn that's a simple single-set. Both the operands must be
688 known to be valid. */
689 inline static rtx
690 emit_set_insn (rtx x, rtx y)
692 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
695 /* X and Y are two things to compare using CODE. Emit the compare insn and
696 return the rtx for register 0 in the proper mode. */
698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
700 machine_mode mode = SELECT_CC_MODE (code, x, y);
701 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
703 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
704 return cc_reg;
707 /* Build the SYMBOL_REF for __tls_get_addr. */
709 static GTY(()) rtx tls_get_addr_libfunc;
712 aarch64_tls_get_addr (void)
714 if (!tls_get_addr_libfunc)
715 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
716 return tls_get_addr_libfunc;
719 /* Return the TLS model to use for ADDR. */
721 static enum tls_model
722 tls_symbolic_operand_type (rtx addr)
724 enum tls_model tls_kind = TLS_MODEL_NONE;
725 rtx sym, addend;
727 if (GET_CODE (addr) == CONST)
729 split_const (addr, &sym, &addend);
730 if (GET_CODE (sym) == SYMBOL_REF)
731 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
733 else if (GET_CODE (addr) == SYMBOL_REF)
734 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
736 return tls_kind;
739 /* We'll allow lo_sum's in addresses in our legitimate addresses
740 so that combine would take care of combining addresses where
741 necessary, but for generation purposes, we'll generate the address
742 as :
743 RTL Absolute
744 tmp = hi (symbol_ref); adrp x1, foo
745 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
748 PIC TLS
749 adrp x1, :got:foo adrp tmp, :tlsgd:foo
750 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
751 bl __tls_get_addr
754 Load TLS symbol, depending on TLS mechanism and TLS access model.
756 Global Dynamic - Traditional TLS:
757 adrp tmp, :tlsgd:imm
758 add dest, tmp, #:tlsgd_lo12:imm
759 bl __tls_get_addr
761 Global Dynamic - TLS Descriptors:
762 adrp dest, :tlsdesc:imm
763 ldr tmp, [dest, #:tlsdesc_lo12:imm]
764 add dest, dest, #:tlsdesc_lo12:imm
765 blr tmp
766 mrs tp, tpidr_el0
767 add dest, dest, tp
769 Initial Exec:
770 mrs tp, tpidr_el0
771 adrp tmp, :gottprel:imm
772 ldr dest, [tmp, #:gottprel_lo12:imm]
773 add dest, dest, tp
775 Local Exec:
776 mrs tp, tpidr_el0
777 add t0, tp, #:tprel_hi12:imm, lsl #12
778 add t0, t0, #:tprel_lo12_nc:imm
781 static void
782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
783 enum aarch64_symbol_type type)
785 switch (type)
787 case SYMBOL_SMALL_ABSOLUTE:
789 /* In ILP32, the mode of dest can be either SImode or DImode. */
790 rtx tmp_reg = dest;
791 machine_mode mode = GET_MODE (dest);
793 gcc_assert (mode == Pmode || mode == ptr_mode);
795 if (can_create_pseudo_p ())
796 tmp_reg = gen_reg_rtx (mode);
798 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
799 emit_insn (gen_add_losym (dest, tmp_reg, imm));
800 return;
803 case SYMBOL_TINY_ABSOLUTE:
804 emit_insn (gen_rtx_SET (Pmode, dest, imm));
805 return;
807 case SYMBOL_SMALL_GOT:
809 /* In ILP32, the mode of dest can be either SImode or DImode,
810 while the got entry is always of SImode size. The mode of
811 dest depends on how dest is used: if dest is assigned to a
812 pointer (e.g. in the memory), it has SImode; it may have
813 DImode if dest is dereferenced to access the memeory.
814 This is why we have to handle three different ldr_got_small
815 patterns here (two patterns for ILP32). */
816 rtx tmp_reg = dest;
817 machine_mode mode = GET_MODE (dest);
819 if (can_create_pseudo_p ())
820 tmp_reg = gen_reg_rtx (mode);
822 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
823 if (mode == ptr_mode)
825 if (mode == DImode)
826 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
827 else
828 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
830 else
832 gcc_assert (mode == Pmode);
833 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
836 return;
839 case SYMBOL_SMALL_TLSGD:
841 rtx_insn *insns;
842 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
844 start_sequence ();
845 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
846 insns = get_insns ();
847 end_sequence ();
849 RTL_CONST_CALL_P (insns) = 1;
850 emit_libcall_block (insns, dest, result, imm);
851 return;
854 case SYMBOL_SMALL_TLSDESC:
856 machine_mode mode = GET_MODE (dest);
857 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
858 rtx tp;
860 gcc_assert (mode == Pmode || mode == ptr_mode);
862 /* In ILP32, the got entry is always of SImode size. Unlike
863 small GOT, the dest is fixed at reg 0. */
864 if (TARGET_ILP32)
865 emit_insn (gen_tlsdesc_small_si (imm));
866 else
867 emit_insn (gen_tlsdesc_small_di (imm));
868 tp = aarch64_load_tp (NULL);
870 if (mode != Pmode)
871 tp = gen_lowpart (mode, tp);
873 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
874 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
875 return;
878 case SYMBOL_SMALL_GOTTPREL:
880 /* In ILP32, the mode of dest can be either SImode or DImode,
881 while the got entry is always of SImode size. The mode of
882 dest depends on how dest is used: if dest is assigned to a
883 pointer (e.g. in the memory), it has SImode; it may have
884 DImode if dest is dereferenced to access the memeory.
885 This is why we have to handle three different tlsie_small
886 patterns here (two patterns for ILP32). */
887 machine_mode mode = GET_MODE (dest);
888 rtx tmp_reg = gen_reg_rtx (mode);
889 rtx tp = aarch64_load_tp (NULL);
891 if (mode == ptr_mode)
893 if (mode == DImode)
894 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
895 else
897 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
898 tp = gen_lowpart (mode, tp);
901 else
903 gcc_assert (mode == Pmode);
904 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
907 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
908 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
909 return;
912 case SYMBOL_SMALL_TPREL:
914 rtx tp = aarch64_load_tp (NULL);
916 if (GET_MODE (dest) != Pmode)
917 tp = gen_lowpart (GET_MODE (dest), tp);
919 emit_insn (gen_tlsle_small (dest, tp, imm));
920 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
921 return;
924 case SYMBOL_TINY_GOT:
925 emit_insn (gen_ldr_got_tiny (dest, imm));
926 return;
928 default:
929 gcc_unreachable ();
933 /* Emit a move from SRC to DEST. Assume that the move expanders can
934 handle all moves if !can_create_pseudo_p (). The distinction is
935 important because, unlike emit_move_insn, the move expanders know
936 how to force Pmode objects into the constant pool even when the
937 constant pool address is not itself legitimate. */
938 static rtx
939 aarch64_emit_move (rtx dest, rtx src)
941 return (can_create_pseudo_p ()
942 ? emit_move_insn (dest, src)
943 : emit_move_insn_1 (dest, src));
946 /* Split a 128-bit move operation into two 64-bit move operations,
947 taking care to handle partial overlap of register to register
948 copies. Special cases are needed when moving between GP regs and
949 FP regs. SRC can be a register, constant or memory; DST a register
950 or memory. If either operand is memory it must not have any side
951 effects. */
952 void
953 aarch64_split_128bit_move (rtx dst, rtx src)
955 rtx dst_lo, dst_hi;
956 rtx src_lo, src_hi;
958 machine_mode mode = GET_MODE (dst);
960 gcc_assert (mode == TImode || mode == TFmode);
961 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
962 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
964 if (REG_P (dst) && REG_P (src))
966 int src_regno = REGNO (src);
967 int dst_regno = REGNO (dst);
969 /* Handle FP <-> GP regs. */
970 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
972 src_lo = gen_lowpart (word_mode, src);
973 src_hi = gen_highpart (word_mode, src);
975 if (mode == TImode)
977 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
978 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
980 else
982 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
983 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
985 return;
987 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
989 dst_lo = gen_lowpart (word_mode, dst);
990 dst_hi = gen_highpart (word_mode, dst);
992 if (mode == TImode)
994 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
995 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
997 else
999 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1002 return;
1006 dst_lo = gen_lowpart (word_mode, dst);
1007 dst_hi = gen_highpart (word_mode, dst);
1008 src_lo = gen_lowpart (word_mode, src);
1009 src_hi = gen_highpart_mode (word_mode, mode, src);
1011 /* At most one pairing may overlap. */
1012 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1014 aarch64_emit_move (dst_hi, src_hi);
1015 aarch64_emit_move (dst_lo, src_lo);
1017 else
1019 aarch64_emit_move (dst_lo, src_lo);
1020 aarch64_emit_move (dst_hi, src_hi);
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1027 return (! REG_P (src)
1028 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1031 /* Split a complex SIMD combine. */
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1036 machine_mode src_mode = GET_MODE (src1);
1037 machine_mode dst_mode = GET_MODE (dst);
1039 gcc_assert (VECTOR_MODE_P (dst_mode));
1041 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1043 rtx (*gen) (rtx, rtx, rtx);
1045 switch (src_mode)
1047 case V8QImode:
1048 gen = gen_aarch64_simd_combinev8qi;
1049 break;
1050 case V4HImode:
1051 gen = gen_aarch64_simd_combinev4hi;
1052 break;
1053 case V2SImode:
1054 gen = gen_aarch64_simd_combinev2si;
1055 break;
1056 case V2SFmode:
1057 gen = gen_aarch64_simd_combinev2sf;
1058 break;
1059 case DImode:
1060 gen = gen_aarch64_simd_combinedi;
1061 break;
1062 case DFmode:
1063 gen = gen_aarch64_simd_combinedf;
1064 break;
1065 default:
1066 gcc_unreachable ();
1069 emit_insn (gen (dst, src1, src2));
1070 return;
1074 /* Split a complex SIMD move. */
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1079 machine_mode src_mode = GET_MODE (src);
1080 machine_mode dst_mode = GET_MODE (dst);
1082 gcc_assert (VECTOR_MODE_P (dst_mode));
1084 if (REG_P (dst) && REG_P (src))
1086 rtx (*gen) (rtx, rtx);
1088 gcc_assert (VECTOR_MODE_P (src_mode));
1090 switch (src_mode)
1092 case V16QImode:
1093 gen = gen_aarch64_split_simd_movv16qi;
1094 break;
1095 case V8HImode:
1096 gen = gen_aarch64_split_simd_movv8hi;
1097 break;
1098 case V4SImode:
1099 gen = gen_aarch64_split_simd_movv4si;
1100 break;
1101 case V2DImode:
1102 gen = gen_aarch64_split_simd_movv2di;
1103 break;
1104 case V4SFmode:
1105 gen = gen_aarch64_split_simd_movv4sf;
1106 break;
1107 case V2DFmode:
1108 gen = gen_aarch64_split_simd_movv2df;
1109 break;
1110 default:
1111 gcc_unreachable ();
1114 emit_insn (gen (dst, src));
1115 return;
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1122 if (can_create_pseudo_p ())
1123 return force_reg (mode, value);
1124 else
1126 x = aarch64_emit_move (x, value);
1127 return x;
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1135 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1137 rtx high;
1138 /* Load the full offset into a register. This
1139 might be improvable in the future. */
1140 high = GEN_INT (offset);
1141 offset = 0;
1142 high = aarch64_force_temporary (mode, temp, high);
1143 reg = aarch64_force_temporary (mode, temp,
1144 gen_rtx_PLUS (mode, high, reg));
1146 return plus_constant (mode, reg, offset);
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151 machine_mode mode)
1153 unsigned HOST_WIDE_INT mask;
1154 int i;
1155 bool first;
1156 unsigned HOST_WIDE_INT val;
1157 bool subtargets;
1158 rtx subtarget;
1159 int one_match, zero_match, first_not_ffff_match;
1160 int num_insns = 0;
1162 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1164 if (generate)
1165 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166 num_insns++;
1167 return num_insns;
1170 if (mode == SImode)
1172 /* We know we can't do this in 1 insn, and we must be able to do it
1173 in two; so don't mess around looking for sequences that don't buy
1174 us anything. */
1175 if (generate)
1177 emit_insn (gen_rtx_SET (VOIDmode, dest,
1178 GEN_INT (INTVAL (imm) & 0xffff)));
1179 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1182 num_insns += 2;
1183 return num_insns;
1186 /* Remaining cases are all for DImode. */
1188 val = INTVAL (imm);
1189 subtargets = optimize && can_create_pseudo_p ();
1191 one_match = 0;
1192 zero_match = 0;
1193 mask = 0xffff;
1194 first_not_ffff_match = -1;
1196 for (i = 0; i < 64; i += 16, mask <<= 16)
1198 if ((val & mask) == mask)
1199 one_match++;
1200 else
1202 if (first_not_ffff_match < 0)
1203 first_not_ffff_match = i;
1204 if ((val & mask) == 0)
1205 zero_match++;
1209 if (one_match == 2)
1211 /* Set one of the quarters and then insert back into result. */
1212 mask = 0xffffll << first_not_ffff_match;
1213 if (generate)
1215 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217 GEN_INT ((val >> first_not_ffff_match)
1218 & 0xffff)));
1220 num_insns += 2;
1221 return num_insns;
1224 if (zero_match == 2)
1225 goto simple_sequence;
1227 mask = 0x0ffff0000UL;
1228 for (i = 16; i < 64; i += 16, mask <<= 16)
1230 HOST_WIDE_INT comp = mask & ~(mask - 1);
1232 if (aarch64_uimm12_shift (val - (val & mask)))
1234 if (generate)
1236 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238 GEN_INT (val & mask)));
1239 emit_insn (gen_adddi3 (dest, subtarget,
1240 GEN_INT (val - (val & mask))));
1242 num_insns += 2;
1243 return num_insns;
1245 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1247 if (generate)
1249 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251 GEN_INT ((val + comp) & mask)));
1252 emit_insn (gen_adddi3 (dest, subtarget,
1253 GEN_INT (val - ((val + comp) & mask))));
1255 num_insns += 2;
1256 return num_insns;
1258 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1260 if (generate)
1262 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264 GEN_INT ((val - comp) | ~mask)));
1265 emit_insn (gen_adddi3 (dest, subtarget,
1266 GEN_INT (val - ((val - comp) | ~mask))));
1268 num_insns += 2;
1269 return num_insns;
1271 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1273 if (generate)
1275 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277 GEN_INT (val | ~mask)));
1278 emit_insn (gen_adddi3 (dest, subtarget,
1279 GEN_INT (val - (val | ~mask))));
1281 num_insns += 2;
1282 return num_insns;
1286 /* See if we can do it by arithmetically combining two
1287 immediates. */
1288 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1290 int j;
1291 mask = 0xffff;
1293 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1296 if (generate)
1298 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300 GEN_INT (aarch64_bitmasks[i])));
1301 emit_insn (gen_adddi3 (dest, subtarget,
1302 GEN_INT (val - aarch64_bitmasks[i])));
1304 num_insns += 2;
1305 return num_insns;
1308 for (j = 0; j < 64; j += 16, mask <<= 16)
1310 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1312 if (generate)
1314 emit_insn (gen_rtx_SET (VOIDmode, dest,
1315 GEN_INT (aarch64_bitmasks[i])));
1316 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317 GEN_INT ((val >> j) & 0xffff)));
1319 num_insns += 2;
1320 return num_insns;
1325 /* See if we can do it by logically combining two immediates. */
1326 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1328 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1330 int j;
1332 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1335 if (generate)
1337 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339 GEN_INT (aarch64_bitmasks[i])));
1340 emit_insn (gen_iordi3 (dest, subtarget,
1341 GEN_INT (aarch64_bitmasks[j])));
1343 num_insns += 2;
1344 return num_insns;
1347 else if ((val & aarch64_bitmasks[i]) == val)
1349 int j;
1351 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1354 if (generate)
1356 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358 GEN_INT (aarch64_bitmasks[j])));
1359 emit_insn (gen_anddi3 (dest, subtarget,
1360 GEN_INT (aarch64_bitmasks[i])));
1362 num_insns += 2;
1363 return num_insns;
1368 if (one_match > zero_match)
1370 /* Set either first three quarters or all but the third. */
1371 mask = 0xffffll << (16 - first_not_ffff_match);
1372 if (generate)
1373 emit_insn (gen_rtx_SET (VOIDmode, dest,
1374 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375 num_insns ++;
1377 /* Now insert other two quarters. */
1378 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379 i < 64; i += 16, mask <<= 16)
1381 if ((val & mask) != mask)
1383 if (generate)
1384 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385 GEN_INT ((val >> i) & 0xffff)));
1386 num_insns ++;
1389 return num_insns;
1392 simple_sequence:
1393 first = true;
1394 mask = 0xffff;
1395 for (i = 0; i < 64; i += 16, mask <<= 16)
1397 if ((val & mask) != 0)
1399 if (first)
1401 if (generate)
1402 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403 GEN_INT (val & mask)));
1404 num_insns ++;
1405 first = false;
1407 else
1409 if (generate)
1410 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411 GEN_INT ((val >> i) & 0xffff)));
1412 num_insns ++;
1417 return num_insns;
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1424 machine_mode mode = GET_MODE (dest);
1426 gcc_assert (mode == SImode || mode == DImode);
1428 /* Check on what type of symbol it is. */
1429 if (GET_CODE (imm) == SYMBOL_REF
1430 || GET_CODE (imm) == LABEL_REF
1431 || GET_CODE (imm) == CONST)
1433 rtx mem, base, offset;
1434 enum aarch64_symbol_type sty;
1436 /* If we have (const (plus symbol offset)), separate out the offset
1437 before we start classifying the symbol. */
1438 split_const (imm, &base, &offset);
1440 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441 switch (sty)
1443 case SYMBOL_FORCE_TO_MEM:
1444 if (offset != const0_rtx
1445 && targetm.cannot_force_const_mem (mode, imm))
1447 gcc_assert (can_create_pseudo_p ());
1448 base = aarch64_force_temporary (mode, dest, base);
1449 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450 aarch64_emit_move (dest, base);
1451 return;
1453 mem = force_const_mem (ptr_mode, imm);
1454 gcc_assert (mem);
1455 if (mode != ptr_mode)
1456 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458 return;
1460 case SYMBOL_SMALL_TLSGD:
1461 case SYMBOL_SMALL_TLSDESC:
1462 case SYMBOL_SMALL_GOTTPREL:
1463 case SYMBOL_SMALL_GOT:
1464 case SYMBOL_TINY_GOT:
1465 if (offset != const0_rtx)
1467 gcc_assert(can_create_pseudo_p ());
1468 base = aarch64_force_temporary (mode, dest, base);
1469 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470 aarch64_emit_move (dest, base);
1471 return;
1473 /* FALLTHRU */
1475 case SYMBOL_SMALL_TPREL:
1476 case SYMBOL_SMALL_ABSOLUTE:
1477 case SYMBOL_TINY_ABSOLUTE:
1478 aarch64_load_symref_appropriately (dest, imm, sty);
1479 return;
1481 default:
1482 gcc_unreachable ();
1486 if (!CONST_INT_P (imm))
1488 if (GET_CODE (imm) == HIGH)
1489 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490 else
1492 rtx mem = force_const_mem (mode, imm);
1493 gcc_assert (mem);
1494 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1497 return;
1500 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505 tree exp ATTRIBUTE_UNUSED)
1507 /* Currently, always true. */
1508 return true;
1511 /* Implement TARGET_PASS_BY_REFERENCE. */
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515 machine_mode mode,
1516 const_tree type,
1517 bool named ATTRIBUTE_UNUSED)
1519 HOST_WIDE_INT size;
1520 machine_mode dummymode;
1521 int nregs;
1523 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1524 size = (mode == BLKmode && type)
1525 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1527 /* Aggregates are passed by reference based on their size. */
1528 if (type && AGGREGATE_TYPE_P (type))
1530 size = int_size_in_bytes (type);
1533 /* Variable sized arguments are always returned by reference. */
1534 if (size < 0)
1535 return true;
1537 /* Can this be a candidate to be passed in fp/simd register(s)? */
1538 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539 &dummymode, &nregs,
1540 NULL))
1541 return false;
1543 /* Arguments which are variable sized or larger than 2 registers are
1544 passed by reference unless they are a homogenous floating point
1545 aggregate. */
1546 return size > 2 * UNITS_PER_WORD;
1549 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1553 machine_mode dummy_mode;
1554 int dummy_int;
1556 /* Never happens in little-endian mode. */
1557 if (!BYTES_BIG_ENDIAN)
1558 return false;
1560 /* Only composite types smaller than or equal to 16 bytes can
1561 be potentially returned in registers. */
1562 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563 || int_size_in_bytes (valtype) <= 0
1564 || int_size_in_bytes (valtype) > 16)
1565 return false;
1567 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569 is always passed/returned in the least significant bits of fp/simd
1570 register(s). */
1571 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572 &dummy_mode, &dummy_int, NULL))
1573 return false;
1575 return true;
1578 /* Implement TARGET_FUNCTION_VALUE.
1579 Define how to find the value returned by a function. */
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583 bool outgoing ATTRIBUTE_UNUSED)
1585 machine_mode mode;
1586 int unsignedp;
1587 int count;
1588 machine_mode ag_mode;
1590 mode = TYPE_MODE (type);
1591 if (INTEGRAL_TYPE_P (type))
1592 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1594 if (aarch64_return_in_msb (type))
1596 HOST_WIDE_INT size = int_size_in_bytes (type);
1598 if (size % UNITS_PER_WORD != 0)
1600 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1605 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606 &ag_mode, &count, NULL))
1608 if (!aarch64_composite_type_p (type, mode))
1610 gcc_assert (count == 1 && mode == ag_mode);
1611 return gen_rtx_REG (mode, V0_REGNUM);
1613 else
1615 int i;
1616 rtx par;
1618 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619 for (i = 0; i < count; i++)
1621 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624 XVECEXP (par, 0, i) = tmp;
1626 return par;
1629 else
1630 return gen_rtx_REG (mode, R0_REGNUM);
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634 Return true if REGNO is the number of a hard register in which the values
1635 of called function may come back. */
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1640 /* Maximum of 16 bytes can be returned in the general registers. Examples
1641 of 16-byte return values are: 128-bit integers and 16-byte small
1642 structures (excluding homogeneous floating-point aggregates). */
1643 if (regno == R0_REGNUM || regno == R1_REGNUM)
1644 return true;
1646 /* Up to four fp/simd registers can return a function value, e.g. a
1647 homogeneous floating-point aggregate having four members. */
1648 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649 return !TARGET_GENERAL_REGS_ONLY;
1651 return false;
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1656 If the type T of the result of a function is such that
1657 void func (T arg)
1658 would require that arg be passed as a value in a register (or set of
1659 registers) according to the parameter passing rules, then the result
1660 is returned in the same registers as would be used for such an
1661 argument. */
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1666 HOST_WIDE_INT size;
1667 machine_mode ag_mode;
1668 int count;
1670 if (!AGGREGATE_TYPE_P (type)
1671 && TREE_CODE (type) != COMPLEX_TYPE
1672 && TREE_CODE (type) != VECTOR_TYPE)
1673 /* Simple scalar types always returned in registers. */
1674 return false;
1676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677 type,
1678 &ag_mode,
1679 &count,
1680 NULL))
1681 return false;
1683 /* Types larger than 2 registers returned in memory. */
1684 size = int_size_in_bytes (type);
1685 return (size < 0 || size > 2 * UNITS_PER_WORD);
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690 const_tree type, int *nregs)
1692 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693 return aarch64_vfp_is_call_or_return_candidate (mode,
1694 type,
1695 &pcum->aapcs_vfp_rmode,
1696 nregs,
1697 NULL);
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701 bits. The idea is to suppress any stronger alignment requested by
1702 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703 This is a helper function for local use only. */
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1708 unsigned int alignment;
1710 if (type)
1712 if (!integer_zerop (TYPE_SIZE (type)))
1714 if (TYPE_MODE (type) == mode)
1715 alignment = TYPE_ALIGN (type);
1716 else
1717 alignment = GET_MODE_ALIGNMENT (mode);
1719 else
1720 alignment = 0;
1722 else
1723 alignment = GET_MODE_ALIGNMENT (mode);
1725 return alignment;
1728 /* Layout a function argument according to the AAPCS64 rules. The rule
1729 numbers refer to the rule numbers in the AAPCS64. */
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733 const_tree type,
1734 bool named ATTRIBUTE_UNUSED)
1736 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737 int ncrn, nvrn, nregs;
1738 bool allocate_ncrn, allocate_nvrn;
1739 HOST_WIDE_INT size;
1741 /* We need to do this once per argument. */
1742 if (pcum->aapcs_arg_processed)
1743 return;
1745 pcum->aapcs_arg_processed = true;
1747 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1748 size
1749 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750 UNITS_PER_WORD);
1752 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754 mode,
1755 type,
1756 &nregs);
1758 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759 The following code thus handles passing by SIMD/FP registers first. */
1761 nvrn = pcum->aapcs_nvrn;
1763 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764 and homogenous short-vector aggregates (HVA). */
1765 if (allocate_nvrn)
1767 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1769 pcum->aapcs_nextnvrn = nvrn + nregs;
1770 if (!aarch64_composite_type_p (type, mode))
1772 gcc_assert (nregs == 1);
1773 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1775 else
1777 rtx par;
1778 int i;
1779 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780 for (i = 0; i < nregs; i++)
1782 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783 V0_REGNUM + nvrn + i);
1784 tmp = gen_rtx_EXPR_LIST
1785 (VOIDmode, tmp,
1786 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787 XVECEXP (par, 0, i) = tmp;
1789 pcum->aapcs_reg = par;
1791 return;
1793 else
1795 /* C.3 NSRN is set to 8. */
1796 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797 goto on_stack;
1801 ncrn = pcum->aapcs_ncrn;
1802 nregs = size / UNITS_PER_WORD;
1804 /* C6 - C9. though the sign and zero extension semantics are
1805 handled elsewhere. This is the case where the argument fits
1806 entirely general registers. */
1807 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1809 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1811 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1813 /* C.8 if the argument has an alignment of 16 then the NGRN is
1814 rounded up to the next even number. */
1815 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1817 ++ncrn;
1818 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1820 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821 A reg is still generated for it, but the caller should be smart
1822 enough not to use it. */
1823 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1825 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1827 else
1829 rtx par;
1830 int i;
1832 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833 for (i = 0; i < nregs; i++)
1835 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837 GEN_INT (i * UNITS_PER_WORD));
1838 XVECEXP (par, 0, i) = tmp;
1840 pcum->aapcs_reg = par;
1843 pcum->aapcs_nextncrn = ncrn + nregs;
1844 return;
1847 /* C.11 */
1848 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1850 /* The argument is passed on stack; record the needed number of words for
1851 this argument and align the total size if necessary. */
1852 on_stack:
1853 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856 16 / UNITS_PER_WORD);
1857 return;
1860 /* Implement TARGET_FUNCTION_ARG. */
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864 const_tree type, bool named)
1866 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1869 if (mode == VOIDmode)
1870 return NULL_RTX;
1872 aarch64_layout_arg (pcum_v, mode, type, named);
1873 return pcum->aapcs_reg;
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878 const_tree fntype ATTRIBUTE_UNUSED,
1879 rtx libname ATTRIBUTE_UNUSED,
1880 const_tree fndecl ATTRIBUTE_UNUSED,
1881 unsigned n_named ATTRIBUTE_UNUSED)
1883 pcum->aapcs_ncrn = 0;
1884 pcum->aapcs_nvrn = 0;
1885 pcum->aapcs_nextncrn = 0;
1886 pcum->aapcs_nextnvrn = 0;
1887 pcum->pcs_variant = ARM_PCS_AAPCS64;
1888 pcum->aapcs_reg = NULL_RTX;
1889 pcum->aapcs_arg_processed = false;
1890 pcum->aapcs_stack_words = 0;
1891 pcum->aapcs_stack_size = 0;
1893 return;
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898 machine_mode mode,
1899 const_tree type,
1900 bool named)
1902 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1905 aarch64_layout_arg (pcum_v, mode, type, named);
1906 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907 != (pcum->aapcs_stack_words != 0));
1908 pcum->aapcs_arg_processed = false;
1909 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912 pcum->aapcs_stack_words = 0;
1913 pcum->aapcs_reg = NULL_RTX;
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1920 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1924 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1925 PARM_BOUNDARY bits of alignment, but will be given anything up
1926 to STACK_BOUNDARY bits if the type requires it. This makes sure
1927 that both before and after the layout of each argument, the Next
1928 Stacked Argument Address (NSAA) will have a minimum alignment of
1929 8 bytes. */
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1934 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1936 if (alignment < PARM_BOUNDARY)
1937 alignment = PARM_BOUNDARY;
1938 if (alignment > STACK_BOUNDARY)
1939 alignment = STACK_BOUNDARY;
1940 return alignment;
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1945 Return true if an argument passed on the stack should be padded upwards,
1946 i.e. if the least-significant byte of the stack slot has useful data.
1948 Small aggregate types are placed in the lowest memory address.
1950 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1955 /* On little-endian targets, the least significant byte of every stack
1956 argument is passed at the lowest byte address of the stack slot. */
1957 if (!BYTES_BIG_ENDIAN)
1958 return true;
1960 /* Otherwise, integral, floating-point and pointer types are padded downward:
1961 the least significant byte of a stack argument is passed at the highest
1962 byte address of the stack slot. */
1963 if (type
1964 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965 || POINTER_TYPE_P (type))
1966 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967 return false;
1969 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1970 return true;
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1975 It specifies padding for the last (may also be the only)
1976 element of a block move between registers and memory. If
1977 assuming the block is in the memory, padding upward means that
1978 the last element is padded after its highest significant byte,
1979 while in downward padding, the last element is padded at the
1980 its least significant byte side.
1982 Small aggregates and small complex types are always padded
1983 upwards.
1985 We don't need to worry about homogeneous floating-point or
1986 short-vector aggregates; their move is not affected by the
1987 padding direction determined here. Regardless of endianness,
1988 each element of such an aggregate is put in the least
1989 significant bits of a fp/simd register.
1991 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992 register has useful data, and return the opposite if the most
1993 significant byte does. */
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997 bool first ATTRIBUTE_UNUSED)
2000 /* Small composite types are always padded upward. */
2001 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2003 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004 : GET_MODE_SIZE (mode));
2005 if (size < 2 * UNITS_PER_WORD)
2006 return true;
2009 /* Otherwise, use the default padding. */
2010 return !BYTES_BIG_ENDIAN;
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2016 return SImode;
2019 static bool
2020 aarch64_frame_pointer_required (void)
2022 /* In aarch64_override_options_after_change
2023 flag_omit_leaf_frame_pointer turns off the frame pointer by
2024 default. Turn it back on now if we've not got a leaf
2025 function. */
2026 if (flag_omit_leaf_frame_pointer
2027 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028 return true;
2030 return false;
2033 /* Mark the registers that need to be saved by the callee and calculate
2034 the size of the callee-saved registers area and frame record (both FP
2035 and LR may be omitted). */
2036 static void
2037 aarch64_layout_frame (void)
2039 HOST_WIDE_INT offset = 0;
2040 int regno;
2042 if (reload_completed && cfun->machine->frame.laid_out)
2043 return;
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED (-1)
2048 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2051 /* First mark all the registers that really need to be saved... */
2052 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2055 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2058 /* ... that includes the eh data registers (if needed)... */
2059 if (crtl->calls_eh_return)
2060 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062 = SLOT_REQUIRED;
2064 /* ... and any callee saved register that dataflow says is live. */
2065 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066 if (df_regs_ever_live_p (regno)
2067 && (regno == R30_REGNUM
2068 || !call_used_regs[regno]))
2069 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2071 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072 if (df_regs_ever_live_p (regno)
2073 && !call_used_regs[regno])
2074 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2076 if (frame_pointer_needed)
2078 /* FP and LR are placed in the linkage record. */
2079 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084 offset += 2 * UNITS_PER_WORD;
2087 /* Now assign stack slots for them. */
2088 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2091 cfun->machine->frame.reg_offset[regno] = offset;
2092 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093 cfun->machine->frame.wb_candidate1 = regno;
2094 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095 cfun->machine->frame.wb_candidate2 = regno;
2096 offset += UNITS_PER_WORD;
2099 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2102 cfun->machine->frame.reg_offset[regno] = offset;
2103 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104 cfun->machine->frame.wb_candidate1 = regno;
2105 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107 cfun->machine->frame.wb_candidate2 = regno;
2108 offset += UNITS_PER_WORD;
2111 cfun->machine->frame.padding0 =
2112 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2115 cfun->machine->frame.saved_regs_size = offset;
2117 cfun->machine->frame.hard_fp_offset
2118 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119 + get_frame_size ()
2120 + cfun->machine->frame.saved_regs_size,
2121 STACK_BOUNDARY / BITS_PER_UNIT);
2123 cfun->machine->frame.frame_size
2124 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125 + crtl->outgoing_args_size,
2126 STACK_BOUNDARY / BITS_PER_UNIT);
2128 cfun->machine->frame.laid_out = true;
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2134 return cfun->machine->frame.reg_offset[regno] >= 0;
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2140 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141 regno ++;
2142 return regno;
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147 HOST_WIDE_INT adjustment)
2149 rtx base_rtx = stack_pointer_rtx;
2150 rtx insn, reg, mem;
2152 reg = gen_rtx_REG (mode, regno);
2153 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154 plus_constant (Pmode, base_rtx, -adjustment));
2155 mem = gen_rtx_MEM (mode, mem);
2157 insn = emit_move_insn (mem, reg);
2158 RTX_FRAME_RELATED_P (insn) = 1;
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163 HOST_WIDE_INT adjustment)
2165 switch (mode)
2167 case DImode:
2168 return gen_storewb_pairdi_di (base, base, reg, reg2,
2169 GEN_INT (-adjustment),
2170 GEN_INT (UNITS_PER_WORD - adjustment));
2171 case DFmode:
2172 return gen_storewb_pairdf_di (base, base, reg, reg2,
2173 GEN_INT (-adjustment),
2174 GEN_INT (UNITS_PER_WORD - adjustment));
2175 default:
2176 gcc_unreachable ();
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182 unsigned regno2, HOST_WIDE_INT adjustment)
2184 rtx_insn *insn;
2185 rtx reg1 = gen_rtx_REG (mode, regno1);
2186 rtx reg2 = gen_rtx_REG (mode, regno2);
2188 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189 reg2, adjustment));
2190 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192 RTX_FRAME_RELATED_P (insn) = 1;
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197 HOST_WIDE_INT adjustment)
2199 switch (mode)
2201 case DImode:
2202 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203 GEN_INT (UNITS_PER_WORD));
2204 case DFmode:
2205 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206 GEN_INT (UNITS_PER_WORD));
2207 default:
2208 gcc_unreachable ();
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214 rtx reg2)
2216 switch (mode)
2218 case DImode:
2219 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2221 case DFmode:
2222 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2224 default:
2225 gcc_unreachable ();
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231 rtx mem2)
2233 switch (mode)
2235 case DImode:
2236 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2238 case DFmode:
2239 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2241 default:
2242 gcc_unreachable ();
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249 unsigned start, unsigned limit, bool skip_wb)
2251 rtx_insn *insn;
2252 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253 ? gen_frame_mem : gen_rtx_MEM);
2254 unsigned regno;
2255 unsigned regno2;
2257 for (regno = aarch64_next_callee_save (start, limit);
2258 regno <= limit;
2259 regno = aarch64_next_callee_save (regno + 1, limit))
2261 rtx reg, mem;
2262 HOST_WIDE_INT offset;
2264 if (skip_wb
2265 && (regno == cfun->machine->frame.wb_candidate1
2266 || regno == cfun->machine->frame.wb_candidate2))
2267 continue;
2269 reg = gen_rtx_REG (mode, regno);
2270 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272 offset));
2274 regno2 = aarch64_next_callee_save (regno + 1, limit);
2276 if (regno2 <= limit
2277 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278 == cfun->machine->frame.reg_offset[regno2]))
2281 rtx reg2 = gen_rtx_REG (mode, regno2);
2282 rtx mem2;
2284 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286 offset));
2287 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288 reg2));
2290 /* The first part of a frame-related parallel insn is
2291 always assumed to be relevant to the frame
2292 calculations; subsequent parts, are only
2293 frame-related if explicitly marked. */
2294 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295 regno = regno2;
2297 else
2298 insn = emit_move_insn (mem, reg);
2300 RTX_FRAME_RELATED_P (insn) = 1;
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306 HOST_WIDE_INT start_offset, unsigned start,
2307 unsigned limit, bool skip_wb, rtx *cfi_ops)
2309 rtx base_rtx = stack_pointer_rtx;
2310 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311 ? gen_frame_mem : gen_rtx_MEM);
2312 unsigned regno;
2313 unsigned regno2;
2314 HOST_WIDE_INT offset;
2316 for (regno = aarch64_next_callee_save (start, limit);
2317 regno <= limit;
2318 regno = aarch64_next_callee_save (regno + 1, limit))
2320 rtx reg, mem;
2322 if (skip_wb
2323 && (regno == cfun->machine->frame.wb_candidate1
2324 || regno == cfun->machine->frame.wb_candidate2))
2325 continue;
2327 reg = gen_rtx_REG (mode, regno);
2328 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2331 regno2 = aarch64_next_callee_save (regno + 1, limit);
2333 if (regno2 <= limit
2334 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335 == cfun->machine->frame.reg_offset[regno2]))
2337 rtx reg2 = gen_rtx_REG (mode, regno2);
2338 rtx mem2;
2340 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2344 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345 regno = regno2;
2347 else
2348 emit_move_insn (reg, mem);
2349 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2353 /* AArch64 stack frames generated by this compiler look like:
2355 +-------------------------------+
2357 | incoming stack arguments |
2359 +-------------------------------+
2360 | | <-- incoming stack pointer (aligned)
2361 | callee-allocated save area |
2362 | for register varargs |
2364 +-------------------------------+
2365 | local variables | <-- frame_pointer_rtx
2367 +-------------------------------+
2368 | padding0 | \
2369 +-------------------------------+ |
2370 | callee-saved registers | | frame.saved_regs_size
2371 +-------------------------------+ |
2372 | LR' | |
2373 +-------------------------------+ |
2374 | FP' | / <- hard_frame_pointer_rtx (aligned)
2375 +-------------------------------+
2376 | dynamic allocation |
2377 +-------------------------------+
2378 | padding |
2379 +-------------------------------+
2380 | outgoing stack arguments | <-- arg_pointer
2382 +-------------------------------+
2383 | | <-- stack_pointer_rtx (aligned)
2385 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387 unchanged. */
2389 /* Generate the prologue instructions for entry into a function.
2390 Establish the stack frame by decreasing the stack pointer with a
2391 properly calculated size and, if necessary, create a frame record
2392 filled with the values of LR and previous frame pointer. The
2393 current FP is also set up if it is in use. */
2395 void
2396 aarch64_expand_prologue (void)
2398 /* sub sp, sp, #<frame_size>
2399 stp {fp, lr}, [sp, #<frame_size> - 16]
2400 add fp, sp, #<frame_size> - hardfp_offset
2401 stp {cs_reg}, [fp, #-16] etc.
2403 sub sp, sp, <final_adjustment_if_any>
2405 HOST_WIDE_INT frame_size, offset;
2406 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2407 HOST_WIDE_INT hard_fp_offset;
2408 rtx_insn *insn;
2410 aarch64_layout_frame ();
2412 offset = frame_size = cfun->machine->frame.frame_size;
2413 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414 fp_offset = frame_size - hard_fp_offset;
2416 if (flag_stack_usage_info)
2417 current_function_static_stack_size = frame_size;
2419 /* Store pairs and load pairs have a range only -512 to 504. */
2420 if (offset >= 512)
2422 /* When the frame has a large size, an initial decrease is done on
2423 the stack pointer to jump over the callee-allocated save area for
2424 register varargs, the local variable area and/or the callee-saved
2425 register area. This will allow the pre-index write-back
2426 store pair instructions to be used for setting up the stack frame
2427 efficiently. */
2428 offset = hard_fp_offset;
2429 if (offset >= 512)
2430 offset = cfun->machine->frame.saved_regs_size;
2432 frame_size -= (offset + crtl->outgoing_args_size);
2433 fp_offset = 0;
2435 if (frame_size >= 0x1000000)
2437 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438 emit_move_insn (op0, GEN_INT (-frame_size));
2439 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2441 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443 plus_constant (Pmode, stack_pointer_rtx,
2444 -frame_size)));
2445 RTX_FRAME_RELATED_P (insn) = 1;
2447 else if (frame_size > 0)
2449 int hi_ofs = frame_size & 0xfff000;
2450 int lo_ofs = frame_size & 0x000fff;
2452 if (hi_ofs)
2454 insn = emit_insn (gen_add2_insn
2455 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456 RTX_FRAME_RELATED_P (insn) = 1;
2458 if (lo_ofs)
2460 insn = emit_insn (gen_add2_insn
2461 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462 RTX_FRAME_RELATED_P (insn) = 1;
2466 else
2467 frame_size = -1;
2469 if (offset > 0)
2471 bool skip_wb = false;
2473 if (frame_pointer_needed)
2475 skip_wb = true;
2477 if (fp_offset)
2479 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480 GEN_INT (-offset)));
2481 RTX_FRAME_RELATED_P (insn) = 1;
2483 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484 R30_REGNUM, false);
2486 else
2487 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2489 /* Set up frame pointer to point to the location of the
2490 previous frame pointer on the stack. */
2491 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492 stack_pointer_rtx,
2493 GEN_INT (fp_offset)));
2494 RTX_FRAME_RELATED_P (insn) = 1;
2495 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2497 else
2499 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2502 if (fp_offset
2503 || reg1 == FIRST_PSEUDO_REGISTER
2504 || (reg2 == FIRST_PSEUDO_REGISTER
2505 && offset >= 256))
2507 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508 GEN_INT (-offset)));
2509 RTX_FRAME_RELATED_P (insn) = 1;
2511 else
2513 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2515 skip_wb = true;
2517 if (reg2 == FIRST_PSEUDO_REGISTER)
2518 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519 else
2520 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2524 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525 skip_wb);
2526 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527 skip_wb);
2530 /* when offset >= 512,
2531 sub sp, sp, #<outgoing_args_size> */
2532 if (frame_size > -1)
2534 if (crtl->outgoing_args_size > 0)
2536 insn = emit_insn (gen_add2_insn
2537 (stack_pointer_rtx,
2538 GEN_INT (- crtl->outgoing_args_size)));
2539 RTX_FRAME_RELATED_P (insn) = 1;
2544 /* Return TRUE if we can use a simple_return insn.
2546 This function checks whether the callee saved stack is empty, which
2547 means no restore actions are need. The pro_and_epilogue will use
2548 this to check whether shrink-wrapping opt is feasible. */
2550 bool
2551 aarch64_use_return_insn_p (void)
2553 if (!reload_completed)
2554 return false;
2556 if (crtl->profile)
2557 return false;
2559 aarch64_layout_frame ();
2561 return cfun->machine->frame.frame_size == 0;
2564 /* Generate the epilogue instructions for returning from a function. */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2568 HOST_WIDE_INT frame_size, offset;
2569 HOST_WIDE_INT fp_offset;
2570 HOST_WIDE_INT hard_fp_offset;
2571 rtx_insn *insn;
2572 /* We need to add memory barrier to prevent read from deallocated stack. */
2573 bool need_barrier_p = (get_frame_size () != 0
2574 || cfun->machine->frame.saved_varargs_size);
2576 aarch64_layout_frame ();
2578 offset = frame_size = cfun->machine->frame.frame_size;
2579 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580 fp_offset = frame_size - hard_fp_offset;
2582 /* Store pairs and load pairs have a range only -512 to 504. */
2583 if (offset >= 512)
2585 offset = hard_fp_offset;
2586 if (offset >= 512)
2587 offset = cfun->machine->frame.saved_regs_size;
2589 frame_size -= (offset + crtl->outgoing_args_size);
2590 fp_offset = 0;
2591 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2593 insn = emit_insn (gen_add2_insn
2594 (stack_pointer_rtx,
2595 GEN_INT (crtl->outgoing_args_size)));
2596 RTX_FRAME_RELATED_P (insn) = 1;
2599 else
2600 frame_size = -1;
2602 /* If there were outgoing arguments or we've done dynamic stack
2603 allocation, then restore the stack pointer from the frame
2604 pointer. This is at most one insn and more efficient than using
2605 GCC's internal mechanism. */
2606 if (frame_pointer_needed
2607 && (crtl->outgoing_args_size || cfun->calls_alloca))
2609 if (cfun->calls_alloca)
2610 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2612 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613 hard_frame_pointer_rtx,
2614 GEN_INT (0)));
2615 offset = offset - fp_offset;
2618 if (offset > 0)
2620 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622 bool skip_wb = true;
2623 rtx cfi_ops = NULL;
2625 if (frame_pointer_needed)
2626 fp_offset = 0;
2627 else if (fp_offset
2628 || reg1 == FIRST_PSEUDO_REGISTER
2629 || (reg2 == FIRST_PSEUDO_REGISTER
2630 && offset >= 256))
2631 skip_wb = false;
2633 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634 skip_wb, &cfi_ops);
2635 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636 skip_wb, &cfi_ops);
2638 if (need_barrier_p)
2639 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2641 if (skip_wb)
2643 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2646 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647 if (reg2 == FIRST_PSEUDO_REGISTER)
2649 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651 mem = gen_rtx_MEM (mode1, mem);
2652 insn = emit_move_insn (rreg1, mem);
2654 else
2656 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2658 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659 insn = emit_insn (aarch64_gen_loadwb_pair
2660 (mode1, stack_pointer_rtx, rreg1,
2661 rreg2, offset));
2664 else
2666 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667 GEN_INT (offset)));
2670 /* Reset the CFA to be SP + FRAME_SIZE. */
2671 rtx new_cfa = stack_pointer_rtx;
2672 if (frame_size > 0)
2673 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675 REG_NOTES (insn) = cfi_ops;
2676 RTX_FRAME_RELATED_P (insn) = 1;
2679 if (frame_size > 0)
2681 if (need_barrier_p)
2682 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2684 if (frame_size >= 0x1000000)
2686 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687 emit_move_insn (op0, GEN_INT (frame_size));
2688 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2690 else
2692 int hi_ofs = frame_size & 0xfff000;
2693 int lo_ofs = frame_size & 0x000fff;
2695 if (hi_ofs && lo_ofs)
2697 insn = emit_insn (gen_add2_insn
2698 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699 RTX_FRAME_RELATED_P (insn) = 1;
2700 frame_size = lo_ofs;
2702 insn = emit_insn (gen_add2_insn
2703 (stack_pointer_rtx, GEN_INT (frame_size)));
2706 /* Reset the CFA to be SP + 0. */
2707 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708 RTX_FRAME_RELATED_P (insn) = 1;
2711 /* Stack adjustment for exception handler. */
2712 if (crtl->calls_eh_return)
2714 /* We need to unwind the stack by the offset computed by
2715 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2716 to be SP; letting the CFA move during this adjustment
2717 is just as correct as retaining the CFA from the body
2718 of the function. Therefore, do nothing special. */
2719 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2722 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723 if (!for_sibcall)
2724 emit_jump_insn (ret_rtx);
2727 /* Return the place to copy the exception unwinding return address to.
2728 This will probably be a stack slot, but could (in theory be the
2729 return register). */
2731 aarch64_final_eh_return_addr (void)
2733 HOST_WIDE_INT fp_offset;
2735 aarch64_layout_frame ();
2737 fp_offset = cfun->machine->frame.frame_size
2738 - cfun->machine->frame.hard_fp_offset;
2740 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741 return gen_rtx_REG (DImode, LR_REGNUM);
2743 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2744 result in a store to save LR introduced by builtin_eh_return () being
2745 incorrectly deleted because the alias is not detected.
2746 So in the calculation of the address to copy the exception unwinding
2747 return address to, we note 2 cases.
2748 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749 we return a SP-relative location since all the addresses are SP-relative
2750 in this case. This prevents the store from being optimized away.
2751 If the fp_offset is not 0, then the addresses will be FP-relative and
2752 therefore we return a FP-relative location. */
2754 if (frame_pointer_needed)
2756 if (fp_offset)
2757 return gen_frame_mem (DImode,
2758 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759 else
2760 return gen_frame_mem (DImode,
2761 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2764 /* If FP is not needed, we calculate the location of LR, which would be
2765 at the top of the saved registers block. */
2767 return gen_frame_mem (DImode,
2768 plus_constant (Pmode,
2769 stack_pointer_rtx,
2770 fp_offset
2771 + cfun->machine->frame.saved_regs_size
2772 - 2 * UNITS_PER_WORD));
2775 /* Possibly output code to build up a constant in a register. For
2776 the benefit of the costs infrastructure, returns the number of
2777 instructions which would be emitted. GENERATE inhibits or
2778 enables code generation. */
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2783 int insns = 0;
2785 if (aarch64_bitmask_imm (val, DImode))
2787 if (generate)
2788 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789 insns = 1;
2791 else
2793 int i;
2794 int ncount = 0;
2795 int zcount = 0;
2796 HOST_WIDE_INT valp = val >> 16;
2797 HOST_WIDE_INT valm;
2798 HOST_WIDE_INT tval;
2800 for (i = 16; i < 64; i += 16)
2802 valm = (valp & 0xffff);
2804 if (valm != 0)
2805 ++ zcount;
2807 if (valm != 0xffff)
2808 ++ ncount;
2810 valp >>= 16;
2813 /* zcount contains the number of additional MOVK instructions
2814 required if the constant is built up with an initial MOVZ instruction,
2815 while ncount is the number of MOVK instructions required if starting
2816 with a MOVN instruction. Choose the sequence that yields the fewest
2817 number of instructions, preferring MOVZ instructions when they are both
2818 the same. */
2819 if (ncount < zcount)
2821 if (generate)
2822 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824 tval = 0xffff;
2825 insns++;
2827 else
2829 if (generate)
2830 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831 GEN_INT (val & 0xffff));
2832 tval = 0;
2833 insns++;
2836 val >>= 16;
2838 for (i = 16; i < 64; i += 16)
2840 if ((val & 0xffff) != tval)
2842 if (generate)
2843 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844 GEN_INT (i),
2845 GEN_INT (val & 0xffff)));
2846 insns++;
2848 val >>= 16;
2851 return insns;
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2857 HOST_WIDE_INT mdelta = delta;
2858 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2861 if (mdelta < 0)
2862 mdelta = -mdelta;
2864 if (mdelta >= 4096 * 4096)
2866 (void) aarch64_build_constant (scratchreg, delta, true);
2867 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2869 else if (mdelta > 0)
2871 if (mdelta >= 4096)
2873 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875 if (delta < 0)
2876 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878 else
2879 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2882 if (mdelta % 4096 != 0)
2884 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2891 /* Output code to add DELTA to the first argument, and then jump
2892 to FUNCTION. Used for C++ multiple inheritance. */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895 HOST_WIDE_INT delta,
2896 HOST_WIDE_INT vcall_offset,
2897 tree function)
2899 /* The this pointer is always in x0. Note that this differs from
2900 Arm where the this pointer maybe bumped to r1 if r0 is required
2901 to return a pointer to an aggregate. On AArch64 a result value
2902 pointer will be in x8. */
2903 int this_regno = R0_REGNUM;
2904 rtx this_rtx, temp0, temp1, addr, funexp;
2905 rtx_insn *insn;
2907 reload_completed = 1;
2908 emit_note (NOTE_INSN_PROLOGUE_END);
2910 if (vcall_offset == 0)
2911 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912 else
2914 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2916 this_rtx = gen_rtx_REG (Pmode, this_regno);
2917 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2920 addr = this_rtx;
2921 if (delta != 0)
2923 if (delta >= -256 && delta < 256)
2924 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925 plus_constant (Pmode, this_rtx, delta));
2926 else
2927 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2930 if (Pmode == ptr_mode)
2931 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932 else
2933 aarch64_emit_move (temp0,
2934 gen_rtx_ZERO_EXTEND (Pmode,
2935 gen_rtx_MEM (ptr_mode, addr)));
2937 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938 addr = plus_constant (Pmode, temp0, vcall_offset);
2939 else
2941 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2945 if (Pmode == ptr_mode)
2946 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947 else
2948 aarch64_emit_move (temp1,
2949 gen_rtx_SIGN_EXTEND (Pmode,
2950 gen_rtx_MEM (ptr_mode, addr)));
2952 emit_insn (gen_add2_insn (this_rtx, temp1));
2955 /* Generate a tail call to the target function. */
2956 if (!TREE_USED (function))
2958 assemble_external (function);
2959 TREE_USED (function) = 1;
2961 funexp = XEXP (DECL_RTL (function), 0);
2962 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964 SIBLING_CALL_P (insn) = 1;
2966 insn = get_insns ();
2967 shorten_branches (insn);
2968 final_start_function (insn, file, 1);
2969 final (insn, file, 1);
2970 final_end_function ();
2972 /* Stop pretending to be a post-reload pass. */
2973 reload_completed = 0;
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2979 if (!TARGET_HAVE_TLS)
2980 return false;
2981 subrtx_iterator::array_type array;
2982 FOR_EACH_SUBRTX (iter, array, x, ALL)
2984 const_rtx x = *iter;
2985 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986 return true;
2987 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988 TLS offsets, not real symbol references. */
2989 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990 iter.skip_subrtxes ();
2992 return false;
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2999 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3002 if (*imm1 < *imm2)
3003 return -1;
3004 if (*imm1 > *imm2)
3005 return +1;
3006 return 0;
3010 static void
3011 aarch64_build_bitmask_table (void)
3013 unsigned HOST_WIDE_INT mask, imm;
3014 unsigned int log_e, e, s, r;
3015 unsigned int nimms = 0;
3017 for (log_e = 1; log_e <= 6; log_e++)
3019 e = 1 << log_e;
3020 if (e == 64)
3021 mask = ~(HOST_WIDE_INT) 0;
3022 else
3023 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024 for (s = 1; s < e; s++)
3026 for (r = 0; r < e; r++)
3028 /* set s consecutive bits to 1 (s < 64) */
3029 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030 /* rotate right by r */
3031 if (r != 0)
3032 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033 /* replicate the constant depending on SIMD size */
3034 switch (log_e) {
3035 case 1: imm |= (imm << 2);
3036 case 2: imm |= (imm << 4);
3037 case 3: imm |= (imm << 8);
3038 case 4: imm |= (imm << 16);
3039 case 5: imm |= (imm << 32);
3040 case 6:
3041 break;
3042 default:
3043 gcc_unreachable ();
3045 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046 aarch64_bitmasks[nimms++] = imm;
3051 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053 aarch64_bitmasks_cmp);
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058 a left shift of 0 or 12 bits. */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3062 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3068 /* Return true if val is an immediate that can be loaded into a
3069 register by a MOVZ instruction. */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3073 if (GET_MODE_SIZE (mode) > 4)
3075 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077 return 1;
3079 else
3081 /* Ignore sign extension. */
3082 val &= (HOST_WIDE_INT) 0xffffffff;
3084 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3089 /* Return true if val is a valid bitmask immediate. */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3093 if (GET_MODE_SIZE (mode) < 8)
3095 /* Replicate bit pattern. */
3096 val &= (HOST_WIDE_INT) 0xffffffff;
3097 val |= val << 32;
3099 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3104 /* Return true if val is an immediate that can be loaded into a
3105 register in a single instruction. */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3109 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110 return 1;
3111 return aarch64_bitmask_imm (val, mode);
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3117 rtx base, offset;
3119 if (GET_CODE (x) == HIGH)
3120 return true;
3122 split_const (x, &base, &offset);
3123 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3125 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126 != SYMBOL_FORCE_TO_MEM)
3127 return true;
3128 else
3129 /* Avoid generating a 64-bit relocation in ILP32; leave
3130 to aarch64_expand_mov_immediate to handle it properly. */
3131 return mode != ptr_mode;
3134 return aarch64_tls_referenced_p (x);
3137 /* Return true if register REGNO is a valid index register.
3138 STRICT_P is true if REG_OK_STRICT is in effect. */
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3143 if (!HARD_REGISTER_NUM_P (regno))
3145 if (!strict_p)
3146 return true;
3148 if (!reg_renumber)
3149 return false;
3151 regno = reg_renumber[regno];
3153 return GP_REGNUM_P (regno);
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157 STRICT_P is true if REG_OK_STRICT is in effect. */
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3162 if (!HARD_REGISTER_NUM_P (regno))
3164 if (!strict_p)
3165 return true;
3167 if (!reg_renumber)
3168 return false;
3170 regno = reg_renumber[regno];
3173 /* The fake registers will be eliminated to either the stack or
3174 hard frame pointer, both of which are usually valid base registers.
3175 Reload deals with the cases where the eliminated form isn't valid. */
3176 return (GP_REGNUM_P (regno)
3177 || regno == SP_REGNUM
3178 || regno == FRAME_POINTER_REGNUM
3179 || regno == ARG_POINTER_REGNUM);
3182 /* Return true if X is a valid base register for mode MODE.
3183 STRICT_P is true if REG_OK_STRICT is in effect. */
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3188 if (!strict_p && GET_CODE (x) == SUBREG)
3189 x = SUBREG_REG (x);
3191 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3194 /* Return true if address offset is a valid index. If it is, fill in INFO
3195 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199 machine_mode mode, bool strict_p)
3201 enum aarch64_address_type type;
3202 rtx index;
3203 int shift;
3205 /* (reg:P) */
3206 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207 && GET_MODE (x) == Pmode)
3209 type = ADDRESS_REG_REG;
3210 index = x;
3211 shift = 0;
3213 /* (sign_extend:DI (reg:SI)) */
3214 else if ((GET_CODE (x) == SIGN_EXTEND
3215 || GET_CODE (x) == ZERO_EXTEND)
3216 && GET_MODE (x) == DImode
3217 && GET_MODE (XEXP (x, 0)) == SImode)
3219 type = (GET_CODE (x) == SIGN_EXTEND)
3220 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221 index = XEXP (x, 0);
3222 shift = 0;
3224 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225 else if (GET_CODE (x) == MULT
3226 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228 && GET_MODE (XEXP (x, 0)) == DImode
3229 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230 && CONST_INT_P (XEXP (x, 1)))
3232 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234 index = XEXP (XEXP (x, 0), 0);
3235 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3237 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238 else if (GET_CODE (x) == ASHIFT
3239 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241 && GET_MODE (XEXP (x, 0)) == DImode
3242 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243 && CONST_INT_P (XEXP (x, 1)))
3245 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247 index = XEXP (XEXP (x, 0), 0);
3248 shift = INTVAL (XEXP (x, 1));
3250 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251 else if ((GET_CODE (x) == SIGN_EXTRACT
3252 || GET_CODE (x) == ZERO_EXTRACT)
3253 && GET_MODE (x) == DImode
3254 && GET_CODE (XEXP (x, 0)) == MULT
3255 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3258 type = (GET_CODE (x) == SIGN_EXTRACT)
3259 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260 index = XEXP (XEXP (x, 0), 0);
3261 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262 if (INTVAL (XEXP (x, 1)) != 32 + shift
3263 || INTVAL (XEXP (x, 2)) != 0)
3264 shift = -1;
3266 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267 (const_int 0xffffffff<<shift)) */
3268 else if (GET_CODE (x) == AND
3269 && GET_MODE (x) == DImode
3270 && GET_CODE (XEXP (x, 0)) == MULT
3271 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273 && CONST_INT_P (XEXP (x, 1)))
3275 type = ADDRESS_REG_UXTW;
3276 index = XEXP (XEXP (x, 0), 0);
3277 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279 shift = -1;
3281 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282 else if ((GET_CODE (x) == SIGN_EXTRACT
3283 || GET_CODE (x) == ZERO_EXTRACT)
3284 && GET_MODE (x) == DImode
3285 && GET_CODE (XEXP (x, 0)) == ASHIFT
3286 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3289 type = (GET_CODE (x) == SIGN_EXTRACT)
3290 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291 index = XEXP (XEXP (x, 0), 0);
3292 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293 if (INTVAL (XEXP (x, 1)) != 32 + shift
3294 || INTVAL (XEXP (x, 2)) != 0)
3295 shift = -1;
3297 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298 (const_int 0xffffffff<<shift)) */
3299 else if (GET_CODE (x) == AND
3300 && GET_MODE (x) == DImode
3301 && GET_CODE (XEXP (x, 0)) == ASHIFT
3302 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304 && CONST_INT_P (XEXP (x, 1)))
3306 type = ADDRESS_REG_UXTW;
3307 index = XEXP (XEXP (x, 0), 0);
3308 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310 shift = -1;
3312 /* (mult:P (reg:P) (const_int scale)) */
3313 else if (GET_CODE (x) == MULT
3314 && GET_MODE (x) == Pmode
3315 && GET_MODE (XEXP (x, 0)) == Pmode
3316 && CONST_INT_P (XEXP (x, 1)))
3318 type = ADDRESS_REG_REG;
3319 index = XEXP (x, 0);
3320 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3322 /* (ashift:P (reg:P) (const_int shift)) */
3323 else if (GET_CODE (x) == ASHIFT
3324 && GET_MODE (x) == Pmode
3325 && GET_MODE (XEXP (x, 0)) == Pmode
3326 && CONST_INT_P (XEXP (x, 1)))
3328 type = ADDRESS_REG_REG;
3329 index = XEXP (x, 0);
3330 shift = INTVAL (XEXP (x, 1));
3332 else
3333 return false;
3335 if (GET_CODE (index) == SUBREG)
3336 index = SUBREG_REG (index);
3338 if ((shift == 0 ||
3339 (shift > 0 && shift <= 3
3340 && (1 << shift) == GET_MODE_SIZE (mode)))
3341 && REG_P (index)
3342 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3344 info->type = type;
3345 info->offset = index;
3346 info->shift = shift;
3347 return true;
3350 return false;
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 return (offset >= -64 * GET_MODE_SIZE (mode)
3357 && offset < 64 * GET_MODE_SIZE (mode)
3358 && offset % GET_MODE_SIZE (mode) == 0);
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363 HOST_WIDE_INT offset)
3365 return offset >= -256 && offset < 256;
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3371 return (offset >= 0
3372 && offset < 4096 * GET_MODE_SIZE (mode)
3373 && offset % GET_MODE_SIZE (mode) == 0);
3376 /* Return true if X is a valid address for machine mode MODE. If it is,
3377 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3378 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382 rtx x, machine_mode mode,
3383 RTX_CODE outer_code, bool strict_p)
3385 enum rtx_code code = GET_CODE (x);
3386 rtx op0, op1;
3388 /* On BE, we use load/store pair for all large int mode load/stores. */
3389 bool load_store_pair_p = (outer_code == PARALLEL
3390 || (BYTES_BIG_ENDIAN
3391 && aarch64_vect_struct_mode_p (mode)));
3393 bool allow_reg_index_p =
3394 !load_store_pair_p
3395 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396 && !aarch64_vect_struct_mode_p (mode);
3398 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399 REG addressing. */
3400 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401 && (code != POST_INC && code != REG))
3402 return false;
3404 switch (code)
3406 case REG:
3407 case SUBREG:
3408 info->type = ADDRESS_REG_IMM;
3409 info->base = x;
3410 info->offset = const0_rtx;
3411 return aarch64_base_register_rtx_p (x, strict_p);
3413 case PLUS:
3414 op0 = XEXP (x, 0);
3415 op1 = XEXP (x, 1);
3417 if (! strict_p
3418 && REG_P (op0)
3419 && (op0 == virtual_stack_vars_rtx
3420 || op0 == frame_pointer_rtx
3421 || op0 == arg_pointer_rtx)
3422 && CONST_INT_P (op1))
3424 info->type = ADDRESS_REG_IMM;
3425 info->base = op0;
3426 info->offset = op1;
3428 return true;
3431 if (GET_MODE_SIZE (mode) != 0
3432 && CONST_INT_P (op1)
3433 && aarch64_base_register_rtx_p (op0, strict_p))
3435 HOST_WIDE_INT offset = INTVAL (op1);
3437 info->type = ADDRESS_REG_IMM;
3438 info->base = op0;
3439 info->offset = op1;
3441 /* TImode and TFmode values are allowed in both pairs of X
3442 registers and individual Q registers. The available
3443 address modes are:
3444 X,X: 7-bit signed scaled offset
3445 Q: 9-bit signed offset
3446 We conservatively require an offset representable in either mode.
3448 if (mode == TImode || mode == TFmode)
3449 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450 && offset_9bit_signed_unscaled_p (mode, offset));
3452 /* A 7bit offset check because OImode will emit a ldp/stp
3453 instruction (only big endian will get here).
3454 For ldp/stp instructions, the offset is scaled for the size of a
3455 single element of the pair. */
3456 if (mode == OImode)
3457 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3459 /* Three 9/12 bit offsets checks because CImode will emit three
3460 ldr/str instructions (only big endian will get here). */
3461 if (mode == CImode)
3462 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464 || offset_12bit_unsigned_scaled_p (V16QImode,
3465 offset + 32)));
3467 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468 instructions (only big endian will get here). */
3469 if (mode == XImode)
3470 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471 && aarch64_offset_7bit_signed_scaled_p (TImode,
3472 offset + 32));
3474 if (load_store_pair_p)
3475 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477 else
3478 return (offset_9bit_signed_unscaled_p (mode, offset)
3479 || offset_12bit_unsigned_scaled_p (mode, offset));
3482 if (allow_reg_index_p)
3484 /* Look for base + (scaled/extended) index register. */
3485 if (aarch64_base_register_rtx_p (op0, strict_p)
3486 && aarch64_classify_index (info, op1, mode, strict_p))
3488 info->base = op0;
3489 return true;
3491 if (aarch64_base_register_rtx_p (op1, strict_p)
3492 && aarch64_classify_index (info, op0, mode, strict_p))
3494 info->base = op1;
3495 return true;
3499 return false;
3501 case POST_INC:
3502 case POST_DEC:
3503 case PRE_INC:
3504 case PRE_DEC:
3505 info->type = ADDRESS_REG_WB;
3506 info->base = XEXP (x, 0);
3507 info->offset = NULL_RTX;
3508 return aarch64_base_register_rtx_p (info->base, strict_p);
3510 case POST_MODIFY:
3511 case PRE_MODIFY:
3512 info->type = ADDRESS_REG_WB;
3513 info->base = XEXP (x, 0);
3514 if (GET_CODE (XEXP (x, 1)) == PLUS
3515 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517 && aarch64_base_register_rtx_p (info->base, strict_p))
3519 HOST_WIDE_INT offset;
3520 info->offset = XEXP (XEXP (x, 1), 1);
3521 offset = INTVAL (info->offset);
3523 /* TImode and TFmode values are allowed in both pairs of X
3524 registers and individual Q registers. The available
3525 address modes are:
3526 X,X: 7-bit signed scaled offset
3527 Q: 9-bit signed offset
3528 We conservatively require an offset representable in either mode.
3530 if (mode == TImode || mode == TFmode)
3531 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532 && offset_9bit_signed_unscaled_p (mode, offset));
3534 if (load_store_pair_p)
3535 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537 else
3538 return offset_9bit_signed_unscaled_p (mode, offset);
3540 return false;
3542 case CONST:
3543 case SYMBOL_REF:
3544 case LABEL_REF:
3545 /* load literal: pc-relative constant pool entry. Only supported
3546 for SI mode or larger. */
3547 info->type = ADDRESS_SYMBOLIC;
3549 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3551 rtx sym, addend;
3553 split_const (x, &sym, &addend);
3554 return (GET_CODE (sym) == LABEL_REF
3555 || (GET_CODE (sym) == SYMBOL_REF
3556 && CONSTANT_POOL_ADDRESS_P (sym)));
3558 return false;
3560 case LO_SUM:
3561 info->type = ADDRESS_LO_SUM;
3562 info->base = XEXP (x, 0);
3563 info->offset = XEXP (x, 1);
3564 if (allow_reg_index_p
3565 && aarch64_base_register_rtx_p (info->base, strict_p))
3567 rtx sym, offs;
3568 split_const (info->offset, &sym, &offs);
3569 if (GET_CODE (sym) == SYMBOL_REF
3570 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571 == SYMBOL_SMALL_ABSOLUTE))
3573 /* The symbol and offset must be aligned to the access size. */
3574 unsigned int align;
3575 unsigned int ref_size;
3577 if (CONSTANT_POOL_ADDRESS_P (sym))
3578 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3581 tree exp = SYMBOL_REF_DECL (sym);
3582 align = TYPE_ALIGN (TREE_TYPE (exp));
3583 align = CONSTANT_ALIGNMENT (exp, align);
3585 else if (SYMBOL_REF_DECL (sym))
3586 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588 && SYMBOL_REF_BLOCK (sym) != NULL)
3589 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590 else
3591 align = BITS_PER_UNIT;
3593 ref_size = GET_MODE_SIZE (mode);
3594 if (ref_size == 0)
3595 ref_size = GET_MODE_SIZE (DImode);
3597 return ((INTVAL (offs) & (ref_size - 1)) == 0
3598 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3601 return false;
3603 default:
3604 return false;
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3611 rtx offset;
3613 split_const (x, &x, &offset);
3614 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3617 /* Classify the base of symbolic expression X, given that X appears in
3618 context CONTEXT. */
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622 enum aarch64_symbol_context context)
3624 rtx offset;
3626 split_const (x, &x, &offset);
3627 return aarch64_classify_symbol (x, offset, context);
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632 mode MODE. */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3636 struct aarch64_address_info addr;
3638 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3643 pair operation. */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646 RTX_CODE outer_code, bool strict_p)
3648 struct aarch64_address_info addr;
3650 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3657 REAL_VALUE_TYPE r;
3659 if (GET_MODE (x) == VOIDmode)
3660 return false;
3662 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663 if (REAL_VALUE_MINUS_ZERO (r))
3664 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665 return REAL_VALUES_EQUAL (r, dconst0);
3668 /* Return the fixed registers used for condition codes. */
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3673 *p1 = CC_REGNUM;
3674 *p2 = INVALID_REGNUM;
3675 return true;
3678 /* Emit call insn with PAT and do aarch64-specific handling. */
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3683 rtx insn = emit_call_insn (pat);
3685 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3693 /* All floating point compares return CCFP if it is an equality
3694 comparison, and CCFPE otherwise. */
3695 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3697 switch (code)
3699 case EQ:
3700 case NE:
3701 case UNORDERED:
3702 case ORDERED:
3703 case UNLT:
3704 case UNLE:
3705 case UNGT:
3706 case UNGE:
3707 case UNEQ:
3708 case LTGT:
3709 return CCFPmode;
3711 case LT:
3712 case LE:
3713 case GT:
3714 case GE:
3715 return CCFPEmode;
3717 default:
3718 gcc_unreachable ();
3722 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723 && y == const0_rtx
3724 && (code == EQ || code == NE || code == LT || code == GE)
3725 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726 || GET_CODE (x) == NEG))
3727 return CC_NZmode;
3729 /* A compare with a shifted operand. Because of canonicalization,
3730 the comparison will have to be swapped when we emit the assembly
3731 code. */
3732 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733 && (REG_P (y) || GET_CODE (y) == SUBREG)
3734 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735 || GET_CODE (x) == LSHIFTRT
3736 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737 return CC_SWPmode;
3739 /* Similarly for a negated operand, but we can only do this for
3740 equalities. */
3741 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742 && (REG_P (y) || GET_CODE (y) == SUBREG)
3743 && (code == EQ || code == NE)
3744 && GET_CODE (x) == NEG)
3745 return CC_Zmode;
3747 /* A compare of a mode narrower than SI mode against zero can be done
3748 by extending the value in the comparison. */
3749 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750 && y == const0_rtx)
3751 /* Only use sign-extension if we really need it. */
3752 return ((code == GT || code == GE || code == LE || code == LT)
3753 ? CC_SESWPmode : CC_ZESWPmode);
3755 /* For everything else, return CCmode. */
3756 return CCmode;
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3763 aarch64_get_condition_code (rtx x)
3765 machine_mode mode = GET_MODE (XEXP (x, 0));
3766 enum rtx_code comp_code = GET_CODE (x);
3768 if (GET_MODE_CLASS (mode) != MODE_CC)
3769 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770 return aarch64_get_condition_code_1 (mode, comp_code);
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3776 int ne = -1, eq = -1;
3777 switch (mode)
3779 case CCFPmode:
3780 case CCFPEmode:
3781 switch (comp_code)
3783 case GE: return AARCH64_GE;
3784 case GT: return AARCH64_GT;
3785 case LE: return AARCH64_LS;
3786 case LT: return AARCH64_MI;
3787 case NE: return AARCH64_NE;
3788 case EQ: return AARCH64_EQ;
3789 case ORDERED: return AARCH64_VC;
3790 case UNORDERED: return AARCH64_VS;
3791 case UNLT: return AARCH64_LT;
3792 case UNLE: return AARCH64_LE;
3793 case UNGT: return AARCH64_HI;
3794 case UNGE: return AARCH64_PL;
3795 default: return -1;
3797 break;
3799 case CC_DNEmode:
3800 ne = AARCH64_NE;
3801 eq = AARCH64_EQ;
3802 break;
3804 case CC_DEQmode:
3805 ne = AARCH64_EQ;
3806 eq = AARCH64_NE;
3807 break;
3809 case CC_DGEmode:
3810 ne = AARCH64_GE;
3811 eq = AARCH64_LT;
3812 break;
3814 case CC_DLTmode:
3815 ne = AARCH64_LT;
3816 eq = AARCH64_GE;
3817 break;
3819 case CC_DGTmode:
3820 ne = AARCH64_GT;
3821 eq = AARCH64_LE;
3822 break;
3824 case CC_DLEmode:
3825 ne = AARCH64_LE;
3826 eq = AARCH64_GT;
3827 break;
3829 case CC_DGEUmode:
3830 ne = AARCH64_CS;
3831 eq = AARCH64_CC;
3832 break;
3834 case CC_DLTUmode:
3835 ne = AARCH64_CC;
3836 eq = AARCH64_CS;
3837 break;
3839 case CC_DGTUmode:
3840 ne = AARCH64_HI;
3841 eq = AARCH64_LS;
3842 break;
3844 case CC_DLEUmode:
3845 ne = AARCH64_LS;
3846 eq = AARCH64_HI;
3847 break;
3849 case CCmode:
3850 switch (comp_code)
3852 case NE: return AARCH64_NE;
3853 case EQ: return AARCH64_EQ;
3854 case GE: return AARCH64_GE;
3855 case GT: return AARCH64_GT;
3856 case LE: return AARCH64_LE;
3857 case LT: return AARCH64_LT;
3858 case GEU: return AARCH64_CS;
3859 case GTU: return AARCH64_HI;
3860 case LEU: return AARCH64_LS;
3861 case LTU: return AARCH64_CC;
3862 default: return -1;
3864 break;
3866 case CC_SWPmode:
3867 case CC_ZESWPmode:
3868 case CC_SESWPmode:
3869 switch (comp_code)
3871 case NE: return AARCH64_NE;
3872 case EQ: return AARCH64_EQ;
3873 case GE: return AARCH64_LE;
3874 case GT: return AARCH64_LT;
3875 case LE: return AARCH64_GE;
3876 case LT: return AARCH64_GT;
3877 case GEU: return AARCH64_LS;
3878 case GTU: return AARCH64_CC;
3879 case LEU: return AARCH64_CS;
3880 case LTU: return AARCH64_HI;
3881 default: return -1;
3883 break;
3885 case CC_NZmode:
3886 switch (comp_code)
3888 case NE: return AARCH64_NE;
3889 case EQ: return AARCH64_EQ;
3890 case GE: return AARCH64_PL;
3891 case LT: return AARCH64_MI;
3892 default: return -1;
3894 break;
3896 case CC_Zmode:
3897 switch (comp_code)
3899 case NE: return AARCH64_NE;
3900 case EQ: return AARCH64_EQ;
3901 default: return -1;
3903 break;
3905 default:
3906 return -1;
3907 break;
3910 if (comp_code == NE)
3911 return ne;
3913 if (comp_code == EQ)
3914 return eq;
3916 return -1;
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921 HOST_WIDE_INT minval,
3922 HOST_WIDE_INT maxval)
3924 HOST_WIDE_INT firstval;
3925 int count, i;
3927 if (GET_CODE (x) != CONST_VECTOR
3928 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929 return false;
3931 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932 if (firstval < minval || firstval > maxval)
3933 return false;
3935 count = CONST_VECTOR_NUNITS (x);
3936 for (i = 1; i < count; i++)
3937 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938 return false;
3940 return true;
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3946 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3952 unsigned count = 0;
3954 while (value)
3956 count++;
3957 value &= value - 1;
3960 return count;
3963 /* N Z C V. */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3969 /* N Z C V flags for ccmp. The first code is for AND op and the other
3970 is for IOR op. Indexed by AARCH64_COND_CODE. */
3971 static const int aarch64_nzcv_codes[][2] =
3973 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3974 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3975 {AARCH64_CC_C, 0}, /* CS, C == 1. */
3976 {0, AARCH64_CC_C}, /* CC, C == 0. */
3977 {AARCH64_CC_N, 0}, /* MI, N == 1. */
3978 {0, AARCH64_CC_N}, /* PL, N == 0. */
3979 {AARCH64_CC_V, 0}, /* VS, V == 1. */
3980 {0, AARCH64_CC_V}, /* VC, V == 0. */
3981 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
3982 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
3983 {0, AARCH64_CC_V}, /* GE, N == V. */
3984 {AARCH64_CC_V, 0}, /* LT, N != V. */
3985 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
3986 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
3987 {0, 0}, /* AL, Any. */
3988 {0, 0}, /* NV, Any. */
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3994 switch (mode)
3996 case CC_DNEmode:
3997 return NE;
3999 case CC_DEQmode:
4000 return EQ;
4002 case CC_DLEmode:
4003 return LE;
4005 case CC_DGTmode:
4006 return GT;
4008 case CC_DLTmode:
4009 return LT;
4011 case CC_DGEmode:
4012 return GE;
4014 case CC_DLEUmode:
4015 return LEU;
4017 case CC_DGTUmode:
4018 return GTU;
4020 case CC_DLTUmode:
4021 return LTU;
4023 case CC_DGEUmode:
4024 return GEU;
4026 default:
4027 gcc_unreachable ();
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4035 switch (code)
4037 /* An integer or symbol address without a preceding # sign. */
4038 case 'c':
4039 switch (GET_CODE (x))
4041 case CONST_INT:
4042 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043 break;
4045 case SYMBOL_REF:
4046 output_addr_const (f, x);
4047 break;
4049 case CONST:
4050 if (GET_CODE (XEXP (x, 0)) == PLUS
4051 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4053 output_addr_const (f, x);
4054 break;
4056 /* Fall through. */
4058 default:
4059 output_operand_lossage ("Unsupported operand for code '%c'", code);
4061 break;
4063 case 'e':
4064 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4066 int n;
4068 if (!CONST_INT_P (x)
4069 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4071 output_operand_lossage ("invalid operand for '%%%c'", code);
4072 return;
4075 switch (n)
4077 case 3:
4078 fputc ('b', f);
4079 break;
4080 case 4:
4081 fputc ('h', f);
4082 break;
4083 case 5:
4084 fputc ('w', f);
4085 break;
4086 default:
4087 output_operand_lossage ("invalid operand for '%%%c'", code);
4088 return;
4091 break;
4093 case 'p':
4095 int n;
4097 /* Print N such that 2^N == X. */
4098 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4100 output_operand_lossage ("invalid operand for '%%%c'", code);
4101 return;
4104 asm_fprintf (f, "%d", n);
4106 break;
4108 case 'P':
4109 /* Print the number of non-zero bits in X (a const_int). */
4110 if (!CONST_INT_P (x))
4112 output_operand_lossage ("invalid operand for '%%%c'", code);
4113 return;
4116 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117 break;
4119 case 'H':
4120 /* Print the higher numbered register of a pair (TImode) of regs. */
4121 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4123 output_operand_lossage ("invalid operand for '%%%c'", code);
4124 return;
4127 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128 break;
4130 case 'm':
4132 int cond_code;
4133 /* Print a condition (eq, ne, etc). */
4135 /* CONST_TRUE_RTX means always -- that's the default. */
4136 if (x == const_true_rtx)
4137 return;
4139 if (!COMPARISON_P (x))
4141 output_operand_lossage ("invalid operand for '%%%c'", code);
4142 return;
4145 cond_code = aarch64_get_condition_code (x);
4146 gcc_assert (cond_code >= 0);
4147 fputs (aarch64_condition_codes[cond_code], f);
4149 break;
4151 case 'M':
4153 int cond_code;
4154 /* Print the inverse of a condition (eq <-> ne, etc). */
4156 /* CONST_TRUE_RTX means never -- that's the default. */
4157 if (x == const_true_rtx)
4159 fputs ("nv", f);
4160 return;
4163 if (!COMPARISON_P (x))
4165 output_operand_lossage ("invalid operand for '%%%c'", code);
4166 return;
4168 cond_code = aarch64_get_condition_code (x);
4169 gcc_assert (cond_code >= 0);
4170 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171 (cond_code)], f);
4173 break;
4175 case 'b':
4176 case 'h':
4177 case 's':
4178 case 'd':
4179 case 'q':
4180 /* Print a scalar FP/SIMD register name. */
4181 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4183 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184 return;
4186 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187 break;
4189 case 'S':
4190 case 'T':
4191 case 'U':
4192 case 'V':
4193 /* Print the first FP/SIMD register name in a list. */
4194 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4196 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197 return;
4199 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200 break;
4202 case 'R':
4203 /* Print a scalar FP/SIMD register name + 1. */
4204 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4206 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207 return;
4209 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210 break;
4212 case 'X':
4213 /* Print bottom 16 bits of integer constant in hex. */
4214 if (!CONST_INT_P (x))
4216 output_operand_lossage ("invalid operand for '%%%c'", code);
4217 return;
4219 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220 break;
4222 case 'w':
4223 case 'x':
4224 /* Print a general register name or the zero register (32-bit or
4225 64-bit). */
4226 if (x == const0_rtx
4227 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4229 asm_fprintf (f, "%czr", code);
4230 break;
4233 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4235 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236 break;
4239 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4241 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242 break;
4245 /* Fall through */
4247 case 0:
4248 /* Print a normal operand, if it's a general register, then we
4249 assume DImode. */
4250 if (x == NULL)
4252 output_operand_lossage ("missing operand");
4253 return;
4256 switch (GET_CODE (x))
4258 case REG:
4259 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260 break;
4262 case MEM:
4263 aarch64_memory_reference_mode = GET_MODE (x);
4264 output_address (XEXP (x, 0));
4265 break;
4267 case LABEL_REF:
4268 case SYMBOL_REF:
4269 output_addr_const (asm_out_file, x);
4270 break;
4272 case CONST_INT:
4273 asm_fprintf (f, "%wd", INTVAL (x));
4274 break;
4276 case CONST_VECTOR:
4277 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4279 gcc_assert (
4280 aarch64_const_vec_all_same_in_range_p (x,
4281 HOST_WIDE_INT_MIN,
4282 HOST_WIDE_INT_MAX));
4283 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4285 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4287 fputc ('0', f);
4289 else
4290 gcc_unreachable ();
4291 break;
4293 case CONST_DOUBLE:
4294 /* CONST_DOUBLE can represent a double-width integer.
4295 In this case, the mode of x is VOIDmode. */
4296 if (GET_MODE (x) == VOIDmode)
4297 ; /* Do Nothing. */
4298 else if (aarch64_float_const_zero_rtx_p (x))
4300 fputc ('0', f);
4301 break;
4303 else if (aarch64_float_const_representable_p (x))
4305 #define buf_size 20
4306 char float_buf[buf_size] = {'\0'};
4307 REAL_VALUE_TYPE r;
4308 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309 real_to_decimal_for_mode (float_buf, &r,
4310 buf_size, buf_size,
4311 1, GET_MODE (x));
4312 asm_fprintf (asm_out_file, "%s", float_buf);
4313 break;
4314 #undef buf_size
4316 output_operand_lossage ("invalid constant");
4317 return;
4318 default:
4319 output_operand_lossage ("invalid operand");
4320 return;
4322 break;
4324 case 'A':
4325 if (GET_CODE (x) == HIGH)
4326 x = XEXP (x, 0);
4328 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4330 case SYMBOL_SMALL_GOT:
4331 asm_fprintf (asm_out_file, ":got:");
4332 break;
4334 case SYMBOL_SMALL_TLSGD:
4335 asm_fprintf (asm_out_file, ":tlsgd:");
4336 break;
4338 case SYMBOL_SMALL_TLSDESC:
4339 asm_fprintf (asm_out_file, ":tlsdesc:");
4340 break;
4342 case SYMBOL_SMALL_GOTTPREL:
4343 asm_fprintf (asm_out_file, ":gottprel:");
4344 break;
4346 case SYMBOL_SMALL_TPREL:
4347 asm_fprintf (asm_out_file, ":tprel:");
4348 break;
4350 case SYMBOL_TINY_GOT:
4351 gcc_unreachable ();
4352 break;
4354 default:
4355 break;
4357 output_addr_const (asm_out_file, x);
4358 break;
4360 case 'L':
4361 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4363 case SYMBOL_SMALL_GOT:
4364 asm_fprintf (asm_out_file, ":lo12:");
4365 break;
4367 case SYMBOL_SMALL_TLSGD:
4368 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369 break;
4371 case SYMBOL_SMALL_TLSDESC:
4372 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373 break;
4375 case SYMBOL_SMALL_GOTTPREL:
4376 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377 break;
4379 case SYMBOL_SMALL_TPREL:
4380 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381 break;
4383 case SYMBOL_TINY_GOT:
4384 asm_fprintf (asm_out_file, ":got:");
4385 break;
4387 default:
4388 break;
4390 output_addr_const (asm_out_file, x);
4391 break;
4393 case 'G':
4395 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4397 case SYMBOL_SMALL_TPREL:
4398 asm_fprintf (asm_out_file, ":tprel_hi12:");
4399 break;
4400 default:
4401 break;
4403 output_addr_const (asm_out_file, x);
4404 break;
4406 case 'K':
4408 int cond_code;
4409 /* Print nzcv. */
4411 if (!COMPARISON_P (x))
4413 output_operand_lossage ("invalid operand for '%%%c'", code);
4414 return;
4417 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418 gcc_assert (cond_code >= 0);
4419 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4421 break;
4423 case 'k':
4425 int cond_code;
4426 /* Print nzcv. */
4428 if (!COMPARISON_P (x))
4430 output_operand_lossage ("invalid operand for '%%%c'", code);
4431 return;
4434 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435 gcc_assert (cond_code >= 0);
4436 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4438 break;
4440 default:
4441 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442 return;
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4449 struct aarch64_address_info addr;
4451 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452 MEM, true))
4453 switch (addr.type)
4455 case ADDRESS_REG_IMM:
4456 if (addr.offset == const0_rtx)
4457 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458 else
4459 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460 INTVAL (addr.offset));
4461 return;
4463 case ADDRESS_REG_REG:
4464 if (addr.shift == 0)
4465 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466 reg_names [REGNO (addr.offset)]);
4467 else
4468 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469 reg_names [REGNO (addr.offset)], addr.shift);
4470 return;
4472 case ADDRESS_REG_UXTW:
4473 if (addr.shift == 0)
4474 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475 REGNO (addr.offset) - R0_REGNUM);
4476 else
4477 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479 return;
4481 case ADDRESS_REG_SXTW:
4482 if (addr.shift == 0)
4483 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484 REGNO (addr.offset) - R0_REGNUM);
4485 else
4486 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488 return;
4490 case ADDRESS_REG_WB:
4491 switch (GET_CODE (x))
4493 case PRE_INC:
4494 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495 GET_MODE_SIZE (aarch64_memory_reference_mode));
4496 return;
4497 case POST_INC:
4498 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499 GET_MODE_SIZE (aarch64_memory_reference_mode));
4500 return;
4501 case PRE_DEC:
4502 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503 GET_MODE_SIZE (aarch64_memory_reference_mode));
4504 return;
4505 case POST_DEC:
4506 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507 GET_MODE_SIZE (aarch64_memory_reference_mode));
4508 return;
4509 case PRE_MODIFY:
4510 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511 INTVAL (addr.offset));
4512 return;
4513 case POST_MODIFY:
4514 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515 INTVAL (addr.offset));
4516 return;
4517 default:
4518 break;
4520 break;
4522 case ADDRESS_LO_SUM:
4523 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524 output_addr_const (f, addr.offset);
4525 asm_fprintf (f, "]");
4526 return;
4528 case ADDRESS_SYMBOLIC:
4529 break;
4532 output_addr_const (f, x);
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4538 const char *fmt;
4539 int i;
4541 if (GET_CODE (x) == LABEL_REF)
4542 return true;
4544 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545 referencing instruction, but they are constant offsets, not
4546 symbols. */
4547 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548 return false;
4550 fmt = GET_RTX_FORMAT (GET_CODE (x));
4551 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4553 if (fmt[i] == 'E')
4555 int j;
4557 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559 return 1;
4561 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562 return 1;
4565 return 0;
4568 /* Implement REGNO_REG_CLASS. */
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4573 if (GP_REGNUM_P (regno))
4574 return GENERAL_REGS;
4576 if (regno == SP_REGNUM)
4577 return STACK_REG;
4579 if (regno == FRAME_POINTER_REGNUM
4580 || regno == ARG_POINTER_REGNUM)
4581 return POINTER_REGS;
4583 if (FP_REGNUM_P (regno))
4584 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4586 return NO_REGS;
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4592 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593 where mask is selected by alignment and size of the offset.
4594 We try to pick as large a range for the offset as possible to
4595 maximize the chance of a CSE. However, for aligned addresses
4596 we limit the range to 4k so that structures with different sized
4597 elements are likely to use the same base. */
4599 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4601 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602 HOST_WIDE_INT base_offset;
4604 /* Does it look like we'll need a load/store-pair operation? */
4605 if (GET_MODE_SIZE (mode) > 16
4606 || mode == TImode)
4607 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609 /* For offsets aren't a multiple of the access size, the limit is
4610 -256...255. */
4611 else if (offset & (GET_MODE_SIZE (mode) - 1))
4612 base_offset = (offset + 0x100) & ~0x1ff;
4613 else
4614 base_offset = offset & ~0xfff;
4616 if (base_offset == 0)
4617 return x;
4619 offset -= base_offset;
4620 rtx base_reg = gen_reg_rtx (Pmode);
4621 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622 NULL_RTX);
4623 emit_move_insn (base_reg, val);
4624 x = plus_constant (Pmode, base_reg, offset);
4627 return x;
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631 operand. If we find one, push the reload and return the new rtx. */
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635 machine_mode mode,
4636 int opnum, int type,
4637 int ind_levels ATTRIBUTE_UNUSED)
4639 rtx x = *x_p;
4641 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4642 if (aarch64_vect_struct_mode_p (mode)
4643 && GET_CODE (x) == PLUS
4644 && REG_P (XEXP (x, 0))
4645 && CONST_INT_P (XEXP (x, 1)))
4647 rtx orig_rtx = x;
4648 x = copy_rtx (x);
4649 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651 opnum, (enum reload_type) type);
4652 return x;
4655 /* We must recognize output that we have already generated ourselves. */
4656 if (GET_CODE (x) == PLUS
4657 && GET_CODE (XEXP (x, 0)) == PLUS
4658 && REG_P (XEXP (XEXP (x, 0), 0))
4659 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660 && CONST_INT_P (XEXP (x, 1)))
4662 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664 opnum, (enum reload_type) type);
4665 return x;
4668 /* We wish to handle large displacements off a base register by splitting
4669 the addend across an add and the mem insn. This can cut the number of
4670 extra insns needed from 3 to 1. It is only useful for load/store of a
4671 single register with 12 bit offset field. */
4672 if (GET_CODE (x) == PLUS
4673 && REG_P (XEXP (x, 0))
4674 && CONST_INT_P (XEXP (x, 1))
4675 && HARD_REGISTER_P (XEXP (x, 0))
4676 && mode != TImode
4677 && mode != TFmode
4678 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4680 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681 HOST_WIDE_INT low = val & 0xfff;
4682 HOST_WIDE_INT high = val - low;
4683 HOST_WIDE_INT offs;
4684 rtx cst;
4685 machine_mode xmode = GET_MODE (x);
4687 /* In ILP32, xmode can be either DImode or SImode. */
4688 gcc_assert (xmode == DImode || xmode == SImode);
4690 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4691 BLKmode alignment. */
4692 if (GET_MODE_SIZE (mode) == 0)
4693 return NULL_RTX;
4695 offs = low % GET_MODE_SIZE (mode);
4697 /* Align misaligned offset by adjusting high part to compensate. */
4698 if (offs != 0)
4700 if (aarch64_uimm12_shift (high + offs))
4702 /* Align down. */
4703 low = low - offs;
4704 high = high + offs;
4706 else
4708 /* Align up. */
4709 offs = GET_MODE_SIZE (mode) - offs;
4710 low = low + offs;
4711 high = high + (low & 0x1000) - offs;
4712 low &= 0xfff;
4716 /* Check for overflow. */
4717 if (high + low != val)
4718 return NULL_RTX;
4720 cst = GEN_INT (high);
4721 if (!aarch64_uimm12_shift (high))
4722 cst = force_const_mem (xmode, cst);
4724 /* Reload high part into base reg, leaving the low part
4725 in the mem instruction.
4726 Note that replacing this gen_rtx_PLUS with plus_constant is
4727 wrong in this case because we rely on the
4728 (plus (plus reg c1) c2) structure being preserved so that
4729 XEXP (*p, 0) in push_reload below uses the correct term. */
4730 x = gen_rtx_PLUS (xmode,
4731 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732 GEN_INT (low));
4734 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736 opnum, (enum reload_type) type);
4737 return x;
4740 return NULL_RTX;
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746 reg_class_t rclass,
4747 machine_mode mode,
4748 secondary_reload_info *sri)
4750 /* Without the TARGET_SIMD instructions we cannot move a Q register
4751 to a Q register directly. We need a scratch. */
4752 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754 && reg_class_subset_p (rclass, FP_REGS))
4756 if (mode == TFmode)
4757 sri->icode = CODE_FOR_aarch64_reload_movtf;
4758 else if (mode == TImode)
4759 sri->icode = CODE_FOR_aarch64_reload_movti;
4760 return NO_REGS;
4763 /* A TFmode or TImode memory access should be handled via an FP_REGS
4764 because AArch64 has richer addressing modes for LDR/STR instructions
4765 than LDP/STP instructions. */
4766 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768 return FP_REGS;
4770 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771 return GENERAL_REGS;
4773 return NO_REGS;
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4779 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4782 if (frame_pointer_needed)
4784 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785 return true;
4786 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787 return false;
4788 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789 && !cfun->calls_alloca)
4790 return true;
4791 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792 return true;
4794 return false;
4796 else
4798 /* If we decided that we didn't need a leaf frame pointer but then used
4799 LR in the function, then we'll want a frame pointer after all, so
4800 prevent this elimination to ensure a frame pointer is used. */
4801 if (to == STACK_POINTER_REGNUM
4802 && flag_omit_leaf_frame_pointer
4803 && df_regs_ever_live_p (LR_REGNUM))
4804 return false;
4807 return true;
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4813 aarch64_layout_frame ();
4815 if (to == HARD_FRAME_POINTER_REGNUM)
4817 if (from == ARG_POINTER_REGNUM)
4818 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4820 if (from == FRAME_POINTER_REGNUM)
4821 return (cfun->machine->frame.hard_fp_offset
4822 - cfun->machine->frame.saved_varargs_size);
4825 if (to == STACK_POINTER_REGNUM)
4827 if (from == FRAME_POINTER_REGNUM)
4828 return (cfun->machine->frame.frame_size
4829 - cfun->machine->frame.saved_varargs_size);
4832 return cfun->machine->frame.frame_size;
4835 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4836 previous frame. */
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4841 if (count != 0)
4842 return const0_rtx;
4843 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4850 if (TARGET_ILP32)
4852 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4855 else
4857 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4860 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861 assemble_aligned_integer (4, const0_rtx);
4862 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4869 rtx fnaddr, mem, a_tramp;
4870 const int tramp_code_sz = 16;
4872 /* Don't need to copy the trailing D-words, we fill those in below. */
4873 emit_block_move (m_tramp, assemble_trampoline_template (),
4874 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877 if (GET_MODE (fnaddr) != ptr_mode)
4878 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879 emit_move_insn (mem, fnaddr);
4881 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882 emit_move_insn (mem, chain_value);
4884 /* XXX We should really define a "clear_cache" pattern and use
4885 gen_clear_cache(). */
4886 a_tramp = XEXP (m_tramp, 0);
4887 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890 ptr_mode);
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4896 switch (regclass)
4898 case CALLER_SAVE_REGS:
4899 case POINTER_REGS:
4900 case GENERAL_REGS:
4901 case ALL_REGS:
4902 case FP_REGS:
4903 case FP_LO_REGS:
4904 return
4905 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906 (GET_MODE_SIZE (mode) + 7) / 8;
4907 case STACK_REG:
4908 return 1;
4910 case NO_REGS:
4911 return 0;
4913 default:
4914 break;
4916 gcc_unreachable ();
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4922 if (regclass == POINTER_REGS)
4923 return GENERAL_REGS;
4925 if (regclass == STACK_REG)
4927 if (REG_P(x)
4928 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929 return regclass;
4931 return NO_REGS;
4934 /* If it's an integer immediate that MOVI can't handle, then
4935 FP_REGS is not an option, so we return NO_REGS instead. */
4936 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938 return NO_REGS;
4940 /* Register eliminiation can result in a request for
4941 SP+constant->FP_REGS. We cannot support such operations which
4942 use SP as source and an FP_REG as destination, so reject out
4943 right now. */
4944 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4946 rtx lhs = XEXP (x, 0);
4948 /* Look through a possible SUBREG introduced by ILP32. */
4949 if (GET_CODE (lhs) == SUBREG)
4950 lhs = SUBREG_REG (lhs);
4952 gcc_assert (REG_P (lhs));
4953 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954 POINTER_REGS));
4955 return NO_REGS;
4958 return regclass;
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4964 asm_fprintf (f, "%U%s", name);
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4970 if (priority == DEFAULT_INIT_PRIORITY)
4971 default_ctor_section_asm_out_constructor (symbol, priority);
4972 else
4974 section *s;
4975 char buf[18];
4976 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977 s = get_section (buf, SECTION_WRITE, NULL);
4978 switch_to_section (s);
4979 assemble_align (POINTER_SIZE);
4980 assemble_aligned_integer (POINTER_BYTES, symbol);
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4987 if (priority == DEFAULT_INIT_PRIORITY)
4988 default_dtor_section_asm_out_destructor (symbol, priority);
4989 else
4991 section *s;
4992 char buf[18];
4993 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994 s = get_section (buf, SECTION_WRITE, NULL);
4995 switch_to_section (s);
4996 assemble_align (POINTER_SIZE);
4997 assemble_aligned_integer (POINTER_BYTES, symbol);
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5004 char buf[100];
5005 char label[100];
5006 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007 int index;
5008 static const char *const patterns[4][2] =
5011 "ldrb\t%w3, [%0,%w1,uxtw]",
5012 "add\t%3, %4, %w3, sxtb #2"
5015 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016 "add\t%3, %4, %w3, sxth #2"
5019 "ldr\t%w3, [%0,%w1,uxtw #2]",
5020 "add\t%3, %4, %w3, sxtw #2"
5022 /* We assume that DImode is only generated when not optimizing and
5023 that we don't really need 64-bit address offsets. That would
5024 imply an object file with 8GB of code in a single function! */
5026 "ldr\t%w3, [%0,%w1,uxtw #2]",
5027 "add\t%3, %4, %w3, sxtw #2"
5031 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5033 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5035 gcc_assert (index >= 0 && index <= 3);
5037 /* Need to implement table size reduction, by chaning the code below. */
5038 output_asm_insn (patterns[index][0], operands);
5039 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040 snprintf (buf, sizeof (buf),
5041 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042 output_asm_insn (buf, operands);
5043 output_asm_insn (patterns[index][1], operands);
5044 output_asm_insn ("br\t%3", operands);
5045 assemble_label (asm_out_file, label);
5046 return "";
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052 operator. */
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5057 if (shift >= 0 && shift <= 3)
5059 int size;
5060 for (size = 8; size <= 32; size *= 2)
5062 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063 if (mask == bits << shift)
5064 return size;
5067 return 0;
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072 const_rtx x ATTRIBUTE_UNUSED)
5074 /* We can't use blocks for constants when we're using a per-function
5075 constant pool. */
5076 return false;
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081 rtx x ATTRIBUTE_UNUSED,
5082 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5084 /* Force all constant pool entries into the current function section. */
5085 return function_section (current_function_decl);
5089 /* Costs. */
5091 /* Helper function for rtx cost calculation. Strip a shift expression
5092 from X. Returns the inner operand if successful, or the original
5093 expression on failure. */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5097 rtx op = x;
5099 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100 we can convert both to ROR during final output. */
5101 if ((GET_CODE (op) == ASHIFT
5102 || GET_CODE (op) == ASHIFTRT
5103 || GET_CODE (op) == LSHIFTRT
5104 || GET_CODE (op) == ROTATERT
5105 || GET_CODE (op) == ROTATE)
5106 && CONST_INT_P (XEXP (op, 1)))
5107 return XEXP (op, 0);
5109 if (GET_CODE (op) == MULT
5110 && CONST_INT_P (XEXP (op, 1))
5111 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112 return XEXP (op, 0);
5114 return x;
5117 /* Helper function for rtx cost calculation. Strip an extend
5118 expression from X. Returns the inner operand if successful, or the
5119 original expression on failure. We deal with a number of possible
5120 canonicalization variations here. */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5124 rtx op = x;
5126 /* Zero and sign extraction of a widened value. */
5127 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128 && XEXP (op, 2) == const0_rtx
5129 && GET_CODE (XEXP (op, 0)) == MULT
5130 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131 XEXP (op, 1)))
5132 return XEXP (XEXP (op, 0), 0);
5134 /* It can also be represented (for zero-extend) as an AND with an
5135 immediate. */
5136 if (GET_CODE (op) == AND
5137 && GET_CODE (XEXP (op, 0)) == MULT
5138 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139 && CONST_INT_P (XEXP (op, 1))
5140 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141 INTVAL (XEXP (op, 1))) != 0)
5142 return XEXP (XEXP (op, 0), 0);
5144 /* Now handle extended register, as this may also have an optional
5145 left shift by 1..4. */
5146 if (GET_CODE (op) == ASHIFT
5147 && CONST_INT_P (XEXP (op, 1))
5148 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149 op = XEXP (op, 0);
5151 if (GET_CODE (op) == ZERO_EXTEND
5152 || GET_CODE (op) == SIGN_EXTEND)
5153 op = XEXP (op, 0);
5155 if (op != x)
5156 return op;
5158 return x;
5161 /* Return true iff CODE is a shift supported in combination
5162 with arithmetic instructions. */
5164 static bool
5165 aarch64_shift_p (enum rtx_code code)
5167 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5170 /* Helper function for rtx cost calculation. Calculate the cost of
5171 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5172 Return the calculated cost of the expression, recursing manually in to
5173 operands where needed. */
5175 static int
5176 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5178 rtx op0, op1;
5179 const struct cpu_cost_table *extra_cost
5180 = aarch64_tune_params->insn_extra_cost;
5181 int cost = 0;
5182 bool compound_p = (outer == PLUS || outer == MINUS);
5183 machine_mode mode = GET_MODE (x);
5185 gcc_checking_assert (code == MULT);
5187 op0 = XEXP (x, 0);
5188 op1 = XEXP (x, 1);
5190 if (VECTOR_MODE_P (mode))
5191 mode = GET_MODE_INNER (mode);
5193 /* Integer multiply/fma. */
5194 if (GET_MODE_CLASS (mode) == MODE_INT)
5196 /* The multiply will be canonicalized as a shift, cost it as such. */
5197 if (aarch64_shift_p (GET_CODE (x))
5198 || (CONST_INT_P (op1)
5199 && exact_log2 (INTVAL (op1)) > 0))
5201 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5202 || GET_CODE (op0) == SIGN_EXTEND;
5203 if (speed)
5205 if (compound_p)
5207 if (REG_P (op1))
5208 /* ARITH + shift-by-register. */
5209 cost += extra_cost->alu.arith_shift_reg;
5210 else if (is_extend)
5211 /* ARITH + extended register. We don't have a cost field
5212 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5213 cost += extra_cost->alu.extend_arith;
5214 else
5215 /* ARITH + shift-by-immediate. */
5216 cost += extra_cost->alu.arith_shift;
5218 else
5219 /* LSL (immediate). */
5220 cost += extra_cost->alu.shift;
5223 /* Strip extends as we will have costed them in the case above. */
5224 if (is_extend)
5225 op0 = aarch64_strip_extend (op0);
5227 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5229 return cost;
5232 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5233 compound and let the below cases handle it. After all, MNEG is a
5234 special-case alias of MSUB. */
5235 if (GET_CODE (op0) == NEG)
5237 op0 = XEXP (op0, 0);
5238 compound_p = true;
5241 /* Integer multiplies or FMAs have zero/sign extending variants. */
5242 if ((GET_CODE (op0) == ZERO_EXTEND
5243 && GET_CODE (op1) == ZERO_EXTEND)
5244 || (GET_CODE (op0) == SIGN_EXTEND
5245 && GET_CODE (op1) == SIGN_EXTEND))
5247 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5248 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5250 if (speed)
5252 if (compound_p)
5253 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5254 cost += extra_cost->mult[0].extend_add;
5255 else
5256 /* MUL/SMULL/UMULL. */
5257 cost += extra_cost->mult[0].extend;
5260 return cost;
5263 /* This is either an integer multiply or a MADD. In both cases
5264 we want to recurse and cost the operands. */
5265 cost += rtx_cost (op0, MULT, 0, speed)
5266 + rtx_cost (op1, MULT, 1, speed);
5268 if (speed)
5270 if (compound_p)
5271 /* MADD/MSUB. */
5272 cost += extra_cost->mult[mode == DImode].add;
5273 else
5274 /* MUL. */
5275 cost += extra_cost->mult[mode == DImode].simple;
5278 return cost;
5280 else
5282 if (speed)
5284 /* Floating-point FMA/FMUL can also support negations of the
5285 operands. */
5286 if (GET_CODE (op0) == NEG)
5287 op0 = XEXP (op0, 0);
5288 if (GET_CODE (op1) == NEG)
5289 op1 = XEXP (op1, 0);
5291 if (compound_p)
5292 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5293 cost += extra_cost->fp[mode == DFmode].fma;
5294 else
5295 /* FMUL/FNMUL. */
5296 cost += extra_cost->fp[mode == DFmode].mult;
5299 cost += rtx_cost (op0, MULT, 0, speed)
5300 + rtx_cost (op1, MULT, 1, speed);
5301 return cost;
5305 static int
5306 aarch64_address_cost (rtx x,
5307 machine_mode mode,
5308 addr_space_t as ATTRIBUTE_UNUSED,
5309 bool speed)
5311 enum rtx_code c = GET_CODE (x);
5312 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5313 struct aarch64_address_info info;
5314 int cost = 0;
5315 info.shift = 0;
5317 if (!aarch64_classify_address (&info, x, mode, c, false))
5319 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5321 /* This is a CONST or SYMBOL ref which will be split
5322 in a different way depending on the code model in use.
5323 Cost it through the generic infrastructure. */
5324 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5325 /* Divide through by the cost of one instruction to
5326 bring it to the same units as the address costs. */
5327 cost_symbol_ref /= COSTS_N_INSNS (1);
5328 /* The cost is then the cost of preparing the address,
5329 followed by an immediate (possibly 0) offset. */
5330 return cost_symbol_ref + addr_cost->imm_offset;
5332 else
5334 /* This is most likely a jump table from a case
5335 statement. */
5336 return addr_cost->register_offset;
5340 switch (info.type)
5342 case ADDRESS_LO_SUM:
5343 case ADDRESS_SYMBOLIC:
5344 case ADDRESS_REG_IMM:
5345 cost += addr_cost->imm_offset;
5346 break;
5348 case ADDRESS_REG_WB:
5349 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5350 cost += addr_cost->pre_modify;
5351 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5352 cost += addr_cost->post_modify;
5353 else
5354 gcc_unreachable ();
5356 break;
5358 case ADDRESS_REG_REG:
5359 cost += addr_cost->register_offset;
5360 break;
5362 case ADDRESS_REG_UXTW:
5363 case ADDRESS_REG_SXTW:
5364 cost += addr_cost->register_extend;
5365 break;
5367 default:
5368 gcc_unreachable ();
5372 if (info.shift > 0)
5374 /* For the sake of calculating the cost of the shifted register
5375 component, we can treat same sized modes in the same way. */
5376 switch (GET_MODE_BITSIZE (mode))
5378 case 16:
5379 cost += addr_cost->addr_scale_costs.hi;
5380 break;
5382 case 32:
5383 cost += addr_cost->addr_scale_costs.si;
5384 break;
5386 case 64:
5387 cost += addr_cost->addr_scale_costs.di;
5388 break;
5390 /* We can't tell, or this is a 128-bit vector. */
5391 default:
5392 cost += addr_cost->addr_scale_costs.ti;
5393 break;
5397 return cost;
5400 /* Return true if the RTX X in mode MODE is a zero or sign extract
5401 usable in an ADD or SUB (extended register) instruction. */
5402 static bool
5403 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5405 /* Catch add with a sign extract.
5406 This is add_<optab><mode>_multp2. */
5407 if (GET_CODE (x) == SIGN_EXTRACT
5408 || GET_CODE (x) == ZERO_EXTRACT)
5410 rtx op0 = XEXP (x, 0);
5411 rtx op1 = XEXP (x, 1);
5412 rtx op2 = XEXP (x, 2);
5414 if (GET_CODE (op0) == MULT
5415 && CONST_INT_P (op1)
5416 && op2 == const0_rtx
5417 && CONST_INT_P (XEXP (op0, 1))
5418 && aarch64_is_extend_from_extract (mode,
5419 XEXP (op0, 1),
5420 op1))
5422 return true;
5426 return false;
5429 static bool
5430 aarch64_frint_unspec_p (unsigned int u)
5432 switch (u)
5434 case UNSPEC_FRINTZ:
5435 case UNSPEC_FRINTP:
5436 case UNSPEC_FRINTM:
5437 case UNSPEC_FRINTA:
5438 case UNSPEC_FRINTN:
5439 case UNSPEC_FRINTX:
5440 case UNSPEC_FRINTI:
5441 return true;
5443 default:
5444 return false;
5448 /* Return true iff X is an rtx that will match an extr instruction
5449 i.e. as described in the *extr<mode>5_insn family of patterns.
5450 OP0 and OP1 will be set to the operands of the shifts involved
5451 on success and will be NULL_RTX otherwise. */
5453 static bool
5454 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5456 rtx op0, op1;
5457 machine_mode mode = GET_MODE (x);
5459 *res_op0 = NULL_RTX;
5460 *res_op1 = NULL_RTX;
5462 if (GET_CODE (x) != IOR)
5463 return false;
5465 op0 = XEXP (x, 0);
5466 op1 = XEXP (x, 1);
5468 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5469 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5471 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5472 if (GET_CODE (op1) == ASHIFT)
5473 std::swap (op0, op1);
5475 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5476 return false;
5478 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5479 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5481 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5482 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5484 *res_op0 = XEXP (op0, 0);
5485 *res_op1 = XEXP (op1, 0);
5486 return true;
5490 return false;
5493 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5494 storing it in *COST. Result is true if the total cost of the operation
5495 has now been calculated. */
5496 static bool
5497 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5499 rtx inner;
5500 rtx comparator;
5501 enum rtx_code cmpcode;
5503 if (COMPARISON_P (op0))
5505 inner = XEXP (op0, 0);
5506 comparator = XEXP (op0, 1);
5507 cmpcode = GET_CODE (op0);
5509 else
5511 inner = op0;
5512 comparator = const0_rtx;
5513 cmpcode = NE;
5516 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5518 /* Conditional branch. */
5519 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5520 return true;
5521 else
5523 if (cmpcode == NE || cmpcode == EQ)
5525 if (comparator == const0_rtx)
5527 /* TBZ/TBNZ/CBZ/CBNZ. */
5528 if (GET_CODE (inner) == ZERO_EXTRACT)
5529 /* TBZ/TBNZ. */
5530 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5531 0, speed);
5532 else
5533 /* CBZ/CBNZ. */
5534 *cost += rtx_cost (inner, cmpcode, 0, speed);
5536 return true;
5539 else if (cmpcode == LT || cmpcode == GE)
5541 /* TBZ/TBNZ. */
5542 if (comparator == const0_rtx)
5543 return true;
5547 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5549 /* It's a conditional operation based on the status flags,
5550 so it must be some flavor of CSEL. */
5552 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5553 if (GET_CODE (op1) == NEG
5554 || GET_CODE (op1) == NOT
5555 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5556 op1 = XEXP (op1, 0);
5558 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5559 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5560 return true;
5563 /* We don't know what this is, cost all operands. */
5564 return false;
5567 /* Calculate the cost of calculating X, storing it in *COST. Result
5568 is true if the total cost of the operation has now been calculated. */
5569 static bool
5570 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5571 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5573 rtx op0, op1, op2;
5574 const struct cpu_cost_table *extra_cost
5575 = aarch64_tune_params->insn_extra_cost;
5576 machine_mode mode = GET_MODE (x);
5578 /* By default, assume that everything has equivalent cost to the
5579 cheapest instruction. Any additional costs are applied as a delta
5580 above this default. */
5581 *cost = COSTS_N_INSNS (1);
5583 /* TODO: The cost infrastructure currently does not handle
5584 vector operations. Assume that all vector operations
5585 are equally expensive. */
5586 if (VECTOR_MODE_P (mode))
5588 if (speed)
5589 *cost += extra_cost->vect.alu;
5590 return true;
5593 switch (code)
5595 case SET:
5596 /* The cost depends entirely on the operands to SET. */
5597 *cost = 0;
5598 op0 = SET_DEST (x);
5599 op1 = SET_SRC (x);
5601 switch (GET_CODE (op0))
5603 case MEM:
5604 if (speed)
5606 rtx address = XEXP (op0, 0);
5607 if (GET_MODE_CLASS (mode) == MODE_INT)
5608 *cost += extra_cost->ldst.store;
5609 else if (mode == SFmode)
5610 *cost += extra_cost->ldst.storef;
5611 else if (mode == DFmode)
5612 *cost += extra_cost->ldst.stored;
5614 *cost +=
5615 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5616 0, speed));
5619 *cost += rtx_cost (op1, SET, 1, speed);
5620 return true;
5622 case SUBREG:
5623 if (! REG_P (SUBREG_REG (op0)))
5624 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5626 /* Fall through. */
5627 case REG:
5628 /* const0_rtx is in general free, but we will use an
5629 instruction to set a register to 0. */
5630 if (REG_P (op1) || op1 == const0_rtx)
5632 /* The cost is 1 per register copied. */
5633 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5634 / UNITS_PER_WORD;
5635 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5637 else
5638 /* Cost is just the cost of the RHS of the set. */
5639 *cost += rtx_cost (op1, SET, 1, speed);
5640 return true;
5642 case ZERO_EXTRACT:
5643 case SIGN_EXTRACT:
5644 /* Bit-field insertion. Strip any redundant widening of
5645 the RHS to meet the width of the target. */
5646 if (GET_CODE (op1) == SUBREG)
5647 op1 = SUBREG_REG (op1);
5648 if ((GET_CODE (op1) == ZERO_EXTEND
5649 || GET_CODE (op1) == SIGN_EXTEND)
5650 && CONST_INT_P (XEXP (op0, 1))
5651 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5652 >= INTVAL (XEXP (op0, 1))))
5653 op1 = XEXP (op1, 0);
5655 if (CONST_INT_P (op1))
5657 /* MOV immediate is assumed to always be cheap. */
5658 *cost = COSTS_N_INSNS (1);
5660 else
5662 /* BFM. */
5663 if (speed)
5664 *cost += extra_cost->alu.bfi;
5665 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5668 return true;
5670 default:
5671 /* We can't make sense of this, assume default cost. */
5672 *cost = COSTS_N_INSNS (1);
5673 return false;
5675 return false;
5677 case CONST_INT:
5678 /* If an instruction can incorporate a constant within the
5679 instruction, the instruction's expression avoids calling
5680 rtx_cost() on the constant. If rtx_cost() is called on a
5681 constant, then it is usually because the constant must be
5682 moved into a register by one or more instructions.
5684 The exception is constant 0, which can be expressed
5685 as XZR/WZR and is therefore free. The exception to this is
5686 if we have (set (reg) (const0_rtx)) in which case we must cost
5687 the move. However, we can catch that when we cost the SET, so
5688 we don't need to consider that here. */
5689 if (x == const0_rtx)
5690 *cost = 0;
5691 else
5693 /* To an approximation, building any other constant is
5694 proportionally expensive to the number of instructions
5695 required to build that constant. This is true whether we
5696 are compiling for SPEED or otherwise. */
5697 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5698 (NULL_RTX, x, false, mode));
5700 return true;
5702 case CONST_DOUBLE:
5703 if (speed)
5705 /* mov[df,sf]_aarch64. */
5706 if (aarch64_float_const_representable_p (x))
5707 /* FMOV (scalar immediate). */
5708 *cost += extra_cost->fp[mode == DFmode].fpconst;
5709 else if (!aarch64_float_const_zero_rtx_p (x))
5711 /* This will be a load from memory. */
5712 if (mode == DFmode)
5713 *cost += extra_cost->ldst.loadd;
5714 else
5715 *cost += extra_cost->ldst.loadf;
5717 else
5718 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5719 or MOV v0.s[0], wzr - neither of which are modeled by the
5720 cost tables. Just use the default cost. */
5725 return true;
5727 case MEM:
5728 if (speed)
5730 /* For loads we want the base cost of a load, plus an
5731 approximation for the additional cost of the addressing
5732 mode. */
5733 rtx address = XEXP (x, 0);
5734 if (GET_MODE_CLASS (mode) == MODE_INT)
5735 *cost += extra_cost->ldst.load;
5736 else if (mode == SFmode)
5737 *cost += extra_cost->ldst.loadf;
5738 else if (mode == DFmode)
5739 *cost += extra_cost->ldst.loadd;
5741 *cost +=
5742 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5743 0, speed));
5746 return true;
5748 case NEG:
5749 op0 = XEXP (x, 0);
5751 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5753 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5754 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5756 /* CSETM. */
5757 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5758 return true;
5761 /* Cost this as SUB wzr, X. */
5762 op0 = CONST0_RTX (GET_MODE (x));
5763 op1 = XEXP (x, 0);
5764 goto cost_minus;
5767 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5769 /* Support (neg(fma...)) as a single instruction only if
5770 sign of zeros is unimportant. This matches the decision
5771 making in aarch64.md. */
5772 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5774 /* FNMADD. */
5775 *cost = rtx_cost (op0, NEG, 0, speed);
5776 return true;
5778 if (speed)
5779 /* FNEG. */
5780 *cost += extra_cost->fp[mode == DFmode].neg;
5781 return false;
5784 return false;
5786 case CLRSB:
5787 case CLZ:
5788 if (speed)
5789 *cost += extra_cost->alu.clz;
5791 return false;
5793 case COMPARE:
5794 op0 = XEXP (x, 0);
5795 op1 = XEXP (x, 1);
5797 if (op1 == const0_rtx
5798 && GET_CODE (op0) == AND)
5800 x = op0;
5801 goto cost_logic;
5804 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5806 /* TODO: A write to the CC flags possibly costs extra, this
5807 needs encoding in the cost tables. */
5809 /* CC_ZESWPmode supports zero extend for free. */
5810 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5811 op0 = XEXP (op0, 0);
5813 /* ANDS. */
5814 if (GET_CODE (op0) == AND)
5816 x = op0;
5817 goto cost_logic;
5820 if (GET_CODE (op0) == PLUS)
5822 /* ADDS (and CMN alias). */
5823 x = op0;
5824 goto cost_plus;
5827 if (GET_CODE (op0) == MINUS)
5829 /* SUBS. */
5830 x = op0;
5831 goto cost_minus;
5834 if (GET_CODE (op1) == NEG)
5836 /* CMN. */
5837 if (speed)
5838 *cost += extra_cost->alu.arith;
5840 *cost += rtx_cost (op0, COMPARE, 0, speed);
5841 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5842 return true;
5845 /* CMP.
5847 Compare can freely swap the order of operands, and
5848 canonicalization puts the more complex operation first.
5849 But the integer MINUS logic expects the shift/extend
5850 operation in op1. */
5851 if (! (REG_P (op0)
5852 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5854 op0 = XEXP (x, 1);
5855 op1 = XEXP (x, 0);
5857 goto cost_minus;
5860 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5862 /* FCMP. */
5863 if (speed)
5864 *cost += extra_cost->fp[mode == DFmode].compare;
5866 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5868 *cost += rtx_cost (op0, COMPARE, 0, speed);
5869 /* FCMP supports constant 0.0 for no extra cost. */
5870 return true;
5872 return false;
5875 return false;
5877 case MINUS:
5879 op0 = XEXP (x, 0);
5880 op1 = XEXP (x, 1);
5882 cost_minus:
5883 *cost += rtx_cost (op0, MINUS, 0, speed);
5885 /* Detect valid immediates. */
5886 if ((GET_MODE_CLASS (mode) == MODE_INT
5887 || (GET_MODE_CLASS (mode) == MODE_CC
5888 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5889 && CONST_INT_P (op1)
5890 && aarch64_uimm12_shift (INTVAL (op1)))
5892 if (speed)
5893 /* SUB(S) (immediate). */
5894 *cost += extra_cost->alu.arith;
5895 return true;
5898 /* Look for SUB (extended register). */
5899 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5901 if (speed)
5902 *cost += extra_cost->alu.extend_arith;
5904 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5905 (enum rtx_code) GET_CODE (op1),
5906 0, speed);
5907 return true;
5910 rtx new_op1 = aarch64_strip_extend (op1);
5912 /* Cost this as an FMA-alike operation. */
5913 if ((GET_CODE (new_op1) == MULT
5914 || aarch64_shift_p (GET_CODE (new_op1)))
5915 && code != COMPARE)
5917 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5918 (enum rtx_code) code,
5919 speed);
5920 return true;
5923 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5925 if (speed)
5927 if (GET_MODE_CLASS (mode) == MODE_INT)
5928 /* SUB(S). */
5929 *cost += extra_cost->alu.arith;
5930 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5931 /* FSUB. */
5932 *cost += extra_cost->fp[mode == DFmode].addsub;
5934 return true;
5937 case PLUS:
5939 rtx new_op0;
5941 op0 = XEXP (x, 0);
5942 op1 = XEXP (x, 1);
5944 cost_plus:
5945 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5946 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5948 /* CSINC. */
5949 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5950 *cost += rtx_cost (op1, PLUS, 1, speed);
5951 return true;
5954 if (GET_MODE_CLASS (mode) == MODE_INT
5955 && CONST_INT_P (op1)
5956 && aarch64_uimm12_shift (INTVAL (op1)))
5958 *cost += rtx_cost (op0, PLUS, 0, speed);
5960 if (speed)
5961 /* ADD (immediate). */
5962 *cost += extra_cost->alu.arith;
5963 return true;
5966 *cost += rtx_cost (op1, PLUS, 1, speed);
5968 /* Look for ADD (extended register). */
5969 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5971 if (speed)
5972 *cost += extra_cost->alu.extend_arith;
5974 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5975 (enum rtx_code) GET_CODE (op0),
5976 0, speed);
5977 return true;
5980 /* Strip any extend, leave shifts behind as we will
5981 cost them through mult_cost. */
5982 new_op0 = aarch64_strip_extend (op0);
5984 if (GET_CODE (new_op0) == MULT
5985 || aarch64_shift_p (GET_CODE (new_op0)))
5987 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5988 speed);
5989 return true;
5992 *cost += rtx_cost (new_op0, PLUS, 0, speed);
5994 if (speed)
5996 if (GET_MODE_CLASS (mode) == MODE_INT)
5997 /* ADD. */
5998 *cost += extra_cost->alu.arith;
5999 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6000 /* FADD. */
6001 *cost += extra_cost->fp[mode == DFmode].addsub;
6003 return true;
6006 case BSWAP:
6007 *cost = COSTS_N_INSNS (1);
6009 if (speed)
6010 *cost += extra_cost->alu.rev;
6012 return false;
6014 case IOR:
6015 if (aarch_rev16_p (x))
6017 *cost = COSTS_N_INSNS (1);
6019 if (speed)
6020 *cost += extra_cost->alu.rev;
6022 return true;
6025 if (aarch64_extr_rtx_p (x, &op0, &op1))
6027 *cost += rtx_cost (op0, IOR, 0, speed)
6028 + rtx_cost (op1, IOR, 1, speed);
6029 if (speed)
6030 *cost += extra_cost->alu.shift;
6032 return true;
6034 /* Fall through. */
6035 case XOR:
6036 case AND:
6037 cost_logic:
6038 op0 = XEXP (x, 0);
6039 op1 = XEXP (x, 1);
6041 if (code == AND
6042 && GET_CODE (op0) == MULT
6043 && CONST_INT_P (XEXP (op0, 1))
6044 && CONST_INT_P (op1)
6045 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6046 INTVAL (op1)) != 0)
6048 /* This is a UBFM/SBFM. */
6049 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6050 if (speed)
6051 *cost += extra_cost->alu.bfx;
6052 return true;
6055 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6057 /* We possibly get the immediate for free, this is not
6058 modelled. */
6059 if (CONST_INT_P (op1)
6060 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6062 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6064 if (speed)
6065 *cost += extra_cost->alu.logical;
6067 return true;
6069 else
6071 rtx new_op0 = op0;
6073 /* Handle ORN, EON, or BIC. */
6074 if (GET_CODE (op0) == NOT)
6075 op0 = XEXP (op0, 0);
6077 new_op0 = aarch64_strip_shift (op0);
6079 /* If we had a shift on op0 then this is a logical-shift-
6080 by-register/immediate operation. Otherwise, this is just
6081 a logical operation. */
6082 if (speed)
6084 if (new_op0 != op0)
6086 /* Shift by immediate. */
6087 if (CONST_INT_P (XEXP (op0, 1)))
6088 *cost += extra_cost->alu.log_shift;
6089 else
6090 *cost += extra_cost->alu.log_shift_reg;
6092 else
6093 *cost += extra_cost->alu.logical;
6096 /* In both cases we want to cost both operands. */
6097 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6098 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6100 return true;
6103 return false;
6105 case NOT:
6106 x = XEXP (x, 0);
6107 op0 = aarch64_strip_shift (x);
6109 /* MVN-shifted-reg. */
6110 if (op0 != x)
6112 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6114 if (speed)
6115 *cost += extra_cost->alu.log_shift;
6117 return true;
6119 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6120 Handle the second form here taking care that 'a' in the above can
6121 be a shift. */
6122 else if (GET_CODE (op0) == XOR)
6124 rtx newop0 = XEXP (op0, 0);
6125 rtx newop1 = XEXP (op0, 1);
6126 rtx op0_stripped = aarch64_strip_shift (newop0);
6128 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6129 + rtx_cost (op0_stripped, XOR, 0, speed);
6131 if (speed)
6133 if (op0_stripped != newop0)
6134 *cost += extra_cost->alu.log_shift;
6135 else
6136 *cost += extra_cost->alu.logical;
6139 return true;
6141 /* MVN. */
6142 if (speed)
6143 *cost += extra_cost->alu.logical;
6145 return false;
6147 case ZERO_EXTEND:
6149 op0 = XEXP (x, 0);
6150 /* If a value is written in SI mode, then zero extended to DI
6151 mode, the operation will in general be free as a write to
6152 a 'w' register implicitly zeroes the upper bits of an 'x'
6153 register. However, if this is
6155 (set (reg) (zero_extend (reg)))
6157 we must cost the explicit register move. */
6158 if (mode == DImode
6159 && GET_MODE (op0) == SImode
6160 && outer == SET)
6162 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6164 if (!op_cost && speed)
6165 /* MOV. */
6166 *cost += extra_cost->alu.extend;
6167 else
6168 /* Free, the cost is that of the SI mode operation. */
6169 *cost = op_cost;
6171 return true;
6173 else if (MEM_P (XEXP (x, 0)))
6175 /* All loads can zero extend to any size for free. */
6176 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6177 return true;
6180 /* UXTB/UXTH. */
6181 if (speed)
6182 *cost += extra_cost->alu.extend;
6184 return false;
6186 case SIGN_EXTEND:
6187 if (MEM_P (XEXP (x, 0)))
6189 /* LDRSH. */
6190 if (speed)
6192 rtx address = XEXP (XEXP (x, 0), 0);
6193 *cost += extra_cost->ldst.load_sign_extend;
6195 *cost +=
6196 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6197 0, speed));
6199 return true;
6202 if (speed)
6203 *cost += extra_cost->alu.extend;
6204 return false;
6206 case ASHIFT:
6207 op0 = XEXP (x, 0);
6208 op1 = XEXP (x, 1);
6210 if (CONST_INT_P (op1))
6212 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6213 aliases. */
6214 if (speed)
6215 *cost += extra_cost->alu.shift;
6217 /* We can incorporate zero/sign extend for free. */
6218 if (GET_CODE (op0) == ZERO_EXTEND
6219 || GET_CODE (op0) == SIGN_EXTEND)
6220 op0 = XEXP (op0, 0);
6222 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6223 return true;
6225 else
6227 /* LSLV. */
6228 if (speed)
6229 *cost += extra_cost->alu.shift_reg;
6231 return false; /* All arguments need to be in registers. */
6234 case ROTATE:
6235 case ROTATERT:
6236 case LSHIFTRT:
6237 case ASHIFTRT:
6238 op0 = XEXP (x, 0);
6239 op1 = XEXP (x, 1);
6241 if (CONST_INT_P (op1))
6243 /* ASR (immediate) and friends. */
6244 if (speed)
6245 *cost += extra_cost->alu.shift;
6247 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6248 return true;
6250 else
6253 /* ASR (register) and friends. */
6254 if (speed)
6255 *cost += extra_cost->alu.shift_reg;
6257 return false; /* All arguments need to be in registers. */
6260 case SYMBOL_REF:
6262 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6264 /* LDR. */
6265 if (speed)
6266 *cost += extra_cost->ldst.load;
6268 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6269 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6271 /* ADRP, followed by ADD. */
6272 *cost += COSTS_N_INSNS (1);
6273 if (speed)
6274 *cost += 2 * extra_cost->alu.arith;
6276 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6277 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6279 /* ADR. */
6280 if (speed)
6281 *cost += extra_cost->alu.arith;
6284 if (flag_pic)
6286 /* One extra load instruction, after accessing the GOT. */
6287 *cost += COSTS_N_INSNS (1);
6288 if (speed)
6289 *cost += extra_cost->ldst.load;
6291 return true;
6293 case HIGH:
6294 case LO_SUM:
6295 /* ADRP/ADD (immediate). */
6296 if (speed)
6297 *cost += extra_cost->alu.arith;
6298 return true;
6300 case ZERO_EXTRACT:
6301 case SIGN_EXTRACT:
6302 /* UBFX/SBFX. */
6303 if (speed)
6304 *cost += extra_cost->alu.bfx;
6306 /* We can trust that the immediates used will be correct (there
6307 are no by-register forms), so we need only cost op0. */
6308 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6309 return true;
6311 case MULT:
6312 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6313 /* aarch64_rtx_mult_cost always handles recursion to its
6314 operands. */
6315 return true;
6317 case MOD:
6318 case UMOD:
6319 if (speed)
6321 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6322 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6323 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6324 else if (GET_MODE (x) == DFmode)
6325 *cost += (extra_cost->fp[1].mult
6326 + extra_cost->fp[1].div);
6327 else if (GET_MODE (x) == SFmode)
6328 *cost += (extra_cost->fp[0].mult
6329 + extra_cost->fp[0].div);
6331 return false; /* All arguments need to be in registers. */
6333 case DIV:
6334 case UDIV:
6335 case SQRT:
6336 if (speed)
6338 if (GET_MODE_CLASS (mode) == MODE_INT)
6339 /* There is no integer SQRT, so only DIV and UDIV can get
6340 here. */
6341 *cost += extra_cost->mult[mode == DImode].idiv;
6342 else
6343 *cost += extra_cost->fp[mode == DFmode].div;
6345 return false; /* All arguments need to be in registers. */
6347 case IF_THEN_ELSE:
6348 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6349 XEXP (x, 2), cost, speed);
6351 case EQ:
6352 case NE:
6353 case GT:
6354 case GTU:
6355 case LT:
6356 case LTU:
6357 case GE:
6358 case GEU:
6359 case LE:
6360 case LEU:
6362 return false; /* All arguments must be in registers. */
6364 case FMA:
6365 op0 = XEXP (x, 0);
6366 op1 = XEXP (x, 1);
6367 op2 = XEXP (x, 2);
6369 if (speed)
6370 *cost += extra_cost->fp[mode == DFmode].fma;
6372 /* FMSUB, FNMADD, and FNMSUB are free. */
6373 if (GET_CODE (op0) == NEG)
6374 op0 = XEXP (op0, 0);
6376 if (GET_CODE (op2) == NEG)
6377 op2 = XEXP (op2, 0);
6379 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6380 and the by-element operand as operand 0. */
6381 if (GET_CODE (op1) == NEG)
6382 op1 = XEXP (op1, 0);
6384 /* Catch vector-by-element operations. The by-element operand can
6385 either be (vec_duplicate (vec_select (x))) or just
6386 (vec_select (x)), depending on whether we are multiplying by
6387 a vector or a scalar.
6389 Canonicalization is not very good in these cases, FMA4 will put the
6390 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6391 if (GET_CODE (op0) == VEC_DUPLICATE)
6392 op0 = XEXP (op0, 0);
6393 else if (GET_CODE (op1) == VEC_DUPLICATE)
6394 op1 = XEXP (op1, 0);
6396 if (GET_CODE (op0) == VEC_SELECT)
6397 op0 = XEXP (op0, 0);
6398 else if (GET_CODE (op1) == VEC_SELECT)
6399 op1 = XEXP (op1, 0);
6401 /* If the remaining parameters are not registers,
6402 get the cost to put them into registers. */
6403 *cost += rtx_cost (op0, FMA, 0, speed);
6404 *cost += rtx_cost (op1, FMA, 1, speed);
6405 *cost += rtx_cost (op2, FMA, 2, speed);
6406 return true;
6408 case FLOAT_EXTEND:
6409 if (speed)
6410 *cost += extra_cost->fp[mode == DFmode].widen;
6411 return false;
6413 case FLOAT_TRUNCATE:
6414 if (speed)
6415 *cost += extra_cost->fp[mode == DFmode].narrow;
6416 return false;
6418 case FIX:
6419 case UNSIGNED_FIX:
6420 x = XEXP (x, 0);
6421 /* Strip the rounding part. They will all be implemented
6422 by the fcvt* family of instructions anyway. */
6423 if (GET_CODE (x) == UNSPEC)
6425 unsigned int uns_code = XINT (x, 1);
6427 if (uns_code == UNSPEC_FRINTA
6428 || uns_code == UNSPEC_FRINTM
6429 || uns_code == UNSPEC_FRINTN
6430 || uns_code == UNSPEC_FRINTP
6431 || uns_code == UNSPEC_FRINTZ)
6432 x = XVECEXP (x, 0, 0);
6435 if (speed)
6436 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6438 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6439 return true;
6441 case ABS:
6442 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6444 op0 = XEXP (x, 0);
6446 /* FABD, which is analogous to FADD. */
6447 if (GET_CODE (op0) == MINUS)
6449 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6450 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6451 if (speed)
6452 *cost += extra_cost->fp[mode == DFmode].addsub;
6454 return true;
6456 /* Simple FABS is analogous to FNEG. */
6457 if (speed)
6458 *cost += extra_cost->fp[mode == DFmode].neg;
6460 else
6462 /* Integer ABS will either be split to
6463 two arithmetic instructions, or will be an ABS
6464 (scalar), which we don't model. */
6465 *cost = COSTS_N_INSNS (2);
6466 if (speed)
6467 *cost += 2 * extra_cost->alu.arith;
6469 return false;
6471 case SMAX:
6472 case SMIN:
6473 if (speed)
6475 /* FMAXNM/FMINNM/FMAX/FMIN.
6476 TODO: This may not be accurate for all implementations, but
6477 we do not model this in the cost tables. */
6478 *cost += extra_cost->fp[mode == DFmode].addsub;
6480 return false;
6482 case UNSPEC:
6483 /* The floating point round to integer frint* instructions. */
6484 if (aarch64_frint_unspec_p (XINT (x, 1)))
6486 if (speed)
6487 *cost += extra_cost->fp[mode == DFmode].roundint;
6489 return false;
6492 if (XINT (x, 1) == UNSPEC_RBIT)
6494 if (speed)
6495 *cost += extra_cost->alu.rev;
6497 return false;
6499 break;
6501 case TRUNCATE:
6503 /* Decompose <su>muldi3_highpart. */
6504 if (/* (truncate:DI */
6505 mode == DImode
6506 /* (lshiftrt:TI */
6507 && GET_MODE (XEXP (x, 0)) == TImode
6508 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6509 /* (mult:TI */
6510 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6511 /* (ANY_EXTEND:TI (reg:DI))
6512 (ANY_EXTEND:TI (reg:DI))) */
6513 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6514 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6515 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6516 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6517 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6518 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6519 /* (const_int 64) */
6520 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6521 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6523 /* UMULH/SMULH. */
6524 if (speed)
6525 *cost += extra_cost->mult[mode == DImode].extend;
6526 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6527 MULT, 0, speed);
6528 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6529 MULT, 1, speed);
6530 return true;
6533 /* Fall through. */
6534 default:
6535 break;
6538 if (dump_file && (dump_flags & TDF_DETAILS))
6539 fprintf (dump_file,
6540 "\nFailed to cost RTX. Assuming default cost.\n");
6542 return true;
6545 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6546 calculated for X. This cost is stored in *COST. Returns true
6547 if the total cost of X was calculated. */
6548 static bool
6549 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6550 int param, int *cost, bool speed)
6552 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6554 if (dump_file && (dump_flags & TDF_DETAILS))
6556 print_rtl_single (dump_file, x);
6557 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6558 speed ? "Hot" : "Cold",
6559 *cost, result ? "final" : "partial");
6562 return result;
6565 static int
6566 aarch64_register_move_cost (machine_mode mode,
6567 reg_class_t from_i, reg_class_t to_i)
6569 enum reg_class from = (enum reg_class) from_i;
6570 enum reg_class to = (enum reg_class) to_i;
6571 const struct cpu_regmove_cost *regmove_cost
6572 = aarch64_tune_params->regmove_cost;
6574 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6575 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6576 to = GENERAL_REGS;
6578 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6579 from = GENERAL_REGS;
6581 /* Moving between GPR and stack cost is the same as GP2GP. */
6582 if ((from == GENERAL_REGS && to == STACK_REG)
6583 || (to == GENERAL_REGS && from == STACK_REG))
6584 return regmove_cost->GP2GP;
6586 /* To/From the stack register, we move via the gprs. */
6587 if (to == STACK_REG || from == STACK_REG)
6588 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6589 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6591 if (GET_MODE_SIZE (mode) == 16)
6593 /* 128-bit operations on general registers require 2 instructions. */
6594 if (from == GENERAL_REGS && to == GENERAL_REGS)
6595 return regmove_cost->GP2GP * 2;
6596 else if (from == GENERAL_REGS)
6597 return regmove_cost->GP2FP * 2;
6598 else if (to == GENERAL_REGS)
6599 return regmove_cost->FP2GP * 2;
6601 /* When AdvSIMD instructions are disabled it is not possible to move
6602 a 128-bit value directly between Q registers. This is handled in
6603 secondary reload. A general register is used as a scratch to move
6604 the upper DI value and the lower DI value is moved directly,
6605 hence the cost is the sum of three moves. */
6606 if (! TARGET_SIMD)
6607 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6609 return regmove_cost->FP2FP;
6612 if (from == GENERAL_REGS && to == GENERAL_REGS)
6613 return regmove_cost->GP2GP;
6614 else if (from == GENERAL_REGS)
6615 return regmove_cost->GP2FP;
6616 else if (to == GENERAL_REGS)
6617 return regmove_cost->FP2GP;
6619 return regmove_cost->FP2FP;
6622 static int
6623 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6624 reg_class_t rclass ATTRIBUTE_UNUSED,
6625 bool in ATTRIBUTE_UNUSED)
6627 return aarch64_tune_params->memmov_cost;
6630 /* Return the number of instructions that can be issued per cycle. */
6631 static int
6632 aarch64_sched_issue_rate (void)
6634 return aarch64_tune_params->issue_rate;
6637 static int
6638 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6640 int issue_rate = aarch64_sched_issue_rate ();
6642 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6645 /* Vectorizer cost model target hooks. */
6647 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6648 static int
6649 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6650 tree vectype,
6651 int misalign ATTRIBUTE_UNUSED)
6653 unsigned elements;
6655 switch (type_of_cost)
6657 case scalar_stmt:
6658 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6660 case scalar_load:
6661 return aarch64_tune_params->vec_costs->scalar_load_cost;
6663 case scalar_store:
6664 return aarch64_tune_params->vec_costs->scalar_store_cost;
6666 case vector_stmt:
6667 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6669 case vector_load:
6670 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6672 case vector_store:
6673 return aarch64_tune_params->vec_costs->vec_store_cost;
6675 case vec_to_scalar:
6676 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6678 case scalar_to_vec:
6679 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6681 case unaligned_load:
6682 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6684 case unaligned_store:
6685 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6687 case cond_branch_taken:
6688 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6690 case cond_branch_not_taken:
6691 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6693 case vec_perm:
6694 case vec_promote_demote:
6695 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6697 case vec_construct:
6698 elements = TYPE_VECTOR_SUBPARTS (vectype);
6699 return elements / 2 + 1;
6701 default:
6702 gcc_unreachable ();
6706 /* Implement targetm.vectorize.add_stmt_cost. */
6707 static unsigned
6708 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6709 struct _stmt_vec_info *stmt_info, int misalign,
6710 enum vect_cost_model_location where)
6712 unsigned *cost = (unsigned *) data;
6713 unsigned retval = 0;
6715 if (flag_vect_cost_model)
6717 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6718 int stmt_cost =
6719 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6721 /* Statements in an inner loop relative to the loop being
6722 vectorized are weighted more heavily. The value here is
6723 a function (linear for now) of the loop nest level. */
6724 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6726 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6727 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6728 unsigned nest_level = loop_depth (loop);
6730 count *= nest_level;
6733 retval = (unsigned) (count * stmt_cost);
6734 cost[where] += retval;
6737 return retval;
6740 static void initialize_aarch64_code_model (void);
6742 /* Parse the architecture extension string. */
6744 static void
6745 aarch64_parse_extension (char *str)
6747 /* The extension string is parsed left to right. */
6748 const struct aarch64_option_extension *opt = NULL;
6750 /* Flag to say whether we are adding or removing an extension. */
6751 int adding_ext = -1;
6753 while (str != NULL && *str != 0)
6755 char *ext;
6756 size_t len;
6758 str++;
6759 ext = strchr (str, '+');
6761 if (ext != NULL)
6762 len = ext - str;
6763 else
6764 len = strlen (str);
6766 if (len >= 2 && strncmp (str, "no", 2) == 0)
6768 adding_ext = 0;
6769 len -= 2;
6770 str += 2;
6772 else if (len > 0)
6773 adding_ext = 1;
6775 if (len == 0)
6777 error ("missing feature modifier after %qs", adding_ext ? "+"
6778 : "+no");
6779 return;
6782 /* Scan over the extensions table trying to find an exact match. */
6783 for (opt = all_extensions; opt->name != NULL; opt++)
6785 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6787 /* Add or remove the extension. */
6788 if (adding_ext)
6789 aarch64_isa_flags |= opt->flags_on;
6790 else
6791 aarch64_isa_flags &= ~(opt->flags_off);
6792 break;
6796 if (opt->name == NULL)
6798 /* Extension not found in list. */
6799 error ("unknown feature modifier %qs", str);
6800 return;
6803 str = ext;
6806 return;
6809 /* Parse the ARCH string. */
6811 static void
6812 aarch64_parse_arch (void)
6814 char *ext;
6815 const struct processor *arch;
6816 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6817 size_t len;
6819 strcpy (str, aarch64_arch_string);
6821 ext = strchr (str, '+');
6823 if (ext != NULL)
6824 len = ext - str;
6825 else
6826 len = strlen (str);
6828 if (len == 0)
6830 error ("missing arch name in -march=%qs", str);
6831 return;
6834 /* Loop through the list of supported ARCHs to find a match. */
6835 for (arch = all_architectures; arch->name != NULL; arch++)
6837 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6839 selected_arch = arch;
6840 aarch64_isa_flags = selected_arch->flags;
6842 if (!selected_cpu)
6843 selected_cpu = &all_cores[selected_arch->core];
6845 if (ext != NULL)
6847 /* ARCH string contains at least one extension. */
6848 aarch64_parse_extension (ext);
6851 if (strcmp (selected_arch->arch, selected_cpu->arch))
6853 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6854 selected_cpu->name, selected_arch->name);
6857 return;
6861 /* ARCH name not found in list. */
6862 error ("unknown value %qs for -march", str);
6863 return;
6866 /* Parse the CPU string. */
6868 static void
6869 aarch64_parse_cpu (void)
6871 char *ext;
6872 const struct processor *cpu;
6873 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6874 size_t len;
6876 strcpy (str, aarch64_cpu_string);
6878 ext = strchr (str, '+');
6880 if (ext != NULL)
6881 len = ext - str;
6882 else
6883 len = strlen (str);
6885 if (len == 0)
6887 error ("missing cpu name in -mcpu=%qs", str);
6888 return;
6891 /* Loop through the list of supported CPUs to find a match. */
6892 for (cpu = all_cores; cpu->name != NULL; cpu++)
6894 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6896 selected_cpu = cpu;
6897 aarch64_isa_flags = selected_cpu->flags;
6899 if (ext != NULL)
6901 /* CPU string contains at least one extension. */
6902 aarch64_parse_extension (ext);
6905 return;
6909 /* CPU name not found in list. */
6910 error ("unknown value %qs for -mcpu", str);
6911 return;
6914 /* Parse the TUNE string. */
6916 static void
6917 aarch64_parse_tune (void)
6919 const struct processor *cpu;
6920 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6921 strcpy (str, aarch64_tune_string);
6923 /* Loop through the list of supported CPUs to find a match. */
6924 for (cpu = all_cores; cpu->name != NULL; cpu++)
6926 if (strcmp (cpu->name, str) == 0)
6928 selected_tune = cpu;
6929 return;
6933 /* CPU name not found in list. */
6934 error ("unknown value %qs for -mtune", str);
6935 return;
6939 /* Implement TARGET_OPTION_OVERRIDE. */
6941 static void
6942 aarch64_override_options (void)
6944 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6945 If either of -march or -mtune is given, they override their
6946 respective component of -mcpu.
6948 So, first parse AARCH64_CPU_STRING, then the others, be careful
6949 with -march as, if -mcpu is not present on the command line, march
6950 must set a sensible default CPU. */
6951 if (aarch64_cpu_string)
6953 aarch64_parse_cpu ();
6956 if (aarch64_arch_string)
6958 aarch64_parse_arch ();
6961 if (aarch64_tune_string)
6963 aarch64_parse_tune ();
6966 #ifndef HAVE_AS_MABI_OPTION
6967 /* The compiler may have been configured with 2.23.* binutils, which does
6968 not have support for ILP32. */
6969 if (TARGET_ILP32)
6970 error ("Assembler does not support -mabi=ilp32");
6971 #endif
6973 initialize_aarch64_code_model ();
6975 aarch64_build_bitmask_table ();
6977 /* This target defaults to strict volatile bitfields. */
6978 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6979 flag_strict_volatile_bitfields = 1;
6981 /* If the user did not specify a processor, choose the default
6982 one for them. This will be the CPU set during configuration using
6983 --with-cpu, otherwise it is "generic". */
6984 if (!selected_cpu)
6986 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6987 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6990 gcc_assert (selected_cpu);
6992 if (!selected_tune)
6993 selected_tune = selected_cpu;
6995 aarch64_tune_flags = selected_tune->flags;
6996 aarch64_tune = selected_tune->core;
6997 aarch64_tune_params = selected_tune->tune;
6998 aarch64_architecture_version = selected_cpu->architecture_version;
7000 if (aarch64_fix_a53_err835769 == 2)
7002 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7003 aarch64_fix_a53_err835769 = 1;
7004 #else
7005 aarch64_fix_a53_err835769 = 0;
7006 #endif
7009 /* If not opzimizing for size, set the default
7010 alignment to what the target wants */
7011 if (!optimize_size)
7013 if (align_loops <= 0)
7014 align_loops = aarch64_tune_params->loop_align;
7015 if (align_jumps <= 0)
7016 align_jumps = aarch64_tune_params->jump_align;
7017 if (align_functions <= 0)
7018 align_functions = aarch64_tune_params->function_align;
7021 if (AARCH64_TUNE_FMA_STEERING)
7022 aarch64_register_fma_steering ();
7024 aarch64_override_options_after_change ();
7027 /* Implement targetm.override_options_after_change. */
7029 static void
7030 aarch64_override_options_after_change (void)
7032 if (flag_omit_frame_pointer)
7033 flag_omit_leaf_frame_pointer = false;
7034 else if (flag_omit_leaf_frame_pointer)
7035 flag_omit_frame_pointer = true;
7038 static struct machine_function *
7039 aarch64_init_machine_status (void)
7041 struct machine_function *machine;
7042 machine = ggc_cleared_alloc<machine_function> ();
7043 return machine;
7046 void
7047 aarch64_init_expanders (void)
7049 init_machine_status = aarch64_init_machine_status;
7052 /* A checking mechanism for the implementation of the various code models. */
7053 static void
7054 initialize_aarch64_code_model (void)
7056 if (flag_pic)
7058 switch (aarch64_cmodel_var)
7060 case AARCH64_CMODEL_TINY:
7061 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7062 break;
7063 case AARCH64_CMODEL_SMALL:
7064 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7065 break;
7066 case AARCH64_CMODEL_LARGE:
7067 sorry ("code model %qs with -f%s", "large",
7068 flag_pic > 1 ? "PIC" : "pic");
7069 default:
7070 gcc_unreachable ();
7073 else
7074 aarch64_cmodel = aarch64_cmodel_var;
7077 /* Return true if SYMBOL_REF X binds locally. */
7079 static bool
7080 aarch64_symbol_binds_local_p (const_rtx x)
7082 return (SYMBOL_REF_DECL (x)
7083 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7084 : SYMBOL_REF_LOCAL_P (x));
7087 /* Return true if SYMBOL_REF X is thread local */
7088 static bool
7089 aarch64_tls_symbol_p (rtx x)
7091 if (! TARGET_HAVE_TLS)
7092 return false;
7094 if (GET_CODE (x) != SYMBOL_REF)
7095 return false;
7097 return SYMBOL_REF_TLS_MODEL (x) != 0;
7100 /* Classify a TLS symbol into one of the TLS kinds. */
7101 enum aarch64_symbol_type
7102 aarch64_classify_tls_symbol (rtx x)
7104 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7106 switch (tls_kind)
7108 case TLS_MODEL_GLOBAL_DYNAMIC:
7109 case TLS_MODEL_LOCAL_DYNAMIC:
7110 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7112 case TLS_MODEL_INITIAL_EXEC:
7113 return SYMBOL_SMALL_GOTTPREL;
7115 case TLS_MODEL_LOCAL_EXEC:
7116 return SYMBOL_SMALL_TPREL;
7118 case TLS_MODEL_EMULATED:
7119 case TLS_MODEL_NONE:
7120 return SYMBOL_FORCE_TO_MEM;
7122 default:
7123 gcc_unreachable ();
7127 /* Return the method that should be used to access SYMBOL_REF or
7128 LABEL_REF X in context CONTEXT. */
7130 enum aarch64_symbol_type
7131 aarch64_classify_symbol (rtx x, rtx offset,
7132 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7134 if (GET_CODE (x) == LABEL_REF)
7136 switch (aarch64_cmodel)
7138 case AARCH64_CMODEL_LARGE:
7139 return SYMBOL_FORCE_TO_MEM;
7141 case AARCH64_CMODEL_TINY_PIC:
7142 case AARCH64_CMODEL_TINY:
7143 return SYMBOL_TINY_ABSOLUTE;
7145 case AARCH64_CMODEL_SMALL_PIC:
7146 case AARCH64_CMODEL_SMALL:
7147 return SYMBOL_SMALL_ABSOLUTE;
7149 default:
7150 gcc_unreachable ();
7154 if (GET_CODE (x) == SYMBOL_REF)
7156 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7157 return SYMBOL_FORCE_TO_MEM;
7159 if (aarch64_tls_symbol_p (x))
7160 return aarch64_classify_tls_symbol (x);
7162 switch (aarch64_cmodel)
7164 case AARCH64_CMODEL_TINY:
7165 /* When we retreive symbol + offset address, we have to make sure
7166 the offset does not cause overflow of the final address. But
7167 we have no way of knowing the address of symbol at compile time
7168 so we can't accurately say if the distance between the PC and
7169 symbol + offset is outside the addressible range of +/-1M in the
7170 TINY code model. So we rely on images not being greater than
7171 1M and cap the offset at 1M and anything beyond 1M will have to
7172 be loaded using an alternative mechanism. */
7173 if (SYMBOL_REF_WEAK (x)
7174 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7175 return SYMBOL_FORCE_TO_MEM;
7176 return SYMBOL_TINY_ABSOLUTE;
7178 case AARCH64_CMODEL_SMALL:
7179 /* Same reasoning as the tiny code model, but the offset cap here is
7180 4G. */
7181 if (SYMBOL_REF_WEAK (x)
7182 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7183 HOST_WIDE_INT_C (4294967264)))
7184 return SYMBOL_FORCE_TO_MEM;
7185 return SYMBOL_SMALL_ABSOLUTE;
7187 case AARCH64_CMODEL_TINY_PIC:
7188 if (!aarch64_symbol_binds_local_p (x))
7189 return SYMBOL_TINY_GOT;
7190 return SYMBOL_TINY_ABSOLUTE;
7192 case AARCH64_CMODEL_SMALL_PIC:
7193 if (!aarch64_symbol_binds_local_p (x))
7194 return SYMBOL_SMALL_GOT;
7195 return SYMBOL_SMALL_ABSOLUTE;
7197 default:
7198 gcc_unreachable ();
7202 /* By default push everything into the constant pool. */
7203 return SYMBOL_FORCE_TO_MEM;
7206 bool
7207 aarch64_constant_address_p (rtx x)
7209 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7212 bool
7213 aarch64_legitimate_pic_operand_p (rtx x)
7215 if (GET_CODE (x) == SYMBOL_REF
7216 || (GET_CODE (x) == CONST
7217 && GET_CODE (XEXP (x, 0)) == PLUS
7218 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7219 return false;
7221 return true;
7224 /* Return true if X holds either a quarter-precision or
7225 floating-point +0.0 constant. */
7226 static bool
7227 aarch64_valid_floating_const (machine_mode mode, rtx x)
7229 if (!CONST_DOUBLE_P (x))
7230 return false;
7232 /* TODO: We could handle moving 0.0 to a TFmode register,
7233 but first we would like to refactor the movtf_aarch64
7234 to be more amicable to split moves properly and
7235 correctly gate on TARGET_SIMD. For now - reject all
7236 constants which are not to SFmode or DFmode registers. */
7237 if (!(mode == SFmode || mode == DFmode))
7238 return false;
7240 if (aarch64_float_const_zero_rtx_p (x))
7241 return true;
7242 return aarch64_float_const_representable_p (x);
7245 static bool
7246 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7248 /* Do not allow vector struct mode constants. We could support
7249 0 and -1 easily, but they need support in aarch64-simd.md. */
7250 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7251 return false;
7253 /* This could probably go away because
7254 we now decompose CONST_INTs according to expand_mov_immediate. */
7255 if ((GET_CODE (x) == CONST_VECTOR
7256 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7257 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7258 return !targetm.cannot_force_const_mem (mode, x);
7260 if (GET_CODE (x) == HIGH
7261 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7262 return true;
7264 return aarch64_constant_address_p (x);
7268 aarch64_load_tp (rtx target)
7270 if (!target
7271 || GET_MODE (target) != Pmode
7272 || !register_operand (target, Pmode))
7273 target = gen_reg_rtx (Pmode);
7275 /* Can return in any reg. */
7276 emit_insn (gen_aarch64_load_tp_hard (target));
7277 return target;
7280 /* On AAPCS systems, this is the "struct __va_list". */
7281 static GTY(()) tree va_list_type;
7283 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7284 Return the type to use as __builtin_va_list.
7286 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7288 struct __va_list
7290 void *__stack;
7291 void *__gr_top;
7292 void *__vr_top;
7293 int __gr_offs;
7294 int __vr_offs;
7295 }; */
7297 static tree
7298 aarch64_build_builtin_va_list (void)
7300 tree va_list_name;
7301 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7303 /* Create the type. */
7304 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7305 /* Give it the required name. */
7306 va_list_name = build_decl (BUILTINS_LOCATION,
7307 TYPE_DECL,
7308 get_identifier ("__va_list"),
7309 va_list_type);
7310 DECL_ARTIFICIAL (va_list_name) = 1;
7311 TYPE_NAME (va_list_type) = va_list_name;
7312 TYPE_STUB_DECL (va_list_type) = va_list_name;
7314 /* Create the fields. */
7315 f_stack = build_decl (BUILTINS_LOCATION,
7316 FIELD_DECL, get_identifier ("__stack"),
7317 ptr_type_node);
7318 f_grtop = build_decl (BUILTINS_LOCATION,
7319 FIELD_DECL, get_identifier ("__gr_top"),
7320 ptr_type_node);
7321 f_vrtop = build_decl (BUILTINS_LOCATION,
7322 FIELD_DECL, get_identifier ("__vr_top"),
7323 ptr_type_node);
7324 f_groff = build_decl (BUILTINS_LOCATION,
7325 FIELD_DECL, get_identifier ("__gr_offs"),
7326 integer_type_node);
7327 f_vroff = build_decl (BUILTINS_LOCATION,
7328 FIELD_DECL, get_identifier ("__vr_offs"),
7329 integer_type_node);
7331 DECL_ARTIFICIAL (f_stack) = 1;
7332 DECL_ARTIFICIAL (f_grtop) = 1;
7333 DECL_ARTIFICIAL (f_vrtop) = 1;
7334 DECL_ARTIFICIAL (f_groff) = 1;
7335 DECL_ARTIFICIAL (f_vroff) = 1;
7337 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7338 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7339 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7340 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7341 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7343 TYPE_FIELDS (va_list_type) = f_stack;
7344 DECL_CHAIN (f_stack) = f_grtop;
7345 DECL_CHAIN (f_grtop) = f_vrtop;
7346 DECL_CHAIN (f_vrtop) = f_groff;
7347 DECL_CHAIN (f_groff) = f_vroff;
7349 /* Compute its layout. */
7350 layout_type (va_list_type);
7352 return va_list_type;
7355 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7356 static void
7357 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7359 const CUMULATIVE_ARGS *cum;
7360 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7361 tree stack, grtop, vrtop, groff, vroff;
7362 tree t;
7363 int gr_save_area_size;
7364 int vr_save_area_size;
7365 int vr_offset;
7367 cum = &crtl->args.info;
7368 gr_save_area_size
7369 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7370 vr_save_area_size
7371 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7373 if (TARGET_GENERAL_REGS_ONLY)
7375 if (cum->aapcs_nvrn > 0)
7376 sorry ("%qs and floating point or vector arguments",
7377 "-mgeneral-regs-only");
7378 vr_save_area_size = 0;
7381 f_stack = TYPE_FIELDS (va_list_type_node);
7382 f_grtop = DECL_CHAIN (f_stack);
7383 f_vrtop = DECL_CHAIN (f_grtop);
7384 f_groff = DECL_CHAIN (f_vrtop);
7385 f_vroff = DECL_CHAIN (f_groff);
7387 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7388 NULL_TREE);
7389 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7390 NULL_TREE);
7391 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7392 NULL_TREE);
7393 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7394 NULL_TREE);
7395 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7396 NULL_TREE);
7398 /* Emit code to initialize STACK, which points to the next varargs stack
7399 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7400 by named arguments. STACK is 8-byte aligned. */
7401 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7402 if (cum->aapcs_stack_size > 0)
7403 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7404 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7405 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7407 /* Emit code to initialize GRTOP, the top of the GR save area.
7408 virtual_incoming_args_rtx should have been 16 byte aligned. */
7409 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7410 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7411 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7413 /* Emit code to initialize VRTOP, the top of the VR save area.
7414 This address is gr_save_area_bytes below GRTOP, rounded
7415 down to the next 16-byte boundary. */
7416 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7417 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7418 STACK_BOUNDARY / BITS_PER_UNIT);
7420 if (vr_offset)
7421 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7422 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7423 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7425 /* Emit code to initialize GROFF, the offset from GRTOP of the
7426 next GPR argument. */
7427 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7428 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7429 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7431 /* Likewise emit code to initialize VROFF, the offset from FTOP
7432 of the next VR argument. */
7433 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7434 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7435 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7438 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7440 static tree
7441 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7442 gimple_seq *post_p ATTRIBUTE_UNUSED)
7444 tree addr;
7445 bool indirect_p;
7446 bool is_ha; /* is HFA or HVA. */
7447 bool dw_align; /* double-word align. */
7448 machine_mode ag_mode = VOIDmode;
7449 int nregs;
7450 machine_mode mode;
7452 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7453 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7454 HOST_WIDE_INT size, rsize, adjust, align;
7455 tree t, u, cond1, cond2;
7457 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7458 if (indirect_p)
7459 type = build_pointer_type (type);
7461 mode = TYPE_MODE (type);
7463 f_stack = TYPE_FIELDS (va_list_type_node);
7464 f_grtop = DECL_CHAIN (f_stack);
7465 f_vrtop = DECL_CHAIN (f_grtop);
7466 f_groff = DECL_CHAIN (f_vrtop);
7467 f_vroff = DECL_CHAIN (f_groff);
7469 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7470 f_stack, NULL_TREE);
7471 size = int_size_in_bytes (type);
7472 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7474 dw_align = false;
7475 adjust = 0;
7476 if (aarch64_vfp_is_call_or_return_candidate (mode,
7477 type,
7478 &ag_mode,
7479 &nregs,
7480 &is_ha))
7482 /* TYPE passed in fp/simd registers. */
7483 if (TARGET_GENERAL_REGS_ONLY)
7484 sorry ("%qs and floating point or vector arguments",
7485 "-mgeneral-regs-only");
7487 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7488 unshare_expr (valist), f_vrtop, NULL_TREE);
7489 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7490 unshare_expr (valist), f_vroff, NULL_TREE);
7492 rsize = nregs * UNITS_PER_VREG;
7494 if (is_ha)
7496 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7497 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7499 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7500 && size < UNITS_PER_VREG)
7502 adjust = UNITS_PER_VREG - size;
7505 else
7507 /* TYPE passed in general registers. */
7508 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7509 unshare_expr (valist), f_grtop, NULL_TREE);
7510 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7511 unshare_expr (valist), f_groff, NULL_TREE);
7512 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7513 nregs = rsize / UNITS_PER_WORD;
7515 if (align > 8)
7516 dw_align = true;
7518 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7519 && size < UNITS_PER_WORD)
7521 adjust = UNITS_PER_WORD - size;
7525 /* Get a local temporary for the field value. */
7526 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7528 /* Emit code to branch if off >= 0. */
7529 t = build2 (GE_EXPR, boolean_type_node, off,
7530 build_int_cst (TREE_TYPE (off), 0));
7531 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7533 if (dw_align)
7535 /* Emit: offs = (offs + 15) & -16. */
7536 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7537 build_int_cst (TREE_TYPE (off), 15));
7538 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7539 build_int_cst (TREE_TYPE (off), -16));
7540 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7542 else
7543 roundup = NULL;
7545 /* Update ap.__[g|v]r_offs */
7546 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7547 build_int_cst (TREE_TYPE (off), rsize));
7548 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7550 /* String up. */
7551 if (roundup)
7552 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7554 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7555 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7556 build_int_cst (TREE_TYPE (f_off), 0));
7557 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7559 /* String up: make sure the assignment happens before the use. */
7560 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7561 COND_EXPR_ELSE (cond1) = t;
7563 /* Prepare the trees handling the argument that is passed on the stack;
7564 the top level node will store in ON_STACK. */
7565 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7566 if (align > 8)
7568 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7569 t = fold_convert (intDI_type_node, arg);
7570 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7571 build_int_cst (TREE_TYPE (t), 15));
7572 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7573 build_int_cst (TREE_TYPE (t), -16));
7574 t = fold_convert (TREE_TYPE (arg), t);
7575 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7577 else
7578 roundup = NULL;
7579 /* Advance ap.__stack */
7580 t = fold_convert (intDI_type_node, arg);
7581 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7582 build_int_cst (TREE_TYPE (t), size + 7));
7583 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7584 build_int_cst (TREE_TYPE (t), -8));
7585 t = fold_convert (TREE_TYPE (arg), t);
7586 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7587 /* String up roundup and advance. */
7588 if (roundup)
7589 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7590 /* String up with arg */
7591 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7592 /* Big-endianness related address adjustment. */
7593 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7594 && size < UNITS_PER_WORD)
7596 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7597 size_int (UNITS_PER_WORD - size));
7598 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7601 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7602 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7604 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7605 t = off;
7606 if (adjust)
7607 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7608 build_int_cst (TREE_TYPE (off), adjust));
7610 t = fold_convert (sizetype, t);
7611 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7613 if (is_ha)
7615 /* type ha; // treat as "struct {ftype field[n];}"
7616 ... [computing offs]
7617 for (i = 0; i <nregs; ++i, offs += 16)
7618 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7619 return ha; */
7620 int i;
7621 tree tmp_ha, field_t, field_ptr_t;
7623 /* Declare a local variable. */
7624 tmp_ha = create_tmp_var_raw (type, "ha");
7625 gimple_add_tmp_var (tmp_ha);
7627 /* Establish the base type. */
7628 switch (ag_mode)
7630 case SFmode:
7631 field_t = float_type_node;
7632 field_ptr_t = float_ptr_type_node;
7633 break;
7634 case DFmode:
7635 field_t = double_type_node;
7636 field_ptr_t = double_ptr_type_node;
7637 break;
7638 case TFmode:
7639 field_t = long_double_type_node;
7640 field_ptr_t = long_double_ptr_type_node;
7641 break;
7642 /* The half precision and quad precision are not fully supported yet. Enable
7643 the following code after the support is complete. Need to find the correct
7644 type node for __fp16 *. */
7645 #if 0
7646 case HFmode:
7647 field_t = float_type_node;
7648 field_ptr_t = float_ptr_type_node;
7649 break;
7650 #endif
7651 case V2SImode:
7652 case V4SImode:
7654 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7655 field_t = build_vector_type_for_mode (innertype, ag_mode);
7656 field_ptr_t = build_pointer_type (field_t);
7658 break;
7659 default:
7660 gcc_assert (0);
7663 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7664 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7665 addr = t;
7666 t = fold_convert (field_ptr_t, addr);
7667 t = build2 (MODIFY_EXPR, field_t,
7668 build1 (INDIRECT_REF, field_t, tmp_ha),
7669 build1 (INDIRECT_REF, field_t, t));
7671 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7672 for (i = 1; i < nregs; ++i)
7674 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7675 u = fold_convert (field_ptr_t, addr);
7676 u = build2 (MODIFY_EXPR, field_t,
7677 build2 (MEM_REF, field_t, tmp_ha,
7678 build_int_cst (field_ptr_t,
7679 (i *
7680 int_size_in_bytes (field_t)))),
7681 build1 (INDIRECT_REF, field_t, u));
7682 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7685 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7686 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7689 COND_EXPR_ELSE (cond2) = t;
7690 addr = fold_convert (build_pointer_type (type), cond1);
7691 addr = build_va_arg_indirect_ref (addr);
7693 if (indirect_p)
7694 addr = build_va_arg_indirect_ref (addr);
7696 return addr;
7699 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7701 static void
7702 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7703 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7704 int no_rtl)
7706 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7707 CUMULATIVE_ARGS local_cum;
7708 int gr_saved, vr_saved;
7710 /* The caller has advanced CUM up to, but not beyond, the last named
7711 argument. Advance a local copy of CUM past the last "real" named
7712 argument, to find out how many registers are left over. */
7713 local_cum = *cum;
7714 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7716 /* Found out how many registers we need to save. */
7717 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7718 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7720 if (TARGET_GENERAL_REGS_ONLY)
7722 if (local_cum.aapcs_nvrn > 0)
7723 sorry ("%qs and floating point or vector arguments",
7724 "-mgeneral-regs-only");
7725 vr_saved = 0;
7728 if (!no_rtl)
7730 if (gr_saved > 0)
7732 rtx ptr, mem;
7734 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7735 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7736 - gr_saved * UNITS_PER_WORD);
7737 mem = gen_frame_mem (BLKmode, ptr);
7738 set_mem_alias_set (mem, get_varargs_alias_set ());
7740 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7741 mem, gr_saved);
7743 if (vr_saved > 0)
7745 /* We can't use move_block_from_reg, because it will use
7746 the wrong mode, storing D regs only. */
7747 machine_mode mode = TImode;
7748 int off, i;
7750 /* Set OFF to the offset from virtual_incoming_args_rtx of
7751 the first vector register. The VR save area lies below
7752 the GR one, and is aligned to 16 bytes. */
7753 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7754 STACK_BOUNDARY / BITS_PER_UNIT);
7755 off -= vr_saved * UNITS_PER_VREG;
7757 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7759 rtx ptr, mem;
7761 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7762 mem = gen_frame_mem (mode, ptr);
7763 set_mem_alias_set (mem, get_varargs_alias_set ());
7764 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7765 off += UNITS_PER_VREG;
7770 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7771 any complication of having crtl->args.pretend_args_size changed. */
7772 cfun->machine->frame.saved_varargs_size
7773 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7774 STACK_BOUNDARY / BITS_PER_UNIT)
7775 + vr_saved * UNITS_PER_VREG);
7778 static void
7779 aarch64_conditional_register_usage (void)
7781 int i;
7782 if (!TARGET_FLOAT)
7784 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7786 fixed_regs[i] = 1;
7787 call_used_regs[i] = 1;
7792 /* Walk down the type tree of TYPE counting consecutive base elements.
7793 If *MODEP is VOIDmode, then set it to the first valid floating point
7794 type. If a non-floating point type is found, or if a floating point
7795 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7796 otherwise return the count in the sub-tree. */
7797 static int
7798 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7800 machine_mode mode;
7801 HOST_WIDE_INT size;
7803 switch (TREE_CODE (type))
7805 case REAL_TYPE:
7806 mode = TYPE_MODE (type);
7807 if (mode != DFmode && mode != SFmode && mode != TFmode)
7808 return -1;
7810 if (*modep == VOIDmode)
7811 *modep = mode;
7813 if (*modep == mode)
7814 return 1;
7816 break;
7818 case COMPLEX_TYPE:
7819 mode = TYPE_MODE (TREE_TYPE (type));
7820 if (mode != DFmode && mode != SFmode && mode != TFmode)
7821 return -1;
7823 if (*modep == VOIDmode)
7824 *modep = mode;
7826 if (*modep == mode)
7827 return 2;
7829 break;
7831 case VECTOR_TYPE:
7832 /* Use V2SImode and V4SImode as representatives of all 64-bit
7833 and 128-bit vector types. */
7834 size = int_size_in_bytes (type);
7835 switch (size)
7837 case 8:
7838 mode = V2SImode;
7839 break;
7840 case 16:
7841 mode = V4SImode;
7842 break;
7843 default:
7844 return -1;
7847 if (*modep == VOIDmode)
7848 *modep = mode;
7850 /* Vector modes are considered to be opaque: two vectors are
7851 equivalent for the purposes of being homogeneous aggregates
7852 if they are the same size. */
7853 if (*modep == mode)
7854 return 1;
7856 break;
7858 case ARRAY_TYPE:
7860 int count;
7861 tree index = TYPE_DOMAIN (type);
7863 /* Can't handle incomplete types nor sizes that are not
7864 fixed. */
7865 if (!COMPLETE_TYPE_P (type)
7866 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7867 return -1;
7869 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7870 if (count == -1
7871 || !index
7872 || !TYPE_MAX_VALUE (index)
7873 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7874 || !TYPE_MIN_VALUE (index)
7875 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7876 || count < 0)
7877 return -1;
7879 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7880 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7882 /* There must be no padding. */
7883 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7884 return -1;
7886 return count;
7889 case RECORD_TYPE:
7891 int count = 0;
7892 int sub_count;
7893 tree field;
7895 /* Can't handle incomplete types nor sizes that are not
7896 fixed. */
7897 if (!COMPLETE_TYPE_P (type)
7898 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7899 return -1;
7901 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7903 if (TREE_CODE (field) != FIELD_DECL)
7904 continue;
7906 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7907 if (sub_count < 0)
7908 return -1;
7909 count += sub_count;
7912 /* There must be no padding. */
7913 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7914 return -1;
7916 return count;
7919 case UNION_TYPE:
7920 case QUAL_UNION_TYPE:
7922 /* These aren't very interesting except in a degenerate case. */
7923 int count = 0;
7924 int sub_count;
7925 tree field;
7927 /* Can't handle incomplete types nor sizes that are not
7928 fixed. */
7929 if (!COMPLETE_TYPE_P (type)
7930 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7931 return -1;
7933 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7935 if (TREE_CODE (field) != FIELD_DECL)
7936 continue;
7938 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7939 if (sub_count < 0)
7940 return -1;
7941 count = count > sub_count ? count : sub_count;
7944 /* There must be no padding. */
7945 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7946 return -1;
7948 return count;
7951 default:
7952 break;
7955 return -1;
7958 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7959 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7960 array types. The C99 floating-point complex types are also considered
7961 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7962 types, which are GCC extensions and out of the scope of AAPCS64, are
7963 treated as composite types here as well.
7965 Note that MODE itself is not sufficient in determining whether a type
7966 is such a composite type or not. This is because
7967 stor-layout.c:compute_record_mode may have already changed the MODE
7968 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7969 structure with only one field may have its MODE set to the mode of the
7970 field. Also an integer mode whose size matches the size of the
7971 RECORD_TYPE type may be used to substitute the original mode
7972 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7973 solely relied on. */
7975 static bool
7976 aarch64_composite_type_p (const_tree type,
7977 machine_mode mode)
7979 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7980 return true;
7982 if (mode == BLKmode
7983 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7984 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7985 return true;
7987 return false;
7990 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7991 type as described in AAPCS64 \S 4.1.2.
7993 See the comment above aarch64_composite_type_p for the notes on MODE. */
7995 static bool
7996 aarch64_short_vector_p (const_tree type,
7997 machine_mode mode)
7999 HOST_WIDE_INT size = -1;
8001 if (type && TREE_CODE (type) == VECTOR_TYPE)
8002 size = int_size_in_bytes (type);
8003 else if (!aarch64_composite_type_p (type, mode)
8004 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8005 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
8006 size = GET_MODE_SIZE (mode);
8008 return (size == 8 || size == 16) ? true : false;
8011 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8012 shall be passed or returned in simd/fp register(s) (providing these
8013 parameter passing registers are available).
8015 Upon successful return, *COUNT returns the number of needed registers,
8016 *BASE_MODE returns the mode of the individual register and when IS_HAF
8017 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8018 floating-point aggregate or a homogeneous short-vector aggregate. */
8020 static bool
8021 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8022 const_tree type,
8023 machine_mode *base_mode,
8024 int *count,
8025 bool *is_ha)
8027 machine_mode new_mode = VOIDmode;
8028 bool composite_p = aarch64_composite_type_p (type, mode);
8030 if (is_ha != NULL) *is_ha = false;
8032 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8033 || aarch64_short_vector_p (type, mode))
8035 *count = 1;
8036 new_mode = mode;
8038 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8040 if (is_ha != NULL) *is_ha = true;
8041 *count = 2;
8042 new_mode = GET_MODE_INNER (mode);
8044 else if (type && composite_p)
8046 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8048 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8050 if (is_ha != NULL) *is_ha = true;
8051 *count = ag_count;
8053 else
8054 return false;
8056 else
8057 return false;
8059 *base_mode = new_mode;
8060 return true;
8063 /* Implement TARGET_STRUCT_VALUE_RTX. */
8065 static rtx
8066 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8067 int incoming ATTRIBUTE_UNUSED)
8069 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8072 /* Implements target hook vector_mode_supported_p. */
8073 static bool
8074 aarch64_vector_mode_supported_p (machine_mode mode)
8076 if (TARGET_SIMD
8077 && (mode == V4SImode || mode == V8HImode
8078 || mode == V16QImode || mode == V2DImode
8079 || mode == V2SImode || mode == V4HImode
8080 || mode == V8QImode || mode == V2SFmode
8081 || mode == V4SFmode || mode == V2DFmode
8082 || mode == V1DFmode))
8083 return true;
8085 return false;
8088 /* Return appropriate SIMD container
8089 for MODE within a vector of WIDTH bits. */
8090 static machine_mode
8091 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8093 gcc_assert (width == 64 || width == 128);
8094 if (TARGET_SIMD)
8096 if (width == 128)
8097 switch (mode)
8099 case DFmode:
8100 return V2DFmode;
8101 case SFmode:
8102 return V4SFmode;
8103 case SImode:
8104 return V4SImode;
8105 case HImode:
8106 return V8HImode;
8107 case QImode:
8108 return V16QImode;
8109 case DImode:
8110 return V2DImode;
8111 default:
8112 break;
8114 else
8115 switch (mode)
8117 case SFmode:
8118 return V2SFmode;
8119 case SImode:
8120 return V2SImode;
8121 case HImode:
8122 return V4HImode;
8123 case QImode:
8124 return V8QImode;
8125 default:
8126 break;
8129 return word_mode;
8132 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8133 static machine_mode
8134 aarch64_preferred_simd_mode (machine_mode mode)
8136 return aarch64_simd_container_mode (mode, 128);
8139 /* Return the bitmask of possible vector sizes for the vectorizer
8140 to iterate over. */
8141 static unsigned int
8142 aarch64_autovectorize_vector_sizes (void)
8144 return (16 | 8);
8147 /* Implement TARGET_MANGLE_TYPE. */
8149 static const char *
8150 aarch64_mangle_type (const_tree type)
8152 /* The AArch64 ABI documents say that "__va_list" has to be
8153 managled as if it is in the "std" namespace. */
8154 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8155 return "St9__va_list";
8157 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8158 builtin types. */
8159 if (TYPE_NAME (type) != NULL)
8160 return aarch64_mangle_builtin_type (type);
8162 /* Use the default mangling. */
8163 return NULL;
8167 /* Return true if the rtx_insn contains a MEM RTX somewhere
8168 in it. */
8170 static bool
8171 has_memory_op (rtx_insn *mem_insn)
8173 subrtx_iterator::array_type array;
8174 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8175 if (MEM_P (*iter))
8176 return true;
8178 return false;
8181 /* Find the first rtx_insn before insn that will generate an assembly
8182 instruction. */
8184 static rtx_insn *
8185 aarch64_prev_real_insn (rtx_insn *insn)
8187 if (!insn)
8188 return NULL;
8192 insn = prev_real_insn (insn);
8194 while (insn && recog_memoized (insn) < 0);
8196 return insn;
8199 static bool
8200 is_madd_op (enum attr_type t1)
8202 unsigned int i;
8203 /* A number of these may be AArch32 only. */
8204 enum attr_type mlatypes[] = {
8205 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8206 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8207 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8210 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8212 if (t1 == mlatypes[i])
8213 return true;
8216 return false;
8219 /* Check if there is a register dependency between a load and the insn
8220 for which we hold recog_data. */
8222 static bool
8223 dep_between_memop_and_curr (rtx memop)
8225 rtx load_reg;
8226 int opno;
8228 gcc_assert (GET_CODE (memop) == SET);
8230 if (!REG_P (SET_DEST (memop)))
8231 return false;
8233 load_reg = SET_DEST (memop);
8234 for (opno = 1; opno < recog_data.n_operands; opno++)
8236 rtx operand = recog_data.operand[opno];
8237 if (REG_P (operand)
8238 && reg_overlap_mentioned_p (load_reg, operand))
8239 return true;
8242 return false;
8246 /* When working around the Cortex-A53 erratum 835769,
8247 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8248 instruction and has a preceding memory instruction such that a NOP
8249 should be inserted between them. */
8251 bool
8252 aarch64_madd_needs_nop (rtx_insn* insn)
8254 enum attr_type attr_type;
8255 rtx_insn *prev;
8256 rtx body;
8258 if (!aarch64_fix_a53_err835769)
8259 return false;
8261 if (recog_memoized (insn) < 0)
8262 return false;
8264 attr_type = get_attr_type (insn);
8265 if (!is_madd_op (attr_type))
8266 return false;
8268 prev = aarch64_prev_real_insn (insn);
8269 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8270 Restore recog state to INSN to avoid state corruption. */
8271 extract_constrain_insn_cached (insn);
8273 if (!prev || !has_memory_op (prev))
8274 return false;
8276 body = single_set (prev);
8278 /* If the previous insn is a memory op and there is no dependency between
8279 it and the DImode madd, emit a NOP between them. If body is NULL then we
8280 have a complex memory operation, probably a load/store pair.
8281 Be conservative for now and emit a NOP. */
8282 if (GET_MODE (recog_data.operand[0]) == DImode
8283 && (!body || !dep_between_memop_and_curr (body)))
8284 return true;
8286 return false;
8291 /* Implement FINAL_PRESCAN_INSN. */
8293 void
8294 aarch64_final_prescan_insn (rtx_insn *insn)
8296 if (aarch64_madd_needs_nop (insn))
8297 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8301 /* Return the equivalent letter for size. */
8302 static char
8303 sizetochar (int size)
8305 switch (size)
8307 case 64: return 'd';
8308 case 32: return 's';
8309 case 16: return 'h';
8310 case 8 : return 'b';
8311 default: gcc_unreachable ();
8315 /* Return true iff x is a uniform vector of floating-point
8316 constants, and the constant can be represented in
8317 quarter-precision form. Note, as aarch64_float_const_representable
8318 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8319 static bool
8320 aarch64_vect_float_const_representable_p (rtx x)
8322 int i = 0;
8323 REAL_VALUE_TYPE r0, ri;
8324 rtx x0, xi;
8326 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8327 return false;
8329 x0 = CONST_VECTOR_ELT (x, 0);
8330 if (!CONST_DOUBLE_P (x0))
8331 return false;
8333 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8335 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8337 xi = CONST_VECTOR_ELT (x, i);
8338 if (!CONST_DOUBLE_P (xi))
8339 return false;
8341 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8342 if (!REAL_VALUES_EQUAL (r0, ri))
8343 return false;
8346 return aarch64_float_const_representable_p (x0);
8349 /* Return true for valid and false for invalid. */
8350 bool
8351 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8352 struct simd_immediate_info *info)
8354 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8355 matches = 1; \
8356 for (i = 0; i < idx; i += (STRIDE)) \
8357 if (!(TEST)) \
8358 matches = 0; \
8359 if (matches) \
8361 immtype = (CLASS); \
8362 elsize = (ELSIZE); \
8363 eshift = (SHIFT); \
8364 emvn = (NEG); \
8365 break; \
8368 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8369 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8370 unsigned char bytes[16];
8371 int immtype = -1, matches;
8372 unsigned int invmask = inverse ? 0xff : 0;
8373 int eshift, emvn;
8375 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8377 if (! (aarch64_simd_imm_zero_p (op, mode)
8378 || aarch64_vect_float_const_representable_p (op)))
8379 return false;
8381 if (info)
8383 info->value = CONST_VECTOR_ELT (op, 0);
8384 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8385 info->mvn = false;
8386 info->shift = 0;
8389 return true;
8392 /* Splat vector constant out into a byte vector. */
8393 for (i = 0; i < n_elts; i++)
8395 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8396 it must be laid out in the vector register in reverse order. */
8397 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8398 unsigned HOST_WIDE_INT elpart;
8399 unsigned int part, parts;
8401 if (CONST_INT_P (el))
8403 elpart = INTVAL (el);
8404 parts = 1;
8406 else if (GET_CODE (el) == CONST_DOUBLE)
8408 elpart = CONST_DOUBLE_LOW (el);
8409 parts = 2;
8411 else
8412 gcc_unreachable ();
8414 for (part = 0; part < parts; part++)
8416 unsigned int byte;
8417 for (byte = 0; byte < innersize; byte++)
8419 bytes[idx++] = (elpart & 0xff) ^ invmask;
8420 elpart >>= BITS_PER_UNIT;
8422 if (GET_CODE (el) == CONST_DOUBLE)
8423 elpart = CONST_DOUBLE_HIGH (el);
8427 /* Sanity check. */
8428 gcc_assert (idx == GET_MODE_SIZE (mode));
8432 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8433 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8435 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8436 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8438 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8439 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8441 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8442 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8444 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8446 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8448 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8449 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8451 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8452 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8454 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8455 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8457 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8458 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8460 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8462 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8464 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8465 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8467 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8468 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8470 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8471 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8473 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8474 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8476 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8478 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8479 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8481 while (0);
8483 if (immtype == -1)
8484 return false;
8486 if (info)
8488 info->element_width = elsize;
8489 info->mvn = emvn != 0;
8490 info->shift = eshift;
8492 unsigned HOST_WIDE_INT imm = 0;
8494 if (immtype >= 12 && immtype <= 15)
8495 info->msl = true;
8497 /* Un-invert bytes of recognized vector, if necessary. */
8498 if (invmask != 0)
8499 for (i = 0; i < idx; i++)
8500 bytes[i] ^= invmask;
8502 if (immtype == 17)
8504 /* FIXME: Broken on 32-bit H_W_I hosts. */
8505 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8507 for (i = 0; i < 8; i++)
8508 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8509 << (i * BITS_PER_UNIT);
8512 info->value = GEN_INT (imm);
8514 else
8516 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8517 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8519 /* Construct 'abcdefgh' because the assembler cannot handle
8520 generic constants. */
8521 if (info->mvn)
8522 imm = ~imm;
8523 imm = (imm >> info->shift) & 0xff;
8524 info->value = GEN_INT (imm);
8528 return true;
8529 #undef CHECK
8532 /* Check of immediate shift constants are within range. */
8533 bool
8534 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8536 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8537 if (left)
8538 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8539 else
8540 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8543 /* Return true if X is a uniform vector where all elements
8544 are either the floating-point constant 0.0 or the
8545 integer constant 0. */
8546 bool
8547 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8549 return x == CONST0_RTX (mode);
8552 bool
8553 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8555 HOST_WIDE_INT imm = INTVAL (x);
8556 int i;
8558 for (i = 0; i < 8; i++)
8560 unsigned int byte = imm & 0xff;
8561 if (byte != 0xff && byte != 0)
8562 return false;
8563 imm >>= 8;
8566 return true;
8569 bool
8570 aarch64_mov_operand_p (rtx x,
8571 enum aarch64_symbol_context context,
8572 machine_mode mode)
8574 if (GET_CODE (x) == HIGH
8575 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8576 return true;
8578 if (CONST_INT_P (x))
8579 return true;
8581 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8582 return true;
8584 return aarch64_classify_symbolic_expression (x, context)
8585 == SYMBOL_TINY_ABSOLUTE;
8588 /* Return a const_int vector of VAL. */
8590 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8592 int nunits = GET_MODE_NUNITS (mode);
8593 rtvec v = rtvec_alloc (nunits);
8594 int i;
8596 for (i=0; i < nunits; i++)
8597 RTVEC_ELT (v, i) = GEN_INT (val);
8599 return gen_rtx_CONST_VECTOR (mode, v);
8602 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8604 bool
8605 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8607 machine_mode vmode;
8609 gcc_assert (!VECTOR_MODE_P (mode));
8610 vmode = aarch64_preferred_simd_mode (mode);
8611 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8612 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8615 /* Construct and return a PARALLEL RTX vector with elements numbering the
8616 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8617 the vector - from the perspective of the architecture. This does not
8618 line up with GCC's perspective on lane numbers, so we end up with
8619 different masks depending on our target endian-ness. The diagram
8620 below may help. We must draw the distinction when building masks
8621 which select one half of the vector. An instruction selecting
8622 architectural low-lanes for a big-endian target, must be described using
8623 a mask selecting GCC high-lanes.
8625 Big-Endian Little-Endian
8627 GCC 0 1 2 3 3 2 1 0
8628 | x | x | x | x | | x | x | x | x |
8629 Architecture 3 2 1 0 3 2 1 0
8631 Low Mask: { 2, 3 } { 0, 1 }
8632 High Mask: { 0, 1 } { 2, 3 }
8636 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8638 int nunits = GET_MODE_NUNITS (mode);
8639 rtvec v = rtvec_alloc (nunits / 2);
8640 int high_base = nunits / 2;
8641 int low_base = 0;
8642 int base;
8643 rtx t1;
8644 int i;
8646 if (BYTES_BIG_ENDIAN)
8647 base = high ? low_base : high_base;
8648 else
8649 base = high ? high_base : low_base;
8651 for (i = 0; i < nunits / 2; i++)
8652 RTVEC_ELT (v, i) = GEN_INT (base + i);
8654 t1 = gen_rtx_PARALLEL (mode, v);
8655 return t1;
8658 /* Check OP for validity as a PARALLEL RTX vector with elements
8659 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8660 from the perspective of the architecture. See the diagram above
8661 aarch64_simd_vect_par_cnst_half for more details. */
8663 bool
8664 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8665 bool high)
8667 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8668 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8669 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8670 int i = 0;
8672 if (!VECTOR_MODE_P (mode))
8673 return false;
8675 if (count_op != count_ideal)
8676 return false;
8678 for (i = 0; i < count_ideal; i++)
8680 rtx elt_op = XVECEXP (op, 0, i);
8681 rtx elt_ideal = XVECEXP (ideal, 0, i);
8683 if (!CONST_INT_P (elt_op)
8684 || INTVAL (elt_ideal) != INTVAL (elt_op))
8685 return false;
8687 return true;
8690 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8691 HIGH (exclusive). */
8692 void
8693 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8694 const_tree exp)
8696 HOST_WIDE_INT lane;
8697 gcc_assert (CONST_INT_P (operand));
8698 lane = INTVAL (operand);
8700 if (lane < low || lane >= high)
8702 if (exp)
8703 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8704 else
8705 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8709 /* Return TRUE if OP is a valid vector addressing mode. */
8710 bool
8711 aarch64_simd_mem_operand_p (rtx op)
8713 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8714 || REG_P (XEXP (op, 0)));
8717 /* Emit a register copy from operand to operand, taking care not to
8718 early-clobber source registers in the process.
8720 COUNT is the number of components into which the copy needs to be
8721 decomposed. */
8722 void
8723 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8724 unsigned int count)
8726 unsigned int i;
8727 int rdest = REGNO (operands[0]);
8728 int rsrc = REGNO (operands[1]);
8730 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8731 || rdest < rsrc)
8732 for (i = 0; i < count; i++)
8733 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8734 gen_rtx_REG (mode, rsrc + i));
8735 else
8736 for (i = 0; i < count; i++)
8737 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8738 gen_rtx_REG (mode, rsrc + count - i - 1));
8741 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8742 one of VSTRUCT modes: OI, CI or XI. */
8744 aarch64_simd_attr_length_move (rtx_insn *insn)
8746 machine_mode mode;
8748 extract_insn_cached (insn);
8750 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8752 mode = GET_MODE (recog_data.operand[0]);
8753 switch (mode)
8755 case OImode:
8756 return 8;
8757 case CImode:
8758 return 12;
8759 case XImode:
8760 return 16;
8761 default:
8762 gcc_unreachable ();
8765 return 4;
8768 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8769 one of VSTRUCT modes: OI, CI, EI, or XI. */
8771 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8773 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8776 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8777 alignment of a vector to 128 bits. */
8778 static HOST_WIDE_INT
8779 aarch64_simd_vector_alignment (const_tree type)
8781 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8782 return MIN (align, 128);
8785 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8786 static bool
8787 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8789 if (is_packed)
8790 return false;
8792 /* We guarantee alignment for vectors up to 128-bits. */
8793 if (tree_int_cst_compare (TYPE_SIZE (type),
8794 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8795 return false;
8797 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8798 return true;
8801 /* If VALS is a vector constant that can be loaded into a register
8802 using DUP, generate instructions to do so and return an RTX to
8803 assign to the register. Otherwise return NULL_RTX. */
8804 static rtx
8805 aarch64_simd_dup_constant (rtx vals)
8807 machine_mode mode = GET_MODE (vals);
8808 machine_mode inner_mode = GET_MODE_INNER (mode);
8809 int n_elts = GET_MODE_NUNITS (mode);
8810 bool all_same = true;
8811 rtx x;
8812 int i;
8814 if (GET_CODE (vals) != CONST_VECTOR)
8815 return NULL_RTX;
8817 for (i = 1; i < n_elts; ++i)
8819 x = CONST_VECTOR_ELT (vals, i);
8820 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8821 all_same = false;
8824 if (!all_same)
8825 return NULL_RTX;
8827 /* We can load this constant by using DUP and a constant in a
8828 single ARM register. This will be cheaper than a vector
8829 load. */
8830 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8831 return gen_rtx_VEC_DUPLICATE (mode, x);
8835 /* Generate code to load VALS, which is a PARALLEL containing only
8836 constants (for vec_init) or CONST_VECTOR, efficiently into a
8837 register. Returns an RTX to copy into the register, or NULL_RTX
8838 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8839 static rtx
8840 aarch64_simd_make_constant (rtx vals)
8842 machine_mode mode = GET_MODE (vals);
8843 rtx const_dup;
8844 rtx const_vec = NULL_RTX;
8845 int n_elts = GET_MODE_NUNITS (mode);
8846 int n_const = 0;
8847 int i;
8849 if (GET_CODE (vals) == CONST_VECTOR)
8850 const_vec = vals;
8851 else if (GET_CODE (vals) == PARALLEL)
8853 /* A CONST_VECTOR must contain only CONST_INTs and
8854 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8855 Only store valid constants in a CONST_VECTOR. */
8856 for (i = 0; i < n_elts; ++i)
8858 rtx x = XVECEXP (vals, 0, i);
8859 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8860 n_const++;
8862 if (n_const == n_elts)
8863 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8865 else
8866 gcc_unreachable ();
8868 if (const_vec != NULL_RTX
8869 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8870 /* Load using MOVI/MVNI. */
8871 return const_vec;
8872 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8873 /* Loaded using DUP. */
8874 return const_dup;
8875 else if (const_vec != NULL_RTX)
8876 /* Load from constant pool. We can not take advantage of single-cycle
8877 LD1 because we need a PC-relative addressing mode. */
8878 return const_vec;
8879 else
8880 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8881 We can not construct an initializer. */
8882 return NULL_RTX;
8885 void
8886 aarch64_expand_vector_init (rtx target, rtx vals)
8888 machine_mode mode = GET_MODE (target);
8889 machine_mode inner_mode = GET_MODE_INNER (mode);
8890 int n_elts = GET_MODE_NUNITS (mode);
8891 int n_var = 0;
8892 rtx any_const = NULL_RTX;
8893 bool all_same = true;
8895 for (int i = 0; i < n_elts; ++i)
8897 rtx x = XVECEXP (vals, 0, i);
8898 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8899 ++n_var;
8900 else
8901 any_const = x;
8903 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8904 all_same = false;
8907 if (n_var == 0)
8909 rtx constant = aarch64_simd_make_constant (vals);
8910 if (constant != NULL_RTX)
8912 emit_move_insn (target, constant);
8913 return;
8917 /* Splat a single non-constant element if we can. */
8918 if (all_same)
8920 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8921 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8922 return;
8925 /* Half the fields (or less) are non-constant. Load constant then overwrite
8926 varying fields. Hope that this is more efficient than using the stack. */
8927 if (n_var <= n_elts/2)
8929 rtx copy = copy_rtx (vals);
8931 /* Load constant part of vector. We really don't care what goes into the
8932 parts we will overwrite, but we're more likely to be able to load the
8933 constant efficiently if it has fewer, larger, repeating parts
8934 (see aarch64_simd_valid_immediate). */
8935 for (int i = 0; i < n_elts; i++)
8937 rtx x = XVECEXP (vals, 0, i);
8938 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8939 continue;
8940 rtx subst = any_const;
8941 for (int bit = n_elts / 2; bit > 0; bit /= 2)
8943 /* Look in the copied vector, as more elements are const. */
8944 rtx test = XVECEXP (copy, 0, i ^ bit);
8945 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8947 subst = test;
8948 break;
8951 XVECEXP (copy, 0, i) = subst;
8953 aarch64_expand_vector_init (target, copy);
8955 /* Insert variables. */
8956 enum insn_code icode = optab_handler (vec_set_optab, mode);
8957 gcc_assert (icode != CODE_FOR_nothing);
8959 for (int i = 0; i < n_elts; i++)
8961 rtx x = XVECEXP (vals, 0, i);
8962 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8963 continue;
8964 x = copy_to_mode_reg (inner_mode, x);
8965 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8967 return;
8970 /* Construct the vector in memory one field at a time
8971 and load the whole vector. */
8972 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8973 for (int i = 0; i < n_elts; i++)
8974 emit_move_insn (adjust_address_nv (mem, inner_mode,
8975 i * GET_MODE_SIZE (inner_mode)),
8976 XVECEXP (vals, 0, i));
8977 emit_move_insn (target, mem);
8981 static unsigned HOST_WIDE_INT
8982 aarch64_shift_truncation_mask (machine_mode mode)
8984 return
8985 (aarch64_vector_mode_supported_p (mode)
8986 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8989 #ifndef TLS_SECTION_ASM_FLAG
8990 #define TLS_SECTION_ASM_FLAG 'T'
8991 #endif
8993 void
8994 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8995 tree decl ATTRIBUTE_UNUSED)
8997 char flagchars[10], *f = flagchars;
8999 /* If we have already declared this section, we can use an
9000 abbreviated form to switch back to it -- unless this section is
9001 part of a COMDAT groups, in which case GAS requires the full
9002 declaration every time. */
9003 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9004 && (flags & SECTION_DECLARED))
9006 fprintf (asm_out_file, "\t.section\t%s\n", name);
9007 return;
9010 if (!(flags & SECTION_DEBUG))
9011 *f++ = 'a';
9012 if (flags & SECTION_WRITE)
9013 *f++ = 'w';
9014 if (flags & SECTION_CODE)
9015 *f++ = 'x';
9016 if (flags & SECTION_SMALL)
9017 *f++ = 's';
9018 if (flags & SECTION_MERGE)
9019 *f++ = 'M';
9020 if (flags & SECTION_STRINGS)
9021 *f++ = 'S';
9022 if (flags & SECTION_TLS)
9023 *f++ = TLS_SECTION_ASM_FLAG;
9024 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9025 *f++ = 'G';
9026 *f = '\0';
9028 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9030 if (!(flags & SECTION_NOTYPE))
9032 const char *type;
9033 const char *format;
9035 if (flags & SECTION_BSS)
9036 type = "nobits";
9037 else
9038 type = "progbits";
9040 #ifdef TYPE_OPERAND_FMT
9041 format = "," TYPE_OPERAND_FMT;
9042 #else
9043 format = ",@%s";
9044 #endif
9046 fprintf (asm_out_file, format, type);
9048 if (flags & SECTION_ENTSIZE)
9049 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9050 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9052 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9053 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9054 else
9055 fprintf (asm_out_file, ",%s,comdat",
9056 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9060 putc ('\n', asm_out_file);
9063 /* Select a format to encode pointers in exception handling data. */
9065 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9067 int type;
9068 switch (aarch64_cmodel)
9070 case AARCH64_CMODEL_TINY:
9071 case AARCH64_CMODEL_TINY_PIC:
9072 case AARCH64_CMODEL_SMALL:
9073 case AARCH64_CMODEL_SMALL_PIC:
9074 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9075 for everything. */
9076 type = DW_EH_PE_sdata4;
9077 break;
9078 default:
9079 /* No assumptions here. 8-byte relocs required. */
9080 type = DW_EH_PE_sdata8;
9081 break;
9083 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9086 /* Emit load exclusive. */
9088 static void
9089 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9090 rtx mem, rtx model_rtx)
9092 rtx (*gen) (rtx, rtx, rtx);
9094 switch (mode)
9096 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9097 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9098 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9099 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9100 default:
9101 gcc_unreachable ();
9104 emit_insn (gen (rval, mem, model_rtx));
9107 /* Emit store exclusive. */
9109 static void
9110 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9111 rtx rval, rtx mem, rtx model_rtx)
9113 rtx (*gen) (rtx, rtx, rtx, rtx);
9115 switch (mode)
9117 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9118 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9119 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9120 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9121 default:
9122 gcc_unreachable ();
9125 emit_insn (gen (bval, rval, mem, model_rtx));
9128 /* Mark the previous jump instruction as unlikely. */
9130 static void
9131 aarch64_emit_unlikely_jump (rtx insn)
9133 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9135 insn = emit_jump_insn (insn);
9136 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9139 /* Expand a compare and swap pattern. */
9141 void
9142 aarch64_expand_compare_and_swap (rtx operands[])
9144 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9145 machine_mode mode, cmp_mode;
9146 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9148 bval = operands[0];
9149 rval = operands[1];
9150 mem = operands[2];
9151 oldval = operands[3];
9152 newval = operands[4];
9153 is_weak = operands[5];
9154 mod_s = operands[6];
9155 mod_f = operands[7];
9156 mode = GET_MODE (mem);
9157 cmp_mode = mode;
9159 /* Normally the succ memory model must be stronger than fail, but in the
9160 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9161 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9163 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9164 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9165 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9167 switch (mode)
9169 case QImode:
9170 case HImode:
9171 /* For short modes, we're going to perform the comparison in SImode,
9172 so do the zero-extension now. */
9173 cmp_mode = SImode;
9174 rval = gen_reg_rtx (SImode);
9175 oldval = convert_modes (SImode, mode, oldval, true);
9176 /* Fall through. */
9178 case SImode:
9179 case DImode:
9180 /* Force the value into a register if needed. */
9181 if (!aarch64_plus_operand (oldval, mode))
9182 oldval = force_reg (cmp_mode, oldval);
9183 break;
9185 default:
9186 gcc_unreachable ();
9189 switch (mode)
9191 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9192 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9193 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9194 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9195 default:
9196 gcc_unreachable ();
9199 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9201 if (mode == QImode || mode == HImode)
9202 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9204 x = gen_rtx_REG (CCmode, CC_REGNUM);
9205 x = gen_rtx_EQ (SImode, x, const0_rtx);
9206 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9209 /* Split a compare and swap pattern. */
9211 void
9212 aarch64_split_compare_and_swap (rtx operands[])
9214 rtx rval, mem, oldval, newval, scratch;
9215 machine_mode mode;
9216 bool is_weak;
9217 rtx_code_label *label1, *label2;
9218 rtx x, cond;
9220 rval = operands[0];
9221 mem = operands[1];
9222 oldval = operands[2];
9223 newval = operands[3];
9224 is_weak = (operands[4] != const0_rtx);
9225 scratch = operands[7];
9226 mode = GET_MODE (mem);
9228 label1 = NULL;
9229 if (!is_weak)
9231 label1 = gen_label_rtx ();
9232 emit_label (label1);
9234 label2 = gen_label_rtx ();
9236 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9238 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9239 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9240 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9241 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9242 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9244 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9246 if (!is_weak)
9248 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9249 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9250 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9251 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9253 else
9255 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9256 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9257 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9260 emit_label (label2);
9263 /* Split an atomic operation. */
9265 void
9266 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9267 rtx value, rtx model_rtx, rtx cond)
9269 machine_mode mode = GET_MODE (mem);
9270 machine_mode wmode = (mode == DImode ? DImode : SImode);
9271 rtx_code_label *label;
9272 rtx x;
9274 label = gen_label_rtx ();
9275 emit_label (label);
9277 if (new_out)
9278 new_out = gen_lowpart (wmode, new_out);
9279 if (old_out)
9280 old_out = gen_lowpart (wmode, old_out);
9281 else
9282 old_out = new_out;
9283 value = simplify_gen_subreg (wmode, value, mode, 0);
9285 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9287 switch (code)
9289 case SET:
9290 new_out = value;
9291 break;
9293 case NOT:
9294 x = gen_rtx_AND (wmode, old_out, value);
9295 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9296 x = gen_rtx_NOT (wmode, new_out);
9297 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9298 break;
9300 case MINUS:
9301 if (CONST_INT_P (value))
9303 value = GEN_INT (-INTVAL (value));
9304 code = PLUS;
9306 /* Fall through. */
9308 default:
9309 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9310 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9311 break;
9314 aarch64_emit_store_exclusive (mode, cond, mem,
9315 gen_lowpart (mode, new_out), model_rtx);
9317 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9318 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9319 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9320 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9323 static void
9324 aarch64_print_extension (void)
9326 const struct aarch64_option_extension *opt = NULL;
9328 for (opt = all_extensions; opt->name != NULL; opt++)
9329 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9330 asm_fprintf (asm_out_file, "+%s", opt->name);
9332 asm_fprintf (asm_out_file, "\n");
9335 static void
9336 aarch64_start_file (void)
9338 if (selected_arch)
9340 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9341 aarch64_print_extension ();
9343 else if (selected_cpu)
9345 const char *truncated_name
9346 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9347 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9348 aarch64_print_extension ();
9350 default_file_start();
9353 /* Target hook for c_mode_for_suffix. */
9354 static machine_mode
9355 aarch64_c_mode_for_suffix (char suffix)
9357 if (suffix == 'q')
9358 return TFmode;
9360 return VOIDmode;
9363 /* We can only represent floating point constants which will fit in
9364 "quarter-precision" values. These values are characterised by
9365 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9368 (-1)^s * (n/16) * 2^r
9370 Where:
9371 's' is the sign bit.
9372 'n' is an integer in the range 16 <= n <= 31.
9373 'r' is an integer in the range -3 <= r <= 4. */
9375 /* Return true iff X can be represented by a quarter-precision
9376 floating point immediate operand X. Note, we cannot represent 0.0. */
9377 bool
9378 aarch64_float_const_representable_p (rtx x)
9380 /* This represents our current view of how many bits
9381 make up the mantissa. */
9382 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9383 int exponent;
9384 unsigned HOST_WIDE_INT mantissa, mask;
9385 REAL_VALUE_TYPE r, m;
9386 bool fail;
9388 if (!CONST_DOUBLE_P (x))
9389 return false;
9391 if (GET_MODE (x) == VOIDmode)
9392 return false;
9394 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9396 /* We cannot represent infinities, NaNs or +/-zero. We won't
9397 know if we have +zero until we analyse the mantissa, but we
9398 can reject the other invalid values. */
9399 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9400 || REAL_VALUE_MINUS_ZERO (r))
9401 return false;
9403 /* Extract exponent. */
9404 r = real_value_abs (&r);
9405 exponent = REAL_EXP (&r);
9407 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9408 highest (sign) bit, with a fixed binary point at bit point_pos.
9409 m1 holds the low part of the mantissa, m2 the high part.
9410 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9411 bits for the mantissa, this can fail (low bits will be lost). */
9412 real_ldexp (&m, &r, point_pos - exponent);
9413 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9415 /* If the low part of the mantissa has bits set we cannot represent
9416 the value. */
9417 if (w.elt (0) != 0)
9418 return false;
9419 /* We have rejected the lower HOST_WIDE_INT, so update our
9420 understanding of how many bits lie in the mantissa and
9421 look only at the high HOST_WIDE_INT. */
9422 mantissa = w.elt (1);
9423 point_pos -= HOST_BITS_PER_WIDE_INT;
9425 /* We can only represent values with a mantissa of the form 1.xxxx. */
9426 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9427 if ((mantissa & mask) != 0)
9428 return false;
9430 /* Having filtered unrepresentable values, we may now remove all
9431 but the highest 5 bits. */
9432 mantissa >>= point_pos - 5;
9434 /* We cannot represent the value 0.0, so reject it. This is handled
9435 elsewhere. */
9436 if (mantissa == 0)
9437 return false;
9439 /* Then, as bit 4 is always set, we can mask it off, leaving
9440 the mantissa in the range [0, 15]. */
9441 mantissa &= ~(1 << 4);
9442 gcc_assert (mantissa <= 15);
9444 /* GCC internally does not use IEEE754-like encoding (where normalized
9445 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9446 Our mantissa values are shifted 4 places to the left relative to
9447 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9448 by 5 places to correct for GCC's representation. */
9449 exponent = 5 - exponent;
9451 return (exponent >= 0 && exponent <= 7);
9454 char*
9455 aarch64_output_simd_mov_immediate (rtx const_vector,
9456 machine_mode mode,
9457 unsigned width)
9459 bool is_valid;
9460 static char templ[40];
9461 const char *mnemonic;
9462 const char *shift_op;
9463 unsigned int lane_count = 0;
9464 char element_char;
9466 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9468 /* This will return true to show const_vector is legal for use as either
9469 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9470 also update INFO to show how the immediate should be generated. */
9471 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9472 gcc_assert (is_valid);
9474 element_char = sizetochar (info.element_width);
9475 lane_count = width / info.element_width;
9477 mode = GET_MODE_INNER (mode);
9478 if (mode == SFmode || mode == DFmode)
9480 gcc_assert (info.shift == 0 && ! info.mvn);
9481 if (aarch64_float_const_zero_rtx_p (info.value))
9482 info.value = GEN_INT (0);
9483 else
9485 #define buf_size 20
9486 REAL_VALUE_TYPE r;
9487 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9488 char float_buf[buf_size] = {'\0'};
9489 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9490 #undef buf_size
9492 if (lane_count == 1)
9493 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9494 else
9495 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9496 lane_count, element_char, float_buf);
9497 return templ;
9501 mnemonic = info.mvn ? "mvni" : "movi";
9502 shift_op = info.msl ? "msl" : "lsl";
9504 if (lane_count == 1)
9505 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9506 mnemonic, UINTVAL (info.value));
9507 else if (info.shift)
9508 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9509 ", %s %d", mnemonic, lane_count, element_char,
9510 UINTVAL (info.value), shift_op, info.shift);
9511 else
9512 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9513 mnemonic, lane_count, element_char, UINTVAL (info.value));
9514 return templ;
9517 char*
9518 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9519 machine_mode mode)
9521 machine_mode vmode;
9523 gcc_assert (!VECTOR_MODE_P (mode));
9524 vmode = aarch64_simd_container_mode (mode, 64);
9525 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9526 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9529 /* Split operands into moves from op[1] + op[2] into op[0]. */
9531 void
9532 aarch64_split_combinev16qi (rtx operands[3])
9534 unsigned int dest = REGNO (operands[0]);
9535 unsigned int src1 = REGNO (operands[1]);
9536 unsigned int src2 = REGNO (operands[2]);
9537 machine_mode halfmode = GET_MODE (operands[1]);
9538 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9539 rtx destlo, desthi;
9541 gcc_assert (halfmode == V16QImode);
9543 if (src1 == dest && src2 == dest + halfregs)
9545 /* No-op move. Can't split to nothing; emit something. */
9546 emit_note (NOTE_INSN_DELETED);
9547 return;
9550 /* Preserve register attributes for variable tracking. */
9551 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9552 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9553 GET_MODE_SIZE (halfmode));
9555 /* Special case of reversed high/low parts. */
9556 if (reg_overlap_mentioned_p (operands[2], destlo)
9557 && reg_overlap_mentioned_p (operands[1], desthi))
9559 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9560 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9561 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9563 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9565 /* Try to avoid unnecessary moves if part of the result
9566 is in the right place already. */
9567 if (src1 != dest)
9568 emit_move_insn (destlo, operands[1]);
9569 if (src2 != dest + halfregs)
9570 emit_move_insn (desthi, operands[2]);
9572 else
9574 if (src2 != dest + halfregs)
9575 emit_move_insn (desthi, operands[2]);
9576 if (src1 != dest)
9577 emit_move_insn (destlo, operands[1]);
9581 /* vec_perm support. */
9583 #define MAX_VECT_LEN 16
9585 struct expand_vec_perm_d
9587 rtx target, op0, op1;
9588 unsigned char perm[MAX_VECT_LEN];
9589 machine_mode vmode;
9590 unsigned char nelt;
9591 bool one_vector_p;
9592 bool testing_p;
9595 /* Generate a variable permutation. */
9597 static void
9598 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9600 machine_mode vmode = GET_MODE (target);
9601 bool one_vector_p = rtx_equal_p (op0, op1);
9603 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9604 gcc_checking_assert (GET_MODE (op0) == vmode);
9605 gcc_checking_assert (GET_MODE (op1) == vmode);
9606 gcc_checking_assert (GET_MODE (sel) == vmode);
9607 gcc_checking_assert (TARGET_SIMD);
9609 if (one_vector_p)
9611 if (vmode == V8QImode)
9613 /* Expand the argument to a V16QI mode by duplicating it. */
9614 rtx pair = gen_reg_rtx (V16QImode);
9615 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9616 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9618 else
9620 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9623 else
9625 rtx pair;
9627 if (vmode == V8QImode)
9629 pair = gen_reg_rtx (V16QImode);
9630 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9631 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9633 else
9635 pair = gen_reg_rtx (OImode);
9636 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9637 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9642 void
9643 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9645 machine_mode vmode = GET_MODE (target);
9646 unsigned int nelt = GET_MODE_NUNITS (vmode);
9647 bool one_vector_p = rtx_equal_p (op0, op1);
9648 rtx mask;
9650 /* The TBL instruction does not use a modulo index, so we must take care
9651 of that ourselves. */
9652 mask = aarch64_simd_gen_const_vector_dup (vmode,
9653 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9654 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9656 /* For big-endian, we also need to reverse the index within the vector
9657 (but not which vector). */
9658 if (BYTES_BIG_ENDIAN)
9660 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9661 if (!one_vector_p)
9662 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9663 sel = expand_simple_binop (vmode, XOR, sel, mask,
9664 NULL, 0, OPTAB_LIB_WIDEN);
9666 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9669 /* Recognize patterns suitable for the TRN instructions. */
9670 static bool
9671 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9673 unsigned int i, odd, mask, nelt = d->nelt;
9674 rtx out, in0, in1, x;
9675 rtx (*gen) (rtx, rtx, rtx);
9676 machine_mode vmode = d->vmode;
9678 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9679 return false;
9681 /* Note that these are little-endian tests.
9682 We correct for big-endian later. */
9683 if (d->perm[0] == 0)
9684 odd = 0;
9685 else if (d->perm[0] == 1)
9686 odd = 1;
9687 else
9688 return false;
9689 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9691 for (i = 0; i < nelt; i += 2)
9693 if (d->perm[i] != i + odd)
9694 return false;
9695 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9696 return false;
9699 /* Success! */
9700 if (d->testing_p)
9701 return true;
9703 in0 = d->op0;
9704 in1 = d->op1;
9705 if (BYTES_BIG_ENDIAN)
9707 x = in0, in0 = in1, in1 = x;
9708 odd = !odd;
9710 out = d->target;
9712 if (odd)
9714 switch (vmode)
9716 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9717 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9718 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9719 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9720 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9721 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9722 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9723 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9724 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9725 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9726 default:
9727 return false;
9730 else
9732 switch (vmode)
9734 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9735 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9736 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9737 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9738 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9739 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9740 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9741 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9742 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9743 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9744 default:
9745 return false;
9749 emit_insn (gen (out, in0, in1));
9750 return true;
9753 /* Recognize patterns suitable for the UZP instructions. */
9754 static bool
9755 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9757 unsigned int i, odd, mask, nelt = d->nelt;
9758 rtx out, in0, in1, x;
9759 rtx (*gen) (rtx, rtx, rtx);
9760 machine_mode vmode = d->vmode;
9762 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9763 return false;
9765 /* Note that these are little-endian tests.
9766 We correct for big-endian later. */
9767 if (d->perm[0] == 0)
9768 odd = 0;
9769 else if (d->perm[0] == 1)
9770 odd = 1;
9771 else
9772 return false;
9773 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9775 for (i = 0; i < nelt; i++)
9777 unsigned elt = (i * 2 + odd) & mask;
9778 if (d->perm[i] != elt)
9779 return false;
9782 /* Success! */
9783 if (d->testing_p)
9784 return true;
9786 in0 = d->op0;
9787 in1 = d->op1;
9788 if (BYTES_BIG_ENDIAN)
9790 x = in0, in0 = in1, in1 = x;
9791 odd = !odd;
9793 out = d->target;
9795 if (odd)
9797 switch (vmode)
9799 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9800 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9801 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9802 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9803 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9804 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9805 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9806 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9807 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9808 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9809 default:
9810 return false;
9813 else
9815 switch (vmode)
9817 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9818 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9819 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9820 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9821 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9822 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9823 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9824 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9825 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9826 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9827 default:
9828 return false;
9832 emit_insn (gen (out, in0, in1));
9833 return true;
9836 /* Recognize patterns suitable for the ZIP instructions. */
9837 static bool
9838 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9840 unsigned int i, high, mask, nelt = d->nelt;
9841 rtx out, in0, in1, x;
9842 rtx (*gen) (rtx, rtx, rtx);
9843 machine_mode vmode = d->vmode;
9845 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9846 return false;
9848 /* Note that these are little-endian tests.
9849 We correct for big-endian later. */
9850 high = nelt / 2;
9851 if (d->perm[0] == high)
9852 /* Do Nothing. */
9854 else if (d->perm[0] == 0)
9855 high = 0;
9856 else
9857 return false;
9858 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9860 for (i = 0; i < nelt / 2; i++)
9862 unsigned elt = (i + high) & mask;
9863 if (d->perm[i * 2] != elt)
9864 return false;
9865 elt = (elt + nelt) & mask;
9866 if (d->perm[i * 2 + 1] != elt)
9867 return false;
9870 /* Success! */
9871 if (d->testing_p)
9872 return true;
9874 in0 = d->op0;
9875 in1 = d->op1;
9876 if (BYTES_BIG_ENDIAN)
9878 x = in0, in0 = in1, in1 = x;
9879 high = !high;
9881 out = d->target;
9883 if (high)
9885 switch (vmode)
9887 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9888 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9889 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9890 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9891 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9892 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9893 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9894 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9895 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9896 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9897 default:
9898 return false;
9901 else
9903 switch (vmode)
9905 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9906 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9907 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9908 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9909 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9910 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9911 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9912 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9913 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9914 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9915 default:
9916 return false;
9920 emit_insn (gen (out, in0, in1));
9921 return true;
9924 /* Recognize patterns for the EXT insn. */
9926 static bool
9927 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9929 unsigned int i, nelt = d->nelt;
9930 rtx (*gen) (rtx, rtx, rtx, rtx);
9931 rtx offset;
9933 unsigned int location = d->perm[0]; /* Always < nelt. */
9935 /* Check if the extracted indices are increasing by one. */
9936 for (i = 1; i < nelt; i++)
9938 unsigned int required = location + i;
9939 if (d->one_vector_p)
9941 /* We'll pass the same vector in twice, so allow indices to wrap. */
9942 required &= (nelt - 1);
9944 if (d->perm[i] != required)
9945 return false;
9948 switch (d->vmode)
9950 case V16QImode: gen = gen_aarch64_extv16qi; break;
9951 case V8QImode: gen = gen_aarch64_extv8qi; break;
9952 case V4HImode: gen = gen_aarch64_extv4hi; break;
9953 case V8HImode: gen = gen_aarch64_extv8hi; break;
9954 case V2SImode: gen = gen_aarch64_extv2si; break;
9955 case V4SImode: gen = gen_aarch64_extv4si; break;
9956 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9957 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9958 case V2DImode: gen = gen_aarch64_extv2di; break;
9959 case V2DFmode: gen = gen_aarch64_extv2df; break;
9960 default:
9961 return false;
9964 /* Success! */
9965 if (d->testing_p)
9966 return true;
9968 /* The case where (location == 0) is a no-op for both big- and little-endian,
9969 and is removed by the mid-end at optimization levels -O1 and higher. */
9971 if (BYTES_BIG_ENDIAN && (location != 0))
9973 /* After setup, we want the high elements of the first vector (stored
9974 at the LSB end of the register), and the low elements of the second
9975 vector (stored at the MSB end of the register). So swap. */
9976 std::swap (d->op0, d->op1);
9977 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9978 location = nelt - location;
9981 offset = GEN_INT (location);
9982 emit_insn (gen (d->target, d->op0, d->op1, offset));
9983 return true;
9986 /* Recognize patterns for the REV insns. */
9988 static bool
9989 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9991 unsigned int i, j, diff, nelt = d->nelt;
9992 rtx (*gen) (rtx, rtx);
9994 if (!d->one_vector_p)
9995 return false;
9997 diff = d->perm[0];
9998 switch (diff)
10000 case 7:
10001 switch (d->vmode)
10003 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10004 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10005 default:
10006 return false;
10008 break;
10009 case 3:
10010 switch (d->vmode)
10012 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10013 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10014 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10015 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10016 default:
10017 return false;
10019 break;
10020 case 1:
10021 switch (d->vmode)
10023 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10024 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10025 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10026 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10027 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10028 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10029 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10030 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10031 default:
10032 return false;
10034 break;
10035 default:
10036 return false;
10039 for (i = 0; i < nelt ; i += diff + 1)
10040 for (j = 0; j <= diff; j += 1)
10042 /* This is guaranteed to be true as the value of diff
10043 is 7, 3, 1 and we should have enough elements in the
10044 queue to generate this. Getting a vector mask with a
10045 value of diff other than these values implies that
10046 something is wrong by the time we get here. */
10047 gcc_assert (i + j < nelt);
10048 if (d->perm[i + j] != i + diff - j)
10049 return false;
10052 /* Success! */
10053 if (d->testing_p)
10054 return true;
10056 emit_insn (gen (d->target, d->op0));
10057 return true;
10060 static bool
10061 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10063 rtx (*gen) (rtx, rtx, rtx);
10064 rtx out = d->target;
10065 rtx in0;
10066 machine_mode vmode = d->vmode;
10067 unsigned int i, elt, nelt = d->nelt;
10068 rtx lane;
10070 elt = d->perm[0];
10071 for (i = 1; i < nelt; i++)
10073 if (elt != d->perm[i])
10074 return false;
10077 /* The generic preparation in aarch64_expand_vec_perm_const_1
10078 swaps the operand order and the permute indices if it finds
10079 d->perm[0] to be in the second operand. Thus, we can always
10080 use d->op0 and need not do any extra arithmetic to get the
10081 correct lane number. */
10082 in0 = d->op0;
10083 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10085 switch (vmode)
10087 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10088 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10089 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10090 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10091 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10092 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10093 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10094 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10095 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10096 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10097 default:
10098 return false;
10101 emit_insn (gen (out, in0, lane));
10102 return true;
10105 static bool
10106 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10108 rtx rperm[MAX_VECT_LEN], sel;
10109 machine_mode vmode = d->vmode;
10110 unsigned int i, nelt = d->nelt;
10112 if (d->testing_p)
10113 return true;
10115 /* Generic code will try constant permutation twice. Once with the
10116 original mode and again with the elements lowered to QImode.
10117 So wait and don't do the selector expansion ourselves. */
10118 if (vmode != V8QImode && vmode != V16QImode)
10119 return false;
10121 for (i = 0; i < nelt; ++i)
10123 int nunits = GET_MODE_NUNITS (vmode);
10125 /* If big-endian and two vectors we end up with a weird mixed-endian
10126 mode on NEON. Reverse the index within each word but not the word
10127 itself. */
10128 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10129 : d->perm[i]);
10131 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10132 sel = force_reg (vmode, sel);
10134 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10135 return true;
10138 static bool
10139 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10141 /* The pattern matching functions above are written to look for a small
10142 number to begin the sequence (0, 1, N/2). If we begin with an index
10143 from the second operand, we can swap the operands. */
10144 if (d->perm[0] >= d->nelt)
10146 unsigned i, nelt = d->nelt;
10148 gcc_assert (nelt == (nelt & -nelt));
10149 for (i = 0; i < nelt; ++i)
10150 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10152 std::swap (d->op0, d->op1);
10155 if (TARGET_SIMD)
10157 if (aarch64_evpc_rev (d))
10158 return true;
10159 else if (aarch64_evpc_ext (d))
10160 return true;
10161 else if (aarch64_evpc_dup (d))
10162 return true;
10163 else if (aarch64_evpc_zip (d))
10164 return true;
10165 else if (aarch64_evpc_uzp (d))
10166 return true;
10167 else if (aarch64_evpc_trn (d))
10168 return true;
10169 return aarch64_evpc_tbl (d);
10171 return false;
10174 /* Expand a vec_perm_const pattern. */
10176 bool
10177 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10179 struct expand_vec_perm_d d;
10180 int i, nelt, which;
10182 d.target = target;
10183 d.op0 = op0;
10184 d.op1 = op1;
10186 d.vmode = GET_MODE (target);
10187 gcc_assert (VECTOR_MODE_P (d.vmode));
10188 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10189 d.testing_p = false;
10191 for (i = which = 0; i < nelt; ++i)
10193 rtx e = XVECEXP (sel, 0, i);
10194 int ei = INTVAL (e) & (2 * nelt - 1);
10195 which |= (ei < nelt ? 1 : 2);
10196 d.perm[i] = ei;
10199 switch (which)
10201 default:
10202 gcc_unreachable ();
10204 case 3:
10205 d.one_vector_p = false;
10206 if (!rtx_equal_p (op0, op1))
10207 break;
10209 /* The elements of PERM do not suggest that only the first operand
10210 is used, but both operands are identical. Allow easier matching
10211 of the permutation by folding the permutation into the single
10212 input vector. */
10213 /* Fall Through. */
10214 case 2:
10215 for (i = 0; i < nelt; ++i)
10216 d.perm[i] &= nelt - 1;
10217 d.op0 = op1;
10218 d.one_vector_p = true;
10219 break;
10221 case 1:
10222 d.op1 = op0;
10223 d.one_vector_p = true;
10224 break;
10227 return aarch64_expand_vec_perm_const_1 (&d);
10230 static bool
10231 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10232 const unsigned char *sel)
10234 struct expand_vec_perm_d d;
10235 unsigned int i, nelt, which;
10236 bool ret;
10238 d.vmode = vmode;
10239 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10240 d.testing_p = true;
10241 memcpy (d.perm, sel, nelt);
10243 /* Calculate whether all elements are in one vector. */
10244 for (i = which = 0; i < nelt; ++i)
10246 unsigned char e = d.perm[i];
10247 gcc_assert (e < 2 * nelt);
10248 which |= (e < nelt ? 1 : 2);
10251 /* If all elements are from the second vector, reindex as if from the
10252 first vector. */
10253 if (which == 2)
10254 for (i = 0; i < nelt; ++i)
10255 d.perm[i] -= nelt;
10257 /* Check whether the mask can be applied to a single vector. */
10258 d.one_vector_p = (which != 3);
10260 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10261 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10262 if (!d.one_vector_p)
10263 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10265 start_sequence ();
10266 ret = aarch64_expand_vec_perm_const_1 (&d);
10267 end_sequence ();
10269 return ret;
10273 aarch64_reverse_mask (enum machine_mode mode)
10275 /* We have to reverse each vector because we dont have
10276 a permuted load that can reverse-load according to ABI rules. */
10277 rtx mask;
10278 rtvec v = rtvec_alloc (16);
10279 int i, j;
10280 int nunits = GET_MODE_NUNITS (mode);
10281 int usize = GET_MODE_UNIT_SIZE (mode);
10283 gcc_assert (BYTES_BIG_ENDIAN);
10284 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10286 for (i = 0; i < nunits; i++)
10287 for (j = 0; j < usize; j++)
10288 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10289 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10290 return force_reg (V16QImode, mask);
10293 /* Implement MODES_TIEABLE_P. */
10295 bool
10296 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10298 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10299 return true;
10301 /* We specifically want to allow elements of "structure" modes to
10302 be tieable to the structure. This more general condition allows
10303 other rarer situations too. */
10304 if (TARGET_SIMD
10305 && aarch64_vector_mode_p (mode1)
10306 && aarch64_vector_mode_p (mode2))
10307 return true;
10309 return false;
10312 /* Return a new RTX holding the result of moving POINTER forward by
10313 AMOUNT bytes. */
10315 static rtx
10316 aarch64_move_pointer (rtx pointer, int amount)
10318 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10320 return adjust_automodify_address (pointer, GET_MODE (pointer),
10321 next, amount);
10324 /* Return a new RTX holding the result of moving POINTER forward by the
10325 size of the mode it points to. */
10327 static rtx
10328 aarch64_progress_pointer (rtx pointer)
10330 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10332 return aarch64_move_pointer (pointer, amount);
10335 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10336 MODE bytes. */
10338 static void
10339 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10340 machine_mode mode)
10342 rtx reg = gen_reg_rtx (mode);
10344 /* "Cast" the pointers to the correct mode. */
10345 *src = adjust_address (*src, mode, 0);
10346 *dst = adjust_address (*dst, mode, 0);
10347 /* Emit the memcpy. */
10348 emit_move_insn (reg, *src);
10349 emit_move_insn (*dst, reg);
10350 /* Move the pointers forward. */
10351 *src = aarch64_progress_pointer (*src);
10352 *dst = aarch64_progress_pointer (*dst);
10355 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10356 we succeed, otherwise return false. */
10358 bool
10359 aarch64_expand_movmem (rtx *operands)
10361 unsigned int n;
10362 rtx dst = operands[0];
10363 rtx src = operands[1];
10364 rtx base;
10365 bool speed_p = !optimize_function_for_size_p (cfun);
10367 /* When optimizing for size, give a better estimate of the length of a
10368 memcpy call, but use the default otherwise. */
10369 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10371 /* We can't do anything smart if the amount to copy is not constant. */
10372 if (!CONST_INT_P (operands[2]))
10373 return false;
10375 n = UINTVAL (operands[2]);
10377 /* Try to keep the number of instructions low. For cases below 16 bytes we
10378 need to make at most two moves. For cases above 16 bytes it will be one
10379 move for each 16 byte chunk, then at most two additional moves. */
10380 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10381 return false;
10383 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10384 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10386 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10387 src = adjust_automodify_address (src, VOIDmode, base, 0);
10389 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10390 1-byte chunk. */
10391 if (n < 4)
10393 if (n >= 2)
10395 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10396 n -= 2;
10399 if (n == 1)
10400 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10402 return true;
10405 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10406 4-byte chunk, partially overlapping with the previously copied chunk. */
10407 if (n < 8)
10409 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10410 n -= 4;
10411 if (n > 0)
10413 int move = n - 4;
10415 src = aarch64_move_pointer (src, move);
10416 dst = aarch64_move_pointer (dst, move);
10417 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10419 return true;
10422 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10423 them, then (if applicable) an 8-byte chunk. */
10424 while (n >= 8)
10426 if (n / 16)
10428 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10429 n -= 16;
10431 else
10433 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10434 n -= 8;
10438 /* Finish the final bytes of the copy. We can always do this in one
10439 instruction. We either copy the exact amount we need, or partially
10440 overlap with the previous chunk we copied and copy 8-bytes. */
10441 if (n == 0)
10442 return true;
10443 else if (n == 1)
10444 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10445 else if (n == 2)
10446 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10447 else if (n == 4)
10448 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10449 else
10451 if (n == 3)
10453 src = aarch64_move_pointer (src, -1);
10454 dst = aarch64_move_pointer (dst, -1);
10455 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10457 else
10459 int move = n - 8;
10461 src = aarch64_move_pointer (src, move);
10462 dst = aarch64_move_pointer (dst, move);
10463 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10467 return true;
10470 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10472 static unsigned HOST_WIDE_INT
10473 aarch64_asan_shadow_offset (void)
10475 return (HOST_WIDE_INT_1 << 36);
10478 static bool
10479 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10480 unsigned int align,
10481 enum by_pieces_operation op,
10482 bool speed_p)
10484 /* STORE_BY_PIECES can be used when copying a constant string, but
10485 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10486 For now we always fail this and let the move_by_pieces code copy
10487 the string from read-only memory. */
10488 if (op == STORE_BY_PIECES)
10489 return false;
10491 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10494 static enum machine_mode
10495 aarch64_code_to_ccmode (enum rtx_code code)
10497 switch (code)
10499 case NE:
10500 return CC_DNEmode;
10502 case EQ:
10503 return CC_DEQmode;
10505 case LE:
10506 return CC_DLEmode;
10508 case LT:
10509 return CC_DLTmode;
10511 case GE:
10512 return CC_DGEmode;
10514 case GT:
10515 return CC_DGTmode;
10517 case LEU:
10518 return CC_DLEUmode;
10520 case LTU:
10521 return CC_DLTUmode;
10523 case GEU:
10524 return CC_DGEUmode;
10526 case GTU:
10527 return CC_DGTUmode;
10529 default:
10530 return CCmode;
10534 static rtx
10535 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10536 int code, tree treeop0, tree treeop1)
10538 enum machine_mode op_mode, cmp_mode, cc_mode;
10539 rtx op0, op1, cmp, target;
10540 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10541 enum insn_code icode;
10542 struct expand_operand ops[4];
10544 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10545 if (cc_mode == CCmode)
10546 return NULL_RTX;
10548 start_sequence ();
10549 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10551 op_mode = GET_MODE (op0);
10552 if (op_mode == VOIDmode)
10553 op_mode = GET_MODE (op1);
10555 switch (op_mode)
10557 case QImode:
10558 case HImode:
10559 case SImode:
10560 cmp_mode = SImode;
10561 icode = CODE_FOR_cmpsi;
10562 break;
10564 case DImode:
10565 cmp_mode = DImode;
10566 icode = CODE_FOR_cmpdi;
10567 break;
10569 default:
10570 end_sequence ();
10571 return NULL_RTX;
10574 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10575 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10576 if (!op0 || !op1)
10578 end_sequence ();
10579 return NULL_RTX;
10581 *prep_seq = get_insns ();
10582 end_sequence ();
10584 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10585 target = gen_rtx_REG (CCmode, CC_REGNUM);
10587 create_output_operand (&ops[0], target, CCmode);
10588 create_fixed_operand (&ops[1], cmp);
10589 create_fixed_operand (&ops[2], op0);
10590 create_fixed_operand (&ops[3], op1);
10592 start_sequence ();
10593 if (!maybe_expand_insn (icode, 4, ops))
10595 end_sequence ();
10596 return NULL_RTX;
10598 *gen_seq = get_insns ();
10599 end_sequence ();
10601 return gen_rtx_REG (cc_mode, CC_REGNUM);
10604 static rtx
10605 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10606 tree treeop0, tree treeop1, int bit_code)
10608 rtx op0, op1, cmp0, cmp1, target;
10609 enum machine_mode op_mode, cmp_mode, cc_mode;
10610 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10611 enum insn_code icode = CODE_FOR_ccmp_andsi;
10612 struct expand_operand ops[6];
10614 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10615 if (cc_mode == CCmode)
10616 return NULL_RTX;
10618 push_to_sequence ((rtx_insn*) *prep_seq);
10619 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10621 op_mode = GET_MODE (op0);
10622 if (op_mode == VOIDmode)
10623 op_mode = GET_MODE (op1);
10625 switch (op_mode)
10627 case QImode:
10628 case HImode:
10629 case SImode:
10630 cmp_mode = SImode;
10631 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10632 : CODE_FOR_ccmp_iorsi;
10633 break;
10635 case DImode:
10636 cmp_mode = DImode;
10637 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10638 : CODE_FOR_ccmp_iordi;
10639 break;
10641 default:
10642 end_sequence ();
10643 return NULL_RTX;
10646 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10647 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10648 if (!op0 || !op1)
10650 end_sequence ();
10651 return NULL_RTX;
10653 *prep_seq = get_insns ();
10654 end_sequence ();
10656 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10657 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10658 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10660 create_fixed_operand (&ops[0], prev);
10661 create_fixed_operand (&ops[1], target);
10662 create_fixed_operand (&ops[2], op0);
10663 create_fixed_operand (&ops[3], op1);
10664 create_fixed_operand (&ops[4], cmp0);
10665 create_fixed_operand (&ops[5], cmp1);
10667 push_to_sequence ((rtx_insn*) *gen_seq);
10668 if (!maybe_expand_insn (icode, 6, ops))
10670 end_sequence ();
10671 return NULL_RTX;
10674 *gen_seq = get_insns ();
10675 end_sequence ();
10677 return target;
10680 #undef TARGET_GEN_CCMP_FIRST
10681 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10683 #undef TARGET_GEN_CCMP_NEXT
10684 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10686 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10687 instruction fusion of some sort. */
10689 static bool
10690 aarch64_macro_fusion_p (void)
10692 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10696 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10697 should be kept together during scheduling. */
10699 static bool
10700 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10702 rtx set_dest;
10703 rtx prev_set = single_set (prev);
10704 rtx curr_set = single_set (curr);
10705 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10706 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10708 if (!aarch64_macro_fusion_p ())
10709 return false;
10711 if (simple_sets_p
10712 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10714 /* We are trying to match:
10715 prev (mov) == (set (reg r0) (const_int imm16))
10716 curr (movk) == (set (zero_extract (reg r0)
10717 (const_int 16)
10718 (const_int 16))
10719 (const_int imm16_1)) */
10721 set_dest = SET_DEST (curr_set);
10723 if (GET_CODE (set_dest) == ZERO_EXTRACT
10724 && CONST_INT_P (SET_SRC (curr_set))
10725 && CONST_INT_P (SET_SRC (prev_set))
10726 && CONST_INT_P (XEXP (set_dest, 2))
10727 && INTVAL (XEXP (set_dest, 2)) == 16
10728 && REG_P (XEXP (set_dest, 0))
10729 && REG_P (SET_DEST (prev_set))
10730 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10732 return true;
10736 if (simple_sets_p
10737 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10740 /* We're trying to match:
10741 prev (adrp) == (set (reg r1)
10742 (high (symbol_ref ("SYM"))))
10743 curr (add) == (set (reg r0)
10744 (lo_sum (reg r1)
10745 (symbol_ref ("SYM"))))
10746 Note that r0 need not necessarily be the same as r1, especially
10747 during pre-regalloc scheduling. */
10749 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10750 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10752 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10753 && REG_P (XEXP (SET_SRC (curr_set), 0))
10754 && REGNO (XEXP (SET_SRC (curr_set), 0))
10755 == REGNO (SET_DEST (prev_set))
10756 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10757 XEXP (SET_SRC (curr_set), 1)))
10758 return true;
10762 if (simple_sets_p
10763 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10766 /* We're trying to match:
10767 prev (movk) == (set (zero_extract (reg r0)
10768 (const_int 16)
10769 (const_int 32))
10770 (const_int imm16_1))
10771 curr (movk) == (set (zero_extract (reg r0)
10772 (const_int 16)
10773 (const_int 48))
10774 (const_int imm16_2)) */
10776 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10777 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10778 && REG_P (XEXP (SET_DEST (prev_set), 0))
10779 && REG_P (XEXP (SET_DEST (curr_set), 0))
10780 && REGNO (XEXP (SET_DEST (prev_set), 0))
10781 == REGNO (XEXP (SET_DEST (curr_set), 0))
10782 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10783 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10784 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10785 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10786 && CONST_INT_P (SET_SRC (prev_set))
10787 && CONST_INT_P (SET_SRC (curr_set)))
10788 return true;
10791 if (simple_sets_p
10792 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10794 /* We're trying to match:
10795 prev (adrp) == (set (reg r0)
10796 (high (symbol_ref ("SYM"))))
10797 curr (ldr) == (set (reg r1)
10798 (mem (lo_sum (reg r0)
10799 (symbol_ref ("SYM")))))
10801 curr (ldr) == (set (reg r1)
10802 (zero_extend (mem
10803 (lo_sum (reg r0)
10804 (symbol_ref ("SYM")))))) */
10805 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10806 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10808 rtx curr_src = SET_SRC (curr_set);
10810 if (GET_CODE (curr_src) == ZERO_EXTEND)
10811 curr_src = XEXP (curr_src, 0);
10813 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10814 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10815 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10816 == REGNO (SET_DEST (prev_set))
10817 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10818 XEXP (SET_SRC (prev_set), 0)))
10819 return true;
10823 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10824 && any_condjump_p (curr))
10826 enum attr_type prev_type = get_attr_type (prev);
10828 /* FIXME: this misses some which is considered simple arthematic
10829 instructions for ThunderX. Simple shifts are missed here. */
10830 if (prev_type == TYPE_ALUS_SREG
10831 || prev_type == TYPE_ALUS_IMM
10832 || prev_type == TYPE_LOGICS_REG
10833 || prev_type == TYPE_LOGICS_IMM)
10834 return true;
10837 return false;
10840 /* If MEM is in the form of [base+offset], extract the two parts
10841 of address and set to BASE and OFFSET, otherwise return false
10842 after clearing BASE and OFFSET. */
10844 bool
10845 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10847 rtx addr;
10849 gcc_assert (MEM_P (mem));
10851 addr = XEXP (mem, 0);
10853 if (REG_P (addr))
10855 *base = addr;
10856 *offset = const0_rtx;
10857 return true;
10860 if (GET_CODE (addr) == PLUS
10861 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10863 *base = XEXP (addr, 0);
10864 *offset = XEXP (addr, 1);
10865 return true;
10868 *base = NULL_RTX;
10869 *offset = NULL_RTX;
10871 return false;
10874 /* Types for scheduling fusion. */
10875 enum sched_fusion_type
10877 SCHED_FUSION_NONE = 0,
10878 SCHED_FUSION_LD_SIGN_EXTEND,
10879 SCHED_FUSION_LD_ZERO_EXTEND,
10880 SCHED_FUSION_LD,
10881 SCHED_FUSION_ST,
10882 SCHED_FUSION_NUM
10885 /* If INSN is a load or store of address in the form of [base+offset],
10886 extract the two parts and set to BASE and OFFSET. Return scheduling
10887 fusion type this INSN is. */
10889 static enum sched_fusion_type
10890 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10892 rtx x, dest, src;
10893 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10895 gcc_assert (INSN_P (insn));
10896 x = PATTERN (insn);
10897 if (GET_CODE (x) != SET)
10898 return SCHED_FUSION_NONE;
10900 src = SET_SRC (x);
10901 dest = SET_DEST (x);
10903 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10904 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10905 return SCHED_FUSION_NONE;
10907 if (GET_CODE (src) == SIGN_EXTEND)
10909 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10910 src = XEXP (src, 0);
10911 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10912 return SCHED_FUSION_NONE;
10914 else if (GET_CODE (src) == ZERO_EXTEND)
10916 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10917 src = XEXP (src, 0);
10918 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10919 return SCHED_FUSION_NONE;
10922 if (GET_CODE (src) == MEM && REG_P (dest))
10923 extract_base_offset_in_addr (src, base, offset);
10924 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10926 fusion = SCHED_FUSION_ST;
10927 extract_base_offset_in_addr (dest, base, offset);
10929 else
10930 return SCHED_FUSION_NONE;
10932 if (*base == NULL_RTX || *offset == NULL_RTX)
10933 fusion = SCHED_FUSION_NONE;
10935 return fusion;
10938 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10940 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10941 and PRI are only calculated for these instructions. For other instruction,
10942 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10943 type instruction fusion can be added by returning different priorities.
10945 It's important that irrelevant instructions get the largest FUSION_PRI. */
10947 static void
10948 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10949 int *fusion_pri, int *pri)
10951 int tmp, off_val;
10952 rtx base, offset;
10953 enum sched_fusion_type fusion;
10955 gcc_assert (INSN_P (insn));
10957 tmp = max_pri - 1;
10958 fusion = fusion_load_store (insn, &base, &offset);
10959 if (fusion == SCHED_FUSION_NONE)
10961 *pri = tmp;
10962 *fusion_pri = tmp;
10963 return;
10966 /* Set FUSION_PRI according to fusion type and base register. */
10967 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10969 /* Calculate PRI. */
10970 tmp /= 2;
10972 /* INSN with smaller offset goes first. */
10973 off_val = (int)(INTVAL (offset));
10974 if (off_val >= 0)
10975 tmp -= (off_val & 0xfffff);
10976 else
10977 tmp += ((- off_val) & 0xfffff);
10979 *pri = tmp;
10980 return;
10983 /* Given OPERANDS of consecutive load/store, check if we can merge
10984 them into ldp/stp. LOAD is true if they are load instructions.
10985 MODE is the mode of memory operands. */
10987 bool
10988 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10989 enum machine_mode mode)
10991 HOST_WIDE_INT offval_1, offval_2, msize;
10992 enum reg_class rclass_1, rclass_2;
10993 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10995 if (load)
10997 mem_1 = operands[1];
10998 mem_2 = operands[3];
10999 reg_1 = operands[0];
11000 reg_2 = operands[2];
11001 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11002 if (REGNO (reg_1) == REGNO (reg_2))
11003 return false;
11005 else
11007 mem_1 = operands[0];
11008 mem_2 = operands[2];
11009 reg_1 = operands[1];
11010 reg_2 = operands[3];
11013 /* The mems cannot be volatile. */
11014 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11015 return false;
11017 /* Check if the addresses are in the form of [base+offset]. */
11018 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11019 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11020 return false;
11021 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11022 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11023 return false;
11025 /* Check if the bases are same. */
11026 if (!rtx_equal_p (base_1, base_2))
11027 return false;
11029 offval_1 = INTVAL (offset_1);
11030 offval_2 = INTVAL (offset_2);
11031 msize = GET_MODE_SIZE (mode);
11032 /* Check if the offsets are consecutive. */
11033 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11034 return false;
11036 /* Check if the addresses are clobbered by load. */
11037 if (load)
11039 if (reg_mentioned_p (reg_1, mem_1))
11040 return false;
11042 /* In increasing order, the last load can clobber the address. */
11043 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11044 return false;
11047 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11048 rclass_1 = FP_REGS;
11049 else
11050 rclass_1 = GENERAL_REGS;
11052 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11053 rclass_2 = FP_REGS;
11054 else
11055 rclass_2 = GENERAL_REGS;
11057 /* Check if the registers are of same class. */
11058 if (rclass_1 != rclass_2)
11059 return false;
11061 return true;
11064 /* Given OPERANDS of consecutive load/store, check if we can merge
11065 them into ldp/stp by adjusting the offset. LOAD is true if they
11066 are load instructions. MODE is the mode of memory operands.
11068 Given below consecutive stores:
11070 str w1, [xb, 0x100]
11071 str w1, [xb, 0x104]
11072 str w1, [xb, 0x108]
11073 str w1, [xb, 0x10c]
11075 Though the offsets are out of the range supported by stp, we can
11076 still pair them after adjusting the offset, like:
11078 add scratch, xb, 0x100
11079 stp w1, w1, [scratch]
11080 stp w1, w1, [scratch, 0x8]
11082 The peephole patterns detecting this opportunity should guarantee
11083 the scratch register is avaliable. */
11085 bool
11086 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11087 enum machine_mode mode)
11089 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11090 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11091 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11092 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11094 if (load)
11096 reg_1 = operands[0];
11097 mem_1 = operands[1];
11098 reg_2 = operands[2];
11099 mem_2 = operands[3];
11100 reg_3 = operands[4];
11101 mem_3 = operands[5];
11102 reg_4 = operands[6];
11103 mem_4 = operands[7];
11104 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11105 && REG_P (reg_3) && REG_P (reg_4));
11106 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11107 return false;
11109 else
11111 mem_1 = operands[0];
11112 reg_1 = operands[1];
11113 mem_2 = operands[2];
11114 reg_2 = operands[3];
11115 mem_3 = operands[4];
11116 reg_3 = operands[5];
11117 mem_4 = operands[6];
11118 reg_4 = operands[7];
11120 /* Skip if memory operand is by itslef valid for ldp/stp. */
11121 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11122 return false;
11124 /* The mems cannot be volatile. */
11125 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11126 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11127 return false;
11129 /* Check if the addresses are in the form of [base+offset]. */
11130 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11131 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11132 return false;
11133 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11134 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11135 return false;
11136 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11137 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11138 return false;
11139 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11140 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11141 return false;
11143 /* Check if the bases are same. */
11144 if (!rtx_equal_p (base_1, base_2)
11145 || !rtx_equal_p (base_2, base_3)
11146 || !rtx_equal_p (base_3, base_4))
11147 return false;
11149 offval_1 = INTVAL (offset_1);
11150 offval_2 = INTVAL (offset_2);
11151 offval_3 = INTVAL (offset_3);
11152 offval_4 = INTVAL (offset_4);
11153 msize = GET_MODE_SIZE (mode);
11154 /* Check if the offsets are consecutive. */
11155 if ((offval_1 != (offval_2 + msize)
11156 || offval_1 != (offval_3 + msize * 2)
11157 || offval_1 != (offval_4 + msize * 3))
11158 && (offval_4 != (offval_3 + msize)
11159 || offval_4 != (offval_2 + msize * 2)
11160 || offval_4 != (offval_1 + msize * 3)))
11161 return false;
11163 /* Check if the addresses are clobbered by load. */
11164 if (load)
11166 if (reg_mentioned_p (reg_1, mem_1)
11167 || reg_mentioned_p (reg_2, mem_2)
11168 || reg_mentioned_p (reg_3, mem_3))
11169 return false;
11171 /* In increasing order, the last load can clobber the address. */
11172 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11173 return false;
11176 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11177 rclass_1 = FP_REGS;
11178 else
11179 rclass_1 = GENERAL_REGS;
11181 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11182 rclass_2 = FP_REGS;
11183 else
11184 rclass_2 = GENERAL_REGS;
11186 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11187 rclass_3 = FP_REGS;
11188 else
11189 rclass_3 = GENERAL_REGS;
11191 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11192 rclass_4 = FP_REGS;
11193 else
11194 rclass_4 = GENERAL_REGS;
11196 /* Check if the registers are of same class. */
11197 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11198 return false;
11200 return true;
11203 /* Given OPERANDS of consecutive load/store, this function pairs them
11204 into ldp/stp after adjusting the offset. It depends on the fact
11205 that addresses of load/store instructions are in increasing order.
11206 MODE is the mode of memory operands. CODE is the rtl operator
11207 which should be applied to all memory operands, it's SIGN_EXTEND,
11208 ZERO_EXTEND or UNKNOWN. */
11210 bool
11211 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11212 enum machine_mode mode, RTX_CODE code)
11214 rtx base, offset, t1, t2;
11215 rtx mem_1, mem_2, mem_3, mem_4;
11216 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11218 if (load)
11220 mem_1 = operands[1];
11221 mem_2 = operands[3];
11222 mem_3 = operands[5];
11223 mem_4 = operands[7];
11225 else
11227 mem_1 = operands[0];
11228 mem_2 = operands[2];
11229 mem_3 = operands[4];
11230 mem_4 = operands[6];
11231 gcc_assert (code == UNKNOWN);
11234 extract_base_offset_in_addr (mem_1, &base, &offset);
11235 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11237 /* Adjust offset thus it can fit in ldp/stp instruction. */
11238 msize = GET_MODE_SIZE (mode);
11239 stp_off_limit = msize * 0x40;
11240 off_val = INTVAL (offset);
11241 abs_off = (off_val < 0) ? -off_val : off_val;
11242 new_off = abs_off % stp_off_limit;
11243 adj_off = abs_off - new_off;
11245 /* Further adjust to make sure all offsets are OK. */
11246 if ((new_off + msize * 2) >= stp_off_limit)
11248 adj_off += stp_off_limit;
11249 new_off -= stp_off_limit;
11252 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11253 if (adj_off >= 0x1000)
11254 return false;
11256 if (off_val < 0)
11258 adj_off = -adj_off;
11259 new_off = -new_off;
11262 /* Create new memory references. */
11263 mem_1 = change_address (mem_1, VOIDmode,
11264 plus_constant (DImode, operands[8], new_off));
11266 /* Check if the adjusted address is OK for ldp/stp. */
11267 if (!aarch64_mem_pair_operand (mem_1, mode))
11268 return false;
11270 msize = GET_MODE_SIZE (mode);
11271 mem_2 = change_address (mem_2, VOIDmode,
11272 plus_constant (DImode,
11273 operands[8],
11274 new_off + msize));
11275 mem_3 = change_address (mem_3, VOIDmode,
11276 plus_constant (DImode,
11277 operands[8],
11278 new_off + msize * 2));
11279 mem_4 = change_address (mem_4, VOIDmode,
11280 plus_constant (DImode,
11281 operands[8],
11282 new_off + msize * 3));
11284 if (code == ZERO_EXTEND)
11286 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11287 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11288 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11289 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11291 else if (code == SIGN_EXTEND)
11293 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11294 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11295 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11296 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11299 if (load)
11301 operands[1] = mem_1;
11302 operands[3] = mem_2;
11303 operands[5] = mem_3;
11304 operands[7] = mem_4;
11306 else
11308 operands[0] = mem_1;
11309 operands[2] = mem_2;
11310 operands[4] = mem_3;
11311 operands[6] = mem_4;
11314 /* Emit adjusting instruction. */
11315 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11316 plus_constant (DImode, base, adj_off)));
11317 /* Emit ldp/stp instructions. */
11318 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11319 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11320 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11321 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11322 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11323 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11324 return true;
11327 #undef TARGET_ADDRESS_COST
11328 #define TARGET_ADDRESS_COST aarch64_address_cost
11330 /* This hook will determines whether unnamed bitfields affect the alignment
11331 of the containing structure. The hook returns true if the structure
11332 should inherit the alignment requirements of an unnamed bitfield's
11333 type. */
11334 #undef TARGET_ALIGN_ANON_BITFIELD
11335 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11337 #undef TARGET_ASM_ALIGNED_DI_OP
11338 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11340 #undef TARGET_ASM_ALIGNED_HI_OP
11341 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11343 #undef TARGET_ASM_ALIGNED_SI_OP
11344 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11346 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11347 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11348 hook_bool_const_tree_hwi_hwi_const_tree_true
11350 #undef TARGET_ASM_FILE_START
11351 #define TARGET_ASM_FILE_START aarch64_start_file
11353 #undef TARGET_ASM_OUTPUT_MI_THUNK
11354 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11356 #undef TARGET_ASM_SELECT_RTX_SECTION
11357 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11359 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11360 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11362 #undef TARGET_BUILD_BUILTIN_VA_LIST
11363 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11365 #undef TARGET_CALLEE_COPIES
11366 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11368 #undef TARGET_CAN_ELIMINATE
11369 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11371 #undef TARGET_CANNOT_FORCE_CONST_MEM
11372 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11374 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11375 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11377 /* Only the least significant bit is used for initialization guard
11378 variables. */
11379 #undef TARGET_CXX_GUARD_MASK_BIT
11380 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11382 #undef TARGET_C_MODE_FOR_SUFFIX
11383 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11385 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11386 #undef TARGET_DEFAULT_TARGET_FLAGS
11387 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11388 #endif
11390 #undef TARGET_CLASS_MAX_NREGS
11391 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11393 #undef TARGET_BUILTIN_DECL
11394 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11396 #undef TARGET_EXPAND_BUILTIN
11397 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11399 #undef TARGET_EXPAND_BUILTIN_VA_START
11400 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11402 #undef TARGET_FOLD_BUILTIN
11403 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11405 #undef TARGET_FUNCTION_ARG
11406 #define TARGET_FUNCTION_ARG aarch64_function_arg
11408 #undef TARGET_FUNCTION_ARG_ADVANCE
11409 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11411 #undef TARGET_FUNCTION_ARG_BOUNDARY
11412 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11414 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11415 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11417 #undef TARGET_FUNCTION_VALUE
11418 #define TARGET_FUNCTION_VALUE aarch64_function_value
11420 #undef TARGET_FUNCTION_VALUE_REGNO_P
11421 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11423 #undef TARGET_FRAME_POINTER_REQUIRED
11424 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11426 #undef TARGET_GIMPLE_FOLD_BUILTIN
11427 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11429 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11430 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11432 #undef TARGET_INIT_BUILTINS
11433 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11435 #undef TARGET_LEGITIMATE_ADDRESS_P
11436 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11438 #undef TARGET_LEGITIMATE_CONSTANT_P
11439 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11441 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11442 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11444 #undef TARGET_LRA_P
11445 #define TARGET_LRA_P hook_bool_void_true
11447 #undef TARGET_MANGLE_TYPE
11448 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11450 #undef TARGET_MEMORY_MOVE_COST
11451 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11453 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11454 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11456 #undef TARGET_MUST_PASS_IN_STACK
11457 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11459 /* This target hook should return true if accesses to volatile bitfields
11460 should use the narrowest mode possible. It should return false if these
11461 accesses should use the bitfield container type. */
11462 #undef TARGET_NARROW_VOLATILE_BITFIELD
11463 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11465 #undef TARGET_OPTION_OVERRIDE
11466 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11468 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11469 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11470 aarch64_override_options_after_change
11472 #undef TARGET_PASS_BY_REFERENCE
11473 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11475 #undef TARGET_PREFERRED_RELOAD_CLASS
11476 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11478 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11479 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11481 #undef TARGET_SECONDARY_RELOAD
11482 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11484 #undef TARGET_SHIFT_TRUNCATION_MASK
11485 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11487 #undef TARGET_SETUP_INCOMING_VARARGS
11488 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11490 #undef TARGET_STRUCT_VALUE_RTX
11491 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11493 #undef TARGET_REGISTER_MOVE_COST
11494 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11496 #undef TARGET_RETURN_IN_MEMORY
11497 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11499 #undef TARGET_RETURN_IN_MSB
11500 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11502 #undef TARGET_RTX_COSTS
11503 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11505 #undef TARGET_SCHED_ISSUE_RATE
11506 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11508 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11509 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11510 aarch64_sched_first_cycle_multipass_dfa_lookahead
11512 #undef TARGET_TRAMPOLINE_INIT
11513 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11515 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11516 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11518 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11519 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11521 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11522 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11524 #undef TARGET_VECTORIZE_ADD_STMT_COST
11525 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11527 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11528 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11529 aarch64_builtin_vectorization_cost
11531 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11532 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11534 #undef TARGET_VECTORIZE_BUILTINS
11535 #define TARGET_VECTORIZE_BUILTINS
11537 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11538 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11539 aarch64_builtin_vectorized_function
11541 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11542 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11543 aarch64_autovectorize_vector_sizes
11545 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11546 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11547 aarch64_atomic_assign_expand_fenv
11549 /* Section anchor support. */
11551 #undef TARGET_MIN_ANCHOR_OFFSET
11552 #define TARGET_MIN_ANCHOR_OFFSET -256
11554 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11555 byte offset; we can do much more for larger data types, but have no way
11556 to determine the size of the access. We assume accesses are aligned. */
11557 #undef TARGET_MAX_ANCHOR_OFFSET
11558 #define TARGET_MAX_ANCHOR_OFFSET 4095
11560 #undef TARGET_VECTOR_ALIGNMENT
11561 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11563 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11564 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11565 aarch64_simd_vector_alignment_reachable
11567 /* vec_perm support. */
11569 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11570 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11571 aarch64_vectorize_vec_perm_const_ok
11574 #undef TARGET_FIXED_CONDITION_CODE_REGS
11575 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11577 #undef TARGET_FLAGS_REGNUM
11578 #define TARGET_FLAGS_REGNUM CC_REGNUM
11580 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11581 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11583 #undef TARGET_ASAN_SHADOW_OFFSET
11584 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11586 #undef TARGET_LEGITIMIZE_ADDRESS
11587 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11589 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11590 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11591 aarch64_use_by_pieces_infrastructure_p
11593 #undef TARGET_CAN_USE_DOLOOP_P
11594 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11596 #undef TARGET_SCHED_MACRO_FUSION_P
11597 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11599 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11600 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11602 #undef TARGET_SCHED_FUSION_PRIORITY
11603 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11605 struct gcc_target targetm = TARGET_INITIALIZER;
11607 #include "gt-aarch64.h"