[AArch64] Remember to cost operand 0 in FP compare-with-0.0 case
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob4573fb411932119220813dab553a29dc7c231513
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
105 ADDRESS_REG_IMM
106 A simple base register plus immediate offset.
108 ADDRESS_REG_WB
109 A base register indexed by immediate offset with writeback.
111 ADDRESS_REG_REG
112 A base register indexed by (optionally scaled) register.
114 ADDRESS_REG_UXTW
115 A base register indexed by (optionally scaled) zero-extended register.
117 ADDRESS_REG_SXTW
118 A base register indexed by (optionally scaled) sign-extended register.
120 ADDRESS_LO_SUM
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
123 ADDRESS_SYMBOLIC:
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type {
127 ADDRESS_REG_IMM,
128 ADDRESS_REG_WB,
129 ADDRESS_REG_REG,
130 ADDRESS_REG_UXTW,
131 ADDRESS_REG_SXTW,
132 ADDRESS_LO_SUM,
133 ADDRESS_SYMBOLIC
136 struct aarch64_address_info {
137 enum aarch64_address_type type;
138 rtx base;
139 rtx offset;
140 int shift;
141 enum aarch64_symbol_type symbol_type;
144 struct simd_immediate_info
146 rtx value;
147 int shift;
148 int element_width;
149 bool mvn;
150 bool msl;
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel;
156 #ifdef HAVE_AS_TLS
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
159 #endif
161 static bool aarch64_composite_type_p (const_tree, machine_mode);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
163 const_tree,
164 machine_mode *, int *,
165 bool *);
166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode);
170 static unsigned bit_count (unsigned HOST_WIDE_INT);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
172 const unsigned char *sel);
173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune = cortexa53;
181 /* The current tuning set. */
182 const struct tune_params *aarch64_tune_params;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags = 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags = 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table =
195 0, /* hi */
196 0, /* si */
197 0, /* di */
198 0, /* ti */
200 0, /* pre_modify */
201 0, /* post_modify */
202 0, /* register_offset */
203 0, /* register_extend */
204 0 /* imm_offset */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 1, /* hi */
211 0, /* si */
212 0, /* di */
213 1, /* ti */
215 0, /* pre_modify */
216 0, /* post_modify */
217 0, /* register_offset */
218 0, /* register_extend */
219 0, /* imm_offset */
222 static const struct cpu_addrcost_table xgene1_addrcost_table =
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
230 1, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 1, /* register_extend */
234 0, /* imm_offset */
237 static const struct cpu_regmove_cost generic_regmove_cost =
239 1, /* GP2GP */
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
242 5, /* GP2FP */
243 5, /* FP2GP */
244 2 /* FP2FP */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
249 1, /* GP2GP */
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
252 5, /* GP2FP */
253 5, /* FP2GP */
254 2 /* FP2FP */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
259 1, /* GP2GP */
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
262 5, /* GP2FP */
263 5, /* FP2GP */
264 2 /* FP2FP */
267 static const struct cpu_regmove_cost thunderx_regmove_cost =
269 2, /* GP2GP */
270 2, /* GP2FP */
271 6, /* FP2GP */
272 4 /* FP2FP */
275 static const struct cpu_regmove_cost xgene1_regmove_cost =
277 1, /* GP2GP */
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
280 8, /* GP2FP */
281 8, /* FP2GP */
282 2 /* FP2FP */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost =
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost =
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost =
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 static const struct tune_params generic_tunings =
345 &cortexa57_extra_costs,
346 &generic_addrcost_table,
347 &generic_regmove_cost,
348 &generic_vector_cost,
349 4, /* memmov_cost */
350 2, /* issue_rate */
351 AARCH64_FUSE_NOTHING, /* fuseable_ops */
352 8, /* function_align. */
353 8, /* jump_align. */
354 4, /* loop_align. */
355 2, /* int_reassoc_width. */
356 4, /* fp_reassoc_width. */
357 1 /* vec_reassoc_width. */
360 static const struct tune_params cortexa53_tunings =
362 &cortexa53_extra_costs,
363 &generic_addrcost_table,
364 &cortexa53_regmove_cost,
365 &generic_vector_cost,
366 4, /* memmov_cost */
367 2, /* issue_rate */
368 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
369 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops */
370 8, /* function_align. */
371 8, /* jump_align. */
372 4, /* loop_align. */
373 2, /* int_reassoc_width. */
374 4, /* fp_reassoc_width. */
375 1 /* vec_reassoc_width. */
378 static const struct tune_params cortexa57_tunings =
380 &cortexa57_extra_costs,
381 &cortexa57_addrcost_table,
382 &cortexa57_regmove_cost,
383 &cortexa57_vector_cost,
384 4, /* memmov_cost */
385 3, /* issue_rate */
386 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
387 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
388 16, /* function_align. */
389 8, /* jump_align. */
390 4, /* loop_align. */
391 2, /* int_reassoc_width. */
392 4, /* fp_reassoc_width. */
393 1 /* vec_reassoc_width. */
396 static const struct tune_params thunderx_tunings =
398 &thunderx_extra_costs,
399 &generic_addrcost_table,
400 &thunderx_regmove_cost,
401 &generic_vector_cost,
402 6, /* memmov_cost */
403 2, /* issue_rate */
404 AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops */
405 8, /* function_align. */
406 8, /* jump_align. */
407 8, /* loop_align. */
408 2, /* int_reassoc_width. */
409 4, /* fp_reassoc_width. */
410 1 /* vec_reassoc_width. */
413 static const struct tune_params xgene1_tunings =
415 &xgene1_extra_costs,
416 &xgene1_addrcost_table,
417 &xgene1_regmove_cost,
418 &xgene1_vector_cost,
419 6, /* memmov_cost */
420 4, /* issue_rate */
421 AARCH64_FUSE_NOTHING, /* fuseable_ops */
422 16, /* function_align. */
423 8, /* jump_align. */
424 16, /* loop_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1 /* vec_reassoc_width. */
430 /* A processor implementing AArch64. */
431 struct processor
433 const char *const name;
434 enum aarch64_processor core;
435 const char *arch;
436 unsigned architecture_version;
437 const unsigned long flags;
438 const struct tune_params *const tune;
441 /* Processor cores implementing AArch64. */
442 static const struct processor all_cores[] =
444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
445 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
446 #include "aarch64-cores.def"
447 #undef AARCH64_CORE
448 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
449 {NULL, aarch64_none, NULL, 0, 0, NULL}
452 /* Architectures implementing AArch64. */
453 static const struct processor all_architectures[] =
455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
456 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
457 #include "aarch64-arches.def"
458 #undef AARCH64_ARCH
459 {NULL, aarch64_none, NULL, 0, 0, NULL}
462 /* Target specification. These are populated as commandline arguments
463 are processed, or NULL if not specified. */
464 static const struct processor *selected_arch;
465 static const struct processor *selected_cpu;
466 static const struct processor *selected_tune;
468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
470 /* An ISA extension in the co-processor and main instruction set space. */
471 struct aarch64_option_extension
473 const char *const name;
474 const unsigned long flags_on;
475 const unsigned long flags_off;
478 /* ISA extensions in AArch64. */
479 static const struct aarch64_option_extension all_extensions[] =
481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
482 {NAME, FLAGS_ON, FLAGS_OFF},
483 #include "aarch64-option-extensions.def"
484 #undef AARCH64_OPT_EXTENSION
485 {NULL, 0, 0}
488 /* Used to track the size of an address when generating a pre/post
489 increment address. */
490 static machine_mode aarch64_memory_reference_mode;
492 /* A table of valid AArch64 "bitmask immediate" values for
493 logical instructions. */
495 #define AARCH64_NUM_BITMASKS 5334
496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
498 typedef enum aarch64_cond_code
500 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
501 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
502 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
504 aarch64_cc;
506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
508 /* The condition codes of the processor, and the inverse function. */
509 static const char * const aarch64_condition_codes[] =
511 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
512 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
515 static unsigned int
516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
518 return 2;
521 static int
522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
523 enum machine_mode mode)
525 if (VECTOR_MODE_P (mode))
526 return aarch64_tune_params->vec_reassoc_width;
527 if (INTEGRAL_MODE_P (mode))
528 return aarch64_tune_params->int_reassoc_width;
529 if (FLOAT_MODE_P (mode))
530 return aarch64_tune_params->fp_reassoc_width;
531 return 1;
534 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
535 unsigned
536 aarch64_dbx_register_number (unsigned regno)
538 if (GP_REGNUM_P (regno))
539 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
540 else if (regno == SP_REGNUM)
541 return AARCH64_DWARF_SP;
542 else if (FP_REGNUM_P (regno))
543 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
545 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
546 equivalent DWARF register. */
547 return DWARF_FRAME_REGISTERS;
550 /* Return TRUE if MODE is any of the large INT modes. */
551 static bool
552 aarch64_vect_struct_mode_p (machine_mode mode)
554 return mode == OImode || mode == CImode || mode == XImode;
557 /* Return TRUE if MODE is any of the vector modes. */
558 static bool
559 aarch64_vector_mode_p (machine_mode mode)
561 return aarch64_vector_mode_supported_p (mode)
562 || aarch64_vect_struct_mode_p (mode);
565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
566 static bool
567 aarch64_array_mode_supported_p (machine_mode mode,
568 unsigned HOST_WIDE_INT nelems)
570 if (TARGET_SIMD
571 && AARCH64_VALID_SIMD_QREG_MODE (mode)
572 && (nelems >= 2 && nelems <= 4))
573 return true;
575 return false;
578 /* Implement HARD_REGNO_NREGS. */
581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
583 switch (aarch64_regno_regclass (regno))
585 case FP_REGS:
586 case FP_LO_REGS:
587 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
588 default:
589 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
591 gcc_unreachable ();
594 /* Implement HARD_REGNO_MODE_OK. */
597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
599 if (GET_MODE_CLASS (mode) == MODE_CC)
600 return regno == CC_REGNUM;
602 if (regno == SP_REGNUM)
603 /* The purpose of comparing with ptr_mode is to support the
604 global register variable associated with the stack pointer
605 register via the syntax of asm ("wsp") in ILP32. */
606 return mode == Pmode || mode == ptr_mode;
608 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
609 return mode == Pmode;
611 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
612 return 1;
614 if (FP_REGNUM_P (regno))
616 if (aarch64_vect_struct_mode_p (mode))
617 return
618 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
619 else
620 return 1;
623 return 0;
626 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
627 machine_mode
628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
629 machine_mode mode)
631 /* Handle modes that fit within single registers. */
632 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
634 if (GET_MODE_SIZE (mode) >= 4)
635 return mode;
636 else
637 return SImode;
639 /* Fall back to generic for multi-reg and very large modes. */
640 else
641 return choose_hard_reg_mode (regno, nregs, false);
644 /* Return true if calls to DECL should be treated as
645 long-calls (ie called via a register). */
646 static bool
647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
649 return false;
652 /* Return true if calls to symbol-ref SYM should be treated as
653 long-calls (ie called via a register). */
654 bool
655 aarch64_is_long_call_p (rtx sym)
657 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
660 /* Return true if the offsets to a zero/sign-extract operation
661 represent an expression that matches an extend operation. The
662 operands represent the paramters from
664 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
665 bool
666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
667 rtx extract_imm)
669 HOST_WIDE_INT mult_val, extract_val;
671 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
672 return false;
674 mult_val = INTVAL (mult_imm);
675 extract_val = INTVAL (extract_imm);
677 if (extract_val > 8
678 && extract_val < GET_MODE_BITSIZE (mode)
679 && exact_log2 (extract_val & ~7) > 0
680 && (extract_val & 7) <= 4
681 && mult_val == (1 << (extract_val & 7)))
682 return true;
684 return false;
687 /* Emit an insn that's a simple single-set. Both the operands must be
688 known to be valid. */
689 inline static rtx
690 emit_set_insn (rtx x, rtx y)
692 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
695 /* X and Y are two things to compare using CODE. Emit the compare insn and
696 return the rtx for register 0 in the proper mode. */
698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
700 machine_mode mode = SELECT_CC_MODE (code, x, y);
701 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
703 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
704 return cc_reg;
707 /* Build the SYMBOL_REF for __tls_get_addr. */
709 static GTY(()) rtx tls_get_addr_libfunc;
712 aarch64_tls_get_addr (void)
714 if (!tls_get_addr_libfunc)
715 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
716 return tls_get_addr_libfunc;
719 /* Return the TLS model to use for ADDR. */
721 static enum tls_model
722 tls_symbolic_operand_type (rtx addr)
724 enum tls_model tls_kind = TLS_MODEL_NONE;
725 rtx sym, addend;
727 if (GET_CODE (addr) == CONST)
729 split_const (addr, &sym, &addend);
730 if (GET_CODE (sym) == SYMBOL_REF)
731 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
733 else if (GET_CODE (addr) == SYMBOL_REF)
734 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
736 return tls_kind;
739 /* We'll allow lo_sum's in addresses in our legitimate addresses
740 so that combine would take care of combining addresses where
741 necessary, but for generation purposes, we'll generate the address
742 as :
743 RTL Absolute
744 tmp = hi (symbol_ref); adrp x1, foo
745 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
748 PIC TLS
749 adrp x1, :got:foo adrp tmp, :tlsgd:foo
750 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
751 bl __tls_get_addr
754 Load TLS symbol, depending on TLS mechanism and TLS access model.
756 Global Dynamic - Traditional TLS:
757 adrp tmp, :tlsgd:imm
758 add dest, tmp, #:tlsgd_lo12:imm
759 bl __tls_get_addr
761 Global Dynamic - TLS Descriptors:
762 adrp dest, :tlsdesc:imm
763 ldr tmp, [dest, #:tlsdesc_lo12:imm]
764 add dest, dest, #:tlsdesc_lo12:imm
765 blr tmp
766 mrs tp, tpidr_el0
767 add dest, dest, tp
769 Initial Exec:
770 mrs tp, tpidr_el0
771 adrp tmp, :gottprel:imm
772 ldr dest, [tmp, #:gottprel_lo12:imm]
773 add dest, dest, tp
775 Local Exec:
776 mrs tp, tpidr_el0
777 add t0, tp, #:tprel_hi12:imm, lsl #12
778 add t0, t0, #:tprel_lo12_nc:imm
781 static void
782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
783 enum aarch64_symbol_type type)
785 switch (type)
787 case SYMBOL_SMALL_ABSOLUTE:
789 /* In ILP32, the mode of dest can be either SImode or DImode. */
790 rtx tmp_reg = dest;
791 machine_mode mode = GET_MODE (dest);
793 gcc_assert (mode == Pmode || mode == ptr_mode);
795 if (can_create_pseudo_p ())
796 tmp_reg = gen_reg_rtx (mode);
798 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
799 emit_insn (gen_add_losym (dest, tmp_reg, imm));
800 return;
803 case SYMBOL_TINY_ABSOLUTE:
804 emit_insn (gen_rtx_SET (Pmode, dest, imm));
805 return;
807 case SYMBOL_SMALL_GOT:
809 /* In ILP32, the mode of dest can be either SImode or DImode,
810 while the got entry is always of SImode size. The mode of
811 dest depends on how dest is used: if dest is assigned to a
812 pointer (e.g. in the memory), it has SImode; it may have
813 DImode if dest is dereferenced to access the memeory.
814 This is why we have to handle three different ldr_got_small
815 patterns here (two patterns for ILP32). */
816 rtx tmp_reg = dest;
817 machine_mode mode = GET_MODE (dest);
819 if (can_create_pseudo_p ())
820 tmp_reg = gen_reg_rtx (mode);
822 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
823 if (mode == ptr_mode)
825 if (mode == DImode)
826 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
827 else
828 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
830 else
832 gcc_assert (mode == Pmode);
833 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
836 return;
839 case SYMBOL_SMALL_TLSGD:
841 rtx_insn *insns;
842 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
844 start_sequence ();
845 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
846 insns = get_insns ();
847 end_sequence ();
849 RTL_CONST_CALL_P (insns) = 1;
850 emit_libcall_block (insns, dest, result, imm);
851 return;
854 case SYMBOL_SMALL_TLSDESC:
856 machine_mode mode = GET_MODE (dest);
857 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
858 rtx tp;
860 gcc_assert (mode == Pmode || mode == ptr_mode);
862 /* In ILP32, the got entry is always of SImode size. Unlike
863 small GOT, the dest is fixed at reg 0. */
864 if (TARGET_ILP32)
865 emit_insn (gen_tlsdesc_small_si (imm));
866 else
867 emit_insn (gen_tlsdesc_small_di (imm));
868 tp = aarch64_load_tp (NULL);
870 if (mode != Pmode)
871 tp = gen_lowpart (mode, tp);
873 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
874 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
875 return;
878 case SYMBOL_SMALL_GOTTPREL:
880 /* In ILP32, the mode of dest can be either SImode or DImode,
881 while the got entry is always of SImode size. The mode of
882 dest depends on how dest is used: if dest is assigned to a
883 pointer (e.g. in the memory), it has SImode; it may have
884 DImode if dest is dereferenced to access the memeory.
885 This is why we have to handle three different tlsie_small
886 patterns here (two patterns for ILP32). */
887 machine_mode mode = GET_MODE (dest);
888 rtx tmp_reg = gen_reg_rtx (mode);
889 rtx tp = aarch64_load_tp (NULL);
891 if (mode == ptr_mode)
893 if (mode == DImode)
894 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
895 else
897 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
898 tp = gen_lowpart (mode, tp);
901 else
903 gcc_assert (mode == Pmode);
904 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
907 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
908 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
909 return;
912 case SYMBOL_SMALL_TPREL:
914 rtx tp = aarch64_load_tp (NULL);
916 if (GET_MODE (dest) != Pmode)
917 tp = gen_lowpart (GET_MODE (dest), tp);
919 emit_insn (gen_tlsle_small (dest, tp, imm));
920 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
921 return;
924 case SYMBOL_TINY_GOT:
925 emit_insn (gen_ldr_got_tiny (dest, imm));
926 return;
928 default:
929 gcc_unreachable ();
933 /* Emit a move from SRC to DEST. Assume that the move expanders can
934 handle all moves if !can_create_pseudo_p (). The distinction is
935 important because, unlike emit_move_insn, the move expanders know
936 how to force Pmode objects into the constant pool even when the
937 constant pool address is not itself legitimate. */
938 static rtx
939 aarch64_emit_move (rtx dest, rtx src)
941 return (can_create_pseudo_p ()
942 ? emit_move_insn (dest, src)
943 : emit_move_insn_1 (dest, src));
946 /* Split a 128-bit move operation into two 64-bit move operations,
947 taking care to handle partial overlap of register to register
948 copies. Special cases are needed when moving between GP regs and
949 FP regs. SRC can be a register, constant or memory; DST a register
950 or memory. If either operand is memory it must not have any side
951 effects. */
952 void
953 aarch64_split_128bit_move (rtx dst, rtx src)
955 rtx dst_lo, dst_hi;
956 rtx src_lo, src_hi;
958 machine_mode mode = GET_MODE (dst);
960 gcc_assert (mode == TImode || mode == TFmode);
961 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
962 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
964 if (REG_P (dst) && REG_P (src))
966 int src_regno = REGNO (src);
967 int dst_regno = REGNO (dst);
969 /* Handle FP <-> GP regs. */
970 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
972 src_lo = gen_lowpart (word_mode, src);
973 src_hi = gen_highpart (word_mode, src);
975 if (mode == TImode)
977 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
978 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
980 else
982 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
983 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
985 return;
987 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
989 dst_lo = gen_lowpart (word_mode, dst);
990 dst_hi = gen_highpart (word_mode, dst);
992 if (mode == TImode)
994 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
995 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
997 else
999 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1002 return;
1006 dst_lo = gen_lowpart (word_mode, dst);
1007 dst_hi = gen_highpart (word_mode, dst);
1008 src_lo = gen_lowpart (word_mode, src);
1009 src_hi = gen_highpart_mode (word_mode, mode, src);
1011 /* At most one pairing may overlap. */
1012 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1014 aarch64_emit_move (dst_hi, src_hi);
1015 aarch64_emit_move (dst_lo, src_lo);
1017 else
1019 aarch64_emit_move (dst_lo, src_lo);
1020 aarch64_emit_move (dst_hi, src_hi);
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1027 return (! REG_P (src)
1028 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1031 /* Split a complex SIMD combine. */
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1036 machine_mode src_mode = GET_MODE (src1);
1037 machine_mode dst_mode = GET_MODE (dst);
1039 gcc_assert (VECTOR_MODE_P (dst_mode));
1041 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1043 rtx (*gen) (rtx, rtx, rtx);
1045 switch (src_mode)
1047 case V8QImode:
1048 gen = gen_aarch64_simd_combinev8qi;
1049 break;
1050 case V4HImode:
1051 gen = gen_aarch64_simd_combinev4hi;
1052 break;
1053 case V2SImode:
1054 gen = gen_aarch64_simd_combinev2si;
1055 break;
1056 case V2SFmode:
1057 gen = gen_aarch64_simd_combinev2sf;
1058 break;
1059 case DImode:
1060 gen = gen_aarch64_simd_combinedi;
1061 break;
1062 case DFmode:
1063 gen = gen_aarch64_simd_combinedf;
1064 break;
1065 default:
1066 gcc_unreachable ();
1069 emit_insn (gen (dst, src1, src2));
1070 return;
1074 /* Split a complex SIMD move. */
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1079 machine_mode src_mode = GET_MODE (src);
1080 machine_mode dst_mode = GET_MODE (dst);
1082 gcc_assert (VECTOR_MODE_P (dst_mode));
1084 if (REG_P (dst) && REG_P (src))
1086 rtx (*gen) (rtx, rtx);
1088 gcc_assert (VECTOR_MODE_P (src_mode));
1090 switch (src_mode)
1092 case V16QImode:
1093 gen = gen_aarch64_split_simd_movv16qi;
1094 break;
1095 case V8HImode:
1096 gen = gen_aarch64_split_simd_movv8hi;
1097 break;
1098 case V4SImode:
1099 gen = gen_aarch64_split_simd_movv4si;
1100 break;
1101 case V2DImode:
1102 gen = gen_aarch64_split_simd_movv2di;
1103 break;
1104 case V4SFmode:
1105 gen = gen_aarch64_split_simd_movv4sf;
1106 break;
1107 case V2DFmode:
1108 gen = gen_aarch64_split_simd_movv2df;
1109 break;
1110 default:
1111 gcc_unreachable ();
1114 emit_insn (gen (dst, src));
1115 return;
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1122 if (can_create_pseudo_p ())
1123 return force_reg (mode, value);
1124 else
1126 x = aarch64_emit_move (x, value);
1127 return x;
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1135 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1137 rtx high;
1138 /* Load the full offset into a register. This
1139 might be improvable in the future. */
1140 high = GEN_INT (offset);
1141 offset = 0;
1142 high = aarch64_force_temporary (mode, temp, high);
1143 reg = aarch64_force_temporary (mode, temp,
1144 gen_rtx_PLUS (mode, high, reg));
1146 return plus_constant (mode, reg, offset);
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151 machine_mode mode)
1153 unsigned HOST_WIDE_INT mask;
1154 int i;
1155 bool first;
1156 unsigned HOST_WIDE_INT val;
1157 bool subtargets;
1158 rtx subtarget;
1159 int one_match, zero_match, first_not_ffff_match;
1160 int num_insns = 0;
1162 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1164 if (generate)
1165 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166 num_insns++;
1167 return num_insns;
1170 if (mode == SImode)
1172 /* We know we can't do this in 1 insn, and we must be able to do it
1173 in two; so don't mess around looking for sequences that don't buy
1174 us anything. */
1175 if (generate)
1177 emit_insn (gen_rtx_SET (VOIDmode, dest,
1178 GEN_INT (INTVAL (imm) & 0xffff)));
1179 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1182 num_insns += 2;
1183 return num_insns;
1186 /* Remaining cases are all for DImode. */
1188 val = INTVAL (imm);
1189 subtargets = optimize && can_create_pseudo_p ();
1191 one_match = 0;
1192 zero_match = 0;
1193 mask = 0xffff;
1194 first_not_ffff_match = -1;
1196 for (i = 0; i < 64; i += 16, mask <<= 16)
1198 if ((val & mask) == mask)
1199 one_match++;
1200 else
1202 if (first_not_ffff_match < 0)
1203 first_not_ffff_match = i;
1204 if ((val & mask) == 0)
1205 zero_match++;
1209 if (one_match == 2)
1211 /* Set one of the quarters and then insert back into result. */
1212 mask = 0xffffll << first_not_ffff_match;
1213 if (generate)
1215 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217 GEN_INT ((val >> first_not_ffff_match)
1218 & 0xffff)));
1220 num_insns += 2;
1221 return num_insns;
1224 if (zero_match == 2)
1225 goto simple_sequence;
1227 mask = 0x0ffff0000UL;
1228 for (i = 16; i < 64; i += 16, mask <<= 16)
1230 HOST_WIDE_INT comp = mask & ~(mask - 1);
1232 if (aarch64_uimm12_shift (val - (val & mask)))
1234 if (generate)
1236 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238 GEN_INT (val & mask)));
1239 emit_insn (gen_adddi3 (dest, subtarget,
1240 GEN_INT (val - (val & mask))));
1242 num_insns += 2;
1243 return num_insns;
1245 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1247 if (generate)
1249 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251 GEN_INT ((val + comp) & mask)));
1252 emit_insn (gen_adddi3 (dest, subtarget,
1253 GEN_INT (val - ((val + comp) & mask))));
1255 num_insns += 2;
1256 return num_insns;
1258 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1260 if (generate)
1262 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264 GEN_INT ((val - comp) | ~mask)));
1265 emit_insn (gen_adddi3 (dest, subtarget,
1266 GEN_INT (val - ((val - comp) | ~mask))));
1268 num_insns += 2;
1269 return num_insns;
1271 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1273 if (generate)
1275 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277 GEN_INT (val | ~mask)));
1278 emit_insn (gen_adddi3 (dest, subtarget,
1279 GEN_INT (val - (val | ~mask))));
1281 num_insns += 2;
1282 return num_insns;
1286 /* See if we can do it by arithmetically combining two
1287 immediates. */
1288 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1290 int j;
1291 mask = 0xffff;
1293 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1296 if (generate)
1298 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300 GEN_INT (aarch64_bitmasks[i])));
1301 emit_insn (gen_adddi3 (dest, subtarget,
1302 GEN_INT (val - aarch64_bitmasks[i])));
1304 num_insns += 2;
1305 return num_insns;
1308 for (j = 0; j < 64; j += 16, mask <<= 16)
1310 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1312 if (generate)
1314 emit_insn (gen_rtx_SET (VOIDmode, dest,
1315 GEN_INT (aarch64_bitmasks[i])));
1316 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317 GEN_INT ((val >> j) & 0xffff)));
1319 num_insns += 2;
1320 return num_insns;
1325 /* See if we can do it by logically combining two immediates. */
1326 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1328 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1330 int j;
1332 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1335 if (generate)
1337 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339 GEN_INT (aarch64_bitmasks[i])));
1340 emit_insn (gen_iordi3 (dest, subtarget,
1341 GEN_INT (aarch64_bitmasks[j])));
1343 num_insns += 2;
1344 return num_insns;
1347 else if ((val & aarch64_bitmasks[i]) == val)
1349 int j;
1351 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1354 if (generate)
1356 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358 GEN_INT (aarch64_bitmasks[j])));
1359 emit_insn (gen_anddi3 (dest, subtarget,
1360 GEN_INT (aarch64_bitmasks[i])));
1362 num_insns += 2;
1363 return num_insns;
1368 if (one_match > zero_match)
1370 /* Set either first three quarters or all but the third. */
1371 mask = 0xffffll << (16 - first_not_ffff_match);
1372 if (generate)
1373 emit_insn (gen_rtx_SET (VOIDmode, dest,
1374 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375 num_insns ++;
1377 /* Now insert other two quarters. */
1378 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379 i < 64; i += 16, mask <<= 16)
1381 if ((val & mask) != mask)
1383 if (generate)
1384 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385 GEN_INT ((val >> i) & 0xffff)));
1386 num_insns ++;
1389 return num_insns;
1392 simple_sequence:
1393 first = true;
1394 mask = 0xffff;
1395 for (i = 0; i < 64; i += 16, mask <<= 16)
1397 if ((val & mask) != 0)
1399 if (first)
1401 if (generate)
1402 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403 GEN_INT (val & mask)));
1404 num_insns ++;
1405 first = false;
1407 else
1409 if (generate)
1410 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411 GEN_INT ((val >> i) & 0xffff)));
1412 num_insns ++;
1417 return num_insns;
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1424 machine_mode mode = GET_MODE (dest);
1426 gcc_assert (mode == SImode || mode == DImode);
1428 /* Check on what type of symbol it is. */
1429 if (GET_CODE (imm) == SYMBOL_REF
1430 || GET_CODE (imm) == LABEL_REF
1431 || GET_CODE (imm) == CONST)
1433 rtx mem, base, offset;
1434 enum aarch64_symbol_type sty;
1436 /* If we have (const (plus symbol offset)), separate out the offset
1437 before we start classifying the symbol. */
1438 split_const (imm, &base, &offset);
1440 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441 switch (sty)
1443 case SYMBOL_FORCE_TO_MEM:
1444 if (offset != const0_rtx
1445 && targetm.cannot_force_const_mem (mode, imm))
1447 gcc_assert (can_create_pseudo_p ());
1448 base = aarch64_force_temporary (mode, dest, base);
1449 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450 aarch64_emit_move (dest, base);
1451 return;
1453 mem = force_const_mem (ptr_mode, imm);
1454 gcc_assert (mem);
1455 if (mode != ptr_mode)
1456 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458 return;
1460 case SYMBOL_SMALL_TLSGD:
1461 case SYMBOL_SMALL_TLSDESC:
1462 case SYMBOL_SMALL_GOTTPREL:
1463 case SYMBOL_SMALL_GOT:
1464 case SYMBOL_TINY_GOT:
1465 if (offset != const0_rtx)
1467 gcc_assert(can_create_pseudo_p ());
1468 base = aarch64_force_temporary (mode, dest, base);
1469 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470 aarch64_emit_move (dest, base);
1471 return;
1473 /* FALLTHRU */
1475 case SYMBOL_SMALL_TPREL:
1476 case SYMBOL_SMALL_ABSOLUTE:
1477 case SYMBOL_TINY_ABSOLUTE:
1478 aarch64_load_symref_appropriately (dest, imm, sty);
1479 return;
1481 default:
1482 gcc_unreachable ();
1486 if (!CONST_INT_P (imm))
1488 if (GET_CODE (imm) == HIGH)
1489 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490 else
1492 rtx mem = force_const_mem (mode, imm);
1493 gcc_assert (mem);
1494 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1497 return;
1500 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505 tree exp ATTRIBUTE_UNUSED)
1507 /* Currently, always true. */
1508 return true;
1511 /* Implement TARGET_PASS_BY_REFERENCE. */
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515 machine_mode mode,
1516 const_tree type,
1517 bool named ATTRIBUTE_UNUSED)
1519 HOST_WIDE_INT size;
1520 machine_mode dummymode;
1521 int nregs;
1523 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1524 size = (mode == BLKmode && type)
1525 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1527 /* Aggregates are passed by reference based on their size. */
1528 if (type && AGGREGATE_TYPE_P (type))
1530 size = int_size_in_bytes (type);
1533 /* Variable sized arguments are always returned by reference. */
1534 if (size < 0)
1535 return true;
1537 /* Can this be a candidate to be passed in fp/simd register(s)? */
1538 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539 &dummymode, &nregs,
1540 NULL))
1541 return false;
1543 /* Arguments which are variable sized or larger than 2 registers are
1544 passed by reference unless they are a homogenous floating point
1545 aggregate. */
1546 return size > 2 * UNITS_PER_WORD;
1549 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1553 machine_mode dummy_mode;
1554 int dummy_int;
1556 /* Never happens in little-endian mode. */
1557 if (!BYTES_BIG_ENDIAN)
1558 return false;
1560 /* Only composite types smaller than or equal to 16 bytes can
1561 be potentially returned in registers. */
1562 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563 || int_size_in_bytes (valtype) <= 0
1564 || int_size_in_bytes (valtype) > 16)
1565 return false;
1567 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569 is always passed/returned in the least significant bits of fp/simd
1570 register(s). */
1571 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572 &dummy_mode, &dummy_int, NULL))
1573 return false;
1575 return true;
1578 /* Implement TARGET_FUNCTION_VALUE.
1579 Define how to find the value returned by a function. */
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583 bool outgoing ATTRIBUTE_UNUSED)
1585 machine_mode mode;
1586 int unsignedp;
1587 int count;
1588 machine_mode ag_mode;
1590 mode = TYPE_MODE (type);
1591 if (INTEGRAL_TYPE_P (type))
1592 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1594 if (aarch64_return_in_msb (type))
1596 HOST_WIDE_INT size = int_size_in_bytes (type);
1598 if (size % UNITS_PER_WORD != 0)
1600 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1605 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606 &ag_mode, &count, NULL))
1608 if (!aarch64_composite_type_p (type, mode))
1610 gcc_assert (count == 1 && mode == ag_mode);
1611 return gen_rtx_REG (mode, V0_REGNUM);
1613 else
1615 int i;
1616 rtx par;
1618 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619 for (i = 0; i < count; i++)
1621 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624 XVECEXP (par, 0, i) = tmp;
1626 return par;
1629 else
1630 return gen_rtx_REG (mode, R0_REGNUM);
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634 Return true if REGNO is the number of a hard register in which the values
1635 of called function may come back. */
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1640 /* Maximum of 16 bytes can be returned in the general registers. Examples
1641 of 16-byte return values are: 128-bit integers and 16-byte small
1642 structures (excluding homogeneous floating-point aggregates). */
1643 if (regno == R0_REGNUM || regno == R1_REGNUM)
1644 return true;
1646 /* Up to four fp/simd registers can return a function value, e.g. a
1647 homogeneous floating-point aggregate having four members. */
1648 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649 return !TARGET_GENERAL_REGS_ONLY;
1651 return false;
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1656 If the type T of the result of a function is such that
1657 void func (T arg)
1658 would require that arg be passed as a value in a register (or set of
1659 registers) according to the parameter passing rules, then the result
1660 is returned in the same registers as would be used for such an
1661 argument. */
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1666 HOST_WIDE_INT size;
1667 machine_mode ag_mode;
1668 int count;
1670 if (!AGGREGATE_TYPE_P (type)
1671 && TREE_CODE (type) != COMPLEX_TYPE
1672 && TREE_CODE (type) != VECTOR_TYPE)
1673 /* Simple scalar types always returned in registers. */
1674 return false;
1676 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677 type,
1678 &ag_mode,
1679 &count,
1680 NULL))
1681 return false;
1683 /* Types larger than 2 registers returned in memory. */
1684 size = int_size_in_bytes (type);
1685 return (size < 0 || size > 2 * UNITS_PER_WORD);
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690 const_tree type, int *nregs)
1692 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693 return aarch64_vfp_is_call_or_return_candidate (mode,
1694 type,
1695 &pcum->aapcs_vfp_rmode,
1696 nregs,
1697 NULL);
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701 bits. The idea is to suppress any stronger alignment requested by
1702 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703 This is a helper function for local use only. */
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1708 unsigned int alignment;
1710 if (type)
1712 if (!integer_zerop (TYPE_SIZE (type)))
1714 if (TYPE_MODE (type) == mode)
1715 alignment = TYPE_ALIGN (type);
1716 else
1717 alignment = GET_MODE_ALIGNMENT (mode);
1719 else
1720 alignment = 0;
1722 else
1723 alignment = GET_MODE_ALIGNMENT (mode);
1725 return alignment;
1728 /* Layout a function argument according to the AAPCS64 rules. The rule
1729 numbers refer to the rule numbers in the AAPCS64. */
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733 const_tree type,
1734 bool named ATTRIBUTE_UNUSED)
1736 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737 int ncrn, nvrn, nregs;
1738 bool allocate_ncrn, allocate_nvrn;
1739 HOST_WIDE_INT size;
1741 /* We need to do this once per argument. */
1742 if (pcum->aapcs_arg_processed)
1743 return;
1745 pcum->aapcs_arg_processed = true;
1747 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1748 size
1749 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750 UNITS_PER_WORD);
1752 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754 mode,
1755 type,
1756 &nregs);
1758 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759 The following code thus handles passing by SIMD/FP registers first. */
1761 nvrn = pcum->aapcs_nvrn;
1763 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764 and homogenous short-vector aggregates (HVA). */
1765 if (allocate_nvrn)
1767 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1769 pcum->aapcs_nextnvrn = nvrn + nregs;
1770 if (!aarch64_composite_type_p (type, mode))
1772 gcc_assert (nregs == 1);
1773 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1775 else
1777 rtx par;
1778 int i;
1779 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780 for (i = 0; i < nregs; i++)
1782 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783 V0_REGNUM + nvrn + i);
1784 tmp = gen_rtx_EXPR_LIST
1785 (VOIDmode, tmp,
1786 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787 XVECEXP (par, 0, i) = tmp;
1789 pcum->aapcs_reg = par;
1791 return;
1793 else
1795 /* C.3 NSRN is set to 8. */
1796 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797 goto on_stack;
1801 ncrn = pcum->aapcs_ncrn;
1802 nregs = size / UNITS_PER_WORD;
1804 /* C6 - C9. though the sign and zero extension semantics are
1805 handled elsewhere. This is the case where the argument fits
1806 entirely general registers. */
1807 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1809 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1811 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1813 /* C.8 if the argument has an alignment of 16 then the NGRN is
1814 rounded up to the next even number. */
1815 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1817 ++ncrn;
1818 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1820 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821 A reg is still generated for it, but the caller should be smart
1822 enough not to use it. */
1823 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1825 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1827 else
1829 rtx par;
1830 int i;
1832 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833 for (i = 0; i < nregs; i++)
1835 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837 GEN_INT (i * UNITS_PER_WORD));
1838 XVECEXP (par, 0, i) = tmp;
1840 pcum->aapcs_reg = par;
1843 pcum->aapcs_nextncrn = ncrn + nregs;
1844 return;
1847 /* C.11 */
1848 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1850 /* The argument is passed on stack; record the needed number of words for
1851 this argument and align the total size if necessary. */
1852 on_stack:
1853 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856 16 / UNITS_PER_WORD);
1857 return;
1860 /* Implement TARGET_FUNCTION_ARG. */
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864 const_tree type, bool named)
1866 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1869 if (mode == VOIDmode)
1870 return NULL_RTX;
1872 aarch64_layout_arg (pcum_v, mode, type, named);
1873 return pcum->aapcs_reg;
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878 const_tree fntype ATTRIBUTE_UNUSED,
1879 rtx libname ATTRIBUTE_UNUSED,
1880 const_tree fndecl ATTRIBUTE_UNUSED,
1881 unsigned n_named ATTRIBUTE_UNUSED)
1883 pcum->aapcs_ncrn = 0;
1884 pcum->aapcs_nvrn = 0;
1885 pcum->aapcs_nextncrn = 0;
1886 pcum->aapcs_nextnvrn = 0;
1887 pcum->pcs_variant = ARM_PCS_AAPCS64;
1888 pcum->aapcs_reg = NULL_RTX;
1889 pcum->aapcs_arg_processed = false;
1890 pcum->aapcs_stack_words = 0;
1891 pcum->aapcs_stack_size = 0;
1893 return;
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898 machine_mode mode,
1899 const_tree type,
1900 bool named)
1902 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1905 aarch64_layout_arg (pcum_v, mode, type, named);
1906 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907 != (pcum->aapcs_stack_words != 0));
1908 pcum->aapcs_arg_processed = false;
1909 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912 pcum->aapcs_stack_words = 0;
1913 pcum->aapcs_reg = NULL_RTX;
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1920 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1924 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1925 PARM_BOUNDARY bits of alignment, but will be given anything up
1926 to STACK_BOUNDARY bits if the type requires it. This makes sure
1927 that both before and after the layout of each argument, the Next
1928 Stacked Argument Address (NSAA) will have a minimum alignment of
1929 8 bytes. */
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1934 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1936 if (alignment < PARM_BOUNDARY)
1937 alignment = PARM_BOUNDARY;
1938 if (alignment > STACK_BOUNDARY)
1939 alignment = STACK_BOUNDARY;
1940 return alignment;
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1945 Return true if an argument passed on the stack should be padded upwards,
1946 i.e. if the least-significant byte of the stack slot has useful data.
1948 Small aggregate types are placed in the lowest memory address.
1950 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1955 /* On little-endian targets, the least significant byte of every stack
1956 argument is passed at the lowest byte address of the stack slot. */
1957 if (!BYTES_BIG_ENDIAN)
1958 return true;
1960 /* Otherwise, integral, floating-point and pointer types are padded downward:
1961 the least significant byte of a stack argument is passed at the highest
1962 byte address of the stack slot. */
1963 if (type
1964 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965 || POINTER_TYPE_P (type))
1966 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967 return false;
1969 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1970 return true;
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1975 It specifies padding for the last (may also be the only)
1976 element of a block move between registers and memory. If
1977 assuming the block is in the memory, padding upward means that
1978 the last element is padded after its highest significant byte,
1979 while in downward padding, the last element is padded at the
1980 its least significant byte side.
1982 Small aggregates and small complex types are always padded
1983 upwards.
1985 We don't need to worry about homogeneous floating-point or
1986 short-vector aggregates; their move is not affected by the
1987 padding direction determined here. Regardless of endianness,
1988 each element of such an aggregate is put in the least
1989 significant bits of a fp/simd register.
1991 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992 register has useful data, and return the opposite if the most
1993 significant byte does. */
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997 bool first ATTRIBUTE_UNUSED)
2000 /* Small composite types are always padded upward. */
2001 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2003 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004 : GET_MODE_SIZE (mode));
2005 if (size < 2 * UNITS_PER_WORD)
2006 return true;
2009 /* Otherwise, use the default padding. */
2010 return !BYTES_BIG_ENDIAN;
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2016 return SImode;
2019 static bool
2020 aarch64_frame_pointer_required (void)
2022 /* In aarch64_override_options_after_change
2023 flag_omit_leaf_frame_pointer turns off the frame pointer by
2024 default. Turn it back on now if we've not got a leaf
2025 function. */
2026 if (flag_omit_leaf_frame_pointer
2027 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028 return true;
2030 return false;
2033 /* Mark the registers that need to be saved by the callee and calculate
2034 the size of the callee-saved registers area and frame record (both FP
2035 and LR may be omitted). */
2036 static void
2037 aarch64_layout_frame (void)
2039 HOST_WIDE_INT offset = 0;
2040 int regno;
2042 if (reload_completed && cfun->machine->frame.laid_out)
2043 return;
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED (-1)
2048 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2051 /* First mark all the registers that really need to be saved... */
2052 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2055 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2058 /* ... that includes the eh data registers (if needed)... */
2059 if (crtl->calls_eh_return)
2060 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062 = SLOT_REQUIRED;
2064 /* ... and any callee saved register that dataflow says is live. */
2065 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066 if (df_regs_ever_live_p (regno)
2067 && (regno == R30_REGNUM
2068 || !call_used_regs[regno]))
2069 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2071 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072 if (df_regs_ever_live_p (regno)
2073 && !call_used_regs[regno])
2074 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2076 if (frame_pointer_needed)
2078 /* FP and LR are placed in the linkage record. */
2079 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084 offset += 2 * UNITS_PER_WORD;
2087 /* Now assign stack slots for them. */
2088 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2091 cfun->machine->frame.reg_offset[regno] = offset;
2092 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093 cfun->machine->frame.wb_candidate1 = regno;
2094 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095 cfun->machine->frame.wb_candidate2 = regno;
2096 offset += UNITS_PER_WORD;
2099 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2102 cfun->machine->frame.reg_offset[regno] = offset;
2103 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104 cfun->machine->frame.wb_candidate1 = regno;
2105 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107 cfun->machine->frame.wb_candidate2 = regno;
2108 offset += UNITS_PER_WORD;
2111 cfun->machine->frame.padding0 =
2112 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2115 cfun->machine->frame.saved_regs_size = offset;
2117 cfun->machine->frame.hard_fp_offset
2118 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119 + get_frame_size ()
2120 + cfun->machine->frame.saved_regs_size,
2121 STACK_BOUNDARY / BITS_PER_UNIT);
2123 cfun->machine->frame.frame_size
2124 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125 + crtl->outgoing_args_size,
2126 STACK_BOUNDARY / BITS_PER_UNIT);
2128 cfun->machine->frame.laid_out = true;
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2134 return cfun->machine->frame.reg_offset[regno] >= 0;
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2140 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141 regno ++;
2142 return regno;
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147 HOST_WIDE_INT adjustment)
2149 rtx base_rtx = stack_pointer_rtx;
2150 rtx insn, reg, mem;
2152 reg = gen_rtx_REG (mode, regno);
2153 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154 plus_constant (Pmode, base_rtx, -adjustment));
2155 mem = gen_rtx_MEM (mode, mem);
2157 insn = emit_move_insn (mem, reg);
2158 RTX_FRAME_RELATED_P (insn) = 1;
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163 HOST_WIDE_INT adjustment)
2165 switch (mode)
2167 case DImode:
2168 return gen_storewb_pairdi_di (base, base, reg, reg2,
2169 GEN_INT (-adjustment),
2170 GEN_INT (UNITS_PER_WORD - adjustment));
2171 case DFmode:
2172 return gen_storewb_pairdf_di (base, base, reg, reg2,
2173 GEN_INT (-adjustment),
2174 GEN_INT (UNITS_PER_WORD - adjustment));
2175 default:
2176 gcc_unreachable ();
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182 unsigned regno2, HOST_WIDE_INT adjustment)
2184 rtx_insn *insn;
2185 rtx reg1 = gen_rtx_REG (mode, regno1);
2186 rtx reg2 = gen_rtx_REG (mode, regno2);
2188 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189 reg2, adjustment));
2190 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192 RTX_FRAME_RELATED_P (insn) = 1;
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197 HOST_WIDE_INT adjustment)
2199 switch (mode)
2201 case DImode:
2202 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203 GEN_INT (UNITS_PER_WORD));
2204 case DFmode:
2205 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206 GEN_INT (UNITS_PER_WORD));
2207 default:
2208 gcc_unreachable ();
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214 rtx reg2)
2216 switch (mode)
2218 case DImode:
2219 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2221 case DFmode:
2222 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2224 default:
2225 gcc_unreachable ();
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231 rtx mem2)
2233 switch (mode)
2235 case DImode:
2236 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2238 case DFmode:
2239 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2241 default:
2242 gcc_unreachable ();
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249 unsigned start, unsigned limit, bool skip_wb)
2251 rtx_insn *insn;
2252 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253 ? gen_frame_mem : gen_rtx_MEM);
2254 unsigned regno;
2255 unsigned regno2;
2257 for (regno = aarch64_next_callee_save (start, limit);
2258 regno <= limit;
2259 regno = aarch64_next_callee_save (regno + 1, limit))
2261 rtx reg, mem;
2262 HOST_WIDE_INT offset;
2264 if (skip_wb
2265 && (regno == cfun->machine->frame.wb_candidate1
2266 || regno == cfun->machine->frame.wb_candidate2))
2267 continue;
2269 reg = gen_rtx_REG (mode, regno);
2270 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272 offset));
2274 regno2 = aarch64_next_callee_save (regno + 1, limit);
2276 if (regno2 <= limit
2277 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278 == cfun->machine->frame.reg_offset[regno2]))
2281 rtx reg2 = gen_rtx_REG (mode, regno2);
2282 rtx mem2;
2284 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286 offset));
2287 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288 reg2));
2290 /* The first part of a frame-related parallel insn is
2291 always assumed to be relevant to the frame
2292 calculations; subsequent parts, are only
2293 frame-related if explicitly marked. */
2294 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295 regno = regno2;
2297 else
2298 insn = emit_move_insn (mem, reg);
2300 RTX_FRAME_RELATED_P (insn) = 1;
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306 HOST_WIDE_INT start_offset, unsigned start,
2307 unsigned limit, bool skip_wb, rtx *cfi_ops)
2309 rtx base_rtx = stack_pointer_rtx;
2310 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311 ? gen_frame_mem : gen_rtx_MEM);
2312 unsigned regno;
2313 unsigned regno2;
2314 HOST_WIDE_INT offset;
2316 for (regno = aarch64_next_callee_save (start, limit);
2317 regno <= limit;
2318 regno = aarch64_next_callee_save (regno + 1, limit))
2320 rtx reg, mem;
2322 if (skip_wb
2323 && (regno == cfun->machine->frame.wb_candidate1
2324 || regno == cfun->machine->frame.wb_candidate2))
2325 continue;
2327 reg = gen_rtx_REG (mode, regno);
2328 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2331 regno2 = aarch64_next_callee_save (regno + 1, limit);
2333 if (regno2 <= limit
2334 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335 == cfun->machine->frame.reg_offset[regno2]))
2337 rtx reg2 = gen_rtx_REG (mode, regno2);
2338 rtx mem2;
2340 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2344 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345 regno = regno2;
2347 else
2348 emit_move_insn (reg, mem);
2349 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2353 /* AArch64 stack frames generated by this compiler look like:
2355 +-------------------------------+
2357 | incoming stack arguments |
2359 +-------------------------------+
2360 | | <-- incoming stack pointer (aligned)
2361 | callee-allocated save area |
2362 | for register varargs |
2364 +-------------------------------+
2365 | local variables | <-- frame_pointer_rtx
2367 +-------------------------------+
2368 | padding0 | \
2369 +-------------------------------+ |
2370 | callee-saved registers | | frame.saved_regs_size
2371 +-------------------------------+ |
2372 | LR' | |
2373 +-------------------------------+ |
2374 | FP' | / <- hard_frame_pointer_rtx (aligned)
2375 +-------------------------------+
2376 | dynamic allocation |
2377 +-------------------------------+
2378 | padding |
2379 +-------------------------------+
2380 | outgoing stack arguments | <-- arg_pointer
2382 +-------------------------------+
2383 | | <-- stack_pointer_rtx (aligned)
2385 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387 unchanged. */
2389 /* Generate the prologue instructions for entry into a function.
2390 Establish the stack frame by decreasing the stack pointer with a
2391 properly calculated size and, if necessary, create a frame record
2392 filled with the values of LR and previous frame pointer. The
2393 current FP is also set up if it is in use. */
2395 void
2396 aarch64_expand_prologue (void)
2398 /* sub sp, sp, #<frame_size>
2399 stp {fp, lr}, [sp, #<frame_size> - 16]
2400 add fp, sp, #<frame_size> - hardfp_offset
2401 stp {cs_reg}, [fp, #-16] etc.
2403 sub sp, sp, <final_adjustment_if_any>
2405 HOST_WIDE_INT frame_size, offset;
2406 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2407 HOST_WIDE_INT hard_fp_offset;
2408 rtx_insn *insn;
2410 aarch64_layout_frame ();
2412 offset = frame_size = cfun->machine->frame.frame_size;
2413 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414 fp_offset = frame_size - hard_fp_offset;
2416 if (flag_stack_usage_info)
2417 current_function_static_stack_size = frame_size;
2419 /* Store pairs and load pairs have a range only -512 to 504. */
2420 if (offset >= 512)
2422 /* When the frame has a large size, an initial decrease is done on
2423 the stack pointer to jump over the callee-allocated save area for
2424 register varargs, the local variable area and/or the callee-saved
2425 register area. This will allow the pre-index write-back
2426 store pair instructions to be used for setting up the stack frame
2427 efficiently. */
2428 offset = hard_fp_offset;
2429 if (offset >= 512)
2430 offset = cfun->machine->frame.saved_regs_size;
2432 frame_size -= (offset + crtl->outgoing_args_size);
2433 fp_offset = 0;
2435 if (frame_size >= 0x1000000)
2437 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438 emit_move_insn (op0, GEN_INT (-frame_size));
2439 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2441 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443 plus_constant (Pmode, stack_pointer_rtx,
2444 -frame_size)));
2445 RTX_FRAME_RELATED_P (insn) = 1;
2447 else if (frame_size > 0)
2449 int hi_ofs = frame_size & 0xfff000;
2450 int lo_ofs = frame_size & 0x000fff;
2452 if (hi_ofs)
2454 insn = emit_insn (gen_add2_insn
2455 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456 RTX_FRAME_RELATED_P (insn) = 1;
2458 if (lo_ofs)
2460 insn = emit_insn (gen_add2_insn
2461 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462 RTX_FRAME_RELATED_P (insn) = 1;
2466 else
2467 frame_size = -1;
2469 if (offset > 0)
2471 bool skip_wb = false;
2473 if (frame_pointer_needed)
2475 skip_wb = true;
2477 if (fp_offset)
2479 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480 GEN_INT (-offset)));
2481 RTX_FRAME_RELATED_P (insn) = 1;
2483 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484 R30_REGNUM, false);
2486 else
2487 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2489 /* Set up frame pointer to point to the location of the
2490 previous frame pointer on the stack. */
2491 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492 stack_pointer_rtx,
2493 GEN_INT (fp_offset)));
2494 RTX_FRAME_RELATED_P (insn) = 1;
2495 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2497 else
2499 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2502 if (fp_offset
2503 || reg1 == FIRST_PSEUDO_REGISTER
2504 || (reg2 == FIRST_PSEUDO_REGISTER
2505 && offset >= 256))
2507 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508 GEN_INT (-offset)));
2509 RTX_FRAME_RELATED_P (insn) = 1;
2511 else
2513 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2515 skip_wb = true;
2517 if (reg2 == FIRST_PSEUDO_REGISTER)
2518 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519 else
2520 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2524 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525 skip_wb);
2526 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527 skip_wb);
2530 /* when offset >= 512,
2531 sub sp, sp, #<outgoing_args_size> */
2532 if (frame_size > -1)
2534 if (crtl->outgoing_args_size > 0)
2536 insn = emit_insn (gen_add2_insn
2537 (stack_pointer_rtx,
2538 GEN_INT (- crtl->outgoing_args_size)));
2539 RTX_FRAME_RELATED_P (insn) = 1;
2544 /* Return TRUE if we can use a simple_return insn.
2546 This function checks whether the callee saved stack is empty, which
2547 means no restore actions are need. The pro_and_epilogue will use
2548 this to check whether shrink-wrapping opt is feasible. */
2550 bool
2551 aarch64_use_return_insn_p (void)
2553 if (!reload_completed)
2554 return false;
2556 if (crtl->profile)
2557 return false;
2559 aarch64_layout_frame ();
2561 return cfun->machine->frame.frame_size == 0;
2564 /* Generate the epilogue instructions for returning from a function. */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2568 HOST_WIDE_INT frame_size, offset;
2569 HOST_WIDE_INT fp_offset;
2570 HOST_WIDE_INT hard_fp_offset;
2571 rtx_insn *insn;
2572 /* We need to add memory barrier to prevent read from deallocated stack. */
2573 bool need_barrier_p = (get_frame_size () != 0
2574 || cfun->machine->frame.saved_varargs_size);
2576 aarch64_layout_frame ();
2578 offset = frame_size = cfun->machine->frame.frame_size;
2579 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580 fp_offset = frame_size - hard_fp_offset;
2582 /* Store pairs and load pairs have a range only -512 to 504. */
2583 if (offset >= 512)
2585 offset = hard_fp_offset;
2586 if (offset >= 512)
2587 offset = cfun->machine->frame.saved_regs_size;
2589 frame_size -= (offset + crtl->outgoing_args_size);
2590 fp_offset = 0;
2591 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2593 insn = emit_insn (gen_add2_insn
2594 (stack_pointer_rtx,
2595 GEN_INT (crtl->outgoing_args_size)));
2596 RTX_FRAME_RELATED_P (insn) = 1;
2599 else
2600 frame_size = -1;
2602 /* If there were outgoing arguments or we've done dynamic stack
2603 allocation, then restore the stack pointer from the frame
2604 pointer. This is at most one insn and more efficient than using
2605 GCC's internal mechanism. */
2606 if (frame_pointer_needed
2607 && (crtl->outgoing_args_size || cfun->calls_alloca))
2609 if (cfun->calls_alloca)
2610 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2612 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613 hard_frame_pointer_rtx,
2614 GEN_INT (0)));
2615 offset = offset - fp_offset;
2618 if (offset > 0)
2620 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622 bool skip_wb = true;
2623 rtx cfi_ops = NULL;
2625 if (frame_pointer_needed)
2626 fp_offset = 0;
2627 else if (fp_offset
2628 || reg1 == FIRST_PSEUDO_REGISTER
2629 || (reg2 == FIRST_PSEUDO_REGISTER
2630 && offset >= 256))
2631 skip_wb = false;
2633 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634 skip_wb, &cfi_ops);
2635 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636 skip_wb, &cfi_ops);
2638 if (need_barrier_p)
2639 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2641 if (skip_wb)
2643 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2646 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647 if (reg2 == FIRST_PSEUDO_REGISTER)
2649 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651 mem = gen_rtx_MEM (mode1, mem);
2652 insn = emit_move_insn (rreg1, mem);
2654 else
2656 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2658 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659 insn = emit_insn (aarch64_gen_loadwb_pair
2660 (mode1, stack_pointer_rtx, rreg1,
2661 rreg2, offset));
2664 else
2666 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667 GEN_INT (offset)));
2670 /* Reset the CFA to be SP + FRAME_SIZE. */
2671 rtx new_cfa = stack_pointer_rtx;
2672 if (frame_size > 0)
2673 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675 REG_NOTES (insn) = cfi_ops;
2676 RTX_FRAME_RELATED_P (insn) = 1;
2679 if (frame_size > 0)
2681 if (need_barrier_p)
2682 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2684 if (frame_size >= 0x1000000)
2686 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687 emit_move_insn (op0, GEN_INT (frame_size));
2688 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2690 else
2692 int hi_ofs = frame_size & 0xfff000;
2693 int lo_ofs = frame_size & 0x000fff;
2695 if (hi_ofs && lo_ofs)
2697 insn = emit_insn (gen_add2_insn
2698 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699 RTX_FRAME_RELATED_P (insn) = 1;
2700 frame_size = lo_ofs;
2702 insn = emit_insn (gen_add2_insn
2703 (stack_pointer_rtx, GEN_INT (frame_size)));
2706 /* Reset the CFA to be SP + 0. */
2707 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708 RTX_FRAME_RELATED_P (insn) = 1;
2711 /* Stack adjustment for exception handler. */
2712 if (crtl->calls_eh_return)
2714 /* We need to unwind the stack by the offset computed by
2715 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2716 to be SP; letting the CFA move during this adjustment
2717 is just as correct as retaining the CFA from the body
2718 of the function. Therefore, do nothing special. */
2719 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2722 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723 if (!for_sibcall)
2724 emit_jump_insn (ret_rtx);
2727 /* Return the place to copy the exception unwinding return address to.
2728 This will probably be a stack slot, but could (in theory be the
2729 return register). */
2731 aarch64_final_eh_return_addr (void)
2733 HOST_WIDE_INT fp_offset;
2735 aarch64_layout_frame ();
2737 fp_offset = cfun->machine->frame.frame_size
2738 - cfun->machine->frame.hard_fp_offset;
2740 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741 return gen_rtx_REG (DImode, LR_REGNUM);
2743 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2744 result in a store to save LR introduced by builtin_eh_return () being
2745 incorrectly deleted because the alias is not detected.
2746 So in the calculation of the address to copy the exception unwinding
2747 return address to, we note 2 cases.
2748 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749 we return a SP-relative location since all the addresses are SP-relative
2750 in this case. This prevents the store from being optimized away.
2751 If the fp_offset is not 0, then the addresses will be FP-relative and
2752 therefore we return a FP-relative location. */
2754 if (frame_pointer_needed)
2756 if (fp_offset)
2757 return gen_frame_mem (DImode,
2758 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759 else
2760 return gen_frame_mem (DImode,
2761 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2764 /* If FP is not needed, we calculate the location of LR, which would be
2765 at the top of the saved registers block. */
2767 return gen_frame_mem (DImode,
2768 plus_constant (Pmode,
2769 stack_pointer_rtx,
2770 fp_offset
2771 + cfun->machine->frame.saved_regs_size
2772 - 2 * UNITS_PER_WORD));
2775 /* Possibly output code to build up a constant in a register. For
2776 the benefit of the costs infrastructure, returns the number of
2777 instructions which would be emitted. GENERATE inhibits or
2778 enables code generation. */
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2783 int insns = 0;
2785 if (aarch64_bitmask_imm (val, DImode))
2787 if (generate)
2788 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789 insns = 1;
2791 else
2793 int i;
2794 int ncount = 0;
2795 int zcount = 0;
2796 HOST_WIDE_INT valp = val >> 16;
2797 HOST_WIDE_INT valm;
2798 HOST_WIDE_INT tval;
2800 for (i = 16; i < 64; i += 16)
2802 valm = (valp & 0xffff);
2804 if (valm != 0)
2805 ++ zcount;
2807 if (valm != 0xffff)
2808 ++ ncount;
2810 valp >>= 16;
2813 /* zcount contains the number of additional MOVK instructions
2814 required if the constant is built up with an initial MOVZ instruction,
2815 while ncount is the number of MOVK instructions required if starting
2816 with a MOVN instruction. Choose the sequence that yields the fewest
2817 number of instructions, preferring MOVZ instructions when they are both
2818 the same. */
2819 if (ncount < zcount)
2821 if (generate)
2822 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824 tval = 0xffff;
2825 insns++;
2827 else
2829 if (generate)
2830 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831 GEN_INT (val & 0xffff));
2832 tval = 0;
2833 insns++;
2836 val >>= 16;
2838 for (i = 16; i < 64; i += 16)
2840 if ((val & 0xffff) != tval)
2842 if (generate)
2843 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844 GEN_INT (i),
2845 GEN_INT (val & 0xffff)));
2846 insns++;
2848 val >>= 16;
2851 return insns;
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2857 HOST_WIDE_INT mdelta = delta;
2858 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2861 if (mdelta < 0)
2862 mdelta = -mdelta;
2864 if (mdelta >= 4096 * 4096)
2866 (void) aarch64_build_constant (scratchreg, delta, true);
2867 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2869 else if (mdelta > 0)
2871 if (mdelta >= 4096)
2873 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875 if (delta < 0)
2876 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878 else
2879 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2882 if (mdelta % 4096 != 0)
2884 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2891 /* Output code to add DELTA to the first argument, and then jump
2892 to FUNCTION. Used for C++ multiple inheritance. */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895 HOST_WIDE_INT delta,
2896 HOST_WIDE_INT vcall_offset,
2897 tree function)
2899 /* The this pointer is always in x0. Note that this differs from
2900 Arm where the this pointer maybe bumped to r1 if r0 is required
2901 to return a pointer to an aggregate. On AArch64 a result value
2902 pointer will be in x8. */
2903 int this_regno = R0_REGNUM;
2904 rtx this_rtx, temp0, temp1, addr, funexp;
2905 rtx_insn *insn;
2907 reload_completed = 1;
2908 emit_note (NOTE_INSN_PROLOGUE_END);
2910 if (vcall_offset == 0)
2911 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912 else
2914 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2916 this_rtx = gen_rtx_REG (Pmode, this_regno);
2917 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2920 addr = this_rtx;
2921 if (delta != 0)
2923 if (delta >= -256 && delta < 256)
2924 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925 plus_constant (Pmode, this_rtx, delta));
2926 else
2927 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2930 if (Pmode == ptr_mode)
2931 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932 else
2933 aarch64_emit_move (temp0,
2934 gen_rtx_ZERO_EXTEND (Pmode,
2935 gen_rtx_MEM (ptr_mode, addr)));
2937 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938 addr = plus_constant (Pmode, temp0, vcall_offset);
2939 else
2941 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2945 if (Pmode == ptr_mode)
2946 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947 else
2948 aarch64_emit_move (temp1,
2949 gen_rtx_SIGN_EXTEND (Pmode,
2950 gen_rtx_MEM (ptr_mode, addr)));
2952 emit_insn (gen_add2_insn (this_rtx, temp1));
2955 /* Generate a tail call to the target function. */
2956 if (!TREE_USED (function))
2958 assemble_external (function);
2959 TREE_USED (function) = 1;
2961 funexp = XEXP (DECL_RTL (function), 0);
2962 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964 SIBLING_CALL_P (insn) = 1;
2966 insn = get_insns ();
2967 shorten_branches (insn);
2968 final_start_function (insn, file, 1);
2969 final (insn, file, 1);
2970 final_end_function ();
2972 /* Stop pretending to be a post-reload pass. */
2973 reload_completed = 0;
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2979 if (!TARGET_HAVE_TLS)
2980 return false;
2981 subrtx_iterator::array_type array;
2982 FOR_EACH_SUBRTX (iter, array, x, ALL)
2984 const_rtx x = *iter;
2985 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986 return true;
2987 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988 TLS offsets, not real symbol references. */
2989 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990 iter.skip_subrtxes ();
2992 return false;
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2999 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3002 if (*imm1 < *imm2)
3003 return -1;
3004 if (*imm1 > *imm2)
3005 return +1;
3006 return 0;
3010 static void
3011 aarch64_build_bitmask_table (void)
3013 unsigned HOST_WIDE_INT mask, imm;
3014 unsigned int log_e, e, s, r;
3015 unsigned int nimms = 0;
3017 for (log_e = 1; log_e <= 6; log_e++)
3019 e = 1 << log_e;
3020 if (e == 64)
3021 mask = ~(HOST_WIDE_INT) 0;
3022 else
3023 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024 for (s = 1; s < e; s++)
3026 for (r = 0; r < e; r++)
3028 /* set s consecutive bits to 1 (s < 64) */
3029 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030 /* rotate right by r */
3031 if (r != 0)
3032 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033 /* replicate the constant depending on SIMD size */
3034 switch (log_e) {
3035 case 1: imm |= (imm << 2);
3036 case 2: imm |= (imm << 4);
3037 case 3: imm |= (imm << 8);
3038 case 4: imm |= (imm << 16);
3039 case 5: imm |= (imm << 32);
3040 case 6:
3041 break;
3042 default:
3043 gcc_unreachable ();
3045 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046 aarch64_bitmasks[nimms++] = imm;
3051 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053 aarch64_bitmasks_cmp);
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058 a left shift of 0 or 12 bits. */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3062 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3068 /* Return true if val is an immediate that can be loaded into a
3069 register by a MOVZ instruction. */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3073 if (GET_MODE_SIZE (mode) > 4)
3075 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077 return 1;
3079 else
3081 /* Ignore sign extension. */
3082 val &= (HOST_WIDE_INT) 0xffffffff;
3084 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3089 /* Return true if val is a valid bitmask immediate. */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3093 if (GET_MODE_SIZE (mode) < 8)
3095 /* Replicate bit pattern. */
3096 val &= (HOST_WIDE_INT) 0xffffffff;
3097 val |= val << 32;
3099 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3104 /* Return true if val is an immediate that can be loaded into a
3105 register in a single instruction. */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3109 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110 return 1;
3111 return aarch64_bitmask_imm (val, mode);
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3117 rtx base, offset;
3119 if (GET_CODE (x) == HIGH)
3120 return true;
3122 split_const (x, &base, &offset);
3123 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3125 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126 != SYMBOL_FORCE_TO_MEM)
3127 return true;
3128 else
3129 /* Avoid generating a 64-bit relocation in ILP32; leave
3130 to aarch64_expand_mov_immediate to handle it properly. */
3131 return mode != ptr_mode;
3134 return aarch64_tls_referenced_p (x);
3137 /* Return true if register REGNO is a valid index register.
3138 STRICT_P is true if REG_OK_STRICT is in effect. */
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3143 if (!HARD_REGISTER_NUM_P (regno))
3145 if (!strict_p)
3146 return true;
3148 if (!reg_renumber)
3149 return false;
3151 regno = reg_renumber[regno];
3153 return GP_REGNUM_P (regno);
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157 STRICT_P is true if REG_OK_STRICT is in effect. */
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3162 if (!HARD_REGISTER_NUM_P (regno))
3164 if (!strict_p)
3165 return true;
3167 if (!reg_renumber)
3168 return false;
3170 regno = reg_renumber[regno];
3173 /* The fake registers will be eliminated to either the stack or
3174 hard frame pointer, both of which are usually valid base registers.
3175 Reload deals with the cases where the eliminated form isn't valid. */
3176 return (GP_REGNUM_P (regno)
3177 || regno == SP_REGNUM
3178 || regno == FRAME_POINTER_REGNUM
3179 || regno == ARG_POINTER_REGNUM);
3182 /* Return true if X is a valid base register for mode MODE.
3183 STRICT_P is true if REG_OK_STRICT is in effect. */
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3188 if (!strict_p && GET_CODE (x) == SUBREG)
3189 x = SUBREG_REG (x);
3191 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3194 /* Return true if address offset is a valid index. If it is, fill in INFO
3195 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199 machine_mode mode, bool strict_p)
3201 enum aarch64_address_type type;
3202 rtx index;
3203 int shift;
3205 /* (reg:P) */
3206 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207 && GET_MODE (x) == Pmode)
3209 type = ADDRESS_REG_REG;
3210 index = x;
3211 shift = 0;
3213 /* (sign_extend:DI (reg:SI)) */
3214 else if ((GET_CODE (x) == SIGN_EXTEND
3215 || GET_CODE (x) == ZERO_EXTEND)
3216 && GET_MODE (x) == DImode
3217 && GET_MODE (XEXP (x, 0)) == SImode)
3219 type = (GET_CODE (x) == SIGN_EXTEND)
3220 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221 index = XEXP (x, 0);
3222 shift = 0;
3224 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225 else if (GET_CODE (x) == MULT
3226 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228 && GET_MODE (XEXP (x, 0)) == DImode
3229 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230 && CONST_INT_P (XEXP (x, 1)))
3232 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234 index = XEXP (XEXP (x, 0), 0);
3235 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3237 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238 else if (GET_CODE (x) == ASHIFT
3239 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241 && GET_MODE (XEXP (x, 0)) == DImode
3242 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243 && CONST_INT_P (XEXP (x, 1)))
3245 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247 index = XEXP (XEXP (x, 0), 0);
3248 shift = INTVAL (XEXP (x, 1));
3250 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251 else if ((GET_CODE (x) == SIGN_EXTRACT
3252 || GET_CODE (x) == ZERO_EXTRACT)
3253 && GET_MODE (x) == DImode
3254 && GET_CODE (XEXP (x, 0)) == MULT
3255 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3258 type = (GET_CODE (x) == SIGN_EXTRACT)
3259 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260 index = XEXP (XEXP (x, 0), 0);
3261 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262 if (INTVAL (XEXP (x, 1)) != 32 + shift
3263 || INTVAL (XEXP (x, 2)) != 0)
3264 shift = -1;
3266 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267 (const_int 0xffffffff<<shift)) */
3268 else if (GET_CODE (x) == AND
3269 && GET_MODE (x) == DImode
3270 && GET_CODE (XEXP (x, 0)) == MULT
3271 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273 && CONST_INT_P (XEXP (x, 1)))
3275 type = ADDRESS_REG_UXTW;
3276 index = XEXP (XEXP (x, 0), 0);
3277 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279 shift = -1;
3281 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282 else if ((GET_CODE (x) == SIGN_EXTRACT
3283 || GET_CODE (x) == ZERO_EXTRACT)
3284 && GET_MODE (x) == DImode
3285 && GET_CODE (XEXP (x, 0)) == ASHIFT
3286 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3289 type = (GET_CODE (x) == SIGN_EXTRACT)
3290 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291 index = XEXP (XEXP (x, 0), 0);
3292 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293 if (INTVAL (XEXP (x, 1)) != 32 + shift
3294 || INTVAL (XEXP (x, 2)) != 0)
3295 shift = -1;
3297 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298 (const_int 0xffffffff<<shift)) */
3299 else if (GET_CODE (x) == AND
3300 && GET_MODE (x) == DImode
3301 && GET_CODE (XEXP (x, 0)) == ASHIFT
3302 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304 && CONST_INT_P (XEXP (x, 1)))
3306 type = ADDRESS_REG_UXTW;
3307 index = XEXP (XEXP (x, 0), 0);
3308 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310 shift = -1;
3312 /* (mult:P (reg:P) (const_int scale)) */
3313 else if (GET_CODE (x) == MULT
3314 && GET_MODE (x) == Pmode
3315 && GET_MODE (XEXP (x, 0)) == Pmode
3316 && CONST_INT_P (XEXP (x, 1)))
3318 type = ADDRESS_REG_REG;
3319 index = XEXP (x, 0);
3320 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3322 /* (ashift:P (reg:P) (const_int shift)) */
3323 else if (GET_CODE (x) == ASHIFT
3324 && GET_MODE (x) == Pmode
3325 && GET_MODE (XEXP (x, 0)) == Pmode
3326 && CONST_INT_P (XEXP (x, 1)))
3328 type = ADDRESS_REG_REG;
3329 index = XEXP (x, 0);
3330 shift = INTVAL (XEXP (x, 1));
3332 else
3333 return false;
3335 if (GET_CODE (index) == SUBREG)
3336 index = SUBREG_REG (index);
3338 if ((shift == 0 ||
3339 (shift > 0 && shift <= 3
3340 && (1 << shift) == GET_MODE_SIZE (mode)))
3341 && REG_P (index)
3342 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3344 info->type = type;
3345 info->offset = index;
3346 info->shift = shift;
3347 return true;
3350 return false;
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 return (offset >= -64 * GET_MODE_SIZE (mode)
3357 && offset < 64 * GET_MODE_SIZE (mode)
3358 && offset % GET_MODE_SIZE (mode) == 0);
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363 HOST_WIDE_INT offset)
3365 return offset >= -256 && offset < 256;
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3371 return (offset >= 0
3372 && offset < 4096 * GET_MODE_SIZE (mode)
3373 && offset % GET_MODE_SIZE (mode) == 0);
3376 /* Return true if X is a valid address for machine mode MODE. If it is,
3377 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3378 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382 rtx x, machine_mode mode,
3383 RTX_CODE outer_code, bool strict_p)
3385 enum rtx_code code = GET_CODE (x);
3386 rtx op0, op1;
3388 /* On BE, we use load/store pair for all large int mode load/stores. */
3389 bool load_store_pair_p = (outer_code == PARALLEL
3390 || (BYTES_BIG_ENDIAN
3391 && aarch64_vect_struct_mode_p (mode)));
3393 bool allow_reg_index_p =
3394 !load_store_pair_p
3395 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396 && !aarch64_vect_struct_mode_p (mode);
3398 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399 REG addressing. */
3400 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401 && (code != POST_INC && code != REG))
3402 return false;
3404 switch (code)
3406 case REG:
3407 case SUBREG:
3408 info->type = ADDRESS_REG_IMM;
3409 info->base = x;
3410 info->offset = const0_rtx;
3411 return aarch64_base_register_rtx_p (x, strict_p);
3413 case PLUS:
3414 op0 = XEXP (x, 0);
3415 op1 = XEXP (x, 1);
3417 if (! strict_p
3418 && REG_P (op0)
3419 && (op0 == virtual_stack_vars_rtx
3420 || op0 == frame_pointer_rtx
3421 || op0 == arg_pointer_rtx)
3422 && CONST_INT_P (op1))
3424 info->type = ADDRESS_REG_IMM;
3425 info->base = op0;
3426 info->offset = op1;
3428 return true;
3431 if (GET_MODE_SIZE (mode) != 0
3432 && CONST_INT_P (op1)
3433 && aarch64_base_register_rtx_p (op0, strict_p))
3435 HOST_WIDE_INT offset = INTVAL (op1);
3437 info->type = ADDRESS_REG_IMM;
3438 info->base = op0;
3439 info->offset = op1;
3441 /* TImode and TFmode values are allowed in both pairs of X
3442 registers and individual Q registers. The available
3443 address modes are:
3444 X,X: 7-bit signed scaled offset
3445 Q: 9-bit signed offset
3446 We conservatively require an offset representable in either mode.
3448 if (mode == TImode || mode == TFmode)
3449 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450 && offset_9bit_signed_unscaled_p (mode, offset));
3452 /* A 7bit offset check because OImode will emit a ldp/stp
3453 instruction (only big endian will get here).
3454 For ldp/stp instructions, the offset is scaled for the size of a
3455 single element of the pair. */
3456 if (mode == OImode)
3457 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3459 /* Three 9/12 bit offsets checks because CImode will emit three
3460 ldr/str instructions (only big endian will get here). */
3461 if (mode == CImode)
3462 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464 || offset_12bit_unsigned_scaled_p (V16QImode,
3465 offset + 32)));
3467 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468 instructions (only big endian will get here). */
3469 if (mode == XImode)
3470 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471 && aarch64_offset_7bit_signed_scaled_p (TImode,
3472 offset + 32));
3474 if (load_store_pair_p)
3475 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477 else
3478 return (offset_9bit_signed_unscaled_p (mode, offset)
3479 || offset_12bit_unsigned_scaled_p (mode, offset));
3482 if (allow_reg_index_p)
3484 /* Look for base + (scaled/extended) index register. */
3485 if (aarch64_base_register_rtx_p (op0, strict_p)
3486 && aarch64_classify_index (info, op1, mode, strict_p))
3488 info->base = op0;
3489 return true;
3491 if (aarch64_base_register_rtx_p (op1, strict_p)
3492 && aarch64_classify_index (info, op0, mode, strict_p))
3494 info->base = op1;
3495 return true;
3499 return false;
3501 case POST_INC:
3502 case POST_DEC:
3503 case PRE_INC:
3504 case PRE_DEC:
3505 info->type = ADDRESS_REG_WB;
3506 info->base = XEXP (x, 0);
3507 info->offset = NULL_RTX;
3508 return aarch64_base_register_rtx_p (info->base, strict_p);
3510 case POST_MODIFY:
3511 case PRE_MODIFY:
3512 info->type = ADDRESS_REG_WB;
3513 info->base = XEXP (x, 0);
3514 if (GET_CODE (XEXP (x, 1)) == PLUS
3515 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517 && aarch64_base_register_rtx_p (info->base, strict_p))
3519 HOST_WIDE_INT offset;
3520 info->offset = XEXP (XEXP (x, 1), 1);
3521 offset = INTVAL (info->offset);
3523 /* TImode and TFmode values are allowed in both pairs of X
3524 registers and individual Q registers. The available
3525 address modes are:
3526 X,X: 7-bit signed scaled offset
3527 Q: 9-bit signed offset
3528 We conservatively require an offset representable in either mode.
3530 if (mode == TImode || mode == TFmode)
3531 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532 && offset_9bit_signed_unscaled_p (mode, offset));
3534 if (load_store_pair_p)
3535 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537 else
3538 return offset_9bit_signed_unscaled_p (mode, offset);
3540 return false;
3542 case CONST:
3543 case SYMBOL_REF:
3544 case LABEL_REF:
3545 /* load literal: pc-relative constant pool entry. Only supported
3546 for SI mode or larger. */
3547 info->type = ADDRESS_SYMBOLIC;
3549 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3551 rtx sym, addend;
3553 split_const (x, &sym, &addend);
3554 return (GET_CODE (sym) == LABEL_REF
3555 || (GET_CODE (sym) == SYMBOL_REF
3556 && CONSTANT_POOL_ADDRESS_P (sym)));
3558 return false;
3560 case LO_SUM:
3561 info->type = ADDRESS_LO_SUM;
3562 info->base = XEXP (x, 0);
3563 info->offset = XEXP (x, 1);
3564 if (allow_reg_index_p
3565 && aarch64_base_register_rtx_p (info->base, strict_p))
3567 rtx sym, offs;
3568 split_const (info->offset, &sym, &offs);
3569 if (GET_CODE (sym) == SYMBOL_REF
3570 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571 == SYMBOL_SMALL_ABSOLUTE))
3573 /* The symbol and offset must be aligned to the access size. */
3574 unsigned int align;
3575 unsigned int ref_size;
3577 if (CONSTANT_POOL_ADDRESS_P (sym))
3578 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3581 tree exp = SYMBOL_REF_DECL (sym);
3582 align = TYPE_ALIGN (TREE_TYPE (exp));
3583 align = CONSTANT_ALIGNMENT (exp, align);
3585 else if (SYMBOL_REF_DECL (sym))
3586 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588 && SYMBOL_REF_BLOCK (sym) != NULL)
3589 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590 else
3591 align = BITS_PER_UNIT;
3593 ref_size = GET_MODE_SIZE (mode);
3594 if (ref_size == 0)
3595 ref_size = GET_MODE_SIZE (DImode);
3597 return ((INTVAL (offs) & (ref_size - 1)) == 0
3598 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3601 return false;
3603 default:
3604 return false;
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3611 rtx offset;
3613 split_const (x, &x, &offset);
3614 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3617 /* Classify the base of symbolic expression X, given that X appears in
3618 context CONTEXT. */
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622 enum aarch64_symbol_context context)
3624 rtx offset;
3626 split_const (x, &x, &offset);
3627 return aarch64_classify_symbol (x, offset, context);
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632 mode MODE. */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3636 struct aarch64_address_info addr;
3638 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3643 pair operation. */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646 RTX_CODE outer_code, bool strict_p)
3648 struct aarch64_address_info addr;
3650 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3657 REAL_VALUE_TYPE r;
3659 if (GET_MODE (x) == VOIDmode)
3660 return false;
3662 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663 if (REAL_VALUE_MINUS_ZERO (r))
3664 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665 return REAL_VALUES_EQUAL (r, dconst0);
3668 /* Return the fixed registers used for condition codes. */
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3673 *p1 = CC_REGNUM;
3674 *p2 = INVALID_REGNUM;
3675 return true;
3678 /* Emit call insn with PAT and do aarch64-specific handling. */
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3683 rtx insn = emit_call_insn (pat);
3685 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3693 /* All floating point compares return CCFP if it is an equality
3694 comparison, and CCFPE otherwise. */
3695 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3697 switch (code)
3699 case EQ:
3700 case NE:
3701 case UNORDERED:
3702 case ORDERED:
3703 case UNLT:
3704 case UNLE:
3705 case UNGT:
3706 case UNGE:
3707 case UNEQ:
3708 case LTGT:
3709 return CCFPmode;
3711 case LT:
3712 case LE:
3713 case GT:
3714 case GE:
3715 return CCFPEmode;
3717 default:
3718 gcc_unreachable ();
3722 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723 && y == const0_rtx
3724 && (code == EQ || code == NE || code == LT || code == GE)
3725 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726 || GET_CODE (x) == NEG))
3727 return CC_NZmode;
3729 /* A compare with a shifted operand. Because of canonicalization,
3730 the comparison will have to be swapped when we emit the assembly
3731 code. */
3732 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733 && (REG_P (y) || GET_CODE (y) == SUBREG)
3734 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735 || GET_CODE (x) == LSHIFTRT
3736 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737 return CC_SWPmode;
3739 /* Similarly for a negated operand, but we can only do this for
3740 equalities. */
3741 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742 && (REG_P (y) || GET_CODE (y) == SUBREG)
3743 && (code == EQ || code == NE)
3744 && GET_CODE (x) == NEG)
3745 return CC_Zmode;
3747 /* A compare of a mode narrower than SI mode against zero can be done
3748 by extending the value in the comparison. */
3749 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750 && y == const0_rtx)
3751 /* Only use sign-extension if we really need it. */
3752 return ((code == GT || code == GE || code == LE || code == LT)
3753 ? CC_SESWPmode : CC_ZESWPmode);
3755 /* For everything else, return CCmode. */
3756 return CCmode;
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3763 aarch64_get_condition_code (rtx x)
3765 machine_mode mode = GET_MODE (XEXP (x, 0));
3766 enum rtx_code comp_code = GET_CODE (x);
3768 if (GET_MODE_CLASS (mode) != MODE_CC)
3769 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770 return aarch64_get_condition_code_1 (mode, comp_code);
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3776 int ne = -1, eq = -1;
3777 switch (mode)
3779 case CCFPmode:
3780 case CCFPEmode:
3781 switch (comp_code)
3783 case GE: return AARCH64_GE;
3784 case GT: return AARCH64_GT;
3785 case LE: return AARCH64_LS;
3786 case LT: return AARCH64_MI;
3787 case NE: return AARCH64_NE;
3788 case EQ: return AARCH64_EQ;
3789 case ORDERED: return AARCH64_VC;
3790 case UNORDERED: return AARCH64_VS;
3791 case UNLT: return AARCH64_LT;
3792 case UNLE: return AARCH64_LE;
3793 case UNGT: return AARCH64_HI;
3794 case UNGE: return AARCH64_PL;
3795 default: return -1;
3797 break;
3799 case CC_DNEmode:
3800 ne = AARCH64_NE;
3801 eq = AARCH64_EQ;
3802 break;
3804 case CC_DEQmode:
3805 ne = AARCH64_EQ;
3806 eq = AARCH64_NE;
3807 break;
3809 case CC_DGEmode:
3810 ne = AARCH64_GE;
3811 eq = AARCH64_LT;
3812 break;
3814 case CC_DLTmode:
3815 ne = AARCH64_LT;
3816 eq = AARCH64_GE;
3817 break;
3819 case CC_DGTmode:
3820 ne = AARCH64_GT;
3821 eq = AARCH64_LE;
3822 break;
3824 case CC_DLEmode:
3825 ne = AARCH64_LE;
3826 eq = AARCH64_GT;
3827 break;
3829 case CC_DGEUmode:
3830 ne = AARCH64_CS;
3831 eq = AARCH64_CC;
3832 break;
3834 case CC_DLTUmode:
3835 ne = AARCH64_CC;
3836 eq = AARCH64_CS;
3837 break;
3839 case CC_DGTUmode:
3840 ne = AARCH64_HI;
3841 eq = AARCH64_LS;
3842 break;
3844 case CC_DLEUmode:
3845 ne = AARCH64_LS;
3846 eq = AARCH64_HI;
3847 break;
3849 case CCmode:
3850 switch (comp_code)
3852 case NE: return AARCH64_NE;
3853 case EQ: return AARCH64_EQ;
3854 case GE: return AARCH64_GE;
3855 case GT: return AARCH64_GT;
3856 case LE: return AARCH64_LE;
3857 case LT: return AARCH64_LT;
3858 case GEU: return AARCH64_CS;
3859 case GTU: return AARCH64_HI;
3860 case LEU: return AARCH64_LS;
3861 case LTU: return AARCH64_CC;
3862 default: return -1;
3864 break;
3866 case CC_SWPmode:
3867 case CC_ZESWPmode:
3868 case CC_SESWPmode:
3869 switch (comp_code)
3871 case NE: return AARCH64_NE;
3872 case EQ: return AARCH64_EQ;
3873 case GE: return AARCH64_LE;
3874 case GT: return AARCH64_LT;
3875 case LE: return AARCH64_GE;
3876 case LT: return AARCH64_GT;
3877 case GEU: return AARCH64_LS;
3878 case GTU: return AARCH64_CC;
3879 case LEU: return AARCH64_CS;
3880 case LTU: return AARCH64_HI;
3881 default: return -1;
3883 break;
3885 case CC_NZmode:
3886 switch (comp_code)
3888 case NE: return AARCH64_NE;
3889 case EQ: return AARCH64_EQ;
3890 case GE: return AARCH64_PL;
3891 case LT: return AARCH64_MI;
3892 default: return -1;
3894 break;
3896 case CC_Zmode:
3897 switch (comp_code)
3899 case NE: return AARCH64_NE;
3900 case EQ: return AARCH64_EQ;
3901 default: return -1;
3903 break;
3905 default:
3906 return -1;
3907 break;
3910 if (comp_code == NE)
3911 return ne;
3913 if (comp_code == EQ)
3914 return eq;
3916 return -1;
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921 HOST_WIDE_INT minval,
3922 HOST_WIDE_INT maxval)
3924 HOST_WIDE_INT firstval;
3925 int count, i;
3927 if (GET_CODE (x) != CONST_VECTOR
3928 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929 return false;
3931 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932 if (firstval < minval || firstval > maxval)
3933 return false;
3935 count = CONST_VECTOR_NUNITS (x);
3936 for (i = 1; i < count; i++)
3937 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938 return false;
3940 return true;
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3946 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3952 unsigned count = 0;
3954 while (value)
3956 count++;
3957 value &= value - 1;
3960 return count;
3963 /* N Z C V. */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3969 /* N Z C V flags for ccmp. The first code is for AND op and the other
3970 is for IOR op. Indexed by AARCH64_COND_CODE. */
3971 static const int aarch64_nzcv_codes[][2] =
3973 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3974 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3975 {AARCH64_CC_C, 0}, /* CS, C == 1. */
3976 {0, AARCH64_CC_C}, /* CC, C == 0. */
3977 {AARCH64_CC_N, 0}, /* MI, N == 1. */
3978 {0, AARCH64_CC_N}, /* PL, N == 0. */
3979 {AARCH64_CC_V, 0}, /* VS, V == 1. */
3980 {0, AARCH64_CC_V}, /* VC, V == 0. */
3981 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
3982 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
3983 {0, AARCH64_CC_V}, /* GE, N == V. */
3984 {AARCH64_CC_V, 0}, /* LT, N != V. */
3985 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
3986 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
3987 {0, 0}, /* AL, Any. */
3988 {0, 0}, /* NV, Any. */
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3994 switch (mode)
3996 case CC_DNEmode:
3997 return NE;
3999 case CC_DEQmode:
4000 return EQ;
4002 case CC_DLEmode:
4003 return LE;
4005 case CC_DGTmode:
4006 return GT;
4008 case CC_DLTmode:
4009 return LT;
4011 case CC_DGEmode:
4012 return GE;
4014 case CC_DLEUmode:
4015 return LEU;
4017 case CC_DGTUmode:
4018 return GTU;
4020 case CC_DLTUmode:
4021 return LTU;
4023 case CC_DGEUmode:
4024 return GEU;
4026 default:
4027 gcc_unreachable ();
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4035 switch (code)
4037 /* An integer or symbol address without a preceding # sign. */
4038 case 'c':
4039 switch (GET_CODE (x))
4041 case CONST_INT:
4042 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043 break;
4045 case SYMBOL_REF:
4046 output_addr_const (f, x);
4047 break;
4049 case CONST:
4050 if (GET_CODE (XEXP (x, 0)) == PLUS
4051 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4053 output_addr_const (f, x);
4054 break;
4056 /* Fall through. */
4058 default:
4059 output_operand_lossage ("Unsupported operand for code '%c'", code);
4061 break;
4063 case 'e':
4064 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4066 int n;
4068 if (!CONST_INT_P (x)
4069 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4071 output_operand_lossage ("invalid operand for '%%%c'", code);
4072 return;
4075 switch (n)
4077 case 3:
4078 fputc ('b', f);
4079 break;
4080 case 4:
4081 fputc ('h', f);
4082 break;
4083 case 5:
4084 fputc ('w', f);
4085 break;
4086 default:
4087 output_operand_lossage ("invalid operand for '%%%c'", code);
4088 return;
4091 break;
4093 case 'p':
4095 int n;
4097 /* Print N such that 2^N == X. */
4098 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4100 output_operand_lossage ("invalid operand for '%%%c'", code);
4101 return;
4104 asm_fprintf (f, "%d", n);
4106 break;
4108 case 'P':
4109 /* Print the number of non-zero bits in X (a const_int). */
4110 if (!CONST_INT_P (x))
4112 output_operand_lossage ("invalid operand for '%%%c'", code);
4113 return;
4116 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117 break;
4119 case 'H':
4120 /* Print the higher numbered register of a pair (TImode) of regs. */
4121 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4123 output_operand_lossage ("invalid operand for '%%%c'", code);
4124 return;
4127 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128 break;
4130 case 'm':
4132 int cond_code;
4133 /* Print a condition (eq, ne, etc). */
4135 /* CONST_TRUE_RTX means always -- that's the default. */
4136 if (x == const_true_rtx)
4137 return;
4139 if (!COMPARISON_P (x))
4141 output_operand_lossage ("invalid operand for '%%%c'", code);
4142 return;
4145 cond_code = aarch64_get_condition_code (x);
4146 gcc_assert (cond_code >= 0);
4147 fputs (aarch64_condition_codes[cond_code], f);
4149 break;
4151 case 'M':
4153 int cond_code;
4154 /* Print the inverse of a condition (eq <-> ne, etc). */
4156 /* CONST_TRUE_RTX means never -- that's the default. */
4157 if (x == const_true_rtx)
4159 fputs ("nv", f);
4160 return;
4163 if (!COMPARISON_P (x))
4165 output_operand_lossage ("invalid operand for '%%%c'", code);
4166 return;
4168 cond_code = aarch64_get_condition_code (x);
4169 gcc_assert (cond_code >= 0);
4170 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171 (cond_code)], f);
4173 break;
4175 case 'b':
4176 case 'h':
4177 case 's':
4178 case 'd':
4179 case 'q':
4180 /* Print a scalar FP/SIMD register name. */
4181 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4183 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184 return;
4186 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187 break;
4189 case 'S':
4190 case 'T':
4191 case 'U':
4192 case 'V':
4193 /* Print the first FP/SIMD register name in a list. */
4194 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4196 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197 return;
4199 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200 break;
4202 case 'R':
4203 /* Print a scalar FP/SIMD register name + 1. */
4204 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4206 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207 return;
4209 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210 break;
4212 case 'X':
4213 /* Print bottom 16 bits of integer constant in hex. */
4214 if (!CONST_INT_P (x))
4216 output_operand_lossage ("invalid operand for '%%%c'", code);
4217 return;
4219 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220 break;
4222 case 'w':
4223 case 'x':
4224 /* Print a general register name or the zero register (32-bit or
4225 64-bit). */
4226 if (x == const0_rtx
4227 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4229 asm_fprintf (f, "%czr", code);
4230 break;
4233 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4235 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236 break;
4239 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4241 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242 break;
4245 /* Fall through */
4247 case 0:
4248 /* Print a normal operand, if it's a general register, then we
4249 assume DImode. */
4250 if (x == NULL)
4252 output_operand_lossage ("missing operand");
4253 return;
4256 switch (GET_CODE (x))
4258 case REG:
4259 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260 break;
4262 case MEM:
4263 aarch64_memory_reference_mode = GET_MODE (x);
4264 output_address (XEXP (x, 0));
4265 break;
4267 case LABEL_REF:
4268 case SYMBOL_REF:
4269 output_addr_const (asm_out_file, x);
4270 break;
4272 case CONST_INT:
4273 asm_fprintf (f, "%wd", INTVAL (x));
4274 break;
4276 case CONST_VECTOR:
4277 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4279 gcc_assert (
4280 aarch64_const_vec_all_same_in_range_p (x,
4281 HOST_WIDE_INT_MIN,
4282 HOST_WIDE_INT_MAX));
4283 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4285 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4287 fputc ('0', f);
4289 else
4290 gcc_unreachable ();
4291 break;
4293 case CONST_DOUBLE:
4294 /* CONST_DOUBLE can represent a double-width integer.
4295 In this case, the mode of x is VOIDmode. */
4296 if (GET_MODE (x) == VOIDmode)
4297 ; /* Do Nothing. */
4298 else if (aarch64_float_const_zero_rtx_p (x))
4300 fputc ('0', f);
4301 break;
4303 else if (aarch64_float_const_representable_p (x))
4305 #define buf_size 20
4306 char float_buf[buf_size] = {'\0'};
4307 REAL_VALUE_TYPE r;
4308 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309 real_to_decimal_for_mode (float_buf, &r,
4310 buf_size, buf_size,
4311 1, GET_MODE (x));
4312 asm_fprintf (asm_out_file, "%s", float_buf);
4313 break;
4314 #undef buf_size
4316 output_operand_lossage ("invalid constant");
4317 return;
4318 default:
4319 output_operand_lossage ("invalid operand");
4320 return;
4322 break;
4324 case 'A':
4325 if (GET_CODE (x) == HIGH)
4326 x = XEXP (x, 0);
4328 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4330 case SYMBOL_SMALL_GOT:
4331 asm_fprintf (asm_out_file, ":got:");
4332 break;
4334 case SYMBOL_SMALL_TLSGD:
4335 asm_fprintf (asm_out_file, ":tlsgd:");
4336 break;
4338 case SYMBOL_SMALL_TLSDESC:
4339 asm_fprintf (asm_out_file, ":tlsdesc:");
4340 break;
4342 case SYMBOL_SMALL_GOTTPREL:
4343 asm_fprintf (asm_out_file, ":gottprel:");
4344 break;
4346 case SYMBOL_SMALL_TPREL:
4347 asm_fprintf (asm_out_file, ":tprel:");
4348 break;
4350 case SYMBOL_TINY_GOT:
4351 gcc_unreachable ();
4352 break;
4354 default:
4355 break;
4357 output_addr_const (asm_out_file, x);
4358 break;
4360 case 'L':
4361 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4363 case SYMBOL_SMALL_GOT:
4364 asm_fprintf (asm_out_file, ":lo12:");
4365 break;
4367 case SYMBOL_SMALL_TLSGD:
4368 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369 break;
4371 case SYMBOL_SMALL_TLSDESC:
4372 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373 break;
4375 case SYMBOL_SMALL_GOTTPREL:
4376 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377 break;
4379 case SYMBOL_SMALL_TPREL:
4380 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381 break;
4383 case SYMBOL_TINY_GOT:
4384 asm_fprintf (asm_out_file, ":got:");
4385 break;
4387 default:
4388 break;
4390 output_addr_const (asm_out_file, x);
4391 break;
4393 case 'G':
4395 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4397 case SYMBOL_SMALL_TPREL:
4398 asm_fprintf (asm_out_file, ":tprel_hi12:");
4399 break;
4400 default:
4401 break;
4403 output_addr_const (asm_out_file, x);
4404 break;
4406 case 'K':
4408 int cond_code;
4409 /* Print nzcv. */
4411 if (!COMPARISON_P (x))
4413 output_operand_lossage ("invalid operand for '%%%c'", code);
4414 return;
4417 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418 gcc_assert (cond_code >= 0);
4419 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4421 break;
4423 case 'k':
4425 int cond_code;
4426 /* Print nzcv. */
4428 if (!COMPARISON_P (x))
4430 output_operand_lossage ("invalid operand for '%%%c'", code);
4431 return;
4434 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435 gcc_assert (cond_code >= 0);
4436 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4438 break;
4440 default:
4441 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442 return;
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4449 struct aarch64_address_info addr;
4451 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452 MEM, true))
4453 switch (addr.type)
4455 case ADDRESS_REG_IMM:
4456 if (addr.offset == const0_rtx)
4457 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458 else
4459 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460 INTVAL (addr.offset));
4461 return;
4463 case ADDRESS_REG_REG:
4464 if (addr.shift == 0)
4465 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466 reg_names [REGNO (addr.offset)]);
4467 else
4468 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469 reg_names [REGNO (addr.offset)], addr.shift);
4470 return;
4472 case ADDRESS_REG_UXTW:
4473 if (addr.shift == 0)
4474 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475 REGNO (addr.offset) - R0_REGNUM);
4476 else
4477 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479 return;
4481 case ADDRESS_REG_SXTW:
4482 if (addr.shift == 0)
4483 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484 REGNO (addr.offset) - R0_REGNUM);
4485 else
4486 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488 return;
4490 case ADDRESS_REG_WB:
4491 switch (GET_CODE (x))
4493 case PRE_INC:
4494 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495 GET_MODE_SIZE (aarch64_memory_reference_mode));
4496 return;
4497 case POST_INC:
4498 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499 GET_MODE_SIZE (aarch64_memory_reference_mode));
4500 return;
4501 case PRE_DEC:
4502 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503 GET_MODE_SIZE (aarch64_memory_reference_mode));
4504 return;
4505 case POST_DEC:
4506 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507 GET_MODE_SIZE (aarch64_memory_reference_mode));
4508 return;
4509 case PRE_MODIFY:
4510 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511 INTVAL (addr.offset));
4512 return;
4513 case POST_MODIFY:
4514 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515 INTVAL (addr.offset));
4516 return;
4517 default:
4518 break;
4520 break;
4522 case ADDRESS_LO_SUM:
4523 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524 output_addr_const (f, addr.offset);
4525 asm_fprintf (f, "]");
4526 return;
4528 case ADDRESS_SYMBOLIC:
4529 break;
4532 output_addr_const (f, x);
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4538 const char *fmt;
4539 int i;
4541 if (GET_CODE (x) == LABEL_REF)
4542 return true;
4544 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545 referencing instruction, but they are constant offsets, not
4546 symbols. */
4547 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548 return false;
4550 fmt = GET_RTX_FORMAT (GET_CODE (x));
4551 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4553 if (fmt[i] == 'E')
4555 int j;
4557 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559 return 1;
4561 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562 return 1;
4565 return 0;
4568 /* Implement REGNO_REG_CLASS. */
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4573 if (GP_REGNUM_P (regno))
4574 return GENERAL_REGS;
4576 if (regno == SP_REGNUM)
4577 return STACK_REG;
4579 if (regno == FRAME_POINTER_REGNUM
4580 || regno == ARG_POINTER_REGNUM)
4581 return POINTER_REGS;
4583 if (FP_REGNUM_P (regno))
4584 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4586 return NO_REGS;
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4592 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593 where mask is selected by alignment and size of the offset.
4594 We try to pick as large a range for the offset as possible to
4595 maximize the chance of a CSE. However, for aligned addresses
4596 we limit the range to 4k so that structures with different sized
4597 elements are likely to use the same base. */
4599 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4601 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602 HOST_WIDE_INT base_offset;
4604 /* Does it look like we'll need a load/store-pair operation? */
4605 if (GET_MODE_SIZE (mode) > 16
4606 || mode == TImode)
4607 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609 /* For offsets aren't a multiple of the access size, the limit is
4610 -256...255. */
4611 else if (offset & (GET_MODE_SIZE (mode) - 1))
4612 base_offset = (offset + 0x100) & ~0x1ff;
4613 else
4614 base_offset = offset & ~0xfff;
4616 if (base_offset == 0)
4617 return x;
4619 offset -= base_offset;
4620 rtx base_reg = gen_reg_rtx (Pmode);
4621 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622 NULL_RTX);
4623 emit_move_insn (base_reg, val);
4624 x = plus_constant (Pmode, base_reg, offset);
4627 return x;
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631 operand. If we find one, push the reload and return the new rtx. */
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635 machine_mode mode,
4636 int opnum, int type,
4637 int ind_levels ATTRIBUTE_UNUSED)
4639 rtx x = *x_p;
4641 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4642 if (aarch64_vect_struct_mode_p (mode)
4643 && GET_CODE (x) == PLUS
4644 && REG_P (XEXP (x, 0))
4645 && CONST_INT_P (XEXP (x, 1)))
4647 rtx orig_rtx = x;
4648 x = copy_rtx (x);
4649 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651 opnum, (enum reload_type) type);
4652 return x;
4655 /* We must recognize output that we have already generated ourselves. */
4656 if (GET_CODE (x) == PLUS
4657 && GET_CODE (XEXP (x, 0)) == PLUS
4658 && REG_P (XEXP (XEXP (x, 0), 0))
4659 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660 && CONST_INT_P (XEXP (x, 1)))
4662 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664 opnum, (enum reload_type) type);
4665 return x;
4668 /* We wish to handle large displacements off a base register by splitting
4669 the addend across an add and the mem insn. This can cut the number of
4670 extra insns needed from 3 to 1. It is only useful for load/store of a
4671 single register with 12 bit offset field. */
4672 if (GET_CODE (x) == PLUS
4673 && REG_P (XEXP (x, 0))
4674 && CONST_INT_P (XEXP (x, 1))
4675 && HARD_REGISTER_P (XEXP (x, 0))
4676 && mode != TImode
4677 && mode != TFmode
4678 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4680 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681 HOST_WIDE_INT low = val & 0xfff;
4682 HOST_WIDE_INT high = val - low;
4683 HOST_WIDE_INT offs;
4684 rtx cst;
4685 machine_mode xmode = GET_MODE (x);
4687 /* In ILP32, xmode can be either DImode or SImode. */
4688 gcc_assert (xmode == DImode || xmode == SImode);
4690 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4691 BLKmode alignment. */
4692 if (GET_MODE_SIZE (mode) == 0)
4693 return NULL_RTX;
4695 offs = low % GET_MODE_SIZE (mode);
4697 /* Align misaligned offset by adjusting high part to compensate. */
4698 if (offs != 0)
4700 if (aarch64_uimm12_shift (high + offs))
4702 /* Align down. */
4703 low = low - offs;
4704 high = high + offs;
4706 else
4708 /* Align up. */
4709 offs = GET_MODE_SIZE (mode) - offs;
4710 low = low + offs;
4711 high = high + (low & 0x1000) - offs;
4712 low &= 0xfff;
4716 /* Check for overflow. */
4717 if (high + low != val)
4718 return NULL_RTX;
4720 cst = GEN_INT (high);
4721 if (!aarch64_uimm12_shift (high))
4722 cst = force_const_mem (xmode, cst);
4724 /* Reload high part into base reg, leaving the low part
4725 in the mem instruction.
4726 Note that replacing this gen_rtx_PLUS with plus_constant is
4727 wrong in this case because we rely on the
4728 (plus (plus reg c1) c2) structure being preserved so that
4729 XEXP (*p, 0) in push_reload below uses the correct term. */
4730 x = gen_rtx_PLUS (xmode,
4731 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732 GEN_INT (low));
4734 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736 opnum, (enum reload_type) type);
4737 return x;
4740 return NULL_RTX;
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746 reg_class_t rclass,
4747 machine_mode mode,
4748 secondary_reload_info *sri)
4750 /* Without the TARGET_SIMD instructions we cannot move a Q register
4751 to a Q register directly. We need a scratch. */
4752 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754 && reg_class_subset_p (rclass, FP_REGS))
4756 if (mode == TFmode)
4757 sri->icode = CODE_FOR_aarch64_reload_movtf;
4758 else if (mode == TImode)
4759 sri->icode = CODE_FOR_aarch64_reload_movti;
4760 return NO_REGS;
4763 /* A TFmode or TImode memory access should be handled via an FP_REGS
4764 because AArch64 has richer addressing modes for LDR/STR instructions
4765 than LDP/STP instructions. */
4766 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768 return FP_REGS;
4770 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771 return GENERAL_REGS;
4773 return NO_REGS;
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4779 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4782 if (frame_pointer_needed)
4784 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785 return true;
4786 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787 return false;
4788 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789 && !cfun->calls_alloca)
4790 return true;
4791 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792 return true;
4794 return false;
4796 else
4798 /* If we decided that we didn't need a leaf frame pointer but then used
4799 LR in the function, then we'll want a frame pointer after all, so
4800 prevent this elimination to ensure a frame pointer is used. */
4801 if (to == STACK_POINTER_REGNUM
4802 && flag_omit_leaf_frame_pointer
4803 && df_regs_ever_live_p (LR_REGNUM))
4804 return false;
4807 return true;
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4813 aarch64_layout_frame ();
4815 if (to == HARD_FRAME_POINTER_REGNUM)
4817 if (from == ARG_POINTER_REGNUM)
4818 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4820 if (from == FRAME_POINTER_REGNUM)
4821 return (cfun->machine->frame.hard_fp_offset
4822 - cfun->machine->frame.saved_varargs_size);
4825 if (to == STACK_POINTER_REGNUM)
4827 if (from == FRAME_POINTER_REGNUM)
4828 return (cfun->machine->frame.frame_size
4829 - cfun->machine->frame.saved_varargs_size);
4832 return cfun->machine->frame.frame_size;
4835 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4836 previous frame. */
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4841 if (count != 0)
4842 return const0_rtx;
4843 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4850 if (TARGET_ILP32)
4852 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4855 else
4857 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4860 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861 assemble_aligned_integer (4, const0_rtx);
4862 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4869 rtx fnaddr, mem, a_tramp;
4870 const int tramp_code_sz = 16;
4872 /* Don't need to copy the trailing D-words, we fill those in below. */
4873 emit_block_move (m_tramp, assemble_trampoline_template (),
4874 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877 if (GET_MODE (fnaddr) != ptr_mode)
4878 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879 emit_move_insn (mem, fnaddr);
4881 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882 emit_move_insn (mem, chain_value);
4884 /* XXX We should really define a "clear_cache" pattern and use
4885 gen_clear_cache(). */
4886 a_tramp = XEXP (m_tramp, 0);
4887 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890 ptr_mode);
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4896 switch (regclass)
4898 case CALLER_SAVE_REGS:
4899 case POINTER_REGS:
4900 case GENERAL_REGS:
4901 case ALL_REGS:
4902 case FP_REGS:
4903 case FP_LO_REGS:
4904 return
4905 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906 (GET_MODE_SIZE (mode) + 7) / 8;
4907 case STACK_REG:
4908 return 1;
4910 case NO_REGS:
4911 return 0;
4913 default:
4914 break;
4916 gcc_unreachable ();
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4922 if (regclass == POINTER_REGS)
4923 return GENERAL_REGS;
4925 if (regclass == STACK_REG)
4927 if (REG_P(x)
4928 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929 return regclass;
4931 return NO_REGS;
4934 /* If it's an integer immediate that MOVI can't handle, then
4935 FP_REGS is not an option, so we return NO_REGS instead. */
4936 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938 return NO_REGS;
4940 /* Register eliminiation can result in a request for
4941 SP+constant->FP_REGS. We cannot support such operations which
4942 use SP as source and an FP_REG as destination, so reject out
4943 right now. */
4944 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4946 rtx lhs = XEXP (x, 0);
4948 /* Look through a possible SUBREG introduced by ILP32. */
4949 if (GET_CODE (lhs) == SUBREG)
4950 lhs = SUBREG_REG (lhs);
4952 gcc_assert (REG_P (lhs));
4953 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954 POINTER_REGS));
4955 return NO_REGS;
4958 return regclass;
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4964 asm_fprintf (f, "%U%s", name);
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4970 if (priority == DEFAULT_INIT_PRIORITY)
4971 default_ctor_section_asm_out_constructor (symbol, priority);
4972 else
4974 section *s;
4975 char buf[18];
4976 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977 s = get_section (buf, SECTION_WRITE, NULL);
4978 switch_to_section (s);
4979 assemble_align (POINTER_SIZE);
4980 assemble_aligned_integer (POINTER_BYTES, symbol);
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4987 if (priority == DEFAULT_INIT_PRIORITY)
4988 default_dtor_section_asm_out_destructor (symbol, priority);
4989 else
4991 section *s;
4992 char buf[18];
4993 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994 s = get_section (buf, SECTION_WRITE, NULL);
4995 switch_to_section (s);
4996 assemble_align (POINTER_SIZE);
4997 assemble_aligned_integer (POINTER_BYTES, symbol);
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5004 char buf[100];
5005 char label[100];
5006 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007 int index;
5008 static const char *const patterns[4][2] =
5011 "ldrb\t%w3, [%0,%w1,uxtw]",
5012 "add\t%3, %4, %w3, sxtb #2"
5015 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016 "add\t%3, %4, %w3, sxth #2"
5019 "ldr\t%w3, [%0,%w1,uxtw #2]",
5020 "add\t%3, %4, %w3, sxtw #2"
5022 /* We assume that DImode is only generated when not optimizing and
5023 that we don't really need 64-bit address offsets. That would
5024 imply an object file with 8GB of code in a single function! */
5026 "ldr\t%w3, [%0,%w1,uxtw #2]",
5027 "add\t%3, %4, %w3, sxtw #2"
5031 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5033 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5035 gcc_assert (index >= 0 && index <= 3);
5037 /* Need to implement table size reduction, by chaning the code below. */
5038 output_asm_insn (patterns[index][0], operands);
5039 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040 snprintf (buf, sizeof (buf),
5041 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042 output_asm_insn (buf, operands);
5043 output_asm_insn (patterns[index][1], operands);
5044 output_asm_insn ("br\t%3", operands);
5045 assemble_label (asm_out_file, label);
5046 return "";
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052 operator. */
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5057 if (shift >= 0 && shift <= 3)
5059 int size;
5060 for (size = 8; size <= 32; size *= 2)
5062 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063 if (mask == bits << shift)
5064 return size;
5067 return 0;
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072 const_rtx x ATTRIBUTE_UNUSED)
5074 /* We can't use blocks for constants when we're using a per-function
5075 constant pool. */
5076 return false;
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081 rtx x ATTRIBUTE_UNUSED,
5082 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5084 /* Force all constant pool entries into the current function section. */
5085 return function_section (current_function_decl);
5089 /* Costs. */
5091 /* Helper function for rtx cost calculation. Strip a shift expression
5092 from X. Returns the inner operand if successful, or the original
5093 expression on failure. */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5097 rtx op = x;
5099 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100 we can convert both to ROR during final output. */
5101 if ((GET_CODE (op) == ASHIFT
5102 || GET_CODE (op) == ASHIFTRT
5103 || GET_CODE (op) == LSHIFTRT
5104 || GET_CODE (op) == ROTATERT
5105 || GET_CODE (op) == ROTATE)
5106 && CONST_INT_P (XEXP (op, 1)))
5107 return XEXP (op, 0);
5109 if (GET_CODE (op) == MULT
5110 && CONST_INT_P (XEXP (op, 1))
5111 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112 return XEXP (op, 0);
5114 return x;
5117 /* Helper function for rtx cost calculation. Strip an extend
5118 expression from X. Returns the inner operand if successful, or the
5119 original expression on failure. We deal with a number of possible
5120 canonicalization variations here. */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5124 rtx op = x;
5126 /* Zero and sign extraction of a widened value. */
5127 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128 && XEXP (op, 2) == const0_rtx
5129 && GET_CODE (XEXP (op, 0)) == MULT
5130 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131 XEXP (op, 1)))
5132 return XEXP (XEXP (op, 0), 0);
5134 /* It can also be represented (for zero-extend) as an AND with an
5135 immediate. */
5136 if (GET_CODE (op) == AND
5137 && GET_CODE (XEXP (op, 0)) == MULT
5138 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139 && CONST_INT_P (XEXP (op, 1))
5140 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141 INTVAL (XEXP (op, 1))) != 0)
5142 return XEXP (XEXP (op, 0), 0);
5144 /* Now handle extended register, as this may also have an optional
5145 left shift by 1..4. */
5146 if (GET_CODE (op) == ASHIFT
5147 && CONST_INT_P (XEXP (op, 1))
5148 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149 op = XEXP (op, 0);
5151 if (GET_CODE (op) == ZERO_EXTEND
5152 || GET_CODE (op) == SIGN_EXTEND)
5153 op = XEXP (op, 0);
5155 if (op != x)
5156 return op;
5158 return x;
5161 /* Return true iff CODE is a shift supported in combination
5162 with arithmetic instructions. */
5164 static bool
5165 aarch64_shift_p (enum rtx_code code)
5167 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5170 /* Helper function for rtx cost calculation. Calculate the cost of
5171 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5172 Return the calculated cost of the expression, recursing manually in to
5173 operands where needed. */
5175 static int
5176 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5178 rtx op0, op1;
5179 const struct cpu_cost_table *extra_cost
5180 = aarch64_tune_params->insn_extra_cost;
5181 int cost = 0;
5182 bool compound_p = (outer == PLUS || outer == MINUS);
5183 machine_mode mode = GET_MODE (x);
5185 gcc_checking_assert (code == MULT);
5187 op0 = XEXP (x, 0);
5188 op1 = XEXP (x, 1);
5190 if (VECTOR_MODE_P (mode))
5191 mode = GET_MODE_INNER (mode);
5193 /* Integer multiply/fma. */
5194 if (GET_MODE_CLASS (mode) == MODE_INT)
5196 /* The multiply will be canonicalized as a shift, cost it as such. */
5197 if (aarch64_shift_p (GET_CODE (x))
5198 || (CONST_INT_P (op1)
5199 && exact_log2 (INTVAL (op1)) > 0))
5201 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5202 || GET_CODE (op0) == SIGN_EXTEND;
5203 if (speed)
5205 if (compound_p)
5207 if (REG_P (op1))
5208 /* ARITH + shift-by-register. */
5209 cost += extra_cost->alu.arith_shift_reg;
5210 else if (is_extend)
5211 /* ARITH + extended register. We don't have a cost field
5212 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5213 cost += extra_cost->alu.extend_arith;
5214 else
5215 /* ARITH + shift-by-immediate. */
5216 cost += extra_cost->alu.arith_shift;
5218 else
5219 /* LSL (immediate). */
5220 cost += extra_cost->alu.shift;
5223 /* Strip extends as we will have costed them in the case above. */
5224 if (is_extend)
5225 op0 = aarch64_strip_extend (op0);
5227 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5229 return cost;
5232 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5233 compound and let the below cases handle it. After all, MNEG is a
5234 special-case alias of MSUB. */
5235 if (GET_CODE (op0) == NEG)
5237 op0 = XEXP (op0, 0);
5238 compound_p = true;
5241 /* Integer multiplies or FMAs have zero/sign extending variants. */
5242 if ((GET_CODE (op0) == ZERO_EXTEND
5243 && GET_CODE (op1) == ZERO_EXTEND)
5244 || (GET_CODE (op0) == SIGN_EXTEND
5245 && GET_CODE (op1) == SIGN_EXTEND))
5247 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5248 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5250 if (speed)
5252 if (compound_p)
5253 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5254 cost += extra_cost->mult[0].extend_add;
5255 else
5256 /* MUL/SMULL/UMULL. */
5257 cost += extra_cost->mult[0].extend;
5260 return cost;
5263 /* This is either an integer multiply or a MADD. In both cases
5264 we want to recurse and cost the operands. */
5265 cost += rtx_cost (op0, MULT, 0, speed)
5266 + rtx_cost (op1, MULT, 1, speed);
5268 if (speed)
5270 if (compound_p)
5271 /* MADD/MSUB. */
5272 cost += extra_cost->mult[mode == DImode].add;
5273 else
5274 /* MUL. */
5275 cost += extra_cost->mult[mode == DImode].simple;
5278 return cost;
5280 else
5282 if (speed)
5284 /* Floating-point FMA/FMUL can also support negations of the
5285 operands. */
5286 if (GET_CODE (op0) == NEG)
5287 op0 = XEXP (op0, 0);
5288 if (GET_CODE (op1) == NEG)
5289 op1 = XEXP (op1, 0);
5291 if (compound_p)
5292 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5293 cost += extra_cost->fp[mode == DFmode].fma;
5294 else
5295 /* FMUL/FNMUL. */
5296 cost += extra_cost->fp[mode == DFmode].mult;
5299 cost += rtx_cost (op0, MULT, 0, speed)
5300 + rtx_cost (op1, MULT, 1, speed);
5301 return cost;
5305 static int
5306 aarch64_address_cost (rtx x,
5307 machine_mode mode,
5308 addr_space_t as ATTRIBUTE_UNUSED,
5309 bool speed)
5311 enum rtx_code c = GET_CODE (x);
5312 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5313 struct aarch64_address_info info;
5314 int cost = 0;
5315 info.shift = 0;
5317 if (!aarch64_classify_address (&info, x, mode, c, false))
5319 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5321 /* This is a CONST or SYMBOL ref which will be split
5322 in a different way depending on the code model in use.
5323 Cost it through the generic infrastructure. */
5324 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5325 /* Divide through by the cost of one instruction to
5326 bring it to the same units as the address costs. */
5327 cost_symbol_ref /= COSTS_N_INSNS (1);
5328 /* The cost is then the cost of preparing the address,
5329 followed by an immediate (possibly 0) offset. */
5330 return cost_symbol_ref + addr_cost->imm_offset;
5332 else
5334 /* This is most likely a jump table from a case
5335 statement. */
5336 return addr_cost->register_offset;
5340 switch (info.type)
5342 case ADDRESS_LO_SUM:
5343 case ADDRESS_SYMBOLIC:
5344 case ADDRESS_REG_IMM:
5345 cost += addr_cost->imm_offset;
5346 break;
5348 case ADDRESS_REG_WB:
5349 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5350 cost += addr_cost->pre_modify;
5351 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5352 cost += addr_cost->post_modify;
5353 else
5354 gcc_unreachable ();
5356 break;
5358 case ADDRESS_REG_REG:
5359 cost += addr_cost->register_offset;
5360 break;
5362 case ADDRESS_REG_UXTW:
5363 case ADDRESS_REG_SXTW:
5364 cost += addr_cost->register_extend;
5365 break;
5367 default:
5368 gcc_unreachable ();
5372 if (info.shift > 0)
5374 /* For the sake of calculating the cost of the shifted register
5375 component, we can treat same sized modes in the same way. */
5376 switch (GET_MODE_BITSIZE (mode))
5378 case 16:
5379 cost += addr_cost->addr_scale_costs.hi;
5380 break;
5382 case 32:
5383 cost += addr_cost->addr_scale_costs.si;
5384 break;
5386 case 64:
5387 cost += addr_cost->addr_scale_costs.di;
5388 break;
5390 /* We can't tell, or this is a 128-bit vector. */
5391 default:
5392 cost += addr_cost->addr_scale_costs.ti;
5393 break;
5397 return cost;
5400 /* Return true if the RTX X in mode MODE is a zero or sign extract
5401 usable in an ADD or SUB (extended register) instruction. */
5402 static bool
5403 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5405 /* Catch add with a sign extract.
5406 This is add_<optab><mode>_multp2. */
5407 if (GET_CODE (x) == SIGN_EXTRACT
5408 || GET_CODE (x) == ZERO_EXTRACT)
5410 rtx op0 = XEXP (x, 0);
5411 rtx op1 = XEXP (x, 1);
5412 rtx op2 = XEXP (x, 2);
5414 if (GET_CODE (op0) == MULT
5415 && CONST_INT_P (op1)
5416 && op2 == const0_rtx
5417 && CONST_INT_P (XEXP (op0, 1))
5418 && aarch64_is_extend_from_extract (mode,
5419 XEXP (op0, 1),
5420 op1))
5422 return true;
5426 return false;
5429 static bool
5430 aarch64_frint_unspec_p (unsigned int u)
5432 switch (u)
5434 case UNSPEC_FRINTZ:
5435 case UNSPEC_FRINTP:
5436 case UNSPEC_FRINTM:
5437 case UNSPEC_FRINTA:
5438 case UNSPEC_FRINTN:
5439 case UNSPEC_FRINTX:
5440 case UNSPEC_FRINTI:
5441 return true;
5443 default:
5444 return false;
5448 /* Return true iff X is an rtx that will match an extr instruction
5449 i.e. as described in the *extr<mode>5_insn family of patterns.
5450 OP0 and OP1 will be set to the operands of the shifts involved
5451 on success and will be NULL_RTX otherwise. */
5453 static bool
5454 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5456 rtx op0, op1;
5457 machine_mode mode = GET_MODE (x);
5459 *res_op0 = NULL_RTX;
5460 *res_op1 = NULL_RTX;
5462 if (GET_CODE (x) != IOR)
5463 return false;
5465 op0 = XEXP (x, 0);
5466 op1 = XEXP (x, 1);
5468 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5469 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5471 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5472 if (GET_CODE (op1) == ASHIFT)
5473 std::swap (op0, op1);
5475 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5476 return false;
5478 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5479 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5481 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5482 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5484 *res_op0 = XEXP (op0, 0);
5485 *res_op1 = XEXP (op1, 0);
5486 return true;
5490 return false;
5493 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5494 storing it in *COST. Result is true if the total cost of the operation
5495 has now been calculated. */
5496 static bool
5497 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5499 rtx inner;
5500 rtx comparator;
5501 enum rtx_code cmpcode;
5503 if (COMPARISON_P (op0))
5505 inner = XEXP (op0, 0);
5506 comparator = XEXP (op0, 1);
5507 cmpcode = GET_CODE (op0);
5509 else
5511 inner = op0;
5512 comparator = const0_rtx;
5513 cmpcode = NE;
5516 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5518 /* Conditional branch. */
5519 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5520 return true;
5521 else
5523 if (cmpcode == NE || cmpcode == EQ)
5525 if (comparator == const0_rtx)
5527 /* TBZ/TBNZ/CBZ/CBNZ. */
5528 if (GET_CODE (inner) == ZERO_EXTRACT)
5529 /* TBZ/TBNZ. */
5530 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5531 0, speed);
5532 else
5533 /* CBZ/CBNZ. */
5534 *cost += rtx_cost (inner, cmpcode, 0, speed);
5536 return true;
5539 else if (cmpcode == LT || cmpcode == GE)
5541 /* TBZ/TBNZ. */
5542 if (comparator == const0_rtx)
5543 return true;
5547 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5549 /* It's a conditional operation based on the status flags,
5550 so it must be some flavor of CSEL. */
5552 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5553 if (GET_CODE (op1) == NEG
5554 || GET_CODE (op1) == NOT
5555 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5556 op1 = XEXP (op1, 0);
5558 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5559 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5560 return true;
5563 /* We don't know what this is, cost all operands. */
5564 return false;
5567 /* Calculate the cost of calculating X, storing it in *COST. Result
5568 is true if the total cost of the operation has now been calculated. */
5569 static bool
5570 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5571 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5573 rtx op0, op1, op2;
5574 const struct cpu_cost_table *extra_cost
5575 = aarch64_tune_params->insn_extra_cost;
5576 machine_mode mode = GET_MODE (x);
5578 /* By default, assume that everything has equivalent cost to the
5579 cheapest instruction. Any additional costs are applied as a delta
5580 above this default. */
5581 *cost = COSTS_N_INSNS (1);
5583 /* TODO: The cost infrastructure currently does not handle
5584 vector operations. Assume that all vector operations
5585 are equally expensive. */
5586 if (VECTOR_MODE_P (mode))
5588 if (speed)
5589 *cost += extra_cost->vect.alu;
5590 return true;
5593 switch (code)
5595 case SET:
5596 /* The cost depends entirely on the operands to SET. */
5597 *cost = 0;
5598 op0 = SET_DEST (x);
5599 op1 = SET_SRC (x);
5601 switch (GET_CODE (op0))
5603 case MEM:
5604 if (speed)
5606 rtx address = XEXP (op0, 0);
5607 if (GET_MODE_CLASS (mode) == MODE_INT)
5608 *cost += extra_cost->ldst.store;
5609 else if (mode == SFmode)
5610 *cost += extra_cost->ldst.storef;
5611 else if (mode == DFmode)
5612 *cost += extra_cost->ldst.stored;
5614 *cost +=
5615 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5616 0, speed));
5619 *cost += rtx_cost (op1, SET, 1, speed);
5620 return true;
5622 case SUBREG:
5623 if (! REG_P (SUBREG_REG (op0)))
5624 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5626 /* Fall through. */
5627 case REG:
5628 /* const0_rtx is in general free, but we will use an
5629 instruction to set a register to 0. */
5630 if (REG_P (op1) || op1 == const0_rtx)
5632 /* The cost is 1 per register copied. */
5633 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5634 / UNITS_PER_WORD;
5635 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5637 else
5638 /* Cost is just the cost of the RHS of the set. */
5639 *cost += rtx_cost (op1, SET, 1, speed);
5640 return true;
5642 case ZERO_EXTRACT:
5643 case SIGN_EXTRACT:
5644 /* Bit-field insertion. Strip any redundant widening of
5645 the RHS to meet the width of the target. */
5646 if (GET_CODE (op1) == SUBREG)
5647 op1 = SUBREG_REG (op1);
5648 if ((GET_CODE (op1) == ZERO_EXTEND
5649 || GET_CODE (op1) == SIGN_EXTEND)
5650 && CONST_INT_P (XEXP (op0, 1))
5651 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5652 >= INTVAL (XEXP (op0, 1))))
5653 op1 = XEXP (op1, 0);
5655 if (CONST_INT_P (op1))
5657 /* MOV immediate is assumed to always be cheap. */
5658 *cost = COSTS_N_INSNS (1);
5660 else
5662 /* BFM. */
5663 if (speed)
5664 *cost += extra_cost->alu.bfi;
5665 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5668 return true;
5670 default:
5671 /* We can't make sense of this, assume default cost. */
5672 *cost = COSTS_N_INSNS (1);
5673 return false;
5675 return false;
5677 case CONST_INT:
5678 /* If an instruction can incorporate a constant within the
5679 instruction, the instruction's expression avoids calling
5680 rtx_cost() on the constant. If rtx_cost() is called on a
5681 constant, then it is usually because the constant must be
5682 moved into a register by one or more instructions.
5684 The exception is constant 0, which can be expressed
5685 as XZR/WZR and is therefore free. The exception to this is
5686 if we have (set (reg) (const0_rtx)) in which case we must cost
5687 the move. However, we can catch that when we cost the SET, so
5688 we don't need to consider that here. */
5689 if (x == const0_rtx)
5690 *cost = 0;
5691 else
5693 /* To an approximation, building any other constant is
5694 proportionally expensive to the number of instructions
5695 required to build that constant. This is true whether we
5696 are compiling for SPEED or otherwise. */
5697 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5698 (NULL_RTX, x, false, mode));
5700 return true;
5702 case CONST_DOUBLE:
5703 if (speed)
5705 /* mov[df,sf]_aarch64. */
5706 if (aarch64_float_const_representable_p (x))
5707 /* FMOV (scalar immediate). */
5708 *cost += extra_cost->fp[mode == DFmode].fpconst;
5709 else if (!aarch64_float_const_zero_rtx_p (x))
5711 /* This will be a load from memory. */
5712 if (mode == DFmode)
5713 *cost += extra_cost->ldst.loadd;
5714 else
5715 *cost += extra_cost->ldst.loadf;
5717 else
5718 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5719 or MOV v0.s[0], wzr - neither of which are modeled by the
5720 cost tables. Just use the default cost. */
5725 return true;
5727 case MEM:
5728 if (speed)
5730 /* For loads we want the base cost of a load, plus an
5731 approximation for the additional cost of the addressing
5732 mode. */
5733 rtx address = XEXP (x, 0);
5734 if (GET_MODE_CLASS (mode) == MODE_INT)
5735 *cost += extra_cost->ldst.load;
5736 else if (mode == SFmode)
5737 *cost += extra_cost->ldst.loadf;
5738 else if (mode == DFmode)
5739 *cost += extra_cost->ldst.loadd;
5741 *cost +=
5742 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5743 0, speed));
5746 return true;
5748 case NEG:
5749 op0 = XEXP (x, 0);
5751 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5753 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5754 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5756 /* CSETM. */
5757 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5758 return true;
5761 /* Cost this as SUB wzr, X. */
5762 op0 = CONST0_RTX (GET_MODE (x));
5763 op1 = XEXP (x, 0);
5764 goto cost_minus;
5767 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5769 /* Support (neg(fma...)) as a single instruction only if
5770 sign of zeros is unimportant. This matches the decision
5771 making in aarch64.md. */
5772 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5774 /* FNMADD. */
5775 *cost = rtx_cost (op0, NEG, 0, speed);
5776 return true;
5778 if (speed)
5779 /* FNEG. */
5780 *cost += extra_cost->fp[mode == DFmode].neg;
5781 return false;
5784 return false;
5786 case CLRSB:
5787 case CLZ:
5788 if (speed)
5789 *cost += extra_cost->alu.clz;
5791 return false;
5793 case COMPARE:
5794 op0 = XEXP (x, 0);
5795 op1 = XEXP (x, 1);
5797 if (op1 == const0_rtx
5798 && GET_CODE (op0) == AND)
5800 x = op0;
5801 goto cost_logic;
5804 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5806 /* TODO: A write to the CC flags possibly costs extra, this
5807 needs encoding in the cost tables. */
5809 /* CC_ZESWPmode supports zero extend for free. */
5810 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5811 op0 = XEXP (op0, 0);
5813 /* ANDS. */
5814 if (GET_CODE (op0) == AND)
5816 x = op0;
5817 goto cost_logic;
5820 if (GET_CODE (op0) == PLUS)
5822 /* ADDS (and CMN alias). */
5823 x = op0;
5824 goto cost_plus;
5827 if (GET_CODE (op0) == MINUS)
5829 /* SUBS. */
5830 x = op0;
5831 goto cost_minus;
5834 if (GET_CODE (op1) == NEG)
5836 /* CMN. */
5837 if (speed)
5838 *cost += extra_cost->alu.arith;
5840 *cost += rtx_cost (op0, COMPARE, 0, speed);
5841 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5842 return true;
5845 /* CMP.
5847 Compare can freely swap the order of operands, and
5848 canonicalization puts the more complex operation first.
5849 But the integer MINUS logic expects the shift/extend
5850 operation in op1. */
5851 if (! (REG_P (op0)
5852 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5854 op0 = XEXP (x, 1);
5855 op1 = XEXP (x, 0);
5857 goto cost_minus;
5860 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5862 /* FCMP. */
5863 if (speed)
5864 *cost += extra_cost->fp[mode == DFmode].compare;
5866 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5868 *cost += rtx_cost (op0, COMPARE, 0, speed);
5869 /* FCMP supports constant 0.0 for no extra cost. */
5870 return true;
5872 return false;
5875 return false;
5877 case MINUS:
5879 op0 = XEXP (x, 0);
5880 op1 = XEXP (x, 1);
5882 cost_minus:
5883 /* Detect valid immediates. */
5884 if ((GET_MODE_CLASS (mode) == MODE_INT
5885 || (GET_MODE_CLASS (mode) == MODE_CC
5886 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5887 && CONST_INT_P (op1)
5888 && aarch64_uimm12_shift (INTVAL (op1)))
5890 *cost += rtx_cost (op0, MINUS, 0, speed);
5892 if (speed)
5893 /* SUB(S) (immediate). */
5894 *cost += extra_cost->alu.arith;
5895 return true;
5899 /* Look for SUB (extended register). */
5900 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5902 if (speed)
5903 *cost += extra_cost->alu.extend_arith;
5905 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5906 (enum rtx_code) GET_CODE (op1),
5907 0, speed);
5908 return true;
5911 rtx new_op1 = aarch64_strip_extend (op1);
5913 /* Cost this as an FMA-alike operation. */
5914 if ((GET_CODE (new_op1) == MULT
5915 || aarch64_shift_p (GET_CODE (new_op1)))
5916 && code != COMPARE)
5918 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5919 (enum rtx_code) code,
5920 speed);
5921 *cost += rtx_cost (op0, MINUS, 0, speed);
5922 return true;
5925 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5927 if (speed)
5929 if (GET_MODE_CLASS (mode) == MODE_INT)
5930 /* SUB(S). */
5931 *cost += extra_cost->alu.arith;
5932 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5933 /* FSUB. */
5934 *cost += extra_cost->fp[mode == DFmode].addsub;
5936 return true;
5939 case PLUS:
5941 rtx new_op0;
5943 op0 = XEXP (x, 0);
5944 op1 = XEXP (x, 1);
5946 cost_plus:
5947 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5948 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5950 /* CSINC. */
5951 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5952 *cost += rtx_cost (op1, PLUS, 1, speed);
5953 return true;
5956 if (GET_MODE_CLASS (mode) == MODE_INT
5957 && CONST_INT_P (op1)
5958 && aarch64_uimm12_shift (INTVAL (op1)))
5960 *cost += rtx_cost (op0, PLUS, 0, speed);
5962 if (speed)
5963 /* ADD (immediate). */
5964 *cost += extra_cost->alu.arith;
5965 return true;
5968 /* Look for ADD (extended register). */
5969 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5971 if (speed)
5972 *cost += extra_cost->alu.extend_arith;
5974 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5975 (enum rtx_code) GET_CODE (op0),
5976 0, speed);
5977 return true;
5980 /* Strip any extend, leave shifts behind as we will
5981 cost them through mult_cost. */
5982 new_op0 = aarch64_strip_extend (op0);
5984 if (GET_CODE (new_op0) == MULT
5985 || aarch64_shift_p (GET_CODE (new_op0)))
5987 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5988 speed);
5989 *cost += rtx_cost (op1, PLUS, 1, speed);
5990 return true;
5993 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5994 + rtx_cost (op1, PLUS, 1, speed));
5996 if (speed)
5998 if (GET_MODE_CLASS (mode) == MODE_INT)
5999 /* ADD. */
6000 *cost += extra_cost->alu.arith;
6001 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6002 /* FADD. */
6003 *cost += extra_cost->fp[mode == DFmode].addsub;
6005 return true;
6008 case BSWAP:
6009 *cost = COSTS_N_INSNS (1);
6011 if (speed)
6012 *cost += extra_cost->alu.rev;
6014 return false;
6016 case IOR:
6017 if (aarch_rev16_p (x))
6019 *cost = COSTS_N_INSNS (1);
6021 if (speed)
6022 *cost += extra_cost->alu.rev;
6024 return true;
6027 if (aarch64_extr_rtx_p (x, &op0, &op1))
6029 *cost += rtx_cost (op0, IOR, 0, speed)
6030 + rtx_cost (op1, IOR, 1, speed);
6031 if (speed)
6032 *cost += extra_cost->alu.shift;
6034 return true;
6036 /* Fall through. */
6037 case XOR:
6038 case AND:
6039 cost_logic:
6040 op0 = XEXP (x, 0);
6041 op1 = XEXP (x, 1);
6043 if (code == AND
6044 && GET_CODE (op0) == MULT
6045 && CONST_INT_P (XEXP (op0, 1))
6046 && CONST_INT_P (op1)
6047 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6048 INTVAL (op1)) != 0)
6050 /* This is a UBFM/SBFM. */
6051 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6052 if (speed)
6053 *cost += extra_cost->alu.bfx;
6054 return true;
6057 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6059 /* We possibly get the immediate for free, this is not
6060 modelled. */
6061 if (CONST_INT_P (op1)
6062 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6064 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6066 if (speed)
6067 *cost += extra_cost->alu.logical;
6069 return true;
6071 else
6073 rtx new_op0 = op0;
6075 /* Handle ORN, EON, or BIC. */
6076 if (GET_CODE (op0) == NOT)
6077 op0 = XEXP (op0, 0);
6079 new_op0 = aarch64_strip_shift (op0);
6081 /* If we had a shift on op0 then this is a logical-shift-
6082 by-register/immediate operation. Otherwise, this is just
6083 a logical operation. */
6084 if (speed)
6086 if (new_op0 != op0)
6088 /* Shift by immediate. */
6089 if (CONST_INT_P (XEXP (op0, 1)))
6090 *cost += extra_cost->alu.log_shift;
6091 else
6092 *cost += extra_cost->alu.log_shift_reg;
6094 else
6095 *cost += extra_cost->alu.logical;
6098 /* In both cases we want to cost both operands. */
6099 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6100 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6102 return true;
6105 return false;
6107 case NOT:
6108 x = XEXP (x, 0);
6109 op0 = aarch64_strip_shift (x);
6111 /* MVN-shifted-reg. */
6112 if (op0 != x)
6114 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6116 if (speed)
6117 *cost += extra_cost->alu.log_shift;
6119 return true;
6121 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6122 Handle the second form here taking care that 'a' in the above can
6123 be a shift. */
6124 else if (GET_CODE (op0) == XOR)
6126 rtx newop0 = XEXP (op0, 0);
6127 rtx newop1 = XEXP (op0, 1);
6128 rtx op0_stripped = aarch64_strip_shift (newop0);
6130 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6131 + rtx_cost (op0_stripped, XOR, 0, speed);
6133 if (speed)
6135 if (op0_stripped != newop0)
6136 *cost += extra_cost->alu.log_shift;
6137 else
6138 *cost += extra_cost->alu.logical;
6141 return true;
6143 /* MVN. */
6144 if (speed)
6145 *cost += extra_cost->alu.logical;
6147 return false;
6149 case ZERO_EXTEND:
6151 op0 = XEXP (x, 0);
6152 /* If a value is written in SI mode, then zero extended to DI
6153 mode, the operation will in general be free as a write to
6154 a 'w' register implicitly zeroes the upper bits of an 'x'
6155 register. However, if this is
6157 (set (reg) (zero_extend (reg)))
6159 we must cost the explicit register move. */
6160 if (mode == DImode
6161 && GET_MODE (op0) == SImode
6162 && outer == SET)
6164 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6166 if (!op_cost && speed)
6167 /* MOV. */
6168 *cost += extra_cost->alu.extend;
6169 else
6170 /* Free, the cost is that of the SI mode operation. */
6171 *cost = op_cost;
6173 return true;
6175 else if (MEM_P (XEXP (x, 0)))
6177 /* All loads can zero extend to any size for free. */
6178 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6179 return true;
6182 /* UXTB/UXTH. */
6183 if (speed)
6184 *cost += extra_cost->alu.extend;
6186 return false;
6188 case SIGN_EXTEND:
6189 if (MEM_P (XEXP (x, 0)))
6191 /* LDRSH. */
6192 if (speed)
6194 rtx address = XEXP (XEXP (x, 0), 0);
6195 *cost += extra_cost->ldst.load_sign_extend;
6197 *cost +=
6198 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6199 0, speed));
6201 return true;
6204 if (speed)
6205 *cost += extra_cost->alu.extend;
6206 return false;
6208 case ASHIFT:
6209 op0 = XEXP (x, 0);
6210 op1 = XEXP (x, 1);
6212 if (CONST_INT_P (op1))
6214 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6215 aliases. */
6216 if (speed)
6217 *cost += extra_cost->alu.shift;
6219 /* We can incorporate zero/sign extend for free. */
6220 if (GET_CODE (op0) == ZERO_EXTEND
6221 || GET_CODE (op0) == SIGN_EXTEND)
6222 op0 = XEXP (op0, 0);
6224 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6225 return true;
6227 else
6229 /* LSLV. */
6230 if (speed)
6231 *cost += extra_cost->alu.shift_reg;
6233 return false; /* All arguments need to be in registers. */
6236 case ROTATE:
6237 case ROTATERT:
6238 case LSHIFTRT:
6239 case ASHIFTRT:
6240 op0 = XEXP (x, 0);
6241 op1 = XEXP (x, 1);
6243 if (CONST_INT_P (op1))
6245 /* ASR (immediate) and friends. */
6246 if (speed)
6247 *cost += extra_cost->alu.shift;
6249 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6250 return true;
6252 else
6255 /* ASR (register) and friends. */
6256 if (speed)
6257 *cost += extra_cost->alu.shift_reg;
6259 return false; /* All arguments need to be in registers. */
6262 case SYMBOL_REF:
6264 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6266 /* LDR. */
6267 if (speed)
6268 *cost += extra_cost->ldst.load;
6270 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6271 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6273 /* ADRP, followed by ADD. */
6274 *cost += COSTS_N_INSNS (1);
6275 if (speed)
6276 *cost += 2 * extra_cost->alu.arith;
6278 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6279 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6281 /* ADR. */
6282 if (speed)
6283 *cost += extra_cost->alu.arith;
6286 if (flag_pic)
6288 /* One extra load instruction, after accessing the GOT. */
6289 *cost += COSTS_N_INSNS (1);
6290 if (speed)
6291 *cost += extra_cost->ldst.load;
6293 return true;
6295 case HIGH:
6296 case LO_SUM:
6297 /* ADRP/ADD (immediate). */
6298 if (speed)
6299 *cost += extra_cost->alu.arith;
6300 return true;
6302 case ZERO_EXTRACT:
6303 case SIGN_EXTRACT:
6304 /* UBFX/SBFX. */
6305 if (speed)
6306 *cost += extra_cost->alu.bfx;
6308 /* We can trust that the immediates used will be correct (there
6309 are no by-register forms), so we need only cost op0. */
6310 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6311 return true;
6313 case MULT:
6314 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6315 /* aarch64_rtx_mult_cost always handles recursion to its
6316 operands. */
6317 return true;
6319 case MOD:
6320 case UMOD:
6321 if (speed)
6323 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6324 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6325 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6326 else if (GET_MODE (x) == DFmode)
6327 *cost += (extra_cost->fp[1].mult
6328 + extra_cost->fp[1].div);
6329 else if (GET_MODE (x) == SFmode)
6330 *cost += (extra_cost->fp[0].mult
6331 + extra_cost->fp[0].div);
6333 return false; /* All arguments need to be in registers. */
6335 case DIV:
6336 case UDIV:
6337 case SQRT:
6338 if (speed)
6340 if (GET_MODE_CLASS (mode) == MODE_INT)
6341 /* There is no integer SQRT, so only DIV and UDIV can get
6342 here. */
6343 *cost += extra_cost->mult[mode == DImode].idiv;
6344 else
6345 *cost += extra_cost->fp[mode == DFmode].div;
6347 return false; /* All arguments need to be in registers. */
6349 case IF_THEN_ELSE:
6350 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6351 XEXP (x, 2), cost, speed);
6353 case EQ:
6354 case NE:
6355 case GT:
6356 case GTU:
6357 case LT:
6358 case LTU:
6359 case GE:
6360 case GEU:
6361 case LE:
6362 case LEU:
6364 return false; /* All arguments must be in registers. */
6366 case FMA:
6367 op0 = XEXP (x, 0);
6368 op1 = XEXP (x, 1);
6369 op2 = XEXP (x, 2);
6371 if (speed)
6372 *cost += extra_cost->fp[mode == DFmode].fma;
6374 /* FMSUB, FNMADD, and FNMSUB are free. */
6375 if (GET_CODE (op0) == NEG)
6376 op0 = XEXP (op0, 0);
6378 if (GET_CODE (op2) == NEG)
6379 op2 = XEXP (op2, 0);
6381 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6382 and the by-element operand as operand 0. */
6383 if (GET_CODE (op1) == NEG)
6384 op1 = XEXP (op1, 0);
6386 /* Catch vector-by-element operations. The by-element operand can
6387 either be (vec_duplicate (vec_select (x))) or just
6388 (vec_select (x)), depending on whether we are multiplying by
6389 a vector or a scalar.
6391 Canonicalization is not very good in these cases, FMA4 will put the
6392 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6393 if (GET_CODE (op0) == VEC_DUPLICATE)
6394 op0 = XEXP (op0, 0);
6395 else if (GET_CODE (op1) == VEC_DUPLICATE)
6396 op1 = XEXP (op1, 0);
6398 if (GET_CODE (op0) == VEC_SELECT)
6399 op0 = XEXP (op0, 0);
6400 else if (GET_CODE (op1) == VEC_SELECT)
6401 op1 = XEXP (op1, 0);
6403 /* If the remaining parameters are not registers,
6404 get the cost to put them into registers. */
6405 *cost += rtx_cost (op0, FMA, 0, speed);
6406 *cost += rtx_cost (op1, FMA, 1, speed);
6407 *cost += rtx_cost (op2, FMA, 2, speed);
6408 return true;
6410 case FLOAT_EXTEND:
6411 if (speed)
6412 *cost += extra_cost->fp[mode == DFmode].widen;
6413 return false;
6415 case FLOAT_TRUNCATE:
6416 if (speed)
6417 *cost += extra_cost->fp[mode == DFmode].narrow;
6418 return false;
6420 case FIX:
6421 case UNSIGNED_FIX:
6422 x = XEXP (x, 0);
6423 /* Strip the rounding part. They will all be implemented
6424 by the fcvt* family of instructions anyway. */
6425 if (GET_CODE (x) == UNSPEC)
6427 unsigned int uns_code = XINT (x, 1);
6429 if (uns_code == UNSPEC_FRINTA
6430 || uns_code == UNSPEC_FRINTM
6431 || uns_code == UNSPEC_FRINTN
6432 || uns_code == UNSPEC_FRINTP
6433 || uns_code == UNSPEC_FRINTZ)
6434 x = XVECEXP (x, 0, 0);
6437 if (speed)
6438 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6440 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6441 return true;
6443 case ABS:
6444 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6446 op0 = XEXP (x, 0);
6448 /* FABD, which is analogous to FADD. */
6449 if (GET_CODE (op0) == MINUS)
6451 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6452 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6453 if (speed)
6454 *cost += extra_cost->fp[mode == DFmode].addsub;
6456 return true;
6458 /* Simple FABS is analogous to FNEG. */
6459 if (speed)
6460 *cost += extra_cost->fp[mode == DFmode].neg;
6462 else
6464 /* Integer ABS will either be split to
6465 two arithmetic instructions, or will be an ABS
6466 (scalar), which we don't model. */
6467 *cost = COSTS_N_INSNS (2);
6468 if (speed)
6469 *cost += 2 * extra_cost->alu.arith;
6471 return false;
6473 case SMAX:
6474 case SMIN:
6475 if (speed)
6477 /* FMAXNM/FMINNM/FMAX/FMIN.
6478 TODO: This may not be accurate for all implementations, but
6479 we do not model this in the cost tables. */
6480 *cost += extra_cost->fp[mode == DFmode].addsub;
6482 return false;
6484 case UNSPEC:
6485 /* The floating point round to integer frint* instructions. */
6486 if (aarch64_frint_unspec_p (XINT (x, 1)))
6488 if (speed)
6489 *cost += extra_cost->fp[mode == DFmode].roundint;
6491 return false;
6494 if (XINT (x, 1) == UNSPEC_RBIT)
6496 if (speed)
6497 *cost += extra_cost->alu.rev;
6499 return false;
6501 break;
6503 case TRUNCATE:
6505 /* Decompose <su>muldi3_highpart. */
6506 if (/* (truncate:DI */
6507 mode == DImode
6508 /* (lshiftrt:TI */
6509 && GET_MODE (XEXP (x, 0)) == TImode
6510 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6511 /* (mult:TI */
6512 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6513 /* (ANY_EXTEND:TI (reg:DI))
6514 (ANY_EXTEND:TI (reg:DI))) */
6515 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6516 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6517 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6518 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6519 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6520 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6521 /* (const_int 64) */
6522 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6523 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6525 /* UMULH/SMULH. */
6526 if (speed)
6527 *cost += extra_cost->mult[mode == DImode].extend;
6528 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6529 MULT, 0, speed);
6530 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6531 MULT, 1, speed);
6532 return true;
6535 /* Fall through. */
6536 default:
6537 break;
6540 if (dump_file && (dump_flags & TDF_DETAILS))
6541 fprintf (dump_file,
6542 "\nFailed to cost RTX. Assuming default cost.\n");
6544 return true;
6547 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6548 calculated for X. This cost is stored in *COST. Returns true
6549 if the total cost of X was calculated. */
6550 static bool
6551 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6552 int param, int *cost, bool speed)
6554 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6556 if (dump_file && (dump_flags & TDF_DETAILS))
6558 print_rtl_single (dump_file, x);
6559 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6560 speed ? "Hot" : "Cold",
6561 *cost, result ? "final" : "partial");
6564 return result;
6567 static int
6568 aarch64_register_move_cost (machine_mode mode,
6569 reg_class_t from_i, reg_class_t to_i)
6571 enum reg_class from = (enum reg_class) from_i;
6572 enum reg_class to = (enum reg_class) to_i;
6573 const struct cpu_regmove_cost *regmove_cost
6574 = aarch64_tune_params->regmove_cost;
6576 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6577 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6578 to = GENERAL_REGS;
6580 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6581 from = GENERAL_REGS;
6583 /* Moving between GPR and stack cost is the same as GP2GP. */
6584 if ((from == GENERAL_REGS && to == STACK_REG)
6585 || (to == GENERAL_REGS && from == STACK_REG))
6586 return regmove_cost->GP2GP;
6588 /* To/From the stack register, we move via the gprs. */
6589 if (to == STACK_REG || from == STACK_REG)
6590 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6591 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6593 if (GET_MODE_SIZE (mode) == 16)
6595 /* 128-bit operations on general registers require 2 instructions. */
6596 if (from == GENERAL_REGS && to == GENERAL_REGS)
6597 return regmove_cost->GP2GP * 2;
6598 else if (from == GENERAL_REGS)
6599 return regmove_cost->GP2FP * 2;
6600 else if (to == GENERAL_REGS)
6601 return regmove_cost->FP2GP * 2;
6603 /* When AdvSIMD instructions are disabled it is not possible to move
6604 a 128-bit value directly between Q registers. This is handled in
6605 secondary reload. A general register is used as a scratch to move
6606 the upper DI value and the lower DI value is moved directly,
6607 hence the cost is the sum of three moves. */
6608 if (! TARGET_SIMD)
6609 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6611 return regmove_cost->FP2FP;
6614 if (from == GENERAL_REGS && to == GENERAL_REGS)
6615 return regmove_cost->GP2GP;
6616 else if (from == GENERAL_REGS)
6617 return regmove_cost->GP2FP;
6618 else if (to == GENERAL_REGS)
6619 return regmove_cost->FP2GP;
6621 return regmove_cost->FP2FP;
6624 static int
6625 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6626 reg_class_t rclass ATTRIBUTE_UNUSED,
6627 bool in ATTRIBUTE_UNUSED)
6629 return aarch64_tune_params->memmov_cost;
6632 /* Return the number of instructions that can be issued per cycle. */
6633 static int
6634 aarch64_sched_issue_rate (void)
6636 return aarch64_tune_params->issue_rate;
6639 static int
6640 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6642 int issue_rate = aarch64_sched_issue_rate ();
6644 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6647 /* Vectorizer cost model target hooks. */
6649 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6650 static int
6651 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6652 tree vectype,
6653 int misalign ATTRIBUTE_UNUSED)
6655 unsigned elements;
6657 switch (type_of_cost)
6659 case scalar_stmt:
6660 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6662 case scalar_load:
6663 return aarch64_tune_params->vec_costs->scalar_load_cost;
6665 case scalar_store:
6666 return aarch64_tune_params->vec_costs->scalar_store_cost;
6668 case vector_stmt:
6669 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6671 case vector_load:
6672 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6674 case vector_store:
6675 return aarch64_tune_params->vec_costs->vec_store_cost;
6677 case vec_to_scalar:
6678 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6680 case scalar_to_vec:
6681 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6683 case unaligned_load:
6684 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6686 case unaligned_store:
6687 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6689 case cond_branch_taken:
6690 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6692 case cond_branch_not_taken:
6693 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6695 case vec_perm:
6696 case vec_promote_demote:
6697 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6699 case vec_construct:
6700 elements = TYPE_VECTOR_SUBPARTS (vectype);
6701 return elements / 2 + 1;
6703 default:
6704 gcc_unreachable ();
6708 /* Implement targetm.vectorize.add_stmt_cost. */
6709 static unsigned
6710 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6711 struct _stmt_vec_info *stmt_info, int misalign,
6712 enum vect_cost_model_location where)
6714 unsigned *cost = (unsigned *) data;
6715 unsigned retval = 0;
6717 if (flag_vect_cost_model)
6719 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6720 int stmt_cost =
6721 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6723 /* Statements in an inner loop relative to the loop being
6724 vectorized are weighted more heavily. The value here is
6725 a function (linear for now) of the loop nest level. */
6726 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6728 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6729 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6730 unsigned nest_level = loop_depth (loop);
6732 count *= nest_level;
6735 retval = (unsigned) (count * stmt_cost);
6736 cost[where] += retval;
6739 return retval;
6742 static void initialize_aarch64_code_model (void);
6744 /* Parse the architecture extension string. */
6746 static void
6747 aarch64_parse_extension (char *str)
6749 /* The extension string is parsed left to right. */
6750 const struct aarch64_option_extension *opt = NULL;
6752 /* Flag to say whether we are adding or removing an extension. */
6753 int adding_ext = -1;
6755 while (str != NULL && *str != 0)
6757 char *ext;
6758 size_t len;
6760 str++;
6761 ext = strchr (str, '+');
6763 if (ext != NULL)
6764 len = ext - str;
6765 else
6766 len = strlen (str);
6768 if (len >= 2 && strncmp (str, "no", 2) == 0)
6770 adding_ext = 0;
6771 len -= 2;
6772 str += 2;
6774 else if (len > 0)
6775 adding_ext = 1;
6777 if (len == 0)
6779 error ("missing feature modifier after %qs", adding_ext ? "+"
6780 : "+no");
6781 return;
6784 /* Scan over the extensions table trying to find an exact match. */
6785 for (opt = all_extensions; opt->name != NULL; opt++)
6787 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6789 /* Add or remove the extension. */
6790 if (adding_ext)
6791 aarch64_isa_flags |= opt->flags_on;
6792 else
6793 aarch64_isa_flags &= ~(opt->flags_off);
6794 break;
6798 if (opt->name == NULL)
6800 /* Extension not found in list. */
6801 error ("unknown feature modifier %qs", str);
6802 return;
6805 str = ext;
6808 return;
6811 /* Parse the ARCH string. */
6813 static void
6814 aarch64_parse_arch (void)
6816 char *ext;
6817 const struct processor *arch;
6818 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6819 size_t len;
6821 strcpy (str, aarch64_arch_string);
6823 ext = strchr (str, '+');
6825 if (ext != NULL)
6826 len = ext - str;
6827 else
6828 len = strlen (str);
6830 if (len == 0)
6832 error ("missing arch name in -march=%qs", str);
6833 return;
6836 /* Loop through the list of supported ARCHs to find a match. */
6837 for (arch = all_architectures; arch->name != NULL; arch++)
6839 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6841 selected_arch = arch;
6842 aarch64_isa_flags = selected_arch->flags;
6844 if (!selected_cpu)
6845 selected_cpu = &all_cores[selected_arch->core];
6847 if (ext != NULL)
6849 /* ARCH string contains at least one extension. */
6850 aarch64_parse_extension (ext);
6853 if (strcmp (selected_arch->arch, selected_cpu->arch))
6855 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6856 selected_cpu->name, selected_arch->name);
6859 return;
6863 /* ARCH name not found in list. */
6864 error ("unknown value %qs for -march", str);
6865 return;
6868 /* Parse the CPU string. */
6870 static void
6871 aarch64_parse_cpu (void)
6873 char *ext;
6874 const struct processor *cpu;
6875 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6876 size_t len;
6878 strcpy (str, aarch64_cpu_string);
6880 ext = strchr (str, '+');
6882 if (ext != NULL)
6883 len = ext - str;
6884 else
6885 len = strlen (str);
6887 if (len == 0)
6889 error ("missing cpu name in -mcpu=%qs", str);
6890 return;
6893 /* Loop through the list of supported CPUs to find a match. */
6894 for (cpu = all_cores; cpu->name != NULL; cpu++)
6896 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6898 selected_cpu = cpu;
6899 aarch64_isa_flags = selected_cpu->flags;
6901 if (ext != NULL)
6903 /* CPU string contains at least one extension. */
6904 aarch64_parse_extension (ext);
6907 return;
6911 /* CPU name not found in list. */
6912 error ("unknown value %qs for -mcpu", str);
6913 return;
6916 /* Parse the TUNE string. */
6918 static void
6919 aarch64_parse_tune (void)
6921 const struct processor *cpu;
6922 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6923 strcpy (str, aarch64_tune_string);
6925 /* Loop through the list of supported CPUs to find a match. */
6926 for (cpu = all_cores; cpu->name != NULL; cpu++)
6928 if (strcmp (cpu->name, str) == 0)
6930 selected_tune = cpu;
6931 return;
6935 /* CPU name not found in list. */
6936 error ("unknown value %qs for -mtune", str);
6937 return;
6941 /* Implement TARGET_OPTION_OVERRIDE. */
6943 static void
6944 aarch64_override_options (void)
6946 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6947 If either of -march or -mtune is given, they override their
6948 respective component of -mcpu.
6950 So, first parse AARCH64_CPU_STRING, then the others, be careful
6951 with -march as, if -mcpu is not present on the command line, march
6952 must set a sensible default CPU. */
6953 if (aarch64_cpu_string)
6955 aarch64_parse_cpu ();
6958 if (aarch64_arch_string)
6960 aarch64_parse_arch ();
6963 if (aarch64_tune_string)
6965 aarch64_parse_tune ();
6968 #ifndef HAVE_AS_MABI_OPTION
6969 /* The compiler may have been configured with 2.23.* binutils, which does
6970 not have support for ILP32. */
6971 if (TARGET_ILP32)
6972 error ("Assembler does not support -mabi=ilp32");
6973 #endif
6975 initialize_aarch64_code_model ();
6977 aarch64_build_bitmask_table ();
6979 /* This target defaults to strict volatile bitfields. */
6980 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6981 flag_strict_volatile_bitfields = 1;
6983 /* If the user did not specify a processor, choose the default
6984 one for them. This will be the CPU set during configuration using
6985 --with-cpu, otherwise it is "generic". */
6986 if (!selected_cpu)
6988 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6989 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6992 gcc_assert (selected_cpu);
6994 if (!selected_tune)
6995 selected_tune = selected_cpu;
6997 aarch64_tune_flags = selected_tune->flags;
6998 aarch64_tune = selected_tune->core;
6999 aarch64_tune_params = selected_tune->tune;
7000 aarch64_architecture_version = selected_cpu->architecture_version;
7002 if (aarch64_fix_a53_err835769 == 2)
7004 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7005 aarch64_fix_a53_err835769 = 1;
7006 #else
7007 aarch64_fix_a53_err835769 = 0;
7008 #endif
7011 /* If not opzimizing for size, set the default
7012 alignment to what the target wants */
7013 if (!optimize_size)
7015 if (align_loops <= 0)
7016 align_loops = aarch64_tune_params->loop_align;
7017 if (align_jumps <= 0)
7018 align_jumps = aarch64_tune_params->jump_align;
7019 if (align_functions <= 0)
7020 align_functions = aarch64_tune_params->function_align;
7023 if (AARCH64_TUNE_FMA_STEERING)
7024 aarch64_register_fma_steering ();
7026 aarch64_override_options_after_change ();
7029 /* Implement targetm.override_options_after_change. */
7031 static void
7032 aarch64_override_options_after_change (void)
7034 if (flag_omit_frame_pointer)
7035 flag_omit_leaf_frame_pointer = false;
7036 else if (flag_omit_leaf_frame_pointer)
7037 flag_omit_frame_pointer = true;
7040 static struct machine_function *
7041 aarch64_init_machine_status (void)
7043 struct machine_function *machine;
7044 machine = ggc_cleared_alloc<machine_function> ();
7045 return machine;
7048 void
7049 aarch64_init_expanders (void)
7051 init_machine_status = aarch64_init_machine_status;
7054 /* A checking mechanism for the implementation of the various code models. */
7055 static void
7056 initialize_aarch64_code_model (void)
7058 if (flag_pic)
7060 switch (aarch64_cmodel_var)
7062 case AARCH64_CMODEL_TINY:
7063 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7064 break;
7065 case AARCH64_CMODEL_SMALL:
7066 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7067 break;
7068 case AARCH64_CMODEL_LARGE:
7069 sorry ("code model %qs with -f%s", "large",
7070 flag_pic > 1 ? "PIC" : "pic");
7071 default:
7072 gcc_unreachable ();
7075 else
7076 aarch64_cmodel = aarch64_cmodel_var;
7079 /* Return true if SYMBOL_REF X binds locally. */
7081 static bool
7082 aarch64_symbol_binds_local_p (const_rtx x)
7084 return (SYMBOL_REF_DECL (x)
7085 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7086 : SYMBOL_REF_LOCAL_P (x));
7089 /* Return true if SYMBOL_REF X is thread local */
7090 static bool
7091 aarch64_tls_symbol_p (rtx x)
7093 if (! TARGET_HAVE_TLS)
7094 return false;
7096 if (GET_CODE (x) != SYMBOL_REF)
7097 return false;
7099 return SYMBOL_REF_TLS_MODEL (x) != 0;
7102 /* Classify a TLS symbol into one of the TLS kinds. */
7103 enum aarch64_symbol_type
7104 aarch64_classify_tls_symbol (rtx x)
7106 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7108 switch (tls_kind)
7110 case TLS_MODEL_GLOBAL_DYNAMIC:
7111 case TLS_MODEL_LOCAL_DYNAMIC:
7112 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7114 case TLS_MODEL_INITIAL_EXEC:
7115 return SYMBOL_SMALL_GOTTPREL;
7117 case TLS_MODEL_LOCAL_EXEC:
7118 return SYMBOL_SMALL_TPREL;
7120 case TLS_MODEL_EMULATED:
7121 case TLS_MODEL_NONE:
7122 return SYMBOL_FORCE_TO_MEM;
7124 default:
7125 gcc_unreachable ();
7129 /* Return the method that should be used to access SYMBOL_REF or
7130 LABEL_REF X in context CONTEXT. */
7132 enum aarch64_symbol_type
7133 aarch64_classify_symbol (rtx x, rtx offset,
7134 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7136 if (GET_CODE (x) == LABEL_REF)
7138 switch (aarch64_cmodel)
7140 case AARCH64_CMODEL_LARGE:
7141 return SYMBOL_FORCE_TO_MEM;
7143 case AARCH64_CMODEL_TINY_PIC:
7144 case AARCH64_CMODEL_TINY:
7145 return SYMBOL_TINY_ABSOLUTE;
7147 case AARCH64_CMODEL_SMALL_PIC:
7148 case AARCH64_CMODEL_SMALL:
7149 return SYMBOL_SMALL_ABSOLUTE;
7151 default:
7152 gcc_unreachable ();
7156 if (GET_CODE (x) == SYMBOL_REF)
7158 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7159 return SYMBOL_FORCE_TO_MEM;
7161 if (aarch64_tls_symbol_p (x))
7162 return aarch64_classify_tls_symbol (x);
7164 switch (aarch64_cmodel)
7166 case AARCH64_CMODEL_TINY:
7167 /* When we retreive symbol + offset address, we have to make sure
7168 the offset does not cause overflow of the final address. But
7169 we have no way of knowing the address of symbol at compile time
7170 so we can't accurately say if the distance between the PC and
7171 symbol + offset is outside the addressible range of +/-1M in the
7172 TINY code model. So we rely on images not being greater than
7173 1M and cap the offset at 1M and anything beyond 1M will have to
7174 be loaded using an alternative mechanism. */
7175 if (SYMBOL_REF_WEAK (x)
7176 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7177 return SYMBOL_FORCE_TO_MEM;
7178 return SYMBOL_TINY_ABSOLUTE;
7180 case AARCH64_CMODEL_SMALL:
7181 /* Same reasoning as the tiny code model, but the offset cap here is
7182 4G. */
7183 if (SYMBOL_REF_WEAK (x)
7184 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7185 HOST_WIDE_INT_C (4294967264)))
7186 return SYMBOL_FORCE_TO_MEM;
7187 return SYMBOL_SMALL_ABSOLUTE;
7189 case AARCH64_CMODEL_TINY_PIC:
7190 if (!aarch64_symbol_binds_local_p (x))
7191 return SYMBOL_TINY_GOT;
7192 return SYMBOL_TINY_ABSOLUTE;
7194 case AARCH64_CMODEL_SMALL_PIC:
7195 if (!aarch64_symbol_binds_local_p (x))
7196 return SYMBOL_SMALL_GOT;
7197 return SYMBOL_SMALL_ABSOLUTE;
7199 default:
7200 gcc_unreachable ();
7204 /* By default push everything into the constant pool. */
7205 return SYMBOL_FORCE_TO_MEM;
7208 bool
7209 aarch64_constant_address_p (rtx x)
7211 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7214 bool
7215 aarch64_legitimate_pic_operand_p (rtx x)
7217 if (GET_CODE (x) == SYMBOL_REF
7218 || (GET_CODE (x) == CONST
7219 && GET_CODE (XEXP (x, 0)) == PLUS
7220 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7221 return false;
7223 return true;
7226 /* Return true if X holds either a quarter-precision or
7227 floating-point +0.0 constant. */
7228 static bool
7229 aarch64_valid_floating_const (machine_mode mode, rtx x)
7231 if (!CONST_DOUBLE_P (x))
7232 return false;
7234 /* TODO: We could handle moving 0.0 to a TFmode register,
7235 but first we would like to refactor the movtf_aarch64
7236 to be more amicable to split moves properly and
7237 correctly gate on TARGET_SIMD. For now - reject all
7238 constants which are not to SFmode or DFmode registers. */
7239 if (!(mode == SFmode || mode == DFmode))
7240 return false;
7242 if (aarch64_float_const_zero_rtx_p (x))
7243 return true;
7244 return aarch64_float_const_representable_p (x);
7247 static bool
7248 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7250 /* Do not allow vector struct mode constants. We could support
7251 0 and -1 easily, but they need support in aarch64-simd.md. */
7252 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7253 return false;
7255 /* This could probably go away because
7256 we now decompose CONST_INTs according to expand_mov_immediate. */
7257 if ((GET_CODE (x) == CONST_VECTOR
7258 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7259 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7260 return !targetm.cannot_force_const_mem (mode, x);
7262 if (GET_CODE (x) == HIGH
7263 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7264 return true;
7266 return aarch64_constant_address_p (x);
7270 aarch64_load_tp (rtx target)
7272 if (!target
7273 || GET_MODE (target) != Pmode
7274 || !register_operand (target, Pmode))
7275 target = gen_reg_rtx (Pmode);
7277 /* Can return in any reg. */
7278 emit_insn (gen_aarch64_load_tp_hard (target));
7279 return target;
7282 /* On AAPCS systems, this is the "struct __va_list". */
7283 static GTY(()) tree va_list_type;
7285 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7286 Return the type to use as __builtin_va_list.
7288 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7290 struct __va_list
7292 void *__stack;
7293 void *__gr_top;
7294 void *__vr_top;
7295 int __gr_offs;
7296 int __vr_offs;
7297 }; */
7299 static tree
7300 aarch64_build_builtin_va_list (void)
7302 tree va_list_name;
7303 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7305 /* Create the type. */
7306 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7307 /* Give it the required name. */
7308 va_list_name = build_decl (BUILTINS_LOCATION,
7309 TYPE_DECL,
7310 get_identifier ("__va_list"),
7311 va_list_type);
7312 DECL_ARTIFICIAL (va_list_name) = 1;
7313 TYPE_NAME (va_list_type) = va_list_name;
7314 TYPE_STUB_DECL (va_list_type) = va_list_name;
7316 /* Create the fields. */
7317 f_stack = build_decl (BUILTINS_LOCATION,
7318 FIELD_DECL, get_identifier ("__stack"),
7319 ptr_type_node);
7320 f_grtop = build_decl (BUILTINS_LOCATION,
7321 FIELD_DECL, get_identifier ("__gr_top"),
7322 ptr_type_node);
7323 f_vrtop = build_decl (BUILTINS_LOCATION,
7324 FIELD_DECL, get_identifier ("__vr_top"),
7325 ptr_type_node);
7326 f_groff = build_decl (BUILTINS_LOCATION,
7327 FIELD_DECL, get_identifier ("__gr_offs"),
7328 integer_type_node);
7329 f_vroff = build_decl (BUILTINS_LOCATION,
7330 FIELD_DECL, get_identifier ("__vr_offs"),
7331 integer_type_node);
7333 DECL_ARTIFICIAL (f_stack) = 1;
7334 DECL_ARTIFICIAL (f_grtop) = 1;
7335 DECL_ARTIFICIAL (f_vrtop) = 1;
7336 DECL_ARTIFICIAL (f_groff) = 1;
7337 DECL_ARTIFICIAL (f_vroff) = 1;
7339 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7340 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7341 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7342 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7343 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7345 TYPE_FIELDS (va_list_type) = f_stack;
7346 DECL_CHAIN (f_stack) = f_grtop;
7347 DECL_CHAIN (f_grtop) = f_vrtop;
7348 DECL_CHAIN (f_vrtop) = f_groff;
7349 DECL_CHAIN (f_groff) = f_vroff;
7351 /* Compute its layout. */
7352 layout_type (va_list_type);
7354 return va_list_type;
7357 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7358 static void
7359 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7361 const CUMULATIVE_ARGS *cum;
7362 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7363 tree stack, grtop, vrtop, groff, vroff;
7364 tree t;
7365 int gr_save_area_size;
7366 int vr_save_area_size;
7367 int vr_offset;
7369 cum = &crtl->args.info;
7370 gr_save_area_size
7371 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7372 vr_save_area_size
7373 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7375 if (TARGET_GENERAL_REGS_ONLY)
7377 if (cum->aapcs_nvrn > 0)
7378 sorry ("%qs and floating point or vector arguments",
7379 "-mgeneral-regs-only");
7380 vr_save_area_size = 0;
7383 f_stack = TYPE_FIELDS (va_list_type_node);
7384 f_grtop = DECL_CHAIN (f_stack);
7385 f_vrtop = DECL_CHAIN (f_grtop);
7386 f_groff = DECL_CHAIN (f_vrtop);
7387 f_vroff = DECL_CHAIN (f_groff);
7389 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7390 NULL_TREE);
7391 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7392 NULL_TREE);
7393 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7394 NULL_TREE);
7395 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7396 NULL_TREE);
7397 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7398 NULL_TREE);
7400 /* Emit code to initialize STACK, which points to the next varargs stack
7401 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7402 by named arguments. STACK is 8-byte aligned. */
7403 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7404 if (cum->aapcs_stack_size > 0)
7405 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7406 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7407 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7409 /* Emit code to initialize GRTOP, the top of the GR save area.
7410 virtual_incoming_args_rtx should have been 16 byte aligned. */
7411 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7412 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7413 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7415 /* Emit code to initialize VRTOP, the top of the VR save area.
7416 This address is gr_save_area_bytes below GRTOP, rounded
7417 down to the next 16-byte boundary. */
7418 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7419 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7420 STACK_BOUNDARY / BITS_PER_UNIT);
7422 if (vr_offset)
7423 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7424 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7425 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7427 /* Emit code to initialize GROFF, the offset from GRTOP of the
7428 next GPR argument. */
7429 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7430 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7431 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7433 /* Likewise emit code to initialize VROFF, the offset from FTOP
7434 of the next VR argument. */
7435 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7436 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7437 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7440 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7442 static tree
7443 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7444 gimple_seq *post_p ATTRIBUTE_UNUSED)
7446 tree addr;
7447 bool indirect_p;
7448 bool is_ha; /* is HFA or HVA. */
7449 bool dw_align; /* double-word align. */
7450 machine_mode ag_mode = VOIDmode;
7451 int nregs;
7452 machine_mode mode;
7454 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7455 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7456 HOST_WIDE_INT size, rsize, adjust, align;
7457 tree t, u, cond1, cond2;
7459 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7460 if (indirect_p)
7461 type = build_pointer_type (type);
7463 mode = TYPE_MODE (type);
7465 f_stack = TYPE_FIELDS (va_list_type_node);
7466 f_grtop = DECL_CHAIN (f_stack);
7467 f_vrtop = DECL_CHAIN (f_grtop);
7468 f_groff = DECL_CHAIN (f_vrtop);
7469 f_vroff = DECL_CHAIN (f_groff);
7471 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7472 f_stack, NULL_TREE);
7473 size = int_size_in_bytes (type);
7474 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7476 dw_align = false;
7477 adjust = 0;
7478 if (aarch64_vfp_is_call_or_return_candidate (mode,
7479 type,
7480 &ag_mode,
7481 &nregs,
7482 &is_ha))
7484 /* TYPE passed in fp/simd registers. */
7485 if (TARGET_GENERAL_REGS_ONLY)
7486 sorry ("%qs and floating point or vector arguments",
7487 "-mgeneral-regs-only");
7489 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7490 unshare_expr (valist), f_vrtop, NULL_TREE);
7491 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7492 unshare_expr (valist), f_vroff, NULL_TREE);
7494 rsize = nregs * UNITS_PER_VREG;
7496 if (is_ha)
7498 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7499 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7501 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7502 && size < UNITS_PER_VREG)
7504 adjust = UNITS_PER_VREG - size;
7507 else
7509 /* TYPE passed in general registers. */
7510 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7511 unshare_expr (valist), f_grtop, NULL_TREE);
7512 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7513 unshare_expr (valist), f_groff, NULL_TREE);
7514 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7515 nregs = rsize / UNITS_PER_WORD;
7517 if (align > 8)
7518 dw_align = true;
7520 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7521 && size < UNITS_PER_WORD)
7523 adjust = UNITS_PER_WORD - size;
7527 /* Get a local temporary for the field value. */
7528 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7530 /* Emit code to branch if off >= 0. */
7531 t = build2 (GE_EXPR, boolean_type_node, off,
7532 build_int_cst (TREE_TYPE (off), 0));
7533 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7535 if (dw_align)
7537 /* Emit: offs = (offs + 15) & -16. */
7538 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7539 build_int_cst (TREE_TYPE (off), 15));
7540 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7541 build_int_cst (TREE_TYPE (off), -16));
7542 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7544 else
7545 roundup = NULL;
7547 /* Update ap.__[g|v]r_offs */
7548 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7549 build_int_cst (TREE_TYPE (off), rsize));
7550 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7552 /* String up. */
7553 if (roundup)
7554 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7556 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7557 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7558 build_int_cst (TREE_TYPE (f_off), 0));
7559 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7561 /* String up: make sure the assignment happens before the use. */
7562 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7563 COND_EXPR_ELSE (cond1) = t;
7565 /* Prepare the trees handling the argument that is passed on the stack;
7566 the top level node will store in ON_STACK. */
7567 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7568 if (align > 8)
7570 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7571 t = fold_convert (intDI_type_node, arg);
7572 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7573 build_int_cst (TREE_TYPE (t), 15));
7574 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7575 build_int_cst (TREE_TYPE (t), -16));
7576 t = fold_convert (TREE_TYPE (arg), t);
7577 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7579 else
7580 roundup = NULL;
7581 /* Advance ap.__stack */
7582 t = fold_convert (intDI_type_node, arg);
7583 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7584 build_int_cst (TREE_TYPE (t), size + 7));
7585 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7586 build_int_cst (TREE_TYPE (t), -8));
7587 t = fold_convert (TREE_TYPE (arg), t);
7588 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7589 /* String up roundup and advance. */
7590 if (roundup)
7591 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7592 /* String up with arg */
7593 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7594 /* Big-endianness related address adjustment. */
7595 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7596 && size < UNITS_PER_WORD)
7598 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7599 size_int (UNITS_PER_WORD - size));
7600 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7603 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7604 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7606 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7607 t = off;
7608 if (adjust)
7609 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7610 build_int_cst (TREE_TYPE (off), adjust));
7612 t = fold_convert (sizetype, t);
7613 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7615 if (is_ha)
7617 /* type ha; // treat as "struct {ftype field[n];}"
7618 ... [computing offs]
7619 for (i = 0; i <nregs; ++i, offs += 16)
7620 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7621 return ha; */
7622 int i;
7623 tree tmp_ha, field_t, field_ptr_t;
7625 /* Declare a local variable. */
7626 tmp_ha = create_tmp_var_raw (type, "ha");
7627 gimple_add_tmp_var (tmp_ha);
7629 /* Establish the base type. */
7630 switch (ag_mode)
7632 case SFmode:
7633 field_t = float_type_node;
7634 field_ptr_t = float_ptr_type_node;
7635 break;
7636 case DFmode:
7637 field_t = double_type_node;
7638 field_ptr_t = double_ptr_type_node;
7639 break;
7640 case TFmode:
7641 field_t = long_double_type_node;
7642 field_ptr_t = long_double_ptr_type_node;
7643 break;
7644 /* The half precision and quad precision are not fully supported yet. Enable
7645 the following code after the support is complete. Need to find the correct
7646 type node for __fp16 *. */
7647 #if 0
7648 case HFmode:
7649 field_t = float_type_node;
7650 field_ptr_t = float_ptr_type_node;
7651 break;
7652 #endif
7653 case V2SImode:
7654 case V4SImode:
7656 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7657 field_t = build_vector_type_for_mode (innertype, ag_mode);
7658 field_ptr_t = build_pointer_type (field_t);
7660 break;
7661 default:
7662 gcc_assert (0);
7665 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7666 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7667 addr = t;
7668 t = fold_convert (field_ptr_t, addr);
7669 t = build2 (MODIFY_EXPR, field_t,
7670 build1 (INDIRECT_REF, field_t, tmp_ha),
7671 build1 (INDIRECT_REF, field_t, t));
7673 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7674 for (i = 1; i < nregs; ++i)
7676 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7677 u = fold_convert (field_ptr_t, addr);
7678 u = build2 (MODIFY_EXPR, field_t,
7679 build2 (MEM_REF, field_t, tmp_ha,
7680 build_int_cst (field_ptr_t,
7681 (i *
7682 int_size_in_bytes (field_t)))),
7683 build1 (INDIRECT_REF, field_t, u));
7684 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7687 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7688 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7691 COND_EXPR_ELSE (cond2) = t;
7692 addr = fold_convert (build_pointer_type (type), cond1);
7693 addr = build_va_arg_indirect_ref (addr);
7695 if (indirect_p)
7696 addr = build_va_arg_indirect_ref (addr);
7698 return addr;
7701 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7703 static void
7704 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7705 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7706 int no_rtl)
7708 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7709 CUMULATIVE_ARGS local_cum;
7710 int gr_saved, vr_saved;
7712 /* The caller has advanced CUM up to, but not beyond, the last named
7713 argument. Advance a local copy of CUM past the last "real" named
7714 argument, to find out how many registers are left over. */
7715 local_cum = *cum;
7716 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7718 /* Found out how many registers we need to save. */
7719 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7720 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7722 if (TARGET_GENERAL_REGS_ONLY)
7724 if (local_cum.aapcs_nvrn > 0)
7725 sorry ("%qs and floating point or vector arguments",
7726 "-mgeneral-regs-only");
7727 vr_saved = 0;
7730 if (!no_rtl)
7732 if (gr_saved > 0)
7734 rtx ptr, mem;
7736 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7737 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7738 - gr_saved * UNITS_PER_WORD);
7739 mem = gen_frame_mem (BLKmode, ptr);
7740 set_mem_alias_set (mem, get_varargs_alias_set ());
7742 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7743 mem, gr_saved);
7745 if (vr_saved > 0)
7747 /* We can't use move_block_from_reg, because it will use
7748 the wrong mode, storing D regs only. */
7749 machine_mode mode = TImode;
7750 int off, i;
7752 /* Set OFF to the offset from virtual_incoming_args_rtx of
7753 the first vector register. The VR save area lies below
7754 the GR one, and is aligned to 16 bytes. */
7755 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7756 STACK_BOUNDARY / BITS_PER_UNIT);
7757 off -= vr_saved * UNITS_PER_VREG;
7759 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7761 rtx ptr, mem;
7763 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7764 mem = gen_frame_mem (mode, ptr);
7765 set_mem_alias_set (mem, get_varargs_alias_set ());
7766 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7767 off += UNITS_PER_VREG;
7772 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7773 any complication of having crtl->args.pretend_args_size changed. */
7774 cfun->machine->frame.saved_varargs_size
7775 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7776 STACK_BOUNDARY / BITS_PER_UNIT)
7777 + vr_saved * UNITS_PER_VREG);
7780 static void
7781 aarch64_conditional_register_usage (void)
7783 int i;
7784 if (!TARGET_FLOAT)
7786 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7788 fixed_regs[i] = 1;
7789 call_used_regs[i] = 1;
7794 /* Walk down the type tree of TYPE counting consecutive base elements.
7795 If *MODEP is VOIDmode, then set it to the first valid floating point
7796 type. If a non-floating point type is found, or if a floating point
7797 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7798 otherwise return the count in the sub-tree. */
7799 static int
7800 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7802 machine_mode mode;
7803 HOST_WIDE_INT size;
7805 switch (TREE_CODE (type))
7807 case REAL_TYPE:
7808 mode = TYPE_MODE (type);
7809 if (mode != DFmode && mode != SFmode && mode != TFmode)
7810 return -1;
7812 if (*modep == VOIDmode)
7813 *modep = mode;
7815 if (*modep == mode)
7816 return 1;
7818 break;
7820 case COMPLEX_TYPE:
7821 mode = TYPE_MODE (TREE_TYPE (type));
7822 if (mode != DFmode && mode != SFmode && mode != TFmode)
7823 return -1;
7825 if (*modep == VOIDmode)
7826 *modep = mode;
7828 if (*modep == mode)
7829 return 2;
7831 break;
7833 case VECTOR_TYPE:
7834 /* Use V2SImode and V4SImode as representatives of all 64-bit
7835 and 128-bit vector types. */
7836 size = int_size_in_bytes (type);
7837 switch (size)
7839 case 8:
7840 mode = V2SImode;
7841 break;
7842 case 16:
7843 mode = V4SImode;
7844 break;
7845 default:
7846 return -1;
7849 if (*modep == VOIDmode)
7850 *modep = mode;
7852 /* Vector modes are considered to be opaque: two vectors are
7853 equivalent for the purposes of being homogeneous aggregates
7854 if they are the same size. */
7855 if (*modep == mode)
7856 return 1;
7858 break;
7860 case ARRAY_TYPE:
7862 int count;
7863 tree index = TYPE_DOMAIN (type);
7865 /* Can't handle incomplete types nor sizes that are not
7866 fixed. */
7867 if (!COMPLETE_TYPE_P (type)
7868 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7869 return -1;
7871 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7872 if (count == -1
7873 || !index
7874 || !TYPE_MAX_VALUE (index)
7875 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7876 || !TYPE_MIN_VALUE (index)
7877 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7878 || count < 0)
7879 return -1;
7881 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7882 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7884 /* There must be no padding. */
7885 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7886 return -1;
7888 return count;
7891 case RECORD_TYPE:
7893 int count = 0;
7894 int sub_count;
7895 tree field;
7897 /* Can't handle incomplete types nor sizes that are not
7898 fixed. */
7899 if (!COMPLETE_TYPE_P (type)
7900 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7901 return -1;
7903 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7905 if (TREE_CODE (field) != FIELD_DECL)
7906 continue;
7908 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7909 if (sub_count < 0)
7910 return -1;
7911 count += sub_count;
7914 /* There must be no padding. */
7915 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7916 return -1;
7918 return count;
7921 case UNION_TYPE:
7922 case QUAL_UNION_TYPE:
7924 /* These aren't very interesting except in a degenerate case. */
7925 int count = 0;
7926 int sub_count;
7927 tree field;
7929 /* Can't handle incomplete types nor sizes that are not
7930 fixed. */
7931 if (!COMPLETE_TYPE_P (type)
7932 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7933 return -1;
7935 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7937 if (TREE_CODE (field) != FIELD_DECL)
7938 continue;
7940 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7941 if (sub_count < 0)
7942 return -1;
7943 count = count > sub_count ? count : sub_count;
7946 /* There must be no padding. */
7947 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7948 return -1;
7950 return count;
7953 default:
7954 break;
7957 return -1;
7960 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7961 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7962 array types. The C99 floating-point complex types are also considered
7963 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7964 types, which are GCC extensions and out of the scope of AAPCS64, are
7965 treated as composite types here as well.
7967 Note that MODE itself is not sufficient in determining whether a type
7968 is such a composite type or not. This is because
7969 stor-layout.c:compute_record_mode may have already changed the MODE
7970 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7971 structure with only one field may have its MODE set to the mode of the
7972 field. Also an integer mode whose size matches the size of the
7973 RECORD_TYPE type may be used to substitute the original mode
7974 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7975 solely relied on. */
7977 static bool
7978 aarch64_composite_type_p (const_tree type,
7979 machine_mode mode)
7981 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7982 return true;
7984 if (mode == BLKmode
7985 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7986 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7987 return true;
7989 return false;
7992 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7993 type as described in AAPCS64 \S 4.1.2.
7995 See the comment above aarch64_composite_type_p for the notes on MODE. */
7997 static bool
7998 aarch64_short_vector_p (const_tree type,
7999 machine_mode mode)
8001 HOST_WIDE_INT size = -1;
8003 if (type && TREE_CODE (type) == VECTOR_TYPE)
8004 size = int_size_in_bytes (type);
8005 else if (!aarch64_composite_type_p (type, mode)
8006 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8007 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
8008 size = GET_MODE_SIZE (mode);
8010 return (size == 8 || size == 16) ? true : false;
8013 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8014 shall be passed or returned in simd/fp register(s) (providing these
8015 parameter passing registers are available).
8017 Upon successful return, *COUNT returns the number of needed registers,
8018 *BASE_MODE returns the mode of the individual register and when IS_HAF
8019 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8020 floating-point aggregate or a homogeneous short-vector aggregate. */
8022 static bool
8023 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8024 const_tree type,
8025 machine_mode *base_mode,
8026 int *count,
8027 bool *is_ha)
8029 machine_mode new_mode = VOIDmode;
8030 bool composite_p = aarch64_composite_type_p (type, mode);
8032 if (is_ha != NULL) *is_ha = false;
8034 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8035 || aarch64_short_vector_p (type, mode))
8037 *count = 1;
8038 new_mode = mode;
8040 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8042 if (is_ha != NULL) *is_ha = true;
8043 *count = 2;
8044 new_mode = GET_MODE_INNER (mode);
8046 else if (type && composite_p)
8048 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8050 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8052 if (is_ha != NULL) *is_ha = true;
8053 *count = ag_count;
8055 else
8056 return false;
8058 else
8059 return false;
8061 *base_mode = new_mode;
8062 return true;
8065 /* Implement TARGET_STRUCT_VALUE_RTX. */
8067 static rtx
8068 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8069 int incoming ATTRIBUTE_UNUSED)
8071 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8074 /* Implements target hook vector_mode_supported_p. */
8075 static bool
8076 aarch64_vector_mode_supported_p (machine_mode mode)
8078 if (TARGET_SIMD
8079 && (mode == V4SImode || mode == V8HImode
8080 || mode == V16QImode || mode == V2DImode
8081 || mode == V2SImode || mode == V4HImode
8082 || mode == V8QImode || mode == V2SFmode
8083 || mode == V4SFmode || mode == V2DFmode
8084 || mode == V1DFmode))
8085 return true;
8087 return false;
8090 /* Return appropriate SIMD container
8091 for MODE within a vector of WIDTH bits. */
8092 static machine_mode
8093 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8095 gcc_assert (width == 64 || width == 128);
8096 if (TARGET_SIMD)
8098 if (width == 128)
8099 switch (mode)
8101 case DFmode:
8102 return V2DFmode;
8103 case SFmode:
8104 return V4SFmode;
8105 case SImode:
8106 return V4SImode;
8107 case HImode:
8108 return V8HImode;
8109 case QImode:
8110 return V16QImode;
8111 case DImode:
8112 return V2DImode;
8113 default:
8114 break;
8116 else
8117 switch (mode)
8119 case SFmode:
8120 return V2SFmode;
8121 case SImode:
8122 return V2SImode;
8123 case HImode:
8124 return V4HImode;
8125 case QImode:
8126 return V8QImode;
8127 default:
8128 break;
8131 return word_mode;
8134 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8135 static machine_mode
8136 aarch64_preferred_simd_mode (machine_mode mode)
8138 return aarch64_simd_container_mode (mode, 128);
8141 /* Return the bitmask of possible vector sizes for the vectorizer
8142 to iterate over. */
8143 static unsigned int
8144 aarch64_autovectorize_vector_sizes (void)
8146 return (16 | 8);
8149 /* Implement TARGET_MANGLE_TYPE. */
8151 static const char *
8152 aarch64_mangle_type (const_tree type)
8154 /* The AArch64 ABI documents say that "__va_list" has to be
8155 managled as if it is in the "std" namespace. */
8156 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8157 return "St9__va_list";
8159 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8160 builtin types. */
8161 if (TYPE_NAME (type) != NULL)
8162 return aarch64_mangle_builtin_type (type);
8164 /* Use the default mangling. */
8165 return NULL;
8169 /* Return true if the rtx_insn contains a MEM RTX somewhere
8170 in it. */
8172 static bool
8173 has_memory_op (rtx_insn *mem_insn)
8175 subrtx_iterator::array_type array;
8176 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8177 if (MEM_P (*iter))
8178 return true;
8180 return false;
8183 /* Find the first rtx_insn before insn that will generate an assembly
8184 instruction. */
8186 static rtx_insn *
8187 aarch64_prev_real_insn (rtx_insn *insn)
8189 if (!insn)
8190 return NULL;
8194 insn = prev_real_insn (insn);
8196 while (insn && recog_memoized (insn) < 0);
8198 return insn;
8201 static bool
8202 is_madd_op (enum attr_type t1)
8204 unsigned int i;
8205 /* A number of these may be AArch32 only. */
8206 enum attr_type mlatypes[] = {
8207 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8208 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8209 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8212 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8214 if (t1 == mlatypes[i])
8215 return true;
8218 return false;
8221 /* Check if there is a register dependency between a load and the insn
8222 for which we hold recog_data. */
8224 static bool
8225 dep_between_memop_and_curr (rtx memop)
8227 rtx load_reg;
8228 int opno;
8230 gcc_assert (GET_CODE (memop) == SET);
8232 if (!REG_P (SET_DEST (memop)))
8233 return false;
8235 load_reg = SET_DEST (memop);
8236 for (opno = 1; opno < recog_data.n_operands; opno++)
8238 rtx operand = recog_data.operand[opno];
8239 if (REG_P (operand)
8240 && reg_overlap_mentioned_p (load_reg, operand))
8241 return true;
8244 return false;
8248 /* When working around the Cortex-A53 erratum 835769,
8249 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8250 instruction and has a preceding memory instruction such that a NOP
8251 should be inserted between them. */
8253 bool
8254 aarch64_madd_needs_nop (rtx_insn* insn)
8256 enum attr_type attr_type;
8257 rtx_insn *prev;
8258 rtx body;
8260 if (!aarch64_fix_a53_err835769)
8261 return false;
8263 if (recog_memoized (insn) < 0)
8264 return false;
8266 attr_type = get_attr_type (insn);
8267 if (!is_madd_op (attr_type))
8268 return false;
8270 prev = aarch64_prev_real_insn (insn);
8271 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8272 Restore recog state to INSN to avoid state corruption. */
8273 extract_constrain_insn_cached (insn);
8275 if (!prev || !has_memory_op (prev))
8276 return false;
8278 body = single_set (prev);
8280 /* If the previous insn is a memory op and there is no dependency between
8281 it and the DImode madd, emit a NOP between them. If body is NULL then we
8282 have a complex memory operation, probably a load/store pair.
8283 Be conservative for now and emit a NOP. */
8284 if (GET_MODE (recog_data.operand[0]) == DImode
8285 && (!body || !dep_between_memop_and_curr (body)))
8286 return true;
8288 return false;
8293 /* Implement FINAL_PRESCAN_INSN. */
8295 void
8296 aarch64_final_prescan_insn (rtx_insn *insn)
8298 if (aarch64_madd_needs_nop (insn))
8299 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8303 /* Return the equivalent letter for size. */
8304 static char
8305 sizetochar (int size)
8307 switch (size)
8309 case 64: return 'd';
8310 case 32: return 's';
8311 case 16: return 'h';
8312 case 8 : return 'b';
8313 default: gcc_unreachable ();
8317 /* Return true iff x is a uniform vector of floating-point
8318 constants, and the constant can be represented in
8319 quarter-precision form. Note, as aarch64_float_const_representable
8320 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8321 static bool
8322 aarch64_vect_float_const_representable_p (rtx x)
8324 int i = 0;
8325 REAL_VALUE_TYPE r0, ri;
8326 rtx x0, xi;
8328 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8329 return false;
8331 x0 = CONST_VECTOR_ELT (x, 0);
8332 if (!CONST_DOUBLE_P (x0))
8333 return false;
8335 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8337 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8339 xi = CONST_VECTOR_ELT (x, i);
8340 if (!CONST_DOUBLE_P (xi))
8341 return false;
8343 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8344 if (!REAL_VALUES_EQUAL (r0, ri))
8345 return false;
8348 return aarch64_float_const_representable_p (x0);
8351 /* Return true for valid and false for invalid. */
8352 bool
8353 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8354 struct simd_immediate_info *info)
8356 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8357 matches = 1; \
8358 for (i = 0; i < idx; i += (STRIDE)) \
8359 if (!(TEST)) \
8360 matches = 0; \
8361 if (matches) \
8363 immtype = (CLASS); \
8364 elsize = (ELSIZE); \
8365 eshift = (SHIFT); \
8366 emvn = (NEG); \
8367 break; \
8370 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8371 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8372 unsigned char bytes[16];
8373 int immtype = -1, matches;
8374 unsigned int invmask = inverse ? 0xff : 0;
8375 int eshift, emvn;
8377 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8379 if (! (aarch64_simd_imm_zero_p (op, mode)
8380 || aarch64_vect_float_const_representable_p (op)))
8381 return false;
8383 if (info)
8385 info->value = CONST_VECTOR_ELT (op, 0);
8386 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8387 info->mvn = false;
8388 info->shift = 0;
8391 return true;
8394 /* Splat vector constant out into a byte vector. */
8395 for (i = 0; i < n_elts; i++)
8397 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8398 it must be laid out in the vector register in reverse order. */
8399 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8400 unsigned HOST_WIDE_INT elpart;
8401 unsigned int part, parts;
8403 if (CONST_INT_P (el))
8405 elpart = INTVAL (el);
8406 parts = 1;
8408 else if (GET_CODE (el) == CONST_DOUBLE)
8410 elpart = CONST_DOUBLE_LOW (el);
8411 parts = 2;
8413 else
8414 gcc_unreachable ();
8416 for (part = 0; part < parts; part++)
8418 unsigned int byte;
8419 for (byte = 0; byte < innersize; byte++)
8421 bytes[idx++] = (elpart & 0xff) ^ invmask;
8422 elpart >>= BITS_PER_UNIT;
8424 if (GET_CODE (el) == CONST_DOUBLE)
8425 elpart = CONST_DOUBLE_HIGH (el);
8429 /* Sanity check. */
8430 gcc_assert (idx == GET_MODE_SIZE (mode));
8434 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8435 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8437 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8438 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8440 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8441 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8443 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8444 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8446 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8448 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8450 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8451 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8453 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8454 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8456 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8457 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8459 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8460 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8462 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8464 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8466 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8467 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8469 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8470 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8472 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8473 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8475 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8476 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8478 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8480 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8481 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8483 while (0);
8485 if (immtype == -1)
8486 return false;
8488 if (info)
8490 info->element_width = elsize;
8491 info->mvn = emvn != 0;
8492 info->shift = eshift;
8494 unsigned HOST_WIDE_INT imm = 0;
8496 if (immtype >= 12 && immtype <= 15)
8497 info->msl = true;
8499 /* Un-invert bytes of recognized vector, if necessary. */
8500 if (invmask != 0)
8501 for (i = 0; i < idx; i++)
8502 bytes[i] ^= invmask;
8504 if (immtype == 17)
8506 /* FIXME: Broken on 32-bit H_W_I hosts. */
8507 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8509 for (i = 0; i < 8; i++)
8510 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8511 << (i * BITS_PER_UNIT);
8514 info->value = GEN_INT (imm);
8516 else
8518 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8519 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8521 /* Construct 'abcdefgh' because the assembler cannot handle
8522 generic constants. */
8523 if (info->mvn)
8524 imm = ~imm;
8525 imm = (imm >> info->shift) & 0xff;
8526 info->value = GEN_INT (imm);
8530 return true;
8531 #undef CHECK
8534 /* Check of immediate shift constants are within range. */
8535 bool
8536 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8538 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8539 if (left)
8540 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8541 else
8542 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8545 /* Return true if X is a uniform vector where all elements
8546 are either the floating-point constant 0.0 or the
8547 integer constant 0. */
8548 bool
8549 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8551 return x == CONST0_RTX (mode);
8554 bool
8555 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8557 HOST_WIDE_INT imm = INTVAL (x);
8558 int i;
8560 for (i = 0; i < 8; i++)
8562 unsigned int byte = imm & 0xff;
8563 if (byte != 0xff && byte != 0)
8564 return false;
8565 imm >>= 8;
8568 return true;
8571 bool
8572 aarch64_mov_operand_p (rtx x,
8573 enum aarch64_symbol_context context,
8574 machine_mode mode)
8576 if (GET_CODE (x) == HIGH
8577 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8578 return true;
8580 if (CONST_INT_P (x))
8581 return true;
8583 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8584 return true;
8586 return aarch64_classify_symbolic_expression (x, context)
8587 == SYMBOL_TINY_ABSOLUTE;
8590 /* Return a const_int vector of VAL. */
8592 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8594 int nunits = GET_MODE_NUNITS (mode);
8595 rtvec v = rtvec_alloc (nunits);
8596 int i;
8598 for (i=0; i < nunits; i++)
8599 RTVEC_ELT (v, i) = GEN_INT (val);
8601 return gen_rtx_CONST_VECTOR (mode, v);
8604 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8606 bool
8607 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8609 machine_mode vmode;
8611 gcc_assert (!VECTOR_MODE_P (mode));
8612 vmode = aarch64_preferred_simd_mode (mode);
8613 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8614 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8617 /* Construct and return a PARALLEL RTX vector with elements numbering the
8618 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8619 the vector - from the perspective of the architecture. This does not
8620 line up with GCC's perspective on lane numbers, so we end up with
8621 different masks depending on our target endian-ness. The diagram
8622 below may help. We must draw the distinction when building masks
8623 which select one half of the vector. An instruction selecting
8624 architectural low-lanes for a big-endian target, must be described using
8625 a mask selecting GCC high-lanes.
8627 Big-Endian Little-Endian
8629 GCC 0 1 2 3 3 2 1 0
8630 | x | x | x | x | | x | x | x | x |
8631 Architecture 3 2 1 0 3 2 1 0
8633 Low Mask: { 2, 3 } { 0, 1 }
8634 High Mask: { 0, 1 } { 2, 3 }
8638 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8640 int nunits = GET_MODE_NUNITS (mode);
8641 rtvec v = rtvec_alloc (nunits / 2);
8642 int high_base = nunits / 2;
8643 int low_base = 0;
8644 int base;
8645 rtx t1;
8646 int i;
8648 if (BYTES_BIG_ENDIAN)
8649 base = high ? low_base : high_base;
8650 else
8651 base = high ? high_base : low_base;
8653 for (i = 0; i < nunits / 2; i++)
8654 RTVEC_ELT (v, i) = GEN_INT (base + i);
8656 t1 = gen_rtx_PARALLEL (mode, v);
8657 return t1;
8660 /* Check OP for validity as a PARALLEL RTX vector with elements
8661 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8662 from the perspective of the architecture. See the diagram above
8663 aarch64_simd_vect_par_cnst_half for more details. */
8665 bool
8666 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8667 bool high)
8669 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8670 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8671 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8672 int i = 0;
8674 if (!VECTOR_MODE_P (mode))
8675 return false;
8677 if (count_op != count_ideal)
8678 return false;
8680 for (i = 0; i < count_ideal; i++)
8682 rtx elt_op = XVECEXP (op, 0, i);
8683 rtx elt_ideal = XVECEXP (ideal, 0, i);
8685 if (!CONST_INT_P (elt_op)
8686 || INTVAL (elt_ideal) != INTVAL (elt_op))
8687 return false;
8689 return true;
8692 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8693 HIGH (exclusive). */
8694 void
8695 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8696 const_tree exp)
8698 HOST_WIDE_INT lane;
8699 gcc_assert (CONST_INT_P (operand));
8700 lane = INTVAL (operand);
8702 if (lane < low || lane >= high)
8704 if (exp)
8705 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8706 else
8707 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8711 /* Return TRUE if OP is a valid vector addressing mode. */
8712 bool
8713 aarch64_simd_mem_operand_p (rtx op)
8715 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8716 || REG_P (XEXP (op, 0)));
8719 /* Emit a register copy from operand to operand, taking care not to
8720 early-clobber source registers in the process.
8722 COUNT is the number of components into which the copy needs to be
8723 decomposed. */
8724 void
8725 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8726 unsigned int count)
8728 unsigned int i;
8729 int rdest = REGNO (operands[0]);
8730 int rsrc = REGNO (operands[1]);
8732 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8733 || rdest < rsrc)
8734 for (i = 0; i < count; i++)
8735 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8736 gen_rtx_REG (mode, rsrc + i));
8737 else
8738 for (i = 0; i < count; i++)
8739 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8740 gen_rtx_REG (mode, rsrc + count - i - 1));
8743 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8744 one of VSTRUCT modes: OI, CI or XI. */
8746 aarch64_simd_attr_length_move (rtx_insn *insn)
8748 machine_mode mode;
8750 extract_insn_cached (insn);
8752 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8754 mode = GET_MODE (recog_data.operand[0]);
8755 switch (mode)
8757 case OImode:
8758 return 8;
8759 case CImode:
8760 return 12;
8761 case XImode:
8762 return 16;
8763 default:
8764 gcc_unreachable ();
8767 return 4;
8770 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8771 one of VSTRUCT modes: OI, CI, EI, or XI. */
8773 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8775 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8778 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8779 alignment of a vector to 128 bits. */
8780 static HOST_WIDE_INT
8781 aarch64_simd_vector_alignment (const_tree type)
8783 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8784 return MIN (align, 128);
8787 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8788 static bool
8789 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8791 if (is_packed)
8792 return false;
8794 /* We guarantee alignment for vectors up to 128-bits. */
8795 if (tree_int_cst_compare (TYPE_SIZE (type),
8796 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8797 return false;
8799 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8800 return true;
8803 /* If VALS is a vector constant that can be loaded into a register
8804 using DUP, generate instructions to do so and return an RTX to
8805 assign to the register. Otherwise return NULL_RTX. */
8806 static rtx
8807 aarch64_simd_dup_constant (rtx vals)
8809 machine_mode mode = GET_MODE (vals);
8810 machine_mode inner_mode = GET_MODE_INNER (mode);
8811 int n_elts = GET_MODE_NUNITS (mode);
8812 bool all_same = true;
8813 rtx x;
8814 int i;
8816 if (GET_CODE (vals) != CONST_VECTOR)
8817 return NULL_RTX;
8819 for (i = 1; i < n_elts; ++i)
8821 x = CONST_VECTOR_ELT (vals, i);
8822 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8823 all_same = false;
8826 if (!all_same)
8827 return NULL_RTX;
8829 /* We can load this constant by using DUP and a constant in a
8830 single ARM register. This will be cheaper than a vector
8831 load. */
8832 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8833 return gen_rtx_VEC_DUPLICATE (mode, x);
8837 /* Generate code to load VALS, which is a PARALLEL containing only
8838 constants (for vec_init) or CONST_VECTOR, efficiently into a
8839 register. Returns an RTX to copy into the register, or NULL_RTX
8840 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8841 static rtx
8842 aarch64_simd_make_constant (rtx vals)
8844 machine_mode mode = GET_MODE (vals);
8845 rtx const_dup;
8846 rtx const_vec = NULL_RTX;
8847 int n_elts = GET_MODE_NUNITS (mode);
8848 int n_const = 0;
8849 int i;
8851 if (GET_CODE (vals) == CONST_VECTOR)
8852 const_vec = vals;
8853 else if (GET_CODE (vals) == PARALLEL)
8855 /* A CONST_VECTOR must contain only CONST_INTs and
8856 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8857 Only store valid constants in a CONST_VECTOR. */
8858 for (i = 0; i < n_elts; ++i)
8860 rtx x = XVECEXP (vals, 0, i);
8861 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8862 n_const++;
8864 if (n_const == n_elts)
8865 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8867 else
8868 gcc_unreachable ();
8870 if (const_vec != NULL_RTX
8871 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8872 /* Load using MOVI/MVNI. */
8873 return const_vec;
8874 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8875 /* Loaded using DUP. */
8876 return const_dup;
8877 else if (const_vec != NULL_RTX)
8878 /* Load from constant pool. We can not take advantage of single-cycle
8879 LD1 because we need a PC-relative addressing mode. */
8880 return const_vec;
8881 else
8882 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8883 We can not construct an initializer. */
8884 return NULL_RTX;
8887 void
8888 aarch64_expand_vector_init (rtx target, rtx vals)
8890 machine_mode mode = GET_MODE (target);
8891 machine_mode inner_mode = GET_MODE_INNER (mode);
8892 int n_elts = GET_MODE_NUNITS (mode);
8893 int n_var = 0;
8894 rtx any_const = NULL_RTX;
8895 bool all_same = true;
8897 for (int i = 0; i < n_elts; ++i)
8899 rtx x = XVECEXP (vals, 0, i);
8900 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8901 ++n_var;
8902 else
8903 any_const = x;
8905 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8906 all_same = false;
8909 if (n_var == 0)
8911 rtx constant = aarch64_simd_make_constant (vals);
8912 if (constant != NULL_RTX)
8914 emit_move_insn (target, constant);
8915 return;
8919 /* Splat a single non-constant element if we can. */
8920 if (all_same)
8922 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8923 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8924 return;
8927 /* Half the fields (or less) are non-constant. Load constant then overwrite
8928 varying fields. Hope that this is more efficient than using the stack. */
8929 if (n_var <= n_elts/2)
8931 rtx copy = copy_rtx (vals);
8933 /* Load constant part of vector. We really don't care what goes into the
8934 parts we will overwrite, but we're more likely to be able to load the
8935 constant efficiently if it has fewer, larger, repeating parts
8936 (see aarch64_simd_valid_immediate). */
8937 for (int i = 0; i < n_elts; i++)
8939 rtx x = XVECEXP (vals, 0, i);
8940 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8941 continue;
8942 rtx subst = any_const;
8943 for (int bit = n_elts / 2; bit > 0; bit /= 2)
8945 /* Look in the copied vector, as more elements are const. */
8946 rtx test = XVECEXP (copy, 0, i ^ bit);
8947 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8949 subst = test;
8950 break;
8953 XVECEXP (copy, 0, i) = subst;
8955 aarch64_expand_vector_init (target, copy);
8957 /* Insert variables. */
8958 enum insn_code icode = optab_handler (vec_set_optab, mode);
8959 gcc_assert (icode != CODE_FOR_nothing);
8961 for (int i = 0; i < n_elts; i++)
8963 rtx x = XVECEXP (vals, 0, i);
8964 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8965 continue;
8966 x = copy_to_mode_reg (inner_mode, x);
8967 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8969 return;
8972 /* Construct the vector in memory one field at a time
8973 and load the whole vector. */
8974 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8975 for (int i = 0; i < n_elts; i++)
8976 emit_move_insn (adjust_address_nv (mem, inner_mode,
8977 i * GET_MODE_SIZE (inner_mode)),
8978 XVECEXP (vals, 0, i));
8979 emit_move_insn (target, mem);
8983 static unsigned HOST_WIDE_INT
8984 aarch64_shift_truncation_mask (machine_mode mode)
8986 return
8987 (aarch64_vector_mode_supported_p (mode)
8988 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8991 #ifndef TLS_SECTION_ASM_FLAG
8992 #define TLS_SECTION_ASM_FLAG 'T'
8993 #endif
8995 void
8996 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8997 tree decl ATTRIBUTE_UNUSED)
8999 char flagchars[10], *f = flagchars;
9001 /* If we have already declared this section, we can use an
9002 abbreviated form to switch back to it -- unless this section is
9003 part of a COMDAT groups, in which case GAS requires the full
9004 declaration every time. */
9005 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9006 && (flags & SECTION_DECLARED))
9008 fprintf (asm_out_file, "\t.section\t%s\n", name);
9009 return;
9012 if (!(flags & SECTION_DEBUG))
9013 *f++ = 'a';
9014 if (flags & SECTION_WRITE)
9015 *f++ = 'w';
9016 if (flags & SECTION_CODE)
9017 *f++ = 'x';
9018 if (flags & SECTION_SMALL)
9019 *f++ = 's';
9020 if (flags & SECTION_MERGE)
9021 *f++ = 'M';
9022 if (flags & SECTION_STRINGS)
9023 *f++ = 'S';
9024 if (flags & SECTION_TLS)
9025 *f++ = TLS_SECTION_ASM_FLAG;
9026 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9027 *f++ = 'G';
9028 *f = '\0';
9030 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9032 if (!(flags & SECTION_NOTYPE))
9034 const char *type;
9035 const char *format;
9037 if (flags & SECTION_BSS)
9038 type = "nobits";
9039 else
9040 type = "progbits";
9042 #ifdef TYPE_OPERAND_FMT
9043 format = "," TYPE_OPERAND_FMT;
9044 #else
9045 format = ",@%s";
9046 #endif
9048 fprintf (asm_out_file, format, type);
9050 if (flags & SECTION_ENTSIZE)
9051 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9052 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9054 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9055 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9056 else
9057 fprintf (asm_out_file, ",%s,comdat",
9058 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9062 putc ('\n', asm_out_file);
9065 /* Select a format to encode pointers in exception handling data. */
9067 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9069 int type;
9070 switch (aarch64_cmodel)
9072 case AARCH64_CMODEL_TINY:
9073 case AARCH64_CMODEL_TINY_PIC:
9074 case AARCH64_CMODEL_SMALL:
9075 case AARCH64_CMODEL_SMALL_PIC:
9076 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9077 for everything. */
9078 type = DW_EH_PE_sdata4;
9079 break;
9080 default:
9081 /* No assumptions here. 8-byte relocs required. */
9082 type = DW_EH_PE_sdata8;
9083 break;
9085 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9088 /* Emit load exclusive. */
9090 static void
9091 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9092 rtx mem, rtx model_rtx)
9094 rtx (*gen) (rtx, rtx, rtx);
9096 switch (mode)
9098 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9099 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9100 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9101 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9102 default:
9103 gcc_unreachable ();
9106 emit_insn (gen (rval, mem, model_rtx));
9109 /* Emit store exclusive. */
9111 static void
9112 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9113 rtx rval, rtx mem, rtx model_rtx)
9115 rtx (*gen) (rtx, rtx, rtx, rtx);
9117 switch (mode)
9119 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9120 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9121 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9122 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9123 default:
9124 gcc_unreachable ();
9127 emit_insn (gen (bval, rval, mem, model_rtx));
9130 /* Mark the previous jump instruction as unlikely. */
9132 static void
9133 aarch64_emit_unlikely_jump (rtx insn)
9135 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9137 insn = emit_jump_insn (insn);
9138 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9141 /* Expand a compare and swap pattern. */
9143 void
9144 aarch64_expand_compare_and_swap (rtx operands[])
9146 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9147 machine_mode mode, cmp_mode;
9148 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9150 bval = operands[0];
9151 rval = operands[1];
9152 mem = operands[2];
9153 oldval = operands[3];
9154 newval = operands[4];
9155 is_weak = operands[5];
9156 mod_s = operands[6];
9157 mod_f = operands[7];
9158 mode = GET_MODE (mem);
9159 cmp_mode = mode;
9161 /* Normally the succ memory model must be stronger than fail, but in the
9162 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9163 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9165 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9166 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9167 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9169 switch (mode)
9171 case QImode:
9172 case HImode:
9173 /* For short modes, we're going to perform the comparison in SImode,
9174 so do the zero-extension now. */
9175 cmp_mode = SImode;
9176 rval = gen_reg_rtx (SImode);
9177 oldval = convert_modes (SImode, mode, oldval, true);
9178 /* Fall through. */
9180 case SImode:
9181 case DImode:
9182 /* Force the value into a register if needed. */
9183 if (!aarch64_plus_operand (oldval, mode))
9184 oldval = force_reg (cmp_mode, oldval);
9185 break;
9187 default:
9188 gcc_unreachable ();
9191 switch (mode)
9193 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9194 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9195 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9196 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9197 default:
9198 gcc_unreachable ();
9201 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9203 if (mode == QImode || mode == HImode)
9204 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9206 x = gen_rtx_REG (CCmode, CC_REGNUM);
9207 x = gen_rtx_EQ (SImode, x, const0_rtx);
9208 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9211 /* Split a compare and swap pattern. */
9213 void
9214 aarch64_split_compare_and_swap (rtx operands[])
9216 rtx rval, mem, oldval, newval, scratch;
9217 machine_mode mode;
9218 bool is_weak;
9219 rtx_code_label *label1, *label2;
9220 rtx x, cond;
9222 rval = operands[0];
9223 mem = operands[1];
9224 oldval = operands[2];
9225 newval = operands[3];
9226 is_weak = (operands[4] != const0_rtx);
9227 scratch = operands[7];
9228 mode = GET_MODE (mem);
9230 label1 = NULL;
9231 if (!is_weak)
9233 label1 = gen_label_rtx ();
9234 emit_label (label1);
9236 label2 = gen_label_rtx ();
9238 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9240 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9241 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9242 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9243 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9244 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9246 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9248 if (!is_weak)
9250 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9251 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9252 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9253 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9255 else
9257 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9258 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9259 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9262 emit_label (label2);
9265 /* Split an atomic operation. */
9267 void
9268 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9269 rtx value, rtx model_rtx, rtx cond)
9271 machine_mode mode = GET_MODE (mem);
9272 machine_mode wmode = (mode == DImode ? DImode : SImode);
9273 rtx_code_label *label;
9274 rtx x;
9276 label = gen_label_rtx ();
9277 emit_label (label);
9279 if (new_out)
9280 new_out = gen_lowpart (wmode, new_out);
9281 if (old_out)
9282 old_out = gen_lowpart (wmode, old_out);
9283 else
9284 old_out = new_out;
9285 value = simplify_gen_subreg (wmode, value, mode, 0);
9287 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9289 switch (code)
9291 case SET:
9292 new_out = value;
9293 break;
9295 case NOT:
9296 x = gen_rtx_AND (wmode, old_out, value);
9297 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9298 x = gen_rtx_NOT (wmode, new_out);
9299 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9300 break;
9302 case MINUS:
9303 if (CONST_INT_P (value))
9305 value = GEN_INT (-INTVAL (value));
9306 code = PLUS;
9308 /* Fall through. */
9310 default:
9311 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9312 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9313 break;
9316 aarch64_emit_store_exclusive (mode, cond, mem,
9317 gen_lowpart (mode, new_out), model_rtx);
9319 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9320 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9321 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9322 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9325 static void
9326 aarch64_print_extension (void)
9328 const struct aarch64_option_extension *opt = NULL;
9330 for (opt = all_extensions; opt->name != NULL; opt++)
9331 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9332 asm_fprintf (asm_out_file, "+%s", opt->name);
9334 asm_fprintf (asm_out_file, "\n");
9337 static void
9338 aarch64_start_file (void)
9340 if (selected_arch)
9342 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9343 aarch64_print_extension ();
9345 else if (selected_cpu)
9347 const char *truncated_name
9348 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9349 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9350 aarch64_print_extension ();
9352 default_file_start();
9355 /* Target hook for c_mode_for_suffix. */
9356 static machine_mode
9357 aarch64_c_mode_for_suffix (char suffix)
9359 if (suffix == 'q')
9360 return TFmode;
9362 return VOIDmode;
9365 /* We can only represent floating point constants which will fit in
9366 "quarter-precision" values. These values are characterised by
9367 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9370 (-1)^s * (n/16) * 2^r
9372 Where:
9373 's' is the sign bit.
9374 'n' is an integer in the range 16 <= n <= 31.
9375 'r' is an integer in the range -3 <= r <= 4. */
9377 /* Return true iff X can be represented by a quarter-precision
9378 floating point immediate operand X. Note, we cannot represent 0.0. */
9379 bool
9380 aarch64_float_const_representable_p (rtx x)
9382 /* This represents our current view of how many bits
9383 make up the mantissa. */
9384 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9385 int exponent;
9386 unsigned HOST_WIDE_INT mantissa, mask;
9387 REAL_VALUE_TYPE r, m;
9388 bool fail;
9390 if (!CONST_DOUBLE_P (x))
9391 return false;
9393 if (GET_MODE (x) == VOIDmode)
9394 return false;
9396 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9398 /* We cannot represent infinities, NaNs or +/-zero. We won't
9399 know if we have +zero until we analyse the mantissa, but we
9400 can reject the other invalid values. */
9401 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9402 || REAL_VALUE_MINUS_ZERO (r))
9403 return false;
9405 /* Extract exponent. */
9406 r = real_value_abs (&r);
9407 exponent = REAL_EXP (&r);
9409 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9410 highest (sign) bit, with a fixed binary point at bit point_pos.
9411 m1 holds the low part of the mantissa, m2 the high part.
9412 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9413 bits for the mantissa, this can fail (low bits will be lost). */
9414 real_ldexp (&m, &r, point_pos - exponent);
9415 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9417 /* If the low part of the mantissa has bits set we cannot represent
9418 the value. */
9419 if (w.elt (0) != 0)
9420 return false;
9421 /* We have rejected the lower HOST_WIDE_INT, so update our
9422 understanding of how many bits lie in the mantissa and
9423 look only at the high HOST_WIDE_INT. */
9424 mantissa = w.elt (1);
9425 point_pos -= HOST_BITS_PER_WIDE_INT;
9427 /* We can only represent values with a mantissa of the form 1.xxxx. */
9428 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9429 if ((mantissa & mask) != 0)
9430 return false;
9432 /* Having filtered unrepresentable values, we may now remove all
9433 but the highest 5 bits. */
9434 mantissa >>= point_pos - 5;
9436 /* We cannot represent the value 0.0, so reject it. This is handled
9437 elsewhere. */
9438 if (mantissa == 0)
9439 return false;
9441 /* Then, as bit 4 is always set, we can mask it off, leaving
9442 the mantissa in the range [0, 15]. */
9443 mantissa &= ~(1 << 4);
9444 gcc_assert (mantissa <= 15);
9446 /* GCC internally does not use IEEE754-like encoding (where normalized
9447 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9448 Our mantissa values are shifted 4 places to the left relative to
9449 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9450 by 5 places to correct for GCC's representation. */
9451 exponent = 5 - exponent;
9453 return (exponent >= 0 && exponent <= 7);
9456 char*
9457 aarch64_output_simd_mov_immediate (rtx const_vector,
9458 machine_mode mode,
9459 unsigned width)
9461 bool is_valid;
9462 static char templ[40];
9463 const char *mnemonic;
9464 const char *shift_op;
9465 unsigned int lane_count = 0;
9466 char element_char;
9468 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9470 /* This will return true to show const_vector is legal for use as either
9471 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9472 also update INFO to show how the immediate should be generated. */
9473 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9474 gcc_assert (is_valid);
9476 element_char = sizetochar (info.element_width);
9477 lane_count = width / info.element_width;
9479 mode = GET_MODE_INNER (mode);
9480 if (mode == SFmode || mode == DFmode)
9482 gcc_assert (info.shift == 0 && ! info.mvn);
9483 if (aarch64_float_const_zero_rtx_p (info.value))
9484 info.value = GEN_INT (0);
9485 else
9487 #define buf_size 20
9488 REAL_VALUE_TYPE r;
9489 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9490 char float_buf[buf_size] = {'\0'};
9491 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9492 #undef buf_size
9494 if (lane_count == 1)
9495 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9496 else
9497 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9498 lane_count, element_char, float_buf);
9499 return templ;
9503 mnemonic = info.mvn ? "mvni" : "movi";
9504 shift_op = info.msl ? "msl" : "lsl";
9506 if (lane_count == 1)
9507 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9508 mnemonic, UINTVAL (info.value));
9509 else if (info.shift)
9510 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9511 ", %s %d", mnemonic, lane_count, element_char,
9512 UINTVAL (info.value), shift_op, info.shift);
9513 else
9514 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9515 mnemonic, lane_count, element_char, UINTVAL (info.value));
9516 return templ;
9519 char*
9520 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9521 machine_mode mode)
9523 machine_mode vmode;
9525 gcc_assert (!VECTOR_MODE_P (mode));
9526 vmode = aarch64_simd_container_mode (mode, 64);
9527 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9528 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9531 /* Split operands into moves from op[1] + op[2] into op[0]. */
9533 void
9534 aarch64_split_combinev16qi (rtx operands[3])
9536 unsigned int dest = REGNO (operands[0]);
9537 unsigned int src1 = REGNO (operands[1]);
9538 unsigned int src2 = REGNO (operands[2]);
9539 machine_mode halfmode = GET_MODE (operands[1]);
9540 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9541 rtx destlo, desthi;
9543 gcc_assert (halfmode == V16QImode);
9545 if (src1 == dest && src2 == dest + halfregs)
9547 /* No-op move. Can't split to nothing; emit something. */
9548 emit_note (NOTE_INSN_DELETED);
9549 return;
9552 /* Preserve register attributes for variable tracking. */
9553 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9554 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9555 GET_MODE_SIZE (halfmode));
9557 /* Special case of reversed high/low parts. */
9558 if (reg_overlap_mentioned_p (operands[2], destlo)
9559 && reg_overlap_mentioned_p (operands[1], desthi))
9561 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9562 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9563 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9565 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9567 /* Try to avoid unnecessary moves if part of the result
9568 is in the right place already. */
9569 if (src1 != dest)
9570 emit_move_insn (destlo, operands[1]);
9571 if (src2 != dest + halfregs)
9572 emit_move_insn (desthi, operands[2]);
9574 else
9576 if (src2 != dest + halfregs)
9577 emit_move_insn (desthi, operands[2]);
9578 if (src1 != dest)
9579 emit_move_insn (destlo, operands[1]);
9583 /* vec_perm support. */
9585 #define MAX_VECT_LEN 16
9587 struct expand_vec_perm_d
9589 rtx target, op0, op1;
9590 unsigned char perm[MAX_VECT_LEN];
9591 machine_mode vmode;
9592 unsigned char nelt;
9593 bool one_vector_p;
9594 bool testing_p;
9597 /* Generate a variable permutation. */
9599 static void
9600 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9602 machine_mode vmode = GET_MODE (target);
9603 bool one_vector_p = rtx_equal_p (op0, op1);
9605 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9606 gcc_checking_assert (GET_MODE (op0) == vmode);
9607 gcc_checking_assert (GET_MODE (op1) == vmode);
9608 gcc_checking_assert (GET_MODE (sel) == vmode);
9609 gcc_checking_assert (TARGET_SIMD);
9611 if (one_vector_p)
9613 if (vmode == V8QImode)
9615 /* Expand the argument to a V16QI mode by duplicating it. */
9616 rtx pair = gen_reg_rtx (V16QImode);
9617 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9618 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9620 else
9622 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9625 else
9627 rtx pair;
9629 if (vmode == V8QImode)
9631 pair = gen_reg_rtx (V16QImode);
9632 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9633 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9635 else
9637 pair = gen_reg_rtx (OImode);
9638 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9639 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9644 void
9645 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9647 machine_mode vmode = GET_MODE (target);
9648 unsigned int nelt = GET_MODE_NUNITS (vmode);
9649 bool one_vector_p = rtx_equal_p (op0, op1);
9650 rtx mask;
9652 /* The TBL instruction does not use a modulo index, so we must take care
9653 of that ourselves. */
9654 mask = aarch64_simd_gen_const_vector_dup (vmode,
9655 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9656 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9658 /* For big-endian, we also need to reverse the index within the vector
9659 (but not which vector). */
9660 if (BYTES_BIG_ENDIAN)
9662 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9663 if (!one_vector_p)
9664 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9665 sel = expand_simple_binop (vmode, XOR, sel, mask,
9666 NULL, 0, OPTAB_LIB_WIDEN);
9668 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9671 /* Recognize patterns suitable for the TRN instructions. */
9672 static bool
9673 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9675 unsigned int i, odd, mask, nelt = d->nelt;
9676 rtx out, in0, in1, x;
9677 rtx (*gen) (rtx, rtx, rtx);
9678 machine_mode vmode = d->vmode;
9680 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9681 return false;
9683 /* Note that these are little-endian tests.
9684 We correct for big-endian later. */
9685 if (d->perm[0] == 0)
9686 odd = 0;
9687 else if (d->perm[0] == 1)
9688 odd = 1;
9689 else
9690 return false;
9691 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9693 for (i = 0; i < nelt; i += 2)
9695 if (d->perm[i] != i + odd)
9696 return false;
9697 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9698 return false;
9701 /* Success! */
9702 if (d->testing_p)
9703 return true;
9705 in0 = d->op0;
9706 in1 = d->op1;
9707 if (BYTES_BIG_ENDIAN)
9709 x = in0, in0 = in1, in1 = x;
9710 odd = !odd;
9712 out = d->target;
9714 if (odd)
9716 switch (vmode)
9718 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9719 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9720 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9721 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9722 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9723 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9724 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9725 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9726 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9727 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9728 default:
9729 return false;
9732 else
9734 switch (vmode)
9736 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9737 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9738 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9739 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9740 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9741 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9742 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9743 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9744 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9745 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9746 default:
9747 return false;
9751 emit_insn (gen (out, in0, in1));
9752 return true;
9755 /* Recognize patterns suitable for the UZP instructions. */
9756 static bool
9757 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9759 unsigned int i, odd, mask, nelt = d->nelt;
9760 rtx out, in0, in1, x;
9761 rtx (*gen) (rtx, rtx, rtx);
9762 machine_mode vmode = d->vmode;
9764 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9765 return false;
9767 /* Note that these are little-endian tests.
9768 We correct for big-endian later. */
9769 if (d->perm[0] == 0)
9770 odd = 0;
9771 else if (d->perm[0] == 1)
9772 odd = 1;
9773 else
9774 return false;
9775 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9777 for (i = 0; i < nelt; i++)
9779 unsigned elt = (i * 2 + odd) & mask;
9780 if (d->perm[i] != elt)
9781 return false;
9784 /* Success! */
9785 if (d->testing_p)
9786 return true;
9788 in0 = d->op0;
9789 in1 = d->op1;
9790 if (BYTES_BIG_ENDIAN)
9792 x = in0, in0 = in1, in1 = x;
9793 odd = !odd;
9795 out = d->target;
9797 if (odd)
9799 switch (vmode)
9801 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9802 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9803 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9804 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9805 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9806 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9807 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9808 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9809 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9810 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9811 default:
9812 return false;
9815 else
9817 switch (vmode)
9819 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9820 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9821 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9822 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9823 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9824 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9825 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9826 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9827 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9828 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9829 default:
9830 return false;
9834 emit_insn (gen (out, in0, in1));
9835 return true;
9838 /* Recognize patterns suitable for the ZIP instructions. */
9839 static bool
9840 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9842 unsigned int i, high, mask, nelt = d->nelt;
9843 rtx out, in0, in1, x;
9844 rtx (*gen) (rtx, rtx, rtx);
9845 machine_mode vmode = d->vmode;
9847 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9848 return false;
9850 /* Note that these are little-endian tests.
9851 We correct for big-endian later. */
9852 high = nelt / 2;
9853 if (d->perm[0] == high)
9854 /* Do Nothing. */
9856 else if (d->perm[0] == 0)
9857 high = 0;
9858 else
9859 return false;
9860 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9862 for (i = 0; i < nelt / 2; i++)
9864 unsigned elt = (i + high) & mask;
9865 if (d->perm[i * 2] != elt)
9866 return false;
9867 elt = (elt + nelt) & mask;
9868 if (d->perm[i * 2 + 1] != elt)
9869 return false;
9872 /* Success! */
9873 if (d->testing_p)
9874 return true;
9876 in0 = d->op0;
9877 in1 = d->op1;
9878 if (BYTES_BIG_ENDIAN)
9880 x = in0, in0 = in1, in1 = x;
9881 high = !high;
9883 out = d->target;
9885 if (high)
9887 switch (vmode)
9889 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9890 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9891 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9892 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9893 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9894 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9895 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9896 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9897 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9898 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9899 default:
9900 return false;
9903 else
9905 switch (vmode)
9907 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9908 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9909 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9910 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9911 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9912 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9913 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9914 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9915 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9916 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9917 default:
9918 return false;
9922 emit_insn (gen (out, in0, in1));
9923 return true;
9926 /* Recognize patterns for the EXT insn. */
9928 static bool
9929 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9931 unsigned int i, nelt = d->nelt;
9932 rtx (*gen) (rtx, rtx, rtx, rtx);
9933 rtx offset;
9935 unsigned int location = d->perm[0]; /* Always < nelt. */
9937 /* Check if the extracted indices are increasing by one. */
9938 for (i = 1; i < nelt; i++)
9940 unsigned int required = location + i;
9941 if (d->one_vector_p)
9943 /* We'll pass the same vector in twice, so allow indices to wrap. */
9944 required &= (nelt - 1);
9946 if (d->perm[i] != required)
9947 return false;
9950 switch (d->vmode)
9952 case V16QImode: gen = gen_aarch64_extv16qi; break;
9953 case V8QImode: gen = gen_aarch64_extv8qi; break;
9954 case V4HImode: gen = gen_aarch64_extv4hi; break;
9955 case V8HImode: gen = gen_aarch64_extv8hi; break;
9956 case V2SImode: gen = gen_aarch64_extv2si; break;
9957 case V4SImode: gen = gen_aarch64_extv4si; break;
9958 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9959 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9960 case V2DImode: gen = gen_aarch64_extv2di; break;
9961 case V2DFmode: gen = gen_aarch64_extv2df; break;
9962 default:
9963 return false;
9966 /* Success! */
9967 if (d->testing_p)
9968 return true;
9970 /* The case where (location == 0) is a no-op for both big- and little-endian,
9971 and is removed by the mid-end at optimization levels -O1 and higher. */
9973 if (BYTES_BIG_ENDIAN && (location != 0))
9975 /* After setup, we want the high elements of the first vector (stored
9976 at the LSB end of the register), and the low elements of the second
9977 vector (stored at the MSB end of the register). So swap. */
9978 std::swap (d->op0, d->op1);
9979 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9980 location = nelt - location;
9983 offset = GEN_INT (location);
9984 emit_insn (gen (d->target, d->op0, d->op1, offset));
9985 return true;
9988 /* Recognize patterns for the REV insns. */
9990 static bool
9991 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9993 unsigned int i, j, diff, nelt = d->nelt;
9994 rtx (*gen) (rtx, rtx);
9996 if (!d->one_vector_p)
9997 return false;
9999 diff = d->perm[0];
10000 switch (diff)
10002 case 7:
10003 switch (d->vmode)
10005 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10006 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10007 default:
10008 return false;
10010 break;
10011 case 3:
10012 switch (d->vmode)
10014 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10015 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10016 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10017 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10018 default:
10019 return false;
10021 break;
10022 case 1:
10023 switch (d->vmode)
10025 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10026 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10027 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10028 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10029 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10030 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10031 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10032 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10033 default:
10034 return false;
10036 break;
10037 default:
10038 return false;
10041 for (i = 0; i < nelt ; i += diff + 1)
10042 for (j = 0; j <= diff; j += 1)
10044 /* This is guaranteed to be true as the value of diff
10045 is 7, 3, 1 and we should have enough elements in the
10046 queue to generate this. Getting a vector mask with a
10047 value of diff other than these values implies that
10048 something is wrong by the time we get here. */
10049 gcc_assert (i + j < nelt);
10050 if (d->perm[i + j] != i + diff - j)
10051 return false;
10054 /* Success! */
10055 if (d->testing_p)
10056 return true;
10058 emit_insn (gen (d->target, d->op0));
10059 return true;
10062 static bool
10063 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10065 rtx (*gen) (rtx, rtx, rtx);
10066 rtx out = d->target;
10067 rtx in0;
10068 machine_mode vmode = d->vmode;
10069 unsigned int i, elt, nelt = d->nelt;
10070 rtx lane;
10072 elt = d->perm[0];
10073 for (i = 1; i < nelt; i++)
10075 if (elt != d->perm[i])
10076 return false;
10079 /* The generic preparation in aarch64_expand_vec_perm_const_1
10080 swaps the operand order and the permute indices if it finds
10081 d->perm[0] to be in the second operand. Thus, we can always
10082 use d->op0 and need not do any extra arithmetic to get the
10083 correct lane number. */
10084 in0 = d->op0;
10085 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10087 switch (vmode)
10089 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10090 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10091 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10092 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10093 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10094 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10095 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10096 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10097 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10098 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10099 default:
10100 return false;
10103 emit_insn (gen (out, in0, lane));
10104 return true;
10107 static bool
10108 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10110 rtx rperm[MAX_VECT_LEN], sel;
10111 machine_mode vmode = d->vmode;
10112 unsigned int i, nelt = d->nelt;
10114 if (d->testing_p)
10115 return true;
10117 /* Generic code will try constant permutation twice. Once with the
10118 original mode and again with the elements lowered to QImode.
10119 So wait and don't do the selector expansion ourselves. */
10120 if (vmode != V8QImode && vmode != V16QImode)
10121 return false;
10123 for (i = 0; i < nelt; ++i)
10125 int nunits = GET_MODE_NUNITS (vmode);
10127 /* If big-endian and two vectors we end up with a weird mixed-endian
10128 mode on NEON. Reverse the index within each word but not the word
10129 itself. */
10130 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10131 : d->perm[i]);
10133 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10134 sel = force_reg (vmode, sel);
10136 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10137 return true;
10140 static bool
10141 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10143 /* The pattern matching functions above are written to look for a small
10144 number to begin the sequence (0, 1, N/2). If we begin with an index
10145 from the second operand, we can swap the operands. */
10146 if (d->perm[0] >= d->nelt)
10148 unsigned i, nelt = d->nelt;
10150 gcc_assert (nelt == (nelt & -nelt));
10151 for (i = 0; i < nelt; ++i)
10152 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10154 std::swap (d->op0, d->op1);
10157 if (TARGET_SIMD)
10159 if (aarch64_evpc_rev (d))
10160 return true;
10161 else if (aarch64_evpc_ext (d))
10162 return true;
10163 else if (aarch64_evpc_dup (d))
10164 return true;
10165 else if (aarch64_evpc_zip (d))
10166 return true;
10167 else if (aarch64_evpc_uzp (d))
10168 return true;
10169 else if (aarch64_evpc_trn (d))
10170 return true;
10171 return aarch64_evpc_tbl (d);
10173 return false;
10176 /* Expand a vec_perm_const pattern. */
10178 bool
10179 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10181 struct expand_vec_perm_d d;
10182 int i, nelt, which;
10184 d.target = target;
10185 d.op0 = op0;
10186 d.op1 = op1;
10188 d.vmode = GET_MODE (target);
10189 gcc_assert (VECTOR_MODE_P (d.vmode));
10190 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10191 d.testing_p = false;
10193 for (i = which = 0; i < nelt; ++i)
10195 rtx e = XVECEXP (sel, 0, i);
10196 int ei = INTVAL (e) & (2 * nelt - 1);
10197 which |= (ei < nelt ? 1 : 2);
10198 d.perm[i] = ei;
10201 switch (which)
10203 default:
10204 gcc_unreachable ();
10206 case 3:
10207 d.one_vector_p = false;
10208 if (!rtx_equal_p (op0, op1))
10209 break;
10211 /* The elements of PERM do not suggest that only the first operand
10212 is used, but both operands are identical. Allow easier matching
10213 of the permutation by folding the permutation into the single
10214 input vector. */
10215 /* Fall Through. */
10216 case 2:
10217 for (i = 0; i < nelt; ++i)
10218 d.perm[i] &= nelt - 1;
10219 d.op0 = op1;
10220 d.one_vector_p = true;
10221 break;
10223 case 1:
10224 d.op1 = op0;
10225 d.one_vector_p = true;
10226 break;
10229 return aarch64_expand_vec_perm_const_1 (&d);
10232 static bool
10233 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10234 const unsigned char *sel)
10236 struct expand_vec_perm_d d;
10237 unsigned int i, nelt, which;
10238 bool ret;
10240 d.vmode = vmode;
10241 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10242 d.testing_p = true;
10243 memcpy (d.perm, sel, nelt);
10245 /* Calculate whether all elements are in one vector. */
10246 for (i = which = 0; i < nelt; ++i)
10248 unsigned char e = d.perm[i];
10249 gcc_assert (e < 2 * nelt);
10250 which |= (e < nelt ? 1 : 2);
10253 /* If all elements are from the second vector, reindex as if from the
10254 first vector. */
10255 if (which == 2)
10256 for (i = 0; i < nelt; ++i)
10257 d.perm[i] -= nelt;
10259 /* Check whether the mask can be applied to a single vector. */
10260 d.one_vector_p = (which != 3);
10262 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10263 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10264 if (!d.one_vector_p)
10265 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10267 start_sequence ();
10268 ret = aarch64_expand_vec_perm_const_1 (&d);
10269 end_sequence ();
10271 return ret;
10275 aarch64_reverse_mask (enum machine_mode mode)
10277 /* We have to reverse each vector because we dont have
10278 a permuted load that can reverse-load according to ABI rules. */
10279 rtx mask;
10280 rtvec v = rtvec_alloc (16);
10281 int i, j;
10282 int nunits = GET_MODE_NUNITS (mode);
10283 int usize = GET_MODE_UNIT_SIZE (mode);
10285 gcc_assert (BYTES_BIG_ENDIAN);
10286 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10288 for (i = 0; i < nunits; i++)
10289 for (j = 0; j < usize; j++)
10290 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10291 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10292 return force_reg (V16QImode, mask);
10295 /* Implement MODES_TIEABLE_P. */
10297 bool
10298 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10300 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10301 return true;
10303 /* We specifically want to allow elements of "structure" modes to
10304 be tieable to the structure. This more general condition allows
10305 other rarer situations too. */
10306 if (TARGET_SIMD
10307 && aarch64_vector_mode_p (mode1)
10308 && aarch64_vector_mode_p (mode2))
10309 return true;
10311 return false;
10314 /* Return a new RTX holding the result of moving POINTER forward by
10315 AMOUNT bytes. */
10317 static rtx
10318 aarch64_move_pointer (rtx pointer, int amount)
10320 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10322 return adjust_automodify_address (pointer, GET_MODE (pointer),
10323 next, amount);
10326 /* Return a new RTX holding the result of moving POINTER forward by the
10327 size of the mode it points to. */
10329 static rtx
10330 aarch64_progress_pointer (rtx pointer)
10332 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10334 return aarch64_move_pointer (pointer, amount);
10337 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10338 MODE bytes. */
10340 static void
10341 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10342 machine_mode mode)
10344 rtx reg = gen_reg_rtx (mode);
10346 /* "Cast" the pointers to the correct mode. */
10347 *src = adjust_address (*src, mode, 0);
10348 *dst = adjust_address (*dst, mode, 0);
10349 /* Emit the memcpy. */
10350 emit_move_insn (reg, *src);
10351 emit_move_insn (*dst, reg);
10352 /* Move the pointers forward. */
10353 *src = aarch64_progress_pointer (*src);
10354 *dst = aarch64_progress_pointer (*dst);
10357 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10358 we succeed, otherwise return false. */
10360 bool
10361 aarch64_expand_movmem (rtx *operands)
10363 unsigned int n;
10364 rtx dst = operands[0];
10365 rtx src = operands[1];
10366 rtx base;
10367 bool speed_p = !optimize_function_for_size_p (cfun);
10369 /* When optimizing for size, give a better estimate of the length of a
10370 memcpy call, but use the default otherwise. */
10371 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10373 /* We can't do anything smart if the amount to copy is not constant. */
10374 if (!CONST_INT_P (operands[2]))
10375 return false;
10377 n = UINTVAL (operands[2]);
10379 /* Try to keep the number of instructions low. For cases below 16 bytes we
10380 need to make at most two moves. For cases above 16 bytes it will be one
10381 move for each 16 byte chunk, then at most two additional moves. */
10382 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10383 return false;
10385 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10386 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10388 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10389 src = adjust_automodify_address (src, VOIDmode, base, 0);
10391 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10392 1-byte chunk. */
10393 if (n < 4)
10395 if (n >= 2)
10397 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10398 n -= 2;
10401 if (n == 1)
10402 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10404 return true;
10407 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10408 4-byte chunk, partially overlapping with the previously copied chunk. */
10409 if (n < 8)
10411 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10412 n -= 4;
10413 if (n > 0)
10415 int move = n - 4;
10417 src = aarch64_move_pointer (src, move);
10418 dst = aarch64_move_pointer (dst, move);
10419 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10421 return true;
10424 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10425 them, then (if applicable) an 8-byte chunk. */
10426 while (n >= 8)
10428 if (n / 16)
10430 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10431 n -= 16;
10433 else
10435 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10436 n -= 8;
10440 /* Finish the final bytes of the copy. We can always do this in one
10441 instruction. We either copy the exact amount we need, or partially
10442 overlap with the previous chunk we copied and copy 8-bytes. */
10443 if (n == 0)
10444 return true;
10445 else if (n == 1)
10446 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10447 else if (n == 2)
10448 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10449 else if (n == 4)
10450 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10451 else
10453 if (n == 3)
10455 src = aarch64_move_pointer (src, -1);
10456 dst = aarch64_move_pointer (dst, -1);
10457 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10459 else
10461 int move = n - 8;
10463 src = aarch64_move_pointer (src, move);
10464 dst = aarch64_move_pointer (dst, move);
10465 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10469 return true;
10472 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10474 static unsigned HOST_WIDE_INT
10475 aarch64_asan_shadow_offset (void)
10477 return (HOST_WIDE_INT_1 << 36);
10480 static bool
10481 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10482 unsigned int align,
10483 enum by_pieces_operation op,
10484 bool speed_p)
10486 /* STORE_BY_PIECES can be used when copying a constant string, but
10487 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10488 For now we always fail this and let the move_by_pieces code copy
10489 the string from read-only memory. */
10490 if (op == STORE_BY_PIECES)
10491 return false;
10493 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10496 static enum machine_mode
10497 aarch64_code_to_ccmode (enum rtx_code code)
10499 switch (code)
10501 case NE:
10502 return CC_DNEmode;
10504 case EQ:
10505 return CC_DEQmode;
10507 case LE:
10508 return CC_DLEmode;
10510 case LT:
10511 return CC_DLTmode;
10513 case GE:
10514 return CC_DGEmode;
10516 case GT:
10517 return CC_DGTmode;
10519 case LEU:
10520 return CC_DLEUmode;
10522 case LTU:
10523 return CC_DLTUmode;
10525 case GEU:
10526 return CC_DGEUmode;
10528 case GTU:
10529 return CC_DGTUmode;
10531 default:
10532 return CCmode;
10536 static rtx
10537 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10538 int code, tree treeop0, tree treeop1)
10540 enum machine_mode op_mode, cmp_mode, cc_mode;
10541 rtx op0, op1, cmp, target;
10542 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10543 enum insn_code icode;
10544 struct expand_operand ops[4];
10546 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10547 if (cc_mode == CCmode)
10548 return NULL_RTX;
10550 start_sequence ();
10551 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10553 op_mode = GET_MODE (op0);
10554 if (op_mode == VOIDmode)
10555 op_mode = GET_MODE (op1);
10557 switch (op_mode)
10559 case QImode:
10560 case HImode:
10561 case SImode:
10562 cmp_mode = SImode;
10563 icode = CODE_FOR_cmpsi;
10564 break;
10566 case DImode:
10567 cmp_mode = DImode;
10568 icode = CODE_FOR_cmpdi;
10569 break;
10571 default:
10572 end_sequence ();
10573 return NULL_RTX;
10576 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10577 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10578 if (!op0 || !op1)
10580 end_sequence ();
10581 return NULL_RTX;
10583 *prep_seq = get_insns ();
10584 end_sequence ();
10586 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10587 target = gen_rtx_REG (CCmode, CC_REGNUM);
10589 create_output_operand (&ops[0], target, CCmode);
10590 create_fixed_operand (&ops[1], cmp);
10591 create_fixed_operand (&ops[2], op0);
10592 create_fixed_operand (&ops[3], op1);
10594 start_sequence ();
10595 if (!maybe_expand_insn (icode, 4, ops))
10597 end_sequence ();
10598 return NULL_RTX;
10600 *gen_seq = get_insns ();
10601 end_sequence ();
10603 return gen_rtx_REG (cc_mode, CC_REGNUM);
10606 static rtx
10607 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10608 tree treeop0, tree treeop1, int bit_code)
10610 rtx op0, op1, cmp0, cmp1, target;
10611 enum machine_mode op_mode, cmp_mode, cc_mode;
10612 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10613 enum insn_code icode = CODE_FOR_ccmp_andsi;
10614 struct expand_operand ops[6];
10616 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10617 if (cc_mode == CCmode)
10618 return NULL_RTX;
10620 push_to_sequence ((rtx_insn*) *prep_seq);
10621 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10623 op_mode = GET_MODE (op0);
10624 if (op_mode == VOIDmode)
10625 op_mode = GET_MODE (op1);
10627 switch (op_mode)
10629 case QImode:
10630 case HImode:
10631 case SImode:
10632 cmp_mode = SImode;
10633 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10634 : CODE_FOR_ccmp_iorsi;
10635 break;
10637 case DImode:
10638 cmp_mode = DImode;
10639 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10640 : CODE_FOR_ccmp_iordi;
10641 break;
10643 default:
10644 end_sequence ();
10645 return NULL_RTX;
10648 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10649 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10650 if (!op0 || !op1)
10652 end_sequence ();
10653 return NULL_RTX;
10655 *prep_seq = get_insns ();
10656 end_sequence ();
10658 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10659 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10660 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10662 create_fixed_operand (&ops[0], prev);
10663 create_fixed_operand (&ops[1], target);
10664 create_fixed_operand (&ops[2], op0);
10665 create_fixed_operand (&ops[3], op1);
10666 create_fixed_operand (&ops[4], cmp0);
10667 create_fixed_operand (&ops[5], cmp1);
10669 push_to_sequence ((rtx_insn*) *gen_seq);
10670 if (!maybe_expand_insn (icode, 6, ops))
10672 end_sequence ();
10673 return NULL_RTX;
10676 *gen_seq = get_insns ();
10677 end_sequence ();
10679 return target;
10682 #undef TARGET_GEN_CCMP_FIRST
10683 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10685 #undef TARGET_GEN_CCMP_NEXT
10686 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10688 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10689 instruction fusion of some sort. */
10691 static bool
10692 aarch64_macro_fusion_p (void)
10694 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10698 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10699 should be kept together during scheduling. */
10701 static bool
10702 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10704 rtx set_dest;
10705 rtx prev_set = single_set (prev);
10706 rtx curr_set = single_set (curr);
10707 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10708 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10710 if (!aarch64_macro_fusion_p ())
10711 return false;
10713 if (simple_sets_p
10714 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10716 /* We are trying to match:
10717 prev (mov) == (set (reg r0) (const_int imm16))
10718 curr (movk) == (set (zero_extract (reg r0)
10719 (const_int 16)
10720 (const_int 16))
10721 (const_int imm16_1)) */
10723 set_dest = SET_DEST (curr_set);
10725 if (GET_CODE (set_dest) == ZERO_EXTRACT
10726 && CONST_INT_P (SET_SRC (curr_set))
10727 && CONST_INT_P (SET_SRC (prev_set))
10728 && CONST_INT_P (XEXP (set_dest, 2))
10729 && INTVAL (XEXP (set_dest, 2)) == 16
10730 && REG_P (XEXP (set_dest, 0))
10731 && REG_P (SET_DEST (prev_set))
10732 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10734 return true;
10738 if (simple_sets_p
10739 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10742 /* We're trying to match:
10743 prev (adrp) == (set (reg r1)
10744 (high (symbol_ref ("SYM"))))
10745 curr (add) == (set (reg r0)
10746 (lo_sum (reg r1)
10747 (symbol_ref ("SYM"))))
10748 Note that r0 need not necessarily be the same as r1, especially
10749 during pre-regalloc scheduling. */
10751 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10752 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10754 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10755 && REG_P (XEXP (SET_SRC (curr_set), 0))
10756 && REGNO (XEXP (SET_SRC (curr_set), 0))
10757 == REGNO (SET_DEST (prev_set))
10758 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10759 XEXP (SET_SRC (curr_set), 1)))
10760 return true;
10764 if (simple_sets_p
10765 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10768 /* We're trying to match:
10769 prev (movk) == (set (zero_extract (reg r0)
10770 (const_int 16)
10771 (const_int 32))
10772 (const_int imm16_1))
10773 curr (movk) == (set (zero_extract (reg r0)
10774 (const_int 16)
10775 (const_int 48))
10776 (const_int imm16_2)) */
10778 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10779 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10780 && REG_P (XEXP (SET_DEST (prev_set), 0))
10781 && REG_P (XEXP (SET_DEST (curr_set), 0))
10782 && REGNO (XEXP (SET_DEST (prev_set), 0))
10783 == REGNO (XEXP (SET_DEST (curr_set), 0))
10784 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10785 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10786 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10787 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10788 && CONST_INT_P (SET_SRC (prev_set))
10789 && CONST_INT_P (SET_SRC (curr_set)))
10790 return true;
10793 if (simple_sets_p
10794 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10796 /* We're trying to match:
10797 prev (adrp) == (set (reg r0)
10798 (high (symbol_ref ("SYM"))))
10799 curr (ldr) == (set (reg r1)
10800 (mem (lo_sum (reg r0)
10801 (symbol_ref ("SYM")))))
10803 curr (ldr) == (set (reg r1)
10804 (zero_extend (mem
10805 (lo_sum (reg r0)
10806 (symbol_ref ("SYM")))))) */
10807 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10808 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10810 rtx curr_src = SET_SRC (curr_set);
10812 if (GET_CODE (curr_src) == ZERO_EXTEND)
10813 curr_src = XEXP (curr_src, 0);
10815 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10816 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10817 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10818 == REGNO (SET_DEST (prev_set))
10819 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10820 XEXP (SET_SRC (prev_set), 0)))
10821 return true;
10825 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10826 && any_condjump_p (curr))
10828 enum attr_type prev_type = get_attr_type (prev);
10830 /* FIXME: this misses some which is considered simple arthematic
10831 instructions for ThunderX. Simple shifts are missed here. */
10832 if (prev_type == TYPE_ALUS_SREG
10833 || prev_type == TYPE_ALUS_IMM
10834 || prev_type == TYPE_LOGICS_REG
10835 || prev_type == TYPE_LOGICS_IMM)
10836 return true;
10839 return false;
10842 /* If MEM is in the form of [base+offset], extract the two parts
10843 of address and set to BASE and OFFSET, otherwise return false
10844 after clearing BASE and OFFSET. */
10846 bool
10847 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10849 rtx addr;
10851 gcc_assert (MEM_P (mem));
10853 addr = XEXP (mem, 0);
10855 if (REG_P (addr))
10857 *base = addr;
10858 *offset = const0_rtx;
10859 return true;
10862 if (GET_CODE (addr) == PLUS
10863 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10865 *base = XEXP (addr, 0);
10866 *offset = XEXP (addr, 1);
10867 return true;
10870 *base = NULL_RTX;
10871 *offset = NULL_RTX;
10873 return false;
10876 /* Types for scheduling fusion. */
10877 enum sched_fusion_type
10879 SCHED_FUSION_NONE = 0,
10880 SCHED_FUSION_LD_SIGN_EXTEND,
10881 SCHED_FUSION_LD_ZERO_EXTEND,
10882 SCHED_FUSION_LD,
10883 SCHED_FUSION_ST,
10884 SCHED_FUSION_NUM
10887 /* If INSN is a load or store of address in the form of [base+offset],
10888 extract the two parts and set to BASE and OFFSET. Return scheduling
10889 fusion type this INSN is. */
10891 static enum sched_fusion_type
10892 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10894 rtx x, dest, src;
10895 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10897 gcc_assert (INSN_P (insn));
10898 x = PATTERN (insn);
10899 if (GET_CODE (x) != SET)
10900 return SCHED_FUSION_NONE;
10902 src = SET_SRC (x);
10903 dest = SET_DEST (x);
10905 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10906 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10907 return SCHED_FUSION_NONE;
10909 if (GET_CODE (src) == SIGN_EXTEND)
10911 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10912 src = XEXP (src, 0);
10913 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10914 return SCHED_FUSION_NONE;
10916 else if (GET_CODE (src) == ZERO_EXTEND)
10918 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10919 src = XEXP (src, 0);
10920 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10921 return SCHED_FUSION_NONE;
10924 if (GET_CODE (src) == MEM && REG_P (dest))
10925 extract_base_offset_in_addr (src, base, offset);
10926 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10928 fusion = SCHED_FUSION_ST;
10929 extract_base_offset_in_addr (dest, base, offset);
10931 else
10932 return SCHED_FUSION_NONE;
10934 if (*base == NULL_RTX || *offset == NULL_RTX)
10935 fusion = SCHED_FUSION_NONE;
10937 return fusion;
10940 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10942 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10943 and PRI are only calculated for these instructions. For other instruction,
10944 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10945 type instruction fusion can be added by returning different priorities.
10947 It's important that irrelevant instructions get the largest FUSION_PRI. */
10949 static void
10950 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10951 int *fusion_pri, int *pri)
10953 int tmp, off_val;
10954 rtx base, offset;
10955 enum sched_fusion_type fusion;
10957 gcc_assert (INSN_P (insn));
10959 tmp = max_pri - 1;
10960 fusion = fusion_load_store (insn, &base, &offset);
10961 if (fusion == SCHED_FUSION_NONE)
10963 *pri = tmp;
10964 *fusion_pri = tmp;
10965 return;
10968 /* Set FUSION_PRI according to fusion type and base register. */
10969 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10971 /* Calculate PRI. */
10972 tmp /= 2;
10974 /* INSN with smaller offset goes first. */
10975 off_val = (int)(INTVAL (offset));
10976 if (off_val >= 0)
10977 tmp -= (off_val & 0xfffff);
10978 else
10979 tmp += ((- off_val) & 0xfffff);
10981 *pri = tmp;
10982 return;
10985 /* Given OPERANDS of consecutive load/store, check if we can merge
10986 them into ldp/stp. LOAD is true if they are load instructions.
10987 MODE is the mode of memory operands. */
10989 bool
10990 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10991 enum machine_mode mode)
10993 HOST_WIDE_INT offval_1, offval_2, msize;
10994 enum reg_class rclass_1, rclass_2;
10995 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10997 if (load)
10999 mem_1 = operands[1];
11000 mem_2 = operands[3];
11001 reg_1 = operands[0];
11002 reg_2 = operands[2];
11003 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11004 if (REGNO (reg_1) == REGNO (reg_2))
11005 return false;
11007 else
11009 mem_1 = operands[0];
11010 mem_2 = operands[2];
11011 reg_1 = operands[1];
11012 reg_2 = operands[3];
11015 /* The mems cannot be volatile. */
11016 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11017 return false;
11019 /* Check if the addresses are in the form of [base+offset]. */
11020 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11021 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11022 return false;
11023 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11024 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11025 return false;
11027 /* Check if the bases are same. */
11028 if (!rtx_equal_p (base_1, base_2))
11029 return false;
11031 offval_1 = INTVAL (offset_1);
11032 offval_2 = INTVAL (offset_2);
11033 msize = GET_MODE_SIZE (mode);
11034 /* Check if the offsets are consecutive. */
11035 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11036 return false;
11038 /* Check if the addresses are clobbered by load. */
11039 if (load)
11041 if (reg_mentioned_p (reg_1, mem_1))
11042 return false;
11044 /* In increasing order, the last load can clobber the address. */
11045 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11046 return false;
11049 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11050 rclass_1 = FP_REGS;
11051 else
11052 rclass_1 = GENERAL_REGS;
11054 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11055 rclass_2 = FP_REGS;
11056 else
11057 rclass_2 = GENERAL_REGS;
11059 /* Check if the registers are of same class. */
11060 if (rclass_1 != rclass_2)
11061 return false;
11063 return true;
11066 /* Given OPERANDS of consecutive load/store, check if we can merge
11067 them into ldp/stp by adjusting the offset. LOAD is true if they
11068 are load instructions. MODE is the mode of memory operands.
11070 Given below consecutive stores:
11072 str w1, [xb, 0x100]
11073 str w1, [xb, 0x104]
11074 str w1, [xb, 0x108]
11075 str w1, [xb, 0x10c]
11077 Though the offsets are out of the range supported by stp, we can
11078 still pair them after adjusting the offset, like:
11080 add scratch, xb, 0x100
11081 stp w1, w1, [scratch]
11082 stp w1, w1, [scratch, 0x8]
11084 The peephole patterns detecting this opportunity should guarantee
11085 the scratch register is avaliable. */
11087 bool
11088 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11089 enum machine_mode mode)
11091 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11092 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11093 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11094 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11096 if (load)
11098 reg_1 = operands[0];
11099 mem_1 = operands[1];
11100 reg_2 = operands[2];
11101 mem_2 = operands[3];
11102 reg_3 = operands[4];
11103 mem_3 = operands[5];
11104 reg_4 = operands[6];
11105 mem_4 = operands[7];
11106 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11107 && REG_P (reg_3) && REG_P (reg_4));
11108 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11109 return false;
11111 else
11113 mem_1 = operands[0];
11114 reg_1 = operands[1];
11115 mem_2 = operands[2];
11116 reg_2 = operands[3];
11117 mem_3 = operands[4];
11118 reg_3 = operands[5];
11119 mem_4 = operands[6];
11120 reg_4 = operands[7];
11122 /* Skip if memory operand is by itslef valid for ldp/stp. */
11123 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11124 return false;
11126 /* The mems cannot be volatile. */
11127 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11128 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11129 return false;
11131 /* Check if the addresses are in the form of [base+offset]. */
11132 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11133 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11134 return false;
11135 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11136 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11137 return false;
11138 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11139 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11140 return false;
11141 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11142 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11143 return false;
11145 /* Check if the bases are same. */
11146 if (!rtx_equal_p (base_1, base_2)
11147 || !rtx_equal_p (base_2, base_3)
11148 || !rtx_equal_p (base_3, base_4))
11149 return false;
11151 offval_1 = INTVAL (offset_1);
11152 offval_2 = INTVAL (offset_2);
11153 offval_3 = INTVAL (offset_3);
11154 offval_4 = INTVAL (offset_4);
11155 msize = GET_MODE_SIZE (mode);
11156 /* Check if the offsets are consecutive. */
11157 if ((offval_1 != (offval_2 + msize)
11158 || offval_1 != (offval_3 + msize * 2)
11159 || offval_1 != (offval_4 + msize * 3))
11160 && (offval_4 != (offval_3 + msize)
11161 || offval_4 != (offval_2 + msize * 2)
11162 || offval_4 != (offval_1 + msize * 3)))
11163 return false;
11165 /* Check if the addresses are clobbered by load. */
11166 if (load)
11168 if (reg_mentioned_p (reg_1, mem_1)
11169 || reg_mentioned_p (reg_2, mem_2)
11170 || reg_mentioned_p (reg_3, mem_3))
11171 return false;
11173 /* In increasing order, the last load can clobber the address. */
11174 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11175 return false;
11178 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11179 rclass_1 = FP_REGS;
11180 else
11181 rclass_1 = GENERAL_REGS;
11183 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11184 rclass_2 = FP_REGS;
11185 else
11186 rclass_2 = GENERAL_REGS;
11188 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11189 rclass_3 = FP_REGS;
11190 else
11191 rclass_3 = GENERAL_REGS;
11193 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11194 rclass_4 = FP_REGS;
11195 else
11196 rclass_4 = GENERAL_REGS;
11198 /* Check if the registers are of same class. */
11199 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11200 return false;
11202 return true;
11205 /* Given OPERANDS of consecutive load/store, this function pairs them
11206 into ldp/stp after adjusting the offset. It depends on the fact
11207 that addresses of load/store instructions are in increasing order.
11208 MODE is the mode of memory operands. CODE is the rtl operator
11209 which should be applied to all memory operands, it's SIGN_EXTEND,
11210 ZERO_EXTEND or UNKNOWN. */
11212 bool
11213 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11214 enum machine_mode mode, RTX_CODE code)
11216 rtx base, offset, t1, t2;
11217 rtx mem_1, mem_2, mem_3, mem_4;
11218 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11220 if (load)
11222 mem_1 = operands[1];
11223 mem_2 = operands[3];
11224 mem_3 = operands[5];
11225 mem_4 = operands[7];
11227 else
11229 mem_1 = operands[0];
11230 mem_2 = operands[2];
11231 mem_3 = operands[4];
11232 mem_4 = operands[6];
11233 gcc_assert (code == UNKNOWN);
11236 extract_base_offset_in_addr (mem_1, &base, &offset);
11237 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11239 /* Adjust offset thus it can fit in ldp/stp instruction. */
11240 msize = GET_MODE_SIZE (mode);
11241 stp_off_limit = msize * 0x40;
11242 off_val = INTVAL (offset);
11243 abs_off = (off_val < 0) ? -off_val : off_val;
11244 new_off = abs_off % stp_off_limit;
11245 adj_off = abs_off - new_off;
11247 /* Further adjust to make sure all offsets are OK. */
11248 if ((new_off + msize * 2) >= stp_off_limit)
11250 adj_off += stp_off_limit;
11251 new_off -= stp_off_limit;
11254 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11255 if (adj_off >= 0x1000)
11256 return false;
11258 if (off_val < 0)
11260 adj_off = -adj_off;
11261 new_off = -new_off;
11264 /* Create new memory references. */
11265 mem_1 = change_address (mem_1, VOIDmode,
11266 plus_constant (DImode, operands[8], new_off));
11268 /* Check if the adjusted address is OK for ldp/stp. */
11269 if (!aarch64_mem_pair_operand (mem_1, mode))
11270 return false;
11272 msize = GET_MODE_SIZE (mode);
11273 mem_2 = change_address (mem_2, VOIDmode,
11274 plus_constant (DImode,
11275 operands[8],
11276 new_off + msize));
11277 mem_3 = change_address (mem_3, VOIDmode,
11278 plus_constant (DImode,
11279 operands[8],
11280 new_off + msize * 2));
11281 mem_4 = change_address (mem_4, VOIDmode,
11282 plus_constant (DImode,
11283 operands[8],
11284 new_off + msize * 3));
11286 if (code == ZERO_EXTEND)
11288 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11289 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11290 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11291 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11293 else if (code == SIGN_EXTEND)
11295 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11296 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11297 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11298 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11301 if (load)
11303 operands[1] = mem_1;
11304 operands[3] = mem_2;
11305 operands[5] = mem_3;
11306 operands[7] = mem_4;
11308 else
11310 operands[0] = mem_1;
11311 operands[2] = mem_2;
11312 operands[4] = mem_3;
11313 operands[6] = mem_4;
11316 /* Emit adjusting instruction. */
11317 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11318 plus_constant (DImode, base, adj_off)));
11319 /* Emit ldp/stp instructions. */
11320 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11321 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11322 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11323 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11324 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11325 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11326 return true;
11329 #undef TARGET_ADDRESS_COST
11330 #define TARGET_ADDRESS_COST aarch64_address_cost
11332 /* This hook will determines whether unnamed bitfields affect the alignment
11333 of the containing structure. The hook returns true if the structure
11334 should inherit the alignment requirements of an unnamed bitfield's
11335 type. */
11336 #undef TARGET_ALIGN_ANON_BITFIELD
11337 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11339 #undef TARGET_ASM_ALIGNED_DI_OP
11340 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11342 #undef TARGET_ASM_ALIGNED_HI_OP
11343 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11345 #undef TARGET_ASM_ALIGNED_SI_OP
11346 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11348 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11349 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11350 hook_bool_const_tree_hwi_hwi_const_tree_true
11352 #undef TARGET_ASM_FILE_START
11353 #define TARGET_ASM_FILE_START aarch64_start_file
11355 #undef TARGET_ASM_OUTPUT_MI_THUNK
11356 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11358 #undef TARGET_ASM_SELECT_RTX_SECTION
11359 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11361 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11362 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11364 #undef TARGET_BUILD_BUILTIN_VA_LIST
11365 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11367 #undef TARGET_CALLEE_COPIES
11368 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11370 #undef TARGET_CAN_ELIMINATE
11371 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11373 #undef TARGET_CANNOT_FORCE_CONST_MEM
11374 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11376 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11377 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11379 /* Only the least significant bit is used for initialization guard
11380 variables. */
11381 #undef TARGET_CXX_GUARD_MASK_BIT
11382 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11384 #undef TARGET_C_MODE_FOR_SUFFIX
11385 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11387 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11388 #undef TARGET_DEFAULT_TARGET_FLAGS
11389 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11390 #endif
11392 #undef TARGET_CLASS_MAX_NREGS
11393 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11395 #undef TARGET_BUILTIN_DECL
11396 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11398 #undef TARGET_EXPAND_BUILTIN
11399 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11401 #undef TARGET_EXPAND_BUILTIN_VA_START
11402 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11404 #undef TARGET_FOLD_BUILTIN
11405 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11407 #undef TARGET_FUNCTION_ARG
11408 #define TARGET_FUNCTION_ARG aarch64_function_arg
11410 #undef TARGET_FUNCTION_ARG_ADVANCE
11411 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11413 #undef TARGET_FUNCTION_ARG_BOUNDARY
11414 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11416 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11417 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11419 #undef TARGET_FUNCTION_VALUE
11420 #define TARGET_FUNCTION_VALUE aarch64_function_value
11422 #undef TARGET_FUNCTION_VALUE_REGNO_P
11423 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11425 #undef TARGET_FRAME_POINTER_REQUIRED
11426 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11428 #undef TARGET_GIMPLE_FOLD_BUILTIN
11429 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11431 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11432 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11434 #undef TARGET_INIT_BUILTINS
11435 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11437 #undef TARGET_LEGITIMATE_ADDRESS_P
11438 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11440 #undef TARGET_LEGITIMATE_CONSTANT_P
11441 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11443 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11444 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11446 #undef TARGET_LRA_P
11447 #define TARGET_LRA_P hook_bool_void_true
11449 #undef TARGET_MANGLE_TYPE
11450 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11452 #undef TARGET_MEMORY_MOVE_COST
11453 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11455 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11456 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11458 #undef TARGET_MUST_PASS_IN_STACK
11459 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11461 /* This target hook should return true if accesses to volatile bitfields
11462 should use the narrowest mode possible. It should return false if these
11463 accesses should use the bitfield container type. */
11464 #undef TARGET_NARROW_VOLATILE_BITFIELD
11465 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11467 #undef TARGET_OPTION_OVERRIDE
11468 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11470 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11471 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11472 aarch64_override_options_after_change
11474 #undef TARGET_PASS_BY_REFERENCE
11475 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11477 #undef TARGET_PREFERRED_RELOAD_CLASS
11478 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11480 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11481 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11483 #undef TARGET_SECONDARY_RELOAD
11484 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11486 #undef TARGET_SHIFT_TRUNCATION_MASK
11487 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11489 #undef TARGET_SETUP_INCOMING_VARARGS
11490 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11492 #undef TARGET_STRUCT_VALUE_RTX
11493 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11495 #undef TARGET_REGISTER_MOVE_COST
11496 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11498 #undef TARGET_RETURN_IN_MEMORY
11499 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11501 #undef TARGET_RETURN_IN_MSB
11502 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11504 #undef TARGET_RTX_COSTS
11505 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11507 #undef TARGET_SCHED_ISSUE_RATE
11508 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11510 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11511 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11512 aarch64_sched_first_cycle_multipass_dfa_lookahead
11514 #undef TARGET_TRAMPOLINE_INIT
11515 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11517 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11518 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11520 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11521 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11523 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11524 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11526 #undef TARGET_VECTORIZE_ADD_STMT_COST
11527 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11529 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11530 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11531 aarch64_builtin_vectorization_cost
11533 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11534 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11536 #undef TARGET_VECTORIZE_BUILTINS
11537 #define TARGET_VECTORIZE_BUILTINS
11539 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11540 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11541 aarch64_builtin_vectorized_function
11543 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11544 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11545 aarch64_autovectorize_vector_sizes
11547 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11548 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11549 aarch64_atomic_assign_expand_fenv
11551 /* Section anchor support. */
11553 #undef TARGET_MIN_ANCHOR_OFFSET
11554 #define TARGET_MIN_ANCHOR_OFFSET -256
11556 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11557 byte offset; we can do much more for larger data types, but have no way
11558 to determine the size of the access. We assume accesses are aligned. */
11559 #undef TARGET_MAX_ANCHOR_OFFSET
11560 #define TARGET_MAX_ANCHOR_OFFSET 4095
11562 #undef TARGET_VECTOR_ALIGNMENT
11563 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11565 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11566 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11567 aarch64_simd_vector_alignment_reachable
11569 /* vec_perm support. */
11571 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11572 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11573 aarch64_vectorize_vec_perm_const_ok
11576 #undef TARGET_FIXED_CONDITION_CODE_REGS
11577 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11579 #undef TARGET_FLAGS_REGNUM
11580 #define TARGET_FLAGS_REGNUM CC_REGNUM
11582 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11583 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11585 #undef TARGET_ASAN_SHADOW_OFFSET
11586 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11588 #undef TARGET_LEGITIMIZE_ADDRESS
11589 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11591 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11592 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11593 aarch64_use_by_pieces_infrastructure_p
11595 #undef TARGET_CAN_USE_DOLOOP_P
11596 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11598 #undef TARGET_SCHED_MACRO_FUSION_P
11599 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11601 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11602 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11604 #undef TARGET_SCHED_FUSION_PRIORITY
11605 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11607 struct gcc_target targetm = TARGET_INITIALIZER;
11609 #include "gt-aarch64.h"