gcc/
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobd9abff20eaaebb3d68b9d4cf3938f44bce436556
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "alias.h"
29 #include "symtab.h"
30 #include "tree.h"
31 #include "fold-const.h"
32 #include "stringpool.h"
33 #include "stor-layout.h"
34 #include "calls.h"
35 #include "varasm.h"
36 #include "regs.h"
37 #include "dominance.h"
38 #include "cfg.h"
39 #include "cfgrtl.h"
40 #include "cfganal.h"
41 #include "lcm.h"
42 #include "cfgbuild.h"
43 #include "cfgcleanup.h"
44 #include "predict.h"
45 #include "basic-block.h"
46 #include "df.h"
47 #include "hard-reg-set.h"
48 #include "output.h"
49 #include "function.h"
50 #include "flags.h"
51 #include "insn-config.h"
52 #include "expmed.h"
53 #include "dojump.h"
54 #include "explow.h"
55 #include "emit-rtl.h"
56 #include "stmt.h"
57 #include "expr.h"
58 #include "reload.h"
59 #include "toplev.h"
60 #include "target.h"
61 #include "targhooks.h"
62 #include "tm_p.h"
63 #include "recog.h"
64 #include "langhooks.h"
65 #include "diagnostic-core.h"
66 #include "tree-ssa-alias.h"
67 #include "internal-fn.h"
68 #include "gimple-fold.h"
69 #include "tree-eh.h"
70 #include "gimple-expr.h"
71 #include "gimple.h"
72 #include "gimplify.h"
73 #include "optabs.h"
74 #include "dwarf2.h"
75 #include "cfgloop.h"
76 #include "tree-vectorizer.h"
77 #include "aarch64-cost-tables.h"
78 #include "dumpfile.h"
79 #include "builtins.h"
80 #include "rtl-iter.h"
81 #include "tm-constrs.h"
82 #include "sched-int.h"
83 #include "cortex-a57-fma-steering.h"
85 #include "target-def.h"
87 /* Defined for convenience. */
88 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
90 /* Classifies an address.
92 ADDRESS_REG_IMM
93 A simple base register plus immediate offset.
95 ADDRESS_REG_WB
96 A base register indexed by immediate offset with writeback.
98 ADDRESS_REG_REG
99 A base register indexed by (optionally scaled) register.
101 ADDRESS_REG_UXTW
102 A base register indexed by (optionally scaled) zero-extended register.
104 ADDRESS_REG_SXTW
105 A base register indexed by (optionally scaled) sign-extended register.
107 ADDRESS_LO_SUM
108 A LO_SUM rtx with a base register and "LO12" symbol relocation.
110 ADDRESS_SYMBOLIC:
111 A constant symbolic address, in pc-relative literal pool. */
113 enum aarch64_address_type {
114 ADDRESS_REG_IMM,
115 ADDRESS_REG_WB,
116 ADDRESS_REG_REG,
117 ADDRESS_REG_UXTW,
118 ADDRESS_REG_SXTW,
119 ADDRESS_LO_SUM,
120 ADDRESS_SYMBOLIC
123 struct aarch64_address_info {
124 enum aarch64_address_type type;
125 rtx base;
126 rtx offset;
127 int shift;
128 enum aarch64_symbol_type symbol_type;
131 struct simd_immediate_info
133 rtx value;
134 int shift;
135 int element_width;
136 bool mvn;
137 bool msl;
140 /* The current code model. */
141 enum aarch64_code_model aarch64_cmodel;
143 #ifdef HAVE_AS_TLS
144 #undef TARGET_HAVE_TLS
145 #define TARGET_HAVE_TLS 1
146 #endif
148 static bool aarch64_composite_type_p (const_tree, machine_mode);
149 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
150 const_tree,
151 machine_mode *, int *,
152 bool *);
153 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
154 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
155 static void aarch64_override_options_after_change (void);
156 static bool aarch64_vector_mode_supported_p (machine_mode);
157 static unsigned bit_count (unsigned HOST_WIDE_INT);
158 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
159 const unsigned char *sel);
160 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
162 /* Major revision number of the ARM Architecture implemented by the target. */
163 unsigned aarch64_architecture_version;
165 /* The processor for which instructions should be scheduled. */
166 enum aarch64_processor aarch64_tune = cortexa53;
168 /* The current tuning set. */
169 const struct tune_params *aarch64_tune_params;
171 /* Mask to specify which instructions we are allowed to generate. */
172 unsigned long aarch64_isa_flags = 0;
174 /* Mask to specify which instruction scheduling options should be used. */
175 unsigned long aarch64_tune_flags = 0;
177 /* Tuning parameters. */
179 static const struct cpu_addrcost_table generic_addrcost_table =
182 0, /* hi */
183 0, /* si */
184 0, /* di */
185 0, /* ti */
187 0, /* pre_modify */
188 0, /* post_modify */
189 0, /* register_offset */
190 0, /* register_extend */
191 0 /* imm_offset */
194 static const struct cpu_addrcost_table cortexa57_addrcost_table =
197 1, /* hi */
198 0, /* si */
199 0, /* di */
200 1, /* ti */
202 0, /* pre_modify */
203 0, /* post_modify */
204 0, /* register_offset */
205 0, /* register_extend */
206 0, /* imm_offset */
209 static const struct cpu_addrcost_table xgene1_addrcost_table =
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
217 1, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 1, /* register_extend */
221 0, /* imm_offset */
224 static const struct cpu_regmove_cost generic_regmove_cost =
226 1, /* GP2GP */
227 /* Avoid the use of slow int<->fp moves for spilling by setting
228 their cost higher than memmov_cost. */
229 5, /* GP2FP */
230 5, /* FP2GP */
231 2 /* FP2FP */
234 static const struct cpu_regmove_cost cortexa57_regmove_cost =
236 1, /* GP2GP */
237 /* Avoid the use of slow int<->fp moves for spilling by setting
238 their cost higher than memmov_cost. */
239 5, /* GP2FP */
240 5, /* FP2GP */
241 2 /* FP2FP */
244 static const struct cpu_regmove_cost cortexa53_regmove_cost =
246 1, /* GP2GP */
247 /* Avoid the use of slow int<->fp moves for spilling by setting
248 their cost higher than memmov_cost. */
249 5, /* GP2FP */
250 5, /* FP2GP */
251 2 /* FP2FP */
254 static const struct cpu_regmove_cost thunderx_regmove_cost =
256 2, /* GP2GP */
257 2, /* GP2FP */
258 6, /* FP2GP */
259 4 /* FP2FP */
262 static const struct cpu_regmove_cost xgene1_regmove_cost =
264 1, /* GP2GP */
265 /* Avoid the use of slow int<->fp moves for spilling by setting
266 their cost higher than memmov_cost. */
267 8, /* GP2FP */
268 8, /* FP2GP */
269 2 /* FP2FP */
272 /* Generic costs for vector insn classes. */
273 static const struct cpu_vector_cost generic_vector_cost =
275 1, /* scalar_stmt_cost */
276 1, /* scalar_load_cost */
277 1, /* scalar_store_cost */
278 1, /* vec_stmt_cost */
279 1, /* vec_to_scalar_cost */
280 1, /* scalar_to_vec_cost */
281 1, /* vec_align_load_cost */
282 1, /* vec_unalign_load_cost */
283 1, /* vec_unalign_store_cost */
284 1, /* vec_store_cost */
285 3, /* cond_taken_branch_cost */
286 1 /* cond_not_taken_branch_cost */
289 /* Generic costs for vector insn classes. */
290 static const struct cpu_vector_cost cortexa57_vector_cost =
292 1, /* scalar_stmt_cost */
293 4, /* scalar_load_cost */
294 1, /* scalar_store_cost */
295 3, /* vec_stmt_cost */
296 8, /* vec_to_scalar_cost */
297 8, /* scalar_to_vec_cost */
298 5, /* vec_align_load_cost */
299 5, /* vec_unalign_load_cost */
300 1, /* vec_unalign_store_cost */
301 1, /* vec_store_cost */
302 1, /* cond_taken_branch_cost */
303 1 /* cond_not_taken_branch_cost */
306 /* Generic costs for vector insn classes. */
307 static const struct cpu_vector_cost xgene1_vector_cost =
309 1, /* scalar_stmt_cost */
310 5, /* scalar_load_cost */
311 1, /* scalar_store_cost */
312 2, /* vec_stmt_cost */
313 4, /* vec_to_scalar_cost */
314 4, /* scalar_to_vec_cost */
315 10, /* vec_align_load_cost */
316 10, /* vec_unalign_load_cost */
317 2, /* vec_unalign_store_cost */
318 2, /* vec_store_cost */
319 2, /* cond_taken_branch_cost */
320 1 /* cond_not_taken_branch_cost */
323 #define AARCH64_FUSE_NOTHING (0)
324 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
325 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
326 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
327 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
328 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
330 /* Generic costs for branch instructions. */
331 static const struct cpu_branch_cost generic_branch_cost =
333 2, /* Predictable. */
334 2 /* Unpredictable. */
337 static const struct tune_params generic_tunings =
339 &cortexa57_extra_costs,
340 &generic_addrcost_table,
341 &generic_regmove_cost,
342 &generic_vector_cost,
343 &generic_branch_cost,
344 4, /* memmov_cost */
345 2, /* issue_rate */
346 AARCH64_FUSE_NOTHING, /* fusible_ops */
347 8, /* function_align. */
348 8, /* jump_align. */
349 4, /* loop_align. */
350 2, /* int_reassoc_width. */
351 4, /* fp_reassoc_width. */
352 1, /* vec_reassoc_width. */
353 2, /* min_div_recip_mul_sf. */
354 2 /* min_div_recip_mul_df. */
357 static const struct tune_params cortexa53_tunings =
359 &cortexa53_extra_costs,
360 &generic_addrcost_table,
361 &cortexa53_regmove_cost,
362 &generic_vector_cost,
363 &generic_branch_cost,
364 4, /* memmov_cost */
365 2, /* issue_rate */
366 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
367 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
368 8, /* function_align. */
369 8, /* jump_align. */
370 4, /* loop_align. */
371 2, /* int_reassoc_width. */
372 4, /* fp_reassoc_width. */
373 1, /* vec_reassoc_width. */
374 2, /* min_div_recip_mul_sf. */
375 2 /* min_div_recip_mul_df. */
378 static const struct tune_params cortexa57_tunings =
380 &cortexa57_extra_costs,
381 &cortexa57_addrcost_table,
382 &cortexa57_regmove_cost,
383 &cortexa57_vector_cost,
384 &generic_branch_cost,
385 4, /* memmov_cost */
386 3, /* issue_rate */
387 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
388 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
389 16, /* function_align. */
390 8, /* jump_align. */
391 4, /* loop_align. */
392 2, /* int_reassoc_width. */
393 4, /* fp_reassoc_width. */
394 1, /* vec_reassoc_width. */
395 2, /* min_div_recip_mul_sf. */
396 2 /* min_div_recip_mul_df. */
399 static const struct tune_params thunderx_tunings =
401 &thunderx_extra_costs,
402 &generic_addrcost_table,
403 &thunderx_regmove_cost,
404 &generic_vector_cost,
405 &generic_branch_cost,
406 6, /* memmov_cost */
407 2, /* issue_rate */
408 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
409 8, /* function_align. */
410 8, /* jump_align. */
411 8, /* loop_align. */
412 2, /* int_reassoc_width. */
413 4, /* fp_reassoc_width. */
414 1, /* vec_reassoc_width. */
415 2, /* min_div_recip_mul_sf. */
416 2 /* min_div_recip_mul_df. */
419 static const struct tune_params xgene1_tunings =
421 &xgene1_extra_costs,
422 &xgene1_addrcost_table,
423 &xgene1_regmove_cost,
424 &xgene1_vector_cost,
425 &generic_branch_cost,
426 6, /* memmov_cost */
427 4, /* issue_rate */
428 AARCH64_FUSE_NOTHING, /* fusible_ops */
429 16, /* function_align. */
430 8, /* jump_align. */
431 16, /* loop_align. */
432 2, /* int_reassoc_width. */
433 4, /* fp_reassoc_width. */
434 1, /* vec_reassoc_width. */
435 2, /* min_div_recip_mul_sf. */
436 2 /* min_div_recip_mul_df. */
439 /* A processor implementing AArch64. */
440 struct processor
442 const char *const name;
443 enum aarch64_processor core;
444 const char *arch;
445 unsigned architecture_version;
446 const unsigned long flags;
447 const struct tune_params *const tune;
450 /* Processor cores implementing AArch64. */
451 static const struct processor all_cores[] =
453 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
454 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
455 #include "aarch64-cores.def"
456 #undef AARCH64_CORE
457 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
458 {NULL, aarch64_none, NULL, 0, 0, NULL}
461 /* Architectures implementing AArch64. */
462 static const struct processor all_architectures[] =
464 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
465 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
466 #include "aarch64-arches.def"
467 #undef AARCH64_ARCH
468 {NULL, aarch64_none, NULL, 0, 0, NULL}
471 /* Target specification. These are populated as commandline arguments
472 are processed, or NULL if not specified. */
473 static const struct processor *selected_arch;
474 static const struct processor *selected_cpu;
475 static const struct processor *selected_tune;
477 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
479 /* An ISA extension in the co-processor and main instruction set space. */
480 struct aarch64_option_extension
482 const char *const name;
483 const unsigned long flags_on;
484 const unsigned long flags_off;
487 /* ISA extensions in AArch64. */
488 static const struct aarch64_option_extension all_extensions[] =
490 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
491 {NAME, FLAGS_ON, FLAGS_OFF},
492 #include "aarch64-option-extensions.def"
493 #undef AARCH64_OPT_EXTENSION
494 {NULL, 0, 0}
497 /* Used to track the size of an address when generating a pre/post
498 increment address. */
499 static machine_mode aarch64_memory_reference_mode;
501 /* A table of valid AArch64 "bitmask immediate" values for
502 logical instructions. */
504 #define AARCH64_NUM_BITMASKS 5334
505 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
507 typedef enum aarch64_cond_code
509 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
510 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
511 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
513 aarch64_cc;
515 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
517 /* The condition codes of the processor, and the inverse function. */
518 static const char * const aarch64_condition_codes[] =
520 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
521 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
524 void
525 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
527 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
528 if (TARGET_GENERAL_REGS_ONLY)
529 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
530 else
531 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
534 static unsigned int
535 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
537 if (GET_MODE_UNIT_SIZE (mode) == 4)
538 return aarch64_tune_params->min_div_recip_mul_sf;
539 return aarch64_tune_params->min_div_recip_mul_df;
542 static int
543 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
544 enum machine_mode mode)
546 if (VECTOR_MODE_P (mode))
547 return aarch64_tune_params->vec_reassoc_width;
548 if (INTEGRAL_MODE_P (mode))
549 return aarch64_tune_params->int_reassoc_width;
550 if (FLOAT_MODE_P (mode))
551 return aarch64_tune_params->fp_reassoc_width;
552 return 1;
555 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
556 unsigned
557 aarch64_dbx_register_number (unsigned regno)
559 if (GP_REGNUM_P (regno))
560 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
561 else if (regno == SP_REGNUM)
562 return AARCH64_DWARF_SP;
563 else if (FP_REGNUM_P (regno))
564 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
566 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
567 equivalent DWARF register. */
568 return DWARF_FRAME_REGISTERS;
571 /* Return TRUE if MODE is any of the large INT modes. */
572 static bool
573 aarch64_vect_struct_mode_p (machine_mode mode)
575 return mode == OImode || mode == CImode || mode == XImode;
578 /* Return TRUE if MODE is any of the vector modes. */
579 static bool
580 aarch64_vector_mode_p (machine_mode mode)
582 return aarch64_vector_mode_supported_p (mode)
583 || aarch64_vect_struct_mode_p (mode);
586 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
587 static bool
588 aarch64_array_mode_supported_p (machine_mode mode,
589 unsigned HOST_WIDE_INT nelems)
591 if (TARGET_SIMD
592 && AARCH64_VALID_SIMD_QREG_MODE (mode)
593 && (nelems >= 2 && nelems <= 4))
594 return true;
596 return false;
599 /* Implement HARD_REGNO_NREGS. */
602 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
604 switch (aarch64_regno_regclass (regno))
606 case FP_REGS:
607 case FP_LO_REGS:
608 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
609 default:
610 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
612 gcc_unreachable ();
615 /* Implement HARD_REGNO_MODE_OK. */
618 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
620 if (GET_MODE_CLASS (mode) == MODE_CC)
621 return regno == CC_REGNUM;
623 if (regno == SP_REGNUM)
624 /* The purpose of comparing with ptr_mode is to support the
625 global register variable associated with the stack pointer
626 register via the syntax of asm ("wsp") in ILP32. */
627 return mode == Pmode || mode == ptr_mode;
629 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
630 return mode == Pmode;
632 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
633 return 1;
635 if (FP_REGNUM_P (regno))
637 if (aarch64_vect_struct_mode_p (mode))
638 return
639 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
640 else
641 return 1;
644 return 0;
647 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
648 machine_mode
649 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
650 machine_mode mode)
652 /* Handle modes that fit within single registers. */
653 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
655 if (GET_MODE_SIZE (mode) >= 4)
656 return mode;
657 else
658 return SImode;
660 /* Fall back to generic for multi-reg and very large modes. */
661 else
662 return choose_hard_reg_mode (regno, nregs, false);
665 /* Return true if calls to DECL should be treated as
666 long-calls (ie called via a register). */
667 static bool
668 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
670 return false;
673 /* Return true if calls to symbol-ref SYM should be treated as
674 long-calls (ie called via a register). */
675 bool
676 aarch64_is_long_call_p (rtx sym)
678 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
681 /* Return true if the offsets to a zero/sign-extract operation
682 represent an expression that matches an extend operation. The
683 operands represent the paramters from
685 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
686 bool
687 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
688 rtx extract_imm)
690 HOST_WIDE_INT mult_val, extract_val;
692 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
693 return false;
695 mult_val = INTVAL (mult_imm);
696 extract_val = INTVAL (extract_imm);
698 if (extract_val > 8
699 && extract_val < GET_MODE_BITSIZE (mode)
700 && exact_log2 (extract_val & ~7) > 0
701 && (extract_val & 7) <= 4
702 && mult_val == (1 << (extract_val & 7)))
703 return true;
705 return false;
708 /* Emit an insn that's a simple single-set. Both the operands must be
709 known to be valid. */
710 inline static rtx
711 emit_set_insn (rtx x, rtx y)
713 return emit_insn (gen_rtx_SET (x, y));
716 /* X and Y are two things to compare using CODE. Emit the compare insn and
717 return the rtx for register 0 in the proper mode. */
719 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
721 machine_mode mode = SELECT_CC_MODE (code, x, y);
722 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
724 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
725 return cc_reg;
728 /* Build the SYMBOL_REF for __tls_get_addr. */
730 static GTY(()) rtx tls_get_addr_libfunc;
733 aarch64_tls_get_addr (void)
735 if (!tls_get_addr_libfunc)
736 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
737 return tls_get_addr_libfunc;
740 /* Return the TLS model to use for ADDR. */
742 static enum tls_model
743 tls_symbolic_operand_type (rtx addr)
745 enum tls_model tls_kind = TLS_MODEL_NONE;
746 rtx sym, addend;
748 if (GET_CODE (addr) == CONST)
750 split_const (addr, &sym, &addend);
751 if (GET_CODE (sym) == SYMBOL_REF)
752 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
754 else if (GET_CODE (addr) == SYMBOL_REF)
755 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
757 return tls_kind;
760 /* We'll allow lo_sum's in addresses in our legitimate addresses
761 so that combine would take care of combining addresses where
762 necessary, but for generation purposes, we'll generate the address
763 as :
764 RTL Absolute
765 tmp = hi (symbol_ref); adrp x1, foo
766 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
769 PIC TLS
770 adrp x1, :got:foo adrp tmp, :tlsgd:foo
771 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
772 bl __tls_get_addr
775 Load TLS symbol, depending on TLS mechanism and TLS access model.
777 Global Dynamic - Traditional TLS:
778 adrp tmp, :tlsgd:imm
779 add dest, tmp, #:tlsgd_lo12:imm
780 bl __tls_get_addr
782 Global Dynamic - TLS Descriptors:
783 adrp dest, :tlsdesc:imm
784 ldr tmp, [dest, #:tlsdesc_lo12:imm]
785 add dest, dest, #:tlsdesc_lo12:imm
786 blr tmp
787 mrs tp, tpidr_el0
788 add dest, dest, tp
790 Initial Exec:
791 mrs tp, tpidr_el0
792 adrp tmp, :gottprel:imm
793 ldr dest, [tmp, #:gottprel_lo12:imm]
794 add dest, dest, tp
796 Local Exec:
797 mrs tp, tpidr_el0
798 add t0, tp, #:tprel_hi12:imm, lsl #12
799 add t0, t0, #:tprel_lo12_nc:imm
802 static void
803 aarch64_load_symref_appropriately (rtx dest, rtx imm,
804 enum aarch64_symbol_type type)
806 switch (type)
808 case SYMBOL_SMALL_ABSOLUTE:
810 /* In ILP32, the mode of dest can be either SImode or DImode. */
811 rtx tmp_reg = dest;
812 machine_mode mode = GET_MODE (dest);
814 gcc_assert (mode == Pmode || mode == ptr_mode);
816 if (can_create_pseudo_p ())
817 tmp_reg = gen_reg_rtx (mode);
819 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
820 emit_insn (gen_add_losym (dest, tmp_reg, imm));
821 return;
824 case SYMBOL_TINY_ABSOLUTE:
825 emit_insn (gen_rtx_SET (dest, imm));
826 return;
828 case SYMBOL_SMALL_GOT:
830 /* In ILP32, the mode of dest can be either SImode or DImode,
831 while the got entry is always of SImode size. The mode of
832 dest depends on how dest is used: if dest is assigned to a
833 pointer (e.g. in the memory), it has SImode; it may have
834 DImode if dest is dereferenced to access the memeory.
835 This is why we have to handle three different ldr_got_small
836 patterns here (two patterns for ILP32). */
837 rtx tmp_reg = dest;
838 machine_mode mode = GET_MODE (dest);
840 if (can_create_pseudo_p ())
841 tmp_reg = gen_reg_rtx (mode);
843 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
844 if (mode == ptr_mode)
846 if (mode == DImode)
847 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
848 else
849 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
851 else
853 gcc_assert (mode == Pmode);
854 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
857 return;
860 case SYMBOL_SMALL_TLSGD:
862 rtx_insn *insns;
863 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
865 start_sequence ();
866 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
867 insns = get_insns ();
868 end_sequence ();
870 RTL_CONST_CALL_P (insns) = 1;
871 emit_libcall_block (insns, dest, result, imm);
872 return;
875 case SYMBOL_SMALL_TLSDESC:
877 machine_mode mode = GET_MODE (dest);
878 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
879 rtx tp;
881 gcc_assert (mode == Pmode || mode == ptr_mode);
883 /* In ILP32, the got entry is always of SImode size. Unlike
884 small GOT, the dest is fixed at reg 0. */
885 if (TARGET_ILP32)
886 emit_insn (gen_tlsdesc_small_si (imm));
887 else
888 emit_insn (gen_tlsdesc_small_di (imm));
889 tp = aarch64_load_tp (NULL);
891 if (mode != Pmode)
892 tp = gen_lowpart (mode, tp);
894 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
895 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
896 return;
899 case SYMBOL_SMALL_GOTTPREL:
901 /* In ILP32, the mode of dest can be either SImode or DImode,
902 while the got entry is always of SImode size. The mode of
903 dest depends on how dest is used: if dest is assigned to a
904 pointer (e.g. in the memory), it has SImode; it may have
905 DImode if dest is dereferenced to access the memeory.
906 This is why we have to handle three different tlsie_small
907 patterns here (two patterns for ILP32). */
908 machine_mode mode = GET_MODE (dest);
909 rtx tmp_reg = gen_reg_rtx (mode);
910 rtx tp = aarch64_load_tp (NULL);
912 if (mode == ptr_mode)
914 if (mode == DImode)
915 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
916 else
918 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
919 tp = gen_lowpart (mode, tp);
922 else
924 gcc_assert (mode == Pmode);
925 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
928 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
929 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
930 return;
933 case SYMBOL_SMALL_TPREL:
935 rtx tp = aarch64_load_tp (NULL);
937 if (GET_MODE (dest) != Pmode)
938 tp = gen_lowpart (GET_MODE (dest), tp);
940 emit_insn (gen_tlsle_small (dest, tp, imm));
941 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
942 return;
945 case SYMBOL_TINY_GOT:
946 emit_insn (gen_ldr_got_tiny (dest, imm));
947 return;
949 default:
950 gcc_unreachable ();
954 /* Emit a move from SRC to DEST. Assume that the move expanders can
955 handle all moves if !can_create_pseudo_p (). The distinction is
956 important because, unlike emit_move_insn, the move expanders know
957 how to force Pmode objects into the constant pool even when the
958 constant pool address is not itself legitimate. */
959 static rtx
960 aarch64_emit_move (rtx dest, rtx src)
962 return (can_create_pseudo_p ()
963 ? emit_move_insn (dest, src)
964 : emit_move_insn_1 (dest, src));
967 /* Split a 128-bit move operation into two 64-bit move operations,
968 taking care to handle partial overlap of register to register
969 copies. Special cases are needed when moving between GP regs and
970 FP regs. SRC can be a register, constant or memory; DST a register
971 or memory. If either operand is memory it must not have any side
972 effects. */
973 void
974 aarch64_split_128bit_move (rtx dst, rtx src)
976 rtx dst_lo, dst_hi;
977 rtx src_lo, src_hi;
979 machine_mode mode = GET_MODE (dst);
981 gcc_assert (mode == TImode || mode == TFmode);
982 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
983 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
985 if (REG_P (dst) && REG_P (src))
987 int src_regno = REGNO (src);
988 int dst_regno = REGNO (dst);
990 /* Handle FP <-> GP regs. */
991 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
993 src_lo = gen_lowpart (word_mode, src);
994 src_hi = gen_highpart (word_mode, src);
996 if (mode == TImode)
998 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
999 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1001 else
1003 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1004 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1006 return;
1008 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1010 dst_lo = gen_lowpart (word_mode, dst);
1011 dst_hi = gen_highpart (word_mode, dst);
1013 if (mode == TImode)
1015 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1016 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1018 else
1020 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1021 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1023 return;
1027 dst_lo = gen_lowpart (word_mode, dst);
1028 dst_hi = gen_highpart (word_mode, dst);
1029 src_lo = gen_lowpart (word_mode, src);
1030 src_hi = gen_highpart_mode (word_mode, mode, src);
1032 /* At most one pairing may overlap. */
1033 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1035 aarch64_emit_move (dst_hi, src_hi);
1036 aarch64_emit_move (dst_lo, src_lo);
1038 else
1040 aarch64_emit_move (dst_lo, src_lo);
1041 aarch64_emit_move (dst_hi, src_hi);
1045 bool
1046 aarch64_split_128bit_move_p (rtx dst, rtx src)
1048 return (! REG_P (src)
1049 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1052 /* Split a complex SIMD combine. */
1054 void
1055 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1057 machine_mode src_mode = GET_MODE (src1);
1058 machine_mode dst_mode = GET_MODE (dst);
1060 gcc_assert (VECTOR_MODE_P (dst_mode));
1062 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1064 rtx (*gen) (rtx, rtx, rtx);
1066 switch (src_mode)
1068 case V8QImode:
1069 gen = gen_aarch64_simd_combinev8qi;
1070 break;
1071 case V4HImode:
1072 gen = gen_aarch64_simd_combinev4hi;
1073 break;
1074 case V2SImode:
1075 gen = gen_aarch64_simd_combinev2si;
1076 break;
1077 case V2SFmode:
1078 gen = gen_aarch64_simd_combinev2sf;
1079 break;
1080 case DImode:
1081 gen = gen_aarch64_simd_combinedi;
1082 break;
1083 case DFmode:
1084 gen = gen_aarch64_simd_combinedf;
1085 break;
1086 default:
1087 gcc_unreachable ();
1090 emit_insn (gen (dst, src1, src2));
1091 return;
1095 /* Split a complex SIMD move. */
1097 void
1098 aarch64_split_simd_move (rtx dst, rtx src)
1100 machine_mode src_mode = GET_MODE (src);
1101 machine_mode dst_mode = GET_MODE (dst);
1103 gcc_assert (VECTOR_MODE_P (dst_mode));
1105 if (REG_P (dst) && REG_P (src))
1107 rtx (*gen) (rtx, rtx);
1109 gcc_assert (VECTOR_MODE_P (src_mode));
1111 switch (src_mode)
1113 case V16QImode:
1114 gen = gen_aarch64_split_simd_movv16qi;
1115 break;
1116 case V8HImode:
1117 gen = gen_aarch64_split_simd_movv8hi;
1118 break;
1119 case V4SImode:
1120 gen = gen_aarch64_split_simd_movv4si;
1121 break;
1122 case V2DImode:
1123 gen = gen_aarch64_split_simd_movv2di;
1124 break;
1125 case V4SFmode:
1126 gen = gen_aarch64_split_simd_movv4sf;
1127 break;
1128 case V2DFmode:
1129 gen = gen_aarch64_split_simd_movv2df;
1130 break;
1131 default:
1132 gcc_unreachable ();
1135 emit_insn (gen (dst, src));
1136 return;
1140 static rtx
1141 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1143 if (can_create_pseudo_p ())
1144 return force_reg (mode, value);
1145 else
1147 x = aarch64_emit_move (x, value);
1148 return x;
1153 static rtx
1154 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1156 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1158 rtx high;
1159 /* Load the full offset into a register. This
1160 might be improvable in the future. */
1161 high = GEN_INT (offset);
1162 offset = 0;
1163 high = aarch64_force_temporary (mode, temp, high);
1164 reg = aarch64_force_temporary (mode, temp,
1165 gen_rtx_PLUS (mode, high, reg));
1167 return plus_constant (mode, reg, offset);
1170 static int
1171 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1172 machine_mode mode)
1174 unsigned HOST_WIDE_INT mask;
1175 int i;
1176 bool first;
1177 unsigned HOST_WIDE_INT val;
1178 bool subtargets;
1179 rtx subtarget;
1180 int one_match, zero_match, first_not_ffff_match;
1181 int num_insns = 0;
1183 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1185 if (generate)
1186 emit_insn (gen_rtx_SET (dest, imm));
1187 num_insns++;
1188 return num_insns;
1191 if (mode == SImode)
1193 /* We know we can't do this in 1 insn, and we must be able to do it
1194 in two; so don't mess around looking for sequences that don't buy
1195 us anything. */
1196 if (generate)
1198 emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1199 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1200 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1202 num_insns += 2;
1203 return num_insns;
1206 /* Remaining cases are all for DImode. */
1208 val = INTVAL (imm);
1209 subtargets = optimize && can_create_pseudo_p ();
1211 one_match = 0;
1212 zero_match = 0;
1213 mask = 0xffff;
1214 first_not_ffff_match = -1;
1216 for (i = 0; i < 64; i += 16, mask <<= 16)
1218 if ((val & mask) == mask)
1219 one_match++;
1220 else
1222 if (first_not_ffff_match < 0)
1223 first_not_ffff_match = i;
1224 if ((val & mask) == 0)
1225 zero_match++;
1229 if (one_match == 2)
1231 /* Set one of the quarters and then insert back into result. */
1232 mask = 0xffffll << first_not_ffff_match;
1233 if (generate)
1235 emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1236 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1237 GEN_INT ((val >> first_not_ffff_match)
1238 & 0xffff)));
1240 num_insns += 2;
1241 return num_insns;
1244 if (zero_match == 2)
1245 goto simple_sequence;
1247 mask = 0x0ffff0000UL;
1248 for (i = 16; i < 64; i += 16, mask <<= 16)
1250 HOST_WIDE_INT comp = mask & ~(mask - 1);
1252 if (aarch64_uimm12_shift (val - (val & mask)))
1254 if (generate)
1256 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1257 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1258 emit_insn (gen_adddi3 (dest, subtarget,
1259 GEN_INT (val - (val & mask))));
1261 num_insns += 2;
1262 return num_insns;
1264 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1266 if (generate)
1268 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1269 emit_insn (gen_rtx_SET (subtarget,
1270 GEN_INT ((val + comp) & mask)));
1271 emit_insn (gen_adddi3 (dest, subtarget,
1272 GEN_INT (val - ((val + comp) & mask))));
1274 num_insns += 2;
1275 return num_insns;
1277 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1279 if (generate)
1281 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1282 emit_insn (gen_rtx_SET (subtarget,
1283 GEN_INT ((val - comp) | ~mask)));
1284 emit_insn (gen_adddi3 (dest, subtarget,
1285 GEN_INT (val - ((val - comp) | ~mask))));
1287 num_insns += 2;
1288 return num_insns;
1290 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1292 if (generate)
1294 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1295 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1296 emit_insn (gen_adddi3 (dest, subtarget,
1297 GEN_INT (val - (val | ~mask))));
1299 num_insns += 2;
1300 return num_insns;
1304 /* See if we can do it by arithmetically combining two
1305 immediates. */
1306 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1308 int j;
1309 mask = 0xffff;
1311 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1312 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1314 if (generate)
1316 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1317 emit_insn (gen_rtx_SET (subtarget,
1318 GEN_INT (aarch64_bitmasks[i])));
1319 emit_insn (gen_adddi3 (dest, subtarget,
1320 GEN_INT (val - aarch64_bitmasks[i])));
1322 num_insns += 2;
1323 return num_insns;
1326 for (j = 0; j < 64; j += 16, mask <<= 16)
1328 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1330 if (generate)
1332 emit_insn (gen_rtx_SET (dest,
1333 GEN_INT (aarch64_bitmasks[i])));
1334 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1335 GEN_INT ((val >> j) & 0xffff)));
1337 num_insns += 2;
1338 return num_insns;
1343 /* See if we can do it by logically combining two immediates. */
1344 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1346 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1348 int j;
1350 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1351 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1353 if (generate)
1355 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1356 emit_insn (gen_rtx_SET (subtarget,
1357 GEN_INT (aarch64_bitmasks[i])));
1358 emit_insn (gen_iordi3 (dest, subtarget,
1359 GEN_INT (aarch64_bitmasks[j])));
1361 num_insns += 2;
1362 return num_insns;
1365 else if ((val & aarch64_bitmasks[i]) == val)
1367 int j;
1369 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1370 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1372 if (generate)
1374 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1375 emit_insn (gen_rtx_SET (subtarget,
1376 GEN_INT (aarch64_bitmasks[j])));
1377 emit_insn (gen_anddi3 (dest, subtarget,
1378 GEN_INT (aarch64_bitmasks[i])));
1380 num_insns += 2;
1381 return num_insns;
1386 if (one_match > zero_match)
1388 /* Set either first three quarters or all but the third. */
1389 mask = 0xffffll << (16 - first_not_ffff_match);
1390 if (generate)
1391 emit_insn (gen_rtx_SET (dest,
1392 GEN_INT (val | mask | 0xffffffff00000000ull)));
1393 num_insns ++;
1395 /* Now insert other two quarters. */
1396 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1397 i < 64; i += 16, mask <<= 16)
1399 if ((val & mask) != mask)
1401 if (generate)
1402 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1403 GEN_INT ((val >> i) & 0xffff)));
1404 num_insns ++;
1407 return num_insns;
1410 simple_sequence:
1411 first = true;
1412 mask = 0xffff;
1413 for (i = 0; i < 64; i += 16, mask <<= 16)
1415 if ((val & mask) != 0)
1417 if (first)
1419 if (generate)
1420 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1421 num_insns ++;
1422 first = false;
1424 else
1426 if (generate)
1427 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1428 GEN_INT ((val >> i) & 0xffff)));
1429 num_insns ++;
1434 return num_insns;
1438 void
1439 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1441 machine_mode mode = GET_MODE (dest);
1443 gcc_assert (mode == SImode || mode == DImode);
1445 /* Check on what type of symbol it is. */
1446 if (GET_CODE (imm) == SYMBOL_REF
1447 || GET_CODE (imm) == LABEL_REF
1448 || GET_CODE (imm) == CONST)
1450 rtx mem, base, offset;
1451 enum aarch64_symbol_type sty;
1453 /* If we have (const (plus symbol offset)), separate out the offset
1454 before we start classifying the symbol. */
1455 split_const (imm, &base, &offset);
1457 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1458 switch (sty)
1460 case SYMBOL_FORCE_TO_MEM:
1461 if (offset != const0_rtx
1462 && targetm.cannot_force_const_mem (mode, imm))
1464 gcc_assert (can_create_pseudo_p ());
1465 base = aarch64_force_temporary (mode, dest, base);
1466 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1467 aarch64_emit_move (dest, base);
1468 return;
1470 mem = force_const_mem (ptr_mode, imm);
1471 gcc_assert (mem);
1472 if (mode != ptr_mode)
1473 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1474 emit_insn (gen_rtx_SET (dest, mem));
1475 return;
1477 case SYMBOL_SMALL_TLSGD:
1478 case SYMBOL_SMALL_TLSDESC:
1479 case SYMBOL_SMALL_GOTTPREL:
1480 case SYMBOL_SMALL_GOT:
1481 case SYMBOL_TINY_GOT:
1482 if (offset != const0_rtx)
1484 gcc_assert(can_create_pseudo_p ());
1485 base = aarch64_force_temporary (mode, dest, base);
1486 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1487 aarch64_emit_move (dest, base);
1488 return;
1490 /* FALLTHRU */
1492 case SYMBOL_SMALL_TPREL:
1493 case SYMBOL_SMALL_ABSOLUTE:
1494 case SYMBOL_TINY_ABSOLUTE:
1495 aarch64_load_symref_appropriately (dest, imm, sty);
1496 return;
1498 default:
1499 gcc_unreachable ();
1503 if (!CONST_INT_P (imm))
1505 if (GET_CODE (imm) == HIGH)
1506 emit_insn (gen_rtx_SET (dest, imm));
1507 else
1509 rtx mem = force_const_mem (mode, imm);
1510 gcc_assert (mem);
1511 emit_insn (gen_rtx_SET (dest, mem));
1514 return;
1517 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1520 static bool
1521 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1522 tree exp ATTRIBUTE_UNUSED)
1524 /* Currently, always true. */
1525 return true;
1528 /* Implement TARGET_PASS_BY_REFERENCE. */
1530 static bool
1531 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1532 machine_mode mode,
1533 const_tree type,
1534 bool named ATTRIBUTE_UNUSED)
1536 HOST_WIDE_INT size;
1537 machine_mode dummymode;
1538 int nregs;
1540 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1541 size = (mode == BLKmode && type)
1542 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1544 /* Aggregates are passed by reference based on their size. */
1545 if (type && AGGREGATE_TYPE_P (type))
1547 size = int_size_in_bytes (type);
1550 /* Variable sized arguments are always returned by reference. */
1551 if (size < 0)
1552 return true;
1554 /* Can this be a candidate to be passed in fp/simd register(s)? */
1555 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1556 &dummymode, &nregs,
1557 NULL))
1558 return false;
1560 /* Arguments which are variable sized or larger than 2 registers are
1561 passed by reference unless they are a homogenous floating point
1562 aggregate. */
1563 return size > 2 * UNITS_PER_WORD;
1566 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1567 static bool
1568 aarch64_return_in_msb (const_tree valtype)
1570 machine_mode dummy_mode;
1571 int dummy_int;
1573 /* Never happens in little-endian mode. */
1574 if (!BYTES_BIG_ENDIAN)
1575 return false;
1577 /* Only composite types smaller than or equal to 16 bytes can
1578 be potentially returned in registers. */
1579 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1580 || int_size_in_bytes (valtype) <= 0
1581 || int_size_in_bytes (valtype) > 16)
1582 return false;
1584 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1585 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1586 is always passed/returned in the least significant bits of fp/simd
1587 register(s). */
1588 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1589 &dummy_mode, &dummy_int, NULL))
1590 return false;
1592 return true;
1595 /* Implement TARGET_FUNCTION_VALUE.
1596 Define how to find the value returned by a function. */
1598 static rtx
1599 aarch64_function_value (const_tree type, const_tree func,
1600 bool outgoing ATTRIBUTE_UNUSED)
1602 machine_mode mode;
1603 int unsignedp;
1604 int count;
1605 machine_mode ag_mode;
1607 mode = TYPE_MODE (type);
1608 if (INTEGRAL_TYPE_P (type))
1609 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1611 if (aarch64_return_in_msb (type))
1613 HOST_WIDE_INT size = int_size_in_bytes (type);
1615 if (size % UNITS_PER_WORD != 0)
1617 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1618 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1622 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1623 &ag_mode, &count, NULL))
1625 if (!aarch64_composite_type_p (type, mode))
1627 gcc_assert (count == 1 && mode == ag_mode);
1628 return gen_rtx_REG (mode, V0_REGNUM);
1630 else
1632 int i;
1633 rtx par;
1635 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1636 for (i = 0; i < count; i++)
1638 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1639 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1640 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1641 XVECEXP (par, 0, i) = tmp;
1643 return par;
1646 else
1647 return gen_rtx_REG (mode, R0_REGNUM);
1650 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1651 Return true if REGNO is the number of a hard register in which the values
1652 of called function may come back. */
1654 static bool
1655 aarch64_function_value_regno_p (const unsigned int regno)
1657 /* Maximum of 16 bytes can be returned in the general registers. Examples
1658 of 16-byte return values are: 128-bit integers and 16-byte small
1659 structures (excluding homogeneous floating-point aggregates). */
1660 if (regno == R0_REGNUM || regno == R1_REGNUM)
1661 return true;
1663 /* Up to four fp/simd registers can return a function value, e.g. a
1664 homogeneous floating-point aggregate having four members. */
1665 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1666 return TARGET_FLOAT;
1668 return false;
1671 /* Implement TARGET_RETURN_IN_MEMORY.
1673 If the type T of the result of a function is such that
1674 void func (T arg)
1675 would require that arg be passed as a value in a register (or set of
1676 registers) according to the parameter passing rules, then the result
1677 is returned in the same registers as would be used for such an
1678 argument. */
1680 static bool
1681 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1683 HOST_WIDE_INT size;
1684 machine_mode ag_mode;
1685 int count;
1687 if (!AGGREGATE_TYPE_P (type)
1688 && TREE_CODE (type) != COMPLEX_TYPE
1689 && TREE_CODE (type) != VECTOR_TYPE)
1690 /* Simple scalar types always returned in registers. */
1691 return false;
1693 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1694 type,
1695 &ag_mode,
1696 &count,
1697 NULL))
1698 return false;
1700 /* Types larger than 2 registers returned in memory. */
1701 size = int_size_in_bytes (type);
1702 return (size < 0 || size > 2 * UNITS_PER_WORD);
1705 static bool
1706 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1707 const_tree type, int *nregs)
1709 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1710 return aarch64_vfp_is_call_or_return_candidate (mode,
1711 type,
1712 &pcum->aapcs_vfp_rmode,
1713 nregs,
1714 NULL);
1717 /* Given MODE and TYPE of a function argument, return the alignment in
1718 bits. The idea is to suppress any stronger alignment requested by
1719 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1720 This is a helper function for local use only. */
1722 static unsigned int
1723 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1725 unsigned int alignment;
1727 if (type)
1729 if (!integer_zerop (TYPE_SIZE (type)))
1731 if (TYPE_MODE (type) == mode)
1732 alignment = TYPE_ALIGN (type);
1733 else
1734 alignment = GET_MODE_ALIGNMENT (mode);
1736 else
1737 alignment = 0;
1739 else
1740 alignment = GET_MODE_ALIGNMENT (mode);
1742 return alignment;
1745 /* Layout a function argument according to the AAPCS64 rules. The rule
1746 numbers refer to the rule numbers in the AAPCS64. */
1748 static void
1749 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1750 const_tree type,
1751 bool named ATTRIBUTE_UNUSED)
1753 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1754 int ncrn, nvrn, nregs;
1755 bool allocate_ncrn, allocate_nvrn;
1756 HOST_WIDE_INT size;
1758 /* We need to do this once per argument. */
1759 if (pcum->aapcs_arg_processed)
1760 return;
1762 pcum->aapcs_arg_processed = true;
1764 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1765 size
1766 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1767 UNITS_PER_WORD);
1769 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1770 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1771 mode,
1772 type,
1773 &nregs);
1775 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1776 The following code thus handles passing by SIMD/FP registers first. */
1778 nvrn = pcum->aapcs_nvrn;
1780 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1781 and homogenous short-vector aggregates (HVA). */
1782 if (allocate_nvrn)
1784 if (!TARGET_FLOAT)
1785 aarch64_err_no_fpadvsimd (mode, "argument");
1787 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1789 pcum->aapcs_nextnvrn = nvrn + nregs;
1790 if (!aarch64_composite_type_p (type, mode))
1792 gcc_assert (nregs == 1);
1793 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1795 else
1797 rtx par;
1798 int i;
1799 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1800 for (i = 0; i < nregs; i++)
1802 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1803 V0_REGNUM + nvrn + i);
1804 tmp = gen_rtx_EXPR_LIST
1805 (VOIDmode, tmp,
1806 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1807 XVECEXP (par, 0, i) = tmp;
1809 pcum->aapcs_reg = par;
1811 return;
1813 else
1815 /* C.3 NSRN is set to 8. */
1816 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1817 goto on_stack;
1821 ncrn = pcum->aapcs_ncrn;
1822 nregs = size / UNITS_PER_WORD;
1824 /* C6 - C9. though the sign and zero extension semantics are
1825 handled elsewhere. This is the case where the argument fits
1826 entirely general registers. */
1827 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1829 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1831 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1833 /* C.8 if the argument has an alignment of 16 then the NGRN is
1834 rounded up to the next even number. */
1835 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1837 ++ncrn;
1838 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1840 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1841 A reg is still generated for it, but the caller should be smart
1842 enough not to use it. */
1843 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1845 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1847 else
1849 rtx par;
1850 int i;
1852 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1853 for (i = 0; i < nregs; i++)
1855 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1856 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1857 GEN_INT (i * UNITS_PER_WORD));
1858 XVECEXP (par, 0, i) = tmp;
1860 pcum->aapcs_reg = par;
1863 pcum->aapcs_nextncrn = ncrn + nregs;
1864 return;
1867 /* C.11 */
1868 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1870 /* The argument is passed on stack; record the needed number of words for
1871 this argument and align the total size if necessary. */
1872 on_stack:
1873 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1874 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1875 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1876 16 / UNITS_PER_WORD);
1877 return;
1880 /* Implement TARGET_FUNCTION_ARG. */
1882 static rtx
1883 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1884 const_tree type, bool named)
1886 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1887 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1889 if (mode == VOIDmode)
1890 return NULL_RTX;
1892 aarch64_layout_arg (pcum_v, mode, type, named);
1893 return pcum->aapcs_reg;
1896 void
1897 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1898 const_tree fntype ATTRIBUTE_UNUSED,
1899 rtx libname ATTRIBUTE_UNUSED,
1900 const_tree fndecl ATTRIBUTE_UNUSED,
1901 unsigned n_named ATTRIBUTE_UNUSED)
1903 pcum->aapcs_ncrn = 0;
1904 pcum->aapcs_nvrn = 0;
1905 pcum->aapcs_nextncrn = 0;
1906 pcum->aapcs_nextnvrn = 0;
1907 pcum->pcs_variant = ARM_PCS_AAPCS64;
1908 pcum->aapcs_reg = NULL_RTX;
1909 pcum->aapcs_arg_processed = false;
1910 pcum->aapcs_stack_words = 0;
1911 pcum->aapcs_stack_size = 0;
1913 if (!TARGET_FLOAT
1914 && fndecl && TREE_PUBLIC (fndecl)
1915 && fntype && fntype != error_mark_node)
1917 const_tree type = TREE_TYPE (fntype);
1918 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
1919 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
1920 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
1921 &mode, &nregs, NULL))
1922 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
1924 return;
1927 static void
1928 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1929 machine_mode mode,
1930 const_tree type,
1931 bool named)
1933 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1934 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1936 aarch64_layout_arg (pcum_v, mode, type, named);
1937 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1938 != (pcum->aapcs_stack_words != 0));
1939 pcum->aapcs_arg_processed = false;
1940 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1941 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1942 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1943 pcum->aapcs_stack_words = 0;
1944 pcum->aapcs_reg = NULL_RTX;
1948 bool
1949 aarch64_function_arg_regno_p (unsigned regno)
1951 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1952 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1955 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1956 PARM_BOUNDARY bits of alignment, but will be given anything up
1957 to STACK_BOUNDARY bits if the type requires it. This makes sure
1958 that both before and after the layout of each argument, the Next
1959 Stacked Argument Address (NSAA) will have a minimum alignment of
1960 8 bytes. */
1962 static unsigned int
1963 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1965 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1967 if (alignment < PARM_BOUNDARY)
1968 alignment = PARM_BOUNDARY;
1969 if (alignment > STACK_BOUNDARY)
1970 alignment = STACK_BOUNDARY;
1971 return alignment;
1974 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1976 Return true if an argument passed on the stack should be padded upwards,
1977 i.e. if the least-significant byte of the stack slot has useful data.
1979 Small aggregate types are placed in the lowest memory address.
1981 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1983 bool
1984 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1986 /* On little-endian targets, the least significant byte of every stack
1987 argument is passed at the lowest byte address of the stack slot. */
1988 if (!BYTES_BIG_ENDIAN)
1989 return true;
1991 /* Otherwise, integral, floating-point and pointer types are padded downward:
1992 the least significant byte of a stack argument is passed at the highest
1993 byte address of the stack slot. */
1994 if (type
1995 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1996 || POINTER_TYPE_P (type))
1997 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1998 return false;
2000 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2001 return true;
2004 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2006 It specifies padding for the last (may also be the only)
2007 element of a block move between registers and memory. If
2008 assuming the block is in the memory, padding upward means that
2009 the last element is padded after its highest significant byte,
2010 while in downward padding, the last element is padded at the
2011 its least significant byte side.
2013 Small aggregates and small complex types are always padded
2014 upwards.
2016 We don't need to worry about homogeneous floating-point or
2017 short-vector aggregates; their move is not affected by the
2018 padding direction determined here. Regardless of endianness,
2019 each element of such an aggregate is put in the least
2020 significant bits of a fp/simd register.
2022 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2023 register has useful data, and return the opposite if the most
2024 significant byte does. */
2026 bool
2027 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2028 bool first ATTRIBUTE_UNUSED)
2031 /* Small composite types are always padded upward. */
2032 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2034 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2035 : GET_MODE_SIZE (mode));
2036 if (size < 2 * UNITS_PER_WORD)
2037 return true;
2040 /* Otherwise, use the default padding. */
2041 return !BYTES_BIG_ENDIAN;
2044 static machine_mode
2045 aarch64_libgcc_cmp_return_mode (void)
2047 return SImode;
2050 static bool
2051 aarch64_frame_pointer_required (void)
2053 /* In aarch64_override_options_after_change
2054 flag_omit_leaf_frame_pointer turns off the frame pointer by
2055 default. Turn it back on now if we've not got a leaf
2056 function. */
2057 if (flag_omit_leaf_frame_pointer
2058 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2059 return true;
2061 return false;
2064 /* Mark the registers that need to be saved by the callee and calculate
2065 the size of the callee-saved registers area and frame record (both FP
2066 and LR may be omitted). */
2067 static void
2068 aarch64_layout_frame (void)
2070 HOST_WIDE_INT offset = 0;
2071 int regno;
2073 if (reload_completed && cfun->machine->frame.laid_out)
2074 return;
2076 #define SLOT_NOT_REQUIRED (-2)
2077 #define SLOT_REQUIRED (-1)
2079 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2080 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2082 /* First mark all the registers that really need to be saved... */
2083 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2084 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2086 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2087 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2089 /* ... that includes the eh data registers (if needed)... */
2090 if (crtl->calls_eh_return)
2091 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2092 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2093 = SLOT_REQUIRED;
2095 /* ... and any callee saved register that dataflow says is live. */
2096 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2097 if (df_regs_ever_live_p (regno)
2098 && (regno == R30_REGNUM
2099 || !call_used_regs[regno]))
2100 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2102 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2103 if (df_regs_ever_live_p (regno)
2104 && !call_used_regs[regno])
2105 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2107 if (frame_pointer_needed)
2109 /* FP and LR are placed in the linkage record. */
2110 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2111 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2112 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2113 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2114 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2115 offset += 2 * UNITS_PER_WORD;
2118 /* Now assign stack slots for them. */
2119 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2120 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2122 cfun->machine->frame.reg_offset[regno] = offset;
2123 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2124 cfun->machine->frame.wb_candidate1 = regno;
2125 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2126 cfun->machine->frame.wb_candidate2 = regno;
2127 offset += UNITS_PER_WORD;
2130 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2131 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2133 cfun->machine->frame.reg_offset[regno] = offset;
2134 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2135 cfun->machine->frame.wb_candidate1 = regno;
2136 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2137 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2138 cfun->machine->frame.wb_candidate2 = regno;
2139 offset += UNITS_PER_WORD;
2142 cfun->machine->frame.padding0 =
2143 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2144 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2146 cfun->machine->frame.saved_regs_size = offset;
2148 cfun->machine->frame.hard_fp_offset
2149 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2150 + get_frame_size ()
2151 + cfun->machine->frame.saved_regs_size,
2152 STACK_BOUNDARY / BITS_PER_UNIT);
2154 cfun->machine->frame.frame_size
2155 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2156 + crtl->outgoing_args_size,
2157 STACK_BOUNDARY / BITS_PER_UNIT);
2159 cfun->machine->frame.laid_out = true;
2162 static bool
2163 aarch64_register_saved_on_entry (int regno)
2165 return cfun->machine->frame.reg_offset[regno] >= 0;
2168 static unsigned
2169 aarch64_next_callee_save (unsigned regno, unsigned limit)
2171 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2172 regno ++;
2173 return regno;
2176 static void
2177 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2178 HOST_WIDE_INT adjustment)
2180 rtx base_rtx = stack_pointer_rtx;
2181 rtx insn, reg, mem;
2183 reg = gen_rtx_REG (mode, regno);
2184 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2185 plus_constant (Pmode, base_rtx, -adjustment));
2186 mem = gen_rtx_MEM (mode, mem);
2188 insn = emit_move_insn (mem, reg);
2189 RTX_FRAME_RELATED_P (insn) = 1;
2192 static rtx
2193 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2194 HOST_WIDE_INT adjustment)
2196 switch (mode)
2198 case DImode:
2199 return gen_storewb_pairdi_di (base, base, reg, reg2,
2200 GEN_INT (-adjustment),
2201 GEN_INT (UNITS_PER_WORD - adjustment));
2202 case DFmode:
2203 return gen_storewb_pairdf_di (base, base, reg, reg2,
2204 GEN_INT (-adjustment),
2205 GEN_INT (UNITS_PER_WORD - adjustment));
2206 default:
2207 gcc_unreachable ();
2211 static void
2212 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2213 unsigned regno2, HOST_WIDE_INT adjustment)
2215 rtx_insn *insn;
2216 rtx reg1 = gen_rtx_REG (mode, regno1);
2217 rtx reg2 = gen_rtx_REG (mode, regno2);
2219 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2220 reg2, adjustment));
2221 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2222 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2223 RTX_FRAME_RELATED_P (insn) = 1;
2226 static rtx
2227 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2228 HOST_WIDE_INT adjustment)
2230 switch (mode)
2232 case DImode:
2233 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2234 GEN_INT (UNITS_PER_WORD));
2235 case DFmode:
2236 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2237 GEN_INT (UNITS_PER_WORD));
2238 default:
2239 gcc_unreachable ();
2243 static rtx
2244 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2245 rtx reg2)
2247 switch (mode)
2249 case DImode:
2250 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2252 case DFmode:
2253 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2255 default:
2256 gcc_unreachable ();
2260 static rtx
2261 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2262 rtx mem2)
2264 switch (mode)
2266 case DImode:
2267 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2269 case DFmode:
2270 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2272 default:
2273 gcc_unreachable ();
2278 static void
2279 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2280 unsigned start, unsigned limit, bool skip_wb)
2282 rtx_insn *insn;
2283 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2284 ? gen_frame_mem : gen_rtx_MEM);
2285 unsigned regno;
2286 unsigned regno2;
2288 for (regno = aarch64_next_callee_save (start, limit);
2289 regno <= limit;
2290 regno = aarch64_next_callee_save (regno + 1, limit))
2292 rtx reg, mem;
2293 HOST_WIDE_INT offset;
2295 if (skip_wb
2296 && (regno == cfun->machine->frame.wb_candidate1
2297 || regno == cfun->machine->frame.wb_candidate2))
2298 continue;
2300 reg = gen_rtx_REG (mode, regno);
2301 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2302 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2303 offset));
2305 regno2 = aarch64_next_callee_save (regno + 1, limit);
2307 if (regno2 <= limit
2308 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2309 == cfun->machine->frame.reg_offset[regno2]))
2312 rtx reg2 = gen_rtx_REG (mode, regno2);
2313 rtx mem2;
2315 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2316 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2317 offset));
2318 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2319 reg2));
2321 /* The first part of a frame-related parallel insn is
2322 always assumed to be relevant to the frame
2323 calculations; subsequent parts, are only
2324 frame-related if explicitly marked. */
2325 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2326 regno = regno2;
2328 else
2329 insn = emit_move_insn (mem, reg);
2331 RTX_FRAME_RELATED_P (insn) = 1;
2335 static void
2336 aarch64_restore_callee_saves (machine_mode mode,
2337 HOST_WIDE_INT start_offset, unsigned start,
2338 unsigned limit, bool skip_wb, rtx *cfi_ops)
2340 rtx base_rtx = stack_pointer_rtx;
2341 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2342 ? gen_frame_mem : gen_rtx_MEM);
2343 unsigned regno;
2344 unsigned regno2;
2345 HOST_WIDE_INT offset;
2347 for (regno = aarch64_next_callee_save (start, limit);
2348 regno <= limit;
2349 regno = aarch64_next_callee_save (regno + 1, limit))
2351 rtx reg, mem;
2353 if (skip_wb
2354 && (regno == cfun->machine->frame.wb_candidate1
2355 || regno == cfun->machine->frame.wb_candidate2))
2356 continue;
2358 reg = gen_rtx_REG (mode, regno);
2359 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2360 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2362 regno2 = aarch64_next_callee_save (regno + 1, limit);
2364 if (regno2 <= limit
2365 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2366 == cfun->machine->frame.reg_offset[regno2]))
2368 rtx reg2 = gen_rtx_REG (mode, regno2);
2369 rtx mem2;
2371 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2372 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2373 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2375 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2376 regno = regno2;
2378 else
2379 emit_move_insn (reg, mem);
2380 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2384 /* AArch64 stack frames generated by this compiler look like:
2386 +-------------------------------+
2388 | incoming stack arguments |
2390 +-------------------------------+
2391 | | <-- incoming stack pointer (aligned)
2392 | callee-allocated save area |
2393 | for register varargs |
2395 +-------------------------------+
2396 | local variables | <-- frame_pointer_rtx
2398 +-------------------------------+
2399 | padding0 | \
2400 +-------------------------------+ |
2401 | callee-saved registers | | frame.saved_regs_size
2402 +-------------------------------+ |
2403 | LR' | |
2404 +-------------------------------+ |
2405 | FP' | / <- hard_frame_pointer_rtx (aligned)
2406 +-------------------------------+
2407 | dynamic allocation |
2408 +-------------------------------+
2409 | padding |
2410 +-------------------------------+
2411 | outgoing stack arguments | <-- arg_pointer
2413 +-------------------------------+
2414 | | <-- stack_pointer_rtx (aligned)
2416 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2417 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2418 unchanged. */
2420 /* Generate the prologue instructions for entry into a function.
2421 Establish the stack frame by decreasing the stack pointer with a
2422 properly calculated size and, if necessary, create a frame record
2423 filled with the values of LR and previous frame pointer. The
2424 current FP is also set up if it is in use. */
2426 void
2427 aarch64_expand_prologue (void)
2429 /* sub sp, sp, #<frame_size>
2430 stp {fp, lr}, [sp, #<frame_size> - 16]
2431 add fp, sp, #<frame_size> - hardfp_offset
2432 stp {cs_reg}, [fp, #-16] etc.
2434 sub sp, sp, <final_adjustment_if_any>
2436 HOST_WIDE_INT frame_size, offset;
2437 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2438 HOST_WIDE_INT hard_fp_offset;
2439 rtx_insn *insn;
2441 aarch64_layout_frame ();
2443 offset = frame_size = cfun->machine->frame.frame_size;
2444 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2445 fp_offset = frame_size - hard_fp_offset;
2447 if (flag_stack_usage_info)
2448 current_function_static_stack_size = frame_size;
2450 /* Store pairs and load pairs have a range only -512 to 504. */
2451 if (offset >= 512)
2453 /* When the frame has a large size, an initial decrease is done on
2454 the stack pointer to jump over the callee-allocated save area for
2455 register varargs, the local variable area and/or the callee-saved
2456 register area. This will allow the pre-index write-back
2457 store pair instructions to be used for setting up the stack frame
2458 efficiently. */
2459 offset = hard_fp_offset;
2460 if (offset >= 512)
2461 offset = cfun->machine->frame.saved_regs_size;
2463 frame_size -= (offset + crtl->outgoing_args_size);
2464 fp_offset = 0;
2466 if (frame_size >= 0x1000000)
2468 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2469 emit_move_insn (op0, GEN_INT (-frame_size));
2470 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2472 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2473 gen_rtx_SET (stack_pointer_rtx,
2474 plus_constant (Pmode, stack_pointer_rtx,
2475 -frame_size)));
2476 RTX_FRAME_RELATED_P (insn) = 1;
2478 else if (frame_size > 0)
2480 int hi_ofs = frame_size & 0xfff000;
2481 int lo_ofs = frame_size & 0x000fff;
2483 if (hi_ofs)
2485 insn = emit_insn (gen_add2_insn
2486 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2487 RTX_FRAME_RELATED_P (insn) = 1;
2489 if (lo_ofs)
2491 insn = emit_insn (gen_add2_insn
2492 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2493 RTX_FRAME_RELATED_P (insn) = 1;
2497 else
2498 frame_size = -1;
2500 if (offset > 0)
2502 bool skip_wb = false;
2504 if (frame_pointer_needed)
2506 skip_wb = true;
2508 if (fp_offset)
2510 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2511 GEN_INT (-offset)));
2512 RTX_FRAME_RELATED_P (insn) = 1;
2514 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2515 R30_REGNUM, false);
2517 else
2518 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2520 /* Set up frame pointer to point to the location of the
2521 previous frame pointer on the stack. */
2522 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2523 stack_pointer_rtx,
2524 GEN_INT (fp_offset)));
2525 RTX_FRAME_RELATED_P (insn) = 1;
2526 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2528 else
2530 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2531 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2533 if (fp_offset
2534 || reg1 == FIRST_PSEUDO_REGISTER
2535 || (reg2 == FIRST_PSEUDO_REGISTER
2536 && offset >= 256))
2538 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2539 GEN_INT (-offset)));
2540 RTX_FRAME_RELATED_P (insn) = 1;
2542 else
2544 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2546 skip_wb = true;
2548 if (reg2 == FIRST_PSEUDO_REGISTER)
2549 aarch64_pushwb_single_reg (mode1, reg1, offset);
2550 else
2551 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2555 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2556 skip_wb);
2557 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2558 skip_wb);
2561 /* when offset >= 512,
2562 sub sp, sp, #<outgoing_args_size> */
2563 if (frame_size > -1)
2565 if (crtl->outgoing_args_size > 0)
2567 insn = emit_insn (gen_add2_insn
2568 (stack_pointer_rtx,
2569 GEN_INT (- crtl->outgoing_args_size)));
2570 RTX_FRAME_RELATED_P (insn) = 1;
2575 /* Return TRUE if we can use a simple_return insn.
2577 This function checks whether the callee saved stack is empty, which
2578 means no restore actions are need. The pro_and_epilogue will use
2579 this to check whether shrink-wrapping opt is feasible. */
2581 bool
2582 aarch64_use_return_insn_p (void)
2584 if (!reload_completed)
2585 return false;
2587 if (crtl->profile)
2588 return false;
2590 aarch64_layout_frame ();
2592 return cfun->machine->frame.frame_size == 0;
2595 /* Generate the epilogue instructions for returning from a function. */
2596 void
2597 aarch64_expand_epilogue (bool for_sibcall)
2599 HOST_WIDE_INT frame_size, offset;
2600 HOST_WIDE_INT fp_offset;
2601 HOST_WIDE_INT hard_fp_offset;
2602 rtx_insn *insn;
2603 /* We need to add memory barrier to prevent read from deallocated stack. */
2604 bool need_barrier_p = (get_frame_size () != 0
2605 || cfun->machine->frame.saved_varargs_size);
2607 aarch64_layout_frame ();
2609 offset = frame_size = cfun->machine->frame.frame_size;
2610 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2611 fp_offset = frame_size - hard_fp_offset;
2613 /* Store pairs and load pairs have a range only -512 to 504. */
2614 if (offset >= 512)
2616 offset = hard_fp_offset;
2617 if (offset >= 512)
2618 offset = cfun->machine->frame.saved_regs_size;
2620 frame_size -= (offset + crtl->outgoing_args_size);
2621 fp_offset = 0;
2622 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2624 insn = emit_insn (gen_add2_insn
2625 (stack_pointer_rtx,
2626 GEN_INT (crtl->outgoing_args_size)));
2627 RTX_FRAME_RELATED_P (insn) = 1;
2630 else
2631 frame_size = -1;
2633 /* If there were outgoing arguments or we've done dynamic stack
2634 allocation, then restore the stack pointer from the frame
2635 pointer. This is at most one insn and more efficient than using
2636 GCC's internal mechanism. */
2637 if (frame_pointer_needed
2638 && (crtl->outgoing_args_size || cfun->calls_alloca))
2640 if (cfun->calls_alloca)
2641 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2643 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2644 hard_frame_pointer_rtx,
2645 GEN_INT (0)));
2646 offset = offset - fp_offset;
2649 if (offset > 0)
2651 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2652 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2653 bool skip_wb = true;
2654 rtx cfi_ops = NULL;
2656 if (frame_pointer_needed)
2657 fp_offset = 0;
2658 else if (fp_offset
2659 || reg1 == FIRST_PSEUDO_REGISTER
2660 || (reg2 == FIRST_PSEUDO_REGISTER
2661 && offset >= 256))
2662 skip_wb = false;
2664 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2665 skip_wb, &cfi_ops);
2666 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2667 skip_wb, &cfi_ops);
2669 if (need_barrier_p)
2670 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2672 if (skip_wb)
2674 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2675 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2677 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2678 if (reg2 == FIRST_PSEUDO_REGISTER)
2680 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2681 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2682 mem = gen_rtx_MEM (mode1, mem);
2683 insn = emit_move_insn (rreg1, mem);
2685 else
2687 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2689 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2690 insn = emit_insn (aarch64_gen_loadwb_pair
2691 (mode1, stack_pointer_rtx, rreg1,
2692 rreg2, offset));
2695 else
2697 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2698 GEN_INT (offset)));
2701 /* Reset the CFA to be SP + FRAME_SIZE. */
2702 rtx new_cfa = stack_pointer_rtx;
2703 if (frame_size > 0)
2704 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2705 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2706 REG_NOTES (insn) = cfi_ops;
2707 RTX_FRAME_RELATED_P (insn) = 1;
2710 if (frame_size > 0)
2712 if (need_barrier_p)
2713 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2715 if (frame_size >= 0x1000000)
2717 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2718 emit_move_insn (op0, GEN_INT (frame_size));
2719 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2721 else
2723 int hi_ofs = frame_size & 0xfff000;
2724 int lo_ofs = frame_size & 0x000fff;
2726 if (hi_ofs && lo_ofs)
2728 insn = emit_insn (gen_add2_insn
2729 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2730 RTX_FRAME_RELATED_P (insn) = 1;
2731 frame_size = lo_ofs;
2733 insn = emit_insn (gen_add2_insn
2734 (stack_pointer_rtx, GEN_INT (frame_size)));
2737 /* Reset the CFA to be SP + 0. */
2738 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2739 RTX_FRAME_RELATED_P (insn) = 1;
2742 /* Stack adjustment for exception handler. */
2743 if (crtl->calls_eh_return)
2745 /* We need to unwind the stack by the offset computed by
2746 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2747 to be SP; letting the CFA move during this adjustment
2748 is just as correct as retaining the CFA from the body
2749 of the function. Therefore, do nothing special. */
2750 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2753 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2754 if (!for_sibcall)
2755 emit_jump_insn (ret_rtx);
2758 /* Return the place to copy the exception unwinding return address to.
2759 This will probably be a stack slot, but could (in theory be the
2760 return register). */
2762 aarch64_final_eh_return_addr (void)
2764 HOST_WIDE_INT fp_offset;
2766 aarch64_layout_frame ();
2768 fp_offset = cfun->machine->frame.frame_size
2769 - cfun->machine->frame.hard_fp_offset;
2771 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2772 return gen_rtx_REG (DImode, LR_REGNUM);
2774 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2775 result in a store to save LR introduced by builtin_eh_return () being
2776 incorrectly deleted because the alias is not detected.
2777 So in the calculation of the address to copy the exception unwinding
2778 return address to, we note 2 cases.
2779 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2780 we return a SP-relative location since all the addresses are SP-relative
2781 in this case. This prevents the store from being optimized away.
2782 If the fp_offset is not 0, then the addresses will be FP-relative and
2783 therefore we return a FP-relative location. */
2785 if (frame_pointer_needed)
2787 if (fp_offset)
2788 return gen_frame_mem (DImode,
2789 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2790 else
2791 return gen_frame_mem (DImode,
2792 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2795 /* If FP is not needed, we calculate the location of LR, which would be
2796 at the top of the saved registers block. */
2798 return gen_frame_mem (DImode,
2799 plus_constant (Pmode,
2800 stack_pointer_rtx,
2801 fp_offset
2802 + cfun->machine->frame.saved_regs_size
2803 - 2 * UNITS_PER_WORD));
2806 /* Possibly output code to build up a constant in a register. For
2807 the benefit of the costs infrastructure, returns the number of
2808 instructions which would be emitted. GENERATE inhibits or
2809 enables code generation. */
2811 static int
2812 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2814 int insns = 0;
2816 if (aarch64_bitmask_imm (val, DImode))
2818 if (generate)
2819 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2820 insns = 1;
2822 else
2824 int i;
2825 int ncount = 0;
2826 int zcount = 0;
2827 HOST_WIDE_INT valp = val >> 16;
2828 HOST_WIDE_INT valm;
2829 HOST_WIDE_INT tval;
2831 for (i = 16; i < 64; i += 16)
2833 valm = (valp & 0xffff);
2835 if (valm != 0)
2836 ++ zcount;
2838 if (valm != 0xffff)
2839 ++ ncount;
2841 valp >>= 16;
2844 /* zcount contains the number of additional MOVK instructions
2845 required if the constant is built up with an initial MOVZ instruction,
2846 while ncount is the number of MOVK instructions required if starting
2847 with a MOVN instruction. Choose the sequence that yields the fewest
2848 number of instructions, preferring MOVZ instructions when they are both
2849 the same. */
2850 if (ncount < zcount)
2852 if (generate)
2853 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2854 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2855 tval = 0xffff;
2856 insns++;
2858 else
2860 if (generate)
2861 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2862 GEN_INT (val & 0xffff));
2863 tval = 0;
2864 insns++;
2867 val >>= 16;
2869 for (i = 16; i < 64; i += 16)
2871 if ((val & 0xffff) != tval)
2873 if (generate)
2874 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2875 GEN_INT (i),
2876 GEN_INT (val & 0xffff)));
2877 insns++;
2879 val >>= 16;
2882 return insns;
2885 static void
2886 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2888 HOST_WIDE_INT mdelta = delta;
2889 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2890 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2892 if (mdelta < 0)
2893 mdelta = -mdelta;
2895 if (mdelta >= 4096 * 4096)
2897 (void) aarch64_build_constant (scratchreg, delta, true);
2898 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2900 else if (mdelta > 0)
2902 if (mdelta >= 4096)
2904 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2905 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2906 if (delta < 0)
2907 emit_insn (gen_rtx_SET (this_rtx,
2908 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2909 else
2910 emit_insn (gen_rtx_SET (this_rtx,
2911 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2913 if (mdelta % 4096 != 0)
2915 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2916 emit_insn (gen_rtx_SET (this_rtx,
2917 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2922 /* Output code to add DELTA to the first argument, and then jump
2923 to FUNCTION. Used for C++ multiple inheritance. */
2924 static void
2925 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2926 HOST_WIDE_INT delta,
2927 HOST_WIDE_INT vcall_offset,
2928 tree function)
2930 /* The this pointer is always in x0. Note that this differs from
2931 Arm where the this pointer maybe bumped to r1 if r0 is required
2932 to return a pointer to an aggregate. On AArch64 a result value
2933 pointer will be in x8. */
2934 int this_regno = R0_REGNUM;
2935 rtx this_rtx, temp0, temp1, addr, funexp;
2936 rtx_insn *insn;
2938 reload_completed = 1;
2939 emit_note (NOTE_INSN_PROLOGUE_END);
2941 if (vcall_offset == 0)
2942 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2943 else
2945 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2947 this_rtx = gen_rtx_REG (Pmode, this_regno);
2948 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2949 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2951 addr = this_rtx;
2952 if (delta != 0)
2954 if (delta >= -256 && delta < 256)
2955 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2956 plus_constant (Pmode, this_rtx, delta));
2957 else
2958 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2961 if (Pmode == ptr_mode)
2962 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2963 else
2964 aarch64_emit_move (temp0,
2965 gen_rtx_ZERO_EXTEND (Pmode,
2966 gen_rtx_MEM (ptr_mode, addr)));
2968 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2969 addr = plus_constant (Pmode, temp0, vcall_offset);
2970 else
2972 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2973 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2976 if (Pmode == ptr_mode)
2977 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2978 else
2979 aarch64_emit_move (temp1,
2980 gen_rtx_SIGN_EXTEND (Pmode,
2981 gen_rtx_MEM (ptr_mode, addr)));
2983 emit_insn (gen_add2_insn (this_rtx, temp1));
2986 /* Generate a tail call to the target function. */
2987 if (!TREE_USED (function))
2989 assemble_external (function);
2990 TREE_USED (function) = 1;
2992 funexp = XEXP (DECL_RTL (function), 0);
2993 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2994 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2995 SIBLING_CALL_P (insn) = 1;
2997 insn = get_insns ();
2998 shorten_branches (insn);
2999 final_start_function (insn, file, 1);
3000 final (insn, file, 1);
3001 final_end_function ();
3003 /* Stop pretending to be a post-reload pass. */
3004 reload_completed = 0;
3007 static bool
3008 aarch64_tls_referenced_p (rtx x)
3010 if (!TARGET_HAVE_TLS)
3011 return false;
3012 subrtx_iterator::array_type array;
3013 FOR_EACH_SUBRTX (iter, array, x, ALL)
3015 const_rtx x = *iter;
3016 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3017 return true;
3018 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3019 TLS offsets, not real symbol references. */
3020 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3021 iter.skip_subrtxes ();
3023 return false;
3027 static int
3028 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3030 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3031 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3033 if (*imm1 < *imm2)
3034 return -1;
3035 if (*imm1 > *imm2)
3036 return +1;
3037 return 0;
3041 static void
3042 aarch64_build_bitmask_table (void)
3044 unsigned HOST_WIDE_INT mask, imm;
3045 unsigned int log_e, e, s, r;
3046 unsigned int nimms = 0;
3048 for (log_e = 1; log_e <= 6; log_e++)
3050 e = 1 << log_e;
3051 if (e == 64)
3052 mask = ~(HOST_WIDE_INT) 0;
3053 else
3054 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3055 for (s = 1; s < e; s++)
3057 for (r = 0; r < e; r++)
3059 /* set s consecutive bits to 1 (s < 64) */
3060 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3061 /* rotate right by r */
3062 if (r != 0)
3063 imm = ((imm >> r) | (imm << (e - r))) & mask;
3064 /* replicate the constant depending on SIMD size */
3065 switch (log_e) {
3066 case 1: imm |= (imm << 2);
3067 case 2: imm |= (imm << 4);
3068 case 3: imm |= (imm << 8);
3069 case 4: imm |= (imm << 16);
3070 case 5: imm |= (imm << 32);
3071 case 6:
3072 break;
3073 default:
3074 gcc_unreachable ();
3076 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3077 aarch64_bitmasks[nimms++] = imm;
3082 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3083 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3084 aarch64_bitmasks_cmp);
3088 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3089 a left shift of 0 or 12 bits. */
3090 bool
3091 aarch64_uimm12_shift (HOST_WIDE_INT val)
3093 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3094 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3099 /* Return true if val is an immediate that can be loaded into a
3100 register by a MOVZ instruction. */
3101 static bool
3102 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3104 if (GET_MODE_SIZE (mode) > 4)
3106 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3107 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3108 return 1;
3110 else
3112 /* Ignore sign extension. */
3113 val &= (HOST_WIDE_INT) 0xffffffff;
3115 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3116 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3120 /* Return true if val is a valid bitmask immediate. */
3121 bool
3122 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3124 if (GET_MODE_SIZE (mode) < 8)
3126 /* Replicate bit pattern. */
3127 val &= (HOST_WIDE_INT) 0xffffffff;
3128 val |= val << 32;
3130 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3131 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3135 /* Return true if val is an immediate that can be loaded into a
3136 register in a single instruction. */
3137 bool
3138 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3140 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3141 return 1;
3142 return aarch64_bitmask_imm (val, mode);
3145 static bool
3146 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3148 rtx base, offset;
3150 if (GET_CODE (x) == HIGH)
3151 return true;
3153 split_const (x, &base, &offset);
3154 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3156 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3157 != SYMBOL_FORCE_TO_MEM)
3158 return true;
3159 else
3160 /* Avoid generating a 64-bit relocation in ILP32; leave
3161 to aarch64_expand_mov_immediate to handle it properly. */
3162 return mode != ptr_mode;
3165 return aarch64_tls_referenced_p (x);
3168 /* Return true if register REGNO is a valid index register.
3169 STRICT_P is true if REG_OK_STRICT is in effect. */
3171 bool
3172 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3174 if (!HARD_REGISTER_NUM_P (regno))
3176 if (!strict_p)
3177 return true;
3179 if (!reg_renumber)
3180 return false;
3182 regno = reg_renumber[regno];
3184 return GP_REGNUM_P (regno);
3187 /* Return true if register REGNO is a valid base register for mode MODE.
3188 STRICT_P is true if REG_OK_STRICT is in effect. */
3190 bool
3191 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3193 if (!HARD_REGISTER_NUM_P (regno))
3195 if (!strict_p)
3196 return true;
3198 if (!reg_renumber)
3199 return false;
3201 regno = reg_renumber[regno];
3204 /* The fake registers will be eliminated to either the stack or
3205 hard frame pointer, both of which are usually valid base registers.
3206 Reload deals with the cases where the eliminated form isn't valid. */
3207 return (GP_REGNUM_P (regno)
3208 || regno == SP_REGNUM
3209 || regno == FRAME_POINTER_REGNUM
3210 || regno == ARG_POINTER_REGNUM);
3213 /* Return true if X is a valid base register for mode MODE.
3214 STRICT_P is true if REG_OK_STRICT is in effect. */
3216 static bool
3217 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3219 if (!strict_p && GET_CODE (x) == SUBREG)
3220 x = SUBREG_REG (x);
3222 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3225 /* Return true if address offset is a valid index. If it is, fill in INFO
3226 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3228 static bool
3229 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3230 machine_mode mode, bool strict_p)
3232 enum aarch64_address_type type;
3233 rtx index;
3234 int shift;
3236 /* (reg:P) */
3237 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3238 && GET_MODE (x) == Pmode)
3240 type = ADDRESS_REG_REG;
3241 index = x;
3242 shift = 0;
3244 /* (sign_extend:DI (reg:SI)) */
3245 else if ((GET_CODE (x) == SIGN_EXTEND
3246 || GET_CODE (x) == ZERO_EXTEND)
3247 && GET_MODE (x) == DImode
3248 && GET_MODE (XEXP (x, 0)) == SImode)
3250 type = (GET_CODE (x) == SIGN_EXTEND)
3251 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3252 index = XEXP (x, 0);
3253 shift = 0;
3255 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3256 else if (GET_CODE (x) == MULT
3257 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3258 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3259 && GET_MODE (XEXP (x, 0)) == DImode
3260 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3261 && CONST_INT_P (XEXP (x, 1)))
3263 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3264 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3265 index = XEXP (XEXP (x, 0), 0);
3266 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3268 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3269 else if (GET_CODE (x) == ASHIFT
3270 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3271 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3272 && GET_MODE (XEXP (x, 0)) == DImode
3273 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3274 && CONST_INT_P (XEXP (x, 1)))
3276 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3277 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3278 index = XEXP (XEXP (x, 0), 0);
3279 shift = INTVAL (XEXP (x, 1));
3281 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3282 else if ((GET_CODE (x) == SIGN_EXTRACT
3283 || GET_CODE (x) == ZERO_EXTRACT)
3284 && GET_MODE (x) == DImode
3285 && GET_CODE (XEXP (x, 0)) == MULT
3286 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3289 type = (GET_CODE (x) == SIGN_EXTRACT)
3290 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291 index = XEXP (XEXP (x, 0), 0);
3292 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3293 if (INTVAL (XEXP (x, 1)) != 32 + shift
3294 || INTVAL (XEXP (x, 2)) != 0)
3295 shift = -1;
3297 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3298 (const_int 0xffffffff<<shift)) */
3299 else if (GET_CODE (x) == AND
3300 && GET_MODE (x) == DImode
3301 && GET_CODE (XEXP (x, 0)) == MULT
3302 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304 && CONST_INT_P (XEXP (x, 1)))
3306 type = ADDRESS_REG_UXTW;
3307 index = XEXP (XEXP (x, 0), 0);
3308 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3309 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310 shift = -1;
3312 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3313 else if ((GET_CODE (x) == SIGN_EXTRACT
3314 || GET_CODE (x) == ZERO_EXTRACT)
3315 && GET_MODE (x) == DImode
3316 && GET_CODE (XEXP (x, 0)) == ASHIFT
3317 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3318 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3320 type = (GET_CODE (x) == SIGN_EXTRACT)
3321 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3322 index = XEXP (XEXP (x, 0), 0);
3323 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3324 if (INTVAL (XEXP (x, 1)) != 32 + shift
3325 || INTVAL (XEXP (x, 2)) != 0)
3326 shift = -1;
3328 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3329 (const_int 0xffffffff<<shift)) */
3330 else if (GET_CODE (x) == AND
3331 && GET_MODE (x) == DImode
3332 && GET_CODE (XEXP (x, 0)) == ASHIFT
3333 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3334 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3335 && CONST_INT_P (XEXP (x, 1)))
3337 type = ADDRESS_REG_UXTW;
3338 index = XEXP (XEXP (x, 0), 0);
3339 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3340 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3341 shift = -1;
3343 /* (mult:P (reg:P) (const_int scale)) */
3344 else if (GET_CODE (x) == MULT
3345 && GET_MODE (x) == Pmode
3346 && GET_MODE (XEXP (x, 0)) == Pmode
3347 && CONST_INT_P (XEXP (x, 1)))
3349 type = ADDRESS_REG_REG;
3350 index = XEXP (x, 0);
3351 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3353 /* (ashift:P (reg:P) (const_int shift)) */
3354 else if (GET_CODE (x) == ASHIFT
3355 && GET_MODE (x) == Pmode
3356 && GET_MODE (XEXP (x, 0)) == Pmode
3357 && CONST_INT_P (XEXP (x, 1)))
3359 type = ADDRESS_REG_REG;
3360 index = XEXP (x, 0);
3361 shift = INTVAL (XEXP (x, 1));
3363 else
3364 return false;
3366 if (GET_CODE (index) == SUBREG)
3367 index = SUBREG_REG (index);
3369 if ((shift == 0 ||
3370 (shift > 0 && shift <= 3
3371 && (1 << shift) == GET_MODE_SIZE (mode)))
3372 && REG_P (index)
3373 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3375 info->type = type;
3376 info->offset = index;
3377 info->shift = shift;
3378 return true;
3381 return false;
3384 bool
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3387 return (offset >= -64 * GET_MODE_SIZE (mode)
3388 && offset < 64 * GET_MODE_SIZE (mode)
3389 && offset % GET_MODE_SIZE (mode) == 0);
3392 static inline bool
3393 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3394 HOST_WIDE_INT offset)
3396 return offset >= -256 && offset < 256;
3399 static inline bool
3400 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3402 return (offset >= 0
3403 && offset < 4096 * GET_MODE_SIZE (mode)
3404 && offset % GET_MODE_SIZE (mode) == 0);
3407 /* Return true if X is a valid address for machine mode MODE. If it is,
3408 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3409 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3411 static bool
3412 aarch64_classify_address (struct aarch64_address_info *info,
3413 rtx x, machine_mode mode,
3414 RTX_CODE outer_code, bool strict_p)
3416 enum rtx_code code = GET_CODE (x);
3417 rtx op0, op1;
3419 /* On BE, we use load/store pair for all large int mode load/stores. */
3420 bool load_store_pair_p = (outer_code == PARALLEL
3421 || (BYTES_BIG_ENDIAN
3422 && aarch64_vect_struct_mode_p (mode)));
3424 bool allow_reg_index_p =
3425 !load_store_pair_p
3426 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3427 && !aarch64_vect_struct_mode_p (mode);
3429 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3430 REG addressing. */
3431 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3432 && (code != POST_INC && code != REG))
3433 return false;
3435 switch (code)
3437 case REG:
3438 case SUBREG:
3439 info->type = ADDRESS_REG_IMM;
3440 info->base = x;
3441 info->offset = const0_rtx;
3442 return aarch64_base_register_rtx_p (x, strict_p);
3444 case PLUS:
3445 op0 = XEXP (x, 0);
3446 op1 = XEXP (x, 1);
3448 if (! strict_p
3449 && REG_P (op0)
3450 && (op0 == virtual_stack_vars_rtx
3451 || op0 == frame_pointer_rtx
3452 || op0 == arg_pointer_rtx)
3453 && CONST_INT_P (op1))
3455 info->type = ADDRESS_REG_IMM;
3456 info->base = op0;
3457 info->offset = op1;
3459 return true;
3462 if (GET_MODE_SIZE (mode) != 0
3463 && CONST_INT_P (op1)
3464 && aarch64_base_register_rtx_p (op0, strict_p))
3466 HOST_WIDE_INT offset = INTVAL (op1);
3468 info->type = ADDRESS_REG_IMM;
3469 info->base = op0;
3470 info->offset = op1;
3472 /* TImode and TFmode values are allowed in both pairs of X
3473 registers and individual Q registers. The available
3474 address modes are:
3475 X,X: 7-bit signed scaled offset
3476 Q: 9-bit signed offset
3477 We conservatively require an offset representable in either mode.
3479 if (mode == TImode || mode == TFmode)
3480 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3481 && offset_9bit_signed_unscaled_p (mode, offset));
3483 /* A 7bit offset check because OImode will emit a ldp/stp
3484 instruction (only big endian will get here).
3485 For ldp/stp instructions, the offset is scaled for the size of a
3486 single element of the pair. */
3487 if (mode == OImode)
3488 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3490 /* Three 9/12 bit offsets checks because CImode will emit three
3491 ldr/str instructions (only big endian will get here). */
3492 if (mode == CImode)
3493 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3494 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3495 || offset_12bit_unsigned_scaled_p (V16QImode,
3496 offset + 32)));
3498 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3499 instructions (only big endian will get here). */
3500 if (mode == XImode)
3501 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3502 && aarch64_offset_7bit_signed_scaled_p (TImode,
3503 offset + 32));
3505 if (load_store_pair_p)
3506 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3507 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3508 else
3509 return (offset_9bit_signed_unscaled_p (mode, offset)
3510 || offset_12bit_unsigned_scaled_p (mode, offset));
3513 if (allow_reg_index_p)
3515 /* Look for base + (scaled/extended) index register. */
3516 if (aarch64_base_register_rtx_p (op0, strict_p)
3517 && aarch64_classify_index (info, op1, mode, strict_p))
3519 info->base = op0;
3520 return true;
3522 if (aarch64_base_register_rtx_p (op1, strict_p)
3523 && aarch64_classify_index (info, op0, mode, strict_p))
3525 info->base = op1;
3526 return true;
3530 return false;
3532 case POST_INC:
3533 case POST_DEC:
3534 case PRE_INC:
3535 case PRE_DEC:
3536 info->type = ADDRESS_REG_WB;
3537 info->base = XEXP (x, 0);
3538 info->offset = NULL_RTX;
3539 return aarch64_base_register_rtx_p (info->base, strict_p);
3541 case POST_MODIFY:
3542 case PRE_MODIFY:
3543 info->type = ADDRESS_REG_WB;
3544 info->base = XEXP (x, 0);
3545 if (GET_CODE (XEXP (x, 1)) == PLUS
3546 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3547 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3548 && aarch64_base_register_rtx_p (info->base, strict_p))
3550 HOST_WIDE_INT offset;
3551 info->offset = XEXP (XEXP (x, 1), 1);
3552 offset = INTVAL (info->offset);
3554 /* TImode and TFmode values are allowed in both pairs of X
3555 registers and individual Q registers. The available
3556 address modes are:
3557 X,X: 7-bit signed scaled offset
3558 Q: 9-bit signed offset
3559 We conservatively require an offset representable in either mode.
3561 if (mode == TImode || mode == TFmode)
3562 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3563 && offset_9bit_signed_unscaled_p (mode, offset));
3565 if (load_store_pair_p)
3566 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3567 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3568 else
3569 return offset_9bit_signed_unscaled_p (mode, offset);
3571 return false;
3573 case CONST:
3574 case SYMBOL_REF:
3575 case LABEL_REF:
3576 /* load literal: pc-relative constant pool entry. Only supported
3577 for SI mode or larger. */
3578 info->type = ADDRESS_SYMBOLIC;
3580 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3582 rtx sym, addend;
3584 split_const (x, &sym, &addend);
3585 return (GET_CODE (sym) == LABEL_REF
3586 || (GET_CODE (sym) == SYMBOL_REF
3587 && CONSTANT_POOL_ADDRESS_P (sym)));
3589 return false;
3591 case LO_SUM:
3592 info->type = ADDRESS_LO_SUM;
3593 info->base = XEXP (x, 0);
3594 info->offset = XEXP (x, 1);
3595 if (allow_reg_index_p
3596 && aarch64_base_register_rtx_p (info->base, strict_p))
3598 rtx sym, offs;
3599 split_const (info->offset, &sym, &offs);
3600 if (GET_CODE (sym) == SYMBOL_REF
3601 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3602 == SYMBOL_SMALL_ABSOLUTE))
3604 /* The symbol and offset must be aligned to the access size. */
3605 unsigned int align;
3606 unsigned int ref_size;
3608 if (CONSTANT_POOL_ADDRESS_P (sym))
3609 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3610 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3612 tree exp = SYMBOL_REF_DECL (sym);
3613 align = TYPE_ALIGN (TREE_TYPE (exp));
3614 align = CONSTANT_ALIGNMENT (exp, align);
3616 else if (SYMBOL_REF_DECL (sym))
3617 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3618 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3619 && SYMBOL_REF_BLOCK (sym) != NULL)
3620 align = SYMBOL_REF_BLOCK (sym)->alignment;
3621 else
3622 align = BITS_PER_UNIT;
3624 ref_size = GET_MODE_SIZE (mode);
3625 if (ref_size == 0)
3626 ref_size = GET_MODE_SIZE (DImode);
3628 return ((INTVAL (offs) & (ref_size - 1)) == 0
3629 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3632 return false;
3634 default:
3635 return false;
3639 bool
3640 aarch64_symbolic_address_p (rtx x)
3642 rtx offset;
3644 split_const (x, &x, &offset);
3645 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3648 /* Classify the base of symbolic expression X, given that X appears in
3649 context CONTEXT. */
3651 enum aarch64_symbol_type
3652 aarch64_classify_symbolic_expression (rtx x,
3653 enum aarch64_symbol_context context)
3655 rtx offset;
3657 split_const (x, &x, &offset);
3658 return aarch64_classify_symbol (x, offset, context);
3662 /* Return TRUE if X is a legitimate address for accessing memory in
3663 mode MODE. */
3664 static bool
3665 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3667 struct aarch64_address_info addr;
3669 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3672 /* Return TRUE if X is a legitimate address for accessing memory in
3673 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3674 pair operation. */
3675 bool
3676 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3677 RTX_CODE outer_code, bool strict_p)
3679 struct aarch64_address_info addr;
3681 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3684 /* Return TRUE if rtx X is immediate constant 0.0 */
3685 bool
3686 aarch64_float_const_zero_rtx_p (rtx x)
3688 REAL_VALUE_TYPE r;
3690 if (GET_MODE (x) == VOIDmode)
3691 return false;
3693 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3694 if (REAL_VALUE_MINUS_ZERO (r))
3695 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3696 return REAL_VALUES_EQUAL (r, dconst0);
3699 /* Return the fixed registers used for condition codes. */
3701 static bool
3702 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3704 *p1 = CC_REGNUM;
3705 *p2 = INVALID_REGNUM;
3706 return true;
3709 /* Emit call insn with PAT and do aarch64-specific handling. */
3711 void
3712 aarch64_emit_call_insn (rtx pat)
3714 rtx insn = emit_call_insn (pat);
3716 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3717 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3718 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3721 machine_mode
3722 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3724 /* All floating point compares return CCFP if it is an equality
3725 comparison, and CCFPE otherwise. */
3726 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3728 switch (code)
3730 case EQ:
3731 case NE:
3732 case UNORDERED:
3733 case ORDERED:
3734 case UNLT:
3735 case UNLE:
3736 case UNGT:
3737 case UNGE:
3738 case UNEQ:
3739 case LTGT:
3740 return CCFPmode;
3742 case LT:
3743 case LE:
3744 case GT:
3745 case GE:
3746 return CCFPEmode;
3748 default:
3749 gcc_unreachable ();
3753 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3754 && y == const0_rtx
3755 && (code == EQ || code == NE || code == LT || code == GE)
3756 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3757 || GET_CODE (x) == NEG))
3758 return CC_NZmode;
3760 /* A compare with a shifted operand. Because of canonicalization,
3761 the comparison will have to be swapped when we emit the assembly
3762 code. */
3763 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3764 && (REG_P (y) || GET_CODE (y) == SUBREG)
3765 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3766 || GET_CODE (x) == LSHIFTRT
3767 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3768 return CC_SWPmode;
3770 /* Similarly for a negated operand, but we can only do this for
3771 equalities. */
3772 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3773 && (REG_P (y) || GET_CODE (y) == SUBREG)
3774 && (code == EQ || code == NE)
3775 && GET_CODE (x) == NEG)
3776 return CC_Zmode;
3778 /* A compare of a mode narrower than SI mode against zero can be done
3779 by extending the value in the comparison. */
3780 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3781 && y == const0_rtx)
3782 /* Only use sign-extension if we really need it. */
3783 return ((code == GT || code == GE || code == LE || code == LT)
3784 ? CC_SESWPmode : CC_ZESWPmode);
3786 /* For everything else, return CCmode. */
3787 return CCmode;
3790 static int
3791 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3794 aarch64_get_condition_code (rtx x)
3796 machine_mode mode = GET_MODE (XEXP (x, 0));
3797 enum rtx_code comp_code = GET_CODE (x);
3799 if (GET_MODE_CLASS (mode) != MODE_CC)
3800 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3801 return aarch64_get_condition_code_1 (mode, comp_code);
3804 static int
3805 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3807 int ne = -1, eq = -1;
3808 switch (mode)
3810 case CCFPmode:
3811 case CCFPEmode:
3812 switch (comp_code)
3814 case GE: return AARCH64_GE;
3815 case GT: return AARCH64_GT;
3816 case LE: return AARCH64_LS;
3817 case LT: return AARCH64_MI;
3818 case NE: return AARCH64_NE;
3819 case EQ: return AARCH64_EQ;
3820 case ORDERED: return AARCH64_VC;
3821 case UNORDERED: return AARCH64_VS;
3822 case UNLT: return AARCH64_LT;
3823 case UNLE: return AARCH64_LE;
3824 case UNGT: return AARCH64_HI;
3825 case UNGE: return AARCH64_PL;
3826 default: return -1;
3828 break;
3830 case CC_DNEmode:
3831 ne = AARCH64_NE;
3832 eq = AARCH64_EQ;
3833 break;
3835 case CC_DEQmode:
3836 ne = AARCH64_EQ;
3837 eq = AARCH64_NE;
3838 break;
3840 case CC_DGEmode:
3841 ne = AARCH64_GE;
3842 eq = AARCH64_LT;
3843 break;
3845 case CC_DLTmode:
3846 ne = AARCH64_LT;
3847 eq = AARCH64_GE;
3848 break;
3850 case CC_DGTmode:
3851 ne = AARCH64_GT;
3852 eq = AARCH64_LE;
3853 break;
3855 case CC_DLEmode:
3856 ne = AARCH64_LE;
3857 eq = AARCH64_GT;
3858 break;
3860 case CC_DGEUmode:
3861 ne = AARCH64_CS;
3862 eq = AARCH64_CC;
3863 break;
3865 case CC_DLTUmode:
3866 ne = AARCH64_CC;
3867 eq = AARCH64_CS;
3868 break;
3870 case CC_DGTUmode:
3871 ne = AARCH64_HI;
3872 eq = AARCH64_LS;
3873 break;
3875 case CC_DLEUmode:
3876 ne = AARCH64_LS;
3877 eq = AARCH64_HI;
3878 break;
3880 case CCmode:
3881 switch (comp_code)
3883 case NE: return AARCH64_NE;
3884 case EQ: return AARCH64_EQ;
3885 case GE: return AARCH64_GE;
3886 case GT: return AARCH64_GT;
3887 case LE: return AARCH64_LE;
3888 case LT: return AARCH64_LT;
3889 case GEU: return AARCH64_CS;
3890 case GTU: return AARCH64_HI;
3891 case LEU: return AARCH64_LS;
3892 case LTU: return AARCH64_CC;
3893 default: return -1;
3895 break;
3897 case CC_SWPmode:
3898 case CC_ZESWPmode:
3899 case CC_SESWPmode:
3900 switch (comp_code)
3902 case NE: return AARCH64_NE;
3903 case EQ: return AARCH64_EQ;
3904 case GE: return AARCH64_LE;
3905 case GT: return AARCH64_LT;
3906 case LE: return AARCH64_GE;
3907 case LT: return AARCH64_GT;
3908 case GEU: return AARCH64_LS;
3909 case GTU: return AARCH64_CC;
3910 case LEU: return AARCH64_CS;
3911 case LTU: return AARCH64_HI;
3912 default: return -1;
3914 break;
3916 case CC_NZmode:
3917 switch (comp_code)
3919 case NE: return AARCH64_NE;
3920 case EQ: return AARCH64_EQ;
3921 case GE: return AARCH64_PL;
3922 case LT: return AARCH64_MI;
3923 default: return -1;
3925 break;
3927 case CC_Zmode:
3928 switch (comp_code)
3930 case NE: return AARCH64_NE;
3931 case EQ: return AARCH64_EQ;
3932 default: return -1;
3934 break;
3936 default:
3937 return -1;
3938 break;
3941 if (comp_code == NE)
3942 return ne;
3944 if (comp_code == EQ)
3945 return eq;
3947 return -1;
3950 bool
3951 aarch64_const_vec_all_same_in_range_p (rtx x,
3952 HOST_WIDE_INT minval,
3953 HOST_WIDE_INT maxval)
3955 HOST_WIDE_INT firstval;
3956 int count, i;
3958 if (GET_CODE (x) != CONST_VECTOR
3959 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3960 return false;
3962 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3963 if (firstval < minval || firstval > maxval)
3964 return false;
3966 count = CONST_VECTOR_NUNITS (x);
3967 for (i = 1; i < count; i++)
3968 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3969 return false;
3971 return true;
3974 bool
3975 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3977 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3980 static unsigned
3981 bit_count (unsigned HOST_WIDE_INT value)
3983 unsigned count = 0;
3985 while (value)
3987 count++;
3988 value &= value - 1;
3991 return count;
3994 /* N Z C V. */
3995 #define AARCH64_CC_V 1
3996 #define AARCH64_CC_C (1 << 1)
3997 #define AARCH64_CC_Z (1 << 2)
3998 #define AARCH64_CC_N (1 << 3)
4000 /* N Z C V flags for ccmp. The first code is for AND op and the other
4001 is for IOR op. Indexed by AARCH64_COND_CODE. */
4002 static const int aarch64_nzcv_codes[][2] =
4004 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
4005 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
4006 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4007 {0, AARCH64_CC_C}, /* CC, C == 0. */
4008 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4009 {0, AARCH64_CC_N}, /* PL, N == 0. */
4010 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4011 {0, AARCH64_CC_V}, /* VC, V == 0. */
4012 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4013 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4014 {0, AARCH64_CC_V}, /* GE, N == V. */
4015 {AARCH64_CC_V, 0}, /* LT, N != V. */
4016 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4017 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4018 {0, 0}, /* AL, Any. */
4019 {0, 0}, /* NV, Any. */
4023 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4025 switch (mode)
4027 case CC_DNEmode:
4028 return NE;
4030 case CC_DEQmode:
4031 return EQ;
4033 case CC_DLEmode:
4034 return LE;
4036 case CC_DGTmode:
4037 return GT;
4039 case CC_DLTmode:
4040 return LT;
4042 case CC_DGEmode:
4043 return GE;
4045 case CC_DLEUmode:
4046 return LEU;
4048 case CC_DGTUmode:
4049 return GTU;
4051 case CC_DLTUmode:
4052 return LTU;
4054 case CC_DGEUmode:
4055 return GEU;
4057 default:
4058 gcc_unreachable ();
4063 void
4064 aarch64_print_operand (FILE *f, rtx x, char code)
4066 switch (code)
4068 /* An integer or symbol address without a preceding # sign. */
4069 case 'c':
4070 switch (GET_CODE (x))
4072 case CONST_INT:
4073 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4074 break;
4076 case SYMBOL_REF:
4077 output_addr_const (f, x);
4078 break;
4080 case CONST:
4081 if (GET_CODE (XEXP (x, 0)) == PLUS
4082 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4084 output_addr_const (f, x);
4085 break;
4087 /* Fall through. */
4089 default:
4090 output_operand_lossage ("Unsupported operand for code '%c'", code);
4092 break;
4094 case 'e':
4095 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4097 int n;
4099 if (!CONST_INT_P (x)
4100 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4102 output_operand_lossage ("invalid operand for '%%%c'", code);
4103 return;
4106 switch (n)
4108 case 3:
4109 fputc ('b', f);
4110 break;
4111 case 4:
4112 fputc ('h', f);
4113 break;
4114 case 5:
4115 fputc ('w', f);
4116 break;
4117 default:
4118 output_operand_lossage ("invalid operand for '%%%c'", code);
4119 return;
4122 break;
4124 case 'p':
4126 int n;
4128 /* Print N such that 2^N == X. */
4129 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4131 output_operand_lossage ("invalid operand for '%%%c'", code);
4132 return;
4135 asm_fprintf (f, "%d", n);
4137 break;
4139 case 'P':
4140 /* Print the number of non-zero bits in X (a const_int). */
4141 if (!CONST_INT_P (x))
4143 output_operand_lossage ("invalid operand for '%%%c'", code);
4144 return;
4147 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4148 break;
4150 case 'H':
4151 /* Print the higher numbered register of a pair (TImode) of regs. */
4152 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4154 output_operand_lossage ("invalid operand for '%%%c'", code);
4155 return;
4158 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4159 break;
4161 case 'm':
4163 int cond_code;
4164 /* Print a condition (eq, ne, etc). */
4166 /* CONST_TRUE_RTX means always -- that's the default. */
4167 if (x == const_true_rtx)
4168 return;
4170 if (!COMPARISON_P (x))
4172 output_operand_lossage ("invalid operand for '%%%c'", code);
4173 return;
4176 cond_code = aarch64_get_condition_code (x);
4177 gcc_assert (cond_code >= 0);
4178 fputs (aarch64_condition_codes[cond_code], f);
4180 break;
4182 case 'M':
4184 int cond_code;
4185 /* Print the inverse of a condition (eq <-> ne, etc). */
4187 /* CONST_TRUE_RTX means never -- that's the default. */
4188 if (x == const_true_rtx)
4190 fputs ("nv", f);
4191 return;
4194 if (!COMPARISON_P (x))
4196 output_operand_lossage ("invalid operand for '%%%c'", code);
4197 return;
4199 cond_code = aarch64_get_condition_code (x);
4200 gcc_assert (cond_code >= 0);
4201 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4202 (cond_code)], f);
4204 break;
4206 case 'b':
4207 case 'h':
4208 case 's':
4209 case 'd':
4210 case 'q':
4211 /* Print a scalar FP/SIMD register name. */
4212 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4214 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4215 return;
4217 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4218 break;
4220 case 'S':
4221 case 'T':
4222 case 'U':
4223 case 'V':
4224 /* Print the first FP/SIMD register name in a list. */
4225 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4227 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4228 return;
4230 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4231 break;
4233 case 'R':
4234 /* Print a scalar FP/SIMD register name + 1. */
4235 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4237 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4238 return;
4240 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4241 break;
4243 case 'X':
4244 /* Print bottom 16 bits of integer constant in hex. */
4245 if (!CONST_INT_P (x))
4247 output_operand_lossage ("invalid operand for '%%%c'", code);
4248 return;
4250 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4251 break;
4253 case 'w':
4254 case 'x':
4255 /* Print a general register name or the zero register (32-bit or
4256 64-bit). */
4257 if (x == const0_rtx
4258 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4260 asm_fprintf (f, "%czr", code);
4261 break;
4264 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4266 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4267 break;
4270 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4272 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4273 break;
4276 /* Fall through */
4278 case 0:
4279 /* Print a normal operand, if it's a general register, then we
4280 assume DImode. */
4281 if (x == NULL)
4283 output_operand_lossage ("missing operand");
4284 return;
4287 switch (GET_CODE (x))
4289 case REG:
4290 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4291 break;
4293 case MEM:
4294 aarch64_memory_reference_mode = GET_MODE (x);
4295 output_address (XEXP (x, 0));
4296 break;
4298 case LABEL_REF:
4299 case SYMBOL_REF:
4300 output_addr_const (asm_out_file, x);
4301 break;
4303 case CONST_INT:
4304 asm_fprintf (f, "%wd", INTVAL (x));
4305 break;
4307 case CONST_VECTOR:
4308 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4310 gcc_assert (
4311 aarch64_const_vec_all_same_in_range_p (x,
4312 HOST_WIDE_INT_MIN,
4313 HOST_WIDE_INT_MAX));
4314 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4316 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4318 fputc ('0', f);
4320 else
4321 gcc_unreachable ();
4322 break;
4324 case CONST_DOUBLE:
4325 /* CONST_DOUBLE can represent a double-width integer.
4326 In this case, the mode of x is VOIDmode. */
4327 if (GET_MODE (x) == VOIDmode)
4328 ; /* Do Nothing. */
4329 else if (aarch64_float_const_zero_rtx_p (x))
4331 fputc ('0', f);
4332 break;
4334 else if (aarch64_float_const_representable_p (x))
4336 #define buf_size 20
4337 char float_buf[buf_size] = {'\0'};
4338 REAL_VALUE_TYPE r;
4339 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4340 real_to_decimal_for_mode (float_buf, &r,
4341 buf_size, buf_size,
4342 1, GET_MODE (x));
4343 asm_fprintf (asm_out_file, "%s", float_buf);
4344 break;
4345 #undef buf_size
4347 output_operand_lossage ("invalid constant");
4348 return;
4349 default:
4350 output_operand_lossage ("invalid operand");
4351 return;
4353 break;
4355 case 'A':
4356 if (GET_CODE (x) == HIGH)
4357 x = XEXP (x, 0);
4359 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4361 case SYMBOL_SMALL_GOT:
4362 asm_fprintf (asm_out_file, ":got:");
4363 break;
4365 case SYMBOL_SMALL_TLSGD:
4366 asm_fprintf (asm_out_file, ":tlsgd:");
4367 break;
4369 case SYMBOL_SMALL_TLSDESC:
4370 asm_fprintf (asm_out_file, ":tlsdesc:");
4371 break;
4373 case SYMBOL_SMALL_GOTTPREL:
4374 asm_fprintf (asm_out_file, ":gottprel:");
4375 break;
4377 case SYMBOL_SMALL_TPREL:
4378 asm_fprintf (asm_out_file, ":tprel:");
4379 break;
4381 case SYMBOL_TINY_GOT:
4382 gcc_unreachable ();
4383 break;
4385 default:
4386 break;
4388 output_addr_const (asm_out_file, x);
4389 break;
4391 case 'L':
4392 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4394 case SYMBOL_SMALL_GOT:
4395 asm_fprintf (asm_out_file, ":lo12:");
4396 break;
4398 case SYMBOL_SMALL_TLSGD:
4399 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4400 break;
4402 case SYMBOL_SMALL_TLSDESC:
4403 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4404 break;
4406 case SYMBOL_SMALL_GOTTPREL:
4407 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4408 break;
4410 case SYMBOL_SMALL_TPREL:
4411 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4412 break;
4414 case SYMBOL_TINY_GOT:
4415 asm_fprintf (asm_out_file, ":got:");
4416 break;
4418 default:
4419 break;
4421 output_addr_const (asm_out_file, x);
4422 break;
4424 case 'G':
4426 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4428 case SYMBOL_SMALL_TPREL:
4429 asm_fprintf (asm_out_file, ":tprel_hi12:");
4430 break;
4431 default:
4432 break;
4434 output_addr_const (asm_out_file, x);
4435 break;
4437 case 'K':
4439 int cond_code;
4440 /* Print nzcv. */
4442 if (!COMPARISON_P (x))
4444 output_operand_lossage ("invalid operand for '%%%c'", code);
4445 return;
4448 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4449 gcc_assert (cond_code >= 0);
4450 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4452 break;
4454 case 'k':
4456 int cond_code;
4457 /* Print nzcv. */
4459 if (!COMPARISON_P (x))
4461 output_operand_lossage ("invalid operand for '%%%c'", code);
4462 return;
4465 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4466 gcc_assert (cond_code >= 0);
4467 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4469 break;
4471 default:
4472 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4473 return;
4477 void
4478 aarch64_print_operand_address (FILE *f, rtx x)
4480 struct aarch64_address_info addr;
4482 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4483 MEM, true))
4484 switch (addr.type)
4486 case ADDRESS_REG_IMM:
4487 if (addr.offset == const0_rtx)
4488 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4489 else
4490 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4491 INTVAL (addr.offset));
4492 return;
4494 case ADDRESS_REG_REG:
4495 if (addr.shift == 0)
4496 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4497 reg_names [REGNO (addr.offset)]);
4498 else
4499 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4500 reg_names [REGNO (addr.offset)], addr.shift);
4501 return;
4503 case ADDRESS_REG_UXTW:
4504 if (addr.shift == 0)
4505 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4506 REGNO (addr.offset) - R0_REGNUM);
4507 else
4508 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4509 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4510 return;
4512 case ADDRESS_REG_SXTW:
4513 if (addr.shift == 0)
4514 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4515 REGNO (addr.offset) - R0_REGNUM);
4516 else
4517 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4518 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4519 return;
4521 case ADDRESS_REG_WB:
4522 switch (GET_CODE (x))
4524 case PRE_INC:
4525 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4526 GET_MODE_SIZE (aarch64_memory_reference_mode));
4527 return;
4528 case POST_INC:
4529 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4530 GET_MODE_SIZE (aarch64_memory_reference_mode));
4531 return;
4532 case PRE_DEC:
4533 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4534 GET_MODE_SIZE (aarch64_memory_reference_mode));
4535 return;
4536 case POST_DEC:
4537 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4538 GET_MODE_SIZE (aarch64_memory_reference_mode));
4539 return;
4540 case PRE_MODIFY:
4541 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4542 INTVAL (addr.offset));
4543 return;
4544 case POST_MODIFY:
4545 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4546 INTVAL (addr.offset));
4547 return;
4548 default:
4549 break;
4551 break;
4553 case ADDRESS_LO_SUM:
4554 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4555 output_addr_const (f, addr.offset);
4556 asm_fprintf (f, "]");
4557 return;
4559 case ADDRESS_SYMBOLIC:
4560 break;
4563 output_addr_const (f, x);
4566 bool
4567 aarch64_label_mentioned_p (rtx x)
4569 const char *fmt;
4570 int i;
4572 if (GET_CODE (x) == LABEL_REF)
4573 return true;
4575 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4576 referencing instruction, but they are constant offsets, not
4577 symbols. */
4578 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4579 return false;
4581 fmt = GET_RTX_FORMAT (GET_CODE (x));
4582 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4584 if (fmt[i] == 'E')
4586 int j;
4588 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4589 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4590 return 1;
4592 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4593 return 1;
4596 return 0;
4599 /* Implement REGNO_REG_CLASS. */
4601 enum reg_class
4602 aarch64_regno_regclass (unsigned regno)
4604 if (GP_REGNUM_P (regno))
4605 return GENERAL_REGS;
4607 if (regno == SP_REGNUM)
4608 return STACK_REG;
4610 if (regno == FRAME_POINTER_REGNUM
4611 || regno == ARG_POINTER_REGNUM)
4612 return POINTER_REGS;
4614 if (FP_REGNUM_P (regno))
4615 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4617 return NO_REGS;
4620 static rtx
4621 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4623 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4624 where mask is selected by alignment and size of the offset.
4625 We try to pick as large a range for the offset as possible to
4626 maximize the chance of a CSE. However, for aligned addresses
4627 we limit the range to 4k so that structures with different sized
4628 elements are likely to use the same base. */
4630 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4632 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4633 HOST_WIDE_INT base_offset;
4635 /* Does it look like we'll need a load/store-pair operation? */
4636 if (GET_MODE_SIZE (mode) > 16
4637 || mode == TImode)
4638 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4639 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4640 /* For offsets aren't a multiple of the access size, the limit is
4641 -256...255. */
4642 else if (offset & (GET_MODE_SIZE (mode) - 1))
4643 base_offset = (offset + 0x100) & ~0x1ff;
4644 else
4645 base_offset = offset & ~0xfff;
4647 if (base_offset == 0)
4648 return x;
4650 offset -= base_offset;
4651 rtx base_reg = gen_reg_rtx (Pmode);
4652 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4653 NULL_RTX);
4654 emit_move_insn (base_reg, val);
4655 x = plus_constant (Pmode, base_reg, offset);
4658 return x;
4661 /* Try a machine-dependent way of reloading an illegitimate address
4662 operand. If we find one, push the reload and return the new rtx. */
4665 aarch64_legitimize_reload_address (rtx *x_p,
4666 machine_mode mode,
4667 int opnum, int type,
4668 int ind_levels ATTRIBUTE_UNUSED)
4670 rtx x = *x_p;
4672 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4673 if (aarch64_vect_struct_mode_p (mode)
4674 && GET_CODE (x) == PLUS
4675 && REG_P (XEXP (x, 0))
4676 && CONST_INT_P (XEXP (x, 1)))
4678 rtx orig_rtx = x;
4679 x = copy_rtx (x);
4680 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4681 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4682 opnum, (enum reload_type) type);
4683 return x;
4686 /* We must recognize output that we have already generated ourselves. */
4687 if (GET_CODE (x) == PLUS
4688 && GET_CODE (XEXP (x, 0)) == PLUS
4689 && REG_P (XEXP (XEXP (x, 0), 0))
4690 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4691 && CONST_INT_P (XEXP (x, 1)))
4693 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4694 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4695 opnum, (enum reload_type) type);
4696 return x;
4699 /* We wish to handle large displacements off a base register by splitting
4700 the addend across an add and the mem insn. This can cut the number of
4701 extra insns needed from 3 to 1. It is only useful for load/store of a
4702 single register with 12 bit offset field. */
4703 if (GET_CODE (x) == PLUS
4704 && REG_P (XEXP (x, 0))
4705 && CONST_INT_P (XEXP (x, 1))
4706 && HARD_REGISTER_P (XEXP (x, 0))
4707 && mode != TImode
4708 && mode != TFmode
4709 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4711 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4712 HOST_WIDE_INT low = val & 0xfff;
4713 HOST_WIDE_INT high = val - low;
4714 HOST_WIDE_INT offs;
4715 rtx cst;
4716 machine_mode xmode = GET_MODE (x);
4718 /* In ILP32, xmode can be either DImode or SImode. */
4719 gcc_assert (xmode == DImode || xmode == SImode);
4721 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4722 BLKmode alignment. */
4723 if (GET_MODE_SIZE (mode) == 0)
4724 return NULL_RTX;
4726 offs = low % GET_MODE_SIZE (mode);
4728 /* Align misaligned offset by adjusting high part to compensate. */
4729 if (offs != 0)
4731 if (aarch64_uimm12_shift (high + offs))
4733 /* Align down. */
4734 low = low - offs;
4735 high = high + offs;
4737 else
4739 /* Align up. */
4740 offs = GET_MODE_SIZE (mode) - offs;
4741 low = low + offs;
4742 high = high + (low & 0x1000) - offs;
4743 low &= 0xfff;
4747 /* Check for overflow. */
4748 if (high + low != val)
4749 return NULL_RTX;
4751 cst = GEN_INT (high);
4752 if (!aarch64_uimm12_shift (high))
4753 cst = force_const_mem (xmode, cst);
4755 /* Reload high part into base reg, leaving the low part
4756 in the mem instruction.
4757 Note that replacing this gen_rtx_PLUS with plus_constant is
4758 wrong in this case because we rely on the
4759 (plus (plus reg c1) c2) structure being preserved so that
4760 XEXP (*p, 0) in push_reload below uses the correct term. */
4761 x = gen_rtx_PLUS (xmode,
4762 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4763 GEN_INT (low));
4765 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4766 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4767 opnum, (enum reload_type) type);
4768 return x;
4771 return NULL_RTX;
4775 static reg_class_t
4776 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4777 reg_class_t rclass,
4778 machine_mode mode,
4779 secondary_reload_info *sri)
4781 /* Without the TARGET_SIMD instructions we cannot move a Q register
4782 to a Q register directly. We need a scratch. */
4783 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4784 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4785 && reg_class_subset_p (rclass, FP_REGS))
4787 if (mode == TFmode)
4788 sri->icode = CODE_FOR_aarch64_reload_movtf;
4789 else if (mode == TImode)
4790 sri->icode = CODE_FOR_aarch64_reload_movti;
4791 return NO_REGS;
4794 /* A TFmode or TImode memory access should be handled via an FP_REGS
4795 because AArch64 has richer addressing modes for LDR/STR instructions
4796 than LDP/STP instructions. */
4797 if (TARGET_FLOAT && rclass == GENERAL_REGS
4798 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4799 return FP_REGS;
4801 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4802 return GENERAL_REGS;
4804 return NO_REGS;
4807 static bool
4808 aarch64_can_eliminate (const int from, const int to)
4810 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4811 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4813 if (frame_pointer_needed)
4815 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4816 return true;
4817 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4818 return false;
4819 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4820 && !cfun->calls_alloca)
4821 return true;
4822 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4823 return true;
4825 return false;
4827 else
4829 /* If we decided that we didn't need a leaf frame pointer but then used
4830 LR in the function, then we'll want a frame pointer after all, so
4831 prevent this elimination to ensure a frame pointer is used. */
4832 if (to == STACK_POINTER_REGNUM
4833 && flag_omit_leaf_frame_pointer
4834 && df_regs_ever_live_p (LR_REGNUM))
4835 return false;
4838 return true;
4841 HOST_WIDE_INT
4842 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4844 aarch64_layout_frame ();
4846 if (to == HARD_FRAME_POINTER_REGNUM)
4848 if (from == ARG_POINTER_REGNUM)
4849 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4851 if (from == FRAME_POINTER_REGNUM)
4852 return (cfun->machine->frame.hard_fp_offset
4853 - cfun->machine->frame.saved_varargs_size);
4856 if (to == STACK_POINTER_REGNUM)
4858 if (from == FRAME_POINTER_REGNUM)
4859 return (cfun->machine->frame.frame_size
4860 - cfun->machine->frame.saved_varargs_size);
4863 return cfun->machine->frame.frame_size;
4866 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4867 previous frame. */
4870 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4872 if (count != 0)
4873 return const0_rtx;
4874 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4878 static void
4879 aarch64_asm_trampoline_template (FILE *f)
4881 if (TARGET_ILP32)
4883 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4884 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4886 else
4888 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4889 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4891 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4892 assemble_aligned_integer (4, const0_rtx);
4893 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4894 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4897 static void
4898 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4900 rtx fnaddr, mem, a_tramp;
4901 const int tramp_code_sz = 16;
4903 /* Don't need to copy the trailing D-words, we fill those in below. */
4904 emit_block_move (m_tramp, assemble_trampoline_template (),
4905 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4906 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4907 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4908 if (GET_MODE (fnaddr) != ptr_mode)
4909 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4910 emit_move_insn (mem, fnaddr);
4912 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4913 emit_move_insn (mem, chain_value);
4915 /* XXX We should really define a "clear_cache" pattern and use
4916 gen_clear_cache(). */
4917 a_tramp = XEXP (m_tramp, 0);
4918 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4919 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4920 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4921 ptr_mode);
4924 static unsigned char
4925 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4927 switch (regclass)
4929 case CALLER_SAVE_REGS:
4930 case POINTER_REGS:
4931 case GENERAL_REGS:
4932 case ALL_REGS:
4933 case FP_REGS:
4934 case FP_LO_REGS:
4935 return
4936 aarch64_vector_mode_p (mode)
4937 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
4938 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4939 case STACK_REG:
4940 return 1;
4942 case NO_REGS:
4943 return 0;
4945 default:
4946 break;
4948 gcc_unreachable ();
4951 static reg_class_t
4952 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4954 if (regclass == POINTER_REGS)
4955 return GENERAL_REGS;
4957 if (regclass == STACK_REG)
4959 if (REG_P(x)
4960 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4961 return regclass;
4963 return NO_REGS;
4966 /* If it's an integer immediate that MOVI can't handle, then
4967 FP_REGS is not an option, so we return NO_REGS instead. */
4968 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4969 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4970 return NO_REGS;
4972 /* Register eliminiation can result in a request for
4973 SP+constant->FP_REGS. We cannot support such operations which
4974 use SP as source and an FP_REG as destination, so reject out
4975 right now. */
4976 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4978 rtx lhs = XEXP (x, 0);
4980 /* Look through a possible SUBREG introduced by ILP32. */
4981 if (GET_CODE (lhs) == SUBREG)
4982 lhs = SUBREG_REG (lhs);
4984 gcc_assert (REG_P (lhs));
4985 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4986 POINTER_REGS));
4987 return NO_REGS;
4990 return regclass;
4993 void
4994 aarch64_asm_output_labelref (FILE* f, const char *name)
4996 asm_fprintf (f, "%U%s", name);
4999 static void
5000 aarch64_elf_asm_constructor (rtx symbol, int priority)
5002 if (priority == DEFAULT_INIT_PRIORITY)
5003 default_ctor_section_asm_out_constructor (symbol, priority);
5004 else
5006 section *s;
5007 char buf[18];
5008 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5009 s = get_section (buf, SECTION_WRITE, NULL);
5010 switch_to_section (s);
5011 assemble_align (POINTER_SIZE);
5012 assemble_aligned_integer (POINTER_BYTES, symbol);
5016 static void
5017 aarch64_elf_asm_destructor (rtx symbol, int priority)
5019 if (priority == DEFAULT_INIT_PRIORITY)
5020 default_dtor_section_asm_out_destructor (symbol, priority);
5021 else
5023 section *s;
5024 char buf[18];
5025 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5026 s = get_section (buf, SECTION_WRITE, NULL);
5027 switch_to_section (s);
5028 assemble_align (POINTER_SIZE);
5029 assemble_aligned_integer (POINTER_BYTES, symbol);
5033 const char*
5034 aarch64_output_casesi (rtx *operands)
5036 char buf[100];
5037 char label[100];
5038 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5039 int index;
5040 static const char *const patterns[4][2] =
5043 "ldrb\t%w3, [%0,%w1,uxtw]",
5044 "add\t%3, %4, %w3, sxtb #2"
5047 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5048 "add\t%3, %4, %w3, sxth #2"
5051 "ldr\t%w3, [%0,%w1,uxtw #2]",
5052 "add\t%3, %4, %w3, sxtw #2"
5054 /* We assume that DImode is only generated when not optimizing and
5055 that we don't really need 64-bit address offsets. That would
5056 imply an object file with 8GB of code in a single function! */
5058 "ldr\t%w3, [%0,%w1,uxtw #2]",
5059 "add\t%3, %4, %w3, sxtw #2"
5063 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5065 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5067 gcc_assert (index >= 0 && index <= 3);
5069 /* Need to implement table size reduction, by chaning the code below. */
5070 output_asm_insn (patterns[index][0], operands);
5071 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5072 snprintf (buf, sizeof (buf),
5073 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5074 output_asm_insn (buf, operands);
5075 output_asm_insn (patterns[index][1], operands);
5076 output_asm_insn ("br\t%3", operands);
5077 assemble_label (asm_out_file, label);
5078 return "";
5082 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5083 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5084 operator. */
5087 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5089 if (shift >= 0 && shift <= 3)
5091 int size;
5092 for (size = 8; size <= 32; size *= 2)
5094 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5095 if (mask == bits << shift)
5096 return size;
5099 return 0;
5102 static bool
5103 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5104 const_rtx x ATTRIBUTE_UNUSED)
5106 /* We can't use blocks for constants when we're using a per-function
5107 constant pool. */
5108 return false;
5111 static section *
5112 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5113 rtx x ATTRIBUTE_UNUSED,
5114 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5116 /* Force all constant pool entries into the current function section. */
5117 return function_section (current_function_decl);
5121 /* Costs. */
5123 /* Helper function for rtx cost calculation. Strip a shift expression
5124 from X. Returns the inner operand if successful, or the original
5125 expression on failure. */
5126 static rtx
5127 aarch64_strip_shift (rtx x)
5129 rtx op = x;
5131 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5132 we can convert both to ROR during final output. */
5133 if ((GET_CODE (op) == ASHIFT
5134 || GET_CODE (op) == ASHIFTRT
5135 || GET_CODE (op) == LSHIFTRT
5136 || GET_CODE (op) == ROTATERT
5137 || GET_CODE (op) == ROTATE)
5138 && CONST_INT_P (XEXP (op, 1)))
5139 return XEXP (op, 0);
5141 if (GET_CODE (op) == MULT
5142 && CONST_INT_P (XEXP (op, 1))
5143 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5144 return XEXP (op, 0);
5146 return x;
5149 /* Helper function for rtx cost calculation. Strip an extend
5150 expression from X. Returns the inner operand if successful, or the
5151 original expression on failure. We deal with a number of possible
5152 canonicalization variations here. */
5153 static rtx
5154 aarch64_strip_extend (rtx x)
5156 rtx op = x;
5158 /* Zero and sign extraction of a widened value. */
5159 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5160 && XEXP (op, 2) == const0_rtx
5161 && GET_CODE (XEXP (op, 0)) == MULT
5162 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5163 XEXP (op, 1)))
5164 return XEXP (XEXP (op, 0), 0);
5166 /* It can also be represented (for zero-extend) as an AND with an
5167 immediate. */
5168 if (GET_CODE (op) == AND
5169 && GET_CODE (XEXP (op, 0)) == MULT
5170 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5171 && CONST_INT_P (XEXP (op, 1))
5172 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5173 INTVAL (XEXP (op, 1))) != 0)
5174 return XEXP (XEXP (op, 0), 0);
5176 /* Now handle extended register, as this may also have an optional
5177 left shift by 1..4. */
5178 if (GET_CODE (op) == ASHIFT
5179 && CONST_INT_P (XEXP (op, 1))
5180 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5181 op = XEXP (op, 0);
5183 if (GET_CODE (op) == ZERO_EXTEND
5184 || GET_CODE (op) == SIGN_EXTEND)
5185 op = XEXP (op, 0);
5187 if (op != x)
5188 return op;
5190 return x;
5193 /* Return true iff CODE is a shift supported in combination
5194 with arithmetic instructions. */
5196 static bool
5197 aarch64_shift_p (enum rtx_code code)
5199 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5202 /* Helper function for rtx cost calculation. Calculate the cost of
5203 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5204 Return the calculated cost of the expression, recursing manually in to
5205 operands where needed. */
5207 static int
5208 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5210 rtx op0, op1;
5211 const struct cpu_cost_table *extra_cost
5212 = aarch64_tune_params->insn_extra_cost;
5213 int cost = 0;
5214 bool compound_p = (outer == PLUS || outer == MINUS);
5215 machine_mode mode = GET_MODE (x);
5217 gcc_checking_assert (code == MULT);
5219 op0 = XEXP (x, 0);
5220 op1 = XEXP (x, 1);
5222 if (VECTOR_MODE_P (mode))
5223 mode = GET_MODE_INNER (mode);
5225 /* Integer multiply/fma. */
5226 if (GET_MODE_CLASS (mode) == MODE_INT)
5228 /* The multiply will be canonicalized as a shift, cost it as such. */
5229 if (aarch64_shift_p (GET_CODE (x))
5230 || (CONST_INT_P (op1)
5231 && exact_log2 (INTVAL (op1)) > 0))
5233 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5234 || GET_CODE (op0) == SIGN_EXTEND;
5235 if (speed)
5237 if (compound_p)
5239 if (REG_P (op1))
5240 /* ARITH + shift-by-register. */
5241 cost += extra_cost->alu.arith_shift_reg;
5242 else if (is_extend)
5243 /* ARITH + extended register. We don't have a cost field
5244 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5245 cost += extra_cost->alu.extend_arith;
5246 else
5247 /* ARITH + shift-by-immediate. */
5248 cost += extra_cost->alu.arith_shift;
5250 else
5251 /* LSL (immediate). */
5252 cost += extra_cost->alu.shift;
5255 /* Strip extends as we will have costed them in the case above. */
5256 if (is_extend)
5257 op0 = aarch64_strip_extend (op0);
5259 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5261 return cost;
5264 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5265 compound and let the below cases handle it. After all, MNEG is a
5266 special-case alias of MSUB. */
5267 if (GET_CODE (op0) == NEG)
5269 op0 = XEXP (op0, 0);
5270 compound_p = true;
5273 /* Integer multiplies or FMAs have zero/sign extending variants. */
5274 if ((GET_CODE (op0) == ZERO_EXTEND
5275 && GET_CODE (op1) == ZERO_EXTEND)
5276 || (GET_CODE (op0) == SIGN_EXTEND
5277 && GET_CODE (op1) == SIGN_EXTEND))
5279 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5280 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5282 if (speed)
5284 if (compound_p)
5285 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5286 cost += extra_cost->mult[0].extend_add;
5287 else
5288 /* MUL/SMULL/UMULL. */
5289 cost += extra_cost->mult[0].extend;
5292 return cost;
5295 /* This is either an integer multiply or a MADD. In both cases
5296 we want to recurse and cost the operands. */
5297 cost += rtx_cost (op0, MULT, 0, speed)
5298 + rtx_cost (op1, MULT, 1, speed);
5300 if (speed)
5302 if (compound_p)
5303 /* MADD/MSUB. */
5304 cost += extra_cost->mult[mode == DImode].add;
5305 else
5306 /* MUL. */
5307 cost += extra_cost->mult[mode == DImode].simple;
5310 return cost;
5312 else
5314 if (speed)
5316 /* Floating-point FMA/FMUL can also support negations of the
5317 operands. */
5318 if (GET_CODE (op0) == NEG)
5319 op0 = XEXP (op0, 0);
5320 if (GET_CODE (op1) == NEG)
5321 op1 = XEXP (op1, 0);
5323 if (compound_p)
5324 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5325 cost += extra_cost->fp[mode == DFmode].fma;
5326 else
5327 /* FMUL/FNMUL. */
5328 cost += extra_cost->fp[mode == DFmode].mult;
5331 cost += rtx_cost (op0, MULT, 0, speed)
5332 + rtx_cost (op1, MULT, 1, speed);
5333 return cost;
5337 static int
5338 aarch64_address_cost (rtx x,
5339 machine_mode mode,
5340 addr_space_t as ATTRIBUTE_UNUSED,
5341 bool speed)
5343 enum rtx_code c = GET_CODE (x);
5344 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5345 struct aarch64_address_info info;
5346 int cost = 0;
5347 info.shift = 0;
5349 if (!aarch64_classify_address (&info, x, mode, c, false))
5351 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5353 /* This is a CONST or SYMBOL ref which will be split
5354 in a different way depending on the code model in use.
5355 Cost it through the generic infrastructure. */
5356 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5357 /* Divide through by the cost of one instruction to
5358 bring it to the same units as the address costs. */
5359 cost_symbol_ref /= COSTS_N_INSNS (1);
5360 /* The cost is then the cost of preparing the address,
5361 followed by an immediate (possibly 0) offset. */
5362 return cost_symbol_ref + addr_cost->imm_offset;
5364 else
5366 /* This is most likely a jump table from a case
5367 statement. */
5368 return addr_cost->register_offset;
5372 switch (info.type)
5374 case ADDRESS_LO_SUM:
5375 case ADDRESS_SYMBOLIC:
5376 case ADDRESS_REG_IMM:
5377 cost += addr_cost->imm_offset;
5378 break;
5380 case ADDRESS_REG_WB:
5381 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5382 cost += addr_cost->pre_modify;
5383 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5384 cost += addr_cost->post_modify;
5385 else
5386 gcc_unreachable ();
5388 break;
5390 case ADDRESS_REG_REG:
5391 cost += addr_cost->register_offset;
5392 break;
5394 case ADDRESS_REG_UXTW:
5395 case ADDRESS_REG_SXTW:
5396 cost += addr_cost->register_extend;
5397 break;
5399 default:
5400 gcc_unreachable ();
5404 if (info.shift > 0)
5406 /* For the sake of calculating the cost of the shifted register
5407 component, we can treat same sized modes in the same way. */
5408 switch (GET_MODE_BITSIZE (mode))
5410 case 16:
5411 cost += addr_cost->addr_scale_costs.hi;
5412 break;
5414 case 32:
5415 cost += addr_cost->addr_scale_costs.si;
5416 break;
5418 case 64:
5419 cost += addr_cost->addr_scale_costs.di;
5420 break;
5422 /* We can't tell, or this is a 128-bit vector. */
5423 default:
5424 cost += addr_cost->addr_scale_costs.ti;
5425 break;
5429 return cost;
5432 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5433 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5434 to be taken. */
5437 aarch64_branch_cost (bool speed_p, bool predictable_p)
5439 /* When optimizing for speed, use the cost of unpredictable branches. */
5440 const struct cpu_branch_cost *branch_costs =
5441 aarch64_tune_params->branch_costs;
5443 if (!speed_p || predictable_p)
5444 return branch_costs->predictable;
5445 else
5446 return branch_costs->unpredictable;
5449 /* Return true if the RTX X in mode MODE is a zero or sign extract
5450 usable in an ADD or SUB (extended register) instruction. */
5451 static bool
5452 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5454 /* Catch add with a sign extract.
5455 This is add_<optab><mode>_multp2. */
5456 if (GET_CODE (x) == SIGN_EXTRACT
5457 || GET_CODE (x) == ZERO_EXTRACT)
5459 rtx op0 = XEXP (x, 0);
5460 rtx op1 = XEXP (x, 1);
5461 rtx op2 = XEXP (x, 2);
5463 if (GET_CODE (op0) == MULT
5464 && CONST_INT_P (op1)
5465 && op2 == const0_rtx
5466 && CONST_INT_P (XEXP (op0, 1))
5467 && aarch64_is_extend_from_extract (mode,
5468 XEXP (op0, 1),
5469 op1))
5471 return true;
5475 return false;
5478 static bool
5479 aarch64_frint_unspec_p (unsigned int u)
5481 switch (u)
5483 case UNSPEC_FRINTZ:
5484 case UNSPEC_FRINTP:
5485 case UNSPEC_FRINTM:
5486 case UNSPEC_FRINTA:
5487 case UNSPEC_FRINTN:
5488 case UNSPEC_FRINTX:
5489 case UNSPEC_FRINTI:
5490 return true;
5492 default:
5493 return false;
5497 /* Return true iff X is an rtx that will match an extr instruction
5498 i.e. as described in the *extr<mode>5_insn family of patterns.
5499 OP0 and OP1 will be set to the operands of the shifts involved
5500 on success and will be NULL_RTX otherwise. */
5502 static bool
5503 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5505 rtx op0, op1;
5506 machine_mode mode = GET_MODE (x);
5508 *res_op0 = NULL_RTX;
5509 *res_op1 = NULL_RTX;
5511 if (GET_CODE (x) != IOR)
5512 return false;
5514 op0 = XEXP (x, 0);
5515 op1 = XEXP (x, 1);
5517 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5518 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5520 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5521 if (GET_CODE (op1) == ASHIFT)
5522 std::swap (op0, op1);
5524 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5525 return false;
5527 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5528 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5530 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5531 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5533 *res_op0 = XEXP (op0, 0);
5534 *res_op1 = XEXP (op1, 0);
5535 return true;
5539 return false;
5542 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5543 storing it in *COST. Result is true if the total cost of the operation
5544 has now been calculated. */
5545 static bool
5546 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5548 rtx inner;
5549 rtx comparator;
5550 enum rtx_code cmpcode;
5552 if (COMPARISON_P (op0))
5554 inner = XEXP (op0, 0);
5555 comparator = XEXP (op0, 1);
5556 cmpcode = GET_CODE (op0);
5558 else
5560 inner = op0;
5561 comparator = const0_rtx;
5562 cmpcode = NE;
5565 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5567 /* Conditional branch. */
5568 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5569 return true;
5570 else
5572 if (cmpcode == NE || cmpcode == EQ)
5574 if (comparator == const0_rtx)
5576 /* TBZ/TBNZ/CBZ/CBNZ. */
5577 if (GET_CODE (inner) == ZERO_EXTRACT)
5578 /* TBZ/TBNZ. */
5579 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5580 0, speed);
5581 else
5582 /* CBZ/CBNZ. */
5583 *cost += rtx_cost (inner, cmpcode, 0, speed);
5585 return true;
5588 else if (cmpcode == LT || cmpcode == GE)
5590 /* TBZ/TBNZ. */
5591 if (comparator == const0_rtx)
5592 return true;
5596 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5598 /* It's a conditional operation based on the status flags,
5599 so it must be some flavor of CSEL. */
5601 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5602 if (GET_CODE (op1) == NEG
5603 || GET_CODE (op1) == NOT
5604 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5605 op1 = XEXP (op1, 0);
5607 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5608 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5609 return true;
5612 /* We don't know what this is, cost all operands. */
5613 return false;
5616 /* Calculate the cost of calculating X, storing it in *COST. Result
5617 is true if the total cost of the operation has now been calculated. */
5618 static bool
5619 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5620 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5622 rtx op0, op1, op2;
5623 const struct cpu_cost_table *extra_cost
5624 = aarch64_tune_params->insn_extra_cost;
5625 machine_mode mode = GET_MODE (x);
5627 /* By default, assume that everything has equivalent cost to the
5628 cheapest instruction. Any additional costs are applied as a delta
5629 above this default. */
5630 *cost = COSTS_N_INSNS (1);
5632 switch (code)
5634 case SET:
5635 /* The cost depends entirely on the operands to SET. */
5636 *cost = 0;
5637 op0 = SET_DEST (x);
5638 op1 = SET_SRC (x);
5640 switch (GET_CODE (op0))
5642 case MEM:
5643 if (speed)
5645 rtx address = XEXP (op0, 0);
5646 if (VECTOR_MODE_P (mode))
5647 *cost += extra_cost->ldst.storev;
5648 else if (GET_MODE_CLASS (mode) == MODE_INT)
5649 *cost += extra_cost->ldst.store;
5650 else if (mode == SFmode)
5651 *cost += extra_cost->ldst.storef;
5652 else if (mode == DFmode)
5653 *cost += extra_cost->ldst.stored;
5655 *cost +=
5656 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5657 0, speed));
5660 *cost += rtx_cost (op1, SET, 1, speed);
5661 return true;
5663 case SUBREG:
5664 if (! REG_P (SUBREG_REG (op0)))
5665 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5667 /* Fall through. */
5668 case REG:
5669 /* The cost is one per vector-register copied. */
5670 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5672 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5673 / GET_MODE_SIZE (V4SImode);
5674 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5676 /* const0_rtx is in general free, but we will use an
5677 instruction to set a register to 0. */
5678 else if (REG_P (op1) || op1 == const0_rtx)
5680 /* The cost is 1 per register copied. */
5681 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5682 / UNITS_PER_WORD;
5683 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5685 else
5686 /* Cost is just the cost of the RHS of the set. */
5687 *cost += rtx_cost (op1, SET, 1, speed);
5688 return true;
5690 case ZERO_EXTRACT:
5691 case SIGN_EXTRACT:
5692 /* Bit-field insertion. Strip any redundant widening of
5693 the RHS to meet the width of the target. */
5694 if (GET_CODE (op1) == SUBREG)
5695 op1 = SUBREG_REG (op1);
5696 if ((GET_CODE (op1) == ZERO_EXTEND
5697 || GET_CODE (op1) == SIGN_EXTEND)
5698 && CONST_INT_P (XEXP (op0, 1))
5699 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5700 >= INTVAL (XEXP (op0, 1))))
5701 op1 = XEXP (op1, 0);
5703 if (CONST_INT_P (op1))
5705 /* MOV immediate is assumed to always be cheap. */
5706 *cost = COSTS_N_INSNS (1);
5708 else
5710 /* BFM. */
5711 if (speed)
5712 *cost += extra_cost->alu.bfi;
5713 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5716 return true;
5718 default:
5719 /* We can't make sense of this, assume default cost. */
5720 *cost = COSTS_N_INSNS (1);
5721 return false;
5723 return false;
5725 case CONST_INT:
5726 /* If an instruction can incorporate a constant within the
5727 instruction, the instruction's expression avoids calling
5728 rtx_cost() on the constant. If rtx_cost() is called on a
5729 constant, then it is usually because the constant must be
5730 moved into a register by one or more instructions.
5732 The exception is constant 0, which can be expressed
5733 as XZR/WZR and is therefore free. The exception to this is
5734 if we have (set (reg) (const0_rtx)) in which case we must cost
5735 the move. However, we can catch that when we cost the SET, so
5736 we don't need to consider that here. */
5737 if (x == const0_rtx)
5738 *cost = 0;
5739 else
5741 /* To an approximation, building any other constant is
5742 proportionally expensive to the number of instructions
5743 required to build that constant. This is true whether we
5744 are compiling for SPEED or otherwise. */
5745 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5746 (NULL_RTX, x, false, mode));
5748 return true;
5750 case CONST_DOUBLE:
5751 if (speed)
5753 /* mov[df,sf]_aarch64. */
5754 if (aarch64_float_const_representable_p (x))
5755 /* FMOV (scalar immediate). */
5756 *cost += extra_cost->fp[mode == DFmode].fpconst;
5757 else if (!aarch64_float_const_zero_rtx_p (x))
5759 /* This will be a load from memory. */
5760 if (mode == DFmode)
5761 *cost += extra_cost->ldst.loadd;
5762 else
5763 *cost += extra_cost->ldst.loadf;
5765 else
5766 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5767 or MOV v0.s[0], wzr - neither of which are modeled by the
5768 cost tables. Just use the default cost. */
5773 return true;
5775 case MEM:
5776 if (speed)
5778 /* For loads we want the base cost of a load, plus an
5779 approximation for the additional cost of the addressing
5780 mode. */
5781 rtx address = XEXP (x, 0);
5782 if (VECTOR_MODE_P (mode))
5783 *cost += extra_cost->ldst.loadv;
5784 else if (GET_MODE_CLASS (mode) == MODE_INT)
5785 *cost += extra_cost->ldst.load;
5786 else if (mode == SFmode)
5787 *cost += extra_cost->ldst.loadf;
5788 else if (mode == DFmode)
5789 *cost += extra_cost->ldst.loadd;
5791 *cost +=
5792 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5793 0, speed));
5796 return true;
5798 case NEG:
5799 op0 = XEXP (x, 0);
5801 if (VECTOR_MODE_P (mode))
5803 if (speed)
5805 /* FNEG. */
5806 *cost += extra_cost->vect.alu;
5808 return false;
5811 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5813 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5814 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5816 /* CSETM. */
5817 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5818 return true;
5821 /* Cost this as SUB wzr, X. */
5822 op0 = CONST0_RTX (GET_MODE (x));
5823 op1 = XEXP (x, 0);
5824 goto cost_minus;
5827 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5829 /* Support (neg(fma...)) as a single instruction only if
5830 sign of zeros is unimportant. This matches the decision
5831 making in aarch64.md. */
5832 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5834 /* FNMADD. */
5835 *cost = rtx_cost (op0, NEG, 0, speed);
5836 return true;
5838 if (speed)
5839 /* FNEG. */
5840 *cost += extra_cost->fp[mode == DFmode].neg;
5841 return false;
5844 return false;
5846 case CLRSB:
5847 case CLZ:
5848 if (speed)
5850 if (VECTOR_MODE_P (mode))
5851 *cost += extra_cost->vect.alu;
5852 else
5853 *cost += extra_cost->alu.clz;
5856 return false;
5858 case COMPARE:
5859 op0 = XEXP (x, 0);
5860 op1 = XEXP (x, 1);
5862 if (op1 == const0_rtx
5863 && GET_CODE (op0) == AND)
5865 x = op0;
5866 goto cost_logic;
5869 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5871 /* TODO: A write to the CC flags possibly costs extra, this
5872 needs encoding in the cost tables. */
5874 /* CC_ZESWPmode supports zero extend for free. */
5875 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5876 op0 = XEXP (op0, 0);
5878 /* ANDS. */
5879 if (GET_CODE (op0) == AND)
5881 x = op0;
5882 goto cost_logic;
5885 if (GET_CODE (op0) == PLUS)
5887 /* ADDS (and CMN alias). */
5888 x = op0;
5889 goto cost_plus;
5892 if (GET_CODE (op0) == MINUS)
5894 /* SUBS. */
5895 x = op0;
5896 goto cost_minus;
5899 if (GET_CODE (op1) == NEG)
5901 /* CMN. */
5902 if (speed)
5903 *cost += extra_cost->alu.arith;
5905 *cost += rtx_cost (op0, COMPARE, 0, speed);
5906 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5907 return true;
5910 /* CMP.
5912 Compare can freely swap the order of operands, and
5913 canonicalization puts the more complex operation first.
5914 But the integer MINUS logic expects the shift/extend
5915 operation in op1. */
5916 if (! (REG_P (op0)
5917 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5919 op0 = XEXP (x, 1);
5920 op1 = XEXP (x, 0);
5922 goto cost_minus;
5925 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5927 /* FCMP. */
5928 if (speed)
5929 *cost += extra_cost->fp[mode == DFmode].compare;
5931 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5933 *cost += rtx_cost (op0, COMPARE, 0, speed);
5934 /* FCMP supports constant 0.0 for no extra cost. */
5935 return true;
5937 return false;
5940 if (VECTOR_MODE_P (mode))
5942 /* Vector compare. */
5943 if (speed)
5944 *cost += extra_cost->vect.alu;
5946 if (aarch64_float_const_zero_rtx_p (op1))
5948 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
5949 cost. */
5950 return true;
5952 return false;
5954 return false;
5956 case MINUS:
5958 op0 = XEXP (x, 0);
5959 op1 = XEXP (x, 1);
5961 cost_minus:
5962 *cost += rtx_cost (op0, MINUS, 0, speed);
5964 /* Detect valid immediates. */
5965 if ((GET_MODE_CLASS (mode) == MODE_INT
5966 || (GET_MODE_CLASS (mode) == MODE_CC
5967 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5968 && CONST_INT_P (op1)
5969 && aarch64_uimm12_shift (INTVAL (op1)))
5971 if (speed)
5972 /* SUB(S) (immediate). */
5973 *cost += extra_cost->alu.arith;
5974 return true;
5977 /* Look for SUB (extended register). */
5978 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5980 if (speed)
5981 *cost += extra_cost->alu.extend_arith;
5983 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5984 (enum rtx_code) GET_CODE (op1),
5985 0, speed);
5986 return true;
5989 rtx new_op1 = aarch64_strip_extend (op1);
5991 /* Cost this as an FMA-alike operation. */
5992 if ((GET_CODE (new_op1) == MULT
5993 || aarch64_shift_p (GET_CODE (new_op1)))
5994 && code != COMPARE)
5996 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5997 (enum rtx_code) code,
5998 speed);
5999 return true;
6002 *cost += rtx_cost (new_op1, MINUS, 1, speed);
6004 if (speed)
6006 if (VECTOR_MODE_P (mode))
6008 /* Vector SUB. */
6009 *cost += extra_cost->vect.alu;
6011 else if (GET_MODE_CLASS (mode) == MODE_INT)
6013 /* SUB(S). */
6014 *cost += extra_cost->alu.arith;
6016 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6018 /* FSUB. */
6019 *cost += extra_cost->fp[mode == DFmode].addsub;
6022 return true;
6025 case PLUS:
6027 rtx new_op0;
6029 op0 = XEXP (x, 0);
6030 op1 = XEXP (x, 1);
6032 cost_plus:
6033 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6034 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6036 /* CSINC. */
6037 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6038 *cost += rtx_cost (op1, PLUS, 1, speed);
6039 return true;
6042 if (GET_MODE_CLASS (mode) == MODE_INT
6043 && CONST_INT_P (op1)
6044 && aarch64_uimm12_shift (INTVAL (op1)))
6046 *cost += rtx_cost (op0, PLUS, 0, speed);
6048 if (speed)
6049 /* ADD (immediate). */
6050 *cost += extra_cost->alu.arith;
6051 return true;
6054 *cost += rtx_cost (op1, PLUS, 1, speed);
6056 /* Look for ADD (extended register). */
6057 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6059 if (speed)
6060 *cost += extra_cost->alu.extend_arith;
6062 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6063 (enum rtx_code) GET_CODE (op0),
6064 0, speed);
6065 return true;
6068 /* Strip any extend, leave shifts behind as we will
6069 cost them through mult_cost. */
6070 new_op0 = aarch64_strip_extend (op0);
6072 if (GET_CODE (new_op0) == MULT
6073 || aarch64_shift_p (GET_CODE (new_op0)))
6075 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6076 speed);
6077 return true;
6080 *cost += rtx_cost (new_op0, PLUS, 0, speed);
6082 if (speed)
6084 if (VECTOR_MODE_P (mode))
6086 /* Vector ADD. */
6087 *cost += extra_cost->vect.alu;
6089 else if (GET_MODE_CLASS (mode) == MODE_INT)
6091 /* ADD. */
6092 *cost += extra_cost->alu.arith;
6094 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6096 /* FADD. */
6097 *cost += extra_cost->fp[mode == DFmode].addsub;
6100 return true;
6103 case BSWAP:
6104 *cost = COSTS_N_INSNS (1);
6106 if (speed)
6108 if (VECTOR_MODE_P (mode))
6109 *cost += extra_cost->vect.alu;
6110 else
6111 *cost += extra_cost->alu.rev;
6113 return false;
6115 case IOR:
6116 if (aarch_rev16_p (x))
6118 *cost = COSTS_N_INSNS (1);
6120 if (speed)
6122 if (VECTOR_MODE_P (mode))
6123 *cost += extra_cost->vect.alu;
6124 else
6125 *cost += extra_cost->alu.rev;
6127 return true;
6130 if (aarch64_extr_rtx_p (x, &op0, &op1))
6132 *cost += rtx_cost (op0, IOR, 0, speed)
6133 + rtx_cost (op1, IOR, 1, speed);
6134 if (speed)
6135 *cost += extra_cost->alu.shift;
6137 return true;
6139 /* Fall through. */
6140 case XOR:
6141 case AND:
6142 cost_logic:
6143 op0 = XEXP (x, 0);
6144 op1 = XEXP (x, 1);
6146 if (VECTOR_MODE_P (mode))
6148 if (speed)
6149 *cost += extra_cost->vect.alu;
6150 return true;
6153 if (code == AND
6154 && GET_CODE (op0) == MULT
6155 && CONST_INT_P (XEXP (op0, 1))
6156 && CONST_INT_P (op1)
6157 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6158 INTVAL (op1)) != 0)
6160 /* This is a UBFM/SBFM. */
6161 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6162 if (speed)
6163 *cost += extra_cost->alu.bfx;
6164 return true;
6167 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6169 /* We possibly get the immediate for free, this is not
6170 modelled. */
6171 if (CONST_INT_P (op1)
6172 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6174 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6176 if (speed)
6177 *cost += extra_cost->alu.logical;
6179 return true;
6181 else
6183 rtx new_op0 = op0;
6185 /* Handle ORN, EON, or BIC. */
6186 if (GET_CODE (op0) == NOT)
6187 op0 = XEXP (op0, 0);
6189 new_op0 = aarch64_strip_shift (op0);
6191 /* If we had a shift on op0 then this is a logical-shift-
6192 by-register/immediate operation. Otherwise, this is just
6193 a logical operation. */
6194 if (speed)
6196 if (new_op0 != op0)
6198 /* Shift by immediate. */
6199 if (CONST_INT_P (XEXP (op0, 1)))
6200 *cost += extra_cost->alu.log_shift;
6201 else
6202 *cost += extra_cost->alu.log_shift_reg;
6204 else
6205 *cost += extra_cost->alu.logical;
6208 /* In both cases we want to cost both operands. */
6209 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6210 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6212 return true;
6215 return false;
6217 case NOT:
6218 x = XEXP (x, 0);
6219 op0 = aarch64_strip_shift (x);
6221 if (VECTOR_MODE_P (mode))
6223 /* Vector NOT. */
6224 *cost += extra_cost->vect.alu;
6225 return false;
6228 /* MVN-shifted-reg. */
6229 if (op0 != x)
6231 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6233 if (speed)
6234 *cost += extra_cost->alu.log_shift;
6236 return true;
6238 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6239 Handle the second form here taking care that 'a' in the above can
6240 be a shift. */
6241 else if (GET_CODE (op0) == XOR)
6243 rtx newop0 = XEXP (op0, 0);
6244 rtx newop1 = XEXP (op0, 1);
6245 rtx op0_stripped = aarch64_strip_shift (newop0);
6247 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6248 + rtx_cost (op0_stripped, XOR, 0, speed);
6250 if (speed)
6252 if (op0_stripped != newop0)
6253 *cost += extra_cost->alu.log_shift;
6254 else
6255 *cost += extra_cost->alu.logical;
6258 return true;
6260 /* MVN. */
6261 if (speed)
6262 *cost += extra_cost->alu.logical;
6264 return false;
6266 case ZERO_EXTEND:
6268 op0 = XEXP (x, 0);
6269 /* If a value is written in SI mode, then zero extended to DI
6270 mode, the operation will in general be free as a write to
6271 a 'w' register implicitly zeroes the upper bits of an 'x'
6272 register. However, if this is
6274 (set (reg) (zero_extend (reg)))
6276 we must cost the explicit register move. */
6277 if (mode == DImode
6278 && GET_MODE (op0) == SImode
6279 && outer == SET)
6281 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6283 if (!op_cost && speed)
6284 /* MOV. */
6285 *cost += extra_cost->alu.extend;
6286 else
6287 /* Free, the cost is that of the SI mode operation. */
6288 *cost = op_cost;
6290 return true;
6292 else if (MEM_P (XEXP (x, 0)))
6294 /* All loads can zero extend to any size for free. */
6295 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6296 return true;
6299 if (speed)
6301 if (VECTOR_MODE_P (mode))
6303 /* UMOV. */
6304 *cost += extra_cost->vect.alu;
6306 else
6308 /* UXTB/UXTH. */
6309 *cost += extra_cost->alu.extend;
6312 return false;
6314 case SIGN_EXTEND:
6315 if (MEM_P (XEXP (x, 0)))
6317 /* LDRSH. */
6318 if (speed)
6320 rtx address = XEXP (XEXP (x, 0), 0);
6321 *cost += extra_cost->ldst.load_sign_extend;
6323 *cost +=
6324 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6325 0, speed));
6327 return true;
6330 if (speed)
6332 if (VECTOR_MODE_P (mode))
6333 *cost += extra_cost->vect.alu;
6334 else
6335 *cost += extra_cost->alu.extend;
6337 return false;
6339 case ASHIFT:
6340 op0 = XEXP (x, 0);
6341 op1 = XEXP (x, 1);
6343 if (CONST_INT_P (op1))
6345 if (speed)
6347 if (VECTOR_MODE_P (mode))
6349 /* Vector shift (immediate). */
6350 *cost += extra_cost->vect.alu;
6352 else
6354 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6355 aliases. */
6356 *cost += extra_cost->alu.shift;
6360 /* We can incorporate zero/sign extend for free. */
6361 if (GET_CODE (op0) == ZERO_EXTEND
6362 || GET_CODE (op0) == SIGN_EXTEND)
6363 op0 = XEXP (op0, 0);
6365 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6366 return true;
6368 else
6370 if (speed)
6372 if (VECTOR_MODE_P (mode))
6374 /* Vector shift (register). */
6375 *cost += extra_cost->vect.alu;
6377 else
6379 /* LSLV. */
6380 *cost += extra_cost->alu.shift_reg;
6383 return false; /* All arguments need to be in registers. */
6386 case ROTATE:
6387 case ROTATERT:
6388 case LSHIFTRT:
6389 case ASHIFTRT:
6390 op0 = XEXP (x, 0);
6391 op1 = XEXP (x, 1);
6393 if (CONST_INT_P (op1))
6395 /* ASR (immediate) and friends. */
6396 if (speed)
6398 if (VECTOR_MODE_P (mode))
6399 *cost += extra_cost->vect.alu;
6400 else
6401 *cost += extra_cost->alu.shift;
6404 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6405 return true;
6407 else
6410 /* ASR (register) and friends. */
6411 if (speed)
6413 if (VECTOR_MODE_P (mode))
6414 *cost += extra_cost->vect.alu;
6415 else
6416 *cost += extra_cost->alu.shift_reg;
6418 return false; /* All arguments need to be in registers. */
6421 case SYMBOL_REF:
6423 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6425 /* LDR. */
6426 if (speed)
6427 *cost += extra_cost->ldst.load;
6429 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6430 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6432 /* ADRP, followed by ADD. */
6433 *cost += COSTS_N_INSNS (1);
6434 if (speed)
6435 *cost += 2 * extra_cost->alu.arith;
6437 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6438 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6440 /* ADR. */
6441 if (speed)
6442 *cost += extra_cost->alu.arith;
6445 if (flag_pic)
6447 /* One extra load instruction, after accessing the GOT. */
6448 *cost += COSTS_N_INSNS (1);
6449 if (speed)
6450 *cost += extra_cost->ldst.load;
6452 return true;
6454 case HIGH:
6455 case LO_SUM:
6456 /* ADRP/ADD (immediate). */
6457 if (speed)
6458 *cost += extra_cost->alu.arith;
6459 return true;
6461 case ZERO_EXTRACT:
6462 case SIGN_EXTRACT:
6463 /* UBFX/SBFX. */
6464 if (speed)
6466 if (VECTOR_MODE_P (mode))
6467 *cost += extra_cost->vect.alu;
6468 else
6469 *cost += extra_cost->alu.bfx;
6472 /* We can trust that the immediates used will be correct (there
6473 are no by-register forms), so we need only cost op0. */
6474 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6475 return true;
6477 case MULT:
6478 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6479 /* aarch64_rtx_mult_cost always handles recursion to its
6480 operands. */
6481 return true;
6483 case MOD:
6484 case UMOD:
6485 if (speed)
6487 if (VECTOR_MODE_P (mode))
6488 *cost += extra_cost->vect.alu;
6489 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6490 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6491 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6492 else if (GET_MODE (x) == DFmode)
6493 *cost += (extra_cost->fp[1].mult
6494 + extra_cost->fp[1].div);
6495 else if (GET_MODE (x) == SFmode)
6496 *cost += (extra_cost->fp[0].mult
6497 + extra_cost->fp[0].div);
6499 return false; /* All arguments need to be in registers. */
6501 case DIV:
6502 case UDIV:
6503 case SQRT:
6504 if (speed)
6506 if (VECTOR_MODE_P (mode))
6507 *cost += extra_cost->vect.alu;
6508 else if (GET_MODE_CLASS (mode) == MODE_INT)
6509 /* There is no integer SQRT, so only DIV and UDIV can get
6510 here. */
6511 *cost += extra_cost->mult[mode == DImode].idiv;
6512 else
6513 *cost += extra_cost->fp[mode == DFmode].div;
6515 return false; /* All arguments need to be in registers. */
6517 case IF_THEN_ELSE:
6518 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6519 XEXP (x, 2), cost, speed);
6521 case EQ:
6522 case NE:
6523 case GT:
6524 case GTU:
6525 case LT:
6526 case LTU:
6527 case GE:
6528 case GEU:
6529 case LE:
6530 case LEU:
6532 return false; /* All arguments must be in registers. */
6534 case FMA:
6535 op0 = XEXP (x, 0);
6536 op1 = XEXP (x, 1);
6537 op2 = XEXP (x, 2);
6539 if (speed)
6541 if (VECTOR_MODE_P (mode))
6542 *cost += extra_cost->vect.alu;
6543 else
6544 *cost += extra_cost->fp[mode == DFmode].fma;
6547 /* FMSUB, FNMADD, and FNMSUB are free. */
6548 if (GET_CODE (op0) == NEG)
6549 op0 = XEXP (op0, 0);
6551 if (GET_CODE (op2) == NEG)
6552 op2 = XEXP (op2, 0);
6554 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6555 and the by-element operand as operand 0. */
6556 if (GET_CODE (op1) == NEG)
6557 op1 = XEXP (op1, 0);
6559 /* Catch vector-by-element operations. The by-element operand can
6560 either be (vec_duplicate (vec_select (x))) or just
6561 (vec_select (x)), depending on whether we are multiplying by
6562 a vector or a scalar.
6564 Canonicalization is not very good in these cases, FMA4 will put the
6565 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6566 if (GET_CODE (op0) == VEC_DUPLICATE)
6567 op0 = XEXP (op0, 0);
6568 else if (GET_CODE (op1) == VEC_DUPLICATE)
6569 op1 = XEXP (op1, 0);
6571 if (GET_CODE (op0) == VEC_SELECT)
6572 op0 = XEXP (op0, 0);
6573 else if (GET_CODE (op1) == VEC_SELECT)
6574 op1 = XEXP (op1, 0);
6576 /* If the remaining parameters are not registers,
6577 get the cost to put them into registers. */
6578 *cost += rtx_cost (op0, FMA, 0, speed);
6579 *cost += rtx_cost (op1, FMA, 1, speed);
6580 *cost += rtx_cost (op2, FMA, 2, speed);
6581 return true;
6583 case FLOAT:
6584 case UNSIGNED_FLOAT:
6585 if (speed)
6586 *cost += extra_cost->fp[mode == DFmode].fromint;
6587 return false;
6589 case FLOAT_EXTEND:
6590 if (speed)
6592 if (VECTOR_MODE_P (mode))
6594 /*Vector truncate. */
6595 *cost += extra_cost->vect.alu;
6597 else
6598 *cost += extra_cost->fp[mode == DFmode].widen;
6600 return false;
6602 case FLOAT_TRUNCATE:
6603 if (speed)
6605 if (VECTOR_MODE_P (mode))
6607 /*Vector conversion. */
6608 *cost += extra_cost->vect.alu;
6610 else
6611 *cost += extra_cost->fp[mode == DFmode].narrow;
6613 return false;
6615 case FIX:
6616 case UNSIGNED_FIX:
6617 x = XEXP (x, 0);
6618 /* Strip the rounding part. They will all be implemented
6619 by the fcvt* family of instructions anyway. */
6620 if (GET_CODE (x) == UNSPEC)
6622 unsigned int uns_code = XINT (x, 1);
6624 if (uns_code == UNSPEC_FRINTA
6625 || uns_code == UNSPEC_FRINTM
6626 || uns_code == UNSPEC_FRINTN
6627 || uns_code == UNSPEC_FRINTP
6628 || uns_code == UNSPEC_FRINTZ)
6629 x = XVECEXP (x, 0, 0);
6632 if (speed)
6634 if (VECTOR_MODE_P (mode))
6635 *cost += extra_cost->vect.alu;
6636 else
6637 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6639 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6640 return true;
6642 case ABS:
6643 if (VECTOR_MODE_P (mode))
6645 /* ABS (vector). */
6646 if (speed)
6647 *cost += extra_cost->vect.alu;
6649 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6651 op0 = XEXP (x, 0);
6653 /* FABD, which is analogous to FADD. */
6654 if (GET_CODE (op0) == MINUS)
6656 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6657 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6658 if (speed)
6659 *cost += extra_cost->fp[mode == DFmode].addsub;
6661 return true;
6663 /* Simple FABS is analogous to FNEG. */
6664 if (speed)
6665 *cost += extra_cost->fp[mode == DFmode].neg;
6667 else
6669 /* Integer ABS will either be split to
6670 two arithmetic instructions, or will be an ABS
6671 (scalar), which we don't model. */
6672 *cost = COSTS_N_INSNS (2);
6673 if (speed)
6674 *cost += 2 * extra_cost->alu.arith;
6676 return false;
6678 case SMAX:
6679 case SMIN:
6680 if (speed)
6682 if (VECTOR_MODE_P (mode))
6683 *cost += extra_cost->vect.alu;
6684 else
6686 /* FMAXNM/FMINNM/FMAX/FMIN.
6687 TODO: This may not be accurate for all implementations, but
6688 we do not model this in the cost tables. */
6689 *cost += extra_cost->fp[mode == DFmode].addsub;
6692 return false;
6694 case UNSPEC:
6695 /* The floating point round to integer frint* instructions. */
6696 if (aarch64_frint_unspec_p (XINT (x, 1)))
6698 if (speed)
6699 *cost += extra_cost->fp[mode == DFmode].roundint;
6701 return false;
6704 if (XINT (x, 1) == UNSPEC_RBIT)
6706 if (speed)
6707 *cost += extra_cost->alu.rev;
6709 return false;
6711 break;
6713 case TRUNCATE:
6715 /* Decompose <su>muldi3_highpart. */
6716 if (/* (truncate:DI */
6717 mode == DImode
6718 /* (lshiftrt:TI */
6719 && GET_MODE (XEXP (x, 0)) == TImode
6720 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6721 /* (mult:TI */
6722 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6723 /* (ANY_EXTEND:TI (reg:DI))
6724 (ANY_EXTEND:TI (reg:DI))) */
6725 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6726 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6727 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6728 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6729 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6730 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6731 /* (const_int 64) */
6732 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6733 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6735 /* UMULH/SMULH. */
6736 if (speed)
6737 *cost += extra_cost->mult[mode == DImode].extend;
6738 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6739 MULT, 0, speed);
6740 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6741 MULT, 1, speed);
6742 return true;
6745 /* Fall through. */
6746 default:
6747 break;
6750 if (dump_file && (dump_flags & TDF_DETAILS))
6751 fprintf (dump_file,
6752 "\nFailed to cost RTX. Assuming default cost.\n");
6754 return true;
6757 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6758 calculated for X. This cost is stored in *COST. Returns true
6759 if the total cost of X was calculated. */
6760 static bool
6761 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6762 int param, int *cost, bool speed)
6764 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6766 if (dump_file && (dump_flags & TDF_DETAILS))
6768 print_rtl_single (dump_file, x);
6769 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6770 speed ? "Hot" : "Cold",
6771 *cost, result ? "final" : "partial");
6774 return result;
6777 static int
6778 aarch64_register_move_cost (machine_mode mode,
6779 reg_class_t from_i, reg_class_t to_i)
6781 enum reg_class from = (enum reg_class) from_i;
6782 enum reg_class to = (enum reg_class) to_i;
6783 const struct cpu_regmove_cost *regmove_cost
6784 = aarch64_tune_params->regmove_cost;
6786 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6787 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6788 to = GENERAL_REGS;
6790 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6791 from = GENERAL_REGS;
6793 /* Moving between GPR and stack cost is the same as GP2GP. */
6794 if ((from == GENERAL_REGS && to == STACK_REG)
6795 || (to == GENERAL_REGS && from == STACK_REG))
6796 return regmove_cost->GP2GP;
6798 /* To/From the stack register, we move via the gprs. */
6799 if (to == STACK_REG || from == STACK_REG)
6800 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6801 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6803 if (GET_MODE_SIZE (mode) == 16)
6805 /* 128-bit operations on general registers require 2 instructions. */
6806 if (from == GENERAL_REGS && to == GENERAL_REGS)
6807 return regmove_cost->GP2GP * 2;
6808 else if (from == GENERAL_REGS)
6809 return regmove_cost->GP2FP * 2;
6810 else if (to == GENERAL_REGS)
6811 return regmove_cost->FP2GP * 2;
6813 /* When AdvSIMD instructions are disabled it is not possible to move
6814 a 128-bit value directly between Q registers. This is handled in
6815 secondary reload. A general register is used as a scratch to move
6816 the upper DI value and the lower DI value is moved directly,
6817 hence the cost is the sum of three moves. */
6818 if (! TARGET_SIMD)
6819 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6821 return regmove_cost->FP2FP;
6824 if (from == GENERAL_REGS && to == GENERAL_REGS)
6825 return regmove_cost->GP2GP;
6826 else if (from == GENERAL_REGS)
6827 return regmove_cost->GP2FP;
6828 else if (to == GENERAL_REGS)
6829 return regmove_cost->FP2GP;
6831 return regmove_cost->FP2FP;
6834 static int
6835 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6836 reg_class_t rclass ATTRIBUTE_UNUSED,
6837 bool in ATTRIBUTE_UNUSED)
6839 return aarch64_tune_params->memmov_cost;
6842 /* Return the number of instructions that can be issued per cycle. */
6843 static int
6844 aarch64_sched_issue_rate (void)
6846 return aarch64_tune_params->issue_rate;
6849 static int
6850 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6852 int issue_rate = aarch64_sched_issue_rate ();
6854 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6857 /* Vectorizer cost model target hooks. */
6859 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6860 static int
6861 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6862 tree vectype,
6863 int misalign ATTRIBUTE_UNUSED)
6865 unsigned elements;
6867 switch (type_of_cost)
6869 case scalar_stmt:
6870 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6872 case scalar_load:
6873 return aarch64_tune_params->vec_costs->scalar_load_cost;
6875 case scalar_store:
6876 return aarch64_tune_params->vec_costs->scalar_store_cost;
6878 case vector_stmt:
6879 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6881 case vector_load:
6882 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6884 case vector_store:
6885 return aarch64_tune_params->vec_costs->vec_store_cost;
6887 case vec_to_scalar:
6888 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6890 case scalar_to_vec:
6891 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6893 case unaligned_load:
6894 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6896 case unaligned_store:
6897 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6899 case cond_branch_taken:
6900 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6902 case cond_branch_not_taken:
6903 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6905 case vec_perm:
6906 case vec_promote_demote:
6907 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6909 case vec_construct:
6910 elements = TYPE_VECTOR_SUBPARTS (vectype);
6911 return elements / 2 + 1;
6913 default:
6914 gcc_unreachable ();
6918 /* Implement targetm.vectorize.add_stmt_cost. */
6919 static unsigned
6920 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6921 struct _stmt_vec_info *stmt_info, int misalign,
6922 enum vect_cost_model_location where)
6924 unsigned *cost = (unsigned *) data;
6925 unsigned retval = 0;
6927 if (flag_vect_cost_model)
6929 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6930 int stmt_cost =
6931 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6933 /* Statements in an inner loop relative to the loop being
6934 vectorized are weighted more heavily. The value here is
6935 a function (linear for now) of the loop nest level. */
6936 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6938 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6939 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6940 unsigned nest_level = loop_depth (loop);
6942 count *= nest_level;
6945 retval = (unsigned) (count * stmt_cost);
6946 cost[where] += retval;
6949 return retval;
6952 static void initialize_aarch64_code_model (void);
6954 /* Parse the architecture extension string. */
6956 static void
6957 aarch64_parse_extension (char *str)
6959 /* The extension string is parsed left to right. */
6960 const struct aarch64_option_extension *opt = NULL;
6962 /* Flag to say whether we are adding or removing an extension. */
6963 int adding_ext = -1;
6965 while (str != NULL && *str != 0)
6967 char *ext;
6968 size_t len;
6970 str++;
6971 ext = strchr (str, '+');
6973 if (ext != NULL)
6974 len = ext - str;
6975 else
6976 len = strlen (str);
6978 if (len >= 2 && strncmp (str, "no", 2) == 0)
6980 adding_ext = 0;
6981 len -= 2;
6982 str += 2;
6984 else if (len > 0)
6985 adding_ext = 1;
6987 if (len == 0)
6989 error ("missing feature modifier after %qs", adding_ext ? "+"
6990 : "+no");
6991 return;
6994 /* Scan over the extensions table trying to find an exact match. */
6995 for (opt = all_extensions; opt->name != NULL; opt++)
6997 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6999 /* Add or remove the extension. */
7000 if (adding_ext)
7001 aarch64_isa_flags |= opt->flags_on;
7002 else
7003 aarch64_isa_flags &= ~(opt->flags_off);
7004 break;
7008 if (opt->name == NULL)
7010 /* Extension not found in list. */
7011 error ("unknown feature modifier %qs", str);
7012 return;
7015 str = ext;
7018 return;
7021 /* Parse the ARCH string. */
7023 static void
7024 aarch64_parse_arch (void)
7026 char *ext;
7027 const struct processor *arch;
7028 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7029 size_t len;
7031 strcpy (str, aarch64_arch_string);
7033 ext = strchr (str, '+');
7035 if (ext != NULL)
7036 len = ext - str;
7037 else
7038 len = strlen (str);
7040 if (len == 0)
7042 error ("missing arch name in -march=%qs", str);
7043 return;
7046 /* Loop through the list of supported ARCHs to find a match. */
7047 for (arch = all_architectures; arch->name != NULL; arch++)
7049 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7051 selected_arch = arch;
7052 aarch64_isa_flags = selected_arch->flags;
7054 if (!selected_cpu)
7055 selected_cpu = &all_cores[selected_arch->core];
7057 if (ext != NULL)
7059 /* ARCH string contains at least one extension. */
7060 aarch64_parse_extension (ext);
7063 if (strcmp (selected_arch->arch, selected_cpu->arch))
7065 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7066 selected_cpu->name, selected_arch->name);
7069 return;
7073 /* ARCH name not found in list. */
7074 error ("unknown value %qs for -march", str);
7075 return;
7078 /* Parse the CPU string. */
7080 static void
7081 aarch64_parse_cpu (void)
7083 char *ext;
7084 const struct processor *cpu;
7085 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7086 size_t len;
7088 strcpy (str, aarch64_cpu_string);
7090 ext = strchr (str, '+');
7092 if (ext != NULL)
7093 len = ext - str;
7094 else
7095 len = strlen (str);
7097 if (len == 0)
7099 error ("missing cpu name in -mcpu=%qs", str);
7100 return;
7103 /* Loop through the list of supported CPUs to find a match. */
7104 for (cpu = all_cores; cpu->name != NULL; cpu++)
7106 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7108 selected_cpu = cpu;
7109 aarch64_isa_flags = selected_cpu->flags;
7111 if (ext != NULL)
7113 /* CPU string contains at least one extension. */
7114 aarch64_parse_extension (ext);
7117 return;
7121 /* CPU name not found in list. */
7122 error ("unknown value %qs for -mcpu", str);
7123 return;
7126 /* Parse the TUNE string. */
7128 static void
7129 aarch64_parse_tune (void)
7131 const struct processor *cpu;
7132 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7133 strcpy (str, aarch64_tune_string);
7135 /* Loop through the list of supported CPUs to find a match. */
7136 for (cpu = all_cores; cpu->name != NULL; cpu++)
7138 if (strcmp (cpu->name, str) == 0)
7140 selected_tune = cpu;
7141 return;
7145 /* CPU name not found in list. */
7146 error ("unknown value %qs for -mtune", str);
7147 return;
7151 /* Implement TARGET_OPTION_OVERRIDE. */
7153 static void
7154 aarch64_override_options (void)
7156 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7157 If either of -march or -mtune is given, they override their
7158 respective component of -mcpu.
7160 So, first parse AARCH64_CPU_STRING, then the others, be careful
7161 with -march as, if -mcpu is not present on the command line, march
7162 must set a sensible default CPU. */
7163 if (aarch64_cpu_string)
7165 aarch64_parse_cpu ();
7168 if (aarch64_arch_string)
7170 aarch64_parse_arch ();
7173 if (aarch64_tune_string)
7175 aarch64_parse_tune ();
7178 #ifndef HAVE_AS_MABI_OPTION
7179 /* The compiler may have been configured with 2.23.* binutils, which does
7180 not have support for ILP32. */
7181 if (TARGET_ILP32)
7182 error ("Assembler does not support -mabi=ilp32");
7183 #endif
7185 initialize_aarch64_code_model ();
7187 aarch64_build_bitmask_table ();
7189 /* This target defaults to strict volatile bitfields. */
7190 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7191 flag_strict_volatile_bitfields = 1;
7193 /* If the user did not specify a processor, choose the default
7194 one for them. This will be the CPU set during configuration using
7195 --with-cpu, otherwise it is "generic". */
7196 if (!selected_cpu)
7198 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7199 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7202 gcc_assert (selected_cpu);
7204 if (!selected_tune)
7205 selected_tune = selected_cpu;
7207 aarch64_tune_flags = selected_tune->flags;
7208 aarch64_tune = selected_tune->core;
7209 aarch64_tune_params = selected_tune->tune;
7210 aarch64_architecture_version = selected_cpu->architecture_version;
7212 if (aarch64_fix_a53_err835769 == 2)
7214 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7215 aarch64_fix_a53_err835769 = 1;
7216 #else
7217 aarch64_fix_a53_err835769 = 0;
7218 #endif
7221 aarch64_register_fma_steering ();
7223 aarch64_override_options_after_change ();
7226 /* Implement targetm.override_options_after_change. */
7228 static void
7229 aarch64_override_options_after_change (void)
7231 if (flag_omit_frame_pointer)
7232 flag_omit_leaf_frame_pointer = false;
7233 else if (flag_omit_leaf_frame_pointer)
7234 flag_omit_frame_pointer = true;
7236 /* If not optimizing for size, set the default
7237 alignment to what the target wants */
7238 if (!optimize_size)
7240 if (align_loops <= 0)
7241 align_loops = aarch64_tune_params->loop_align;
7242 if (align_jumps <= 0)
7243 align_jumps = aarch64_tune_params->jump_align;
7244 if (align_functions <= 0)
7245 align_functions = aarch64_tune_params->function_align;
7249 static struct machine_function *
7250 aarch64_init_machine_status (void)
7252 struct machine_function *machine;
7253 machine = ggc_cleared_alloc<machine_function> ();
7254 return machine;
7257 void
7258 aarch64_init_expanders (void)
7260 init_machine_status = aarch64_init_machine_status;
7263 /* A checking mechanism for the implementation of the various code models. */
7264 static void
7265 initialize_aarch64_code_model (void)
7267 if (flag_pic)
7269 switch (aarch64_cmodel_var)
7271 case AARCH64_CMODEL_TINY:
7272 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7273 break;
7274 case AARCH64_CMODEL_SMALL:
7275 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7276 break;
7277 case AARCH64_CMODEL_LARGE:
7278 sorry ("code model %qs with -f%s", "large",
7279 flag_pic > 1 ? "PIC" : "pic");
7280 default:
7281 gcc_unreachable ();
7284 else
7285 aarch64_cmodel = aarch64_cmodel_var;
7288 /* Return true if SYMBOL_REF X binds locally. */
7290 static bool
7291 aarch64_symbol_binds_local_p (const_rtx x)
7293 return (SYMBOL_REF_DECL (x)
7294 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7295 : SYMBOL_REF_LOCAL_P (x));
7298 /* Return true if SYMBOL_REF X is thread local */
7299 static bool
7300 aarch64_tls_symbol_p (rtx x)
7302 if (! TARGET_HAVE_TLS)
7303 return false;
7305 if (GET_CODE (x) != SYMBOL_REF)
7306 return false;
7308 return SYMBOL_REF_TLS_MODEL (x) != 0;
7311 /* Classify a TLS symbol into one of the TLS kinds. */
7312 enum aarch64_symbol_type
7313 aarch64_classify_tls_symbol (rtx x)
7315 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7317 switch (tls_kind)
7319 case TLS_MODEL_GLOBAL_DYNAMIC:
7320 case TLS_MODEL_LOCAL_DYNAMIC:
7321 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7323 case TLS_MODEL_INITIAL_EXEC:
7324 return SYMBOL_SMALL_GOTTPREL;
7326 case TLS_MODEL_LOCAL_EXEC:
7327 return SYMBOL_SMALL_TPREL;
7329 case TLS_MODEL_EMULATED:
7330 case TLS_MODEL_NONE:
7331 return SYMBOL_FORCE_TO_MEM;
7333 default:
7334 gcc_unreachable ();
7338 /* Return the method that should be used to access SYMBOL_REF or
7339 LABEL_REF X in context CONTEXT. */
7341 enum aarch64_symbol_type
7342 aarch64_classify_symbol (rtx x, rtx offset,
7343 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7345 if (GET_CODE (x) == LABEL_REF)
7347 switch (aarch64_cmodel)
7349 case AARCH64_CMODEL_LARGE:
7350 return SYMBOL_FORCE_TO_MEM;
7352 case AARCH64_CMODEL_TINY_PIC:
7353 case AARCH64_CMODEL_TINY:
7354 return SYMBOL_TINY_ABSOLUTE;
7356 case AARCH64_CMODEL_SMALL_PIC:
7357 case AARCH64_CMODEL_SMALL:
7358 return SYMBOL_SMALL_ABSOLUTE;
7360 default:
7361 gcc_unreachable ();
7365 if (GET_CODE (x) == SYMBOL_REF)
7367 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7368 return SYMBOL_FORCE_TO_MEM;
7370 if (aarch64_tls_symbol_p (x))
7371 return aarch64_classify_tls_symbol (x);
7373 switch (aarch64_cmodel)
7375 case AARCH64_CMODEL_TINY:
7376 /* When we retreive symbol + offset address, we have to make sure
7377 the offset does not cause overflow of the final address. But
7378 we have no way of knowing the address of symbol at compile time
7379 so we can't accurately say if the distance between the PC and
7380 symbol + offset is outside the addressible range of +/-1M in the
7381 TINY code model. So we rely on images not being greater than
7382 1M and cap the offset at 1M and anything beyond 1M will have to
7383 be loaded using an alternative mechanism. */
7384 if (SYMBOL_REF_WEAK (x)
7385 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7386 return SYMBOL_FORCE_TO_MEM;
7387 return SYMBOL_TINY_ABSOLUTE;
7389 case AARCH64_CMODEL_SMALL:
7390 /* Same reasoning as the tiny code model, but the offset cap here is
7391 4G. */
7392 if (SYMBOL_REF_WEAK (x)
7393 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7394 HOST_WIDE_INT_C (4294967264)))
7395 return SYMBOL_FORCE_TO_MEM;
7396 return SYMBOL_SMALL_ABSOLUTE;
7398 case AARCH64_CMODEL_TINY_PIC:
7399 if (!aarch64_symbol_binds_local_p (x))
7400 return SYMBOL_TINY_GOT;
7401 return SYMBOL_TINY_ABSOLUTE;
7403 case AARCH64_CMODEL_SMALL_PIC:
7404 if (!aarch64_symbol_binds_local_p (x))
7405 return SYMBOL_SMALL_GOT;
7406 return SYMBOL_SMALL_ABSOLUTE;
7408 default:
7409 gcc_unreachable ();
7413 /* By default push everything into the constant pool. */
7414 return SYMBOL_FORCE_TO_MEM;
7417 bool
7418 aarch64_constant_address_p (rtx x)
7420 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7423 bool
7424 aarch64_legitimate_pic_operand_p (rtx x)
7426 if (GET_CODE (x) == SYMBOL_REF
7427 || (GET_CODE (x) == CONST
7428 && GET_CODE (XEXP (x, 0)) == PLUS
7429 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7430 return false;
7432 return true;
7435 /* Return true if X holds either a quarter-precision or
7436 floating-point +0.0 constant. */
7437 static bool
7438 aarch64_valid_floating_const (machine_mode mode, rtx x)
7440 if (!CONST_DOUBLE_P (x))
7441 return false;
7443 if (aarch64_float_const_zero_rtx_p (x))
7444 return true;
7446 /* We only handle moving 0.0 to a TFmode register. */
7447 if (!(mode == SFmode || mode == DFmode))
7448 return false;
7450 return aarch64_float_const_representable_p (x);
7453 static bool
7454 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7456 /* Do not allow vector struct mode constants. We could support
7457 0 and -1 easily, but they need support in aarch64-simd.md. */
7458 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7459 return false;
7461 /* This could probably go away because
7462 we now decompose CONST_INTs according to expand_mov_immediate. */
7463 if ((GET_CODE (x) == CONST_VECTOR
7464 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7465 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7466 return !targetm.cannot_force_const_mem (mode, x);
7468 if (GET_CODE (x) == HIGH
7469 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7470 return true;
7472 return aarch64_constant_address_p (x);
7476 aarch64_load_tp (rtx target)
7478 if (!target
7479 || GET_MODE (target) != Pmode
7480 || !register_operand (target, Pmode))
7481 target = gen_reg_rtx (Pmode);
7483 /* Can return in any reg. */
7484 emit_insn (gen_aarch64_load_tp_hard (target));
7485 return target;
7488 /* On AAPCS systems, this is the "struct __va_list". */
7489 static GTY(()) tree va_list_type;
7491 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7492 Return the type to use as __builtin_va_list.
7494 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7496 struct __va_list
7498 void *__stack;
7499 void *__gr_top;
7500 void *__vr_top;
7501 int __gr_offs;
7502 int __vr_offs;
7503 }; */
7505 static tree
7506 aarch64_build_builtin_va_list (void)
7508 tree va_list_name;
7509 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7511 /* Create the type. */
7512 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7513 /* Give it the required name. */
7514 va_list_name = build_decl (BUILTINS_LOCATION,
7515 TYPE_DECL,
7516 get_identifier ("__va_list"),
7517 va_list_type);
7518 DECL_ARTIFICIAL (va_list_name) = 1;
7519 TYPE_NAME (va_list_type) = va_list_name;
7520 TYPE_STUB_DECL (va_list_type) = va_list_name;
7522 /* Create the fields. */
7523 f_stack = build_decl (BUILTINS_LOCATION,
7524 FIELD_DECL, get_identifier ("__stack"),
7525 ptr_type_node);
7526 f_grtop = build_decl (BUILTINS_LOCATION,
7527 FIELD_DECL, get_identifier ("__gr_top"),
7528 ptr_type_node);
7529 f_vrtop = build_decl (BUILTINS_LOCATION,
7530 FIELD_DECL, get_identifier ("__vr_top"),
7531 ptr_type_node);
7532 f_groff = build_decl (BUILTINS_LOCATION,
7533 FIELD_DECL, get_identifier ("__gr_offs"),
7534 integer_type_node);
7535 f_vroff = build_decl (BUILTINS_LOCATION,
7536 FIELD_DECL, get_identifier ("__vr_offs"),
7537 integer_type_node);
7539 DECL_ARTIFICIAL (f_stack) = 1;
7540 DECL_ARTIFICIAL (f_grtop) = 1;
7541 DECL_ARTIFICIAL (f_vrtop) = 1;
7542 DECL_ARTIFICIAL (f_groff) = 1;
7543 DECL_ARTIFICIAL (f_vroff) = 1;
7545 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7546 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7547 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7548 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7549 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7551 TYPE_FIELDS (va_list_type) = f_stack;
7552 DECL_CHAIN (f_stack) = f_grtop;
7553 DECL_CHAIN (f_grtop) = f_vrtop;
7554 DECL_CHAIN (f_vrtop) = f_groff;
7555 DECL_CHAIN (f_groff) = f_vroff;
7557 /* Compute its layout. */
7558 layout_type (va_list_type);
7560 return va_list_type;
7563 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7564 static void
7565 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7567 const CUMULATIVE_ARGS *cum;
7568 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7569 tree stack, grtop, vrtop, groff, vroff;
7570 tree t;
7571 int gr_save_area_size;
7572 int vr_save_area_size;
7573 int vr_offset;
7575 cum = &crtl->args.info;
7576 gr_save_area_size
7577 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7578 vr_save_area_size
7579 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7581 if (!TARGET_FLOAT)
7583 gcc_assert (cum->aapcs_nvrn == 0);
7584 vr_save_area_size = 0;
7587 f_stack = TYPE_FIELDS (va_list_type_node);
7588 f_grtop = DECL_CHAIN (f_stack);
7589 f_vrtop = DECL_CHAIN (f_grtop);
7590 f_groff = DECL_CHAIN (f_vrtop);
7591 f_vroff = DECL_CHAIN (f_groff);
7593 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7594 NULL_TREE);
7595 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7596 NULL_TREE);
7597 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7598 NULL_TREE);
7599 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7600 NULL_TREE);
7601 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7602 NULL_TREE);
7604 /* Emit code to initialize STACK, which points to the next varargs stack
7605 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7606 by named arguments. STACK is 8-byte aligned. */
7607 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7608 if (cum->aapcs_stack_size > 0)
7609 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7610 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7611 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7613 /* Emit code to initialize GRTOP, the top of the GR save area.
7614 virtual_incoming_args_rtx should have been 16 byte aligned. */
7615 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7616 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7617 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7619 /* Emit code to initialize VRTOP, the top of the VR save area.
7620 This address is gr_save_area_bytes below GRTOP, rounded
7621 down to the next 16-byte boundary. */
7622 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7623 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7624 STACK_BOUNDARY / BITS_PER_UNIT);
7626 if (vr_offset)
7627 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7628 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7629 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7631 /* Emit code to initialize GROFF, the offset from GRTOP of the
7632 next GPR argument. */
7633 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7634 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7635 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7637 /* Likewise emit code to initialize VROFF, the offset from FTOP
7638 of the next VR argument. */
7639 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7640 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7641 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7644 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7646 static tree
7647 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7648 gimple_seq *post_p ATTRIBUTE_UNUSED)
7650 tree addr;
7651 bool indirect_p;
7652 bool is_ha; /* is HFA or HVA. */
7653 bool dw_align; /* double-word align. */
7654 machine_mode ag_mode = VOIDmode;
7655 int nregs;
7656 machine_mode mode;
7658 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7659 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7660 HOST_WIDE_INT size, rsize, adjust, align;
7661 tree t, u, cond1, cond2;
7663 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7664 if (indirect_p)
7665 type = build_pointer_type (type);
7667 mode = TYPE_MODE (type);
7669 f_stack = TYPE_FIELDS (va_list_type_node);
7670 f_grtop = DECL_CHAIN (f_stack);
7671 f_vrtop = DECL_CHAIN (f_grtop);
7672 f_groff = DECL_CHAIN (f_vrtop);
7673 f_vroff = DECL_CHAIN (f_groff);
7675 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7676 f_stack, NULL_TREE);
7677 size = int_size_in_bytes (type);
7678 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7680 dw_align = false;
7681 adjust = 0;
7682 if (aarch64_vfp_is_call_or_return_candidate (mode,
7683 type,
7684 &ag_mode,
7685 &nregs,
7686 &is_ha))
7688 /* TYPE passed in fp/simd registers. */
7689 if (!TARGET_FLOAT)
7690 aarch64_err_no_fpadvsimd (mode, "varargs");
7692 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7693 unshare_expr (valist), f_vrtop, NULL_TREE);
7694 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7695 unshare_expr (valist), f_vroff, NULL_TREE);
7697 rsize = nregs * UNITS_PER_VREG;
7699 if (is_ha)
7701 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7702 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7704 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7705 && size < UNITS_PER_VREG)
7707 adjust = UNITS_PER_VREG - size;
7710 else
7712 /* TYPE passed in general registers. */
7713 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7714 unshare_expr (valist), f_grtop, NULL_TREE);
7715 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7716 unshare_expr (valist), f_groff, NULL_TREE);
7717 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7718 nregs = rsize / UNITS_PER_WORD;
7720 if (align > 8)
7721 dw_align = true;
7723 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7724 && size < UNITS_PER_WORD)
7726 adjust = UNITS_PER_WORD - size;
7730 /* Get a local temporary for the field value. */
7731 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7733 /* Emit code to branch if off >= 0. */
7734 t = build2 (GE_EXPR, boolean_type_node, off,
7735 build_int_cst (TREE_TYPE (off), 0));
7736 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7738 if (dw_align)
7740 /* Emit: offs = (offs + 15) & -16. */
7741 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7742 build_int_cst (TREE_TYPE (off), 15));
7743 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7744 build_int_cst (TREE_TYPE (off), -16));
7745 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7747 else
7748 roundup = NULL;
7750 /* Update ap.__[g|v]r_offs */
7751 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7752 build_int_cst (TREE_TYPE (off), rsize));
7753 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7755 /* String up. */
7756 if (roundup)
7757 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7759 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7760 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7761 build_int_cst (TREE_TYPE (f_off), 0));
7762 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7764 /* String up: make sure the assignment happens before the use. */
7765 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7766 COND_EXPR_ELSE (cond1) = t;
7768 /* Prepare the trees handling the argument that is passed on the stack;
7769 the top level node will store in ON_STACK. */
7770 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7771 if (align > 8)
7773 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7774 t = fold_convert (intDI_type_node, arg);
7775 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7776 build_int_cst (TREE_TYPE (t), 15));
7777 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7778 build_int_cst (TREE_TYPE (t), -16));
7779 t = fold_convert (TREE_TYPE (arg), t);
7780 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7782 else
7783 roundup = NULL;
7784 /* Advance ap.__stack */
7785 t = fold_convert (intDI_type_node, arg);
7786 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7787 build_int_cst (TREE_TYPE (t), size + 7));
7788 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7789 build_int_cst (TREE_TYPE (t), -8));
7790 t = fold_convert (TREE_TYPE (arg), t);
7791 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7792 /* String up roundup and advance. */
7793 if (roundup)
7794 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7795 /* String up with arg */
7796 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7797 /* Big-endianness related address adjustment. */
7798 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7799 && size < UNITS_PER_WORD)
7801 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7802 size_int (UNITS_PER_WORD - size));
7803 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7806 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7807 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7809 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7810 t = off;
7811 if (adjust)
7812 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7813 build_int_cst (TREE_TYPE (off), adjust));
7815 t = fold_convert (sizetype, t);
7816 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7818 if (is_ha)
7820 /* type ha; // treat as "struct {ftype field[n];}"
7821 ... [computing offs]
7822 for (i = 0; i <nregs; ++i, offs += 16)
7823 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7824 return ha; */
7825 int i;
7826 tree tmp_ha, field_t, field_ptr_t;
7828 /* Declare a local variable. */
7829 tmp_ha = create_tmp_var_raw (type, "ha");
7830 gimple_add_tmp_var (tmp_ha);
7832 /* Establish the base type. */
7833 switch (ag_mode)
7835 case SFmode:
7836 field_t = float_type_node;
7837 field_ptr_t = float_ptr_type_node;
7838 break;
7839 case DFmode:
7840 field_t = double_type_node;
7841 field_ptr_t = double_ptr_type_node;
7842 break;
7843 case TFmode:
7844 field_t = long_double_type_node;
7845 field_ptr_t = long_double_ptr_type_node;
7846 break;
7847 /* The half precision and quad precision are not fully supported yet. Enable
7848 the following code after the support is complete. Need to find the correct
7849 type node for __fp16 *. */
7850 #if 0
7851 case HFmode:
7852 field_t = float_type_node;
7853 field_ptr_t = float_ptr_type_node;
7854 break;
7855 #endif
7856 case V2SImode:
7857 case V4SImode:
7859 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7860 field_t = build_vector_type_for_mode (innertype, ag_mode);
7861 field_ptr_t = build_pointer_type (field_t);
7863 break;
7864 default:
7865 gcc_assert (0);
7868 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7869 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7870 addr = t;
7871 t = fold_convert (field_ptr_t, addr);
7872 t = build2 (MODIFY_EXPR, field_t,
7873 build1 (INDIRECT_REF, field_t, tmp_ha),
7874 build1 (INDIRECT_REF, field_t, t));
7876 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7877 for (i = 1; i < nregs; ++i)
7879 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7880 u = fold_convert (field_ptr_t, addr);
7881 u = build2 (MODIFY_EXPR, field_t,
7882 build2 (MEM_REF, field_t, tmp_ha,
7883 build_int_cst (field_ptr_t,
7884 (i *
7885 int_size_in_bytes (field_t)))),
7886 build1 (INDIRECT_REF, field_t, u));
7887 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7890 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7891 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7894 COND_EXPR_ELSE (cond2) = t;
7895 addr = fold_convert (build_pointer_type (type), cond1);
7896 addr = build_va_arg_indirect_ref (addr);
7898 if (indirect_p)
7899 addr = build_va_arg_indirect_ref (addr);
7901 return addr;
7904 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7906 static void
7907 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7908 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7909 int no_rtl)
7911 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7912 CUMULATIVE_ARGS local_cum;
7913 int gr_saved, vr_saved;
7915 /* The caller has advanced CUM up to, but not beyond, the last named
7916 argument. Advance a local copy of CUM past the last "real" named
7917 argument, to find out how many registers are left over. */
7918 local_cum = *cum;
7919 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7921 /* Found out how many registers we need to save. */
7922 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7923 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7925 if (!TARGET_FLOAT)
7927 gcc_assert (local_cum.aapcs_nvrn == 0);
7928 vr_saved = 0;
7931 if (!no_rtl)
7933 if (gr_saved > 0)
7935 rtx ptr, mem;
7937 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7938 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7939 - gr_saved * UNITS_PER_WORD);
7940 mem = gen_frame_mem (BLKmode, ptr);
7941 set_mem_alias_set (mem, get_varargs_alias_set ());
7943 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7944 mem, gr_saved);
7946 if (vr_saved > 0)
7948 /* We can't use move_block_from_reg, because it will use
7949 the wrong mode, storing D regs only. */
7950 machine_mode mode = TImode;
7951 int off, i;
7953 /* Set OFF to the offset from virtual_incoming_args_rtx of
7954 the first vector register. The VR save area lies below
7955 the GR one, and is aligned to 16 bytes. */
7956 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7957 STACK_BOUNDARY / BITS_PER_UNIT);
7958 off -= vr_saved * UNITS_PER_VREG;
7960 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7962 rtx ptr, mem;
7964 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7965 mem = gen_frame_mem (mode, ptr);
7966 set_mem_alias_set (mem, get_varargs_alias_set ());
7967 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7968 off += UNITS_PER_VREG;
7973 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7974 any complication of having crtl->args.pretend_args_size changed. */
7975 cfun->machine->frame.saved_varargs_size
7976 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7977 STACK_BOUNDARY / BITS_PER_UNIT)
7978 + vr_saved * UNITS_PER_VREG);
7981 static void
7982 aarch64_conditional_register_usage (void)
7984 int i;
7985 if (!TARGET_FLOAT)
7987 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7989 fixed_regs[i] = 1;
7990 call_used_regs[i] = 1;
7995 /* Walk down the type tree of TYPE counting consecutive base elements.
7996 If *MODEP is VOIDmode, then set it to the first valid floating point
7997 type. If a non-floating point type is found, or if a floating point
7998 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7999 otherwise return the count in the sub-tree. */
8000 static int
8001 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8003 machine_mode mode;
8004 HOST_WIDE_INT size;
8006 switch (TREE_CODE (type))
8008 case REAL_TYPE:
8009 mode = TYPE_MODE (type);
8010 if (mode != DFmode && mode != SFmode && mode != TFmode)
8011 return -1;
8013 if (*modep == VOIDmode)
8014 *modep = mode;
8016 if (*modep == mode)
8017 return 1;
8019 break;
8021 case COMPLEX_TYPE:
8022 mode = TYPE_MODE (TREE_TYPE (type));
8023 if (mode != DFmode && mode != SFmode && mode != TFmode)
8024 return -1;
8026 if (*modep == VOIDmode)
8027 *modep = mode;
8029 if (*modep == mode)
8030 return 2;
8032 break;
8034 case VECTOR_TYPE:
8035 /* Use V2SImode and V4SImode as representatives of all 64-bit
8036 and 128-bit vector types. */
8037 size = int_size_in_bytes (type);
8038 switch (size)
8040 case 8:
8041 mode = V2SImode;
8042 break;
8043 case 16:
8044 mode = V4SImode;
8045 break;
8046 default:
8047 return -1;
8050 if (*modep == VOIDmode)
8051 *modep = mode;
8053 /* Vector modes are considered to be opaque: two vectors are
8054 equivalent for the purposes of being homogeneous aggregates
8055 if they are the same size. */
8056 if (*modep == mode)
8057 return 1;
8059 break;
8061 case ARRAY_TYPE:
8063 int count;
8064 tree index = TYPE_DOMAIN (type);
8066 /* Can't handle incomplete types nor sizes that are not
8067 fixed. */
8068 if (!COMPLETE_TYPE_P (type)
8069 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8070 return -1;
8072 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8073 if (count == -1
8074 || !index
8075 || !TYPE_MAX_VALUE (index)
8076 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8077 || !TYPE_MIN_VALUE (index)
8078 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8079 || count < 0)
8080 return -1;
8082 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8083 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8085 /* There must be no padding. */
8086 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8087 return -1;
8089 return count;
8092 case RECORD_TYPE:
8094 int count = 0;
8095 int sub_count;
8096 tree field;
8098 /* Can't handle incomplete types nor sizes that are not
8099 fixed. */
8100 if (!COMPLETE_TYPE_P (type)
8101 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8102 return -1;
8104 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8106 if (TREE_CODE (field) != FIELD_DECL)
8107 continue;
8109 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8110 if (sub_count < 0)
8111 return -1;
8112 count += sub_count;
8115 /* There must be no padding. */
8116 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8117 return -1;
8119 return count;
8122 case UNION_TYPE:
8123 case QUAL_UNION_TYPE:
8125 /* These aren't very interesting except in a degenerate case. */
8126 int count = 0;
8127 int sub_count;
8128 tree field;
8130 /* Can't handle incomplete types nor sizes that are not
8131 fixed. */
8132 if (!COMPLETE_TYPE_P (type)
8133 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8134 return -1;
8136 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8138 if (TREE_CODE (field) != FIELD_DECL)
8139 continue;
8141 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8142 if (sub_count < 0)
8143 return -1;
8144 count = count > sub_count ? count : sub_count;
8147 /* There must be no padding. */
8148 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8149 return -1;
8151 return count;
8154 default:
8155 break;
8158 return -1;
8161 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8162 type as described in AAPCS64 \S 4.1.2.
8164 See the comment above aarch64_composite_type_p for the notes on MODE. */
8166 static bool
8167 aarch64_short_vector_p (const_tree type,
8168 machine_mode mode)
8170 HOST_WIDE_INT size = -1;
8172 if (type && TREE_CODE (type) == VECTOR_TYPE)
8173 size = int_size_in_bytes (type);
8174 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8175 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8176 size = GET_MODE_SIZE (mode);
8178 return (size == 8 || size == 16);
8181 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8182 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
8183 array types. The C99 floating-point complex types are also considered
8184 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
8185 types, which are GCC extensions and out of the scope of AAPCS64, are
8186 treated as composite types here as well.
8188 Note that MODE itself is not sufficient in determining whether a type
8189 is such a composite type or not. This is because
8190 stor-layout.c:compute_record_mode may have already changed the MODE
8191 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
8192 structure with only one field may have its MODE set to the mode of the
8193 field. Also an integer mode whose size matches the size of the
8194 RECORD_TYPE type may be used to substitute the original mode
8195 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
8196 solely relied on. */
8198 static bool
8199 aarch64_composite_type_p (const_tree type,
8200 machine_mode mode)
8202 if (aarch64_short_vector_p (type, mode))
8203 return false;
8205 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8206 return true;
8208 if (mode == BLKmode
8209 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8210 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8211 return true;
8213 return false;
8216 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8217 shall be passed or returned in simd/fp register(s) (providing these
8218 parameter passing registers are available).
8220 Upon successful return, *COUNT returns the number of needed registers,
8221 *BASE_MODE returns the mode of the individual register and when IS_HAF
8222 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8223 floating-point aggregate or a homogeneous short-vector aggregate. */
8225 static bool
8226 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8227 const_tree type,
8228 machine_mode *base_mode,
8229 int *count,
8230 bool *is_ha)
8232 machine_mode new_mode = VOIDmode;
8233 bool composite_p = aarch64_composite_type_p (type, mode);
8235 if (is_ha != NULL) *is_ha = false;
8237 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8238 || aarch64_short_vector_p (type, mode))
8240 *count = 1;
8241 new_mode = mode;
8243 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8245 if (is_ha != NULL) *is_ha = true;
8246 *count = 2;
8247 new_mode = GET_MODE_INNER (mode);
8249 else if (type && composite_p)
8251 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8253 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8255 if (is_ha != NULL) *is_ha = true;
8256 *count = ag_count;
8258 else
8259 return false;
8261 else
8262 return false;
8264 *base_mode = new_mode;
8265 return true;
8268 /* Implement TARGET_STRUCT_VALUE_RTX. */
8270 static rtx
8271 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8272 int incoming ATTRIBUTE_UNUSED)
8274 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8277 /* Implements target hook vector_mode_supported_p. */
8278 static bool
8279 aarch64_vector_mode_supported_p (machine_mode mode)
8281 if (TARGET_SIMD
8282 && (mode == V4SImode || mode == V8HImode
8283 || mode == V16QImode || mode == V2DImode
8284 || mode == V2SImode || mode == V4HImode
8285 || mode == V8QImode || mode == V2SFmode
8286 || mode == V4SFmode || mode == V2DFmode
8287 || mode == V1DFmode))
8288 return true;
8290 return false;
8293 /* Return appropriate SIMD container
8294 for MODE within a vector of WIDTH bits. */
8295 static machine_mode
8296 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8298 gcc_assert (width == 64 || width == 128);
8299 if (TARGET_SIMD)
8301 if (width == 128)
8302 switch (mode)
8304 case DFmode:
8305 return V2DFmode;
8306 case SFmode:
8307 return V4SFmode;
8308 case SImode:
8309 return V4SImode;
8310 case HImode:
8311 return V8HImode;
8312 case QImode:
8313 return V16QImode;
8314 case DImode:
8315 return V2DImode;
8316 default:
8317 break;
8319 else
8320 switch (mode)
8322 case SFmode:
8323 return V2SFmode;
8324 case SImode:
8325 return V2SImode;
8326 case HImode:
8327 return V4HImode;
8328 case QImode:
8329 return V8QImode;
8330 default:
8331 break;
8334 return word_mode;
8337 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8338 static machine_mode
8339 aarch64_preferred_simd_mode (machine_mode mode)
8341 return aarch64_simd_container_mode (mode, 128);
8344 /* Return the bitmask of possible vector sizes for the vectorizer
8345 to iterate over. */
8346 static unsigned int
8347 aarch64_autovectorize_vector_sizes (void)
8349 return (16 | 8);
8352 /* Implement TARGET_MANGLE_TYPE. */
8354 static const char *
8355 aarch64_mangle_type (const_tree type)
8357 /* The AArch64 ABI documents say that "__va_list" has to be
8358 managled as if it is in the "std" namespace. */
8359 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8360 return "St9__va_list";
8362 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8363 builtin types. */
8364 if (TYPE_NAME (type) != NULL)
8365 return aarch64_mangle_builtin_type (type);
8367 /* Use the default mangling. */
8368 return NULL;
8372 /* Return true if the rtx_insn contains a MEM RTX somewhere
8373 in it. */
8375 static bool
8376 has_memory_op (rtx_insn *mem_insn)
8378 subrtx_iterator::array_type array;
8379 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8380 if (MEM_P (*iter))
8381 return true;
8383 return false;
8386 /* Find the first rtx_insn before insn that will generate an assembly
8387 instruction. */
8389 static rtx_insn *
8390 aarch64_prev_real_insn (rtx_insn *insn)
8392 if (!insn)
8393 return NULL;
8397 insn = prev_real_insn (insn);
8399 while (insn && recog_memoized (insn) < 0);
8401 return insn;
8404 static bool
8405 is_madd_op (enum attr_type t1)
8407 unsigned int i;
8408 /* A number of these may be AArch32 only. */
8409 enum attr_type mlatypes[] = {
8410 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8411 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8412 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8415 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8417 if (t1 == mlatypes[i])
8418 return true;
8421 return false;
8424 /* Check if there is a register dependency between a load and the insn
8425 for which we hold recog_data. */
8427 static bool
8428 dep_between_memop_and_curr (rtx memop)
8430 rtx load_reg;
8431 int opno;
8433 gcc_assert (GET_CODE (memop) == SET);
8435 if (!REG_P (SET_DEST (memop)))
8436 return false;
8438 load_reg = SET_DEST (memop);
8439 for (opno = 1; opno < recog_data.n_operands; opno++)
8441 rtx operand = recog_data.operand[opno];
8442 if (REG_P (operand)
8443 && reg_overlap_mentioned_p (load_reg, operand))
8444 return true;
8447 return false;
8451 /* When working around the Cortex-A53 erratum 835769,
8452 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8453 instruction and has a preceding memory instruction such that a NOP
8454 should be inserted between them. */
8456 bool
8457 aarch64_madd_needs_nop (rtx_insn* insn)
8459 enum attr_type attr_type;
8460 rtx_insn *prev;
8461 rtx body;
8463 if (!aarch64_fix_a53_err835769)
8464 return false;
8466 if (recog_memoized (insn) < 0)
8467 return false;
8469 attr_type = get_attr_type (insn);
8470 if (!is_madd_op (attr_type))
8471 return false;
8473 prev = aarch64_prev_real_insn (insn);
8474 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8475 Restore recog state to INSN to avoid state corruption. */
8476 extract_constrain_insn_cached (insn);
8478 if (!prev || !has_memory_op (prev))
8479 return false;
8481 body = single_set (prev);
8483 /* If the previous insn is a memory op and there is no dependency between
8484 it and the DImode madd, emit a NOP between them. If body is NULL then we
8485 have a complex memory operation, probably a load/store pair.
8486 Be conservative for now and emit a NOP. */
8487 if (GET_MODE (recog_data.operand[0]) == DImode
8488 && (!body || !dep_between_memop_and_curr (body)))
8489 return true;
8491 return false;
8496 /* Implement FINAL_PRESCAN_INSN. */
8498 void
8499 aarch64_final_prescan_insn (rtx_insn *insn)
8501 if (aarch64_madd_needs_nop (insn))
8502 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8506 /* Return the equivalent letter for size. */
8507 static char
8508 sizetochar (int size)
8510 switch (size)
8512 case 64: return 'd';
8513 case 32: return 's';
8514 case 16: return 'h';
8515 case 8 : return 'b';
8516 default: gcc_unreachable ();
8520 /* Return true iff x is a uniform vector of floating-point
8521 constants, and the constant can be represented in
8522 quarter-precision form. Note, as aarch64_float_const_representable
8523 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8524 static bool
8525 aarch64_vect_float_const_representable_p (rtx x)
8527 int i = 0;
8528 REAL_VALUE_TYPE r0, ri;
8529 rtx x0, xi;
8531 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8532 return false;
8534 x0 = CONST_VECTOR_ELT (x, 0);
8535 if (!CONST_DOUBLE_P (x0))
8536 return false;
8538 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8540 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8542 xi = CONST_VECTOR_ELT (x, i);
8543 if (!CONST_DOUBLE_P (xi))
8544 return false;
8546 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8547 if (!REAL_VALUES_EQUAL (r0, ri))
8548 return false;
8551 return aarch64_float_const_representable_p (x0);
8554 /* Return true for valid and false for invalid. */
8555 bool
8556 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8557 struct simd_immediate_info *info)
8559 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8560 matches = 1; \
8561 for (i = 0; i < idx; i += (STRIDE)) \
8562 if (!(TEST)) \
8563 matches = 0; \
8564 if (matches) \
8566 immtype = (CLASS); \
8567 elsize = (ELSIZE); \
8568 eshift = (SHIFT); \
8569 emvn = (NEG); \
8570 break; \
8573 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8574 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8575 unsigned char bytes[16];
8576 int immtype = -1, matches;
8577 unsigned int invmask = inverse ? 0xff : 0;
8578 int eshift, emvn;
8580 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8582 if (! (aarch64_simd_imm_zero_p (op, mode)
8583 || aarch64_vect_float_const_representable_p (op)))
8584 return false;
8586 if (info)
8588 info->value = CONST_VECTOR_ELT (op, 0);
8589 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8590 info->mvn = false;
8591 info->shift = 0;
8594 return true;
8597 /* Splat vector constant out into a byte vector. */
8598 for (i = 0; i < n_elts; i++)
8600 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8601 it must be laid out in the vector register in reverse order. */
8602 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8603 unsigned HOST_WIDE_INT elpart;
8604 unsigned int part, parts;
8606 if (CONST_INT_P (el))
8608 elpart = INTVAL (el);
8609 parts = 1;
8611 else if (GET_CODE (el) == CONST_DOUBLE)
8613 elpart = CONST_DOUBLE_LOW (el);
8614 parts = 2;
8616 else
8617 gcc_unreachable ();
8619 for (part = 0; part < parts; part++)
8621 unsigned int byte;
8622 for (byte = 0; byte < innersize; byte++)
8624 bytes[idx++] = (elpart & 0xff) ^ invmask;
8625 elpart >>= BITS_PER_UNIT;
8627 if (GET_CODE (el) == CONST_DOUBLE)
8628 elpart = CONST_DOUBLE_HIGH (el);
8632 /* Sanity check. */
8633 gcc_assert (idx == GET_MODE_SIZE (mode));
8637 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8638 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8640 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8641 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8643 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8644 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8646 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8647 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8649 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8651 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8653 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8654 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8656 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8657 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8659 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8660 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8662 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8663 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8665 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8667 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8669 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8670 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8672 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8673 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8675 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8676 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8678 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8679 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8681 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8683 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8684 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8686 while (0);
8688 if (immtype == -1)
8689 return false;
8691 if (info)
8693 info->element_width = elsize;
8694 info->mvn = emvn != 0;
8695 info->shift = eshift;
8697 unsigned HOST_WIDE_INT imm = 0;
8699 if (immtype >= 12 && immtype <= 15)
8700 info->msl = true;
8702 /* Un-invert bytes of recognized vector, if necessary. */
8703 if (invmask != 0)
8704 for (i = 0; i < idx; i++)
8705 bytes[i] ^= invmask;
8707 if (immtype == 17)
8709 /* FIXME: Broken on 32-bit H_W_I hosts. */
8710 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8712 for (i = 0; i < 8; i++)
8713 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8714 << (i * BITS_PER_UNIT);
8717 info->value = GEN_INT (imm);
8719 else
8721 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8722 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8724 /* Construct 'abcdefgh' because the assembler cannot handle
8725 generic constants. */
8726 if (info->mvn)
8727 imm = ~imm;
8728 imm = (imm >> info->shift) & 0xff;
8729 info->value = GEN_INT (imm);
8733 return true;
8734 #undef CHECK
8737 /* Check of immediate shift constants are within range. */
8738 bool
8739 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8741 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8742 if (left)
8743 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8744 else
8745 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8748 /* Return true if X is a uniform vector where all elements
8749 are either the floating-point constant 0.0 or the
8750 integer constant 0. */
8751 bool
8752 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8754 return x == CONST0_RTX (mode);
8757 bool
8758 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8760 HOST_WIDE_INT imm = INTVAL (x);
8761 int i;
8763 for (i = 0; i < 8; i++)
8765 unsigned int byte = imm & 0xff;
8766 if (byte != 0xff && byte != 0)
8767 return false;
8768 imm >>= 8;
8771 return true;
8774 bool
8775 aarch64_mov_operand_p (rtx x,
8776 enum aarch64_symbol_context context,
8777 machine_mode mode)
8779 if (GET_CODE (x) == HIGH
8780 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8781 return true;
8783 if (CONST_INT_P (x))
8784 return true;
8786 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8787 return true;
8789 return aarch64_classify_symbolic_expression (x, context)
8790 == SYMBOL_TINY_ABSOLUTE;
8793 /* Return a const_int vector of VAL. */
8795 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8797 int nunits = GET_MODE_NUNITS (mode);
8798 rtvec v = rtvec_alloc (nunits);
8799 int i;
8801 for (i=0; i < nunits; i++)
8802 RTVEC_ELT (v, i) = GEN_INT (val);
8804 return gen_rtx_CONST_VECTOR (mode, v);
8807 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8809 bool
8810 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8812 machine_mode vmode;
8814 gcc_assert (!VECTOR_MODE_P (mode));
8815 vmode = aarch64_preferred_simd_mode (mode);
8816 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8817 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8820 /* Construct and return a PARALLEL RTX vector with elements numbering the
8821 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8822 the vector - from the perspective of the architecture. This does not
8823 line up with GCC's perspective on lane numbers, so we end up with
8824 different masks depending on our target endian-ness. The diagram
8825 below may help. We must draw the distinction when building masks
8826 which select one half of the vector. An instruction selecting
8827 architectural low-lanes for a big-endian target, must be described using
8828 a mask selecting GCC high-lanes.
8830 Big-Endian Little-Endian
8832 GCC 0 1 2 3 3 2 1 0
8833 | x | x | x | x | | x | x | x | x |
8834 Architecture 3 2 1 0 3 2 1 0
8836 Low Mask: { 2, 3 } { 0, 1 }
8837 High Mask: { 0, 1 } { 2, 3 }
8841 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8843 int nunits = GET_MODE_NUNITS (mode);
8844 rtvec v = rtvec_alloc (nunits / 2);
8845 int high_base = nunits / 2;
8846 int low_base = 0;
8847 int base;
8848 rtx t1;
8849 int i;
8851 if (BYTES_BIG_ENDIAN)
8852 base = high ? low_base : high_base;
8853 else
8854 base = high ? high_base : low_base;
8856 for (i = 0; i < nunits / 2; i++)
8857 RTVEC_ELT (v, i) = GEN_INT (base + i);
8859 t1 = gen_rtx_PARALLEL (mode, v);
8860 return t1;
8863 /* Check OP for validity as a PARALLEL RTX vector with elements
8864 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8865 from the perspective of the architecture. See the diagram above
8866 aarch64_simd_vect_par_cnst_half for more details. */
8868 bool
8869 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8870 bool high)
8872 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8873 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8874 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8875 int i = 0;
8877 if (!VECTOR_MODE_P (mode))
8878 return false;
8880 if (count_op != count_ideal)
8881 return false;
8883 for (i = 0; i < count_ideal; i++)
8885 rtx elt_op = XVECEXP (op, 0, i);
8886 rtx elt_ideal = XVECEXP (ideal, 0, i);
8888 if (!CONST_INT_P (elt_op)
8889 || INTVAL (elt_ideal) != INTVAL (elt_op))
8890 return false;
8892 return true;
8895 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8896 HIGH (exclusive). */
8897 void
8898 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8899 const_tree exp)
8901 HOST_WIDE_INT lane;
8902 gcc_assert (CONST_INT_P (operand));
8903 lane = INTVAL (operand);
8905 if (lane < low || lane >= high)
8907 if (exp)
8908 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
8909 else
8910 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
8914 /* Return TRUE if OP is a valid vector addressing mode. */
8915 bool
8916 aarch64_simd_mem_operand_p (rtx op)
8918 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8919 || REG_P (XEXP (op, 0)));
8922 /* Emit a register copy from operand to operand, taking care not to
8923 early-clobber source registers in the process.
8925 COUNT is the number of components into which the copy needs to be
8926 decomposed. */
8927 void
8928 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8929 unsigned int count)
8931 unsigned int i;
8932 int rdest = REGNO (operands[0]);
8933 int rsrc = REGNO (operands[1]);
8935 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8936 || rdest < rsrc)
8937 for (i = 0; i < count; i++)
8938 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8939 gen_rtx_REG (mode, rsrc + i));
8940 else
8941 for (i = 0; i < count; i++)
8942 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8943 gen_rtx_REG (mode, rsrc + count - i - 1));
8946 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8947 one of VSTRUCT modes: OI, CI or XI. */
8949 aarch64_simd_attr_length_move (rtx_insn *insn)
8951 machine_mode mode;
8953 extract_insn_cached (insn);
8955 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8957 mode = GET_MODE (recog_data.operand[0]);
8958 switch (mode)
8960 case OImode:
8961 return 8;
8962 case CImode:
8963 return 12;
8964 case XImode:
8965 return 16;
8966 default:
8967 gcc_unreachable ();
8970 return 4;
8973 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8974 one of VSTRUCT modes: OI, CI, EI, or XI. */
8976 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8978 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8981 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8982 alignment of a vector to 128 bits. */
8983 static HOST_WIDE_INT
8984 aarch64_simd_vector_alignment (const_tree type)
8986 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8987 return MIN (align, 128);
8990 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8991 static bool
8992 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8994 if (is_packed)
8995 return false;
8997 /* We guarantee alignment for vectors up to 128-bits. */
8998 if (tree_int_cst_compare (TYPE_SIZE (type),
8999 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
9000 return false;
9002 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
9003 return true;
9006 /* If VALS is a vector constant that can be loaded into a register
9007 using DUP, generate instructions to do so and return an RTX to
9008 assign to the register. Otherwise return NULL_RTX. */
9009 static rtx
9010 aarch64_simd_dup_constant (rtx vals)
9012 machine_mode mode = GET_MODE (vals);
9013 machine_mode inner_mode = GET_MODE_INNER (mode);
9014 int n_elts = GET_MODE_NUNITS (mode);
9015 bool all_same = true;
9016 rtx x;
9017 int i;
9019 if (GET_CODE (vals) != CONST_VECTOR)
9020 return NULL_RTX;
9022 for (i = 1; i < n_elts; ++i)
9024 x = CONST_VECTOR_ELT (vals, i);
9025 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9026 all_same = false;
9029 if (!all_same)
9030 return NULL_RTX;
9032 /* We can load this constant by using DUP and a constant in a
9033 single ARM register. This will be cheaper than a vector
9034 load. */
9035 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9036 return gen_rtx_VEC_DUPLICATE (mode, x);
9040 /* Generate code to load VALS, which is a PARALLEL containing only
9041 constants (for vec_init) or CONST_VECTOR, efficiently into a
9042 register. Returns an RTX to copy into the register, or NULL_RTX
9043 for a PARALLEL that can not be converted into a CONST_VECTOR. */
9044 static rtx
9045 aarch64_simd_make_constant (rtx vals)
9047 machine_mode mode = GET_MODE (vals);
9048 rtx const_dup;
9049 rtx const_vec = NULL_RTX;
9050 int n_elts = GET_MODE_NUNITS (mode);
9051 int n_const = 0;
9052 int i;
9054 if (GET_CODE (vals) == CONST_VECTOR)
9055 const_vec = vals;
9056 else if (GET_CODE (vals) == PARALLEL)
9058 /* A CONST_VECTOR must contain only CONST_INTs and
9059 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9060 Only store valid constants in a CONST_VECTOR. */
9061 for (i = 0; i < n_elts; ++i)
9063 rtx x = XVECEXP (vals, 0, i);
9064 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9065 n_const++;
9067 if (n_const == n_elts)
9068 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9070 else
9071 gcc_unreachable ();
9073 if (const_vec != NULL_RTX
9074 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9075 /* Load using MOVI/MVNI. */
9076 return const_vec;
9077 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9078 /* Loaded using DUP. */
9079 return const_dup;
9080 else if (const_vec != NULL_RTX)
9081 /* Load from constant pool. We can not take advantage of single-cycle
9082 LD1 because we need a PC-relative addressing mode. */
9083 return const_vec;
9084 else
9085 /* A PARALLEL containing something not valid inside CONST_VECTOR.
9086 We can not construct an initializer. */
9087 return NULL_RTX;
9090 void
9091 aarch64_expand_vector_init (rtx target, rtx vals)
9093 machine_mode mode = GET_MODE (target);
9094 machine_mode inner_mode = GET_MODE_INNER (mode);
9095 int n_elts = GET_MODE_NUNITS (mode);
9096 int n_var = 0;
9097 rtx any_const = NULL_RTX;
9098 bool all_same = true;
9100 for (int i = 0; i < n_elts; ++i)
9102 rtx x = XVECEXP (vals, 0, i);
9103 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9104 ++n_var;
9105 else
9106 any_const = x;
9108 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9109 all_same = false;
9112 if (n_var == 0)
9114 rtx constant = aarch64_simd_make_constant (vals);
9115 if (constant != NULL_RTX)
9117 emit_move_insn (target, constant);
9118 return;
9122 /* Splat a single non-constant element if we can. */
9123 if (all_same)
9125 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9126 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9127 return;
9130 /* Half the fields (or less) are non-constant. Load constant then overwrite
9131 varying fields. Hope that this is more efficient than using the stack. */
9132 if (n_var <= n_elts/2)
9134 rtx copy = copy_rtx (vals);
9136 /* Load constant part of vector. We really don't care what goes into the
9137 parts we will overwrite, but we're more likely to be able to load the
9138 constant efficiently if it has fewer, larger, repeating parts
9139 (see aarch64_simd_valid_immediate). */
9140 for (int i = 0; i < n_elts; i++)
9142 rtx x = XVECEXP (vals, 0, i);
9143 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9144 continue;
9145 rtx subst = any_const;
9146 for (int bit = n_elts / 2; bit > 0; bit /= 2)
9148 /* Look in the copied vector, as more elements are const. */
9149 rtx test = XVECEXP (copy, 0, i ^ bit);
9150 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9152 subst = test;
9153 break;
9156 XVECEXP (copy, 0, i) = subst;
9158 aarch64_expand_vector_init (target, copy);
9160 /* Insert variables. */
9161 enum insn_code icode = optab_handler (vec_set_optab, mode);
9162 gcc_assert (icode != CODE_FOR_nothing);
9164 for (int i = 0; i < n_elts; i++)
9166 rtx x = XVECEXP (vals, 0, i);
9167 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9168 continue;
9169 x = copy_to_mode_reg (inner_mode, x);
9170 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9172 return;
9175 /* Construct the vector in memory one field at a time
9176 and load the whole vector. */
9177 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9178 for (int i = 0; i < n_elts; i++)
9179 emit_move_insn (adjust_address_nv (mem, inner_mode,
9180 i * GET_MODE_SIZE (inner_mode)),
9181 XVECEXP (vals, 0, i));
9182 emit_move_insn (target, mem);
9186 static unsigned HOST_WIDE_INT
9187 aarch64_shift_truncation_mask (machine_mode mode)
9189 return
9190 (aarch64_vector_mode_supported_p (mode)
9191 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9194 #ifndef TLS_SECTION_ASM_FLAG
9195 #define TLS_SECTION_ASM_FLAG 'T'
9196 #endif
9198 void
9199 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9200 tree decl ATTRIBUTE_UNUSED)
9202 char flagchars[10], *f = flagchars;
9204 /* If we have already declared this section, we can use an
9205 abbreviated form to switch back to it -- unless this section is
9206 part of a COMDAT groups, in which case GAS requires the full
9207 declaration every time. */
9208 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9209 && (flags & SECTION_DECLARED))
9211 fprintf (asm_out_file, "\t.section\t%s\n", name);
9212 return;
9215 if (!(flags & SECTION_DEBUG))
9216 *f++ = 'a';
9217 if (flags & SECTION_WRITE)
9218 *f++ = 'w';
9219 if (flags & SECTION_CODE)
9220 *f++ = 'x';
9221 if (flags & SECTION_SMALL)
9222 *f++ = 's';
9223 if (flags & SECTION_MERGE)
9224 *f++ = 'M';
9225 if (flags & SECTION_STRINGS)
9226 *f++ = 'S';
9227 if (flags & SECTION_TLS)
9228 *f++ = TLS_SECTION_ASM_FLAG;
9229 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9230 *f++ = 'G';
9231 *f = '\0';
9233 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9235 if (!(flags & SECTION_NOTYPE))
9237 const char *type;
9238 const char *format;
9240 if (flags & SECTION_BSS)
9241 type = "nobits";
9242 else
9243 type = "progbits";
9245 #ifdef TYPE_OPERAND_FMT
9246 format = "," TYPE_OPERAND_FMT;
9247 #else
9248 format = ",@%s";
9249 #endif
9251 fprintf (asm_out_file, format, type);
9253 if (flags & SECTION_ENTSIZE)
9254 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9255 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9257 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9258 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9259 else
9260 fprintf (asm_out_file, ",%s,comdat",
9261 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9265 putc ('\n', asm_out_file);
9268 /* Select a format to encode pointers in exception handling data. */
9270 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9272 int type;
9273 switch (aarch64_cmodel)
9275 case AARCH64_CMODEL_TINY:
9276 case AARCH64_CMODEL_TINY_PIC:
9277 case AARCH64_CMODEL_SMALL:
9278 case AARCH64_CMODEL_SMALL_PIC:
9279 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9280 for everything. */
9281 type = DW_EH_PE_sdata4;
9282 break;
9283 default:
9284 /* No assumptions here. 8-byte relocs required. */
9285 type = DW_EH_PE_sdata8;
9286 break;
9288 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9291 /* Emit load exclusive. */
9293 static void
9294 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9295 rtx mem, rtx model_rtx)
9297 rtx (*gen) (rtx, rtx, rtx);
9299 switch (mode)
9301 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9302 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9303 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9304 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9305 default:
9306 gcc_unreachable ();
9309 emit_insn (gen (rval, mem, model_rtx));
9312 /* Emit store exclusive. */
9314 static void
9315 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9316 rtx rval, rtx mem, rtx model_rtx)
9318 rtx (*gen) (rtx, rtx, rtx, rtx);
9320 switch (mode)
9322 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9323 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9324 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9325 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9326 default:
9327 gcc_unreachable ();
9330 emit_insn (gen (bval, rval, mem, model_rtx));
9333 /* Mark the previous jump instruction as unlikely. */
9335 static void
9336 aarch64_emit_unlikely_jump (rtx insn)
9338 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9340 insn = emit_jump_insn (insn);
9341 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9344 /* Expand a compare and swap pattern. */
9346 void
9347 aarch64_expand_compare_and_swap (rtx operands[])
9349 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9350 machine_mode mode, cmp_mode;
9351 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9353 bval = operands[0];
9354 rval = operands[1];
9355 mem = operands[2];
9356 oldval = operands[3];
9357 newval = operands[4];
9358 is_weak = operands[5];
9359 mod_s = operands[6];
9360 mod_f = operands[7];
9361 mode = GET_MODE (mem);
9362 cmp_mode = mode;
9364 /* Normally the succ memory model must be stronger than fail, but in the
9365 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9366 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9368 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9369 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9370 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9372 switch (mode)
9374 case QImode:
9375 case HImode:
9376 /* For short modes, we're going to perform the comparison in SImode,
9377 so do the zero-extension now. */
9378 cmp_mode = SImode;
9379 rval = gen_reg_rtx (SImode);
9380 oldval = convert_modes (SImode, mode, oldval, true);
9381 /* Fall through. */
9383 case SImode:
9384 case DImode:
9385 /* Force the value into a register if needed. */
9386 if (!aarch64_plus_operand (oldval, mode))
9387 oldval = force_reg (cmp_mode, oldval);
9388 break;
9390 default:
9391 gcc_unreachable ();
9394 switch (mode)
9396 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9397 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9398 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9399 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9400 default:
9401 gcc_unreachable ();
9404 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9406 if (mode == QImode || mode == HImode)
9407 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9409 x = gen_rtx_REG (CCmode, CC_REGNUM);
9410 x = gen_rtx_EQ (SImode, x, const0_rtx);
9411 emit_insn (gen_rtx_SET (bval, x));
9414 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9415 sequence implementing an atomic operation. */
9417 static void
9418 aarch64_emit_post_barrier (enum memmodel model)
9420 const enum memmodel base_model = memmodel_base (model);
9422 if (is_mm_sync (model)
9423 && (base_model == MEMMODEL_ACQUIRE
9424 || base_model == MEMMODEL_ACQ_REL
9425 || base_model == MEMMODEL_SEQ_CST))
9427 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9431 /* Split a compare and swap pattern. */
9433 void
9434 aarch64_split_compare_and_swap (rtx operands[])
9436 rtx rval, mem, oldval, newval, scratch;
9437 machine_mode mode;
9438 bool is_weak;
9439 rtx_code_label *label1, *label2;
9440 rtx x, cond;
9441 enum memmodel model;
9442 rtx model_rtx;
9444 rval = operands[0];
9445 mem = operands[1];
9446 oldval = operands[2];
9447 newval = operands[3];
9448 is_weak = (operands[4] != const0_rtx);
9449 model_rtx = operands[5];
9450 scratch = operands[7];
9451 mode = GET_MODE (mem);
9452 model = memmodel_from_int (INTVAL (model_rtx));
9454 label1 = NULL;
9455 if (!is_weak)
9457 label1 = gen_label_rtx ();
9458 emit_label (label1);
9460 label2 = gen_label_rtx ();
9462 /* The initial load can be relaxed for a __sync operation since a final
9463 barrier will be emitted to stop code hoisting. */
9464 if (is_mm_sync (model))
9465 aarch64_emit_load_exclusive (mode, rval, mem,
9466 GEN_INT (MEMMODEL_RELAXED));
9467 else
9468 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9470 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9471 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9472 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9473 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9474 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9476 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9478 if (!is_weak)
9480 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9481 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9482 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9483 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9485 else
9487 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9488 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9489 emit_insn (gen_rtx_SET (cond, x));
9492 emit_label (label2);
9494 /* Emit any final barrier needed for a __sync operation. */
9495 if (is_mm_sync (model))
9496 aarch64_emit_post_barrier (model);
9499 /* Split an atomic operation. */
9501 void
9502 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9503 rtx value, rtx model_rtx, rtx cond)
9505 machine_mode mode = GET_MODE (mem);
9506 machine_mode wmode = (mode == DImode ? DImode : SImode);
9507 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9508 const bool is_sync = is_mm_sync (model);
9509 rtx_code_label *label;
9510 rtx x;
9512 label = gen_label_rtx ();
9513 emit_label (label);
9515 if (new_out)
9516 new_out = gen_lowpart (wmode, new_out);
9517 if (old_out)
9518 old_out = gen_lowpart (wmode, old_out);
9519 else
9520 old_out = new_out;
9521 value = simplify_gen_subreg (wmode, value, mode, 0);
9523 /* The initial load can be relaxed for a __sync operation since a final
9524 barrier will be emitted to stop code hoisting. */
9525 if (is_sync)
9526 aarch64_emit_load_exclusive (mode, old_out, mem,
9527 GEN_INT (MEMMODEL_RELAXED));
9528 else
9529 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9531 switch (code)
9533 case SET:
9534 new_out = value;
9535 break;
9537 case NOT:
9538 x = gen_rtx_AND (wmode, old_out, value);
9539 emit_insn (gen_rtx_SET (new_out, x));
9540 x = gen_rtx_NOT (wmode, new_out);
9541 emit_insn (gen_rtx_SET (new_out, x));
9542 break;
9544 case MINUS:
9545 if (CONST_INT_P (value))
9547 value = GEN_INT (-INTVAL (value));
9548 code = PLUS;
9550 /* Fall through. */
9552 default:
9553 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9554 emit_insn (gen_rtx_SET (new_out, x));
9555 break;
9558 aarch64_emit_store_exclusive (mode, cond, mem,
9559 gen_lowpart (mode, new_out), model_rtx);
9561 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9562 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9563 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9564 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9566 /* Emit any final barrier needed for a __sync operation. */
9567 if (is_sync)
9568 aarch64_emit_post_barrier (model);
9571 static void
9572 aarch64_print_extension (void)
9574 const struct aarch64_option_extension *opt = NULL;
9576 for (opt = all_extensions; opt->name != NULL; opt++)
9577 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9578 asm_fprintf (asm_out_file, "+%s", opt->name);
9580 asm_fprintf (asm_out_file, "\n");
9583 static void
9584 aarch64_start_file (void)
9586 if (selected_arch)
9588 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9589 aarch64_print_extension ();
9591 else if (selected_cpu)
9593 const char *truncated_name
9594 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9595 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9596 aarch64_print_extension ();
9598 default_file_start();
9601 /* Target hook for c_mode_for_suffix. */
9602 static machine_mode
9603 aarch64_c_mode_for_suffix (char suffix)
9605 if (suffix == 'q')
9606 return TFmode;
9608 return VOIDmode;
9611 /* We can only represent floating point constants which will fit in
9612 "quarter-precision" values. These values are characterised by
9613 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9616 (-1)^s * (n/16) * 2^r
9618 Where:
9619 's' is the sign bit.
9620 'n' is an integer in the range 16 <= n <= 31.
9621 'r' is an integer in the range -3 <= r <= 4. */
9623 /* Return true iff X can be represented by a quarter-precision
9624 floating point immediate operand X. Note, we cannot represent 0.0. */
9625 bool
9626 aarch64_float_const_representable_p (rtx x)
9628 /* This represents our current view of how many bits
9629 make up the mantissa. */
9630 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9631 int exponent;
9632 unsigned HOST_WIDE_INT mantissa, mask;
9633 REAL_VALUE_TYPE r, m;
9634 bool fail;
9636 if (!CONST_DOUBLE_P (x))
9637 return false;
9639 if (GET_MODE (x) == VOIDmode)
9640 return false;
9642 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9644 /* We cannot represent infinities, NaNs or +/-zero. We won't
9645 know if we have +zero until we analyse the mantissa, but we
9646 can reject the other invalid values. */
9647 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9648 || REAL_VALUE_MINUS_ZERO (r))
9649 return false;
9651 /* Extract exponent. */
9652 r = real_value_abs (&r);
9653 exponent = REAL_EXP (&r);
9655 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9656 highest (sign) bit, with a fixed binary point at bit point_pos.
9657 m1 holds the low part of the mantissa, m2 the high part.
9658 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9659 bits for the mantissa, this can fail (low bits will be lost). */
9660 real_ldexp (&m, &r, point_pos - exponent);
9661 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9663 /* If the low part of the mantissa has bits set we cannot represent
9664 the value. */
9665 if (w.elt (0) != 0)
9666 return false;
9667 /* We have rejected the lower HOST_WIDE_INT, so update our
9668 understanding of how many bits lie in the mantissa and
9669 look only at the high HOST_WIDE_INT. */
9670 mantissa = w.elt (1);
9671 point_pos -= HOST_BITS_PER_WIDE_INT;
9673 /* We can only represent values with a mantissa of the form 1.xxxx. */
9674 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9675 if ((mantissa & mask) != 0)
9676 return false;
9678 /* Having filtered unrepresentable values, we may now remove all
9679 but the highest 5 bits. */
9680 mantissa >>= point_pos - 5;
9682 /* We cannot represent the value 0.0, so reject it. This is handled
9683 elsewhere. */
9684 if (mantissa == 0)
9685 return false;
9687 /* Then, as bit 4 is always set, we can mask it off, leaving
9688 the mantissa in the range [0, 15]. */
9689 mantissa &= ~(1 << 4);
9690 gcc_assert (mantissa <= 15);
9692 /* GCC internally does not use IEEE754-like encoding (where normalized
9693 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9694 Our mantissa values are shifted 4 places to the left relative to
9695 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9696 by 5 places to correct for GCC's representation. */
9697 exponent = 5 - exponent;
9699 return (exponent >= 0 && exponent <= 7);
9702 char*
9703 aarch64_output_simd_mov_immediate (rtx const_vector,
9704 machine_mode mode,
9705 unsigned width)
9707 bool is_valid;
9708 static char templ[40];
9709 const char *mnemonic;
9710 const char *shift_op;
9711 unsigned int lane_count = 0;
9712 char element_char;
9714 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9716 /* This will return true to show const_vector is legal for use as either
9717 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9718 also update INFO to show how the immediate should be generated. */
9719 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9720 gcc_assert (is_valid);
9722 element_char = sizetochar (info.element_width);
9723 lane_count = width / info.element_width;
9725 mode = GET_MODE_INNER (mode);
9726 if (mode == SFmode || mode == DFmode)
9728 gcc_assert (info.shift == 0 && ! info.mvn);
9729 if (aarch64_float_const_zero_rtx_p (info.value))
9730 info.value = GEN_INT (0);
9731 else
9733 #define buf_size 20
9734 REAL_VALUE_TYPE r;
9735 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9736 char float_buf[buf_size] = {'\0'};
9737 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9738 #undef buf_size
9740 if (lane_count == 1)
9741 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9742 else
9743 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9744 lane_count, element_char, float_buf);
9745 return templ;
9749 mnemonic = info.mvn ? "mvni" : "movi";
9750 shift_op = info.msl ? "msl" : "lsl";
9752 if (lane_count == 1)
9753 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9754 mnemonic, UINTVAL (info.value));
9755 else if (info.shift)
9756 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9757 ", %s %d", mnemonic, lane_count, element_char,
9758 UINTVAL (info.value), shift_op, info.shift);
9759 else
9760 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9761 mnemonic, lane_count, element_char, UINTVAL (info.value));
9762 return templ;
9765 char*
9766 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9767 machine_mode mode)
9769 machine_mode vmode;
9771 gcc_assert (!VECTOR_MODE_P (mode));
9772 vmode = aarch64_simd_container_mode (mode, 64);
9773 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9774 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9777 /* Split operands into moves from op[1] + op[2] into op[0]. */
9779 void
9780 aarch64_split_combinev16qi (rtx operands[3])
9782 unsigned int dest = REGNO (operands[0]);
9783 unsigned int src1 = REGNO (operands[1]);
9784 unsigned int src2 = REGNO (operands[2]);
9785 machine_mode halfmode = GET_MODE (operands[1]);
9786 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9787 rtx destlo, desthi;
9789 gcc_assert (halfmode == V16QImode);
9791 if (src1 == dest && src2 == dest + halfregs)
9793 /* No-op move. Can't split to nothing; emit something. */
9794 emit_note (NOTE_INSN_DELETED);
9795 return;
9798 /* Preserve register attributes for variable tracking. */
9799 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9800 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9801 GET_MODE_SIZE (halfmode));
9803 /* Special case of reversed high/low parts. */
9804 if (reg_overlap_mentioned_p (operands[2], destlo)
9805 && reg_overlap_mentioned_p (operands[1], desthi))
9807 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9808 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9809 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9811 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9813 /* Try to avoid unnecessary moves if part of the result
9814 is in the right place already. */
9815 if (src1 != dest)
9816 emit_move_insn (destlo, operands[1]);
9817 if (src2 != dest + halfregs)
9818 emit_move_insn (desthi, operands[2]);
9820 else
9822 if (src2 != dest + halfregs)
9823 emit_move_insn (desthi, operands[2]);
9824 if (src1 != dest)
9825 emit_move_insn (destlo, operands[1]);
9829 /* vec_perm support. */
9831 #define MAX_VECT_LEN 16
9833 struct expand_vec_perm_d
9835 rtx target, op0, op1;
9836 unsigned char perm[MAX_VECT_LEN];
9837 machine_mode vmode;
9838 unsigned char nelt;
9839 bool one_vector_p;
9840 bool testing_p;
9843 /* Generate a variable permutation. */
9845 static void
9846 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9848 machine_mode vmode = GET_MODE (target);
9849 bool one_vector_p = rtx_equal_p (op0, op1);
9851 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9852 gcc_checking_assert (GET_MODE (op0) == vmode);
9853 gcc_checking_assert (GET_MODE (op1) == vmode);
9854 gcc_checking_assert (GET_MODE (sel) == vmode);
9855 gcc_checking_assert (TARGET_SIMD);
9857 if (one_vector_p)
9859 if (vmode == V8QImode)
9861 /* Expand the argument to a V16QI mode by duplicating it. */
9862 rtx pair = gen_reg_rtx (V16QImode);
9863 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9864 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9866 else
9868 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9871 else
9873 rtx pair;
9875 if (vmode == V8QImode)
9877 pair = gen_reg_rtx (V16QImode);
9878 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9879 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9881 else
9883 pair = gen_reg_rtx (OImode);
9884 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9885 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9890 void
9891 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9893 machine_mode vmode = GET_MODE (target);
9894 unsigned int nelt = GET_MODE_NUNITS (vmode);
9895 bool one_vector_p = rtx_equal_p (op0, op1);
9896 rtx mask;
9898 /* The TBL instruction does not use a modulo index, so we must take care
9899 of that ourselves. */
9900 mask = aarch64_simd_gen_const_vector_dup (vmode,
9901 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9902 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9904 /* For big-endian, we also need to reverse the index within the vector
9905 (but not which vector). */
9906 if (BYTES_BIG_ENDIAN)
9908 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9909 if (!one_vector_p)
9910 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9911 sel = expand_simple_binop (vmode, XOR, sel, mask,
9912 NULL, 0, OPTAB_LIB_WIDEN);
9914 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9917 /* Recognize patterns suitable for the TRN instructions. */
9918 static bool
9919 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9921 unsigned int i, odd, mask, nelt = d->nelt;
9922 rtx out, in0, in1, x;
9923 rtx (*gen) (rtx, rtx, rtx);
9924 machine_mode vmode = d->vmode;
9926 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9927 return false;
9929 /* Note that these are little-endian tests.
9930 We correct for big-endian later. */
9931 if (d->perm[0] == 0)
9932 odd = 0;
9933 else if (d->perm[0] == 1)
9934 odd = 1;
9935 else
9936 return false;
9937 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9939 for (i = 0; i < nelt; i += 2)
9941 if (d->perm[i] != i + odd)
9942 return false;
9943 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9944 return false;
9947 /* Success! */
9948 if (d->testing_p)
9949 return true;
9951 in0 = d->op0;
9952 in1 = d->op1;
9953 if (BYTES_BIG_ENDIAN)
9955 x = in0, in0 = in1, in1 = x;
9956 odd = !odd;
9958 out = d->target;
9960 if (odd)
9962 switch (vmode)
9964 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9965 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9966 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9967 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9968 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9969 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9970 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9971 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9972 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9973 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9974 default:
9975 return false;
9978 else
9980 switch (vmode)
9982 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9983 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9984 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9985 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9986 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9987 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9988 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9989 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9990 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9991 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9992 default:
9993 return false;
9997 emit_insn (gen (out, in0, in1));
9998 return true;
10001 /* Recognize patterns suitable for the UZP instructions. */
10002 static bool
10003 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10005 unsigned int i, odd, mask, nelt = d->nelt;
10006 rtx out, in0, in1, x;
10007 rtx (*gen) (rtx, rtx, rtx);
10008 machine_mode vmode = d->vmode;
10010 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10011 return false;
10013 /* Note that these are little-endian tests.
10014 We correct for big-endian later. */
10015 if (d->perm[0] == 0)
10016 odd = 0;
10017 else if (d->perm[0] == 1)
10018 odd = 1;
10019 else
10020 return false;
10021 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10023 for (i = 0; i < nelt; i++)
10025 unsigned elt = (i * 2 + odd) & mask;
10026 if (d->perm[i] != elt)
10027 return false;
10030 /* Success! */
10031 if (d->testing_p)
10032 return true;
10034 in0 = d->op0;
10035 in1 = d->op1;
10036 if (BYTES_BIG_ENDIAN)
10038 x = in0, in0 = in1, in1 = x;
10039 odd = !odd;
10041 out = d->target;
10043 if (odd)
10045 switch (vmode)
10047 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10048 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10049 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10050 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10051 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10052 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10053 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10054 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10055 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10056 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10057 default:
10058 return false;
10061 else
10063 switch (vmode)
10065 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10066 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10067 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10068 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10069 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10070 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10071 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10072 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10073 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10074 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10075 default:
10076 return false;
10080 emit_insn (gen (out, in0, in1));
10081 return true;
10084 /* Recognize patterns suitable for the ZIP instructions. */
10085 static bool
10086 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10088 unsigned int i, high, mask, nelt = d->nelt;
10089 rtx out, in0, in1, x;
10090 rtx (*gen) (rtx, rtx, rtx);
10091 machine_mode vmode = d->vmode;
10093 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10094 return false;
10096 /* Note that these are little-endian tests.
10097 We correct for big-endian later. */
10098 high = nelt / 2;
10099 if (d->perm[0] == high)
10100 /* Do Nothing. */
10102 else if (d->perm[0] == 0)
10103 high = 0;
10104 else
10105 return false;
10106 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10108 for (i = 0; i < nelt / 2; i++)
10110 unsigned elt = (i + high) & mask;
10111 if (d->perm[i * 2] != elt)
10112 return false;
10113 elt = (elt + nelt) & mask;
10114 if (d->perm[i * 2 + 1] != elt)
10115 return false;
10118 /* Success! */
10119 if (d->testing_p)
10120 return true;
10122 in0 = d->op0;
10123 in1 = d->op1;
10124 if (BYTES_BIG_ENDIAN)
10126 x = in0, in0 = in1, in1 = x;
10127 high = !high;
10129 out = d->target;
10131 if (high)
10133 switch (vmode)
10135 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10136 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10137 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10138 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10139 case V4SImode: gen = gen_aarch64_zip2v4si; break;
10140 case V2SImode: gen = gen_aarch64_zip2v2si; break;
10141 case V2DImode: gen = gen_aarch64_zip2v2di; break;
10142 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10143 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10144 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10145 default:
10146 return false;
10149 else
10151 switch (vmode)
10153 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10154 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10155 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10156 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10157 case V4SImode: gen = gen_aarch64_zip1v4si; break;
10158 case V2SImode: gen = gen_aarch64_zip1v2si; break;
10159 case V2DImode: gen = gen_aarch64_zip1v2di; break;
10160 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10161 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10162 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10163 default:
10164 return false;
10168 emit_insn (gen (out, in0, in1));
10169 return true;
10172 /* Recognize patterns for the EXT insn. */
10174 static bool
10175 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10177 unsigned int i, nelt = d->nelt;
10178 rtx (*gen) (rtx, rtx, rtx, rtx);
10179 rtx offset;
10181 unsigned int location = d->perm[0]; /* Always < nelt. */
10183 /* Check if the extracted indices are increasing by one. */
10184 for (i = 1; i < nelt; i++)
10186 unsigned int required = location + i;
10187 if (d->one_vector_p)
10189 /* We'll pass the same vector in twice, so allow indices to wrap. */
10190 required &= (nelt - 1);
10192 if (d->perm[i] != required)
10193 return false;
10196 switch (d->vmode)
10198 case V16QImode: gen = gen_aarch64_extv16qi; break;
10199 case V8QImode: gen = gen_aarch64_extv8qi; break;
10200 case V4HImode: gen = gen_aarch64_extv4hi; break;
10201 case V8HImode: gen = gen_aarch64_extv8hi; break;
10202 case V2SImode: gen = gen_aarch64_extv2si; break;
10203 case V4SImode: gen = gen_aarch64_extv4si; break;
10204 case V2SFmode: gen = gen_aarch64_extv2sf; break;
10205 case V4SFmode: gen = gen_aarch64_extv4sf; break;
10206 case V2DImode: gen = gen_aarch64_extv2di; break;
10207 case V2DFmode: gen = gen_aarch64_extv2df; break;
10208 default:
10209 return false;
10212 /* Success! */
10213 if (d->testing_p)
10214 return true;
10216 /* The case where (location == 0) is a no-op for both big- and little-endian,
10217 and is removed by the mid-end at optimization levels -O1 and higher. */
10219 if (BYTES_BIG_ENDIAN && (location != 0))
10221 /* After setup, we want the high elements of the first vector (stored
10222 at the LSB end of the register), and the low elements of the second
10223 vector (stored at the MSB end of the register). So swap. */
10224 std::swap (d->op0, d->op1);
10225 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
10226 location = nelt - location;
10229 offset = GEN_INT (location);
10230 emit_insn (gen (d->target, d->op0, d->op1, offset));
10231 return true;
10234 /* Recognize patterns for the REV insns. */
10236 static bool
10237 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10239 unsigned int i, j, diff, nelt = d->nelt;
10240 rtx (*gen) (rtx, rtx);
10242 if (!d->one_vector_p)
10243 return false;
10245 diff = d->perm[0];
10246 switch (diff)
10248 case 7:
10249 switch (d->vmode)
10251 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10252 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10253 default:
10254 return false;
10256 break;
10257 case 3:
10258 switch (d->vmode)
10260 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10261 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10262 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10263 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10264 default:
10265 return false;
10267 break;
10268 case 1:
10269 switch (d->vmode)
10271 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10272 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10273 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10274 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10275 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10276 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10277 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10278 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10279 default:
10280 return false;
10282 break;
10283 default:
10284 return false;
10287 for (i = 0; i < nelt ; i += diff + 1)
10288 for (j = 0; j <= diff; j += 1)
10290 /* This is guaranteed to be true as the value of diff
10291 is 7, 3, 1 and we should have enough elements in the
10292 queue to generate this. Getting a vector mask with a
10293 value of diff other than these values implies that
10294 something is wrong by the time we get here. */
10295 gcc_assert (i + j < nelt);
10296 if (d->perm[i + j] != i + diff - j)
10297 return false;
10300 /* Success! */
10301 if (d->testing_p)
10302 return true;
10304 emit_insn (gen (d->target, d->op0));
10305 return true;
10308 static bool
10309 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10311 rtx (*gen) (rtx, rtx, rtx);
10312 rtx out = d->target;
10313 rtx in0;
10314 machine_mode vmode = d->vmode;
10315 unsigned int i, elt, nelt = d->nelt;
10316 rtx lane;
10318 elt = d->perm[0];
10319 for (i = 1; i < nelt; i++)
10321 if (elt != d->perm[i])
10322 return false;
10325 /* The generic preparation in aarch64_expand_vec_perm_const_1
10326 swaps the operand order and the permute indices if it finds
10327 d->perm[0] to be in the second operand. Thus, we can always
10328 use d->op0 and need not do any extra arithmetic to get the
10329 correct lane number. */
10330 in0 = d->op0;
10331 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10333 switch (vmode)
10335 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10336 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10337 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10338 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10339 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10340 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10341 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10342 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10343 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10344 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10345 default:
10346 return false;
10349 emit_insn (gen (out, in0, lane));
10350 return true;
10353 static bool
10354 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10356 rtx rperm[MAX_VECT_LEN], sel;
10357 machine_mode vmode = d->vmode;
10358 unsigned int i, nelt = d->nelt;
10360 if (d->testing_p)
10361 return true;
10363 /* Generic code will try constant permutation twice. Once with the
10364 original mode and again with the elements lowered to QImode.
10365 So wait and don't do the selector expansion ourselves. */
10366 if (vmode != V8QImode && vmode != V16QImode)
10367 return false;
10369 for (i = 0; i < nelt; ++i)
10371 int nunits = GET_MODE_NUNITS (vmode);
10373 /* If big-endian and two vectors we end up with a weird mixed-endian
10374 mode on NEON. Reverse the index within each word but not the word
10375 itself. */
10376 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10377 : d->perm[i]);
10379 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10380 sel = force_reg (vmode, sel);
10382 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10383 return true;
10386 static bool
10387 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10389 /* The pattern matching functions above are written to look for a small
10390 number to begin the sequence (0, 1, N/2). If we begin with an index
10391 from the second operand, we can swap the operands. */
10392 if (d->perm[0] >= d->nelt)
10394 unsigned i, nelt = d->nelt;
10396 gcc_assert (nelt == (nelt & -nelt));
10397 for (i = 0; i < nelt; ++i)
10398 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10400 std::swap (d->op0, d->op1);
10403 if (TARGET_SIMD)
10405 if (aarch64_evpc_rev (d))
10406 return true;
10407 else if (aarch64_evpc_ext (d))
10408 return true;
10409 else if (aarch64_evpc_dup (d))
10410 return true;
10411 else if (aarch64_evpc_zip (d))
10412 return true;
10413 else if (aarch64_evpc_uzp (d))
10414 return true;
10415 else if (aarch64_evpc_trn (d))
10416 return true;
10417 return aarch64_evpc_tbl (d);
10419 return false;
10422 /* Expand a vec_perm_const pattern. */
10424 bool
10425 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10427 struct expand_vec_perm_d d;
10428 int i, nelt, which;
10430 d.target = target;
10431 d.op0 = op0;
10432 d.op1 = op1;
10434 d.vmode = GET_MODE (target);
10435 gcc_assert (VECTOR_MODE_P (d.vmode));
10436 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10437 d.testing_p = false;
10439 for (i = which = 0; i < nelt; ++i)
10441 rtx e = XVECEXP (sel, 0, i);
10442 int ei = INTVAL (e) & (2 * nelt - 1);
10443 which |= (ei < nelt ? 1 : 2);
10444 d.perm[i] = ei;
10447 switch (which)
10449 default:
10450 gcc_unreachable ();
10452 case 3:
10453 d.one_vector_p = false;
10454 if (!rtx_equal_p (op0, op1))
10455 break;
10457 /* The elements of PERM do not suggest that only the first operand
10458 is used, but both operands are identical. Allow easier matching
10459 of the permutation by folding the permutation into the single
10460 input vector. */
10461 /* Fall Through. */
10462 case 2:
10463 for (i = 0; i < nelt; ++i)
10464 d.perm[i] &= nelt - 1;
10465 d.op0 = op1;
10466 d.one_vector_p = true;
10467 break;
10469 case 1:
10470 d.op1 = op0;
10471 d.one_vector_p = true;
10472 break;
10475 return aarch64_expand_vec_perm_const_1 (&d);
10478 static bool
10479 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10480 const unsigned char *sel)
10482 struct expand_vec_perm_d d;
10483 unsigned int i, nelt, which;
10484 bool ret;
10486 d.vmode = vmode;
10487 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10488 d.testing_p = true;
10489 memcpy (d.perm, sel, nelt);
10491 /* Calculate whether all elements are in one vector. */
10492 for (i = which = 0; i < nelt; ++i)
10494 unsigned char e = d.perm[i];
10495 gcc_assert (e < 2 * nelt);
10496 which |= (e < nelt ? 1 : 2);
10499 /* If all elements are from the second vector, reindex as if from the
10500 first vector. */
10501 if (which == 2)
10502 for (i = 0; i < nelt; ++i)
10503 d.perm[i] -= nelt;
10505 /* Check whether the mask can be applied to a single vector. */
10506 d.one_vector_p = (which != 3);
10508 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10509 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10510 if (!d.one_vector_p)
10511 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10513 start_sequence ();
10514 ret = aarch64_expand_vec_perm_const_1 (&d);
10515 end_sequence ();
10517 return ret;
10521 aarch64_reverse_mask (enum machine_mode mode)
10523 /* We have to reverse each vector because we dont have
10524 a permuted load that can reverse-load according to ABI rules. */
10525 rtx mask;
10526 rtvec v = rtvec_alloc (16);
10527 int i, j;
10528 int nunits = GET_MODE_NUNITS (mode);
10529 int usize = GET_MODE_UNIT_SIZE (mode);
10531 gcc_assert (BYTES_BIG_ENDIAN);
10532 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10534 for (i = 0; i < nunits; i++)
10535 for (j = 0; j < usize; j++)
10536 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10537 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10538 return force_reg (V16QImode, mask);
10541 /* Implement MODES_TIEABLE_P. */
10543 bool
10544 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10546 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10547 return true;
10549 /* We specifically want to allow elements of "structure" modes to
10550 be tieable to the structure. This more general condition allows
10551 other rarer situations too. */
10552 if (TARGET_SIMD
10553 && aarch64_vector_mode_p (mode1)
10554 && aarch64_vector_mode_p (mode2))
10555 return true;
10557 return false;
10560 /* Return a new RTX holding the result of moving POINTER forward by
10561 AMOUNT bytes. */
10563 static rtx
10564 aarch64_move_pointer (rtx pointer, int amount)
10566 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10568 return adjust_automodify_address (pointer, GET_MODE (pointer),
10569 next, amount);
10572 /* Return a new RTX holding the result of moving POINTER forward by the
10573 size of the mode it points to. */
10575 static rtx
10576 aarch64_progress_pointer (rtx pointer)
10578 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10580 return aarch64_move_pointer (pointer, amount);
10583 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10584 MODE bytes. */
10586 static void
10587 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10588 machine_mode mode)
10590 rtx reg = gen_reg_rtx (mode);
10592 /* "Cast" the pointers to the correct mode. */
10593 *src = adjust_address (*src, mode, 0);
10594 *dst = adjust_address (*dst, mode, 0);
10595 /* Emit the memcpy. */
10596 emit_move_insn (reg, *src);
10597 emit_move_insn (*dst, reg);
10598 /* Move the pointers forward. */
10599 *src = aarch64_progress_pointer (*src);
10600 *dst = aarch64_progress_pointer (*dst);
10603 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10604 we succeed, otherwise return false. */
10606 bool
10607 aarch64_expand_movmem (rtx *operands)
10609 unsigned int n;
10610 rtx dst = operands[0];
10611 rtx src = operands[1];
10612 rtx base;
10613 bool speed_p = !optimize_function_for_size_p (cfun);
10615 /* When optimizing for size, give a better estimate of the length of a
10616 memcpy call, but use the default otherwise. */
10617 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10619 /* We can't do anything smart if the amount to copy is not constant. */
10620 if (!CONST_INT_P (operands[2]))
10621 return false;
10623 n = UINTVAL (operands[2]);
10625 /* Try to keep the number of instructions low. For cases below 16 bytes we
10626 need to make at most two moves. For cases above 16 bytes it will be one
10627 move for each 16 byte chunk, then at most two additional moves. */
10628 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10629 return false;
10631 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10632 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10634 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10635 src = adjust_automodify_address (src, VOIDmode, base, 0);
10637 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10638 1-byte chunk. */
10639 if (n < 4)
10641 if (n >= 2)
10643 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10644 n -= 2;
10647 if (n == 1)
10648 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10650 return true;
10653 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10654 4-byte chunk, partially overlapping with the previously copied chunk. */
10655 if (n < 8)
10657 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10658 n -= 4;
10659 if (n > 0)
10661 int move = n - 4;
10663 src = aarch64_move_pointer (src, move);
10664 dst = aarch64_move_pointer (dst, move);
10665 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10667 return true;
10670 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10671 them, then (if applicable) an 8-byte chunk. */
10672 while (n >= 8)
10674 if (n / 16)
10676 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10677 n -= 16;
10679 else
10681 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10682 n -= 8;
10686 /* Finish the final bytes of the copy. We can always do this in one
10687 instruction. We either copy the exact amount we need, or partially
10688 overlap with the previous chunk we copied and copy 8-bytes. */
10689 if (n == 0)
10690 return true;
10691 else if (n == 1)
10692 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10693 else if (n == 2)
10694 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10695 else if (n == 4)
10696 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10697 else
10699 if (n == 3)
10701 src = aarch64_move_pointer (src, -1);
10702 dst = aarch64_move_pointer (dst, -1);
10703 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10705 else
10707 int move = n - 8;
10709 src = aarch64_move_pointer (src, move);
10710 dst = aarch64_move_pointer (dst, move);
10711 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10715 return true;
10718 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10720 static unsigned HOST_WIDE_INT
10721 aarch64_asan_shadow_offset (void)
10723 return (HOST_WIDE_INT_1 << 36);
10726 static bool
10727 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10728 unsigned int align,
10729 enum by_pieces_operation op,
10730 bool speed_p)
10732 /* STORE_BY_PIECES can be used when copying a constant string, but
10733 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10734 For now we always fail this and let the move_by_pieces code copy
10735 the string from read-only memory. */
10736 if (op == STORE_BY_PIECES)
10737 return false;
10739 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10742 static enum machine_mode
10743 aarch64_code_to_ccmode (enum rtx_code code)
10745 switch (code)
10747 case NE:
10748 return CC_DNEmode;
10750 case EQ:
10751 return CC_DEQmode;
10753 case LE:
10754 return CC_DLEmode;
10756 case LT:
10757 return CC_DLTmode;
10759 case GE:
10760 return CC_DGEmode;
10762 case GT:
10763 return CC_DGTmode;
10765 case LEU:
10766 return CC_DLEUmode;
10768 case LTU:
10769 return CC_DLTUmode;
10771 case GEU:
10772 return CC_DGEUmode;
10774 case GTU:
10775 return CC_DGTUmode;
10777 default:
10778 return CCmode;
10782 static rtx
10783 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10784 int code, tree treeop0, tree treeop1)
10786 enum machine_mode op_mode, cmp_mode, cc_mode;
10787 rtx op0, op1, cmp, target;
10788 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10789 enum insn_code icode;
10790 struct expand_operand ops[4];
10792 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10793 if (cc_mode == CCmode)
10794 return NULL_RTX;
10796 start_sequence ();
10797 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10799 op_mode = GET_MODE (op0);
10800 if (op_mode == VOIDmode)
10801 op_mode = GET_MODE (op1);
10803 switch (op_mode)
10805 case QImode:
10806 case HImode:
10807 case SImode:
10808 cmp_mode = SImode;
10809 icode = CODE_FOR_cmpsi;
10810 break;
10812 case DImode:
10813 cmp_mode = DImode;
10814 icode = CODE_FOR_cmpdi;
10815 break;
10817 default:
10818 end_sequence ();
10819 return NULL_RTX;
10822 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10823 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10824 if (!op0 || !op1)
10826 end_sequence ();
10827 return NULL_RTX;
10829 *prep_seq = get_insns ();
10830 end_sequence ();
10832 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10833 target = gen_rtx_REG (CCmode, CC_REGNUM);
10835 create_output_operand (&ops[0], target, CCmode);
10836 create_fixed_operand (&ops[1], cmp);
10837 create_fixed_operand (&ops[2], op0);
10838 create_fixed_operand (&ops[3], op1);
10840 start_sequence ();
10841 if (!maybe_expand_insn (icode, 4, ops))
10843 end_sequence ();
10844 return NULL_RTX;
10846 *gen_seq = get_insns ();
10847 end_sequence ();
10849 return gen_rtx_REG (cc_mode, CC_REGNUM);
10852 static rtx
10853 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10854 tree treeop0, tree treeop1, int bit_code)
10856 rtx op0, op1, cmp0, cmp1, target;
10857 enum machine_mode op_mode, cmp_mode, cc_mode;
10858 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10859 enum insn_code icode = CODE_FOR_ccmp_andsi;
10860 struct expand_operand ops[6];
10862 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10863 if (cc_mode == CCmode)
10864 return NULL_RTX;
10866 push_to_sequence ((rtx_insn*) *prep_seq);
10867 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10869 op_mode = GET_MODE (op0);
10870 if (op_mode == VOIDmode)
10871 op_mode = GET_MODE (op1);
10873 switch (op_mode)
10875 case QImode:
10876 case HImode:
10877 case SImode:
10878 cmp_mode = SImode;
10879 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10880 : CODE_FOR_ccmp_iorsi;
10881 break;
10883 case DImode:
10884 cmp_mode = DImode;
10885 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10886 : CODE_FOR_ccmp_iordi;
10887 break;
10889 default:
10890 end_sequence ();
10891 return NULL_RTX;
10894 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10895 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10896 if (!op0 || !op1)
10898 end_sequence ();
10899 return NULL_RTX;
10901 *prep_seq = get_insns ();
10902 end_sequence ();
10904 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10905 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10906 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10908 create_fixed_operand (&ops[0], prev);
10909 create_fixed_operand (&ops[1], target);
10910 create_fixed_operand (&ops[2], op0);
10911 create_fixed_operand (&ops[3], op1);
10912 create_fixed_operand (&ops[4], cmp0);
10913 create_fixed_operand (&ops[5], cmp1);
10915 push_to_sequence ((rtx_insn*) *gen_seq);
10916 if (!maybe_expand_insn (icode, 6, ops))
10918 end_sequence ();
10919 return NULL_RTX;
10922 *gen_seq = get_insns ();
10923 end_sequence ();
10925 return target;
10928 #undef TARGET_GEN_CCMP_FIRST
10929 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10931 #undef TARGET_GEN_CCMP_NEXT
10932 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10934 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10935 instruction fusion of some sort. */
10937 static bool
10938 aarch64_macro_fusion_p (void)
10940 return aarch64_tune_params->fusible_ops != AARCH64_FUSE_NOTHING;
10944 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10945 should be kept together during scheduling. */
10947 static bool
10948 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10950 rtx set_dest;
10951 rtx prev_set = single_set (prev);
10952 rtx curr_set = single_set (curr);
10953 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10954 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10956 if (!aarch64_macro_fusion_p ())
10957 return false;
10959 if (simple_sets_p
10960 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOV_MOVK))
10962 /* We are trying to match:
10963 prev (mov) == (set (reg r0) (const_int imm16))
10964 curr (movk) == (set (zero_extract (reg r0)
10965 (const_int 16)
10966 (const_int 16))
10967 (const_int imm16_1)) */
10969 set_dest = SET_DEST (curr_set);
10971 if (GET_CODE (set_dest) == ZERO_EXTRACT
10972 && CONST_INT_P (SET_SRC (curr_set))
10973 && CONST_INT_P (SET_SRC (prev_set))
10974 && CONST_INT_P (XEXP (set_dest, 2))
10975 && INTVAL (XEXP (set_dest, 2)) == 16
10976 && REG_P (XEXP (set_dest, 0))
10977 && REG_P (SET_DEST (prev_set))
10978 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10980 return true;
10984 if (simple_sets_p
10985 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_ADD))
10988 /* We're trying to match:
10989 prev (adrp) == (set (reg r1)
10990 (high (symbol_ref ("SYM"))))
10991 curr (add) == (set (reg r0)
10992 (lo_sum (reg r1)
10993 (symbol_ref ("SYM"))))
10994 Note that r0 need not necessarily be the same as r1, especially
10995 during pre-regalloc scheduling. */
10997 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10998 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11000 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11001 && REG_P (XEXP (SET_SRC (curr_set), 0))
11002 && REGNO (XEXP (SET_SRC (curr_set), 0))
11003 == REGNO (SET_DEST (prev_set))
11004 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11005 XEXP (SET_SRC (curr_set), 1)))
11006 return true;
11010 if (simple_sets_p
11011 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11014 /* We're trying to match:
11015 prev (movk) == (set (zero_extract (reg r0)
11016 (const_int 16)
11017 (const_int 32))
11018 (const_int imm16_1))
11019 curr (movk) == (set (zero_extract (reg r0)
11020 (const_int 16)
11021 (const_int 48))
11022 (const_int imm16_2)) */
11024 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11025 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11026 && REG_P (XEXP (SET_DEST (prev_set), 0))
11027 && REG_P (XEXP (SET_DEST (curr_set), 0))
11028 && REGNO (XEXP (SET_DEST (prev_set), 0))
11029 == REGNO (XEXP (SET_DEST (curr_set), 0))
11030 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11031 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11032 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11033 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11034 && CONST_INT_P (SET_SRC (prev_set))
11035 && CONST_INT_P (SET_SRC (curr_set)))
11036 return true;
11039 if (simple_sets_p
11040 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_LDR))
11042 /* We're trying to match:
11043 prev (adrp) == (set (reg r0)
11044 (high (symbol_ref ("SYM"))))
11045 curr (ldr) == (set (reg r1)
11046 (mem (lo_sum (reg r0)
11047 (symbol_ref ("SYM")))))
11049 curr (ldr) == (set (reg r1)
11050 (zero_extend (mem
11051 (lo_sum (reg r0)
11052 (symbol_ref ("SYM")))))) */
11053 if (satisfies_constraint_Ush (SET_SRC (prev_set))
11054 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11056 rtx curr_src = SET_SRC (curr_set);
11058 if (GET_CODE (curr_src) == ZERO_EXTEND)
11059 curr_src = XEXP (curr_src, 0);
11061 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11062 && REG_P (XEXP (XEXP (curr_src, 0), 0))
11063 && REGNO (XEXP (XEXP (curr_src, 0), 0))
11064 == REGNO (SET_DEST (prev_set))
11065 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11066 XEXP (SET_SRC (prev_set), 0)))
11067 return true;
11071 if ((aarch64_tune_params->fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11072 && any_condjump_p (curr))
11074 enum attr_type prev_type = get_attr_type (prev);
11076 /* FIXME: this misses some which is considered simple arthematic
11077 instructions for ThunderX. Simple shifts are missed here. */
11078 if (prev_type == TYPE_ALUS_SREG
11079 || prev_type == TYPE_ALUS_IMM
11080 || prev_type == TYPE_LOGICS_REG
11081 || prev_type == TYPE_LOGICS_IMM)
11082 return true;
11085 return false;
11088 /* If MEM is in the form of [base+offset], extract the two parts
11089 of address and set to BASE and OFFSET, otherwise return false
11090 after clearing BASE and OFFSET. */
11092 bool
11093 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11095 rtx addr;
11097 gcc_assert (MEM_P (mem));
11099 addr = XEXP (mem, 0);
11101 if (REG_P (addr))
11103 *base = addr;
11104 *offset = const0_rtx;
11105 return true;
11108 if (GET_CODE (addr) == PLUS
11109 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11111 *base = XEXP (addr, 0);
11112 *offset = XEXP (addr, 1);
11113 return true;
11116 *base = NULL_RTX;
11117 *offset = NULL_RTX;
11119 return false;
11122 /* Types for scheduling fusion. */
11123 enum sched_fusion_type
11125 SCHED_FUSION_NONE = 0,
11126 SCHED_FUSION_LD_SIGN_EXTEND,
11127 SCHED_FUSION_LD_ZERO_EXTEND,
11128 SCHED_FUSION_LD,
11129 SCHED_FUSION_ST,
11130 SCHED_FUSION_NUM
11133 /* If INSN is a load or store of address in the form of [base+offset],
11134 extract the two parts and set to BASE and OFFSET. Return scheduling
11135 fusion type this INSN is. */
11137 static enum sched_fusion_type
11138 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11140 rtx x, dest, src;
11141 enum sched_fusion_type fusion = SCHED_FUSION_LD;
11143 gcc_assert (INSN_P (insn));
11144 x = PATTERN (insn);
11145 if (GET_CODE (x) != SET)
11146 return SCHED_FUSION_NONE;
11148 src = SET_SRC (x);
11149 dest = SET_DEST (x);
11151 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11152 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11153 return SCHED_FUSION_NONE;
11155 if (GET_CODE (src) == SIGN_EXTEND)
11157 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11158 src = XEXP (src, 0);
11159 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11160 return SCHED_FUSION_NONE;
11162 else if (GET_CODE (src) == ZERO_EXTEND)
11164 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11165 src = XEXP (src, 0);
11166 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11167 return SCHED_FUSION_NONE;
11170 if (GET_CODE (src) == MEM && REG_P (dest))
11171 extract_base_offset_in_addr (src, base, offset);
11172 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11174 fusion = SCHED_FUSION_ST;
11175 extract_base_offset_in_addr (dest, base, offset);
11177 else
11178 return SCHED_FUSION_NONE;
11180 if (*base == NULL_RTX || *offset == NULL_RTX)
11181 fusion = SCHED_FUSION_NONE;
11183 return fusion;
11186 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11188 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11189 and PRI are only calculated for these instructions. For other instruction,
11190 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
11191 type instruction fusion can be added by returning different priorities.
11193 It's important that irrelevant instructions get the largest FUSION_PRI. */
11195 static void
11196 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11197 int *fusion_pri, int *pri)
11199 int tmp, off_val;
11200 rtx base, offset;
11201 enum sched_fusion_type fusion;
11203 gcc_assert (INSN_P (insn));
11205 tmp = max_pri - 1;
11206 fusion = fusion_load_store (insn, &base, &offset);
11207 if (fusion == SCHED_FUSION_NONE)
11209 *pri = tmp;
11210 *fusion_pri = tmp;
11211 return;
11214 /* Set FUSION_PRI according to fusion type and base register. */
11215 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11217 /* Calculate PRI. */
11218 tmp /= 2;
11220 /* INSN with smaller offset goes first. */
11221 off_val = (int)(INTVAL (offset));
11222 if (off_val >= 0)
11223 tmp -= (off_val & 0xfffff);
11224 else
11225 tmp += ((- off_val) & 0xfffff);
11227 *pri = tmp;
11228 return;
11231 /* Given OPERANDS of consecutive load/store, check if we can merge
11232 them into ldp/stp. LOAD is true if they are load instructions.
11233 MODE is the mode of memory operands. */
11235 bool
11236 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11237 enum machine_mode mode)
11239 HOST_WIDE_INT offval_1, offval_2, msize;
11240 enum reg_class rclass_1, rclass_2;
11241 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11243 if (load)
11245 mem_1 = operands[1];
11246 mem_2 = operands[3];
11247 reg_1 = operands[0];
11248 reg_2 = operands[2];
11249 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11250 if (REGNO (reg_1) == REGNO (reg_2))
11251 return false;
11253 else
11255 mem_1 = operands[0];
11256 mem_2 = operands[2];
11257 reg_1 = operands[1];
11258 reg_2 = operands[3];
11261 /* The mems cannot be volatile. */
11262 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11263 return false;
11265 /* Check if the addresses are in the form of [base+offset]. */
11266 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11267 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11268 return false;
11269 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11270 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11271 return false;
11273 /* Check if the bases are same. */
11274 if (!rtx_equal_p (base_1, base_2))
11275 return false;
11277 offval_1 = INTVAL (offset_1);
11278 offval_2 = INTVAL (offset_2);
11279 msize = GET_MODE_SIZE (mode);
11280 /* Check if the offsets are consecutive. */
11281 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11282 return false;
11284 /* Check if the addresses are clobbered by load. */
11285 if (load)
11287 if (reg_mentioned_p (reg_1, mem_1))
11288 return false;
11290 /* In increasing order, the last load can clobber the address. */
11291 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11292 return false;
11295 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11296 rclass_1 = FP_REGS;
11297 else
11298 rclass_1 = GENERAL_REGS;
11300 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11301 rclass_2 = FP_REGS;
11302 else
11303 rclass_2 = GENERAL_REGS;
11305 /* Check if the registers are of same class. */
11306 if (rclass_1 != rclass_2)
11307 return false;
11309 return true;
11312 /* Given OPERANDS of consecutive load/store, check if we can merge
11313 them into ldp/stp by adjusting the offset. LOAD is true if they
11314 are load instructions. MODE is the mode of memory operands.
11316 Given below consecutive stores:
11318 str w1, [xb, 0x100]
11319 str w1, [xb, 0x104]
11320 str w1, [xb, 0x108]
11321 str w1, [xb, 0x10c]
11323 Though the offsets are out of the range supported by stp, we can
11324 still pair them after adjusting the offset, like:
11326 add scratch, xb, 0x100
11327 stp w1, w1, [scratch]
11328 stp w1, w1, [scratch, 0x8]
11330 The peephole patterns detecting this opportunity should guarantee
11331 the scratch register is avaliable. */
11333 bool
11334 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11335 enum machine_mode mode)
11337 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11338 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11339 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11340 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11342 if (load)
11344 reg_1 = operands[0];
11345 mem_1 = operands[1];
11346 reg_2 = operands[2];
11347 mem_2 = operands[3];
11348 reg_3 = operands[4];
11349 mem_3 = operands[5];
11350 reg_4 = operands[6];
11351 mem_4 = operands[7];
11352 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11353 && REG_P (reg_3) && REG_P (reg_4));
11354 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11355 return false;
11357 else
11359 mem_1 = operands[0];
11360 reg_1 = operands[1];
11361 mem_2 = operands[2];
11362 reg_2 = operands[3];
11363 mem_3 = operands[4];
11364 reg_3 = operands[5];
11365 mem_4 = operands[6];
11366 reg_4 = operands[7];
11368 /* Skip if memory operand is by itslef valid for ldp/stp. */
11369 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11370 return false;
11372 /* The mems cannot be volatile. */
11373 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11374 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11375 return false;
11377 /* Check if the addresses are in the form of [base+offset]. */
11378 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11379 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11380 return false;
11381 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11382 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11383 return false;
11384 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11385 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11386 return false;
11387 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11388 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11389 return false;
11391 /* Check if the bases are same. */
11392 if (!rtx_equal_p (base_1, base_2)
11393 || !rtx_equal_p (base_2, base_3)
11394 || !rtx_equal_p (base_3, base_4))
11395 return false;
11397 offval_1 = INTVAL (offset_1);
11398 offval_2 = INTVAL (offset_2);
11399 offval_3 = INTVAL (offset_3);
11400 offval_4 = INTVAL (offset_4);
11401 msize = GET_MODE_SIZE (mode);
11402 /* Check if the offsets are consecutive. */
11403 if ((offval_1 != (offval_2 + msize)
11404 || offval_1 != (offval_3 + msize * 2)
11405 || offval_1 != (offval_4 + msize * 3))
11406 && (offval_4 != (offval_3 + msize)
11407 || offval_4 != (offval_2 + msize * 2)
11408 || offval_4 != (offval_1 + msize * 3)))
11409 return false;
11411 /* Check if the addresses are clobbered by load. */
11412 if (load)
11414 if (reg_mentioned_p (reg_1, mem_1)
11415 || reg_mentioned_p (reg_2, mem_2)
11416 || reg_mentioned_p (reg_3, mem_3))
11417 return false;
11419 /* In increasing order, the last load can clobber the address. */
11420 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11421 return false;
11424 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11425 rclass_1 = FP_REGS;
11426 else
11427 rclass_1 = GENERAL_REGS;
11429 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11430 rclass_2 = FP_REGS;
11431 else
11432 rclass_2 = GENERAL_REGS;
11434 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11435 rclass_3 = FP_REGS;
11436 else
11437 rclass_3 = GENERAL_REGS;
11439 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11440 rclass_4 = FP_REGS;
11441 else
11442 rclass_4 = GENERAL_REGS;
11444 /* Check if the registers are of same class. */
11445 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11446 return false;
11448 return true;
11451 /* Given OPERANDS of consecutive load/store, this function pairs them
11452 into ldp/stp after adjusting the offset. It depends on the fact
11453 that addresses of load/store instructions are in increasing order.
11454 MODE is the mode of memory operands. CODE is the rtl operator
11455 which should be applied to all memory operands, it's SIGN_EXTEND,
11456 ZERO_EXTEND or UNKNOWN. */
11458 bool
11459 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11460 enum machine_mode mode, RTX_CODE code)
11462 rtx base, offset, t1, t2;
11463 rtx mem_1, mem_2, mem_3, mem_4;
11464 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11466 if (load)
11468 mem_1 = operands[1];
11469 mem_2 = operands[3];
11470 mem_3 = operands[5];
11471 mem_4 = operands[7];
11473 else
11475 mem_1 = operands[0];
11476 mem_2 = operands[2];
11477 mem_3 = operands[4];
11478 mem_4 = operands[6];
11479 gcc_assert (code == UNKNOWN);
11482 extract_base_offset_in_addr (mem_1, &base, &offset);
11483 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11485 /* Adjust offset thus it can fit in ldp/stp instruction. */
11486 msize = GET_MODE_SIZE (mode);
11487 stp_off_limit = msize * 0x40;
11488 off_val = INTVAL (offset);
11489 abs_off = (off_val < 0) ? -off_val : off_val;
11490 new_off = abs_off % stp_off_limit;
11491 adj_off = abs_off - new_off;
11493 /* Further adjust to make sure all offsets are OK. */
11494 if ((new_off + msize * 2) >= stp_off_limit)
11496 adj_off += stp_off_limit;
11497 new_off -= stp_off_limit;
11500 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11501 if (adj_off >= 0x1000)
11502 return false;
11504 if (off_val < 0)
11506 adj_off = -adj_off;
11507 new_off = -new_off;
11510 /* Create new memory references. */
11511 mem_1 = change_address (mem_1, VOIDmode,
11512 plus_constant (DImode, operands[8], new_off));
11514 /* Check if the adjusted address is OK for ldp/stp. */
11515 if (!aarch64_mem_pair_operand (mem_1, mode))
11516 return false;
11518 msize = GET_MODE_SIZE (mode);
11519 mem_2 = change_address (mem_2, VOIDmode,
11520 plus_constant (DImode,
11521 operands[8],
11522 new_off + msize));
11523 mem_3 = change_address (mem_3, VOIDmode,
11524 plus_constant (DImode,
11525 operands[8],
11526 new_off + msize * 2));
11527 mem_4 = change_address (mem_4, VOIDmode,
11528 plus_constant (DImode,
11529 operands[8],
11530 new_off + msize * 3));
11532 if (code == ZERO_EXTEND)
11534 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11535 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11536 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11537 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11539 else if (code == SIGN_EXTEND)
11541 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11542 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11543 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11544 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11547 if (load)
11549 operands[1] = mem_1;
11550 operands[3] = mem_2;
11551 operands[5] = mem_3;
11552 operands[7] = mem_4;
11554 else
11556 operands[0] = mem_1;
11557 operands[2] = mem_2;
11558 operands[4] = mem_3;
11559 operands[6] = mem_4;
11562 /* Emit adjusting instruction. */
11563 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11564 /* Emit ldp/stp instructions. */
11565 t1 = gen_rtx_SET (operands[0], operands[1]);
11566 t2 = gen_rtx_SET (operands[2], operands[3]);
11567 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11568 t1 = gen_rtx_SET (operands[4], operands[5]);
11569 t2 = gen_rtx_SET (operands[6], operands[7]);
11570 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11571 return true;
11574 #undef TARGET_ADDRESS_COST
11575 #define TARGET_ADDRESS_COST aarch64_address_cost
11577 /* This hook will determines whether unnamed bitfields affect the alignment
11578 of the containing structure. The hook returns true if the structure
11579 should inherit the alignment requirements of an unnamed bitfield's
11580 type. */
11581 #undef TARGET_ALIGN_ANON_BITFIELD
11582 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11584 #undef TARGET_ASM_ALIGNED_DI_OP
11585 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11587 #undef TARGET_ASM_ALIGNED_HI_OP
11588 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11590 #undef TARGET_ASM_ALIGNED_SI_OP
11591 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11593 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11594 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11595 hook_bool_const_tree_hwi_hwi_const_tree_true
11597 #undef TARGET_ASM_FILE_START
11598 #define TARGET_ASM_FILE_START aarch64_start_file
11600 #undef TARGET_ASM_OUTPUT_MI_THUNK
11601 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11603 #undef TARGET_ASM_SELECT_RTX_SECTION
11604 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11606 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11607 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11609 #undef TARGET_BUILD_BUILTIN_VA_LIST
11610 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11612 #undef TARGET_CALLEE_COPIES
11613 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11615 #undef TARGET_CAN_ELIMINATE
11616 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11618 #undef TARGET_CANNOT_FORCE_CONST_MEM
11619 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11621 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11622 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11624 /* Only the least significant bit is used for initialization guard
11625 variables. */
11626 #undef TARGET_CXX_GUARD_MASK_BIT
11627 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11629 #undef TARGET_C_MODE_FOR_SUFFIX
11630 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11632 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11633 #undef TARGET_DEFAULT_TARGET_FLAGS
11634 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11635 #endif
11637 #undef TARGET_CLASS_MAX_NREGS
11638 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11640 #undef TARGET_BUILTIN_DECL
11641 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11643 #undef TARGET_EXPAND_BUILTIN
11644 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11646 #undef TARGET_EXPAND_BUILTIN_VA_START
11647 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11649 #undef TARGET_FOLD_BUILTIN
11650 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11652 #undef TARGET_FUNCTION_ARG
11653 #define TARGET_FUNCTION_ARG aarch64_function_arg
11655 #undef TARGET_FUNCTION_ARG_ADVANCE
11656 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11658 #undef TARGET_FUNCTION_ARG_BOUNDARY
11659 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11661 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11662 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11664 #undef TARGET_FUNCTION_VALUE
11665 #define TARGET_FUNCTION_VALUE aarch64_function_value
11667 #undef TARGET_FUNCTION_VALUE_REGNO_P
11668 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11670 #undef TARGET_FRAME_POINTER_REQUIRED
11671 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11673 #undef TARGET_GIMPLE_FOLD_BUILTIN
11674 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11676 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11677 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11679 #undef TARGET_INIT_BUILTINS
11680 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11682 #undef TARGET_LEGITIMATE_ADDRESS_P
11683 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11685 #undef TARGET_LEGITIMATE_CONSTANT_P
11686 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11688 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11689 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11691 #undef TARGET_LRA_P
11692 #define TARGET_LRA_P hook_bool_void_true
11694 #undef TARGET_MANGLE_TYPE
11695 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11697 #undef TARGET_MEMORY_MOVE_COST
11698 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11700 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11701 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11703 #undef TARGET_MUST_PASS_IN_STACK
11704 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11706 /* This target hook should return true if accesses to volatile bitfields
11707 should use the narrowest mode possible. It should return false if these
11708 accesses should use the bitfield container type. */
11709 #undef TARGET_NARROW_VOLATILE_BITFIELD
11710 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11712 #undef TARGET_OPTION_OVERRIDE
11713 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11715 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11716 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11717 aarch64_override_options_after_change
11719 #undef TARGET_PASS_BY_REFERENCE
11720 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11722 #undef TARGET_PREFERRED_RELOAD_CLASS
11723 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11725 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11726 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11728 #undef TARGET_SECONDARY_RELOAD
11729 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11731 #undef TARGET_SHIFT_TRUNCATION_MASK
11732 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11734 #undef TARGET_SETUP_INCOMING_VARARGS
11735 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11737 #undef TARGET_STRUCT_VALUE_RTX
11738 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11740 #undef TARGET_REGISTER_MOVE_COST
11741 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11743 #undef TARGET_RETURN_IN_MEMORY
11744 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11746 #undef TARGET_RETURN_IN_MSB
11747 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11749 #undef TARGET_RTX_COSTS
11750 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11752 #undef TARGET_SCHED_ISSUE_RATE
11753 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11755 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11756 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11757 aarch64_sched_first_cycle_multipass_dfa_lookahead
11759 #undef TARGET_TRAMPOLINE_INIT
11760 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11762 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11763 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11765 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11766 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11768 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11769 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11771 #undef TARGET_VECTORIZE_ADD_STMT_COST
11772 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11774 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11775 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11776 aarch64_builtin_vectorization_cost
11778 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11779 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11781 #undef TARGET_VECTORIZE_BUILTINS
11782 #define TARGET_VECTORIZE_BUILTINS
11784 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11785 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11786 aarch64_builtin_vectorized_function
11788 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11789 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11790 aarch64_autovectorize_vector_sizes
11792 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11793 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11794 aarch64_atomic_assign_expand_fenv
11796 /* Section anchor support. */
11798 #undef TARGET_MIN_ANCHOR_OFFSET
11799 #define TARGET_MIN_ANCHOR_OFFSET -256
11801 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11802 byte offset; we can do much more for larger data types, but have no way
11803 to determine the size of the access. We assume accesses are aligned. */
11804 #undef TARGET_MAX_ANCHOR_OFFSET
11805 #define TARGET_MAX_ANCHOR_OFFSET 4095
11807 #undef TARGET_VECTOR_ALIGNMENT
11808 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11810 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11811 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11812 aarch64_simd_vector_alignment_reachable
11814 /* vec_perm support. */
11816 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11817 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11818 aarch64_vectorize_vec_perm_const_ok
11821 #undef TARGET_FIXED_CONDITION_CODE_REGS
11822 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11824 #undef TARGET_FLAGS_REGNUM
11825 #define TARGET_FLAGS_REGNUM CC_REGNUM
11827 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11828 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11830 #undef TARGET_ASAN_SHADOW_OFFSET
11831 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11833 #undef TARGET_LEGITIMIZE_ADDRESS
11834 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11836 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11837 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11838 aarch64_use_by_pieces_infrastructure_p
11840 #undef TARGET_CAN_USE_DOLOOP_P
11841 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11843 #undef TARGET_SCHED_MACRO_FUSION_P
11844 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11846 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11847 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11849 #undef TARGET_SCHED_FUSION_PRIORITY
11850 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11852 struct gcc_target targetm = TARGET_INITIALIZER;
11854 #include "gt-aarch64.h"