[AArch64] Add branch-cost to cpu tuning information.
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob7bc28ae7cf2f8d6f5ef136158c6c43d39e540435
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
105 ADDRESS_REG_IMM
106 A simple base register plus immediate offset.
108 ADDRESS_REG_WB
109 A base register indexed by immediate offset with writeback.
111 ADDRESS_REG_REG
112 A base register indexed by (optionally scaled) register.
114 ADDRESS_REG_UXTW
115 A base register indexed by (optionally scaled) zero-extended register.
117 ADDRESS_REG_SXTW
118 A base register indexed by (optionally scaled) sign-extended register.
120 ADDRESS_LO_SUM
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
123 ADDRESS_SYMBOLIC:
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type {
127 ADDRESS_REG_IMM,
128 ADDRESS_REG_WB,
129 ADDRESS_REG_REG,
130 ADDRESS_REG_UXTW,
131 ADDRESS_REG_SXTW,
132 ADDRESS_LO_SUM,
133 ADDRESS_SYMBOLIC
136 struct aarch64_address_info {
137 enum aarch64_address_type type;
138 rtx base;
139 rtx offset;
140 int shift;
141 enum aarch64_symbol_type symbol_type;
144 struct simd_immediate_info
146 rtx value;
147 int shift;
148 int element_width;
149 bool mvn;
150 bool msl;
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel;
156 #ifdef HAVE_AS_TLS
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
159 #endif
161 static bool aarch64_composite_type_p (const_tree, machine_mode);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
163 const_tree,
164 machine_mode *, int *,
165 bool *);
166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode);
170 static unsigned bit_count (unsigned HOST_WIDE_INT);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
172 const unsigned char *sel);
173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune = cortexa53;
181 /* The current tuning set. */
182 const struct tune_params *aarch64_tune_params;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags = 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags = 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table =
195 0, /* hi */
196 0, /* si */
197 0, /* di */
198 0, /* ti */
200 0, /* pre_modify */
201 0, /* post_modify */
202 0, /* register_offset */
203 0, /* register_extend */
204 0 /* imm_offset */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 1, /* hi */
211 0, /* si */
212 0, /* di */
213 1, /* ti */
215 0, /* pre_modify */
216 0, /* post_modify */
217 0, /* register_offset */
218 0, /* register_extend */
219 0, /* imm_offset */
222 static const struct cpu_addrcost_table xgene1_addrcost_table =
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
230 1, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 1, /* register_extend */
234 0, /* imm_offset */
237 static const struct cpu_regmove_cost generic_regmove_cost =
239 1, /* GP2GP */
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
242 5, /* GP2FP */
243 5, /* FP2GP */
244 2 /* FP2FP */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
249 1, /* GP2GP */
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
252 5, /* GP2FP */
253 5, /* FP2GP */
254 2 /* FP2FP */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
259 1, /* GP2GP */
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
262 5, /* GP2FP */
263 5, /* FP2GP */
264 2 /* FP2FP */
267 static const struct cpu_regmove_cost thunderx_regmove_cost =
269 2, /* GP2GP */
270 2, /* GP2FP */
271 6, /* FP2GP */
272 4 /* FP2FP */
275 static const struct cpu_regmove_cost xgene1_regmove_cost =
277 1, /* GP2GP */
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
280 8, /* GP2FP */
281 8, /* FP2GP */
282 2 /* FP2FP */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost =
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost =
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost =
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 /* Generic costs for branch instructions. */
344 static const struct cpu_branch_cost generic_branch_cost =
346 2, /* Predictable. */
347 2 /* Unpredictable. */
350 static const struct tune_params generic_tunings =
352 &cortexa57_extra_costs,
353 &generic_addrcost_table,
354 &generic_regmove_cost,
355 &generic_vector_cost,
356 &generic_branch_cost,
357 4, /* memmov_cost */
358 2, /* issue_rate */
359 AARCH64_FUSE_NOTHING, /* fuseable_ops */
360 8, /* function_align. */
361 8, /* jump_align. */
362 4, /* loop_align. */
363 2, /* int_reassoc_width. */
364 4, /* fp_reassoc_width. */
365 1, /* vec_reassoc_width. */
366 2, /* min_div_recip_mul_sf. */
367 2 /* min_div_recip_mul_df. */
370 static const struct tune_params cortexa53_tunings =
372 &cortexa53_extra_costs,
373 &generic_addrcost_table,
374 &cortexa53_regmove_cost,
375 &generic_vector_cost,
376 &generic_branch_cost,
377 4, /* memmov_cost */
378 2, /* issue_rate */
379 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
380 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops */
381 8, /* function_align. */
382 8, /* jump_align. */
383 4, /* loop_align. */
384 2, /* int_reassoc_width. */
385 4, /* fp_reassoc_width. */
386 1, /* vec_reassoc_width. */
387 2, /* min_div_recip_mul_sf. */
388 2 /* min_div_recip_mul_df. */
391 static const struct tune_params cortexa57_tunings =
393 &cortexa57_extra_costs,
394 &cortexa57_addrcost_table,
395 &cortexa57_regmove_cost,
396 &cortexa57_vector_cost,
397 &generic_branch_cost,
398 4, /* memmov_cost */
399 3, /* issue_rate */
400 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
401 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
402 16, /* function_align. */
403 8, /* jump_align. */
404 4, /* loop_align. */
405 2, /* int_reassoc_width. */
406 4, /* fp_reassoc_width. */
407 1, /* vec_reassoc_width. */
408 2, /* min_div_recip_mul_sf. */
409 2 /* min_div_recip_mul_df. */
412 static const struct tune_params thunderx_tunings =
414 &thunderx_extra_costs,
415 &generic_addrcost_table,
416 &thunderx_regmove_cost,
417 &generic_vector_cost,
418 &generic_branch_cost,
419 6, /* memmov_cost */
420 2, /* issue_rate */
421 AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops */
422 8, /* function_align. */
423 8, /* jump_align. */
424 8, /* loop_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1, /* vec_reassoc_width. */
428 2, /* min_div_recip_mul_sf. */
429 2 /* min_div_recip_mul_df. */
432 static const struct tune_params xgene1_tunings =
434 &xgene1_extra_costs,
435 &xgene1_addrcost_table,
436 &xgene1_regmove_cost,
437 &xgene1_vector_cost,
438 &generic_branch_cost,
439 6, /* memmov_cost */
440 4, /* issue_rate */
441 AARCH64_FUSE_NOTHING, /* fuseable_ops */
442 16, /* function_align. */
443 8, /* jump_align. */
444 16, /* loop_align. */
445 2, /* int_reassoc_width. */
446 4, /* fp_reassoc_width. */
447 1, /* vec_reassoc_width. */
448 2, /* min_div_recip_mul_sf. */
449 2 /* min_div_recip_mul_df. */
452 /* A processor implementing AArch64. */
453 struct processor
455 const char *const name;
456 enum aarch64_processor core;
457 const char *arch;
458 unsigned architecture_version;
459 const unsigned long flags;
460 const struct tune_params *const tune;
463 /* Processor cores implementing AArch64. */
464 static const struct processor all_cores[] =
466 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
467 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
468 #include "aarch64-cores.def"
469 #undef AARCH64_CORE
470 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
471 {NULL, aarch64_none, NULL, 0, 0, NULL}
474 /* Architectures implementing AArch64. */
475 static const struct processor all_architectures[] =
477 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
478 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
479 #include "aarch64-arches.def"
480 #undef AARCH64_ARCH
481 {NULL, aarch64_none, NULL, 0, 0, NULL}
484 /* Target specification. These are populated as commandline arguments
485 are processed, or NULL if not specified. */
486 static const struct processor *selected_arch;
487 static const struct processor *selected_cpu;
488 static const struct processor *selected_tune;
490 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
492 /* An ISA extension in the co-processor and main instruction set space. */
493 struct aarch64_option_extension
495 const char *const name;
496 const unsigned long flags_on;
497 const unsigned long flags_off;
500 /* ISA extensions in AArch64. */
501 static const struct aarch64_option_extension all_extensions[] =
503 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
504 {NAME, FLAGS_ON, FLAGS_OFF},
505 #include "aarch64-option-extensions.def"
506 #undef AARCH64_OPT_EXTENSION
507 {NULL, 0, 0}
510 /* Used to track the size of an address when generating a pre/post
511 increment address. */
512 static machine_mode aarch64_memory_reference_mode;
514 /* A table of valid AArch64 "bitmask immediate" values for
515 logical instructions. */
517 #define AARCH64_NUM_BITMASKS 5334
518 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
520 typedef enum aarch64_cond_code
522 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
523 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
524 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
526 aarch64_cc;
528 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
530 /* The condition codes of the processor, and the inverse function. */
531 static const char * const aarch64_condition_codes[] =
533 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
534 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
537 static unsigned int
538 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
540 if (GET_MODE_UNIT_SIZE (mode) == 4)
541 return aarch64_tune_params->min_div_recip_mul_sf;
542 return aarch64_tune_params->min_div_recip_mul_df;
545 static int
546 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
547 enum machine_mode mode)
549 if (VECTOR_MODE_P (mode))
550 return aarch64_tune_params->vec_reassoc_width;
551 if (INTEGRAL_MODE_P (mode))
552 return aarch64_tune_params->int_reassoc_width;
553 if (FLOAT_MODE_P (mode))
554 return aarch64_tune_params->fp_reassoc_width;
555 return 1;
558 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
559 unsigned
560 aarch64_dbx_register_number (unsigned regno)
562 if (GP_REGNUM_P (regno))
563 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
564 else if (regno == SP_REGNUM)
565 return AARCH64_DWARF_SP;
566 else if (FP_REGNUM_P (regno))
567 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
569 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
570 equivalent DWARF register. */
571 return DWARF_FRAME_REGISTERS;
574 /* Return TRUE if MODE is any of the large INT modes. */
575 static bool
576 aarch64_vect_struct_mode_p (machine_mode mode)
578 return mode == OImode || mode == CImode || mode == XImode;
581 /* Return TRUE if MODE is any of the vector modes. */
582 static bool
583 aarch64_vector_mode_p (machine_mode mode)
585 return aarch64_vector_mode_supported_p (mode)
586 || aarch64_vect_struct_mode_p (mode);
589 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
590 static bool
591 aarch64_array_mode_supported_p (machine_mode mode,
592 unsigned HOST_WIDE_INT nelems)
594 if (TARGET_SIMD
595 && AARCH64_VALID_SIMD_QREG_MODE (mode)
596 && (nelems >= 2 && nelems <= 4))
597 return true;
599 return false;
602 /* Implement HARD_REGNO_NREGS. */
605 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
607 switch (aarch64_regno_regclass (regno))
609 case FP_REGS:
610 case FP_LO_REGS:
611 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
612 default:
613 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
615 gcc_unreachable ();
618 /* Implement HARD_REGNO_MODE_OK. */
621 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
623 if (GET_MODE_CLASS (mode) == MODE_CC)
624 return regno == CC_REGNUM;
626 if (regno == SP_REGNUM)
627 /* The purpose of comparing with ptr_mode is to support the
628 global register variable associated with the stack pointer
629 register via the syntax of asm ("wsp") in ILP32. */
630 return mode == Pmode || mode == ptr_mode;
632 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
633 return mode == Pmode;
635 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
636 return 1;
638 if (FP_REGNUM_P (regno))
640 if (aarch64_vect_struct_mode_p (mode))
641 return
642 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
643 else
644 return 1;
647 return 0;
650 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
651 machine_mode
652 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
653 machine_mode mode)
655 /* Handle modes that fit within single registers. */
656 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
658 if (GET_MODE_SIZE (mode) >= 4)
659 return mode;
660 else
661 return SImode;
663 /* Fall back to generic for multi-reg and very large modes. */
664 else
665 return choose_hard_reg_mode (regno, nregs, false);
668 /* Return true if calls to DECL should be treated as
669 long-calls (ie called via a register). */
670 static bool
671 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
673 return false;
676 /* Return true if calls to symbol-ref SYM should be treated as
677 long-calls (ie called via a register). */
678 bool
679 aarch64_is_long_call_p (rtx sym)
681 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
684 /* Return true if the offsets to a zero/sign-extract operation
685 represent an expression that matches an extend operation. The
686 operands represent the paramters from
688 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
689 bool
690 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
691 rtx extract_imm)
693 HOST_WIDE_INT mult_val, extract_val;
695 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
696 return false;
698 mult_val = INTVAL (mult_imm);
699 extract_val = INTVAL (extract_imm);
701 if (extract_val > 8
702 && extract_val < GET_MODE_BITSIZE (mode)
703 && exact_log2 (extract_val & ~7) > 0
704 && (extract_val & 7) <= 4
705 && mult_val == (1 << (extract_val & 7)))
706 return true;
708 return false;
711 /* Emit an insn that's a simple single-set. Both the operands must be
712 known to be valid. */
713 inline static rtx
714 emit_set_insn (rtx x, rtx y)
716 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
719 /* X and Y are two things to compare using CODE. Emit the compare insn and
720 return the rtx for register 0 in the proper mode. */
722 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
724 machine_mode mode = SELECT_CC_MODE (code, x, y);
725 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
727 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
728 return cc_reg;
731 /* Build the SYMBOL_REF for __tls_get_addr. */
733 static GTY(()) rtx tls_get_addr_libfunc;
736 aarch64_tls_get_addr (void)
738 if (!tls_get_addr_libfunc)
739 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
740 return tls_get_addr_libfunc;
743 /* Return the TLS model to use for ADDR. */
745 static enum tls_model
746 tls_symbolic_operand_type (rtx addr)
748 enum tls_model tls_kind = TLS_MODEL_NONE;
749 rtx sym, addend;
751 if (GET_CODE (addr) == CONST)
753 split_const (addr, &sym, &addend);
754 if (GET_CODE (sym) == SYMBOL_REF)
755 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
757 else if (GET_CODE (addr) == SYMBOL_REF)
758 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
760 return tls_kind;
763 /* We'll allow lo_sum's in addresses in our legitimate addresses
764 so that combine would take care of combining addresses where
765 necessary, but for generation purposes, we'll generate the address
766 as :
767 RTL Absolute
768 tmp = hi (symbol_ref); adrp x1, foo
769 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
772 PIC TLS
773 adrp x1, :got:foo adrp tmp, :tlsgd:foo
774 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
775 bl __tls_get_addr
778 Load TLS symbol, depending on TLS mechanism and TLS access model.
780 Global Dynamic - Traditional TLS:
781 adrp tmp, :tlsgd:imm
782 add dest, tmp, #:tlsgd_lo12:imm
783 bl __tls_get_addr
785 Global Dynamic - TLS Descriptors:
786 adrp dest, :tlsdesc:imm
787 ldr tmp, [dest, #:tlsdesc_lo12:imm]
788 add dest, dest, #:tlsdesc_lo12:imm
789 blr tmp
790 mrs tp, tpidr_el0
791 add dest, dest, tp
793 Initial Exec:
794 mrs tp, tpidr_el0
795 adrp tmp, :gottprel:imm
796 ldr dest, [tmp, #:gottprel_lo12:imm]
797 add dest, dest, tp
799 Local Exec:
800 mrs tp, tpidr_el0
801 add t0, tp, #:tprel_hi12:imm, lsl #12
802 add t0, t0, #:tprel_lo12_nc:imm
805 static void
806 aarch64_load_symref_appropriately (rtx dest, rtx imm,
807 enum aarch64_symbol_type type)
809 switch (type)
811 case SYMBOL_SMALL_ABSOLUTE:
813 /* In ILP32, the mode of dest can be either SImode or DImode. */
814 rtx tmp_reg = dest;
815 machine_mode mode = GET_MODE (dest);
817 gcc_assert (mode == Pmode || mode == ptr_mode);
819 if (can_create_pseudo_p ())
820 tmp_reg = gen_reg_rtx (mode);
822 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
823 emit_insn (gen_add_losym (dest, tmp_reg, imm));
824 return;
827 case SYMBOL_TINY_ABSOLUTE:
828 emit_insn (gen_rtx_SET (Pmode, dest, imm));
829 return;
831 case SYMBOL_SMALL_GOT:
833 /* In ILP32, the mode of dest can be either SImode or DImode,
834 while the got entry is always of SImode size. The mode of
835 dest depends on how dest is used: if dest is assigned to a
836 pointer (e.g. in the memory), it has SImode; it may have
837 DImode if dest is dereferenced to access the memeory.
838 This is why we have to handle three different ldr_got_small
839 patterns here (two patterns for ILP32). */
840 rtx tmp_reg = dest;
841 machine_mode mode = GET_MODE (dest);
843 if (can_create_pseudo_p ())
844 tmp_reg = gen_reg_rtx (mode);
846 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
847 if (mode == ptr_mode)
849 if (mode == DImode)
850 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
851 else
852 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
854 else
856 gcc_assert (mode == Pmode);
857 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
860 return;
863 case SYMBOL_SMALL_TLSGD:
865 rtx_insn *insns;
866 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
868 start_sequence ();
869 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
870 insns = get_insns ();
871 end_sequence ();
873 RTL_CONST_CALL_P (insns) = 1;
874 emit_libcall_block (insns, dest, result, imm);
875 return;
878 case SYMBOL_SMALL_TLSDESC:
880 machine_mode mode = GET_MODE (dest);
881 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
882 rtx tp;
884 gcc_assert (mode == Pmode || mode == ptr_mode);
886 /* In ILP32, the got entry is always of SImode size. Unlike
887 small GOT, the dest is fixed at reg 0. */
888 if (TARGET_ILP32)
889 emit_insn (gen_tlsdesc_small_si (imm));
890 else
891 emit_insn (gen_tlsdesc_small_di (imm));
892 tp = aarch64_load_tp (NULL);
894 if (mode != Pmode)
895 tp = gen_lowpart (mode, tp);
897 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
898 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
899 return;
902 case SYMBOL_SMALL_GOTTPREL:
904 /* In ILP32, the mode of dest can be either SImode or DImode,
905 while the got entry is always of SImode size. The mode of
906 dest depends on how dest is used: if dest is assigned to a
907 pointer (e.g. in the memory), it has SImode; it may have
908 DImode if dest is dereferenced to access the memeory.
909 This is why we have to handle three different tlsie_small
910 patterns here (two patterns for ILP32). */
911 machine_mode mode = GET_MODE (dest);
912 rtx tmp_reg = gen_reg_rtx (mode);
913 rtx tp = aarch64_load_tp (NULL);
915 if (mode == ptr_mode)
917 if (mode == DImode)
918 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
919 else
921 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
922 tp = gen_lowpart (mode, tp);
925 else
927 gcc_assert (mode == Pmode);
928 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
931 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
932 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
933 return;
936 case SYMBOL_SMALL_TPREL:
938 rtx tp = aarch64_load_tp (NULL);
940 if (GET_MODE (dest) != Pmode)
941 tp = gen_lowpart (GET_MODE (dest), tp);
943 emit_insn (gen_tlsle_small (dest, tp, imm));
944 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
945 return;
948 case SYMBOL_TINY_GOT:
949 emit_insn (gen_ldr_got_tiny (dest, imm));
950 return;
952 default:
953 gcc_unreachable ();
957 /* Emit a move from SRC to DEST. Assume that the move expanders can
958 handle all moves if !can_create_pseudo_p (). The distinction is
959 important because, unlike emit_move_insn, the move expanders know
960 how to force Pmode objects into the constant pool even when the
961 constant pool address is not itself legitimate. */
962 static rtx
963 aarch64_emit_move (rtx dest, rtx src)
965 return (can_create_pseudo_p ()
966 ? emit_move_insn (dest, src)
967 : emit_move_insn_1 (dest, src));
970 /* Split a 128-bit move operation into two 64-bit move operations,
971 taking care to handle partial overlap of register to register
972 copies. Special cases are needed when moving between GP regs and
973 FP regs. SRC can be a register, constant or memory; DST a register
974 or memory. If either operand is memory it must not have any side
975 effects. */
976 void
977 aarch64_split_128bit_move (rtx dst, rtx src)
979 rtx dst_lo, dst_hi;
980 rtx src_lo, src_hi;
982 machine_mode mode = GET_MODE (dst);
984 gcc_assert (mode == TImode || mode == TFmode);
985 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
986 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
988 if (REG_P (dst) && REG_P (src))
990 int src_regno = REGNO (src);
991 int dst_regno = REGNO (dst);
993 /* Handle FP <-> GP regs. */
994 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
996 src_lo = gen_lowpart (word_mode, src);
997 src_hi = gen_highpart (word_mode, src);
999 if (mode == TImode)
1001 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1002 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1004 else
1006 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1007 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1009 return;
1011 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1013 dst_lo = gen_lowpart (word_mode, dst);
1014 dst_hi = gen_highpart (word_mode, dst);
1016 if (mode == TImode)
1018 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1019 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1021 else
1023 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1024 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1026 return;
1030 dst_lo = gen_lowpart (word_mode, dst);
1031 dst_hi = gen_highpart (word_mode, dst);
1032 src_lo = gen_lowpart (word_mode, src);
1033 src_hi = gen_highpart_mode (word_mode, mode, src);
1035 /* At most one pairing may overlap. */
1036 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1038 aarch64_emit_move (dst_hi, src_hi);
1039 aarch64_emit_move (dst_lo, src_lo);
1041 else
1043 aarch64_emit_move (dst_lo, src_lo);
1044 aarch64_emit_move (dst_hi, src_hi);
1048 bool
1049 aarch64_split_128bit_move_p (rtx dst, rtx src)
1051 return (! REG_P (src)
1052 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1055 /* Split a complex SIMD combine. */
1057 void
1058 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1060 machine_mode src_mode = GET_MODE (src1);
1061 machine_mode dst_mode = GET_MODE (dst);
1063 gcc_assert (VECTOR_MODE_P (dst_mode));
1065 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1067 rtx (*gen) (rtx, rtx, rtx);
1069 switch (src_mode)
1071 case V8QImode:
1072 gen = gen_aarch64_simd_combinev8qi;
1073 break;
1074 case V4HImode:
1075 gen = gen_aarch64_simd_combinev4hi;
1076 break;
1077 case V2SImode:
1078 gen = gen_aarch64_simd_combinev2si;
1079 break;
1080 case V2SFmode:
1081 gen = gen_aarch64_simd_combinev2sf;
1082 break;
1083 case DImode:
1084 gen = gen_aarch64_simd_combinedi;
1085 break;
1086 case DFmode:
1087 gen = gen_aarch64_simd_combinedf;
1088 break;
1089 default:
1090 gcc_unreachable ();
1093 emit_insn (gen (dst, src1, src2));
1094 return;
1098 /* Split a complex SIMD move. */
1100 void
1101 aarch64_split_simd_move (rtx dst, rtx src)
1103 machine_mode src_mode = GET_MODE (src);
1104 machine_mode dst_mode = GET_MODE (dst);
1106 gcc_assert (VECTOR_MODE_P (dst_mode));
1108 if (REG_P (dst) && REG_P (src))
1110 rtx (*gen) (rtx, rtx);
1112 gcc_assert (VECTOR_MODE_P (src_mode));
1114 switch (src_mode)
1116 case V16QImode:
1117 gen = gen_aarch64_split_simd_movv16qi;
1118 break;
1119 case V8HImode:
1120 gen = gen_aarch64_split_simd_movv8hi;
1121 break;
1122 case V4SImode:
1123 gen = gen_aarch64_split_simd_movv4si;
1124 break;
1125 case V2DImode:
1126 gen = gen_aarch64_split_simd_movv2di;
1127 break;
1128 case V4SFmode:
1129 gen = gen_aarch64_split_simd_movv4sf;
1130 break;
1131 case V2DFmode:
1132 gen = gen_aarch64_split_simd_movv2df;
1133 break;
1134 default:
1135 gcc_unreachable ();
1138 emit_insn (gen (dst, src));
1139 return;
1143 static rtx
1144 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1146 if (can_create_pseudo_p ())
1147 return force_reg (mode, value);
1148 else
1150 x = aarch64_emit_move (x, value);
1151 return x;
1156 static rtx
1157 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1159 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1161 rtx high;
1162 /* Load the full offset into a register. This
1163 might be improvable in the future. */
1164 high = GEN_INT (offset);
1165 offset = 0;
1166 high = aarch64_force_temporary (mode, temp, high);
1167 reg = aarch64_force_temporary (mode, temp,
1168 gen_rtx_PLUS (mode, high, reg));
1170 return plus_constant (mode, reg, offset);
1173 static int
1174 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1175 machine_mode mode)
1177 unsigned HOST_WIDE_INT mask;
1178 int i;
1179 bool first;
1180 unsigned HOST_WIDE_INT val;
1181 bool subtargets;
1182 rtx subtarget;
1183 int one_match, zero_match, first_not_ffff_match;
1184 int num_insns = 0;
1186 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1188 if (generate)
1189 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1190 num_insns++;
1191 return num_insns;
1194 if (mode == SImode)
1196 /* We know we can't do this in 1 insn, and we must be able to do it
1197 in two; so don't mess around looking for sequences that don't buy
1198 us anything. */
1199 if (generate)
1201 emit_insn (gen_rtx_SET (VOIDmode, dest,
1202 GEN_INT (INTVAL (imm) & 0xffff)));
1203 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1204 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1206 num_insns += 2;
1207 return num_insns;
1210 /* Remaining cases are all for DImode. */
1212 val = INTVAL (imm);
1213 subtargets = optimize && can_create_pseudo_p ();
1215 one_match = 0;
1216 zero_match = 0;
1217 mask = 0xffff;
1218 first_not_ffff_match = -1;
1220 for (i = 0; i < 64; i += 16, mask <<= 16)
1222 if ((val & mask) == mask)
1223 one_match++;
1224 else
1226 if (first_not_ffff_match < 0)
1227 first_not_ffff_match = i;
1228 if ((val & mask) == 0)
1229 zero_match++;
1233 if (one_match == 2)
1235 /* Set one of the quarters and then insert back into result. */
1236 mask = 0xffffll << first_not_ffff_match;
1237 if (generate)
1239 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1240 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1241 GEN_INT ((val >> first_not_ffff_match)
1242 & 0xffff)));
1244 num_insns += 2;
1245 return num_insns;
1248 if (zero_match == 2)
1249 goto simple_sequence;
1251 mask = 0x0ffff0000UL;
1252 for (i = 16; i < 64; i += 16, mask <<= 16)
1254 HOST_WIDE_INT comp = mask & ~(mask - 1);
1256 if (aarch64_uimm12_shift (val - (val & mask)))
1258 if (generate)
1260 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1261 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1262 GEN_INT (val & mask)));
1263 emit_insn (gen_adddi3 (dest, subtarget,
1264 GEN_INT (val - (val & mask))));
1266 num_insns += 2;
1267 return num_insns;
1269 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1271 if (generate)
1273 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1274 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1275 GEN_INT ((val + comp) & mask)));
1276 emit_insn (gen_adddi3 (dest, subtarget,
1277 GEN_INT (val - ((val + comp) & mask))));
1279 num_insns += 2;
1280 return num_insns;
1282 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1284 if (generate)
1286 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1287 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1288 GEN_INT ((val - comp) | ~mask)));
1289 emit_insn (gen_adddi3 (dest, subtarget,
1290 GEN_INT (val - ((val - comp) | ~mask))));
1292 num_insns += 2;
1293 return num_insns;
1295 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1297 if (generate)
1299 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1300 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1301 GEN_INT (val | ~mask)));
1302 emit_insn (gen_adddi3 (dest, subtarget,
1303 GEN_INT (val - (val | ~mask))));
1305 num_insns += 2;
1306 return num_insns;
1310 /* See if we can do it by arithmetically combining two
1311 immediates. */
1312 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1314 int j;
1315 mask = 0xffff;
1317 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1318 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1320 if (generate)
1322 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1323 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1324 GEN_INT (aarch64_bitmasks[i])));
1325 emit_insn (gen_adddi3 (dest, subtarget,
1326 GEN_INT (val - aarch64_bitmasks[i])));
1328 num_insns += 2;
1329 return num_insns;
1332 for (j = 0; j < 64; j += 16, mask <<= 16)
1334 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1336 if (generate)
1338 emit_insn (gen_rtx_SET (VOIDmode, dest,
1339 GEN_INT (aarch64_bitmasks[i])));
1340 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1341 GEN_INT ((val >> j) & 0xffff)));
1343 num_insns += 2;
1344 return num_insns;
1349 /* See if we can do it by logically combining two immediates. */
1350 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1352 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1354 int j;
1356 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1357 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1359 if (generate)
1361 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1362 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1363 GEN_INT (aarch64_bitmasks[i])));
1364 emit_insn (gen_iordi3 (dest, subtarget,
1365 GEN_INT (aarch64_bitmasks[j])));
1367 num_insns += 2;
1368 return num_insns;
1371 else if ((val & aarch64_bitmasks[i]) == val)
1373 int j;
1375 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1376 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1378 if (generate)
1380 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1381 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1382 GEN_INT (aarch64_bitmasks[j])));
1383 emit_insn (gen_anddi3 (dest, subtarget,
1384 GEN_INT (aarch64_bitmasks[i])));
1386 num_insns += 2;
1387 return num_insns;
1392 if (one_match > zero_match)
1394 /* Set either first three quarters or all but the third. */
1395 mask = 0xffffll << (16 - first_not_ffff_match);
1396 if (generate)
1397 emit_insn (gen_rtx_SET (VOIDmode, dest,
1398 GEN_INT (val | mask | 0xffffffff00000000ull)));
1399 num_insns ++;
1401 /* Now insert other two quarters. */
1402 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1403 i < 64; i += 16, mask <<= 16)
1405 if ((val & mask) != mask)
1407 if (generate)
1408 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1409 GEN_INT ((val >> i) & 0xffff)));
1410 num_insns ++;
1413 return num_insns;
1416 simple_sequence:
1417 first = true;
1418 mask = 0xffff;
1419 for (i = 0; i < 64; i += 16, mask <<= 16)
1421 if ((val & mask) != 0)
1423 if (first)
1425 if (generate)
1426 emit_insn (gen_rtx_SET (VOIDmode, dest,
1427 GEN_INT (val & mask)));
1428 num_insns ++;
1429 first = false;
1431 else
1433 if (generate)
1434 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1435 GEN_INT ((val >> i) & 0xffff)));
1436 num_insns ++;
1441 return num_insns;
1445 void
1446 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1448 machine_mode mode = GET_MODE (dest);
1450 gcc_assert (mode == SImode || mode == DImode);
1452 /* Check on what type of symbol it is. */
1453 if (GET_CODE (imm) == SYMBOL_REF
1454 || GET_CODE (imm) == LABEL_REF
1455 || GET_CODE (imm) == CONST)
1457 rtx mem, base, offset;
1458 enum aarch64_symbol_type sty;
1460 /* If we have (const (plus symbol offset)), separate out the offset
1461 before we start classifying the symbol. */
1462 split_const (imm, &base, &offset);
1464 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1465 switch (sty)
1467 case SYMBOL_FORCE_TO_MEM:
1468 if (offset != const0_rtx
1469 && targetm.cannot_force_const_mem (mode, imm))
1471 gcc_assert (can_create_pseudo_p ());
1472 base = aarch64_force_temporary (mode, dest, base);
1473 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1474 aarch64_emit_move (dest, base);
1475 return;
1477 mem = force_const_mem (ptr_mode, imm);
1478 gcc_assert (mem);
1479 if (mode != ptr_mode)
1480 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1481 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1482 return;
1484 case SYMBOL_SMALL_TLSGD:
1485 case SYMBOL_SMALL_TLSDESC:
1486 case SYMBOL_SMALL_GOTTPREL:
1487 case SYMBOL_SMALL_GOT:
1488 case SYMBOL_TINY_GOT:
1489 if (offset != const0_rtx)
1491 gcc_assert(can_create_pseudo_p ());
1492 base = aarch64_force_temporary (mode, dest, base);
1493 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1494 aarch64_emit_move (dest, base);
1495 return;
1497 /* FALLTHRU */
1499 case SYMBOL_SMALL_TPREL:
1500 case SYMBOL_SMALL_ABSOLUTE:
1501 case SYMBOL_TINY_ABSOLUTE:
1502 aarch64_load_symref_appropriately (dest, imm, sty);
1503 return;
1505 default:
1506 gcc_unreachable ();
1510 if (!CONST_INT_P (imm))
1512 if (GET_CODE (imm) == HIGH)
1513 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1514 else
1516 rtx mem = force_const_mem (mode, imm);
1517 gcc_assert (mem);
1518 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1521 return;
1524 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1527 static bool
1528 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1529 tree exp ATTRIBUTE_UNUSED)
1531 /* Currently, always true. */
1532 return true;
1535 /* Implement TARGET_PASS_BY_REFERENCE. */
1537 static bool
1538 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1539 machine_mode mode,
1540 const_tree type,
1541 bool named ATTRIBUTE_UNUSED)
1543 HOST_WIDE_INT size;
1544 machine_mode dummymode;
1545 int nregs;
1547 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1548 size = (mode == BLKmode && type)
1549 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1551 /* Aggregates are passed by reference based on their size. */
1552 if (type && AGGREGATE_TYPE_P (type))
1554 size = int_size_in_bytes (type);
1557 /* Variable sized arguments are always returned by reference. */
1558 if (size < 0)
1559 return true;
1561 /* Can this be a candidate to be passed in fp/simd register(s)? */
1562 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1563 &dummymode, &nregs,
1564 NULL))
1565 return false;
1567 /* Arguments which are variable sized or larger than 2 registers are
1568 passed by reference unless they are a homogenous floating point
1569 aggregate. */
1570 return size > 2 * UNITS_PER_WORD;
1573 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1574 static bool
1575 aarch64_return_in_msb (const_tree valtype)
1577 machine_mode dummy_mode;
1578 int dummy_int;
1580 /* Never happens in little-endian mode. */
1581 if (!BYTES_BIG_ENDIAN)
1582 return false;
1584 /* Only composite types smaller than or equal to 16 bytes can
1585 be potentially returned in registers. */
1586 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1587 || int_size_in_bytes (valtype) <= 0
1588 || int_size_in_bytes (valtype) > 16)
1589 return false;
1591 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1592 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1593 is always passed/returned in the least significant bits of fp/simd
1594 register(s). */
1595 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1596 &dummy_mode, &dummy_int, NULL))
1597 return false;
1599 return true;
1602 /* Implement TARGET_FUNCTION_VALUE.
1603 Define how to find the value returned by a function. */
1605 static rtx
1606 aarch64_function_value (const_tree type, const_tree func,
1607 bool outgoing ATTRIBUTE_UNUSED)
1609 machine_mode mode;
1610 int unsignedp;
1611 int count;
1612 machine_mode ag_mode;
1614 mode = TYPE_MODE (type);
1615 if (INTEGRAL_TYPE_P (type))
1616 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1618 if (aarch64_return_in_msb (type))
1620 HOST_WIDE_INT size = int_size_in_bytes (type);
1622 if (size % UNITS_PER_WORD != 0)
1624 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1625 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1629 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1630 &ag_mode, &count, NULL))
1632 if (!aarch64_composite_type_p (type, mode))
1634 gcc_assert (count == 1 && mode == ag_mode);
1635 return gen_rtx_REG (mode, V0_REGNUM);
1637 else
1639 int i;
1640 rtx par;
1642 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1643 for (i = 0; i < count; i++)
1645 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1646 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1647 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1648 XVECEXP (par, 0, i) = tmp;
1650 return par;
1653 else
1654 return gen_rtx_REG (mode, R0_REGNUM);
1657 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1658 Return true if REGNO is the number of a hard register in which the values
1659 of called function may come back. */
1661 static bool
1662 aarch64_function_value_regno_p (const unsigned int regno)
1664 /* Maximum of 16 bytes can be returned in the general registers. Examples
1665 of 16-byte return values are: 128-bit integers and 16-byte small
1666 structures (excluding homogeneous floating-point aggregates). */
1667 if (regno == R0_REGNUM || regno == R1_REGNUM)
1668 return true;
1670 /* Up to four fp/simd registers can return a function value, e.g. a
1671 homogeneous floating-point aggregate having four members. */
1672 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1673 return !TARGET_GENERAL_REGS_ONLY;
1675 return false;
1678 /* Implement TARGET_RETURN_IN_MEMORY.
1680 If the type T of the result of a function is such that
1681 void func (T arg)
1682 would require that arg be passed as a value in a register (or set of
1683 registers) according to the parameter passing rules, then the result
1684 is returned in the same registers as would be used for such an
1685 argument. */
1687 static bool
1688 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1690 HOST_WIDE_INT size;
1691 machine_mode ag_mode;
1692 int count;
1694 if (!AGGREGATE_TYPE_P (type)
1695 && TREE_CODE (type) != COMPLEX_TYPE
1696 && TREE_CODE (type) != VECTOR_TYPE)
1697 /* Simple scalar types always returned in registers. */
1698 return false;
1700 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1701 type,
1702 &ag_mode,
1703 &count,
1704 NULL))
1705 return false;
1707 /* Types larger than 2 registers returned in memory. */
1708 size = int_size_in_bytes (type);
1709 return (size < 0 || size > 2 * UNITS_PER_WORD);
1712 static bool
1713 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1714 const_tree type, int *nregs)
1716 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1717 return aarch64_vfp_is_call_or_return_candidate (mode,
1718 type,
1719 &pcum->aapcs_vfp_rmode,
1720 nregs,
1721 NULL);
1724 /* Given MODE and TYPE of a function argument, return the alignment in
1725 bits. The idea is to suppress any stronger alignment requested by
1726 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1727 This is a helper function for local use only. */
1729 static unsigned int
1730 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1732 unsigned int alignment;
1734 if (type)
1736 if (!integer_zerop (TYPE_SIZE (type)))
1738 if (TYPE_MODE (type) == mode)
1739 alignment = TYPE_ALIGN (type);
1740 else
1741 alignment = GET_MODE_ALIGNMENT (mode);
1743 else
1744 alignment = 0;
1746 else
1747 alignment = GET_MODE_ALIGNMENT (mode);
1749 return alignment;
1752 /* Layout a function argument according to the AAPCS64 rules. The rule
1753 numbers refer to the rule numbers in the AAPCS64. */
1755 static void
1756 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1757 const_tree type,
1758 bool named ATTRIBUTE_UNUSED)
1760 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1761 int ncrn, nvrn, nregs;
1762 bool allocate_ncrn, allocate_nvrn;
1763 HOST_WIDE_INT size;
1765 /* We need to do this once per argument. */
1766 if (pcum->aapcs_arg_processed)
1767 return;
1769 pcum->aapcs_arg_processed = true;
1771 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1772 size
1773 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1774 UNITS_PER_WORD);
1776 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1777 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1778 mode,
1779 type,
1780 &nregs);
1782 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1783 The following code thus handles passing by SIMD/FP registers first. */
1785 nvrn = pcum->aapcs_nvrn;
1787 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1788 and homogenous short-vector aggregates (HVA). */
1789 if (allocate_nvrn)
1791 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1793 pcum->aapcs_nextnvrn = nvrn + nregs;
1794 if (!aarch64_composite_type_p (type, mode))
1796 gcc_assert (nregs == 1);
1797 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1799 else
1801 rtx par;
1802 int i;
1803 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1804 for (i = 0; i < nregs; i++)
1806 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1807 V0_REGNUM + nvrn + i);
1808 tmp = gen_rtx_EXPR_LIST
1809 (VOIDmode, tmp,
1810 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1811 XVECEXP (par, 0, i) = tmp;
1813 pcum->aapcs_reg = par;
1815 return;
1817 else
1819 /* C.3 NSRN is set to 8. */
1820 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1821 goto on_stack;
1825 ncrn = pcum->aapcs_ncrn;
1826 nregs = size / UNITS_PER_WORD;
1828 /* C6 - C9. though the sign and zero extension semantics are
1829 handled elsewhere. This is the case where the argument fits
1830 entirely general registers. */
1831 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1833 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1835 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1837 /* C.8 if the argument has an alignment of 16 then the NGRN is
1838 rounded up to the next even number. */
1839 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1841 ++ncrn;
1842 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1844 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1845 A reg is still generated for it, but the caller should be smart
1846 enough not to use it. */
1847 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1849 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1851 else
1853 rtx par;
1854 int i;
1856 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1857 for (i = 0; i < nregs; i++)
1859 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1860 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1861 GEN_INT (i * UNITS_PER_WORD));
1862 XVECEXP (par, 0, i) = tmp;
1864 pcum->aapcs_reg = par;
1867 pcum->aapcs_nextncrn = ncrn + nregs;
1868 return;
1871 /* C.11 */
1872 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1874 /* The argument is passed on stack; record the needed number of words for
1875 this argument and align the total size if necessary. */
1876 on_stack:
1877 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1878 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1879 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1880 16 / UNITS_PER_WORD);
1881 return;
1884 /* Implement TARGET_FUNCTION_ARG. */
1886 static rtx
1887 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1888 const_tree type, bool named)
1890 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1891 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1893 if (mode == VOIDmode)
1894 return NULL_RTX;
1896 aarch64_layout_arg (pcum_v, mode, type, named);
1897 return pcum->aapcs_reg;
1900 void
1901 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1902 const_tree fntype ATTRIBUTE_UNUSED,
1903 rtx libname ATTRIBUTE_UNUSED,
1904 const_tree fndecl ATTRIBUTE_UNUSED,
1905 unsigned n_named ATTRIBUTE_UNUSED)
1907 pcum->aapcs_ncrn = 0;
1908 pcum->aapcs_nvrn = 0;
1909 pcum->aapcs_nextncrn = 0;
1910 pcum->aapcs_nextnvrn = 0;
1911 pcum->pcs_variant = ARM_PCS_AAPCS64;
1912 pcum->aapcs_reg = NULL_RTX;
1913 pcum->aapcs_arg_processed = false;
1914 pcum->aapcs_stack_words = 0;
1915 pcum->aapcs_stack_size = 0;
1917 return;
1920 static void
1921 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1922 machine_mode mode,
1923 const_tree type,
1924 bool named)
1926 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1927 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1929 aarch64_layout_arg (pcum_v, mode, type, named);
1930 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1931 != (pcum->aapcs_stack_words != 0));
1932 pcum->aapcs_arg_processed = false;
1933 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1934 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1935 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1936 pcum->aapcs_stack_words = 0;
1937 pcum->aapcs_reg = NULL_RTX;
1941 bool
1942 aarch64_function_arg_regno_p (unsigned regno)
1944 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1945 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1948 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1949 PARM_BOUNDARY bits of alignment, but will be given anything up
1950 to STACK_BOUNDARY bits if the type requires it. This makes sure
1951 that both before and after the layout of each argument, the Next
1952 Stacked Argument Address (NSAA) will have a minimum alignment of
1953 8 bytes. */
1955 static unsigned int
1956 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1958 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1960 if (alignment < PARM_BOUNDARY)
1961 alignment = PARM_BOUNDARY;
1962 if (alignment > STACK_BOUNDARY)
1963 alignment = STACK_BOUNDARY;
1964 return alignment;
1967 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1969 Return true if an argument passed on the stack should be padded upwards,
1970 i.e. if the least-significant byte of the stack slot has useful data.
1972 Small aggregate types are placed in the lowest memory address.
1974 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1976 bool
1977 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1979 /* On little-endian targets, the least significant byte of every stack
1980 argument is passed at the lowest byte address of the stack slot. */
1981 if (!BYTES_BIG_ENDIAN)
1982 return true;
1984 /* Otherwise, integral, floating-point and pointer types are padded downward:
1985 the least significant byte of a stack argument is passed at the highest
1986 byte address of the stack slot. */
1987 if (type
1988 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1989 || POINTER_TYPE_P (type))
1990 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1991 return false;
1993 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1994 return true;
1997 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1999 It specifies padding for the last (may also be the only)
2000 element of a block move between registers and memory. If
2001 assuming the block is in the memory, padding upward means that
2002 the last element is padded after its highest significant byte,
2003 while in downward padding, the last element is padded at the
2004 its least significant byte side.
2006 Small aggregates and small complex types are always padded
2007 upwards.
2009 We don't need to worry about homogeneous floating-point or
2010 short-vector aggregates; their move is not affected by the
2011 padding direction determined here. Regardless of endianness,
2012 each element of such an aggregate is put in the least
2013 significant bits of a fp/simd register.
2015 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2016 register has useful data, and return the opposite if the most
2017 significant byte does. */
2019 bool
2020 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2021 bool first ATTRIBUTE_UNUSED)
2024 /* Small composite types are always padded upward. */
2025 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2027 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2028 : GET_MODE_SIZE (mode));
2029 if (size < 2 * UNITS_PER_WORD)
2030 return true;
2033 /* Otherwise, use the default padding. */
2034 return !BYTES_BIG_ENDIAN;
2037 static machine_mode
2038 aarch64_libgcc_cmp_return_mode (void)
2040 return SImode;
2043 static bool
2044 aarch64_frame_pointer_required (void)
2046 /* In aarch64_override_options_after_change
2047 flag_omit_leaf_frame_pointer turns off the frame pointer by
2048 default. Turn it back on now if we've not got a leaf
2049 function. */
2050 if (flag_omit_leaf_frame_pointer
2051 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2052 return true;
2054 return false;
2057 /* Mark the registers that need to be saved by the callee and calculate
2058 the size of the callee-saved registers area and frame record (both FP
2059 and LR may be omitted). */
2060 static void
2061 aarch64_layout_frame (void)
2063 HOST_WIDE_INT offset = 0;
2064 int regno;
2066 if (reload_completed && cfun->machine->frame.laid_out)
2067 return;
2069 #define SLOT_NOT_REQUIRED (-2)
2070 #define SLOT_REQUIRED (-1)
2072 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2073 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2075 /* First mark all the registers that really need to be saved... */
2076 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2077 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2079 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2080 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2082 /* ... that includes the eh data registers (if needed)... */
2083 if (crtl->calls_eh_return)
2084 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2085 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2086 = SLOT_REQUIRED;
2088 /* ... and any callee saved register that dataflow says is live. */
2089 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2090 if (df_regs_ever_live_p (regno)
2091 && (regno == R30_REGNUM
2092 || !call_used_regs[regno]))
2093 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2095 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2096 if (df_regs_ever_live_p (regno)
2097 && !call_used_regs[regno])
2098 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2100 if (frame_pointer_needed)
2102 /* FP and LR are placed in the linkage record. */
2103 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2104 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2105 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2106 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2107 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2108 offset += 2 * UNITS_PER_WORD;
2111 /* Now assign stack slots for them. */
2112 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2113 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2115 cfun->machine->frame.reg_offset[regno] = offset;
2116 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2117 cfun->machine->frame.wb_candidate1 = regno;
2118 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2119 cfun->machine->frame.wb_candidate2 = regno;
2120 offset += UNITS_PER_WORD;
2123 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2124 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2126 cfun->machine->frame.reg_offset[regno] = offset;
2127 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2128 cfun->machine->frame.wb_candidate1 = regno;
2129 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2130 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2131 cfun->machine->frame.wb_candidate2 = regno;
2132 offset += UNITS_PER_WORD;
2135 cfun->machine->frame.padding0 =
2136 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2137 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2139 cfun->machine->frame.saved_regs_size = offset;
2141 cfun->machine->frame.hard_fp_offset
2142 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2143 + get_frame_size ()
2144 + cfun->machine->frame.saved_regs_size,
2145 STACK_BOUNDARY / BITS_PER_UNIT);
2147 cfun->machine->frame.frame_size
2148 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2149 + crtl->outgoing_args_size,
2150 STACK_BOUNDARY / BITS_PER_UNIT);
2152 cfun->machine->frame.laid_out = true;
2155 static bool
2156 aarch64_register_saved_on_entry (int regno)
2158 return cfun->machine->frame.reg_offset[regno] >= 0;
2161 static unsigned
2162 aarch64_next_callee_save (unsigned regno, unsigned limit)
2164 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2165 regno ++;
2166 return regno;
2169 static void
2170 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2171 HOST_WIDE_INT adjustment)
2173 rtx base_rtx = stack_pointer_rtx;
2174 rtx insn, reg, mem;
2176 reg = gen_rtx_REG (mode, regno);
2177 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2178 plus_constant (Pmode, base_rtx, -adjustment));
2179 mem = gen_rtx_MEM (mode, mem);
2181 insn = emit_move_insn (mem, reg);
2182 RTX_FRAME_RELATED_P (insn) = 1;
2185 static rtx
2186 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2187 HOST_WIDE_INT adjustment)
2189 switch (mode)
2191 case DImode:
2192 return gen_storewb_pairdi_di (base, base, reg, reg2,
2193 GEN_INT (-adjustment),
2194 GEN_INT (UNITS_PER_WORD - adjustment));
2195 case DFmode:
2196 return gen_storewb_pairdf_di (base, base, reg, reg2,
2197 GEN_INT (-adjustment),
2198 GEN_INT (UNITS_PER_WORD - adjustment));
2199 default:
2200 gcc_unreachable ();
2204 static void
2205 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2206 unsigned regno2, HOST_WIDE_INT adjustment)
2208 rtx_insn *insn;
2209 rtx reg1 = gen_rtx_REG (mode, regno1);
2210 rtx reg2 = gen_rtx_REG (mode, regno2);
2212 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2213 reg2, adjustment));
2214 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2215 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2216 RTX_FRAME_RELATED_P (insn) = 1;
2219 static rtx
2220 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2221 HOST_WIDE_INT adjustment)
2223 switch (mode)
2225 case DImode:
2226 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2227 GEN_INT (UNITS_PER_WORD));
2228 case DFmode:
2229 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2230 GEN_INT (UNITS_PER_WORD));
2231 default:
2232 gcc_unreachable ();
2236 static rtx
2237 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2238 rtx reg2)
2240 switch (mode)
2242 case DImode:
2243 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2245 case DFmode:
2246 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2248 default:
2249 gcc_unreachable ();
2253 static rtx
2254 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2255 rtx mem2)
2257 switch (mode)
2259 case DImode:
2260 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2262 case DFmode:
2263 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2265 default:
2266 gcc_unreachable ();
2271 static void
2272 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2273 unsigned start, unsigned limit, bool skip_wb)
2275 rtx_insn *insn;
2276 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2277 ? gen_frame_mem : gen_rtx_MEM);
2278 unsigned regno;
2279 unsigned regno2;
2281 for (regno = aarch64_next_callee_save (start, limit);
2282 regno <= limit;
2283 regno = aarch64_next_callee_save (regno + 1, limit))
2285 rtx reg, mem;
2286 HOST_WIDE_INT offset;
2288 if (skip_wb
2289 && (regno == cfun->machine->frame.wb_candidate1
2290 || regno == cfun->machine->frame.wb_candidate2))
2291 continue;
2293 reg = gen_rtx_REG (mode, regno);
2294 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2295 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2296 offset));
2298 regno2 = aarch64_next_callee_save (regno + 1, limit);
2300 if (regno2 <= limit
2301 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2302 == cfun->machine->frame.reg_offset[regno2]))
2305 rtx reg2 = gen_rtx_REG (mode, regno2);
2306 rtx mem2;
2308 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2309 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2310 offset));
2311 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2312 reg2));
2314 /* The first part of a frame-related parallel insn is
2315 always assumed to be relevant to the frame
2316 calculations; subsequent parts, are only
2317 frame-related if explicitly marked. */
2318 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2319 regno = regno2;
2321 else
2322 insn = emit_move_insn (mem, reg);
2324 RTX_FRAME_RELATED_P (insn) = 1;
2328 static void
2329 aarch64_restore_callee_saves (machine_mode mode,
2330 HOST_WIDE_INT start_offset, unsigned start,
2331 unsigned limit, bool skip_wb, rtx *cfi_ops)
2333 rtx base_rtx = stack_pointer_rtx;
2334 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2335 ? gen_frame_mem : gen_rtx_MEM);
2336 unsigned regno;
2337 unsigned regno2;
2338 HOST_WIDE_INT offset;
2340 for (regno = aarch64_next_callee_save (start, limit);
2341 regno <= limit;
2342 regno = aarch64_next_callee_save (regno + 1, limit))
2344 rtx reg, mem;
2346 if (skip_wb
2347 && (regno == cfun->machine->frame.wb_candidate1
2348 || regno == cfun->machine->frame.wb_candidate2))
2349 continue;
2351 reg = gen_rtx_REG (mode, regno);
2352 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2353 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2355 regno2 = aarch64_next_callee_save (regno + 1, limit);
2357 if (regno2 <= limit
2358 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2359 == cfun->machine->frame.reg_offset[regno2]))
2361 rtx reg2 = gen_rtx_REG (mode, regno2);
2362 rtx mem2;
2364 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2365 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2366 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2368 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2369 regno = regno2;
2371 else
2372 emit_move_insn (reg, mem);
2373 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2377 /* AArch64 stack frames generated by this compiler look like:
2379 +-------------------------------+
2381 | incoming stack arguments |
2383 +-------------------------------+
2384 | | <-- incoming stack pointer (aligned)
2385 | callee-allocated save area |
2386 | for register varargs |
2388 +-------------------------------+
2389 | local variables | <-- frame_pointer_rtx
2391 +-------------------------------+
2392 | padding0 | \
2393 +-------------------------------+ |
2394 | callee-saved registers | | frame.saved_regs_size
2395 +-------------------------------+ |
2396 | LR' | |
2397 +-------------------------------+ |
2398 | FP' | / <- hard_frame_pointer_rtx (aligned)
2399 +-------------------------------+
2400 | dynamic allocation |
2401 +-------------------------------+
2402 | padding |
2403 +-------------------------------+
2404 | outgoing stack arguments | <-- arg_pointer
2406 +-------------------------------+
2407 | | <-- stack_pointer_rtx (aligned)
2409 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2410 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2411 unchanged. */
2413 /* Generate the prologue instructions for entry into a function.
2414 Establish the stack frame by decreasing the stack pointer with a
2415 properly calculated size and, if necessary, create a frame record
2416 filled with the values of LR and previous frame pointer. The
2417 current FP is also set up if it is in use. */
2419 void
2420 aarch64_expand_prologue (void)
2422 /* sub sp, sp, #<frame_size>
2423 stp {fp, lr}, [sp, #<frame_size> - 16]
2424 add fp, sp, #<frame_size> - hardfp_offset
2425 stp {cs_reg}, [fp, #-16] etc.
2427 sub sp, sp, <final_adjustment_if_any>
2429 HOST_WIDE_INT frame_size, offset;
2430 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2431 HOST_WIDE_INT hard_fp_offset;
2432 rtx_insn *insn;
2434 aarch64_layout_frame ();
2436 offset = frame_size = cfun->machine->frame.frame_size;
2437 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2438 fp_offset = frame_size - hard_fp_offset;
2440 if (flag_stack_usage_info)
2441 current_function_static_stack_size = frame_size;
2443 /* Store pairs and load pairs have a range only -512 to 504. */
2444 if (offset >= 512)
2446 /* When the frame has a large size, an initial decrease is done on
2447 the stack pointer to jump over the callee-allocated save area for
2448 register varargs, the local variable area and/or the callee-saved
2449 register area. This will allow the pre-index write-back
2450 store pair instructions to be used for setting up the stack frame
2451 efficiently. */
2452 offset = hard_fp_offset;
2453 if (offset >= 512)
2454 offset = cfun->machine->frame.saved_regs_size;
2456 frame_size -= (offset + crtl->outgoing_args_size);
2457 fp_offset = 0;
2459 if (frame_size >= 0x1000000)
2461 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2462 emit_move_insn (op0, GEN_INT (-frame_size));
2463 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2465 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2466 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2467 plus_constant (Pmode, stack_pointer_rtx,
2468 -frame_size)));
2469 RTX_FRAME_RELATED_P (insn) = 1;
2471 else if (frame_size > 0)
2473 int hi_ofs = frame_size & 0xfff000;
2474 int lo_ofs = frame_size & 0x000fff;
2476 if (hi_ofs)
2478 insn = emit_insn (gen_add2_insn
2479 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2480 RTX_FRAME_RELATED_P (insn) = 1;
2482 if (lo_ofs)
2484 insn = emit_insn (gen_add2_insn
2485 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2486 RTX_FRAME_RELATED_P (insn) = 1;
2490 else
2491 frame_size = -1;
2493 if (offset > 0)
2495 bool skip_wb = false;
2497 if (frame_pointer_needed)
2499 skip_wb = true;
2501 if (fp_offset)
2503 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2504 GEN_INT (-offset)));
2505 RTX_FRAME_RELATED_P (insn) = 1;
2507 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2508 R30_REGNUM, false);
2510 else
2511 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2513 /* Set up frame pointer to point to the location of the
2514 previous frame pointer on the stack. */
2515 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2516 stack_pointer_rtx,
2517 GEN_INT (fp_offset)));
2518 RTX_FRAME_RELATED_P (insn) = 1;
2519 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2521 else
2523 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2524 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2526 if (fp_offset
2527 || reg1 == FIRST_PSEUDO_REGISTER
2528 || (reg2 == FIRST_PSEUDO_REGISTER
2529 && offset >= 256))
2531 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2532 GEN_INT (-offset)));
2533 RTX_FRAME_RELATED_P (insn) = 1;
2535 else
2537 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2539 skip_wb = true;
2541 if (reg2 == FIRST_PSEUDO_REGISTER)
2542 aarch64_pushwb_single_reg (mode1, reg1, offset);
2543 else
2544 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2548 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2549 skip_wb);
2550 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2551 skip_wb);
2554 /* when offset >= 512,
2555 sub sp, sp, #<outgoing_args_size> */
2556 if (frame_size > -1)
2558 if (crtl->outgoing_args_size > 0)
2560 insn = emit_insn (gen_add2_insn
2561 (stack_pointer_rtx,
2562 GEN_INT (- crtl->outgoing_args_size)));
2563 RTX_FRAME_RELATED_P (insn) = 1;
2568 /* Return TRUE if we can use a simple_return insn.
2570 This function checks whether the callee saved stack is empty, which
2571 means no restore actions are need. The pro_and_epilogue will use
2572 this to check whether shrink-wrapping opt is feasible. */
2574 bool
2575 aarch64_use_return_insn_p (void)
2577 if (!reload_completed)
2578 return false;
2580 if (crtl->profile)
2581 return false;
2583 aarch64_layout_frame ();
2585 return cfun->machine->frame.frame_size == 0;
2588 /* Generate the epilogue instructions for returning from a function. */
2589 void
2590 aarch64_expand_epilogue (bool for_sibcall)
2592 HOST_WIDE_INT frame_size, offset;
2593 HOST_WIDE_INT fp_offset;
2594 HOST_WIDE_INT hard_fp_offset;
2595 rtx_insn *insn;
2596 /* We need to add memory barrier to prevent read from deallocated stack. */
2597 bool need_barrier_p = (get_frame_size () != 0
2598 || cfun->machine->frame.saved_varargs_size);
2600 aarch64_layout_frame ();
2602 offset = frame_size = cfun->machine->frame.frame_size;
2603 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2604 fp_offset = frame_size - hard_fp_offset;
2606 /* Store pairs and load pairs have a range only -512 to 504. */
2607 if (offset >= 512)
2609 offset = hard_fp_offset;
2610 if (offset >= 512)
2611 offset = cfun->machine->frame.saved_regs_size;
2613 frame_size -= (offset + crtl->outgoing_args_size);
2614 fp_offset = 0;
2615 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2617 insn = emit_insn (gen_add2_insn
2618 (stack_pointer_rtx,
2619 GEN_INT (crtl->outgoing_args_size)));
2620 RTX_FRAME_RELATED_P (insn) = 1;
2623 else
2624 frame_size = -1;
2626 /* If there were outgoing arguments or we've done dynamic stack
2627 allocation, then restore the stack pointer from the frame
2628 pointer. This is at most one insn and more efficient than using
2629 GCC's internal mechanism. */
2630 if (frame_pointer_needed
2631 && (crtl->outgoing_args_size || cfun->calls_alloca))
2633 if (cfun->calls_alloca)
2634 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2636 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2637 hard_frame_pointer_rtx,
2638 GEN_INT (0)));
2639 offset = offset - fp_offset;
2642 if (offset > 0)
2644 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2645 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2646 bool skip_wb = true;
2647 rtx cfi_ops = NULL;
2649 if (frame_pointer_needed)
2650 fp_offset = 0;
2651 else if (fp_offset
2652 || reg1 == FIRST_PSEUDO_REGISTER
2653 || (reg2 == FIRST_PSEUDO_REGISTER
2654 && offset >= 256))
2655 skip_wb = false;
2657 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2658 skip_wb, &cfi_ops);
2659 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2660 skip_wb, &cfi_ops);
2662 if (need_barrier_p)
2663 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2665 if (skip_wb)
2667 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2668 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2670 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2671 if (reg2 == FIRST_PSEUDO_REGISTER)
2673 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2674 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2675 mem = gen_rtx_MEM (mode1, mem);
2676 insn = emit_move_insn (rreg1, mem);
2678 else
2680 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2682 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2683 insn = emit_insn (aarch64_gen_loadwb_pair
2684 (mode1, stack_pointer_rtx, rreg1,
2685 rreg2, offset));
2688 else
2690 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2691 GEN_INT (offset)));
2694 /* Reset the CFA to be SP + FRAME_SIZE. */
2695 rtx new_cfa = stack_pointer_rtx;
2696 if (frame_size > 0)
2697 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2698 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2699 REG_NOTES (insn) = cfi_ops;
2700 RTX_FRAME_RELATED_P (insn) = 1;
2703 if (frame_size > 0)
2705 if (need_barrier_p)
2706 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2708 if (frame_size >= 0x1000000)
2710 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2711 emit_move_insn (op0, GEN_INT (frame_size));
2712 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2714 else
2716 int hi_ofs = frame_size & 0xfff000;
2717 int lo_ofs = frame_size & 0x000fff;
2719 if (hi_ofs && lo_ofs)
2721 insn = emit_insn (gen_add2_insn
2722 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2723 RTX_FRAME_RELATED_P (insn) = 1;
2724 frame_size = lo_ofs;
2726 insn = emit_insn (gen_add2_insn
2727 (stack_pointer_rtx, GEN_INT (frame_size)));
2730 /* Reset the CFA to be SP + 0. */
2731 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2732 RTX_FRAME_RELATED_P (insn) = 1;
2735 /* Stack adjustment for exception handler. */
2736 if (crtl->calls_eh_return)
2738 /* We need to unwind the stack by the offset computed by
2739 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2740 to be SP; letting the CFA move during this adjustment
2741 is just as correct as retaining the CFA from the body
2742 of the function. Therefore, do nothing special. */
2743 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2746 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2747 if (!for_sibcall)
2748 emit_jump_insn (ret_rtx);
2751 /* Return the place to copy the exception unwinding return address to.
2752 This will probably be a stack slot, but could (in theory be the
2753 return register). */
2755 aarch64_final_eh_return_addr (void)
2757 HOST_WIDE_INT fp_offset;
2759 aarch64_layout_frame ();
2761 fp_offset = cfun->machine->frame.frame_size
2762 - cfun->machine->frame.hard_fp_offset;
2764 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2765 return gen_rtx_REG (DImode, LR_REGNUM);
2767 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2768 result in a store to save LR introduced by builtin_eh_return () being
2769 incorrectly deleted because the alias is not detected.
2770 So in the calculation of the address to copy the exception unwinding
2771 return address to, we note 2 cases.
2772 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2773 we return a SP-relative location since all the addresses are SP-relative
2774 in this case. This prevents the store from being optimized away.
2775 If the fp_offset is not 0, then the addresses will be FP-relative and
2776 therefore we return a FP-relative location. */
2778 if (frame_pointer_needed)
2780 if (fp_offset)
2781 return gen_frame_mem (DImode,
2782 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2783 else
2784 return gen_frame_mem (DImode,
2785 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2788 /* If FP is not needed, we calculate the location of LR, which would be
2789 at the top of the saved registers block. */
2791 return gen_frame_mem (DImode,
2792 plus_constant (Pmode,
2793 stack_pointer_rtx,
2794 fp_offset
2795 + cfun->machine->frame.saved_regs_size
2796 - 2 * UNITS_PER_WORD));
2799 /* Possibly output code to build up a constant in a register. For
2800 the benefit of the costs infrastructure, returns the number of
2801 instructions which would be emitted. GENERATE inhibits or
2802 enables code generation. */
2804 static int
2805 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2807 int insns = 0;
2809 if (aarch64_bitmask_imm (val, DImode))
2811 if (generate)
2812 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2813 insns = 1;
2815 else
2817 int i;
2818 int ncount = 0;
2819 int zcount = 0;
2820 HOST_WIDE_INT valp = val >> 16;
2821 HOST_WIDE_INT valm;
2822 HOST_WIDE_INT tval;
2824 for (i = 16; i < 64; i += 16)
2826 valm = (valp & 0xffff);
2828 if (valm != 0)
2829 ++ zcount;
2831 if (valm != 0xffff)
2832 ++ ncount;
2834 valp >>= 16;
2837 /* zcount contains the number of additional MOVK instructions
2838 required if the constant is built up with an initial MOVZ instruction,
2839 while ncount is the number of MOVK instructions required if starting
2840 with a MOVN instruction. Choose the sequence that yields the fewest
2841 number of instructions, preferring MOVZ instructions when they are both
2842 the same. */
2843 if (ncount < zcount)
2845 if (generate)
2846 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2847 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2848 tval = 0xffff;
2849 insns++;
2851 else
2853 if (generate)
2854 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2855 GEN_INT (val & 0xffff));
2856 tval = 0;
2857 insns++;
2860 val >>= 16;
2862 for (i = 16; i < 64; i += 16)
2864 if ((val & 0xffff) != tval)
2866 if (generate)
2867 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2868 GEN_INT (i),
2869 GEN_INT (val & 0xffff)));
2870 insns++;
2872 val >>= 16;
2875 return insns;
2878 static void
2879 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2881 HOST_WIDE_INT mdelta = delta;
2882 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2883 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2885 if (mdelta < 0)
2886 mdelta = -mdelta;
2888 if (mdelta >= 4096 * 4096)
2890 (void) aarch64_build_constant (scratchreg, delta, true);
2891 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2893 else if (mdelta > 0)
2895 if (mdelta >= 4096)
2897 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2898 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2899 if (delta < 0)
2900 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2901 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2902 else
2903 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2904 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2906 if (mdelta % 4096 != 0)
2908 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2909 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2910 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2915 /* Output code to add DELTA to the first argument, and then jump
2916 to FUNCTION. Used for C++ multiple inheritance. */
2917 static void
2918 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2919 HOST_WIDE_INT delta,
2920 HOST_WIDE_INT vcall_offset,
2921 tree function)
2923 /* The this pointer is always in x0. Note that this differs from
2924 Arm where the this pointer maybe bumped to r1 if r0 is required
2925 to return a pointer to an aggregate. On AArch64 a result value
2926 pointer will be in x8. */
2927 int this_regno = R0_REGNUM;
2928 rtx this_rtx, temp0, temp1, addr, funexp;
2929 rtx_insn *insn;
2931 reload_completed = 1;
2932 emit_note (NOTE_INSN_PROLOGUE_END);
2934 if (vcall_offset == 0)
2935 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2936 else
2938 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2940 this_rtx = gen_rtx_REG (Pmode, this_regno);
2941 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2942 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2944 addr = this_rtx;
2945 if (delta != 0)
2947 if (delta >= -256 && delta < 256)
2948 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2949 plus_constant (Pmode, this_rtx, delta));
2950 else
2951 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2954 if (Pmode == ptr_mode)
2955 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2956 else
2957 aarch64_emit_move (temp0,
2958 gen_rtx_ZERO_EXTEND (Pmode,
2959 gen_rtx_MEM (ptr_mode, addr)));
2961 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2962 addr = plus_constant (Pmode, temp0, vcall_offset);
2963 else
2965 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2966 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2969 if (Pmode == ptr_mode)
2970 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2971 else
2972 aarch64_emit_move (temp1,
2973 gen_rtx_SIGN_EXTEND (Pmode,
2974 gen_rtx_MEM (ptr_mode, addr)));
2976 emit_insn (gen_add2_insn (this_rtx, temp1));
2979 /* Generate a tail call to the target function. */
2980 if (!TREE_USED (function))
2982 assemble_external (function);
2983 TREE_USED (function) = 1;
2985 funexp = XEXP (DECL_RTL (function), 0);
2986 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2987 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2988 SIBLING_CALL_P (insn) = 1;
2990 insn = get_insns ();
2991 shorten_branches (insn);
2992 final_start_function (insn, file, 1);
2993 final (insn, file, 1);
2994 final_end_function ();
2996 /* Stop pretending to be a post-reload pass. */
2997 reload_completed = 0;
3000 static bool
3001 aarch64_tls_referenced_p (rtx x)
3003 if (!TARGET_HAVE_TLS)
3004 return false;
3005 subrtx_iterator::array_type array;
3006 FOR_EACH_SUBRTX (iter, array, x, ALL)
3008 const_rtx x = *iter;
3009 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3010 return true;
3011 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3012 TLS offsets, not real symbol references. */
3013 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3014 iter.skip_subrtxes ();
3016 return false;
3020 static int
3021 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3023 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3024 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3026 if (*imm1 < *imm2)
3027 return -1;
3028 if (*imm1 > *imm2)
3029 return +1;
3030 return 0;
3034 static void
3035 aarch64_build_bitmask_table (void)
3037 unsigned HOST_WIDE_INT mask, imm;
3038 unsigned int log_e, e, s, r;
3039 unsigned int nimms = 0;
3041 for (log_e = 1; log_e <= 6; log_e++)
3043 e = 1 << log_e;
3044 if (e == 64)
3045 mask = ~(HOST_WIDE_INT) 0;
3046 else
3047 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3048 for (s = 1; s < e; s++)
3050 for (r = 0; r < e; r++)
3052 /* set s consecutive bits to 1 (s < 64) */
3053 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3054 /* rotate right by r */
3055 if (r != 0)
3056 imm = ((imm >> r) | (imm << (e - r))) & mask;
3057 /* replicate the constant depending on SIMD size */
3058 switch (log_e) {
3059 case 1: imm |= (imm << 2);
3060 case 2: imm |= (imm << 4);
3061 case 3: imm |= (imm << 8);
3062 case 4: imm |= (imm << 16);
3063 case 5: imm |= (imm << 32);
3064 case 6:
3065 break;
3066 default:
3067 gcc_unreachable ();
3069 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3070 aarch64_bitmasks[nimms++] = imm;
3075 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3076 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3077 aarch64_bitmasks_cmp);
3081 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3082 a left shift of 0 or 12 bits. */
3083 bool
3084 aarch64_uimm12_shift (HOST_WIDE_INT val)
3086 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3087 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3092 /* Return true if val is an immediate that can be loaded into a
3093 register by a MOVZ instruction. */
3094 static bool
3095 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3097 if (GET_MODE_SIZE (mode) > 4)
3099 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3100 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3101 return 1;
3103 else
3105 /* Ignore sign extension. */
3106 val &= (HOST_WIDE_INT) 0xffffffff;
3108 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3109 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3113 /* Return true if val is a valid bitmask immediate. */
3114 bool
3115 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3117 if (GET_MODE_SIZE (mode) < 8)
3119 /* Replicate bit pattern. */
3120 val &= (HOST_WIDE_INT) 0xffffffff;
3121 val |= val << 32;
3123 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3124 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3128 /* Return true if val is an immediate that can be loaded into a
3129 register in a single instruction. */
3130 bool
3131 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3133 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3134 return 1;
3135 return aarch64_bitmask_imm (val, mode);
3138 static bool
3139 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3141 rtx base, offset;
3143 if (GET_CODE (x) == HIGH)
3144 return true;
3146 split_const (x, &base, &offset);
3147 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3149 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3150 != SYMBOL_FORCE_TO_MEM)
3151 return true;
3152 else
3153 /* Avoid generating a 64-bit relocation in ILP32; leave
3154 to aarch64_expand_mov_immediate to handle it properly. */
3155 return mode != ptr_mode;
3158 return aarch64_tls_referenced_p (x);
3161 /* Return true if register REGNO is a valid index register.
3162 STRICT_P is true if REG_OK_STRICT is in effect. */
3164 bool
3165 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3167 if (!HARD_REGISTER_NUM_P (regno))
3169 if (!strict_p)
3170 return true;
3172 if (!reg_renumber)
3173 return false;
3175 regno = reg_renumber[regno];
3177 return GP_REGNUM_P (regno);
3180 /* Return true if register REGNO is a valid base register for mode MODE.
3181 STRICT_P is true if REG_OK_STRICT is in effect. */
3183 bool
3184 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3186 if (!HARD_REGISTER_NUM_P (regno))
3188 if (!strict_p)
3189 return true;
3191 if (!reg_renumber)
3192 return false;
3194 regno = reg_renumber[regno];
3197 /* The fake registers will be eliminated to either the stack or
3198 hard frame pointer, both of which are usually valid base registers.
3199 Reload deals with the cases where the eliminated form isn't valid. */
3200 return (GP_REGNUM_P (regno)
3201 || regno == SP_REGNUM
3202 || regno == FRAME_POINTER_REGNUM
3203 || regno == ARG_POINTER_REGNUM);
3206 /* Return true if X is a valid base register for mode MODE.
3207 STRICT_P is true if REG_OK_STRICT is in effect. */
3209 static bool
3210 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3212 if (!strict_p && GET_CODE (x) == SUBREG)
3213 x = SUBREG_REG (x);
3215 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3218 /* Return true if address offset is a valid index. If it is, fill in INFO
3219 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3221 static bool
3222 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3223 machine_mode mode, bool strict_p)
3225 enum aarch64_address_type type;
3226 rtx index;
3227 int shift;
3229 /* (reg:P) */
3230 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3231 && GET_MODE (x) == Pmode)
3233 type = ADDRESS_REG_REG;
3234 index = x;
3235 shift = 0;
3237 /* (sign_extend:DI (reg:SI)) */
3238 else if ((GET_CODE (x) == SIGN_EXTEND
3239 || GET_CODE (x) == ZERO_EXTEND)
3240 && GET_MODE (x) == DImode
3241 && GET_MODE (XEXP (x, 0)) == SImode)
3243 type = (GET_CODE (x) == SIGN_EXTEND)
3244 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3245 index = XEXP (x, 0);
3246 shift = 0;
3248 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3249 else if (GET_CODE (x) == MULT
3250 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3251 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3252 && GET_MODE (XEXP (x, 0)) == DImode
3253 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3254 && CONST_INT_P (XEXP (x, 1)))
3256 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3257 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3258 index = XEXP (XEXP (x, 0), 0);
3259 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3261 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3262 else if (GET_CODE (x) == ASHIFT
3263 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3264 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3265 && GET_MODE (XEXP (x, 0)) == DImode
3266 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3267 && CONST_INT_P (XEXP (x, 1)))
3269 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3270 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3271 index = XEXP (XEXP (x, 0), 0);
3272 shift = INTVAL (XEXP (x, 1));
3274 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3275 else if ((GET_CODE (x) == SIGN_EXTRACT
3276 || GET_CODE (x) == ZERO_EXTRACT)
3277 && GET_MODE (x) == DImode
3278 && GET_CODE (XEXP (x, 0)) == MULT
3279 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3280 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3282 type = (GET_CODE (x) == SIGN_EXTRACT)
3283 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3284 index = XEXP (XEXP (x, 0), 0);
3285 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3286 if (INTVAL (XEXP (x, 1)) != 32 + shift
3287 || INTVAL (XEXP (x, 2)) != 0)
3288 shift = -1;
3290 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3291 (const_int 0xffffffff<<shift)) */
3292 else if (GET_CODE (x) == AND
3293 && GET_MODE (x) == DImode
3294 && GET_CODE (XEXP (x, 0)) == MULT
3295 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3296 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3297 && CONST_INT_P (XEXP (x, 1)))
3299 type = ADDRESS_REG_UXTW;
3300 index = XEXP (XEXP (x, 0), 0);
3301 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3302 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3303 shift = -1;
3305 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3306 else if ((GET_CODE (x) == SIGN_EXTRACT
3307 || GET_CODE (x) == ZERO_EXTRACT)
3308 && GET_MODE (x) == DImode
3309 && GET_CODE (XEXP (x, 0)) == ASHIFT
3310 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3311 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3313 type = (GET_CODE (x) == SIGN_EXTRACT)
3314 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3315 index = XEXP (XEXP (x, 0), 0);
3316 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3317 if (INTVAL (XEXP (x, 1)) != 32 + shift
3318 || INTVAL (XEXP (x, 2)) != 0)
3319 shift = -1;
3321 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3322 (const_int 0xffffffff<<shift)) */
3323 else if (GET_CODE (x) == AND
3324 && GET_MODE (x) == DImode
3325 && GET_CODE (XEXP (x, 0)) == ASHIFT
3326 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3327 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3328 && CONST_INT_P (XEXP (x, 1)))
3330 type = ADDRESS_REG_UXTW;
3331 index = XEXP (XEXP (x, 0), 0);
3332 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3333 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3334 shift = -1;
3336 /* (mult:P (reg:P) (const_int scale)) */
3337 else if (GET_CODE (x) == MULT
3338 && GET_MODE (x) == Pmode
3339 && GET_MODE (XEXP (x, 0)) == Pmode
3340 && CONST_INT_P (XEXP (x, 1)))
3342 type = ADDRESS_REG_REG;
3343 index = XEXP (x, 0);
3344 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3346 /* (ashift:P (reg:P) (const_int shift)) */
3347 else if (GET_CODE (x) == ASHIFT
3348 && GET_MODE (x) == Pmode
3349 && GET_MODE (XEXP (x, 0)) == Pmode
3350 && CONST_INT_P (XEXP (x, 1)))
3352 type = ADDRESS_REG_REG;
3353 index = XEXP (x, 0);
3354 shift = INTVAL (XEXP (x, 1));
3356 else
3357 return false;
3359 if (GET_CODE (index) == SUBREG)
3360 index = SUBREG_REG (index);
3362 if ((shift == 0 ||
3363 (shift > 0 && shift <= 3
3364 && (1 << shift) == GET_MODE_SIZE (mode)))
3365 && REG_P (index)
3366 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3368 info->type = type;
3369 info->offset = index;
3370 info->shift = shift;
3371 return true;
3374 return false;
3377 bool
3378 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3380 return (offset >= -64 * GET_MODE_SIZE (mode)
3381 && offset < 64 * GET_MODE_SIZE (mode)
3382 && offset % GET_MODE_SIZE (mode) == 0);
3385 static inline bool
3386 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3387 HOST_WIDE_INT offset)
3389 return offset >= -256 && offset < 256;
3392 static inline bool
3393 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3395 return (offset >= 0
3396 && offset < 4096 * GET_MODE_SIZE (mode)
3397 && offset % GET_MODE_SIZE (mode) == 0);
3400 /* Return true if X is a valid address for machine mode MODE. If it is,
3401 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3402 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3404 static bool
3405 aarch64_classify_address (struct aarch64_address_info *info,
3406 rtx x, machine_mode mode,
3407 RTX_CODE outer_code, bool strict_p)
3409 enum rtx_code code = GET_CODE (x);
3410 rtx op0, op1;
3412 /* On BE, we use load/store pair for all large int mode load/stores. */
3413 bool load_store_pair_p = (outer_code == PARALLEL
3414 || (BYTES_BIG_ENDIAN
3415 && aarch64_vect_struct_mode_p (mode)));
3417 bool allow_reg_index_p =
3418 !load_store_pair_p
3419 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3420 && !aarch64_vect_struct_mode_p (mode);
3422 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3423 REG addressing. */
3424 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3425 && (code != POST_INC && code != REG))
3426 return false;
3428 switch (code)
3430 case REG:
3431 case SUBREG:
3432 info->type = ADDRESS_REG_IMM;
3433 info->base = x;
3434 info->offset = const0_rtx;
3435 return aarch64_base_register_rtx_p (x, strict_p);
3437 case PLUS:
3438 op0 = XEXP (x, 0);
3439 op1 = XEXP (x, 1);
3441 if (! strict_p
3442 && REG_P (op0)
3443 && (op0 == virtual_stack_vars_rtx
3444 || op0 == frame_pointer_rtx
3445 || op0 == arg_pointer_rtx)
3446 && CONST_INT_P (op1))
3448 info->type = ADDRESS_REG_IMM;
3449 info->base = op0;
3450 info->offset = op1;
3452 return true;
3455 if (GET_MODE_SIZE (mode) != 0
3456 && CONST_INT_P (op1)
3457 && aarch64_base_register_rtx_p (op0, strict_p))
3459 HOST_WIDE_INT offset = INTVAL (op1);
3461 info->type = ADDRESS_REG_IMM;
3462 info->base = op0;
3463 info->offset = op1;
3465 /* TImode and TFmode values are allowed in both pairs of X
3466 registers and individual Q registers. The available
3467 address modes are:
3468 X,X: 7-bit signed scaled offset
3469 Q: 9-bit signed offset
3470 We conservatively require an offset representable in either mode.
3472 if (mode == TImode || mode == TFmode)
3473 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3474 && offset_9bit_signed_unscaled_p (mode, offset));
3476 /* A 7bit offset check because OImode will emit a ldp/stp
3477 instruction (only big endian will get here).
3478 For ldp/stp instructions, the offset is scaled for the size of a
3479 single element of the pair. */
3480 if (mode == OImode)
3481 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3483 /* Three 9/12 bit offsets checks because CImode will emit three
3484 ldr/str instructions (only big endian will get here). */
3485 if (mode == CImode)
3486 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3487 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3488 || offset_12bit_unsigned_scaled_p (V16QImode,
3489 offset + 32)));
3491 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3492 instructions (only big endian will get here). */
3493 if (mode == XImode)
3494 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3495 && aarch64_offset_7bit_signed_scaled_p (TImode,
3496 offset + 32));
3498 if (load_store_pair_p)
3499 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3500 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3501 else
3502 return (offset_9bit_signed_unscaled_p (mode, offset)
3503 || offset_12bit_unsigned_scaled_p (mode, offset));
3506 if (allow_reg_index_p)
3508 /* Look for base + (scaled/extended) index register. */
3509 if (aarch64_base_register_rtx_p (op0, strict_p)
3510 && aarch64_classify_index (info, op1, mode, strict_p))
3512 info->base = op0;
3513 return true;
3515 if (aarch64_base_register_rtx_p (op1, strict_p)
3516 && aarch64_classify_index (info, op0, mode, strict_p))
3518 info->base = op1;
3519 return true;
3523 return false;
3525 case POST_INC:
3526 case POST_DEC:
3527 case PRE_INC:
3528 case PRE_DEC:
3529 info->type = ADDRESS_REG_WB;
3530 info->base = XEXP (x, 0);
3531 info->offset = NULL_RTX;
3532 return aarch64_base_register_rtx_p (info->base, strict_p);
3534 case POST_MODIFY:
3535 case PRE_MODIFY:
3536 info->type = ADDRESS_REG_WB;
3537 info->base = XEXP (x, 0);
3538 if (GET_CODE (XEXP (x, 1)) == PLUS
3539 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3540 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3541 && aarch64_base_register_rtx_p (info->base, strict_p))
3543 HOST_WIDE_INT offset;
3544 info->offset = XEXP (XEXP (x, 1), 1);
3545 offset = INTVAL (info->offset);
3547 /* TImode and TFmode values are allowed in both pairs of X
3548 registers and individual Q registers. The available
3549 address modes are:
3550 X,X: 7-bit signed scaled offset
3551 Q: 9-bit signed offset
3552 We conservatively require an offset representable in either mode.
3554 if (mode == TImode || mode == TFmode)
3555 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3556 && offset_9bit_signed_unscaled_p (mode, offset));
3558 if (load_store_pair_p)
3559 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3560 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3561 else
3562 return offset_9bit_signed_unscaled_p (mode, offset);
3564 return false;
3566 case CONST:
3567 case SYMBOL_REF:
3568 case LABEL_REF:
3569 /* load literal: pc-relative constant pool entry. Only supported
3570 for SI mode or larger. */
3571 info->type = ADDRESS_SYMBOLIC;
3573 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3575 rtx sym, addend;
3577 split_const (x, &sym, &addend);
3578 return (GET_CODE (sym) == LABEL_REF
3579 || (GET_CODE (sym) == SYMBOL_REF
3580 && CONSTANT_POOL_ADDRESS_P (sym)));
3582 return false;
3584 case LO_SUM:
3585 info->type = ADDRESS_LO_SUM;
3586 info->base = XEXP (x, 0);
3587 info->offset = XEXP (x, 1);
3588 if (allow_reg_index_p
3589 && aarch64_base_register_rtx_p (info->base, strict_p))
3591 rtx sym, offs;
3592 split_const (info->offset, &sym, &offs);
3593 if (GET_CODE (sym) == SYMBOL_REF
3594 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3595 == SYMBOL_SMALL_ABSOLUTE))
3597 /* The symbol and offset must be aligned to the access size. */
3598 unsigned int align;
3599 unsigned int ref_size;
3601 if (CONSTANT_POOL_ADDRESS_P (sym))
3602 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3603 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3605 tree exp = SYMBOL_REF_DECL (sym);
3606 align = TYPE_ALIGN (TREE_TYPE (exp));
3607 align = CONSTANT_ALIGNMENT (exp, align);
3609 else if (SYMBOL_REF_DECL (sym))
3610 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3611 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3612 && SYMBOL_REF_BLOCK (sym) != NULL)
3613 align = SYMBOL_REF_BLOCK (sym)->alignment;
3614 else
3615 align = BITS_PER_UNIT;
3617 ref_size = GET_MODE_SIZE (mode);
3618 if (ref_size == 0)
3619 ref_size = GET_MODE_SIZE (DImode);
3621 return ((INTVAL (offs) & (ref_size - 1)) == 0
3622 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3625 return false;
3627 default:
3628 return false;
3632 bool
3633 aarch64_symbolic_address_p (rtx x)
3635 rtx offset;
3637 split_const (x, &x, &offset);
3638 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3641 /* Classify the base of symbolic expression X, given that X appears in
3642 context CONTEXT. */
3644 enum aarch64_symbol_type
3645 aarch64_classify_symbolic_expression (rtx x,
3646 enum aarch64_symbol_context context)
3648 rtx offset;
3650 split_const (x, &x, &offset);
3651 return aarch64_classify_symbol (x, offset, context);
3655 /* Return TRUE if X is a legitimate address for accessing memory in
3656 mode MODE. */
3657 static bool
3658 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3660 struct aarch64_address_info addr;
3662 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3665 /* Return TRUE if X is a legitimate address for accessing memory in
3666 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3667 pair operation. */
3668 bool
3669 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3670 RTX_CODE outer_code, bool strict_p)
3672 struct aarch64_address_info addr;
3674 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3677 /* Return TRUE if rtx X is immediate constant 0.0 */
3678 bool
3679 aarch64_float_const_zero_rtx_p (rtx x)
3681 REAL_VALUE_TYPE r;
3683 if (GET_MODE (x) == VOIDmode)
3684 return false;
3686 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3687 if (REAL_VALUE_MINUS_ZERO (r))
3688 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3689 return REAL_VALUES_EQUAL (r, dconst0);
3692 /* Return the fixed registers used for condition codes. */
3694 static bool
3695 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3697 *p1 = CC_REGNUM;
3698 *p2 = INVALID_REGNUM;
3699 return true;
3702 /* Emit call insn with PAT and do aarch64-specific handling. */
3704 void
3705 aarch64_emit_call_insn (rtx pat)
3707 rtx insn = emit_call_insn (pat);
3709 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3710 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3711 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3714 machine_mode
3715 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3717 /* All floating point compares return CCFP if it is an equality
3718 comparison, and CCFPE otherwise. */
3719 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3721 switch (code)
3723 case EQ:
3724 case NE:
3725 case UNORDERED:
3726 case ORDERED:
3727 case UNLT:
3728 case UNLE:
3729 case UNGT:
3730 case UNGE:
3731 case UNEQ:
3732 case LTGT:
3733 return CCFPmode;
3735 case LT:
3736 case LE:
3737 case GT:
3738 case GE:
3739 return CCFPEmode;
3741 default:
3742 gcc_unreachable ();
3746 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3747 && y == const0_rtx
3748 && (code == EQ || code == NE || code == LT || code == GE)
3749 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3750 || GET_CODE (x) == NEG))
3751 return CC_NZmode;
3753 /* A compare with a shifted operand. Because of canonicalization,
3754 the comparison will have to be swapped when we emit the assembly
3755 code. */
3756 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3757 && (REG_P (y) || GET_CODE (y) == SUBREG)
3758 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3759 || GET_CODE (x) == LSHIFTRT
3760 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3761 return CC_SWPmode;
3763 /* Similarly for a negated operand, but we can only do this for
3764 equalities. */
3765 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3766 && (REG_P (y) || GET_CODE (y) == SUBREG)
3767 && (code == EQ || code == NE)
3768 && GET_CODE (x) == NEG)
3769 return CC_Zmode;
3771 /* A compare of a mode narrower than SI mode against zero can be done
3772 by extending the value in the comparison. */
3773 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3774 && y == const0_rtx)
3775 /* Only use sign-extension if we really need it. */
3776 return ((code == GT || code == GE || code == LE || code == LT)
3777 ? CC_SESWPmode : CC_ZESWPmode);
3779 /* For everything else, return CCmode. */
3780 return CCmode;
3783 static int
3784 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3787 aarch64_get_condition_code (rtx x)
3789 machine_mode mode = GET_MODE (XEXP (x, 0));
3790 enum rtx_code comp_code = GET_CODE (x);
3792 if (GET_MODE_CLASS (mode) != MODE_CC)
3793 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3794 return aarch64_get_condition_code_1 (mode, comp_code);
3797 static int
3798 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3800 int ne = -1, eq = -1;
3801 switch (mode)
3803 case CCFPmode:
3804 case CCFPEmode:
3805 switch (comp_code)
3807 case GE: return AARCH64_GE;
3808 case GT: return AARCH64_GT;
3809 case LE: return AARCH64_LS;
3810 case LT: return AARCH64_MI;
3811 case NE: return AARCH64_NE;
3812 case EQ: return AARCH64_EQ;
3813 case ORDERED: return AARCH64_VC;
3814 case UNORDERED: return AARCH64_VS;
3815 case UNLT: return AARCH64_LT;
3816 case UNLE: return AARCH64_LE;
3817 case UNGT: return AARCH64_HI;
3818 case UNGE: return AARCH64_PL;
3819 default: return -1;
3821 break;
3823 case CC_DNEmode:
3824 ne = AARCH64_NE;
3825 eq = AARCH64_EQ;
3826 break;
3828 case CC_DEQmode:
3829 ne = AARCH64_EQ;
3830 eq = AARCH64_NE;
3831 break;
3833 case CC_DGEmode:
3834 ne = AARCH64_GE;
3835 eq = AARCH64_LT;
3836 break;
3838 case CC_DLTmode:
3839 ne = AARCH64_LT;
3840 eq = AARCH64_GE;
3841 break;
3843 case CC_DGTmode:
3844 ne = AARCH64_GT;
3845 eq = AARCH64_LE;
3846 break;
3848 case CC_DLEmode:
3849 ne = AARCH64_LE;
3850 eq = AARCH64_GT;
3851 break;
3853 case CC_DGEUmode:
3854 ne = AARCH64_CS;
3855 eq = AARCH64_CC;
3856 break;
3858 case CC_DLTUmode:
3859 ne = AARCH64_CC;
3860 eq = AARCH64_CS;
3861 break;
3863 case CC_DGTUmode:
3864 ne = AARCH64_HI;
3865 eq = AARCH64_LS;
3866 break;
3868 case CC_DLEUmode:
3869 ne = AARCH64_LS;
3870 eq = AARCH64_HI;
3871 break;
3873 case CCmode:
3874 switch (comp_code)
3876 case NE: return AARCH64_NE;
3877 case EQ: return AARCH64_EQ;
3878 case GE: return AARCH64_GE;
3879 case GT: return AARCH64_GT;
3880 case LE: return AARCH64_LE;
3881 case LT: return AARCH64_LT;
3882 case GEU: return AARCH64_CS;
3883 case GTU: return AARCH64_HI;
3884 case LEU: return AARCH64_LS;
3885 case LTU: return AARCH64_CC;
3886 default: return -1;
3888 break;
3890 case CC_SWPmode:
3891 case CC_ZESWPmode:
3892 case CC_SESWPmode:
3893 switch (comp_code)
3895 case NE: return AARCH64_NE;
3896 case EQ: return AARCH64_EQ;
3897 case GE: return AARCH64_LE;
3898 case GT: return AARCH64_LT;
3899 case LE: return AARCH64_GE;
3900 case LT: return AARCH64_GT;
3901 case GEU: return AARCH64_LS;
3902 case GTU: return AARCH64_CC;
3903 case LEU: return AARCH64_CS;
3904 case LTU: return AARCH64_HI;
3905 default: return -1;
3907 break;
3909 case CC_NZmode:
3910 switch (comp_code)
3912 case NE: return AARCH64_NE;
3913 case EQ: return AARCH64_EQ;
3914 case GE: return AARCH64_PL;
3915 case LT: return AARCH64_MI;
3916 default: return -1;
3918 break;
3920 case CC_Zmode:
3921 switch (comp_code)
3923 case NE: return AARCH64_NE;
3924 case EQ: return AARCH64_EQ;
3925 default: return -1;
3927 break;
3929 default:
3930 return -1;
3931 break;
3934 if (comp_code == NE)
3935 return ne;
3937 if (comp_code == EQ)
3938 return eq;
3940 return -1;
3943 bool
3944 aarch64_const_vec_all_same_in_range_p (rtx x,
3945 HOST_WIDE_INT minval,
3946 HOST_WIDE_INT maxval)
3948 HOST_WIDE_INT firstval;
3949 int count, i;
3951 if (GET_CODE (x) != CONST_VECTOR
3952 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3953 return false;
3955 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3956 if (firstval < minval || firstval > maxval)
3957 return false;
3959 count = CONST_VECTOR_NUNITS (x);
3960 for (i = 1; i < count; i++)
3961 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3962 return false;
3964 return true;
3967 bool
3968 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3970 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3973 static unsigned
3974 bit_count (unsigned HOST_WIDE_INT value)
3976 unsigned count = 0;
3978 while (value)
3980 count++;
3981 value &= value - 1;
3984 return count;
3987 /* N Z C V. */
3988 #define AARCH64_CC_V 1
3989 #define AARCH64_CC_C (1 << 1)
3990 #define AARCH64_CC_Z (1 << 2)
3991 #define AARCH64_CC_N (1 << 3)
3993 /* N Z C V flags for ccmp. The first code is for AND op and the other
3994 is for IOR op. Indexed by AARCH64_COND_CODE. */
3995 static const int aarch64_nzcv_codes[][2] =
3997 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3998 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3999 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4000 {0, AARCH64_CC_C}, /* CC, C == 0. */
4001 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4002 {0, AARCH64_CC_N}, /* PL, N == 0. */
4003 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4004 {0, AARCH64_CC_V}, /* VC, V == 0. */
4005 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4006 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4007 {0, AARCH64_CC_V}, /* GE, N == V. */
4008 {AARCH64_CC_V, 0}, /* LT, N != V. */
4009 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4010 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4011 {0, 0}, /* AL, Any. */
4012 {0, 0}, /* NV, Any. */
4016 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4018 switch (mode)
4020 case CC_DNEmode:
4021 return NE;
4023 case CC_DEQmode:
4024 return EQ;
4026 case CC_DLEmode:
4027 return LE;
4029 case CC_DGTmode:
4030 return GT;
4032 case CC_DLTmode:
4033 return LT;
4035 case CC_DGEmode:
4036 return GE;
4038 case CC_DLEUmode:
4039 return LEU;
4041 case CC_DGTUmode:
4042 return GTU;
4044 case CC_DLTUmode:
4045 return LTU;
4047 case CC_DGEUmode:
4048 return GEU;
4050 default:
4051 gcc_unreachable ();
4056 void
4057 aarch64_print_operand (FILE *f, rtx x, char code)
4059 switch (code)
4061 /* An integer or symbol address without a preceding # sign. */
4062 case 'c':
4063 switch (GET_CODE (x))
4065 case CONST_INT:
4066 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4067 break;
4069 case SYMBOL_REF:
4070 output_addr_const (f, x);
4071 break;
4073 case CONST:
4074 if (GET_CODE (XEXP (x, 0)) == PLUS
4075 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4077 output_addr_const (f, x);
4078 break;
4080 /* Fall through. */
4082 default:
4083 output_operand_lossage ("Unsupported operand for code '%c'", code);
4085 break;
4087 case 'e':
4088 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4090 int n;
4092 if (!CONST_INT_P (x)
4093 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4095 output_operand_lossage ("invalid operand for '%%%c'", code);
4096 return;
4099 switch (n)
4101 case 3:
4102 fputc ('b', f);
4103 break;
4104 case 4:
4105 fputc ('h', f);
4106 break;
4107 case 5:
4108 fputc ('w', f);
4109 break;
4110 default:
4111 output_operand_lossage ("invalid operand for '%%%c'", code);
4112 return;
4115 break;
4117 case 'p':
4119 int n;
4121 /* Print N such that 2^N == X. */
4122 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4124 output_operand_lossage ("invalid operand for '%%%c'", code);
4125 return;
4128 asm_fprintf (f, "%d", n);
4130 break;
4132 case 'P':
4133 /* Print the number of non-zero bits in X (a const_int). */
4134 if (!CONST_INT_P (x))
4136 output_operand_lossage ("invalid operand for '%%%c'", code);
4137 return;
4140 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4141 break;
4143 case 'H':
4144 /* Print the higher numbered register of a pair (TImode) of regs. */
4145 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4147 output_operand_lossage ("invalid operand for '%%%c'", code);
4148 return;
4151 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4152 break;
4154 case 'm':
4156 int cond_code;
4157 /* Print a condition (eq, ne, etc). */
4159 /* CONST_TRUE_RTX means always -- that's the default. */
4160 if (x == const_true_rtx)
4161 return;
4163 if (!COMPARISON_P (x))
4165 output_operand_lossage ("invalid operand for '%%%c'", code);
4166 return;
4169 cond_code = aarch64_get_condition_code (x);
4170 gcc_assert (cond_code >= 0);
4171 fputs (aarch64_condition_codes[cond_code], f);
4173 break;
4175 case 'M':
4177 int cond_code;
4178 /* Print the inverse of a condition (eq <-> ne, etc). */
4180 /* CONST_TRUE_RTX means never -- that's the default. */
4181 if (x == const_true_rtx)
4183 fputs ("nv", f);
4184 return;
4187 if (!COMPARISON_P (x))
4189 output_operand_lossage ("invalid operand for '%%%c'", code);
4190 return;
4192 cond_code = aarch64_get_condition_code (x);
4193 gcc_assert (cond_code >= 0);
4194 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4195 (cond_code)], f);
4197 break;
4199 case 'b':
4200 case 'h':
4201 case 's':
4202 case 'd':
4203 case 'q':
4204 /* Print a scalar FP/SIMD register name. */
4205 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4207 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4208 return;
4210 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4211 break;
4213 case 'S':
4214 case 'T':
4215 case 'U':
4216 case 'V':
4217 /* Print the first FP/SIMD register name in a list. */
4218 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4220 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4221 return;
4223 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4224 break;
4226 case 'R':
4227 /* Print a scalar FP/SIMD register name + 1. */
4228 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4230 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4231 return;
4233 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4234 break;
4236 case 'X':
4237 /* Print bottom 16 bits of integer constant in hex. */
4238 if (!CONST_INT_P (x))
4240 output_operand_lossage ("invalid operand for '%%%c'", code);
4241 return;
4243 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4244 break;
4246 case 'w':
4247 case 'x':
4248 /* Print a general register name or the zero register (32-bit or
4249 64-bit). */
4250 if (x == const0_rtx
4251 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4253 asm_fprintf (f, "%czr", code);
4254 break;
4257 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4259 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4260 break;
4263 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4265 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4266 break;
4269 /* Fall through */
4271 case 0:
4272 /* Print a normal operand, if it's a general register, then we
4273 assume DImode. */
4274 if (x == NULL)
4276 output_operand_lossage ("missing operand");
4277 return;
4280 switch (GET_CODE (x))
4282 case REG:
4283 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4284 break;
4286 case MEM:
4287 aarch64_memory_reference_mode = GET_MODE (x);
4288 output_address (XEXP (x, 0));
4289 break;
4291 case LABEL_REF:
4292 case SYMBOL_REF:
4293 output_addr_const (asm_out_file, x);
4294 break;
4296 case CONST_INT:
4297 asm_fprintf (f, "%wd", INTVAL (x));
4298 break;
4300 case CONST_VECTOR:
4301 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4303 gcc_assert (
4304 aarch64_const_vec_all_same_in_range_p (x,
4305 HOST_WIDE_INT_MIN,
4306 HOST_WIDE_INT_MAX));
4307 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4309 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4311 fputc ('0', f);
4313 else
4314 gcc_unreachable ();
4315 break;
4317 case CONST_DOUBLE:
4318 /* CONST_DOUBLE can represent a double-width integer.
4319 In this case, the mode of x is VOIDmode. */
4320 if (GET_MODE (x) == VOIDmode)
4321 ; /* Do Nothing. */
4322 else if (aarch64_float_const_zero_rtx_p (x))
4324 fputc ('0', f);
4325 break;
4327 else if (aarch64_float_const_representable_p (x))
4329 #define buf_size 20
4330 char float_buf[buf_size] = {'\0'};
4331 REAL_VALUE_TYPE r;
4332 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4333 real_to_decimal_for_mode (float_buf, &r,
4334 buf_size, buf_size,
4335 1, GET_MODE (x));
4336 asm_fprintf (asm_out_file, "%s", float_buf);
4337 break;
4338 #undef buf_size
4340 output_operand_lossage ("invalid constant");
4341 return;
4342 default:
4343 output_operand_lossage ("invalid operand");
4344 return;
4346 break;
4348 case 'A':
4349 if (GET_CODE (x) == HIGH)
4350 x = XEXP (x, 0);
4352 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4354 case SYMBOL_SMALL_GOT:
4355 asm_fprintf (asm_out_file, ":got:");
4356 break;
4358 case SYMBOL_SMALL_TLSGD:
4359 asm_fprintf (asm_out_file, ":tlsgd:");
4360 break;
4362 case SYMBOL_SMALL_TLSDESC:
4363 asm_fprintf (asm_out_file, ":tlsdesc:");
4364 break;
4366 case SYMBOL_SMALL_GOTTPREL:
4367 asm_fprintf (asm_out_file, ":gottprel:");
4368 break;
4370 case SYMBOL_SMALL_TPREL:
4371 asm_fprintf (asm_out_file, ":tprel:");
4372 break;
4374 case SYMBOL_TINY_GOT:
4375 gcc_unreachable ();
4376 break;
4378 default:
4379 break;
4381 output_addr_const (asm_out_file, x);
4382 break;
4384 case 'L':
4385 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4387 case SYMBOL_SMALL_GOT:
4388 asm_fprintf (asm_out_file, ":lo12:");
4389 break;
4391 case SYMBOL_SMALL_TLSGD:
4392 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4393 break;
4395 case SYMBOL_SMALL_TLSDESC:
4396 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4397 break;
4399 case SYMBOL_SMALL_GOTTPREL:
4400 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4401 break;
4403 case SYMBOL_SMALL_TPREL:
4404 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4405 break;
4407 case SYMBOL_TINY_GOT:
4408 asm_fprintf (asm_out_file, ":got:");
4409 break;
4411 default:
4412 break;
4414 output_addr_const (asm_out_file, x);
4415 break;
4417 case 'G':
4419 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4421 case SYMBOL_SMALL_TPREL:
4422 asm_fprintf (asm_out_file, ":tprel_hi12:");
4423 break;
4424 default:
4425 break;
4427 output_addr_const (asm_out_file, x);
4428 break;
4430 case 'K':
4432 int cond_code;
4433 /* Print nzcv. */
4435 if (!COMPARISON_P (x))
4437 output_operand_lossage ("invalid operand for '%%%c'", code);
4438 return;
4441 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4442 gcc_assert (cond_code >= 0);
4443 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4445 break;
4447 case 'k':
4449 int cond_code;
4450 /* Print nzcv. */
4452 if (!COMPARISON_P (x))
4454 output_operand_lossage ("invalid operand for '%%%c'", code);
4455 return;
4458 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4459 gcc_assert (cond_code >= 0);
4460 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4462 break;
4464 default:
4465 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4466 return;
4470 void
4471 aarch64_print_operand_address (FILE *f, rtx x)
4473 struct aarch64_address_info addr;
4475 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4476 MEM, true))
4477 switch (addr.type)
4479 case ADDRESS_REG_IMM:
4480 if (addr.offset == const0_rtx)
4481 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4482 else
4483 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4484 INTVAL (addr.offset));
4485 return;
4487 case ADDRESS_REG_REG:
4488 if (addr.shift == 0)
4489 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4490 reg_names [REGNO (addr.offset)]);
4491 else
4492 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4493 reg_names [REGNO (addr.offset)], addr.shift);
4494 return;
4496 case ADDRESS_REG_UXTW:
4497 if (addr.shift == 0)
4498 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4499 REGNO (addr.offset) - R0_REGNUM);
4500 else
4501 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4502 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4503 return;
4505 case ADDRESS_REG_SXTW:
4506 if (addr.shift == 0)
4507 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4508 REGNO (addr.offset) - R0_REGNUM);
4509 else
4510 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4511 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4512 return;
4514 case ADDRESS_REG_WB:
4515 switch (GET_CODE (x))
4517 case PRE_INC:
4518 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4519 GET_MODE_SIZE (aarch64_memory_reference_mode));
4520 return;
4521 case POST_INC:
4522 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4523 GET_MODE_SIZE (aarch64_memory_reference_mode));
4524 return;
4525 case PRE_DEC:
4526 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4527 GET_MODE_SIZE (aarch64_memory_reference_mode));
4528 return;
4529 case POST_DEC:
4530 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4531 GET_MODE_SIZE (aarch64_memory_reference_mode));
4532 return;
4533 case PRE_MODIFY:
4534 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4535 INTVAL (addr.offset));
4536 return;
4537 case POST_MODIFY:
4538 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4539 INTVAL (addr.offset));
4540 return;
4541 default:
4542 break;
4544 break;
4546 case ADDRESS_LO_SUM:
4547 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4548 output_addr_const (f, addr.offset);
4549 asm_fprintf (f, "]");
4550 return;
4552 case ADDRESS_SYMBOLIC:
4553 break;
4556 output_addr_const (f, x);
4559 bool
4560 aarch64_label_mentioned_p (rtx x)
4562 const char *fmt;
4563 int i;
4565 if (GET_CODE (x) == LABEL_REF)
4566 return true;
4568 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4569 referencing instruction, but they are constant offsets, not
4570 symbols. */
4571 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4572 return false;
4574 fmt = GET_RTX_FORMAT (GET_CODE (x));
4575 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4577 if (fmt[i] == 'E')
4579 int j;
4581 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4582 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4583 return 1;
4585 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4586 return 1;
4589 return 0;
4592 /* Implement REGNO_REG_CLASS. */
4594 enum reg_class
4595 aarch64_regno_regclass (unsigned regno)
4597 if (GP_REGNUM_P (regno))
4598 return GENERAL_REGS;
4600 if (regno == SP_REGNUM)
4601 return STACK_REG;
4603 if (regno == FRAME_POINTER_REGNUM
4604 || regno == ARG_POINTER_REGNUM)
4605 return POINTER_REGS;
4607 if (FP_REGNUM_P (regno))
4608 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4610 return NO_REGS;
4613 static rtx
4614 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4616 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4617 where mask is selected by alignment and size of the offset.
4618 We try to pick as large a range for the offset as possible to
4619 maximize the chance of a CSE. However, for aligned addresses
4620 we limit the range to 4k so that structures with different sized
4621 elements are likely to use the same base. */
4623 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4625 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4626 HOST_WIDE_INT base_offset;
4628 /* Does it look like we'll need a load/store-pair operation? */
4629 if (GET_MODE_SIZE (mode) > 16
4630 || mode == TImode)
4631 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4632 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4633 /* For offsets aren't a multiple of the access size, the limit is
4634 -256...255. */
4635 else if (offset & (GET_MODE_SIZE (mode) - 1))
4636 base_offset = (offset + 0x100) & ~0x1ff;
4637 else
4638 base_offset = offset & ~0xfff;
4640 if (base_offset == 0)
4641 return x;
4643 offset -= base_offset;
4644 rtx base_reg = gen_reg_rtx (Pmode);
4645 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4646 NULL_RTX);
4647 emit_move_insn (base_reg, val);
4648 x = plus_constant (Pmode, base_reg, offset);
4651 return x;
4654 /* Try a machine-dependent way of reloading an illegitimate address
4655 operand. If we find one, push the reload and return the new rtx. */
4658 aarch64_legitimize_reload_address (rtx *x_p,
4659 machine_mode mode,
4660 int opnum, int type,
4661 int ind_levels ATTRIBUTE_UNUSED)
4663 rtx x = *x_p;
4665 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4666 if (aarch64_vect_struct_mode_p (mode)
4667 && GET_CODE (x) == PLUS
4668 && REG_P (XEXP (x, 0))
4669 && CONST_INT_P (XEXP (x, 1)))
4671 rtx orig_rtx = x;
4672 x = copy_rtx (x);
4673 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4674 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4675 opnum, (enum reload_type) type);
4676 return x;
4679 /* We must recognize output that we have already generated ourselves. */
4680 if (GET_CODE (x) == PLUS
4681 && GET_CODE (XEXP (x, 0)) == PLUS
4682 && REG_P (XEXP (XEXP (x, 0), 0))
4683 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4684 && CONST_INT_P (XEXP (x, 1)))
4686 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4687 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4688 opnum, (enum reload_type) type);
4689 return x;
4692 /* We wish to handle large displacements off a base register by splitting
4693 the addend across an add and the mem insn. This can cut the number of
4694 extra insns needed from 3 to 1. It is only useful for load/store of a
4695 single register with 12 bit offset field. */
4696 if (GET_CODE (x) == PLUS
4697 && REG_P (XEXP (x, 0))
4698 && CONST_INT_P (XEXP (x, 1))
4699 && HARD_REGISTER_P (XEXP (x, 0))
4700 && mode != TImode
4701 && mode != TFmode
4702 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4704 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4705 HOST_WIDE_INT low = val & 0xfff;
4706 HOST_WIDE_INT high = val - low;
4707 HOST_WIDE_INT offs;
4708 rtx cst;
4709 machine_mode xmode = GET_MODE (x);
4711 /* In ILP32, xmode can be either DImode or SImode. */
4712 gcc_assert (xmode == DImode || xmode == SImode);
4714 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4715 BLKmode alignment. */
4716 if (GET_MODE_SIZE (mode) == 0)
4717 return NULL_RTX;
4719 offs = low % GET_MODE_SIZE (mode);
4721 /* Align misaligned offset by adjusting high part to compensate. */
4722 if (offs != 0)
4724 if (aarch64_uimm12_shift (high + offs))
4726 /* Align down. */
4727 low = low - offs;
4728 high = high + offs;
4730 else
4732 /* Align up. */
4733 offs = GET_MODE_SIZE (mode) - offs;
4734 low = low + offs;
4735 high = high + (low & 0x1000) - offs;
4736 low &= 0xfff;
4740 /* Check for overflow. */
4741 if (high + low != val)
4742 return NULL_RTX;
4744 cst = GEN_INT (high);
4745 if (!aarch64_uimm12_shift (high))
4746 cst = force_const_mem (xmode, cst);
4748 /* Reload high part into base reg, leaving the low part
4749 in the mem instruction.
4750 Note that replacing this gen_rtx_PLUS with plus_constant is
4751 wrong in this case because we rely on the
4752 (plus (plus reg c1) c2) structure being preserved so that
4753 XEXP (*p, 0) in push_reload below uses the correct term. */
4754 x = gen_rtx_PLUS (xmode,
4755 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4756 GEN_INT (low));
4758 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4759 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4760 opnum, (enum reload_type) type);
4761 return x;
4764 return NULL_RTX;
4768 static reg_class_t
4769 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4770 reg_class_t rclass,
4771 machine_mode mode,
4772 secondary_reload_info *sri)
4774 /* Without the TARGET_SIMD instructions we cannot move a Q register
4775 to a Q register directly. We need a scratch. */
4776 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4777 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4778 && reg_class_subset_p (rclass, FP_REGS))
4780 if (mode == TFmode)
4781 sri->icode = CODE_FOR_aarch64_reload_movtf;
4782 else if (mode == TImode)
4783 sri->icode = CODE_FOR_aarch64_reload_movti;
4784 return NO_REGS;
4787 /* A TFmode or TImode memory access should be handled via an FP_REGS
4788 because AArch64 has richer addressing modes for LDR/STR instructions
4789 than LDP/STP instructions. */
4790 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4791 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4792 return FP_REGS;
4794 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4795 return GENERAL_REGS;
4797 return NO_REGS;
4800 static bool
4801 aarch64_can_eliminate (const int from, const int to)
4803 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4804 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4806 if (frame_pointer_needed)
4808 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4809 return true;
4810 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4811 return false;
4812 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4813 && !cfun->calls_alloca)
4814 return true;
4815 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4816 return true;
4818 return false;
4820 else
4822 /* If we decided that we didn't need a leaf frame pointer but then used
4823 LR in the function, then we'll want a frame pointer after all, so
4824 prevent this elimination to ensure a frame pointer is used. */
4825 if (to == STACK_POINTER_REGNUM
4826 && flag_omit_leaf_frame_pointer
4827 && df_regs_ever_live_p (LR_REGNUM))
4828 return false;
4831 return true;
4834 HOST_WIDE_INT
4835 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4837 aarch64_layout_frame ();
4839 if (to == HARD_FRAME_POINTER_REGNUM)
4841 if (from == ARG_POINTER_REGNUM)
4842 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4844 if (from == FRAME_POINTER_REGNUM)
4845 return (cfun->machine->frame.hard_fp_offset
4846 - cfun->machine->frame.saved_varargs_size);
4849 if (to == STACK_POINTER_REGNUM)
4851 if (from == FRAME_POINTER_REGNUM)
4852 return (cfun->machine->frame.frame_size
4853 - cfun->machine->frame.saved_varargs_size);
4856 return cfun->machine->frame.frame_size;
4859 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4860 previous frame. */
4863 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4865 if (count != 0)
4866 return const0_rtx;
4867 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4871 static void
4872 aarch64_asm_trampoline_template (FILE *f)
4874 if (TARGET_ILP32)
4876 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4877 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4879 else
4881 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4882 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4884 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4885 assemble_aligned_integer (4, const0_rtx);
4886 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4887 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4890 static void
4891 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4893 rtx fnaddr, mem, a_tramp;
4894 const int tramp_code_sz = 16;
4896 /* Don't need to copy the trailing D-words, we fill those in below. */
4897 emit_block_move (m_tramp, assemble_trampoline_template (),
4898 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4899 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4900 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4901 if (GET_MODE (fnaddr) != ptr_mode)
4902 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4903 emit_move_insn (mem, fnaddr);
4905 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4906 emit_move_insn (mem, chain_value);
4908 /* XXX We should really define a "clear_cache" pattern and use
4909 gen_clear_cache(). */
4910 a_tramp = XEXP (m_tramp, 0);
4911 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4912 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4913 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4914 ptr_mode);
4917 static unsigned char
4918 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4920 switch (regclass)
4922 case CALLER_SAVE_REGS:
4923 case POINTER_REGS:
4924 case GENERAL_REGS:
4925 case ALL_REGS:
4926 case FP_REGS:
4927 case FP_LO_REGS:
4928 return
4929 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4930 (GET_MODE_SIZE (mode) + 7) / 8;
4931 case STACK_REG:
4932 return 1;
4934 case NO_REGS:
4935 return 0;
4937 default:
4938 break;
4940 gcc_unreachable ();
4943 static reg_class_t
4944 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4946 if (regclass == POINTER_REGS)
4947 return GENERAL_REGS;
4949 if (regclass == STACK_REG)
4951 if (REG_P(x)
4952 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4953 return regclass;
4955 return NO_REGS;
4958 /* If it's an integer immediate that MOVI can't handle, then
4959 FP_REGS is not an option, so we return NO_REGS instead. */
4960 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4961 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4962 return NO_REGS;
4964 /* Register eliminiation can result in a request for
4965 SP+constant->FP_REGS. We cannot support such operations which
4966 use SP as source and an FP_REG as destination, so reject out
4967 right now. */
4968 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4970 rtx lhs = XEXP (x, 0);
4972 /* Look through a possible SUBREG introduced by ILP32. */
4973 if (GET_CODE (lhs) == SUBREG)
4974 lhs = SUBREG_REG (lhs);
4976 gcc_assert (REG_P (lhs));
4977 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4978 POINTER_REGS));
4979 return NO_REGS;
4982 return regclass;
4985 void
4986 aarch64_asm_output_labelref (FILE* f, const char *name)
4988 asm_fprintf (f, "%U%s", name);
4991 static void
4992 aarch64_elf_asm_constructor (rtx symbol, int priority)
4994 if (priority == DEFAULT_INIT_PRIORITY)
4995 default_ctor_section_asm_out_constructor (symbol, priority);
4996 else
4998 section *s;
4999 char buf[18];
5000 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5001 s = get_section (buf, SECTION_WRITE, NULL);
5002 switch_to_section (s);
5003 assemble_align (POINTER_SIZE);
5004 assemble_aligned_integer (POINTER_BYTES, symbol);
5008 static void
5009 aarch64_elf_asm_destructor (rtx symbol, int priority)
5011 if (priority == DEFAULT_INIT_PRIORITY)
5012 default_dtor_section_asm_out_destructor (symbol, priority);
5013 else
5015 section *s;
5016 char buf[18];
5017 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5018 s = get_section (buf, SECTION_WRITE, NULL);
5019 switch_to_section (s);
5020 assemble_align (POINTER_SIZE);
5021 assemble_aligned_integer (POINTER_BYTES, symbol);
5025 const char*
5026 aarch64_output_casesi (rtx *operands)
5028 char buf[100];
5029 char label[100];
5030 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5031 int index;
5032 static const char *const patterns[4][2] =
5035 "ldrb\t%w3, [%0,%w1,uxtw]",
5036 "add\t%3, %4, %w3, sxtb #2"
5039 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5040 "add\t%3, %4, %w3, sxth #2"
5043 "ldr\t%w3, [%0,%w1,uxtw #2]",
5044 "add\t%3, %4, %w3, sxtw #2"
5046 /* We assume that DImode is only generated when not optimizing and
5047 that we don't really need 64-bit address offsets. That would
5048 imply an object file with 8GB of code in a single function! */
5050 "ldr\t%w3, [%0,%w1,uxtw #2]",
5051 "add\t%3, %4, %w3, sxtw #2"
5055 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5057 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5059 gcc_assert (index >= 0 && index <= 3);
5061 /* Need to implement table size reduction, by chaning the code below. */
5062 output_asm_insn (patterns[index][0], operands);
5063 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5064 snprintf (buf, sizeof (buf),
5065 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5066 output_asm_insn (buf, operands);
5067 output_asm_insn (patterns[index][1], operands);
5068 output_asm_insn ("br\t%3", operands);
5069 assemble_label (asm_out_file, label);
5070 return "";
5074 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5075 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5076 operator. */
5079 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5081 if (shift >= 0 && shift <= 3)
5083 int size;
5084 for (size = 8; size <= 32; size *= 2)
5086 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5087 if (mask == bits << shift)
5088 return size;
5091 return 0;
5094 static bool
5095 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5096 const_rtx x ATTRIBUTE_UNUSED)
5098 /* We can't use blocks for constants when we're using a per-function
5099 constant pool. */
5100 return false;
5103 static section *
5104 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5105 rtx x ATTRIBUTE_UNUSED,
5106 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5108 /* Force all constant pool entries into the current function section. */
5109 return function_section (current_function_decl);
5113 /* Costs. */
5115 /* Helper function for rtx cost calculation. Strip a shift expression
5116 from X. Returns the inner operand if successful, or the original
5117 expression on failure. */
5118 static rtx
5119 aarch64_strip_shift (rtx x)
5121 rtx op = x;
5123 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5124 we can convert both to ROR during final output. */
5125 if ((GET_CODE (op) == ASHIFT
5126 || GET_CODE (op) == ASHIFTRT
5127 || GET_CODE (op) == LSHIFTRT
5128 || GET_CODE (op) == ROTATERT
5129 || GET_CODE (op) == ROTATE)
5130 && CONST_INT_P (XEXP (op, 1)))
5131 return XEXP (op, 0);
5133 if (GET_CODE (op) == MULT
5134 && CONST_INT_P (XEXP (op, 1))
5135 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5136 return XEXP (op, 0);
5138 return x;
5141 /* Helper function for rtx cost calculation. Strip an extend
5142 expression from X. Returns the inner operand if successful, or the
5143 original expression on failure. We deal with a number of possible
5144 canonicalization variations here. */
5145 static rtx
5146 aarch64_strip_extend (rtx x)
5148 rtx op = x;
5150 /* Zero and sign extraction of a widened value. */
5151 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5152 && XEXP (op, 2) == const0_rtx
5153 && GET_CODE (XEXP (op, 0)) == MULT
5154 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5155 XEXP (op, 1)))
5156 return XEXP (XEXP (op, 0), 0);
5158 /* It can also be represented (for zero-extend) as an AND with an
5159 immediate. */
5160 if (GET_CODE (op) == AND
5161 && GET_CODE (XEXP (op, 0)) == MULT
5162 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5163 && CONST_INT_P (XEXP (op, 1))
5164 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5165 INTVAL (XEXP (op, 1))) != 0)
5166 return XEXP (XEXP (op, 0), 0);
5168 /* Now handle extended register, as this may also have an optional
5169 left shift by 1..4. */
5170 if (GET_CODE (op) == ASHIFT
5171 && CONST_INT_P (XEXP (op, 1))
5172 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5173 op = XEXP (op, 0);
5175 if (GET_CODE (op) == ZERO_EXTEND
5176 || GET_CODE (op) == SIGN_EXTEND)
5177 op = XEXP (op, 0);
5179 if (op != x)
5180 return op;
5182 return x;
5185 /* Return true iff CODE is a shift supported in combination
5186 with arithmetic instructions. */
5188 static bool
5189 aarch64_shift_p (enum rtx_code code)
5191 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5194 /* Helper function for rtx cost calculation. Calculate the cost of
5195 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5196 Return the calculated cost of the expression, recursing manually in to
5197 operands where needed. */
5199 static int
5200 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5202 rtx op0, op1;
5203 const struct cpu_cost_table *extra_cost
5204 = aarch64_tune_params->insn_extra_cost;
5205 int cost = 0;
5206 bool compound_p = (outer == PLUS || outer == MINUS);
5207 machine_mode mode = GET_MODE (x);
5209 gcc_checking_assert (code == MULT);
5211 op0 = XEXP (x, 0);
5212 op1 = XEXP (x, 1);
5214 if (VECTOR_MODE_P (mode))
5215 mode = GET_MODE_INNER (mode);
5217 /* Integer multiply/fma. */
5218 if (GET_MODE_CLASS (mode) == MODE_INT)
5220 /* The multiply will be canonicalized as a shift, cost it as such. */
5221 if (aarch64_shift_p (GET_CODE (x))
5222 || (CONST_INT_P (op1)
5223 && exact_log2 (INTVAL (op1)) > 0))
5225 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5226 || GET_CODE (op0) == SIGN_EXTEND;
5227 if (speed)
5229 if (compound_p)
5231 if (REG_P (op1))
5232 /* ARITH + shift-by-register. */
5233 cost += extra_cost->alu.arith_shift_reg;
5234 else if (is_extend)
5235 /* ARITH + extended register. We don't have a cost field
5236 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5237 cost += extra_cost->alu.extend_arith;
5238 else
5239 /* ARITH + shift-by-immediate. */
5240 cost += extra_cost->alu.arith_shift;
5242 else
5243 /* LSL (immediate). */
5244 cost += extra_cost->alu.shift;
5247 /* Strip extends as we will have costed them in the case above. */
5248 if (is_extend)
5249 op0 = aarch64_strip_extend (op0);
5251 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5253 return cost;
5256 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5257 compound and let the below cases handle it. After all, MNEG is a
5258 special-case alias of MSUB. */
5259 if (GET_CODE (op0) == NEG)
5261 op0 = XEXP (op0, 0);
5262 compound_p = true;
5265 /* Integer multiplies or FMAs have zero/sign extending variants. */
5266 if ((GET_CODE (op0) == ZERO_EXTEND
5267 && GET_CODE (op1) == ZERO_EXTEND)
5268 || (GET_CODE (op0) == SIGN_EXTEND
5269 && GET_CODE (op1) == SIGN_EXTEND))
5271 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5272 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5274 if (speed)
5276 if (compound_p)
5277 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5278 cost += extra_cost->mult[0].extend_add;
5279 else
5280 /* MUL/SMULL/UMULL. */
5281 cost += extra_cost->mult[0].extend;
5284 return cost;
5287 /* This is either an integer multiply or a MADD. In both cases
5288 we want to recurse and cost the operands. */
5289 cost += rtx_cost (op0, MULT, 0, speed)
5290 + rtx_cost (op1, MULT, 1, speed);
5292 if (speed)
5294 if (compound_p)
5295 /* MADD/MSUB. */
5296 cost += extra_cost->mult[mode == DImode].add;
5297 else
5298 /* MUL. */
5299 cost += extra_cost->mult[mode == DImode].simple;
5302 return cost;
5304 else
5306 if (speed)
5308 /* Floating-point FMA/FMUL can also support negations of the
5309 operands. */
5310 if (GET_CODE (op0) == NEG)
5311 op0 = XEXP (op0, 0);
5312 if (GET_CODE (op1) == NEG)
5313 op1 = XEXP (op1, 0);
5315 if (compound_p)
5316 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5317 cost += extra_cost->fp[mode == DFmode].fma;
5318 else
5319 /* FMUL/FNMUL. */
5320 cost += extra_cost->fp[mode == DFmode].mult;
5323 cost += rtx_cost (op0, MULT, 0, speed)
5324 + rtx_cost (op1, MULT, 1, speed);
5325 return cost;
5329 static int
5330 aarch64_address_cost (rtx x,
5331 machine_mode mode,
5332 addr_space_t as ATTRIBUTE_UNUSED,
5333 bool speed)
5335 enum rtx_code c = GET_CODE (x);
5336 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5337 struct aarch64_address_info info;
5338 int cost = 0;
5339 info.shift = 0;
5341 if (!aarch64_classify_address (&info, x, mode, c, false))
5343 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5345 /* This is a CONST or SYMBOL ref which will be split
5346 in a different way depending on the code model in use.
5347 Cost it through the generic infrastructure. */
5348 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5349 /* Divide through by the cost of one instruction to
5350 bring it to the same units as the address costs. */
5351 cost_symbol_ref /= COSTS_N_INSNS (1);
5352 /* The cost is then the cost of preparing the address,
5353 followed by an immediate (possibly 0) offset. */
5354 return cost_symbol_ref + addr_cost->imm_offset;
5356 else
5358 /* This is most likely a jump table from a case
5359 statement. */
5360 return addr_cost->register_offset;
5364 switch (info.type)
5366 case ADDRESS_LO_SUM:
5367 case ADDRESS_SYMBOLIC:
5368 case ADDRESS_REG_IMM:
5369 cost += addr_cost->imm_offset;
5370 break;
5372 case ADDRESS_REG_WB:
5373 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5374 cost += addr_cost->pre_modify;
5375 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5376 cost += addr_cost->post_modify;
5377 else
5378 gcc_unreachable ();
5380 break;
5382 case ADDRESS_REG_REG:
5383 cost += addr_cost->register_offset;
5384 break;
5386 case ADDRESS_REG_UXTW:
5387 case ADDRESS_REG_SXTW:
5388 cost += addr_cost->register_extend;
5389 break;
5391 default:
5392 gcc_unreachable ();
5396 if (info.shift > 0)
5398 /* For the sake of calculating the cost of the shifted register
5399 component, we can treat same sized modes in the same way. */
5400 switch (GET_MODE_BITSIZE (mode))
5402 case 16:
5403 cost += addr_cost->addr_scale_costs.hi;
5404 break;
5406 case 32:
5407 cost += addr_cost->addr_scale_costs.si;
5408 break;
5410 case 64:
5411 cost += addr_cost->addr_scale_costs.di;
5412 break;
5414 /* We can't tell, or this is a 128-bit vector. */
5415 default:
5416 cost += addr_cost->addr_scale_costs.ti;
5417 break;
5421 return cost;
5424 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5425 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5426 to be taken. */
5429 aarch64_branch_cost (bool speed_p, bool predictable_p)
5431 /* When optimizing for speed, use the cost of unpredictable branches. */
5432 const struct cpu_branch_cost *branch_costs =
5433 aarch64_tune_params->branch_costs;
5435 if (!speed_p || predictable_p)
5436 return branch_costs->predictable;
5437 else
5438 return branch_costs->unpredictable;
5441 /* Return true if the RTX X in mode MODE is a zero or sign extract
5442 usable in an ADD or SUB (extended register) instruction. */
5443 static bool
5444 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5446 /* Catch add with a sign extract.
5447 This is add_<optab><mode>_multp2. */
5448 if (GET_CODE (x) == SIGN_EXTRACT
5449 || GET_CODE (x) == ZERO_EXTRACT)
5451 rtx op0 = XEXP (x, 0);
5452 rtx op1 = XEXP (x, 1);
5453 rtx op2 = XEXP (x, 2);
5455 if (GET_CODE (op0) == MULT
5456 && CONST_INT_P (op1)
5457 && op2 == const0_rtx
5458 && CONST_INT_P (XEXP (op0, 1))
5459 && aarch64_is_extend_from_extract (mode,
5460 XEXP (op0, 1),
5461 op1))
5463 return true;
5467 return false;
5470 static bool
5471 aarch64_frint_unspec_p (unsigned int u)
5473 switch (u)
5475 case UNSPEC_FRINTZ:
5476 case UNSPEC_FRINTP:
5477 case UNSPEC_FRINTM:
5478 case UNSPEC_FRINTA:
5479 case UNSPEC_FRINTN:
5480 case UNSPEC_FRINTX:
5481 case UNSPEC_FRINTI:
5482 return true;
5484 default:
5485 return false;
5489 /* Return true iff X is an rtx that will match an extr instruction
5490 i.e. as described in the *extr<mode>5_insn family of patterns.
5491 OP0 and OP1 will be set to the operands of the shifts involved
5492 on success and will be NULL_RTX otherwise. */
5494 static bool
5495 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5497 rtx op0, op1;
5498 machine_mode mode = GET_MODE (x);
5500 *res_op0 = NULL_RTX;
5501 *res_op1 = NULL_RTX;
5503 if (GET_CODE (x) != IOR)
5504 return false;
5506 op0 = XEXP (x, 0);
5507 op1 = XEXP (x, 1);
5509 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5510 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5512 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5513 if (GET_CODE (op1) == ASHIFT)
5514 std::swap (op0, op1);
5516 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5517 return false;
5519 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5520 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5522 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5523 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5525 *res_op0 = XEXP (op0, 0);
5526 *res_op1 = XEXP (op1, 0);
5527 return true;
5531 return false;
5534 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5535 storing it in *COST. Result is true if the total cost of the operation
5536 has now been calculated. */
5537 static bool
5538 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5540 rtx inner;
5541 rtx comparator;
5542 enum rtx_code cmpcode;
5544 if (COMPARISON_P (op0))
5546 inner = XEXP (op0, 0);
5547 comparator = XEXP (op0, 1);
5548 cmpcode = GET_CODE (op0);
5550 else
5552 inner = op0;
5553 comparator = const0_rtx;
5554 cmpcode = NE;
5557 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5559 /* Conditional branch. */
5560 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5561 return true;
5562 else
5564 if (cmpcode == NE || cmpcode == EQ)
5566 if (comparator == const0_rtx)
5568 /* TBZ/TBNZ/CBZ/CBNZ. */
5569 if (GET_CODE (inner) == ZERO_EXTRACT)
5570 /* TBZ/TBNZ. */
5571 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5572 0, speed);
5573 else
5574 /* CBZ/CBNZ. */
5575 *cost += rtx_cost (inner, cmpcode, 0, speed);
5577 return true;
5580 else if (cmpcode == LT || cmpcode == GE)
5582 /* TBZ/TBNZ. */
5583 if (comparator == const0_rtx)
5584 return true;
5588 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5590 /* It's a conditional operation based on the status flags,
5591 so it must be some flavor of CSEL. */
5593 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5594 if (GET_CODE (op1) == NEG
5595 || GET_CODE (op1) == NOT
5596 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5597 op1 = XEXP (op1, 0);
5599 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5600 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5601 return true;
5604 /* We don't know what this is, cost all operands. */
5605 return false;
5608 /* Calculate the cost of calculating X, storing it in *COST. Result
5609 is true if the total cost of the operation has now been calculated. */
5610 static bool
5611 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5612 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5614 rtx op0, op1, op2;
5615 const struct cpu_cost_table *extra_cost
5616 = aarch64_tune_params->insn_extra_cost;
5617 machine_mode mode = GET_MODE (x);
5619 /* By default, assume that everything has equivalent cost to the
5620 cheapest instruction. Any additional costs are applied as a delta
5621 above this default. */
5622 *cost = COSTS_N_INSNS (1);
5624 /* TODO: The cost infrastructure currently does not handle
5625 vector operations. Assume that all vector operations
5626 are equally expensive. */
5627 if (VECTOR_MODE_P (mode))
5629 if (speed)
5630 *cost += extra_cost->vect.alu;
5631 return true;
5634 switch (code)
5636 case SET:
5637 /* The cost depends entirely on the operands to SET. */
5638 *cost = 0;
5639 op0 = SET_DEST (x);
5640 op1 = SET_SRC (x);
5642 switch (GET_CODE (op0))
5644 case MEM:
5645 if (speed)
5647 rtx address = XEXP (op0, 0);
5648 if (GET_MODE_CLASS (mode) == MODE_INT)
5649 *cost += extra_cost->ldst.store;
5650 else if (mode == SFmode)
5651 *cost += extra_cost->ldst.storef;
5652 else if (mode == DFmode)
5653 *cost += extra_cost->ldst.stored;
5655 *cost +=
5656 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5657 0, speed));
5660 *cost += rtx_cost (op1, SET, 1, speed);
5661 return true;
5663 case SUBREG:
5664 if (! REG_P (SUBREG_REG (op0)))
5665 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5667 /* Fall through. */
5668 case REG:
5669 /* const0_rtx is in general free, but we will use an
5670 instruction to set a register to 0. */
5671 if (REG_P (op1) || op1 == const0_rtx)
5673 /* The cost is 1 per register copied. */
5674 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5675 / UNITS_PER_WORD;
5676 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5678 else
5679 /* Cost is just the cost of the RHS of the set. */
5680 *cost += rtx_cost (op1, SET, 1, speed);
5681 return true;
5683 case ZERO_EXTRACT:
5684 case SIGN_EXTRACT:
5685 /* Bit-field insertion. Strip any redundant widening of
5686 the RHS to meet the width of the target. */
5687 if (GET_CODE (op1) == SUBREG)
5688 op1 = SUBREG_REG (op1);
5689 if ((GET_CODE (op1) == ZERO_EXTEND
5690 || GET_CODE (op1) == SIGN_EXTEND)
5691 && CONST_INT_P (XEXP (op0, 1))
5692 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5693 >= INTVAL (XEXP (op0, 1))))
5694 op1 = XEXP (op1, 0);
5696 if (CONST_INT_P (op1))
5698 /* MOV immediate is assumed to always be cheap. */
5699 *cost = COSTS_N_INSNS (1);
5701 else
5703 /* BFM. */
5704 if (speed)
5705 *cost += extra_cost->alu.bfi;
5706 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5709 return true;
5711 default:
5712 /* We can't make sense of this, assume default cost. */
5713 *cost = COSTS_N_INSNS (1);
5714 return false;
5716 return false;
5718 case CONST_INT:
5719 /* If an instruction can incorporate a constant within the
5720 instruction, the instruction's expression avoids calling
5721 rtx_cost() on the constant. If rtx_cost() is called on a
5722 constant, then it is usually because the constant must be
5723 moved into a register by one or more instructions.
5725 The exception is constant 0, which can be expressed
5726 as XZR/WZR and is therefore free. The exception to this is
5727 if we have (set (reg) (const0_rtx)) in which case we must cost
5728 the move. However, we can catch that when we cost the SET, so
5729 we don't need to consider that here. */
5730 if (x == const0_rtx)
5731 *cost = 0;
5732 else
5734 /* To an approximation, building any other constant is
5735 proportionally expensive to the number of instructions
5736 required to build that constant. This is true whether we
5737 are compiling for SPEED or otherwise. */
5738 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5739 (NULL_RTX, x, false, mode));
5741 return true;
5743 case CONST_DOUBLE:
5744 if (speed)
5746 /* mov[df,sf]_aarch64. */
5747 if (aarch64_float_const_representable_p (x))
5748 /* FMOV (scalar immediate). */
5749 *cost += extra_cost->fp[mode == DFmode].fpconst;
5750 else if (!aarch64_float_const_zero_rtx_p (x))
5752 /* This will be a load from memory. */
5753 if (mode == DFmode)
5754 *cost += extra_cost->ldst.loadd;
5755 else
5756 *cost += extra_cost->ldst.loadf;
5758 else
5759 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5760 or MOV v0.s[0], wzr - neither of which are modeled by the
5761 cost tables. Just use the default cost. */
5766 return true;
5768 case MEM:
5769 if (speed)
5771 /* For loads we want the base cost of a load, plus an
5772 approximation for the additional cost of the addressing
5773 mode. */
5774 rtx address = XEXP (x, 0);
5775 if (GET_MODE_CLASS (mode) == MODE_INT)
5776 *cost += extra_cost->ldst.load;
5777 else if (mode == SFmode)
5778 *cost += extra_cost->ldst.loadf;
5779 else if (mode == DFmode)
5780 *cost += extra_cost->ldst.loadd;
5782 *cost +=
5783 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5784 0, speed));
5787 return true;
5789 case NEG:
5790 op0 = XEXP (x, 0);
5792 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5794 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5795 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5797 /* CSETM. */
5798 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5799 return true;
5802 /* Cost this as SUB wzr, X. */
5803 op0 = CONST0_RTX (GET_MODE (x));
5804 op1 = XEXP (x, 0);
5805 goto cost_minus;
5808 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5810 /* Support (neg(fma...)) as a single instruction only if
5811 sign of zeros is unimportant. This matches the decision
5812 making in aarch64.md. */
5813 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5815 /* FNMADD. */
5816 *cost = rtx_cost (op0, NEG, 0, speed);
5817 return true;
5819 if (speed)
5820 /* FNEG. */
5821 *cost += extra_cost->fp[mode == DFmode].neg;
5822 return false;
5825 return false;
5827 case CLRSB:
5828 case CLZ:
5829 if (speed)
5830 *cost += extra_cost->alu.clz;
5832 return false;
5834 case COMPARE:
5835 op0 = XEXP (x, 0);
5836 op1 = XEXP (x, 1);
5838 if (op1 == const0_rtx
5839 && GET_CODE (op0) == AND)
5841 x = op0;
5842 goto cost_logic;
5845 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5847 /* TODO: A write to the CC flags possibly costs extra, this
5848 needs encoding in the cost tables. */
5850 /* CC_ZESWPmode supports zero extend for free. */
5851 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5852 op0 = XEXP (op0, 0);
5854 /* ANDS. */
5855 if (GET_CODE (op0) == AND)
5857 x = op0;
5858 goto cost_logic;
5861 if (GET_CODE (op0) == PLUS)
5863 /* ADDS (and CMN alias). */
5864 x = op0;
5865 goto cost_plus;
5868 if (GET_CODE (op0) == MINUS)
5870 /* SUBS. */
5871 x = op0;
5872 goto cost_minus;
5875 if (GET_CODE (op1) == NEG)
5877 /* CMN. */
5878 if (speed)
5879 *cost += extra_cost->alu.arith;
5881 *cost += rtx_cost (op0, COMPARE, 0, speed);
5882 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5883 return true;
5886 /* CMP.
5888 Compare can freely swap the order of operands, and
5889 canonicalization puts the more complex operation first.
5890 But the integer MINUS logic expects the shift/extend
5891 operation in op1. */
5892 if (! (REG_P (op0)
5893 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5895 op0 = XEXP (x, 1);
5896 op1 = XEXP (x, 0);
5898 goto cost_minus;
5901 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5903 /* FCMP. */
5904 if (speed)
5905 *cost += extra_cost->fp[mode == DFmode].compare;
5907 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5909 *cost += rtx_cost (op0, COMPARE, 0, speed);
5910 /* FCMP supports constant 0.0 for no extra cost. */
5911 return true;
5913 return false;
5916 return false;
5918 case MINUS:
5920 op0 = XEXP (x, 0);
5921 op1 = XEXP (x, 1);
5923 cost_minus:
5924 *cost += rtx_cost (op0, MINUS, 0, speed);
5926 /* Detect valid immediates. */
5927 if ((GET_MODE_CLASS (mode) == MODE_INT
5928 || (GET_MODE_CLASS (mode) == MODE_CC
5929 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5930 && CONST_INT_P (op1)
5931 && aarch64_uimm12_shift (INTVAL (op1)))
5933 if (speed)
5934 /* SUB(S) (immediate). */
5935 *cost += extra_cost->alu.arith;
5936 return true;
5939 /* Look for SUB (extended register). */
5940 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5942 if (speed)
5943 *cost += extra_cost->alu.extend_arith;
5945 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5946 (enum rtx_code) GET_CODE (op1),
5947 0, speed);
5948 return true;
5951 rtx new_op1 = aarch64_strip_extend (op1);
5953 /* Cost this as an FMA-alike operation. */
5954 if ((GET_CODE (new_op1) == MULT
5955 || aarch64_shift_p (GET_CODE (new_op1)))
5956 && code != COMPARE)
5958 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5959 (enum rtx_code) code,
5960 speed);
5961 return true;
5964 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5966 if (speed)
5968 if (GET_MODE_CLASS (mode) == MODE_INT)
5969 /* SUB(S). */
5970 *cost += extra_cost->alu.arith;
5971 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5972 /* FSUB. */
5973 *cost += extra_cost->fp[mode == DFmode].addsub;
5975 return true;
5978 case PLUS:
5980 rtx new_op0;
5982 op0 = XEXP (x, 0);
5983 op1 = XEXP (x, 1);
5985 cost_plus:
5986 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5987 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5989 /* CSINC. */
5990 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5991 *cost += rtx_cost (op1, PLUS, 1, speed);
5992 return true;
5995 if (GET_MODE_CLASS (mode) == MODE_INT
5996 && CONST_INT_P (op1)
5997 && aarch64_uimm12_shift (INTVAL (op1)))
5999 *cost += rtx_cost (op0, PLUS, 0, speed);
6001 if (speed)
6002 /* ADD (immediate). */
6003 *cost += extra_cost->alu.arith;
6004 return true;
6007 *cost += rtx_cost (op1, PLUS, 1, speed);
6009 /* Look for ADD (extended register). */
6010 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6012 if (speed)
6013 *cost += extra_cost->alu.extend_arith;
6015 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6016 (enum rtx_code) GET_CODE (op0),
6017 0, speed);
6018 return true;
6021 /* Strip any extend, leave shifts behind as we will
6022 cost them through mult_cost. */
6023 new_op0 = aarch64_strip_extend (op0);
6025 if (GET_CODE (new_op0) == MULT
6026 || aarch64_shift_p (GET_CODE (new_op0)))
6028 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6029 speed);
6030 return true;
6033 *cost += rtx_cost (new_op0, PLUS, 0, speed);
6035 if (speed)
6037 if (GET_MODE_CLASS (mode) == MODE_INT)
6038 /* ADD. */
6039 *cost += extra_cost->alu.arith;
6040 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6041 /* FADD. */
6042 *cost += extra_cost->fp[mode == DFmode].addsub;
6044 return true;
6047 case BSWAP:
6048 *cost = COSTS_N_INSNS (1);
6050 if (speed)
6051 *cost += extra_cost->alu.rev;
6053 return false;
6055 case IOR:
6056 if (aarch_rev16_p (x))
6058 *cost = COSTS_N_INSNS (1);
6060 if (speed)
6061 *cost += extra_cost->alu.rev;
6063 return true;
6066 if (aarch64_extr_rtx_p (x, &op0, &op1))
6068 *cost += rtx_cost (op0, IOR, 0, speed)
6069 + rtx_cost (op1, IOR, 1, speed);
6070 if (speed)
6071 *cost += extra_cost->alu.shift;
6073 return true;
6075 /* Fall through. */
6076 case XOR:
6077 case AND:
6078 cost_logic:
6079 op0 = XEXP (x, 0);
6080 op1 = XEXP (x, 1);
6082 if (code == AND
6083 && GET_CODE (op0) == MULT
6084 && CONST_INT_P (XEXP (op0, 1))
6085 && CONST_INT_P (op1)
6086 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6087 INTVAL (op1)) != 0)
6089 /* This is a UBFM/SBFM. */
6090 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6091 if (speed)
6092 *cost += extra_cost->alu.bfx;
6093 return true;
6096 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6098 /* We possibly get the immediate for free, this is not
6099 modelled. */
6100 if (CONST_INT_P (op1)
6101 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6103 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6105 if (speed)
6106 *cost += extra_cost->alu.logical;
6108 return true;
6110 else
6112 rtx new_op0 = op0;
6114 /* Handle ORN, EON, or BIC. */
6115 if (GET_CODE (op0) == NOT)
6116 op0 = XEXP (op0, 0);
6118 new_op0 = aarch64_strip_shift (op0);
6120 /* If we had a shift on op0 then this is a logical-shift-
6121 by-register/immediate operation. Otherwise, this is just
6122 a logical operation. */
6123 if (speed)
6125 if (new_op0 != op0)
6127 /* Shift by immediate. */
6128 if (CONST_INT_P (XEXP (op0, 1)))
6129 *cost += extra_cost->alu.log_shift;
6130 else
6131 *cost += extra_cost->alu.log_shift_reg;
6133 else
6134 *cost += extra_cost->alu.logical;
6137 /* In both cases we want to cost both operands. */
6138 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6139 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6141 return true;
6144 return false;
6146 case NOT:
6147 x = XEXP (x, 0);
6148 op0 = aarch64_strip_shift (x);
6150 /* MVN-shifted-reg. */
6151 if (op0 != x)
6153 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6155 if (speed)
6156 *cost += extra_cost->alu.log_shift;
6158 return true;
6160 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6161 Handle the second form here taking care that 'a' in the above can
6162 be a shift. */
6163 else if (GET_CODE (op0) == XOR)
6165 rtx newop0 = XEXP (op0, 0);
6166 rtx newop1 = XEXP (op0, 1);
6167 rtx op0_stripped = aarch64_strip_shift (newop0);
6169 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6170 + rtx_cost (op0_stripped, XOR, 0, speed);
6172 if (speed)
6174 if (op0_stripped != newop0)
6175 *cost += extra_cost->alu.log_shift;
6176 else
6177 *cost += extra_cost->alu.logical;
6180 return true;
6182 /* MVN. */
6183 if (speed)
6184 *cost += extra_cost->alu.logical;
6186 return false;
6188 case ZERO_EXTEND:
6190 op0 = XEXP (x, 0);
6191 /* If a value is written in SI mode, then zero extended to DI
6192 mode, the operation will in general be free as a write to
6193 a 'w' register implicitly zeroes the upper bits of an 'x'
6194 register. However, if this is
6196 (set (reg) (zero_extend (reg)))
6198 we must cost the explicit register move. */
6199 if (mode == DImode
6200 && GET_MODE (op0) == SImode
6201 && outer == SET)
6203 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6205 if (!op_cost && speed)
6206 /* MOV. */
6207 *cost += extra_cost->alu.extend;
6208 else
6209 /* Free, the cost is that of the SI mode operation. */
6210 *cost = op_cost;
6212 return true;
6214 else if (MEM_P (XEXP (x, 0)))
6216 /* All loads can zero extend to any size for free. */
6217 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6218 return true;
6221 /* UXTB/UXTH. */
6222 if (speed)
6223 *cost += extra_cost->alu.extend;
6225 return false;
6227 case SIGN_EXTEND:
6228 if (MEM_P (XEXP (x, 0)))
6230 /* LDRSH. */
6231 if (speed)
6233 rtx address = XEXP (XEXP (x, 0), 0);
6234 *cost += extra_cost->ldst.load_sign_extend;
6236 *cost +=
6237 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6238 0, speed));
6240 return true;
6243 if (speed)
6244 *cost += extra_cost->alu.extend;
6245 return false;
6247 case ASHIFT:
6248 op0 = XEXP (x, 0);
6249 op1 = XEXP (x, 1);
6251 if (CONST_INT_P (op1))
6253 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6254 aliases. */
6255 if (speed)
6256 *cost += extra_cost->alu.shift;
6258 /* We can incorporate zero/sign extend for free. */
6259 if (GET_CODE (op0) == ZERO_EXTEND
6260 || GET_CODE (op0) == SIGN_EXTEND)
6261 op0 = XEXP (op0, 0);
6263 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6264 return true;
6266 else
6268 /* LSLV. */
6269 if (speed)
6270 *cost += extra_cost->alu.shift_reg;
6272 return false; /* All arguments need to be in registers. */
6275 case ROTATE:
6276 case ROTATERT:
6277 case LSHIFTRT:
6278 case ASHIFTRT:
6279 op0 = XEXP (x, 0);
6280 op1 = XEXP (x, 1);
6282 if (CONST_INT_P (op1))
6284 /* ASR (immediate) and friends. */
6285 if (speed)
6286 *cost += extra_cost->alu.shift;
6288 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6289 return true;
6291 else
6294 /* ASR (register) and friends. */
6295 if (speed)
6296 *cost += extra_cost->alu.shift_reg;
6298 return false; /* All arguments need to be in registers. */
6301 case SYMBOL_REF:
6303 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6305 /* LDR. */
6306 if (speed)
6307 *cost += extra_cost->ldst.load;
6309 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6310 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6312 /* ADRP, followed by ADD. */
6313 *cost += COSTS_N_INSNS (1);
6314 if (speed)
6315 *cost += 2 * extra_cost->alu.arith;
6317 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6318 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6320 /* ADR. */
6321 if (speed)
6322 *cost += extra_cost->alu.arith;
6325 if (flag_pic)
6327 /* One extra load instruction, after accessing the GOT. */
6328 *cost += COSTS_N_INSNS (1);
6329 if (speed)
6330 *cost += extra_cost->ldst.load;
6332 return true;
6334 case HIGH:
6335 case LO_SUM:
6336 /* ADRP/ADD (immediate). */
6337 if (speed)
6338 *cost += extra_cost->alu.arith;
6339 return true;
6341 case ZERO_EXTRACT:
6342 case SIGN_EXTRACT:
6343 /* UBFX/SBFX. */
6344 if (speed)
6345 *cost += extra_cost->alu.bfx;
6347 /* We can trust that the immediates used will be correct (there
6348 are no by-register forms), so we need only cost op0. */
6349 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6350 return true;
6352 case MULT:
6353 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6354 /* aarch64_rtx_mult_cost always handles recursion to its
6355 operands. */
6356 return true;
6358 case MOD:
6359 case UMOD:
6360 if (speed)
6362 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6363 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6364 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6365 else if (GET_MODE (x) == DFmode)
6366 *cost += (extra_cost->fp[1].mult
6367 + extra_cost->fp[1].div);
6368 else if (GET_MODE (x) == SFmode)
6369 *cost += (extra_cost->fp[0].mult
6370 + extra_cost->fp[0].div);
6372 return false; /* All arguments need to be in registers. */
6374 case DIV:
6375 case UDIV:
6376 case SQRT:
6377 if (speed)
6379 if (GET_MODE_CLASS (mode) == MODE_INT)
6380 /* There is no integer SQRT, so only DIV and UDIV can get
6381 here. */
6382 *cost += extra_cost->mult[mode == DImode].idiv;
6383 else
6384 *cost += extra_cost->fp[mode == DFmode].div;
6386 return false; /* All arguments need to be in registers. */
6388 case IF_THEN_ELSE:
6389 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6390 XEXP (x, 2), cost, speed);
6392 case EQ:
6393 case NE:
6394 case GT:
6395 case GTU:
6396 case LT:
6397 case LTU:
6398 case GE:
6399 case GEU:
6400 case LE:
6401 case LEU:
6403 return false; /* All arguments must be in registers. */
6405 case FMA:
6406 op0 = XEXP (x, 0);
6407 op1 = XEXP (x, 1);
6408 op2 = XEXP (x, 2);
6410 if (speed)
6411 *cost += extra_cost->fp[mode == DFmode].fma;
6413 /* FMSUB, FNMADD, and FNMSUB are free. */
6414 if (GET_CODE (op0) == NEG)
6415 op0 = XEXP (op0, 0);
6417 if (GET_CODE (op2) == NEG)
6418 op2 = XEXP (op2, 0);
6420 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6421 and the by-element operand as operand 0. */
6422 if (GET_CODE (op1) == NEG)
6423 op1 = XEXP (op1, 0);
6425 /* Catch vector-by-element operations. The by-element operand can
6426 either be (vec_duplicate (vec_select (x))) or just
6427 (vec_select (x)), depending on whether we are multiplying by
6428 a vector or a scalar.
6430 Canonicalization is not very good in these cases, FMA4 will put the
6431 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6432 if (GET_CODE (op0) == VEC_DUPLICATE)
6433 op0 = XEXP (op0, 0);
6434 else if (GET_CODE (op1) == VEC_DUPLICATE)
6435 op1 = XEXP (op1, 0);
6437 if (GET_CODE (op0) == VEC_SELECT)
6438 op0 = XEXP (op0, 0);
6439 else if (GET_CODE (op1) == VEC_SELECT)
6440 op1 = XEXP (op1, 0);
6442 /* If the remaining parameters are not registers,
6443 get the cost to put them into registers. */
6444 *cost += rtx_cost (op0, FMA, 0, speed);
6445 *cost += rtx_cost (op1, FMA, 1, speed);
6446 *cost += rtx_cost (op2, FMA, 2, speed);
6447 return true;
6449 case FLOAT:
6450 case UNSIGNED_FLOAT:
6451 if (speed)
6452 *cost += extra_cost->fp[mode == DFmode].fromint;
6453 return false;
6455 case FLOAT_EXTEND:
6456 if (speed)
6457 *cost += extra_cost->fp[mode == DFmode].widen;
6458 return false;
6460 case FLOAT_TRUNCATE:
6461 if (speed)
6462 *cost += extra_cost->fp[mode == DFmode].narrow;
6463 return false;
6465 case FIX:
6466 case UNSIGNED_FIX:
6467 x = XEXP (x, 0);
6468 /* Strip the rounding part. They will all be implemented
6469 by the fcvt* family of instructions anyway. */
6470 if (GET_CODE (x) == UNSPEC)
6472 unsigned int uns_code = XINT (x, 1);
6474 if (uns_code == UNSPEC_FRINTA
6475 || uns_code == UNSPEC_FRINTM
6476 || uns_code == UNSPEC_FRINTN
6477 || uns_code == UNSPEC_FRINTP
6478 || uns_code == UNSPEC_FRINTZ)
6479 x = XVECEXP (x, 0, 0);
6482 if (speed)
6483 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6485 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6486 return true;
6488 case ABS:
6489 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6491 op0 = XEXP (x, 0);
6493 /* FABD, which is analogous to FADD. */
6494 if (GET_CODE (op0) == MINUS)
6496 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6497 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6498 if (speed)
6499 *cost += extra_cost->fp[mode == DFmode].addsub;
6501 return true;
6503 /* Simple FABS is analogous to FNEG. */
6504 if (speed)
6505 *cost += extra_cost->fp[mode == DFmode].neg;
6507 else
6509 /* Integer ABS will either be split to
6510 two arithmetic instructions, or will be an ABS
6511 (scalar), which we don't model. */
6512 *cost = COSTS_N_INSNS (2);
6513 if (speed)
6514 *cost += 2 * extra_cost->alu.arith;
6516 return false;
6518 case SMAX:
6519 case SMIN:
6520 if (speed)
6522 /* FMAXNM/FMINNM/FMAX/FMIN.
6523 TODO: This may not be accurate for all implementations, but
6524 we do not model this in the cost tables. */
6525 *cost += extra_cost->fp[mode == DFmode].addsub;
6527 return false;
6529 case UNSPEC:
6530 /* The floating point round to integer frint* instructions. */
6531 if (aarch64_frint_unspec_p (XINT (x, 1)))
6533 if (speed)
6534 *cost += extra_cost->fp[mode == DFmode].roundint;
6536 return false;
6539 if (XINT (x, 1) == UNSPEC_RBIT)
6541 if (speed)
6542 *cost += extra_cost->alu.rev;
6544 return false;
6546 break;
6548 case TRUNCATE:
6550 /* Decompose <su>muldi3_highpart. */
6551 if (/* (truncate:DI */
6552 mode == DImode
6553 /* (lshiftrt:TI */
6554 && GET_MODE (XEXP (x, 0)) == TImode
6555 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6556 /* (mult:TI */
6557 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6558 /* (ANY_EXTEND:TI (reg:DI))
6559 (ANY_EXTEND:TI (reg:DI))) */
6560 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6561 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6562 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6563 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6564 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6565 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6566 /* (const_int 64) */
6567 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6568 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6570 /* UMULH/SMULH. */
6571 if (speed)
6572 *cost += extra_cost->mult[mode == DImode].extend;
6573 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6574 MULT, 0, speed);
6575 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6576 MULT, 1, speed);
6577 return true;
6580 /* Fall through. */
6581 default:
6582 break;
6585 if (dump_file && (dump_flags & TDF_DETAILS))
6586 fprintf (dump_file,
6587 "\nFailed to cost RTX. Assuming default cost.\n");
6589 return true;
6592 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6593 calculated for X. This cost is stored in *COST. Returns true
6594 if the total cost of X was calculated. */
6595 static bool
6596 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6597 int param, int *cost, bool speed)
6599 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6601 if (dump_file && (dump_flags & TDF_DETAILS))
6603 print_rtl_single (dump_file, x);
6604 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6605 speed ? "Hot" : "Cold",
6606 *cost, result ? "final" : "partial");
6609 return result;
6612 static int
6613 aarch64_register_move_cost (machine_mode mode,
6614 reg_class_t from_i, reg_class_t to_i)
6616 enum reg_class from = (enum reg_class) from_i;
6617 enum reg_class to = (enum reg_class) to_i;
6618 const struct cpu_regmove_cost *regmove_cost
6619 = aarch64_tune_params->regmove_cost;
6621 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6622 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6623 to = GENERAL_REGS;
6625 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6626 from = GENERAL_REGS;
6628 /* Moving between GPR and stack cost is the same as GP2GP. */
6629 if ((from == GENERAL_REGS && to == STACK_REG)
6630 || (to == GENERAL_REGS && from == STACK_REG))
6631 return regmove_cost->GP2GP;
6633 /* To/From the stack register, we move via the gprs. */
6634 if (to == STACK_REG || from == STACK_REG)
6635 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6636 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6638 if (GET_MODE_SIZE (mode) == 16)
6640 /* 128-bit operations on general registers require 2 instructions. */
6641 if (from == GENERAL_REGS && to == GENERAL_REGS)
6642 return regmove_cost->GP2GP * 2;
6643 else if (from == GENERAL_REGS)
6644 return regmove_cost->GP2FP * 2;
6645 else if (to == GENERAL_REGS)
6646 return regmove_cost->FP2GP * 2;
6648 /* When AdvSIMD instructions are disabled it is not possible to move
6649 a 128-bit value directly between Q registers. This is handled in
6650 secondary reload. A general register is used as a scratch to move
6651 the upper DI value and the lower DI value is moved directly,
6652 hence the cost is the sum of three moves. */
6653 if (! TARGET_SIMD)
6654 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6656 return regmove_cost->FP2FP;
6659 if (from == GENERAL_REGS && to == GENERAL_REGS)
6660 return regmove_cost->GP2GP;
6661 else if (from == GENERAL_REGS)
6662 return regmove_cost->GP2FP;
6663 else if (to == GENERAL_REGS)
6664 return regmove_cost->FP2GP;
6666 return regmove_cost->FP2FP;
6669 static int
6670 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6671 reg_class_t rclass ATTRIBUTE_UNUSED,
6672 bool in ATTRIBUTE_UNUSED)
6674 return aarch64_tune_params->memmov_cost;
6677 /* Return the number of instructions that can be issued per cycle. */
6678 static int
6679 aarch64_sched_issue_rate (void)
6681 return aarch64_tune_params->issue_rate;
6684 static int
6685 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6687 int issue_rate = aarch64_sched_issue_rate ();
6689 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6692 /* Vectorizer cost model target hooks. */
6694 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6695 static int
6696 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6697 tree vectype,
6698 int misalign ATTRIBUTE_UNUSED)
6700 unsigned elements;
6702 switch (type_of_cost)
6704 case scalar_stmt:
6705 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6707 case scalar_load:
6708 return aarch64_tune_params->vec_costs->scalar_load_cost;
6710 case scalar_store:
6711 return aarch64_tune_params->vec_costs->scalar_store_cost;
6713 case vector_stmt:
6714 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6716 case vector_load:
6717 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6719 case vector_store:
6720 return aarch64_tune_params->vec_costs->vec_store_cost;
6722 case vec_to_scalar:
6723 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6725 case scalar_to_vec:
6726 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6728 case unaligned_load:
6729 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6731 case unaligned_store:
6732 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6734 case cond_branch_taken:
6735 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6737 case cond_branch_not_taken:
6738 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6740 case vec_perm:
6741 case vec_promote_demote:
6742 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6744 case vec_construct:
6745 elements = TYPE_VECTOR_SUBPARTS (vectype);
6746 return elements / 2 + 1;
6748 default:
6749 gcc_unreachable ();
6753 /* Implement targetm.vectorize.add_stmt_cost. */
6754 static unsigned
6755 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6756 struct _stmt_vec_info *stmt_info, int misalign,
6757 enum vect_cost_model_location where)
6759 unsigned *cost = (unsigned *) data;
6760 unsigned retval = 0;
6762 if (flag_vect_cost_model)
6764 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6765 int stmt_cost =
6766 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6768 /* Statements in an inner loop relative to the loop being
6769 vectorized are weighted more heavily. The value here is
6770 a function (linear for now) of the loop nest level. */
6771 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6773 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6774 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6775 unsigned nest_level = loop_depth (loop);
6777 count *= nest_level;
6780 retval = (unsigned) (count * stmt_cost);
6781 cost[where] += retval;
6784 return retval;
6787 static void initialize_aarch64_code_model (void);
6789 /* Parse the architecture extension string. */
6791 static void
6792 aarch64_parse_extension (char *str)
6794 /* The extension string is parsed left to right. */
6795 const struct aarch64_option_extension *opt = NULL;
6797 /* Flag to say whether we are adding or removing an extension. */
6798 int adding_ext = -1;
6800 while (str != NULL && *str != 0)
6802 char *ext;
6803 size_t len;
6805 str++;
6806 ext = strchr (str, '+');
6808 if (ext != NULL)
6809 len = ext - str;
6810 else
6811 len = strlen (str);
6813 if (len >= 2 && strncmp (str, "no", 2) == 0)
6815 adding_ext = 0;
6816 len -= 2;
6817 str += 2;
6819 else if (len > 0)
6820 adding_ext = 1;
6822 if (len == 0)
6824 error ("missing feature modifier after %qs", adding_ext ? "+"
6825 : "+no");
6826 return;
6829 /* Scan over the extensions table trying to find an exact match. */
6830 for (opt = all_extensions; opt->name != NULL; opt++)
6832 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6834 /* Add or remove the extension. */
6835 if (adding_ext)
6836 aarch64_isa_flags |= opt->flags_on;
6837 else
6838 aarch64_isa_flags &= ~(opt->flags_off);
6839 break;
6843 if (opt->name == NULL)
6845 /* Extension not found in list. */
6846 error ("unknown feature modifier %qs", str);
6847 return;
6850 str = ext;
6853 return;
6856 /* Parse the ARCH string. */
6858 static void
6859 aarch64_parse_arch (void)
6861 char *ext;
6862 const struct processor *arch;
6863 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6864 size_t len;
6866 strcpy (str, aarch64_arch_string);
6868 ext = strchr (str, '+');
6870 if (ext != NULL)
6871 len = ext - str;
6872 else
6873 len = strlen (str);
6875 if (len == 0)
6877 error ("missing arch name in -march=%qs", str);
6878 return;
6881 /* Loop through the list of supported ARCHs to find a match. */
6882 for (arch = all_architectures; arch->name != NULL; arch++)
6884 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6886 selected_arch = arch;
6887 aarch64_isa_flags = selected_arch->flags;
6889 if (!selected_cpu)
6890 selected_cpu = &all_cores[selected_arch->core];
6892 if (ext != NULL)
6894 /* ARCH string contains at least one extension. */
6895 aarch64_parse_extension (ext);
6898 if (strcmp (selected_arch->arch, selected_cpu->arch))
6900 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6901 selected_cpu->name, selected_arch->name);
6904 return;
6908 /* ARCH name not found in list. */
6909 error ("unknown value %qs for -march", str);
6910 return;
6913 /* Parse the CPU string. */
6915 static void
6916 aarch64_parse_cpu (void)
6918 char *ext;
6919 const struct processor *cpu;
6920 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6921 size_t len;
6923 strcpy (str, aarch64_cpu_string);
6925 ext = strchr (str, '+');
6927 if (ext != NULL)
6928 len = ext - str;
6929 else
6930 len = strlen (str);
6932 if (len == 0)
6934 error ("missing cpu name in -mcpu=%qs", str);
6935 return;
6938 /* Loop through the list of supported CPUs to find a match. */
6939 for (cpu = all_cores; cpu->name != NULL; cpu++)
6941 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6943 selected_cpu = cpu;
6944 aarch64_isa_flags = selected_cpu->flags;
6946 if (ext != NULL)
6948 /* CPU string contains at least one extension. */
6949 aarch64_parse_extension (ext);
6952 return;
6956 /* CPU name not found in list. */
6957 error ("unknown value %qs for -mcpu", str);
6958 return;
6961 /* Parse the TUNE string. */
6963 static void
6964 aarch64_parse_tune (void)
6966 const struct processor *cpu;
6967 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6968 strcpy (str, aarch64_tune_string);
6970 /* Loop through the list of supported CPUs to find a match. */
6971 for (cpu = all_cores; cpu->name != NULL; cpu++)
6973 if (strcmp (cpu->name, str) == 0)
6975 selected_tune = cpu;
6976 return;
6980 /* CPU name not found in list. */
6981 error ("unknown value %qs for -mtune", str);
6982 return;
6986 /* Implement TARGET_OPTION_OVERRIDE. */
6988 static void
6989 aarch64_override_options (void)
6991 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6992 If either of -march or -mtune is given, they override their
6993 respective component of -mcpu.
6995 So, first parse AARCH64_CPU_STRING, then the others, be careful
6996 with -march as, if -mcpu is not present on the command line, march
6997 must set a sensible default CPU. */
6998 if (aarch64_cpu_string)
7000 aarch64_parse_cpu ();
7003 if (aarch64_arch_string)
7005 aarch64_parse_arch ();
7008 if (aarch64_tune_string)
7010 aarch64_parse_tune ();
7013 #ifndef HAVE_AS_MABI_OPTION
7014 /* The compiler may have been configured with 2.23.* binutils, which does
7015 not have support for ILP32. */
7016 if (TARGET_ILP32)
7017 error ("Assembler does not support -mabi=ilp32");
7018 #endif
7020 initialize_aarch64_code_model ();
7022 aarch64_build_bitmask_table ();
7024 /* This target defaults to strict volatile bitfields. */
7025 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7026 flag_strict_volatile_bitfields = 1;
7028 /* If the user did not specify a processor, choose the default
7029 one for them. This will be the CPU set during configuration using
7030 --with-cpu, otherwise it is "generic". */
7031 if (!selected_cpu)
7033 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7034 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7037 gcc_assert (selected_cpu);
7039 if (!selected_tune)
7040 selected_tune = selected_cpu;
7042 aarch64_tune_flags = selected_tune->flags;
7043 aarch64_tune = selected_tune->core;
7044 aarch64_tune_params = selected_tune->tune;
7045 aarch64_architecture_version = selected_cpu->architecture_version;
7047 if (aarch64_fix_a53_err835769 == 2)
7049 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7050 aarch64_fix_a53_err835769 = 1;
7051 #else
7052 aarch64_fix_a53_err835769 = 0;
7053 #endif
7056 /* If not opzimizing for size, set the default
7057 alignment to what the target wants */
7058 if (!optimize_size)
7060 if (align_loops <= 0)
7061 align_loops = aarch64_tune_params->loop_align;
7062 if (align_jumps <= 0)
7063 align_jumps = aarch64_tune_params->jump_align;
7064 if (align_functions <= 0)
7065 align_functions = aarch64_tune_params->function_align;
7068 if (AARCH64_TUNE_FMA_STEERING)
7069 aarch64_register_fma_steering ();
7071 aarch64_override_options_after_change ();
7074 /* Implement targetm.override_options_after_change. */
7076 static void
7077 aarch64_override_options_after_change (void)
7079 if (flag_omit_frame_pointer)
7080 flag_omit_leaf_frame_pointer = false;
7081 else if (flag_omit_leaf_frame_pointer)
7082 flag_omit_frame_pointer = true;
7085 static struct machine_function *
7086 aarch64_init_machine_status (void)
7088 struct machine_function *machine;
7089 machine = ggc_cleared_alloc<machine_function> ();
7090 return machine;
7093 void
7094 aarch64_init_expanders (void)
7096 init_machine_status = aarch64_init_machine_status;
7099 /* A checking mechanism for the implementation of the various code models. */
7100 static void
7101 initialize_aarch64_code_model (void)
7103 if (flag_pic)
7105 switch (aarch64_cmodel_var)
7107 case AARCH64_CMODEL_TINY:
7108 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7109 break;
7110 case AARCH64_CMODEL_SMALL:
7111 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7112 break;
7113 case AARCH64_CMODEL_LARGE:
7114 sorry ("code model %qs with -f%s", "large",
7115 flag_pic > 1 ? "PIC" : "pic");
7116 default:
7117 gcc_unreachable ();
7120 else
7121 aarch64_cmodel = aarch64_cmodel_var;
7124 /* Return true if SYMBOL_REF X binds locally. */
7126 static bool
7127 aarch64_symbol_binds_local_p (const_rtx x)
7129 return (SYMBOL_REF_DECL (x)
7130 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7131 : SYMBOL_REF_LOCAL_P (x));
7134 /* Return true if SYMBOL_REF X is thread local */
7135 static bool
7136 aarch64_tls_symbol_p (rtx x)
7138 if (! TARGET_HAVE_TLS)
7139 return false;
7141 if (GET_CODE (x) != SYMBOL_REF)
7142 return false;
7144 return SYMBOL_REF_TLS_MODEL (x) != 0;
7147 /* Classify a TLS symbol into one of the TLS kinds. */
7148 enum aarch64_symbol_type
7149 aarch64_classify_tls_symbol (rtx x)
7151 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7153 switch (tls_kind)
7155 case TLS_MODEL_GLOBAL_DYNAMIC:
7156 case TLS_MODEL_LOCAL_DYNAMIC:
7157 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7159 case TLS_MODEL_INITIAL_EXEC:
7160 return SYMBOL_SMALL_GOTTPREL;
7162 case TLS_MODEL_LOCAL_EXEC:
7163 return SYMBOL_SMALL_TPREL;
7165 case TLS_MODEL_EMULATED:
7166 case TLS_MODEL_NONE:
7167 return SYMBOL_FORCE_TO_MEM;
7169 default:
7170 gcc_unreachable ();
7174 /* Return the method that should be used to access SYMBOL_REF or
7175 LABEL_REF X in context CONTEXT. */
7177 enum aarch64_symbol_type
7178 aarch64_classify_symbol (rtx x, rtx offset,
7179 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7181 if (GET_CODE (x) == LABEL_REF)
7183 switch (aarch64_cmodel)
7185 case AARCH64_CMODEL_LARGE:
7186 return SYMBOL_FORCE_TO_MEM;
7188 case AARCH64_CMODEL_TINY_PIC:
7189 case AARCH64_CMODEL_TINY:
7190 return SYMBOL_TINY_ABSOLUTE;
7192 case AARCH64_CMODEL_SMALL_PIC:
7193 case AARCH64_CMODEL_SMALL:
7194 return SYMBOL_SMALL_ABSOLUTE;
7196 default:
7197 gcc_unreachable ();
7201 if (GET_CODE (x) == SYMBOL_REF)
7203 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7204 return SYMBOL_FORCE_TO_MEM;
7206 if (aarch64_tls_symbol_p (x))
7207 return aarch64_classify_tls_symbol (x);
7209 switch (aarch64_cmodel)
7211 case AARCH64_CMODEL_TINY:
7212 /* When we retreive symbol + offset address, we have to make sure
7213 the offset does not cause overflow of the final address. But
7214 we have no way of knowing the address of symbol at compile time
7215 so we can't accurately say if the distance between the PC and
7216 symbol + offset is outside the addressible range of +/-1M in the
7217 TINY code model. So we rely on images not being greater than
7218 1M and cap the offset at 1M and anything beyond 1M will have to
7219 be loaded using an alternative mechanism. */
7220 if (SYMBOL_REF_WEAK (x)
7221 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7222 return SYMBOL_FORCE_TO_MEM;
7223 return SYMBOL_TINY_ABSOLUTE;
7225 case AARCH64_CMODEL_SMALL:
7226 /* Same reasoning as the tiny code model, but the offset cap here is
7227 4G. */
7228 if (SYMBOL_REF_WEAK (x)
7229 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7230 HOST_WIDE_INT_C (4294967264)))
7231 return SYMBOL_FORCE_TO_MEM;
7232 return SYMBOL_SMALL_ABSOLUTE;
7234 case AARCH64_CMODEL_TINY_PIC:
7235 if (!aarch64_symbol_binds_local_p (x))
7236 return SYMBOL_TINY_GOT;
7237 return SYMBOL_TINY_ABSOLUTE;
7239 case AARCH64_CMODEL_SMALL_PIC:
7240 if (!aarch64_symbol_binds_local_p (x))
7241 return SYMBOL_SMALL_GOT;
7242 return SYMBOL_SMALL_ABSOLUTE;
7244 default:
7245 gcc_unreachable ();
7249 /* By default push everything into the constant pool. */
7250 return SYMBOL_FORCE_TO_MEM;
7253 bool
7254 aarch64_constant_address_p (rtx x)
7256 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7259 bool
7260 aarch64_legitimate_pic_operand_p (rtx x)
7262 if (GET_CODE (x) == SYMBOL_REF
7263 || (GET_CODE (x) == CONST
7264 && GET_CODE (XEXP (x, 0)) == PLUS
7265 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7266 return false;
7268 return true;
7271 /* Return true if X holds either a quarter-precision or
7272 floating-point +0.0 constant. */
7273 static bool
7274 aarch64_valid_floating_const (machine_mode mode, rtx x)
7276 if (!CONST_DOUBLE_P (x))
7277 return false;
7279 /* TODO: We could handle moving 0.0 to a TFmode register,
7280 but first we would like to refactor the movtf_aarch64
7281 to be more amicable to split moves properly and
7282 correctly gate on TARGET_SIMD. For now - reject all
7283 constants which are not to SFmode or DFmode registers. */
7284 if (!(mode == SFmode || mode == DFmode))
7285 return false;
7287 if (aarch64_float_const_zero_rtx_p (x))
7288 return true;
7289 return aarch64_float_const_representable_p (x);
7292 static bool
7293 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7295 /* Do not allow vector struct mode constants. We could support
7296 0 and -1 easily, but they need support in aarch64-simd.md. */
7297 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7298 return false;
7300 /* This could probably go away because
7301 we now decompose CONST_INTs according to expand_mov_immediate. */
7302 if ((GET_CODE (x) == CONST_VECTOR
7303 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7304 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7305 return !targetm.cannot_force_const_mem (mode, x);
7307 if (GET_CODE (x) == HIGH
7308 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7309 return true;
7311 return aarch64_constant_address_p (x);
7315 aarch64_load_tp (rtx target)
7317 if (!target
7318 || GET_MODE (target) != Pmode
7319 || !register_operand (target, Pmode))
7320 target = gen_reg_rtx (Pmode);
7322 /* Can return in any reg. */
7323 emit_insn (gen_aarch64_load_tp_hard (target));
7324 return target;
7327 /* On AAPCS systems, this is the "struct __va_list". */
7328 static GTY(()) tree va_list_type;
7330 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7331 Return the type to use as __builtin_va_list.
7333 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7335 struct __va_list
7337 void *__stack;
7338 void *__gr_top;
7339 void *__vr_top;
7340 int __gr_offs;
7341 int __vr_offs;
7342 }; */
7344 static tree
7345 aarch64_build_builtin_va_list (void)
7347 tree va_list_name;
7348 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7350 /* Create the type. */
7351 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7352 /* Give it the required name. */
7353 va_list_name = build_decl (BUILTINS_LOCATION,
7354 TYPE_DECL,
7355 get_identifier ("__va_list"),
7356 va_list_type);
7357 DECL_ARTIFICIAL (va_list_name) = 1;
7358 TYPE_NAME (va_list_type) = va_list_name;
7359 TYPE_STUB_DECL (va_list_type) = va_list_name;
7361 /* Create the fields. */
7362 f_stack = build_decl (BUILTINS_LOCATION,
7363 FIELD_DECL, get_identifier ("__stack"),
7364 ptr_type_node);
7365 f_grtop = build_decl (BUILTINS_LOCATION,
7366 FIELD_DECL, get_identifier ("__gr_top"),
7367 ptr_type_node);
7368 f_vrtop = build_decl (BUILTINS_LOCATION,
7369 FIELD_DECL, get_identifier ("__vr_top"),
7370 ptr_type_node);
7371 f_groff = build_decl (BUILTINS_LOCATION,
7372 FIELD_DECL, get_identifier ("__gr_offs"),
7373 integer_type_node);
7374 f_vroff = build_decl (BUILTINS_LOCATION,
7375 FIELD_DECL, get_identifier ("__vr_offs"),
7376 integer_type_node);
7378 DECL_ARTIFICIAL (f_stack) = 1;
7379 DECL_ARTIFICIAL (f_grtop) = 1;
7380 DECL_ARTIFICIAL (f_vrtop) = 1;
7381 DECL_ARTIFICIAL (f_groff) = 1;
7382 DECL_ARTIFICIAL (f_vroff) = 1;
7384 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7385 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7386 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7387 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7388 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7390 TYPE_FIELDS (va_list_type) = f_stack;
7391 DECL_CHAIN (f_stack) = f_grtop;
7392 DECL_CHAIN (f_grtop) = f_vrtop;
7393 DECL_CHAIN (f_vrtop) = f_groff;
7394 DECL_CHAIN (f_groff) = f_vroff;
7396 /* Compute its layout. */
7397 layout_type (va_list_type);
7399 return va_list_type;
7402 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7403 static void
7404 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7406 const CUMULATIVE_ARGS *cum;
7407 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7408 tree stack, grtop, vrtop, groff, vroff;
7409 tree t;
7410 int gr_save_area_size;
7411 int vr_save_area_size;
7412 int vr_offset;
7414 cum = &crtl->args.info;
7415 gr_save_area_size
7416 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7417 vr_save_area_size
7418 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7420 if (TARGET_GENERAL_REGS_ONLY)
7422 if (cum->aapcs_nvrn > 0)
7423 sorry ("%qs and floating point or vector arguments",
7424 "-mgeneral-regs-only");
7425 vr_save_area_size = 0;
7428 f_stack = TYPE_FIELDS (va_list_type_node);
7429 f_grtop = DECL_CHAIN (f_stack);
7430 f_vrtop = DECL_CHAIN (f_grtop);
7431 f_groff = DECL_CHAIN (f_vrtop);
7432 f_vroff = DECL_CHAIN (f_groff);
7434 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7435 NULL_TREE);
7436 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7437 NULL_TREE);
7438 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7439 NULL_TREE);
7440 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7441 NULL_TREE);
7442 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7443 NULL_TREE);
7445 /* Emit code to initialize STACK, which points to the next varargs stack
7446 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7447 by named arguments. STACK is 8-byte aligned. */
7448 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7449 if (cum->aapcs_stack_size > 0)
7450 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7451 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7452 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7454 /* Emit code to initialize GRTOP, the top of the GR save area.
7455 virtual_incoming_args_rtx should have been 16 byte aligned. */
7456 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7457 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7458 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7460 /* Emit code to initialize VRTOP, the top of the VR save area.
7461 This address is gr_save_area_bytes below GRTOP, rounded
7462 down to the next 16-byte boundary. */
7463 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7464 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7465 STACK_BOUNDARY / BITS_PER_UNIT);
7467 if (vr_offset)
7468 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7469 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7470 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7472 /* Emit code to initialize GROFF, the offset from GRTOP of the
7473 next GPR argument. */
7474 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7475 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7476 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7478 /* Likewise emit code to initialize VROFF, the offset from FTOP
7479 of the next VR argument. */
7480 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7481 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7482 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7485 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7487 static tree
7488 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7489 gimple_seq *post_p ATTRIBUTE_UNUSED)
7491 tree addr;
7492 bool indirect_p;
7493 bool is_ha; /* is HFA or HVA. */
7494 bool dw_align; /* double-word align. */
7495 machine_mode ag_mode = VOIDmode;
7496 int nregs;
7497 machine_mode mode;
7499 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7500 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7501 HOST_WIDE_INT size, rsize, adjust, align;
7502 tree t, u, cond1, cond2;
7504 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7505 if (indirect_p)
7506 type = build_pointer_type (type);
7508 mode = TYPE_MODE (type);
7510 f_stack = TYPE_FIELDS (va_list_type_node);
7511 f_grtop = DECL_CHAIN (f_stack);
7512 f_vrtop = DECL_CHAIN (f_grtop);
7513 f_groff = DECL_CHAIN (f_vrtop);
7514 f_vroff = DECL_CHAIN (f_groff);
7516 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7517 f_stack, NULL_TREE);
7518 size = int_size_in_bytes (type);
7519 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7521 dw_align = false;
7522 adjust = 0;
7523 if (aarch64_vfp_is_call_or_return_candidate (mode,
7524 type,
7525 &ag_mode,
7526 &nregs,
7527 &is_ha))
7529 /* TYPE passed in fp/simd registers. */
7530 if (TARGET_GENERAL_REGS_ONLY)
7531 sorry ("%qs and floating point or vector arguments",
7532 "-mgeneral-regs-only");
7534 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7535 unshare_expr (valist), f_vrtop, NULL_TREE);
7536 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7537 unshare_expr (valist), f_vroff, NULL_TREE);
7539 rsize = nregs * UNITS_PER_VREG;
7541 if (is_ha)
7543 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7544 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7546 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7547 && size < UNITS_PER_VREG)
7549 adjust = UNITS_PER_VREG - size;
7552 else
7554 /* TYPE passed in general registers. */
7555 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7556 unshare_expr (valist), f_grtop, NULL_TREE);
7557 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7558 unshare_expr (valist), f_groff, NULL_TREE);
7559 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7560 nregs = rsize / UNITS_PER_WORD;
7562 if (align > 8)
7563 dw_align = true;
7565 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7566 && size < UNITS_PER_WORD)
7568 adjust = UNITS_PER_WORD - size;
7572 /* Get a local temporary for the field value. */
7573 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7575 /* Emit code to branch if off >= 0. */
7576 t = build2 (GE_EXPR, boolean_type_node, off,
7577 build_int_cst (TREE_TYPE (off), 0));
7578 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7580 if (dw_align)
7582 /* Emit: offs = (offs + 15) & -16. */
7583 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7584 build_int_cst (TREE_TYPE (off), 15));
7585 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7586 build_int_cst (TREE_TYPE (off), -16));
7587 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7589 else
7590 roundup = NULL;
7592 /* Update ap.__[g|v]r_offs */
7593 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7594 build_int_cst (TREE_TYPE (off), rsize));
7595 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7597 /* String up. */
7598 if (roundup)
7599 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7601 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7602 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7603 build_int_cst (TREE_TYPE (f_off), 0));
7604 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7606 /* String up: make sure the assignment happens before the use. */
7607 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7608 COND_EXPR_ELSE (cond1) = t;
7610 /* Prepare the trees handling the argument that is passed on the stack;
7611 the top level node will store in ON_STACK. */
7612 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7613 if (align > 8)
7615 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7616 t = fold_convert (intDI_type_node, arg);
7617 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7618 build_int_cst (TREE_TYPE (t), 15));
7619 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7620 build_int_cst (TREE_TYPE (t), -16));
7621 t = fold_convert (TREE_TYPE (arg), t);
7622 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7624 else
7625 roundup = NULL;
7626 /* Advance ap.__stack */
7627 t = fold_convert (intDI_type_node, arg);
7628 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7629 build_int_cst (TREE_TYPE (t), size + 7));
7630 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7631 build_int_cst (TREE_TYPE (t), -8));
7632 t = fold_convert (TREE_TYPE (arg), t);
7633 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7634 /* String up roundup and advance. */
7635 if (roundup)
7636 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7637 /* String up with arg */
7638 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7639 /* Big-endianness related address adjustment. */
7640 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7641 && size < UNITS_PER_WORD)
7643 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7644 size_int (UNITS_PER_WORD - size));
7645 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7648 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7649 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7651 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7652 t = off;
7653 if (adjust)
7654 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7655 build_int_cst (TREE_TYPE (off), adjust));
7657 t = fold_convert (sizetype, t);
7658 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7660 if (is_ha)
7662 /* type ha; // treat as "struct {ftype field[n];}"
7663 ... [computing offs]
7664 for (i = 0; i <nregs; ++i, offs += 16)
7665 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7666 return ha; */
7667 int i;
7668 tree tmp_ha, field_t, field_ptr_t;
7670 /* Declare a local variable. */
7671 tmp_ha = create_tmp_var_raw (type, "ha");
7672 gimple_add_tmp_var (tmp_ha);
7674 /* Establish the base type. */
7675 switch (ag_mode)
7677 case SFmode:
7678 field_t = float_type_node;
7679 field_ptr_t = float_ptr_type_node;
7680 break;
7681 case DFmode:
7682 field_t = double_type_node;
7683 field_ptr_t = double_ptr_type_node;
7684 break;
7685 case TFmode:
7686 field_t = long_double_type_node;
7687 field_ptr_t = long_double_ptr_type_node;
7688 break;
7689 /* The half precision and quad precision are not fully supported yet. Enable
7690 the following code after the support is complete. Need to find the correct
7691 type node for __fp16 *. */
7692 #if 0
7693 case HFmode:
7694 field_t = float_type_node;
7695 field_ptr_t = float_ptr_type_node;
7696 break;
7697 #endif
7698 case V2SImode:
7699 case V4SImode:
7701 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7702 field_t = build_vector_type_for_mode (innertype, ag_mode);
7703 field_ptr_t = build_pointer_type (field_t);
7705 break;
7706 default:
7707 gcc_assert (0);
7710 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7711 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7712 addr = t;
7713 t = fold_convert (field_ptr_t, addr);
7714 t = build2 (MODIFY_EXPR, field_t,
7715 build1 (INDIRECT_REF, field_t, tmp_ha),
7716 build1 (INDIRECT_REF, field_t, t));
7718 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7719 for (i = 1; i < nregs; ++i)
7721 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7722 u = fold_convert (field_ptr_t, addr);
7723 u = build2 (MODIFY_EXPR, field_t,
7724 build2 (MEM_REF, field_t, tmp_ha,
7725 build_int_cst (field_ptr_t,
7726 (i *
7727 int_size_in_bytes (field_t)))),
7728 build1 (INDIRECT_REF, field_t, u));
7729 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7732 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7733 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7736 COND_EXPR_ELSE (cond2) = t;
7737 addr = fold_convert (build_pointer_type (type), cond1);
7738 addr = build_va_arg_indirect_ref (addr);
7740 if (indirect_p)
7741 addr = build_va_arg_indirect_ref (addr);
7743 return addr;
7746 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7748 static void
7749 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7750 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7751 int no_rtl)
7753 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7754 CUMULATIVE_ARGS local_cum;
7755 int gr_saved, vr_saved;
7757 /* The caller has advanced CUM up to, but not beyond, the last named
7758 argument. Advance a local copy of CUM past the last "real" named
7759 argument, to find out how many registers are left over. */
7760 local_cum = *cum;
7761 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7763 /* Found out how many registers we need to save. */
7764 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7765 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7767 if (TARGET_GENERAL_REGS_ONLY)
7769 if (local_cum.aapcs_nvrn > 0)
7770 sorry ("%qs and floating point or vector arguments",
7771 "-mgeneral-regs-only");
7772 vr_saved = 0;
7775 if (!no_rtl)
7777 if (gr_saved > 0)
7779 rtx ptr, mem;
7781 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7782 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7783 - gr_saved * UNITS_PER_WORD);
7784 mem = gen_frame_mem (BLKmode, ptr);
7785 set_mem_alias_set (mem, get_varargs_alias_set ());
7787 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7788 mem, gr_saved);
7790 if (vr_saved > 0)
7792 /* We can't use move_block_from_reg, because it will use
7793 the wrong mode, storing D regs only. */
7794 machine_mode mode = TImode;
7795 int off, i;
7797 /* Set OFF to the offset from virtual_incoming_args_rtx of
7798 the first vector register. The VR save area lies below
7799 the GR one, and is aligned to 16 bytes. */
7800 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7801 STACK_BOUNDARY / BITS_PER_UNIT);
7802 off -= vr_saved * UNITS_PER_VREG;
7804 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7806 rtx ptr, mem;
7808 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7809 mem = gen_frame_mem (mode, ptr);
7810 set_mem_alias_set (mem, get_varargs_alias_set ());
7811 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7812 off += UNITS_PER_VREG;
7817 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7818 any complication of having crtl->args.pretend_args_size changed. */
7819 cfun->machine->frame.saved_varargs_size
7820 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7821 STACK_BOUNDARY / BITS_PER_UNIT)
7822 + vr_saved * UNITS_PER_VREG);
7825 static void
7826 aarch64_conditional_register_usage (void)
7828 int i;
7829 if (!TARGET_FLOAT)
7831 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7833 fixed_regs[i] = 1;
7834 call_used_regs[i] = 1;
7839 /* Walk down the type tree of TYPE counting consecutive base elements.
7840 If *MODEP is VOIDmode, then set it to the first valid floating point
7841 type. If a non-floating point type is found, or if a floating point
7842 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7843 otherwise return the count in the sub-tree. */
7844 static int
7845 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7847 machine_mode mode;
7848 HOST_WIDE_INT size;
7850 switch (TREE_CODE (type))
7852 case REAL_TYPE:
7853 mode = TYPE_MODE (type);
7854 if (mode != DFmode && mode != SFmode && mode != TFmode)
7855 return -1;
7857 if (*modep == VOIDmode)
7858 *modep = mode;
7860 if (*modep == mode)
7861 return 1;
7863 break;
7865 case COMPLEX_TYPE:
7866 mode = TYPE_MODE (TREE_TYPE (type));
7867 if (mode != DFmode && mode != SFmode && mode != TFmode)
7868 return -1;
7870 if (*modep == VOIDmode)
7871 *modep = mode;
7873 if (*modep == mode)
7874 return 2;
7876 break;
7878 case VECTOR_TYPE:
7879 /* Use V2SImode and V4SImode as representatives of all 64-bit
7880 and 128-bit vector types. */
7881 size = int_size_in_bytes (type);
7882 switch (size)
7884 case 8:
7885 mode = V2SImode;
7886 break;
7887 case 16:
7888 mode = V4SImode;
7889 break;
7890 default:
7891 return -1;
7894 if (*modep == VOIDmode)
7895 *modep = mode;
7897 /* Vector modes are considered to be opaque: two vectors are
7898 equivalent for the purposes of being homogeneous aggregates
7899 if they are the same size. */
7900 if (*modep == mode)
7901 return 1;
7903 break;
7905 case ARRAY_TYPE:
7907 int count;
7908 tree index = TYPE_DOMAIN (type);
7910 /* Can't handle incomplete types nor sizes that are not
7911 fixed. */
7912 if (!COMPLETE_TYPE_P (type)
7913 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7914 return -1;
7916 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7917 if (count == -1
7918 || !index
7919 || !TYPE_MAX_VALUE (index)
7920 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7921 || !TYPE_MIN_VALUE (index)
7922 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7923 || count < 0)
7924 return -1;
7926 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7927 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7929 /* There must be no padding. */
7930 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7931 return -1;
7933 return count;
7936 case RECORD_TYPE:
7938 int count = 0;
7939 int sub_count;
7940 tree field;
7942 /* Can't handle incomplete types nor sizes that are not
7943 fixed. */
7944 if (!COMPLETE_TYPE_P (type)
7945 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7946 return -1;
7948 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7950 if (TREE_CODE (field) != FIELD_DECL)
7951 continue;
7953 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7954 if (sub_count < 0)
7955 return -1;
7956 count += sub_count;
7959 /* There must be no padding. */
7960 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7961 return -1;
7963 return count;
7966 case UNION_TYPE:
7967 case QUAL_UNION_TYPE:
7969 /* These aren't very interesting except in a degenerate case. */
7970 int count = 0;
7971 int sub_count;
7972 tree field;
7974 /* Can't handle incomplete types nor sizes that are not
7975 fixed. */
7976 if (!COMPLETE_TYPE_P (type)
7977 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7978 return -1;
7980 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7982 if (TREE_CODE (field) != FIELD_DECL)
7983 continue;
7985 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7986 if (sub_count < 0)
7987 return -1;
7988 count = count > sub_count ? count : sub_count;
7991 /* There must be no padding. */
7992 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7993 return -1;
7995 return count;
7998 default:
7999 break;
8002 return -1;
8005 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8006 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
8007 array types. The C99 floating-point complex types are also considered
8008 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
8009 types, which are GCC extensions and out of the scope of AAPCS64, are
8010 treated as composite types here as well.
8012 Note that MODE itself is not sufficient in determining whether a type
8013 is such a composite type or not. This is because
8014 stor-layout.c:compute_record_mode may have already changed the MODE
8015 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
8016 structure with only one field may have its MODE set to the mode of the
8017 field. Also an integer mode whose size matches the size of the
8018 RECORD_TYPE type may be used to substitute the original mode
8019 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
8020 solely relied on. */
8022 static bool
8023 aarch64_composite_type_p (const_tree type,
8024 machine_mode mode)
8026 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8027 return true;
8029 if (mode == BLKmode
8030 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8031 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8032 return true;
8034 return false;
8037 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8038 type as described in AAPCS64 \S 4.1.2.
8040 See the comment above aarch64_composite_type_p for the notes on MODE. */
8042 static bool
8043 aarch64_short_vector_p (const_tree type,
8044 machine_mode mode)
8046 HOST_WIDE_INT size = -1;
8048 if (type && TREE_CODE (type) == VECTOR_TYPE)
8049 size = int_size_in_bytes (type);
8050 else if (!aarch64_composite_type_p (type, mode)
8051 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8052 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
8053 size = GET_MODE_SIZE (mode);
8055 return (size == 8 || size == 16) ? true : false;
8058 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8059 shall be passed or returned in simd/fp register(s) (providing these
8060 parameter passing registers are available).
8062 Upon successful return, *COUNT returns the number of needed registers,
8063 *BASE_MODE returns the mode of the individual register and when IS_HAF
8064 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8065 floating-point aggregate or a homogeneous short-vector aggregate. */
8067 static bool
8068 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8069 const_tree type,
8070 machine_mode *base_mode,
8071 int *count,
8072 bool *is_ha)
8074 machine_mode new_mode = VOIDmode;
8075 bool composite_p = aarch64_composite_type_p (type, mode);
8077 if (is_ha != NULL) *is_ha = false;
8079 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8080 || aarch64_short_vector_p (type, mode))
8082 *count = 1;
8083 new_mode = mode;
8085 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8087 if (is_ha != NULL) *is_ha = true;
8088 *count = 2;
8089 new_mode = GET_MODE_INNER (mode);
8091 else if (type && composite_p)
8093 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8095 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8097 if (is_ha != NULL) *is_ha = true;
8098 *count = ag_count;
8100 else
8101 return false;
8103 else
8104 return false;
8106 *base_mode = new_mode;
8107 return true;
8110 /* Implement TARGET_STRUCT_VALUE_RTX. */
8112 static rtx
8113 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8114 int incoming ATTRIBUTE_UNUSED)
8116 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8119 /* Implements target hook vector_mode_supported_p. */
8120 static bool
8121 aarch64_vector_mode_supported_p (machine_mode mode)
8123 if (TARGET_SIMD
8124 && (mode == V4SImode || mode == V8HImode
8125 || mode == V16QImode || mode == V2DImode
8126 || mode == V2SImode || mode == V4HImode
8127 || mode == V8QImode || mode == V2SFmode
8128 || mode == V4SFmode || mode == V2DFmode
8129 || mode == V1DFmode))
8130 return true;
8132 return false;
8135 /* Return appropriate SIMD container
8136 for MODE within a vector of WIDTH bits. */
8137 static machine_mode
8138 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8140 gcc_assert (width == 64 || width == 128);
8141 if (TARGET_SIMD)
8143 if (width == 128)
8144 switch (mode)
8146 case DFmode:
8147 return V2DFmode;
8148 case SFmode:
8149 return V4SFmode;
8150 case SImode:
8151 return V4SImode;
8152 case HImode:
8153 return V8HImode;
8154 case QImode:
8155 return V16QImode;
8156 case DImode:
8157 return V2DImode;
8158 default:
8159 break;
8161 else
8162 switch (mode)
8164 case SFmode:
8165 return V2SFmode;
8166 case SImode:
8167 return V2SImode;
8168 case HImode:
8169 return V4HImode;
8170 case QImode:
8171 return V8QImode;
8172 default:
8173 break;
8176 return word_mode;
8179 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8180 static machine_mode
8181 aarch64_preferred_simd_mode (machine_mode mode)
8183 return aarch64_simd_container_mode (mode, 128);
8186 /* Return the bitmask of possible vector sizes for the vectorizer
8187 to iterate over. */
8188 static unsigned int
8189 aarch64_autovectorize_vector_sizes (void)
8191 return (16 | 8);
8194 /* Implement TARGET_MANGLE_TYPE. */
8196 static const char *
8197 aarch64_mangle_type (const_tree type)
8199 /* The AArch64 ABI documents say that "__va_list" has to be
8200 managled as if it is in the "std" namespace. */
8201 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8202 return "St9__va_list";
8204 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8205 builtin types. */
8206 if (TYPE_NAME (type) != NULL)
8207 return aarch64_mangle_builtin_type (type);
8209 /* Use the default mangling. */
8210 return NULL;
8214 /* Return true if the rtx_insn contains a MEM RTX somewhere
8215 in it. */
8217 static bool
8218 has_memory_op (rtx_insn *mem_insn)
8220 subrtx_iterator::array_type array;
8221 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8222 if (MEM_P (*iter))
8223 return true;
8225 return false;
8228 /* Find the first rtx_insn before insn that will generate an assembly
8229 instruction. */
8231 static rtx_insn *
8232 aarch64_prev_real_insn (rtx_insn *insn)
8234 if (!insn)
8235 return NULL;
8239 insn = prev_real_insn (insn);
8241 while (insn && recog_memoized (insn) < 0);
8243 return insn;
8246 static bool
8247 is_madd_op (enum attr_type t1)
8249 unsigned int i;
8250 /* A number of these may be AArch32 only. */
8251 enum attr_type mlatypes[] = {
8252 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8253 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8254 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8257 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8259 if (t1 == mlatypes[i])
8260 return true;
8263 return false;
8266 /* Check if there is a register dependency between a load and the insn
8267 for which we hold recog_data. */
8269 static bool
8270 dep_between_memop_and_curr (rtx memop)
8272 rtx load_reg;
8273 int opno;
8275 gcc_assert (GET_CODE (memop) == SET);
8277 if (!REG_P (SET_DEST (memop)))
8278 return false;
8280 load_reg = SET_DEST (memop);
8281 for (opno = 1; opno < recog_data.n_operands; opno++)
8283 rtx operand = recog_data.operand[opno];
8284 if (REG_P (operand)
8285 && reg_overlap_mentioned_p (load_reg, operand))
8286 return true;
8289 return false;
8293 /* When working around the Cortex-A53 erratum 835769,
8294 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8295 instruction and has a preceding memory instruction such that a NOP
8296 should be inserted between them. */
8298 bool
8299 aarch64_madd_needs_nop (rtx_insn* insn)
8301 enum attr_type attr_type;
8302 rtx_insn *prev;
8303 rtx body;
8305 if (!aarch64_fix_a53_err835769)
8306 return false;
8308 if (recog_memoized (insn) < 0)
8309 return false;
8311 attr_type = get_attr_type (insn);
8312 if (!is_madd_op (attr_type))
8313 return false;
8315 prev = aarch64_prev_real_insn (insn);
8316 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8317 Restore recog state to INSN to avoid state corruption. */
8318 extract_constrain_insn_cached (insn);
8320 if (!prev || !has_memory_op (prev))
8321 return false;
8323 body = single_set (prev);
8325 /* If the previous insn is a memory op and there is no dependency between
8326 it and the DImode madd, emit a NOP between them. If body is NULL then we
8327 have a complex memory operation, probably a load/store pair.
8328 Be conservative for now and emit a NOP. */
8329 if (GET_MODE (recog_data.operand[0]) == DImode
8330 && (!body || !dep_between_memop_and_curr (body)))
8331 return true;
8333 return false;
8338 /* Implement FINAL_PRESCAN_INSN. */
8340 void
8341 aarch64_final_prescan_insn (rtx_insn *insn)
8343 if (aarch64_madd_needs_nop (insn))
8344 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8348 /* Return the equivalent letter for size. */
8349 static char
8350 sizetochar (int size)
8352 switch (size)
8354 case 64: return 'd';
8355 case 32: return 's';
8356 case 16: return 'h';
8357 case 8 : return 'b';
8358 default: gcc_unreachable ();
8362 /* Return true iff x is a uniform vector of floating-point
8363 constants, and the constant can be represented in
8364 quarter-precision form. Note, as aarch64_float_const_representable
8365 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8366 static bool
8367 aarch64_vect_float_const_representable_p (rtx x)
8369 int i = 0;
8370 REAL_VALUE_TYPE r0, ri;
8371 rtx x0, xi;
8373 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8374 return false;
8376 x0 = CONST_VECTOR_ELT (x, 0);
8377 if (!CONST_DOUBLE_P (x0))
8378 return false;
8380 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8382 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8384 xi = CONST_VECTOR_ELT (x, i);
8385 if (!CONST_DOUBLE_P (xi))
8386 return false;
8388 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8389 if (!REAL_VALUES_EQUAL (r0, ri))
8390 return false;
8393 return aarch64_float_const_representable_p (x0);
8396 /* Return true for valid and false for invalid. */
8397 bool
8398 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8399 struct simd_immediate_info *info)
8401 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8402 matches = 1; \
8403 for (i = 0; i < idx; i += (STRIDE)) \
8404 if (!(TEST)) \
8405 matches = 0; \
8406 if (matches) \
8408 immtype = (CLASS); \
8409 elsize = (ELSIZE); \
8410 eshift = (SHIFT); \
8411 emvn = (NEG); \
8412 break; \
8415 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8416 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8417 unsigned char bytes[16];
8418 int immtype = -1, matches;
8419 unsigned int invmask = inverse ? 0xff : 0;
8420 int eshift, emvn;
8422 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8424 if (! (aarch64_simd_imm_zero_p (op, mode)
8425 || aarch64_vect_float_const_representable_p (op)))
8426 return false;
8428 if (info)
8430 info->value = CONST_VECTOR_ELT (op, 0);
8431 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8432 info->mvn = false;
8433 info->shift = 0;
8436 return true;
8439 /* Splat vector constant out into a byte vector. */
8440 for (i = 0; i < n_elts; i++)
8442 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8443 it must be laid out in the vector register in reverse order. */
8444 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8445 unsigned HOST_WIDE_INT elpart;
8446 unsigned int part, parts;
8448 if (CONST_INT_P (el))
8450 elpart = INTVAL (el);
8451 parts = 1;
8453 else if (GET_CODE (el) == CONST_DOUBLE)
8455 elpart = CONST_DOUBLE_LOW (el);
8456 parts = 2;
8458 else
8459 gcc_unreachable ();
8461 for (part = 0; part < parts; part++)
8463 unsigned int byte;
8464 for (byte = 0; byte < innersize; byte++)
8466 bytes[idx++] = (elpart & 0xff) ^ invmask;
8467 elpart >>= BITS_PER_UNIT;
8469 if (GET_CODE (el) == CONST_DOUBLE)
8470 elpart = CONST_DOUBLE_HIGH (el);
8474 /* Sanity check. */
8475 gcc_assert (idx == GET_MODE_SIZE (mode));
8479 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8480 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8482 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8483 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8485 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8486 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8488 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8489 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8491 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8493 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8495 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8496 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8498 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8499 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8501 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8502 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8504 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8505 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8507 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8509 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8511 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8512 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8514 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8515 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8517 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8518 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8520 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8521 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8523 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8525 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8526 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8528 while (0);
8530 if (immtype == -1)
8531 return false;
8533 if (info)
8535 info->element_width = elsize;
8536 info->mvn = emvn != 0;
8537 info->shift = eshift;
8539 unsigned HOST_WIDE_INT imm = 0;
8541 if (immtype >= 12 && immtype <= 15)
8542 info->msl = true;
8544 /* Un-invert bytes of recognized vector, if necessary. */
8545 if (invmask != 0)
8546 for (i = 0; i < idx; i++)
8547 bytes[i] ^= invmask;
8549 if (immtype == 17)
8551 /* FIXME: Broken on 32-bit H_W_I hosts. */
8552 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8554 for (i = 0; i < 8; i++)
8555 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8556 << (i * BITS_PER_UNIT);
8559 info->value = GEN_INT (imm);
8561 else
8563 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8564 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8566 /* Construct 'abcdefgh' because the assembler cannot handle
8567 generic constants. */
8568 if (info->mvn)
8569 imm = ~imm;
8570 imm = (imm >> info->shift) & 0xff;
8571 info->value = GEN_INT (imm);
8575 return true;
8576 #undef CHECK
8579 /* Check of immediate shift constants are within range. */
8580 bool
8581 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8583 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8584 if (left)
8585 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8586 else
8587 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8590 /* Return true if X is a uniform vector where all elements
8591 are either the floating-point constant 0.0 or the
8592 integer constant 0. */
8593 bool
8594 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8596 return x == CONST0_RTX (mode);
8599 bool
8600 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8602 HOST_WIDE_INT imm = INTVAL (x);
8603 int i;
8605 for (i = 0; i < 8; i++)
8607 unsigned int byte = imm & 0xff;
8608 if (byte != 0xff && byte != 0)
8609 return false;
8610 imm >>= 8;
8613 return true;
8616 bool
8617 aarch64_mov_operand_p (rtx x,
8618 enum aarch64_symbol_context context,
8619 machine_mode mode)
8621 if (GET_CODE (x) == HIGH
8622 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8623 return true;
8625 if (CONST_INT_P (x))
8626 return true;
8628 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8629 return true;
8631 return aarch64_classify_symbolic_expression (x, context)
8632 == SYMBOL_TINY_ABSOLUTE;
8635 /* Return a const_int vector of VAL. */
8637 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8639 int nunits = GET_MODE_NUNITS (mode);
8640 rtvec v = rtvec_alloc (nunits);
8641 int i;
8643 for (i=0; i < nunits; i++)
8644 RTVEC_ELT (v, i) = GEN_INT (val);
8646 return gen_rtx_CONST_VECTOR (mode, v);
8649 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8651 bool
8652 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8654 machine_mode vmode;
8656 gcc_assert (!VECTOR_MODE_P (mode));
8657 vmode = aarch64_preferred_simd_mode (mode);
8658 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8659 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8662 /* Construct and return a PARALLEL RTX vector with elements numbering the
8663 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8664 the vector - from the perspective of the architecture. This does not
8665 line up with GCC's perspective on lane numbers, so we end up with
8666 different masks depending on our target endian-ness. The diagram
8667 below may help. We must draw the distinction when building masks
8668 which select one half of the vector. An instruction selecting
8669 architectural low-lanes for a big-endian target, must be described using
8670 a mask selecting GCC high-lanes.
8672 Big-Endian Little-Endian
8674 GCC 0 1 2 3 3 2 1 0
8675 | x | x | x | x | | x | x | x | x |
8676 Architecture 3 2 1 0 3 2 1 0
8678 Low Mask: { 2, 3 } { 0, 1 }
8679 High Mask: { 0, 1 } { 2, 3 }
8683 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8685 int nunits = GET_MODE_NUNITS (mode);
8686 rtvec v = rtvec_alloc (nunits / 2);
8687 int high_base = nunits / 2;
8688 int low_base = 0;
8689 int base;
8690 rtx t1;
8691 int i;
8693 if (BYTES_BIG_ENDIAN)
8694 base = high ? low_base : high_base;
8695 else
8696 base = high ? high_base : low_base;
8698 for (i = 0; i < nunits / 2; i++)
8699 RTVEC_ELT (v, i) = GEN_INT (base + i);
8701 t1 = gen_rtx_PARALLEL (mode, v);
8702 return t1;
8705 /* Check OP for validity as a PARALLEL RTX vector with elements
8706 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8707 from the perspective of the architecture. See the diagram above
8708 aarch64_simd_vect_par_cnst_half for more details. */
8710 bool
8711 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8712 bool high)
8714 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8715 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8716 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8717 int i = 0;
8719 if (!VECTOR_MODE_P (mode))
8720 return false;
8722 if (count_op != count_ideal)
8723 return false;
8725 for (i = 0; i < count_ideal; i++)
8727 rtx elt_op = XVECEXP (op, 0, i);
8728 rtx elt_ideal = XVECEXP (ideal, 0, i);
8730 if (!CONST_INT_P (elt_op)
8731 || INTVAL (elt_ideal) != INTVAL (elt_op))
8732 return false;
8734 return true;
8737 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8738 HIGH (exclusive). */
8739 void
8740 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8741 const_tree exp)
8743 HOST_WIDE_INT lane;
8744 gcc_assert (CONST_INT_P (operand));
8745 lane = INTVAL (operand);
8747 if (lane < low || lane >= high)
8749 if (exp)
8750 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8751 else
8752 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8756 /* Return TRUE if OP is a valid vector addressing mode. */
8757 bool
8758 aarch64_simd_mem_operand_p (rtx op)
8760 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8761 || REG_P (XEXP (op, 0)));
8764 /* Emit a register copy from operand to operand, taking care not to
8765 early-clobber source registers in the process.
8767 COUNT is the number of components into which the copy needs to be
8768 decomposed. */
8769 void
8770 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8771 unsigned int count)
8773 unsigned int i;
8774 int rdest = REGNO (operands[0]);
8775 int rsrc = REGNO (operands[1]);
8777 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8778 || rdest < rsrc)
8779 for (i = 0; i < count; i++)
8780 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8781 gen_rtx_REG (mode, rsrc + i));
8782 else
8783 for (i = 0; i < count; i++)
8784 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8785 gen_rtx_REG (mode, rsrc + count - i - 1));
8788 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8789 one of VSTRUCT modes: OI, CI or XI. */
8791 aarch64_simd_attr_length_move (rtx_insn *insn)
8793 machine_mode mode;
8795 extract_insn_cached (insn);
8797 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8799 mode = GET_MODE (recog_data.operand[0]);
8800 switch (mode)
8802 case OImode:
8803 return 8;
8804 case CImode:
8805 return 12;
8806 case XImode:
8807 return 16;
8808 default:
8809 gcc_unreachable ();
8812 return 4;
8815 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8816 one of VSTRUCT modes: OI, CI, EI, or XI. */
8818 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8820 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8823 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8824 alignment of a vector to 128 bits. */
8825 static HOST_WIDE_INT
8826 aarch64_simd_vector_alignment (const_tree type)
8828 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8829 return MIN (align, 128);
8832 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8833 static bool
8834 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8836 if (is_packed)
8837 return false;
8839 /* We guarantee alignment for vectors up to 128-bits. */
8840 if (tree_int_cst_compare (TYPE_SIZE (type),
8841 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8842 return false;
8844 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8845 return true;
8848 /* If VALS is a vector constant that can be loaded into a register
8849 using DUP, generate instructions to do so and return an RTX to
8850 assign to the register. Otherwise return NULL_RTX. */
8851 static rtx
8852 aarch64_simd_dup_constant (rtx vals)
8854 machine_mode mode = GET_MODE (vals);
8855 machine_mode inner_mode = GET_MODE_INNER (mode);
8856 int n_elts = GET_MODE_NUNITS (mode);
8857 bool all_same = true;
8858 rtx x;
8859 int i;
8861 if (GET_CODE (vals) != CONST_VECTOR)
8862 return NULL_RTX;
8864 for (i = 1; i < n_elts; ++i)
8866 x = CONST_VECTOR_ELT (vals, i);
8867 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8868 all_same = false;
8871 if (!all_same)
8872 return NULL_RTX;
8874 /* We can load this constant by using DUP and a constant in a
8875 single ARM register. This will be cheaper than a vector
8876 load. */
8877 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8878 return gen_rtx_VEC_DUPLICATE (mode, x);
8882 /* Generate code to load VALS, which is a PARALLEL containing only
8883 constants (for vec_init) or CONST_VECTOR, efficiently into a
8884 register. Returns an RTX to copy into the register, or NULL_RTX
8885 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8886 static rtx
8887 aarch64_simd_make_constant (rtx vals)
8889 machine_mode mode = GET_MODE (vals);
8890 rtx const_dup;
8891 rtx const_vec = NULL_RTX;
8892 int n_elts = GET_MODE_NUNITS (mode);
8893 int n_const = 0;
8894 int i;
8896 if (GET_CODE (vals) == CONST_VECTOR)
8897 const_vec = vals;
8898 else if (GET_CODE (vals) == PARALLEL)
8900 /* A CONST_VECTOR must contain only CONST_INTs and
8901 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8902 Only store valid constants in a CONST_VECTOR. */
8903 for (i = 0; i < n_elts; ++i)
8905 rtx x = XVECEXP (vals, 0, i);
8906 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8907 n_const++;
8909 if (n_const == n_elts)
8910 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8912 else
8913 gcc_unreachable ();
8915 if (const_vec != NULL_RTX
8916 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8917 /* Load using MOVI/MVNI. */
8918 return const_vec;
8919 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8920 /* Loaded using DUP. */
8921 return const_dup;
8922 else if (const_vec != NULL_RTX)
8923 /* Load from constant pool. We can not take advantage of single-cycle
8924 LD1 because we need a PC-relative addressing mode. */
8925 return const_vec;
8926 else
8927 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8928 We can not construct an initializer. */
8929 return NULL_RTX;
8932 void
8933 aarch64_expand_vector_init (rtx target, rtx vals)
8935 machine_mode mode = GET_MODE (target);
8936 machine_mode inner_mode = GET_MODE_INNER (mode);
8937 int n_elts = GET_MODE_NUNITS (mode);
8938 int n_var = 0;
8939 rtx any_const = NULL_RTX;
8940 bool all_same = true;
8942 for (int i = 0; i < n_elts; ++i)
8944 rtx x = XVECEXP (vals, 0, i);
8945 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8946 ++n_var;
8947 else
8948 any_const = x;
8950 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8951 all_same = false;
8954 if (n_var == 0)
8956 rtx constant = aarch64_simd_make_constant (vals);
8957 if (constant != NULL_RTX)
8959 emit_move_insn (target, constant);
8960 return;
8964 /* Splat a single non-constant element if we can. */
8965 if (all_same)
8967 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8968 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8969 return;
8972 /* Half the fields (or less) are non-constant. Load constant then overwrite
8973 varying fields. Hope that this is more efficient than using the stack. */
8974 if (n_var <= n_elts/2)
8976 rtx copy = copy_rtx (vals);
8978 /* Load constant part of vector. We really don't care what goes into the
8979 parts we will overwrite, but we're more likely to be able to load the
8980 constant efficiently if it has fewer, larger, repeating parts
8981 (see aarch64_simd_valid_immediate). */
8982 for (int i = 0; i < n_elts; i++)
8984 rtx x = XVECEXP (vals, 0, i);
8985 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8986 continue;
8987 rtx subst = any_const;
8988 for (int bit = n_elts / 2; bit > 0; bit /= 2)
8990 /* Look in the copied vector, as more elements are const. */
8991 rtx test = XVECEXP (copy, 0, i ^ bit);
8992 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8994 subst = test;
8995 break;
8998 XVECEXP (copy, 0, i) = subst;
9000 aarch64_expand_vector_init (target, copy);
9002 /* Insert variables. */
9003 enum insn_code icode = optab_handler (vec_set_optab, mode);
9004 gcc_assert (icode != CODE_FOR_nothing);
9006 for (int i = 0; i < n_elts; i++)
9008 rtx x = XVECEXP (vals, 0, i);
9009 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9010 continue;
9011 x = copy_to_mode_reg (inner_mode, x);
9012 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9014 return;
9017 /* Construct the vector in memory one field at a time
9018 and load the whole vector. */
9019 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9020 for (int i = 0; i < n_elts; i++)
9021 emit_move_insn (adjust_address_nv (mem, inner_mode,
9022 i * GET_MODE_SIZE (inner_mode)),
9023 XVECEXP (vals, 0, i));
9024 emit_move_insn (target, mem);
9028 static unsigned HOST_WIDE_INT
9029 aarch64_shift_truncation_mask (machine_mode mode)
9031 return
9032 (aarch64_vector_mode_supported_p (mode)
9033 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9036 #ifndef TLS_SECTION_ASM_FLAG
9037 #define TLS_SECTION_ASM_FLAG 'T'
9038 #endif
9040 void
9041 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9042 tree decl ATTRIBUTE_UNUSED)
9044 char flagchars[10], *f = flagchars;
9046 /* If we have already declared this section, we can use an
9047 abbreviated form to switch back to it -- unless this section is
9048 part of a COMDAT groups, in which case GAS requires the full
9049 declaration every time. */
9050 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9051 && (flags & SECTION_DECLARED))
9053 fprintf (asm_out_file, "\t.section\t%s\n", name);
9054 return;
9057 if (!(flags & SECTION_DEBUG))
9058 *f++ = 'a';
9059 if (flags & SECTION_WRITE)
9060 *f++ = 'w';
9061 if (flags & SECTION_CODE)
9062 *f++ = 'x';
9063 if (flags & SECTION_SMALL)
9064 *f++ = 's';
9065 if (flags & SECTION_MERGE)
9066 *f++ = 'M';
9067 if (flags & SECTION_STRINGS)
9068 *f++ = 'S';
9069 if (flags & SECTION_TLS)
9070 *f++ = TLS_SECTION_ASM_FLAG;
9071 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9072 *f++ = 'G';
9073 *f = '\0';
9075 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9077 if (!(flags & SECTION_NOTYPE))
9079 const char *type;
9080 const char *format;
9082 if (flags & SECTION_BSS)
9083 type = "nobits";
9084 else
9085 type = "progbits";
9087 #ifdef TYPE_OPERAND_FMT
9088 format = "," TYPE_OPERAND_FMT;
9089 #else
9090 format = ",@%s";
9091 #endif
9093 fprintf (asm_out_file, format, type);
9095 if (flags & SECTION_ENTSIZE)
9096 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9097 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9099 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9100 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9101 else
9102 fprintf (asm_out_file, ",%s,comdat",
9103 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9107 putc ('\n', asm_out_file);
9110 /* Select a format to encode pointers in exception handling data. */
9112 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9114 int type;
9115 switch (aarch64_cmodel)
9117 case AARCH64_CMODEL_TINY:
9118 case AARCH64_CMODEL_TINY_PIC:
9119 case AARCH64_CMODEL_SMALL:
9120 case AARCH64_CMODEL_SMALL_PIC:
9121 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9122 for everything. */
9123 type = DW_EH_PE_sdata4;
9124 break;
9125 default:
9126 /* No assumptions here. 8-byte relocs required. */
9127 type = DW_EH_PE_sdata8;
9128 break;
9130 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9133 /* Emit load exclusive. */
9135 static void
9136 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9137 rtx mem, rtx model_rtx)
9139 rtx (*gen) (rtx, rtx, rtx);
9141 switch (mode)
9143 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9144 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9145 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9146 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9147 default:
9148 gcc_unreachable ();
9151 emit_insn (gen (rval, mem, model_rtx));
9154 /* Emit store exclusive. */
9156 static void
9157 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9158 rtx rval, rtx mem, rtx model_rtx)
9160 rtx (*gen) (rtx, rtx, rtx, rtx);
9162 switch (mode)
9164 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9165 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9166 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9167 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9168 default:
9169 gcc_unreachable ();
9172 emit_insn (gen (bval, rval, mem, model_rtx));
9175 /* Mark the previous jump instruction as unlikely. */
9177 static void
9178 aarch64_emit_unlikely_jump (rtx insn)
9180 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9182 insn = emit_jump_insn (insn);
9183 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9186 /* Expand a compare and swap pattern. */
9188 void
9189 aarch64_expand_compare_and_swap (rtx operands[])
9191 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9192 machine_mode mode, cmp_mode;
9193 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9195 bval = operands[0];
9196 rval = operands[1];
9197 mem = operands[2];
9198 oldval = operands[3];
9199 newval = operands[4];
9200 is_weak = operands[5];
9201 mod_s = operands[6];
9202 mod_f = operands[7];
9203 mode = GET_MODE (mem);
9204 cmp_mode = mode;
9206 /* Normally the succ memory model must be stronger than fail, but in the
9207 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9208 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9210 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9211 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9212 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9214 switch (mode)
9216 case QImode:
9217 case HImode:
9218 /* For short modes, we're going to perform the comparison in SImode,
9219 so do the zero-extension now. */
9220 cmp_mode = SImode;
9221 rval = gen_reg_rtx (SImode);
9222 oldval = convert_modes (SImode, mode, oldval, true);
9223 /* Fall through. */
9225 case SImode:
9226 case DImode:
9227 /* Force the value into a register if needed. */
9228 if (!aarch64_plus_operand (oldval, mode))
9229 oldval = force_reg (cmp_mode, oldval);
9230 break;
9232 default:
9233 gcc_unreachable ();
9236 switch (mode)
9238 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9239 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9240 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9241 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9242 default:
9243 gcc_unreachable ();
9246 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9248 if (mode == QImode || mode == HImode)
9249 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9251 x = gen_rtx_REG (CCmode, CC_REGNUM);
9252 x = gen_rtx_EQ (SImode, x, const0_rtx);
9253 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9256 /* Split a compare and swap pattern. */
9258 void
9259 aarch64_split_compare_and_swap (rtx operands[])
9261 rtx rval, mem, oldval, newval, scratch;
9262 machine_mode mode;
9263 bool is_weak;
9264 rtx_code_label *label1, *label2;
9265 rtx x, cond;
9267 rval = operands[0];
9268 mem = operands[1];
9269 oldval = operands[2];
9270 newval = operands[3];
9271 is_weak = (operands[4] != const0_rtx);
9272 scratch = operands[7];
9273 mode = GET_MODE (mem);
9275 label1 = NULL;
9276 if (!is_weak)
9278 label1 = gen_label_rtx ();
9279 emit_label (label1);
9281 label2 = gen_label_rtx ();
9283 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9285 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9286 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9287 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9288 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9289 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9291 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9293 if (!is_weak)
9295 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9296 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9297 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9298 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9300 else
9302 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9303 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9304 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9307 emit_label (label2);
9310 /* Split an atomic operation. */
9312 void
9313 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9314 rtx value, rtx model_rtx, rtx cond)
9316 machine_mode mode = GET_MODE (mem);
9317 machine_mode wmode = (mode == DImode ? DImode : SImode);
9318 rtx_code_label *label;
9319 rtx x;
9321 label = gen_label_rtx ();
9322 emit_label (label);
9324 if (new_out)
9325 new_out = gen_lowpart (wmode, new_out);
9326 if (old_out)
9327 old_out = gen_lowpart (wmode, old_out);
9328 else
9329 old_out = new_out;
9330 value = simplify_gen_subreg (wmode, value, mode, 0);
9332 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9334 switch (code)
9336 case SET:
9337 new_out = value;
9338 break;
9340 case NOT:
9341 x = gen_rtx_AND (wmode, old_out, value);
9342 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9343 x = gen_rtx_NOT (wmode, new_out);
9344 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9345 break;
9347 case MINUS:
9348 if (CONST_INT_P (value))
9350 value = GEN_INT (-INTVAL (value));
9351 code = PLUS;
9353 /* Fall through. */
9355 default:
9356 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9357 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9358 break;
9361 aarch64_emit_store_exclusive (mode, cond, mem,
9362 gen_lowpart (mode, new_out), model_rtx);
9364 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9365 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9366 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9367 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9370 static void
9371 aarch64_print_extension (void)
9373 const struct aarch64_option_extension *opt = NULL;
9375 for (opt = all_extensions; opt->name != NULL; opt++)
9376 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9377 asm_fprintf (asm_out_file, "+%s", opt->name);
9379 asm_fprintf (asm_out_file, "\n");
9382 static void
9383 aarch64_start_file (void)
9385 if (selected_arch)
9387 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9388 aarch64_print_extension ();
9390 else if (selected_cpu)
9392 const char *truncated_name
9393 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9394 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9395 aarch64_print_extension ();
9397 default_file_start();
9400 /* Target hook for c_mode_for_suffix. */
9401 static machine_mode
9402 aarch64_c_mode_for_suffix (char suffix)
9404 if (suffix == 'q')
9405 return TFmode;
9407 return VOIDmode;
9410 /* We can only represent floating point constants which will fit in
9411 "quarter-precision" values. These values are characterised by
9412 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9415 (-1)^s * (n/16) * 2^r
9417 Where:
9418 's' is the sign bit.
9419 'n' is an integer in the range 16 <= n <= 31.
9420 'r' is an integer in the range -3 <= r <= 4. */
9422 /* Return true iff X can be represented by a quarter-precision
9423 floating point immediate operand X. Note, we cannot represent 0.0. */
9424 bool
9425 aarch64_float_const_representable_p (rtx x)
9427 /* This represents our current view of how many bits
9428 make up the mantissa. */
9429 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9430 int exponent;
9431 unsigned HOST_WIDE_INT mantissa, mask;
9432 REAL_VALUE_TYPE r, m;
9433 bool fail;
9435 if (!CONST_DOUBLE_P (x))
9436 return false;
9438 if (GET_MODE (x) == VOIDmode)
9439 return false;
9441 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9443 /* We cannot represent infinities, NaNs or +/-zero. We won't
9444 know if we have +zero until we analyse the mantissa, but we
9445 can reject the other invalid values. */
9446 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9447 || REAL_VALUE_MINUS_ZERO (r))
9448 return false;
9450 /* Extract exponent. */
9451 r = real_value_abs (&r);
9452 exponent = REAL_EXP (&r);
9454 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9455 highest (sign) bit, with a fixed binary point at bit point_pos.
9456 m1 holds the low part of the mantissa, m2 the high part.
9457 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9458 bits for the mantissa, this can fail (low bits will be lost). */
9459 real_ldexp (&m, &r, point_pos - exponent);
9460 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9462 /* If the low part of the mantissa has bits set we cannot represent
9463 the value. */
9464 if (w.elt (0) != 0)
9465 return false;
9466 /* We have rejected the lower HOST_WIDE_INT, so update our
9467 understanding of how many bits lie in the mantissa and
9468 look only at the high HOST_WIDE_INT. */
9469 mantissa = w.elt (1);
9470 point_pos -= HOST_BITS_PER_WIDE_INT;
9472 /* We can only represent values with a mantissa of the form 1.xxxx. */
9473 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9474 if ((mantissa & mask) != 0)
9475 return false;
9477 /* Having filtered unrepresentable values, we may now remove all
9478 but the highest 5 bits. */
9479 mantissa >>= point_pos - 5;
9481 /* We cannot represent the value 0.0, so reject it. This is handled
9482 elsewhere. */
9483 if (mantissa == 0)
9484 return false;
9486 /* Then, as bit 4 is always set, we can mask it off, leaving
9487 the mantissa in the range [0, 15]. */
9488 mantissa &= ~(1 << 4);
9489 gcc_assert (mantissa <= 15);
9491 /* GCC internally does not use IEEE754-like encoding (where normalized
9492 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9493 Our mantissa values are shifted 4 places to the left relative to
9494 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9495 by 5 places to correct for GCC's representation. */
9496 exponent = 5 - exponent;
9498 return (exponent >= 0 && exponent <= 7);
9501 char*
9502 aarch64_output_simd_mov_immediate (rtx const_vector,
9503 machine_mode mode,
9504 unsigned width)
9506 bool is_valid;
9507 static char templ[40];
9508 const char *mnemonic;
9509 const char *shift_op;
9510 unsigned int lane_count = 0;
9511 char element_char;
9513 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9515 /* This will return true to show const_vector is legal for use as either
9516 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9517 also update INFO to show how the immediate should be generated. */
9518 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9519 gcc_assert (is_valid);
9521 element_char = sizetochar (info.element_width);
9522 lane_count = width / info.element_width;
9524 mode = GET_MODE_INNER (mode);
9525 if (mode == SFmode || mode == DFmode)
9527 gcc_assert (info.shift == 0 && ! info.mvn);
9528 if (aarch64_float_const_zero_rtx_p (info.value))
9529 info.value = GEN_INT (0);
9530 else
9532 #define buf_size 20
9533 REAL_VALUE_TYPE r;
9534 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9535 char float_buf[buf_size] = {'\0'};
9536 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9537 #undef buf_size
9539 if (lane_count == 1)
9540 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9541 else
9542 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9543 lane_count, element_char, float_buf);
9544 return templ;
9548 mnemonic = info.mvn ? "mvni" : "movi";
9549 shift_op = info.msl ? "msl" : "lsl";
9551 if (lane_count == 1)
9552 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9553 mnemonic, UINTVAL (info.value));
9554 else if (info.shift)
9555 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9556 ", %s %d", mnemonic, lane_count, element_char,
9557 UINTVAL (info.value), shift_op, info.shift);
9558 else
9559 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9560 mnemonic, lane_count, element_char, UINTVAL (info.value));
9561 return templ;
9564 char*
9565 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9566 machine_mode mode)
9568 machine_mode vmode;
9570 gcc_assert (!VECTOR_MODE_P (mode));
9571 vmode = aarch64_simd_container_mode (mode, 64);
9572 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9573 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9576 /* Split operands into moves from op[1] + op[2] into op[0]. */
9578 void
9579 aarch64_split_combinev16qi (rtx operands[3])
9581 unsigned int dest = REGNO (operands[0]);
9582 unsigned int src1 = REGNO (operands[1]);
9583 unsigned int src2 = REGNO (operands[2]);
9584 machine_mode halfmode = GET_MODE (operands[1]);
9585 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9586 rtx destlo, desthi;
9588 gcc_assert (halfmode == V16QImode);
9590 if (src1 == dest && src2 == dest + halfregs)
9592 /* No-op move. Can't split to nothing; emit something. */
9593 emit_note (NOTE_INSN_DELETED);
9594 return;
9597 /* Preserve register attributes for variable tracking. */
9598 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9599 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9600 GET_MODE_SIZE (halfmode));
9602 /* Special case of reversed high/low parts. */
9603 if (reg_overlap_mentioned_p (operands[2], destlo)
9604 && reg_overlap_mentioned_p (operands[1], desthi))
9606 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9607 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9608 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9610 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9612 /* Try to avoid unnecessary moves if part of the result
9613 is in the right place already. */
9614 if (src1 != dest)
9615 emit_move_insn (destlo, operands[1]);
9616 if (src2 != dest + halfregs)
9617 emit_move_insn (desthi, operands[2]);
9619 else
9621 if (src2 != dest + halfregs)
9622 emit_move_insn (desthi, operands[2]);
9623 if (src1 != dest)
9624 emit_move_insn (destlo, operands[1]);
9628 /* vec_perm support. */
9630 #define MAX_VECT_LEN 16
9632 struct expand_vec_perm_d
9634 rtx target, op0, op1;
9635 unsigned char perm[MAX_VECT_LEN];
9636 machine_mode vmode;
9637 unsigned char nelt;
9638 bool one_vector_p;
9639 bool testing_p;
9642 /* Generate a variable permutation. */
9644 static void
9645 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9647 machine_mode vmode = GET_MODE (target);
9648 bool one_vector_p = rtx_equal_p (op0, op1);
9650 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9651 gcc_checking_assert (GET_MODE (op0) == vmode);
9652 gcc_checking_assert (GET_MODE (op1) == vmode);
9653 gcc_checking_assert (GET_MODE (sel) == vmode);
9654 gcc_checking_assert (TARGET_SIMD);
9656 if (one_vector_p)
9658 if (vmode == V8QImode)
9660 /* Expand the argument to a V16QI mode by duplicating it. */
9661 rtx pair = gen_reg_rtx (V16QImode);
9662 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9663 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9665 else
9667 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9670 else
9672 rtx pair;
9674 if (vmode == V8QImode)
9676 pair = gen_reg_rtx (V16QImode);
9677 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9678 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9680 else
9682 pair = gen_reg_rtx (OImode);
9683 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9684 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9689 void
9690 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9692 machine_mode vmode = GET_MODE (target);
9693 unsigned int nelt = GET_MODE_NUNITS (vmode);
9694 bool one_vector_p = rtx_equal_p (op0, op1);
9695 rtx mask;
9697 /* The TBL instruction does not use a modulo index, so we must take care
9698 of that ourselves. */
9699 mask = aarch64_simd_gen_const_vector_dup (vmode,
9700 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9701 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9703 /* For big-endian, we also need to reverse the index within the vector
9704 (but not which vector). */
9705 if (BYTES_BIG_ENDIAN)
9707 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9708 if (!one_vector_p)
9709 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9710 sel = expand_simple_binop (vmode, XOR, sel, mask,
9711 NULL, 0, OPTAB_LIB_WIDEN);
9713 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9716 /* Recognize patterns suitable for the TRN instructions. */
9717 static bool
9718 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9720 unsigned int i, odd, mask, nelt = d->nelt;
9721 rtx out, in0, in1, x;
9722 rtx (*gen) (rtx, rtx, rtx);
9723 machine_mode vmode = d->vmode;
9725 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9726 return false;
9728 /* Note that these are little-endian tests.
9729 We correct for big-endian later. */
9730 if (d->perm[0] == 0)
9731 odd = 0;
9732 else if (d->perm[0] == 1)
9733 odd = 1;
9734 else
9735 return false;
9736 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9738 for (i = 0; i < nelt; i += 2)
9740 if (d->perm[i] != i + odd)
9741 return false;
9742 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9743 return false;
9746 /* Success! */
9747 if (d->testing_p)
9748 return true;
9750 in0 = d->op0;
9751 in1 = d->op1;
9752 if (BYTES_BIG_ENDIAN)
9754 x = in0, in0 = in1, in1 = x;
9755 odd = !odd;
9757 out = d->target;
9759 if (odd)
9761 switch (vmode)
9763 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9764 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9765 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9766 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9767 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9768 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9769 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9770 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9771 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9772 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9773 default:
9774 return false;
9777 else
9779 switch (vmode)
9781 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9782 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9783 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9784 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9785 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9786 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9787 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9788 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9789 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9790 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9791 default:
9792 return false;
9796 emit_insn (gen (out, in0, in1));
9797 return true;
9800 /* Recognize patterns suitable for the UZP instructions. */
9801 static bool
9802 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9804 unsigned int i, odd, mask, nelt = d->nelt;
9805 rtx out, in0, in1, x;
9806 rtx (*gen) (rtx, rtx, rtx);
9807 machine_mode vmode = d->vmode;
9809 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9810 return false;
9812 /* Note that these are little-endian tests.
9813 We correct for big-endian later. */
9814 if (d->perm[0] == 0)
9815 odd = 0;
9816 else if (d->perm[0] == 1)
9817 odd = 1;
9818 else
9819 return false;
9820 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9822 for (i = 0; i < nelt; i++)
9824 unsigned elt = (i * 2 + odd) & mask;
9825 if (d->perm[i] != elt)
9826 return false;
9829 /* Success! */
9830 if (d->testing_p)
9831 return true;
9833 in0 = d->op0;
9834 in1 = d->op1;
9835 if (BYTES_BIG_ENDIAN)
9837 x = in0, in0 = in1, in1 = x;
9838 odd = !odd;
9840 out = d->target;
9842 if (odd)
9844 switch (vmode)
9846 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9847 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9848 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9849 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9850 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9851 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9852 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9853 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9854 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9855 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9856 default:
9857 return false;
9860 else
9862 switch (vmode)
9864 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9865 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9866 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9867 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9868 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9869 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9870 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9871 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9872 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9873 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9874 default:
9875 return false;
9879 emit_insn (gen (out, in0, in1));
9880 return true;
9883 /* Recognize patterns suitable for the ZIP instructions. */
9884 static bool
9885 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9887 unsigned int i, high, mask, nelt = d->nelt;
9888 rtx out, in0, in1, x;
9889 rtx (*gen) (rtx, rtx, rtx);
9890 machine_mode vmode = d->vmode;
9892 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9893 return false;
9895 /* Note that these are little-endian tests.
9896 We correct for big-endian later. */
9897 high = nelt / 2;
9898 if (d->perm[0] == high)
9899 /* Do Nothing. */
9901 else if (d->perm[0] == 0)
9902 high = 0;
9903 else
9904 return false;
9905 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9907 for (i = 0; i < nelt / 2; i++)
9909 unsigned elt = (i + high) & mask;
9910 if (d->perm[i * 2] != elt)
9911 return false;
9912 elt = (elt + nelt) & mask;
9913 if (d->perm[i * 2 + 1] != elt)
9914 return false;
9917 /* Success! */
9918 if (d->testing_p)
9919 return true;
9921 in0 = d->op0;
9922 in1 = d->op1;
9923 if (BYTES_BIG_ENDIAN)
9925 x = in0, in0 = in1, in1 = x;
9926 high = !high;
9928 out = d->target;
9930 if (high)
9932 switch (vmode)
9934 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9935 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9936 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9937 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9938 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9939 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9940 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9941 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9942 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9943 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9944 default:
9945 return false;
9948 else
9950 switch (vmode)
9952 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9953 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9954 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9955 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9956 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9957 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9958 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9959 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9960 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9961 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9962 default:
9963 return false;
9967 emit_insn (gen (out, in0, in1));
9968 return true;
9971 /* Recognize patterns for the EXT insn. */
9973 static bool
9974 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9976 unsigned int i, nelt = d->nelt;
9977 rtx (*gen) (rtx, rtx, rtx, rtx);
9978 rtx offset;
9980 unsigned int location = d->perm[0]; /* Always < nelt. */
9982 /* Check if the extracted indices are increasing by one. */
9983 for (i = 1; i < nelt; i++)
9985 unsigned int required = location + i;
9986 if (d->one_vector_p)
9988 /* We'll pass the same vector in twice, so allow indices to wrap. */
9989 required &= (nelt - 1);
9991 if (d->perm[i] != required)
9992 return false;
9995 switch (d->vmode)
9997 case V16QImode: gen = gen_aarch64_extv16qi; break;
9998 case V8QImode: gen = gen_aarch64_extv8qi; break;
9999 case V4HImode: gen = gen_aarch64_extv4hi; break;
10000 case V8HImode: gen = gen_aarch64_extv8hi; break;
10001 case V2SImode: gen = gen_aarch64_extv2si; break;
10002 case V4SImode: gen = gen_aarch64_extv4si; break;
10003 case V2SFmode: gen = gen_aarch64_extv2sf; break;
10004 case V4SFmode: gen = gen_aarch64_extv4sf; break;
10005 case V2DImode: gen = gen_aarch64_extv2di; break;
10006 case V2DFmode: gen = gen_aarch64_extv2df; break;
10007 default:
10008 return false;
10011 /* Success! */
10012 if (d->testing_p)
10013 return true;
10015 /* The case where (location == 0) is a no-op for both big- and little-endian,
10016 and is removed by the mid-end at optimization levels -O1 and higher. */
10018 if (BYTES_BIG_ENDIAN && (location != 0))
10020 /* After setup, we want the high elements of the first vector (stored
10021 at the LSB end of the register), and the low elements of the second
10022 vector (stored at the MSB end of the register). So swap. */
10023 std::swap (d->op0, d->op1);
10024 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
10025 location = nelt - location;
10028 offset = GEN_INT (location);
10029 emit_insn (gen (d->target, d->op0, d->op1, offset));
10030 return true;
10033 /* Recognize patterns for the REV insns. */
10035 static bool
10036 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10038 unsigned int i, j, diff, nelt = d->nelt;
10039 rtx (*gen) (rtx, rtx);
10041 if (!d->one_vector_p)
10042 return false;
10044 diff = d->perm[0];
10045 switch (diff)
10047 case 7:
10048 switch (d->vmode)
10050 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10051 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10052 default:
10053 return false;
10055 break;
10056 case 3:
10057 switch (d->vmode)
10059 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10060 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10061 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10062 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10063 default:
10064 return false;
10066 break;
10067 case 1:
10068 switch (d->vmode)
10070 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10071 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10072 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10073 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10074 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10075 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10076 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10077 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10078 default:
10079 return false;
10081 break;
10082 default:
10083 return false;
10086 for (i = 0; i < nelt ; i += diff + 1)
10087 for (j = 0; j <= diff; j += 1)
10089 /* This is guaranteed to be true as the value of diff
10090 is 7, 3, 1 and we should have enough elements in the
10091 queue to generate this. Getting a vector mask with a
10092 value of diff other than these values implies that
10093 something is wrong by the time we get here. */
10094 gcc_assert (i + j < nelt);
10095 if (d->perm[i + j] != i + diff - j)
10096 return false;
10099 /* Success! */
10100 if (d->testing_p)
10101 return true;
10103 emit_insn (gen (d->target, d->op0));
10104 return true;
10107 static bool
10108 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10110 rtx (*gen) (rtx, rtx, rtx);
10111 rtx out = d->target;
10112 rtx in0;
10113 machine_mode vmode = d->vmode;
10114 unsigned int i, elt, nelt = d->nelt;
10115 rtx lane;
10117 elt = d->perm[0];
10118 for (i = 1; i < nelt; i++)
10120 if (elt != d->perm[i])
10121 return false;
10124 /* The generic preparation in aarch64_expand_vec_perm_const_1
10125 swaps the operand order and the permute indices if it finds
10126 d->perm[0] to be in the second operand. Thus, we can always
10127 use d->op0 and need not do any extra arithmetic to get the
10128 correct lane number. */
10129 in0 = d->op0;
10130 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10132 switch (vmode)
10134 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10135 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10136 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10137 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10138 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10139 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10140 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10141 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10142 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10143 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10144 default:
10145 return false;
10148 emit_insn (gen (out, in0, lane));
10149 return true;
10152 static bool
10153 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10155 rtx rperm[MAX_VECT_LEN], sel;
10156 machine_mode vmode = d->vmode;
10157 unsigned int i, nelt = d->nelt;
10159 if (d->testing_p)
10160 return true;
10162 /* Generic code will try constant permutation twice. Once with the
10163 original mode and again with the elements lowered to QImode.
10164 So wait and don't do the selector expansion ourselves. */
10165 if (vmode != V8QImode && vmode != V16QImode)
10166 return false;
10168 for (i = 0; i < nelt; ++i)
10170 int nunits = GET_MODE_NUNITS (vmode);
10172 /* If big-endian and two vectors we end up with a weird mixed-endian
10173 mode on NEON. Reverse the index within each word but not the word
10174 itself. */
10175 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10176 : d->perm[i]);
10178 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10179 sel = force_reg (vmode, sel);
10181 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10182 return true;
10185 static bool
10186 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10188 /* The pattern matching functions above are written to look for a small
10189 number to begin the sequence (0, 1, N/2). If we begin with an index
10190 from the second operand, we can swap the operands. */
10191 if (d->perm[0] >= d->nelt)
10193 unsigned i, nelt = d->nelt;
10195 gcc_assert (nelt == (nelt & -nelt));
10196 for (i = 0; i < nelt; ++i)
10197 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10199 std::swap (d->op0, d->op1);
10202 if (TARGET_SIMD)
10204 if (aarch64_evpc_rev (d))
10205 return true;
10206 else if (aarch64_evpc_ext (d))
10207 return true;
10208 else if (aarch64_evpc_dup (d))
10209 return true;
10210 else if (aarch64_evpc_zip (d))
10211 return true;
10212 else if (aarch64_evpc_uzp (d))
10213 return true;
10214 else if (aarch64_evpc_trn (d))
10215 return true;
10216 return aarch64_evpc_tbl (d);
10218 return false;
10221 /* Expand a vec_perm_const pattern. */
10223 bool
10224 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10226 struct expand_vec_perm_d d;
10227 int i, nelt, which;
10229 d.target = target;
10230 d.op0 = op0;
10231 d.op1 = op1;
10233 d.vmode = GET_MODE (target);
10234 gcc_assert (VECTOR_MODE_P (d.vmode));
10235 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10236 d.testing_p = false;
10238 for (i = which = 0; i < nelt; ++i)
10240 rtx e = XVECEXP (sel, 0, i);
10241 int ei = INTVAL (e) & (2 * nelt - 1);
10242 which |= (ei < nelt ? 1 : 2);
10243 d.perm[i] = ei;
10246 switch (which)
10248 default:
10249 gcc_unreachable ();
10251 case 3:
10252 d.one_vector_p = false;
10253 if (!rtx_equal_p (op0, op1))
10254 break;
10256 /* The elements of PERM do not suggest that only the first operand
10257 is used, but both operands are identical. Allow easier matching
10258 of the permutation by folding the permutation into the single
10259 input vector. */
10260 /* Fall Through. */
10261 case 2:
10262 for (i = 0; i < nelt; ++i)
10263 d.perm[i] &= nelt - 1;
10264 d.op0 = op1;
10265 d.one_vector_p = true;
10266 break;
10268 case 1:
10269 d.op1 = op0;
10270 d.one_vector_p = true;
10271 break;
10274 return aarch64_expand_vec_perm_const_1 (&d);
10277 static bool
10278 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10279 const unsigned char *sel)
10281 struct expand_vec_perm_d d;
10282 unsigned int i, nelt, which;
10283 bool ret;
10285 d.vmode = vmode;
10286 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10287 d.testing_p = true;
10288 memcpy (d.perm, sel, nelt);
10290 /* Calculate whether all elements are in one vector. */
10291 for (i = which = 0; i < nelt; ++i)
10293 unsigned char e = d.perm[i];
10294 gcc_assert (e < 2 * nelt);
10295 which |= (e < nelt ? 1 : 2);
10298 /* If all elements are from the second vector, reindex as if from the
10299 first vector. */
10300 if (which == 2)
10301 for (i = 0; i < nelt; ++i)
10302 d.perm[i] -= nelt;
10304 /* Check whether the mask can be applied to a single vector. */
10305 d.one_vector_p = (which != 3);
10307 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10308 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10309 if (!d.one_vector_p)
10310 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10312 start_sequence ();
10313 ret = aarch64_expand_vec_perm_const_1 (&d);
10314 end_sequence ();
10316 return ret;
10320 aarch64_reverse_mask (enum machine_mode mode)
10322 /* We have to reverse each vector because we dont have
10323 a permuted load that can reverse-load according to ABI rules. */
10324 rtx mask;
10325 rtvec v = rtvec_alloc (16);
10326 int i, j;
10327 int nunits = GET_MODE_NUNITS (mode);
10328 int usize = GET_MODE_UNIT_SIZE (mode);
10330 gcc_assert (BYTES_BIG_ENDIAN);
10331 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10333 for (i = 0; i < nunits; i++)
10334 for (j = 0; j < usize; j++)
10335 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10336 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10337 return force_reg (V16QImode, mask);
10340 /* Implement MODES_TIEABLE_P. */
10342 bool
10343 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10345 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10346 return true;
10348 /* We specifically want to allow elements of "structure" modes to
10349 be tieable to the structure. This more general condition allows
10350 other rarer situations too. */
10351 if (TARGET_SIMD
10352 && aarch64_vector_mode_p (mode1)
10353 && aarch64_vector_mode_p (mode2))
10354 return true;
10356 return false;
10359 /* Return a new RTX holding the result of moving POINTER forward by
10360 AMOUNT bytes. */
10362 static rtx
10363 aarch64_move_pointer (rtx pointer, int amount)
10365 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10367 return adjust_automodify_address (pointer, GET_MODE (pointer),
10368 next, amount);
10371 /* Return a new RTX holding the result of moving POINTER forward by the
10372 size of the mode it points to. */
10374 static rtx
10375 aarch64_progress_pointer (rtx pointer)
10377 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10379 return aarch64_move_pointer (pointer, amount);
10382 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10383 MODE bytes. */
10385 static void
10386 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10387 machine_mode mode)
10389 rtx reg = gen_reg_rtx (mode);
10391 /* "Cast" the pointers to the correct mode. */
10392 *src = adjust_address (*src, mode, 0);
10393 *dst = adjust_address (*dst, mode, 0);
10394 /* Emit the memcpy. */
10395 emit_move_insn (reg, *src);
10396 emit_move_insn (*dst, reg);
10397 /* Move the pointers forward. */
10398 *src = aarch64_progress_pointer (*src);
10399 *dst = aarch64_progress_pointer (*dst);
10402 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10403 we succeed, otherwise return false. */
10405 bool
10406 aarch64_expand_movmem (rtx *operands)
10408 unsigned int n;
10409 rtx dst = operands[0];
10410 rtx src = operands[1];
10411 rtx base;
10412 bool speed_p = !optimize_function_for_size_p (cfun);
10414 /* When optimizing for size, give a better estimate of the length of a
10415 memcpy call, but use the default otherwise. */
10416 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10418 /* We can't do anything smart if the amount to copy is not constant. */
10419 if (!CONST_INT_P (operands[2]))
10420 return false;
10422 n = UINTVAL (operands[2]);
10424 /* Try to keep the number of instructions low. For cases below 16 bytes we
10425 need to make at most two moves. For cases above 16 bytes it will be one
10426 move for each 16 byte chunk, then at most two additional moves. */
10427 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10428 return false;
10430 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10431 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10433 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10434 src = adjust_automodify_address (src, VOIDmode, base, 0);
10436 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10437 1-byte chunk. */
10438 if (n < 4)
10440 if (n >= 2)
10442 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10443 n -= 2;
10446 if (n == 1)
10447 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10449 return true;
10452 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10453 4-byte chunk, partially overlapping with the previously copied chunk. */
10454 if (n < 8)
10456 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10457 n -= 4;
10458 if (n > 0)
10460 int move = n - 4;
10462 src = aarch64_move_pointer (src, move);
10463 dst = aarch64_move_pointer (dst, move);
10464 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10466 return true;
10469 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10470 them, then (if applicable) an 8-byte chunk. */
10471 while (n >= 8)
10473 if (n / 16)
10475 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10476 n -= 16;
10478 else
10480 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10481 n -= 8;
10485 /* Finish the final bytes of the copy. We can always do this in one
10486 instruction. We either copy the exact amount we need, or partially
10487 overlap with the previous chunk we copied and copy 8-bytes. */
10488 if (n == 0)
10489 return true;
10490 else if (n == 1)
10491 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10492 else if (n == 2)
10493 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10494 else if (n == 4)
10495 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10496 else
10498 if (n == 3)
10500 src = aarch64_move_pointer (src, -1);
10501 dst = aarch64_move_pointer (dst, -1);
10502 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10504 else
10506 int move = n - 8;
10508 src = aarch64_move_pointer (src, move);
10509 dst = aarch64_move_pointer (dst, move);
10510 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10514 return true;
10517 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10519 static unsigned HOST_WIDE_INT
10520 aarch64_asan_shadow_offset (void)
10522 return (HOST_WIDE_INT_1 << 36);
10525 static bool
10526 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10527 unsigned int align,
10528 enum by_pieces_operation op,
10529 bool speed_p)
10531 /* STORE_BY_PIECES can be used when copying a constant string, but
10532 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10533 For now we always fail this and let the move_by_pieces code copy
10534 the string from read-only memory. */
10535 if (op == STORE_BY_PIECES)
10536 return false;
10538 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10541 static enum machine_mode
10542 aarch64_code_to_ccmode (enum rtx_code code)
10544 switch (code)
10546 case NE:
10547 return CC_DNEmode;
10549 case EQ:
10550 return CC_DEQmode;
10552 case LE:
10553 return CC_DLEmode;
10555 case LT:
10556 return CC_DLTmode;
10558 case GE:
10559 return CC_DGEmode;
10561 case GT:
10562 return CC_DGTmode;
10564 case LEU:
10565 return CC_DLEUmode;
10567 case LTU:
10568 return CC_DLTUmode;
10570 case GEU:
10571 return CC_DGEUmode;
10573 case GTU:
10574 return CC_DGTUmode;
10576 default:
10577 return CCmode;
10581 static rtx
10582 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10583 int code, tree treeop0, tree treeop1)
10585 enum machine_mode op_mode, cmp_mode, cc_mode;
10586 rtx op0, op1, cmp, target;
10587 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10588 enum insn_code icode;
10589 struct expand_operand ops[4];
10591 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10592 if (cc_mode == CCmode)
10593 return NULL_RTX;
10595 start_sequence ();
10596 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10598 op_mode = GET_MODE (op0);
10599 if (op_mode == VOIDmode)
10600 op_mode = GET_MODE (op1);
10602 switch (op_mode)
10604 case QImode:
10605 case HImode:
10606 case SImode:
10607 cmp_mode = SImode;
10608 icode = CODE_FOR_cmpsi;
10609 break;
10611 case DImode:
10612 cmp_mode = DImode;
10613 icode = CODE_FOR_cmpdi;
10614 break;
10616 default:
10617 end_sequence ();
10618 return NULL_RTX;
10621 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10622 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10623 if (!op0 || !op1)
10625 end_sequence ();
10626 return NULL_RTX;
10628 *prep_seq = get_insns ();
10629 end_sequence ();
10631 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10632 target = gen_rtx_REG (CCmode, CC_REGNUM);
10634 create_output_operand (&ops[0], target, CCmode);
10635 create_fixed_operand (&ops[1], cmp);
10636 create_fixed_operand (&ops[2], op0);
10637 create_fixed_operand (&ops[3], op1);
10639 start_sequence ();
10640 if (!maybe_expand_insn (icode, 4, ops))
10642 end_sequence ();
10643 return NULL_RTX;
10645 *gen_seq = get_insns ();
10646 end_sequence ();
10648 return gen_rtx_REG (cc_mode, CC_REGNUM);
10651 static rtx
10652 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10653 tree treeop0, tree treeop1, int bit_code)
10655 rtx op0, op1, cmp0, cmp1, target;
10656 enum machine_mode op_mode, cmp_mode, cc_mode;
10657 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10658 enum insn_code icode = CODE_FOR_ccmp_andsi;
10659 struct expand_operand ops[6];
10661 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10662 if (cc_mode == CCmode)
10663 return NULL_RTX;
10665 push_to_sequence ((rtx_insn*) *prep_seq);
10666 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10668 op_mode = GET_MODE (op0);
10669 if (op_mode == VOIDmode)
10670 op_mode = GET_MODE (op1);
10672 switch (op_mode)
10674 case QImode:
10675 case HImode:
10676 case SImode:
10677 cmp_mode = SImode;
10678 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10679 : CODE_FOR_ccmp_iorsi;
10680 break;
10682 case DImode:
10683 cmp_mode = DImode;
10684 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10685 : CODE_FOR_ccmp_iordi;
10686 break;
10688 default:
10689 end_sequence ();
10690 return NULL_RTX;
10693 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10694 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10695 if (!op0 || !op1)
10697 end_sequence ();
10698 return NULL_RTX;
10700 *prep_seq = get_insns ();
10701 end_sequence ();
10703 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10704 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10705 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10707 create_fixed_operand (&ops[0], prev);
10708 create_fixed_operand (&ops[1], target);
10709 create_fixed_operand (&ops[2], op0);
10710 create_fixed_operand (&ops[3], op1);
10711 create_fixed_operand (&ops[4], cmp0);
10712 create_fixed_operand (&ops[5], cmp1);
10714 push_to_sequence ((rtx_insn*) *gen_seq);
10715 if (!maybe_expand_insn (icode, 6, ops))
10717 end_sequence ();
10718 return NULL_RTX;
10721 *gen_seq = get_insns ();
10722 end_sequence ();
10724 return target;
10727 #undef TARGET_GEN_CCMP_FIRST
10728 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10730 #undef TARGET_GEN_CCMP_NEXT
10731 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10733 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10734 instruction fusion of some sort. */
10736 static bool
10737 aarch64_macro_fusion_p (void)
10739 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10743 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10744 should be kept together during scheduling. */
10746 static bool
10747 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10749 rtx set_dest;
10750 rtx prev_set = single_set (prev);
10751 rtx curr_set = single_set (curr);
10752 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10753 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10755 if (!aarch64_macro_fusion_p ())
10756 return false;
10758 if (simple_sets_p
10759 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10761 /* We are trying to match:
10762 prev (mov) == (set (reg r0) (const_int imm16))
10763 curr (movk) == (set (zero_extract (reg r0)
10764 (const_int 16)
10765 (const_int 16))
10766 (const_int imm16_1)) */
10768 set_dest = SET_DEST (curr_set);
10770 if (GET_CODE (set_dest) == ZERO_EXTRACT
10771 && CONST_INT_P (SET_SRC (curr_set))
10772 && CONST_INT_P (SET_SRC (prev_set))
10773 && CONST_INT_P (XEXP (set_dest, 2))
10774 && INTVAL (XEXP (set_dest, 2)) == 16
10775 && REG_P (XEXP (set_dest, 0))
10776 && REG_P (SET_DEST (prev_set))
10777 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10779 return true;
10783 if (simple_sets_p
10784 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10787 /* We're trying to match:
10788 prev (adrp) == (set (reg r1)
10789 (high (symbol_ref ("SYM"))))
10790 curr (add) == (set (reg r0)
10791 (lo_sum (reg r1)
10792 (symbol_ref ("SYM"))))
10793 Note that r0 need not necessarily be the same as r1, especially
10794 during pre-regalloc scheduling. */
10796 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10797 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10799 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10800 && REG_P (XEXP (SET_SRC (curr_set), 0))
10801 && REGNO (XEXP (SET_SRC (curr_set), 0))
10802 == REGNO (SET_DEST (prev_set))
10803 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10804 XEXP (SET_SRC (curr_set), 1)))
10805 return true;
10809 if (simple_sets_p
10810 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10813 /* We're trying to match:
10814 prev (movk) == (set (zero_extract (reg r0)
10815 (const_int 16)
10816 (const_int 32))
10817 (const_int imm16_1))
10818 curr (movk) == (set (zero_extract (reg r0)
10819 (const_int 16)
10820 (const_int 48))
10821 (const_int imm16_2)) */
10823 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10824 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10825 && REG_P (XEXP (SET_DEST (prev_set), 0))
10826 && REG_P (XEXP (SET_DEST (curr_set), 0))
10827 && REGNO (XEXP (SET_DEST (prev_set), 0))
10828 == REGNO (XEXP (SET_DEST (curr_set), 0))
10829 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10830 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10831 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10832 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10833 && CONST_INT_P (SET_SRC (prev_set))
10834 && CONST_INT_P (SET_SRC (curr_set)))
10835 return true;
10838 if (simple_sets_p
10839 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10841 /* We're trying to match:
10842 prev (adrp) == (set (reg r0)
10843 (high (symbol_ref ("SYM"))))
10844 curr (ldr) == (set (reg r1)
10845 (mem (lo_sum (reg r0)
10846 (symbol_ref ("SYM")))))
10848 curr (ldr) == (set (reg r1)
10849 (zero_extend (mem
10850 (lo_sum (reg r0)
10851 (symbol_ref ("SYM")))))) */
10852 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10853 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10855 rtx curr_src = SET_SRC (curr_set);
10857 if (GET_CODE (curr_src) == ZERO_EXTEND)
10858 curr_src = XEXP (curr_src, 0);
10860 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10861 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10862 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10863 == REGNO (SET_DEST (prev_set))
10864 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10865 XEXP (SET_SRC (prev_set), 0)))
10866 return true;
10870 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10871 && any_condjump_p (curr))
10873 enum attr_type prev_type = get_attr_type (prev);
10875 /* FIXME: this misses some which is considered simple arthematic
10876 instructions for ThunderX. Simple shifts are missed here. */
10877 if (prev_type == TYPE_ALUS_SREG
10878 || prev_type == TYPE_ALUS_IMM
10879 || prev_type == TYPE_LOGICS_REG
10880 || prev_type == TYPE_LOGICS_IMM)
10881 return true;
10884 return false;
10887 /* If MEM is in the form of [base+offset], extract the two parts
10888 of address and set to BASE and OFFSET, otherwise return false
10889 after clearing BASE and OFFSET. */
10891 bool
10892 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10894 rtx addr;
10896 gcc_assert (MEM_P (mem));
10898 addr = XEXP (mem, 0);
10900 if (REG_P (addr))
10902 *base = addr;
10903 *offset = const0_rtx;
10904 return true;
10907 if (GET_CODE (addr) == PLUS
10908 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10910 *base = XEXP (addr, 0);
10911 *offset = XEXP (addr, 1);
10912 return true;
10915 *base = NULL_RTX;
10916 *offset = NULL_RTX;
10918 return false;
10921 /* Types for scheduling fusion. */
10922 enum sched_fusion_type
10924 SCHED_FUSION_NONE = 0,
10925 SCHED_FUSION_LD_SIGN_EXTEND,
10926 SCHED_FUSION_LD_ZERO_EXTEND,
10927 SCHED_FUSION_LD,
10928 SCHED_FUSION_ST,
10929 SCHED_FUSION_NUM
10932 /* If INSN is a load or store of address in the form of [base+offset],
10933 extract the two parts and set to BASE and OFFSET. Return scheduling
10934 fusion type this INSN is. */
10936 static enum sched_fusion_type
10937 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10939 rtx x, dest, src;
10940 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10942 gcc_assert (INSN_P (insn));
10943 x = PATTERN (insn);
10944 if (GET_CODE (x) != SET)
10945 return SCHED_FUSION_NONE;
10947 src = SET_SRC (x);
10948 dest = SET_DEST (x);
10950 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10951 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10952 return SCHED_FUSION_NONE;
10954 if (GET_CODE (src) == SIGN_EXTEND)
10956 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10957 src = XEXP (src, 0);
10958 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10959 return SCHED_FUSION_NONE;
10961 else if (GET_CODE (src) == ZERO_EXTEND)
10963 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10964 src = XEXP (src, 0);
10965 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10966 return SCHED_FUSION_NONE;
10969 if (GET_CODE (src) == MEM && REG_P (dest))
10970 extract_base_offset_in_addr (src, base, offset);
10971 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10973 fusion = SCHED_FUSION_ST;
10974 extract_base_offset_in_addr (dest, base, offset);
10976 else
10977 return SCHED_FUSION_NONE;
10979 if (*base == NULL_RTX || *offset == NULL_RTX)
10980 fusion = SCHED_FUSION_NONE;
10982 return fusion;
10985 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10987 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10988 and PRI are only calculated for these instructions. For other instruction,
10989 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10990 type instruction fusion can be added by returning different priorities.
10992 It's important that irrelevant instructions get the largest FUSION_PRI. */
10994 static void
10995 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10996 int *fusion_pri, int *pri)
10998 int tmp, off_val;
10999 rtx base, offset;
11000 enum sched_fusion_type fusion;
11002 gcc_assert (INSN_P (insn));
11004 tmp = max_pri - 1;
11005 fusion = fusion_load_store (insn, &base, &offset);
11006 if (fusion == SCHED_FUSION_NONE)
11008 *pri = tmp;
11009 *fusion_pri = tmp;
11010 return;
11013 /* Set FUSION_PRI according to fusion type and base register. */
11014 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11016 /* Calculate PRI. */
11017 tmp /= 2;
11019 /* INSN with smaller offset goes first. */
11020 off_val = (int)(INTVAL (offset));
11021 if (off_val >= 0)
11022 tmp -= (off_val & 0xfffff);
11023 else
11024 tmp += ((- off_val) & 0xfffff);
11026 *pri = tmp;
11027 return;
11030 /* Given OPERANDS of consecutive load/store, check if we can merge
11031 them into ldp/stp. LOAD is true if they are load instructions.
11032 MODE is the mode of memory operands. */
11034 bool
11035 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11036 enum machine_mode mode)
11038 HOST_WIDE_INT offval_1, offval_2, msize;
11039 enum reg_class rclass_1, rclass_2;
11040 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11042 if (load)
11044 mem_1 = operands[1];
11045 mem_2 = operands[3];
11046 reg_1 = operands[0];
11047 reg_2 = operands[2];
11048 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11049 if (REGNO (reg_1) == REGNO (reg_2))
11050 return false;
11052 else
11054 mem_1 = operands[0];
11055 mem_2 = operands[2];
11056 reg_1 = operands[1];
11057 reg_2 = operands[3];
11060 /* The mems cannot be volatile. */
11061 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11062 return false;
11064 /* Check if the addresses are in the form of [base+offset]. */
11065 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11066 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11067 return false;
11068 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11069 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11070 return false;
11072 /* Check if the bases are same. */
11073 if (!rtx_equal_p (base_1, base_2))
11074 return false;
11076 offval_1 = INTVAL (offset_1);
11077 offval_2 = INTVAL (offset_2);
11078 msize = GET_MODE_SIZE (mode);
11079 /* Check if the offsets are consecutive. */
11080 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11081 return false;
11083 /* Check if the addresses are clobbered by load. */
11084 if (load)
11086 if (reg_mentioned_p (reg_1, mem_1))
11087 return false;
11089 /* In increasing order, the last load can clobber the address. */
11090 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11091 return false;
11094 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11095 rclass_1 = FP_REGS;
11096 else
11097 rclass_1 = GENERAL_REGS;
11099 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11100 rclass_2 = FP_REGS;
11101 else
11102 rclass_2 = GENERAL_REGS;
11104 /* Check if the registers are of same class. */
11105 if (rclass_1 != rclass_2)
11106 return false;
11108 return true;
11111 /* Given OPERANDS of consecutive load/store, check if we can merge
11112 them into ldp/stp by adjusting the offset. LOAD is true if they
11113 are load instructions. MODE is the mode of memory operands.
11115 Given below consecutive stores:
11117 str w1, [xb, 0x100]
11118 str w1, [xb, 0x104]
11119 str w1, [xb, 0x108]
11120 str w1, [xb, 0x10c]
11122 Though the offsets are out of the range supported by stp, we can
11123 still pair them after adjusting the offset, like:
11125 add scratch, xb, 0x100
11126 stp w1, w1, [scratch]
11127 stp w1, w1, [scratch, 0x8]
11129 The peephole patterns detecting this opportunity should guarantee
11130 the scratch register is avaliable. */
11132 bool
11133 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11134 enum machine_mode mode)
11136 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11137 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11138 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11139 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11141 if (load)
11143 reg_1 = operands[0];
11144 mem_1 = operands[1];
11145 reg_2 = operands[2];
11146 mem_2 = operands[3];
11147 reg_3 = operands[4];
11148 mem_3 = operands[5];
11149 reg_4 = operands[6];
11150 mem_4 = operands[7];
11151 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11152 && REG_P (reg_3) && REG_P (reg_4));
11153 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11154 return false;
11156 else
11158 mem_1 = operands[0];
11159 reg_1 = operands[1];
11160 mem_2 = operands[2];
11161 reg_2 = operands[3];
11162 mem_3 = operands[4];
11163 reg_3 = operands[5];
11164 mem_4 = operands[6];
11165 reg_4 = operands[7];
11167 /* Skip if memory operand is by itslef valid for ldp/stp. */
11168 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11169 return false;
11171 /* The mems cannot be volatile. */
11172 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11173 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11174 return false;
11176 /* Check if the addresses are in the form of [base+offset]. */
11177 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11178 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11179 return false;
11180 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11181 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11182 return false;
11183 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11184 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11185 return false;
11186 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11187 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11188 return false;
11190 /* Check if the bases are same. */
11191 if (!rtx_equal_p (base_1, base_2)
11192 || !rtx_equal_p (base_2, base_3)
11193 || !rtx_equal_p (base_3, base_4))
11194 return false;
11196 offval_1 = INTVAL (offset_1);
11197 offval_2 = INTVAL (offset_2);
11198 offval_3 = INTVAL (offset_3);
11199 offval_4 = INTVAL (offset_4);
11200 msize = GET_MODE_SIZE (mode);
11201 /* Check if the offsets are consecutive. */
11202 if ((offval_1 != (offval_2 + msize)
11203 || offval_1 != (offval_3 + msize * 2)
11204 || offval_1 != (offval_4 + msize * 3))
11205 && (offval_4 != (offval_3 + msize)
11206 || offval_4 != (offval_2 + msize * 2)
11207 || offval_4 != (offval_1 + msize * 3)))
11208 return false;
11210 /* Check if the addresses are clobbered by load. */
11211 if (load)
11213 if (reg_mentioned_p (reg_1, mem_1)
11214 || reg_mentioned_p (reg_2, mem_2)
11215 || reg_mentioned_p (reg_3, mem_3))
11216 return false;
11218 /* In increasing order, the last load can clobber the address. */
11219 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11220 return false;
11223 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11224 rclass_1 = FP_REGS;
11225 else
11226 rclass_1 = GENERAL_REGS;
11228 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11229 rclass_2 = FP_REGS;
11230 else
11231 rclass_2 = GENERAL_REGS;
11233 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11234 rclass_3 = FP_REGS;
11235 else
11236 rclass_3 = GENERAL_REGS;
11238 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11239 rclass_4 = FP_REGS;
11240 else
11241 rclass_4 = GENERAL_REGS;
11243 /* Check if the registers are of same class. */
11244 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11245 return false;
11247 return true;
11250 /* Given OPERANDS of consecutive load/store, this function pairs them
11251 into ldp/stp after adjusting the offset. It depends on the fact
11252 that addresses of load/store instructions are in increasing order.
11253 MODE is the mode of memory operands. CODE is the rtl operator
11254 which should be applied to all memory operands, it's SIGN_EXTEND,
11255 ZERO_EXTEND or UNKNOWN. */
11257 bool
11258 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11259 enum machine_mode mode, RTX_CODE code)
11261 rtx base, offset, t1, t2;
11262 rtx mem_1, mem_2, mem_3, mem_4;
11263 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11265 if (load)
11267 mem_1 = operands[1];
11268 mem_2 = operands[3];
11269 mem_3 = operands[5];
11270 mem_4 = operands[7];
11272 else
11274 mem_1 = operands[0];
11275 mem_2 = operands[2];
11276 mem_3 = operands[4];
11277 mem_4 = operands[6];
11278 gcc_assert (code == UNKNOWN);
11281 extract_base_offset_in_addr (mem_1, &base, &offset);
11282 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11284 /* Adjust offset thus it can fit in ldp/stp instruction. */
11285 msize = GET_MODE_SIZE (mode);
11286 stp_off_limit = msize * 0x40;
11287 off_val = INTVAL (offset);
11288 abs_off = (off_val < 0) ? -off_val : off_val;
11289 new_off = abs_off % stp_off_limit;
11290 adj_off = abs_off - new_off;
11292 /* Further adjust to make sure all offsets are OK. */
11293 if ((new_off + msize * 2) >= stp_off_limit)
11295 adj_off += stp_off_limit;
11296 new_off -= stp_off_limit;
11299 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11300 if (adj_off >= 0x1000)
11301 return false;
11303 if (off_val < 0)
11305 adj_off = -adj_off;
11306 new_off = -new_off;
11309 /* Create new memory references. */
11310 mem_1 = change_address (mem_1, VOIDmode,
11311 plus_constant (DImode, operands[8], new_off));
11313 /* Check if the adjusted address is OK for ldp/stp. */
11314 if (!aarch64_mem_pair_operand (mem_1, mode))
11315 return false;
11317 msize = GET_MODE_SIZE (mode);
11318 mem_2 = change_address (mem_2, VOIDmode,
11319 plus_constant (DImode,
11320 operands[8],
11321 new_off + msize));
11322 mem_3 = change_address (mem_3, VOIDmode,
11323 plus_constant (DImode,
11324 operands[8],
11325 new_off + msize * 2));
11326 mem_4 = change_address (mem_4, VOIDmode,
11327 plus_constant (DImode,
11328 operands[8],
11329 new_off + msize * 3));
11331 if (code == ZERO_EXTEND)
11333 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11334 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11335 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11336 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11338 else if (code == SIGN_EXTEND)
11340 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11341 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11342 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11343 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11346 if (load)
11348 operands[1] = mem_1;
11349 operands[3] = mem_2;
11350 operands[5] = mem_3;
11351 operands[7] = mem_4;
11353 else
11355 operands[0] = mem_1;
11356 operands[2] = mem_2;
11357 operands[4] = mem_3;
11358 operands[6] = mem_4;
11361 /* Emit adjusting instruction. */
11362 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11363 plus_constant (DImode, base, adj_off)));
11364 /* Emit ldp/stp instructions. */
11365 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11366 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11367 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11368 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11369 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11370 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11371 return true;
11374 #undef TARGET_ADDRESS_COST
11375 #define TARGET_ADDRESS_COST aarch64_address_cost
11377 /* This hook will determines whether unnamed bitfields affect the alignment
11378 of the containing structure. The hook returns true if the structure
11379 should inherit the alignment requirements of an unnamed bitfield's
11380 type. */
11381 #undef TARGET_ALIGN_ANON_BITFIELD
11382 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11384 #undef TARGET_ASM_ALIGNED_DI_OP
11385 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11387 #undef TARGET_ASM_ALIGNED_HI_OP
11388 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11390 #undef TARGET_ASM_ALIGNED_SI_OP
11391 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11393 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11394 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11395 hook_bool_const_tree_hwi_hwi_const_tree_true
11397 #undef TARGET_ASM_FILE_START
11398 #define TARGET_ASM_FILE_START aarch64_start_file
11400 #undef TARGET_ASM_OUTPUT_MI_THUNK
11401 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11403 #undef TARGET_ASM_SELECT_RTX_SECTION
11404 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11406 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11407 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11409 #undef TARGET_BUILD_BUILTIN_VA_LIST
11410 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11412 #undef TARGET_CALLEE_COPIES
11413 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11415 #undef TARGET_CAN_ELIMINATE
11416 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11418 #undef TARGET_CANNOT_FORCE_CONST_MEM
11419 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11421 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11422 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11424 /* Only the least significant bit is used for initialization guard
11425 variables. */
11426 #undef TARGET_CXX_GUARD_MASK_BIT
11427 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11429 #undef TARGET_C_MODE_FOR_SUFFIX
11430 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11432 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11433 #undef TARGET_DEFAULT_TARGET_FLAGS
11434 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11435 #endif
11437 #undef TARGET_CLASS_MAX_NREGS
11438 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11440 #undef TARGET_BUILTIN_DECL
11441 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11443 #undef TARGET_EXPAND_BUILTIN
11444 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11446 #undef TARGET_EXPAND_BUILTIN_VA_START
11447 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11449 #undef TARGET_FOLD_BUILTIN
11450 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11452 #undef TARGET_FUNCTION_ARG
11453 #define TARGET_FUNCTION_ARG aarch64_function_arg
11455 #undef TARGET_FUNCTION_ARG_ADVANCE
11456 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11458 #undef TARGET_FUNCTION_ARG_BOUNDARY
11459 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11461 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11462 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11464 #undef TARGET_FUNCTION_VALUE
11465 #define TARGET_FUNCTION_VALUE aarch64_function_value
11467 #undef TARGET_FUNCTION_VALUE_REGNO_P
11468 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11470 #undef TARGET_FRAME_POINTER_REQUIRED
11471 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11473 #undef TARGET_GIMPLE_FOLD_BUILTIN
11474 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11476 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11477 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11479 #undef TARGET_INIT_BUILTINS
11480 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11482 #undef TARGET_LEGITIMATE_ADDRESS_P
11483 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11485 #undef TARGET_LEGITIMATE_CONSTANT_P
11486 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11488 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11489 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11491 #undef TARGET_LRA_P
11492 #define TARGET_LRA_P hook_bool_void_true
11494 #undef TARGET_MANGLE_TYPE
11495 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11497 #undef TARGET_MEMORY_MOVE_COST
11498 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11500 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11501 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11503 #undef TARGET_MUST_PASS_IN_STACK
11504 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11506 /* This target hook should return true if accesses to volatile bitfields
11507 should use the narrowest mode possible. It should return false if these
11508 accesses should use the bitfield container type. */
11509 #undef TARGET_NARROW_VOLATILE_BITFIELD
11510 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11512 #undef TARGET_OPTION_OVERRIDE
11513 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11515 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11516 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11517 aarch64_override_options_after_change
11519 #undef TARGET_PASS_BY_REFERENCE
11520 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11522 #undef TARGET_PREFERRED_RELOAD_CLASS
11523 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11525 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11526 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11528 #undef TARGET_SECONDARY_RELOAD
11529 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11531 #undef TARGET_SHIFT_TRUNCATION_MASK
11532 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11534 #undef TARGET_SETUP_INCOMING_VARARGS
11535 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11537 #undef TARGET_STRUCT_VALUE_RTX
11538 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11540 #undef TARGET_REGISTER_MOVE_COST
11541 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11543 #undef TARGET_RETURN_IN_MEMORY
11544 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11546 #undef TARGET_RETURN_IN_MSB
11547 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11549 #undef TARGET_RTX_COSTS
11550 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11552 #undef TARGET_SCHED_ISSUE_RATE
11553 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11555 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11556 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11557 aarch64_sched_first_cycle_multipass_dfa_lookahead
11559 #undef TARGET_TRAMPOLINE_INIT
11560 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11562 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11563 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11565 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11566 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11568 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11569 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11571 #undef TARGET_VECTORIZE_ADD_STMT_COST
11572 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11574 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11575 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11576 aarch64_builtin_vectorization_cost
11578 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11579 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11581 #undef TARGET_VECTORIZE_BUILTINS
11582 #define TARGET_VECTORIZE_BUILTINS
11584 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11585 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11586 aarch64_builtin_vectorized_function
11588 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11589 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11590 aarch64_autovectorize_vector_sizes
11592 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11593 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11594 aarch64_atomic_assign_expand_fenv
11596 /* Section anchor support. */
11598 #undef TARGET_MIN_ANCHOR_OFFSET
11599 #define TARGET_MIN_ANCHOR_OFFSET -256
11601 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11602 byte offset; we can do much more for larger data types, but have no way
11603 to determine the size of the access. We assume accesses are aligned. */
11604 #undef TARGET_MAX_ANCHOR_OFFSET
11605 #define TARGET_MAX_ANCHOR_OFFSET 4095
11607 #undef TARGET_VECTOR_ALIGNMENT
11608 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11610 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11611 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11612 aarch64_simd_vector_alignment_reachable
11614 /* vec_perm support. */
11616 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11617 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11618 aarch64_vectorize_vec_perm_const_ok
11621 #undef TARGET_FIXED_CONDITION_CODE_REGS
11622 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11624 #undef TARGET_FLAGS_REGNUM
11625 #define TARGET_FLAGS_REGNUM CC_REGNUM
11627 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11628 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11630 #undef TARGET_ASAN_SHADOW_OFFSET
11631 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11633 #undef TARGET_LEGITIMIZE_ADDRESS
11634 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11636 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11637 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11638 aarch64_use_by_pieces_infrastructure_p
11640 #undef TARGET_CAN_USE_DOLOOP_P
11641 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11643 #undef TARGET_SCHED_MACRO_FUSION_P
11644 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11646 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11647 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11649 #undef TARGET_SCHED_FUSION_PRIORITY
11650 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11652 struct gcc_target targetm = TARGET_INITIALIZER;
11654 #include "gt-aarch64.h"