gcc/
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobd7310d95e34c6dede05003f92176411d4cf48356
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
99 /* Defined for convenience. */
100 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
102 /* Classifies an address.
104 ADDRESS_REG_IMM
105 A simple base register plus immediate offset.
107 ADDRESS_REG_WB
108 A base register indexed by immediate offset with writeback.
110 ADDRESS_REG_REG
111 A base register indexed by (optionally scaled) register.
113 ADDRESS_REG_UXTW
114 A base register indexed by (optionally scaled) zero-extended register.
116 ADDRESS_REG_SXTW
117 A base register indexed by (optionally scaled) sign-extended register.
119 ADDRESS_LO_SUM
120 A LO_SUM rtx with a base register and "LO12" symbol relocation.
122 ADDRESS_SYMBOLIC:
123 A constant symbolic address, in pc-relative literal pool. */
125 enum aarch64_address_type {
126 ADDRESS_REG_IMM,
127 ADDRESS_REG_WB,
128 ADDRESS_REG_REG,
129 ADDRESS_REG_UXTW,
130 ADDRESS_REG_SXTW,
131 ADDRESS_LO_SUM,
132 ADDRESS_SYMBOLIC
135 struct aarch64_address_info {
136 enum aarch64_address_type type;
137 rtx base;
138 rtx offset;
139 int shift;
140 enum aarch64_symbol_type symbol_type;
143 struct simd_immediate_info
145 rtx value;
146 int shift;
147 int element_width;
148 bool mvn;
149 bool msl;
152 /* The current code model. */
153 enum aarch64_code_model aarch64_cmodel;
155 #ifdef HAVE_AS_TLS
156 #undef TARGET_HAVE_TLS
157 #define TARGET_HAVE_TLS 1
158 #endif
160 static bool aarch64_composite_type_p (const_tree, machine_mode);
161 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
162 const_tree,
163 machine_mode *, int *,
164 bool *);
165 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
166 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_override_options_after_change (void);
168 static bool aarch64_vector_mode_supported_p (machine_mode);
169 static unsigned bit_count (unsigned HOST_WIDE_INT);
170 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
171 const unsigned char *sel);
172 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
174 /* Major revision number of the ARM Architecture implemented by the target. */
175 unsigned aarch64_architecture_version;
177 /* The processor for which instructions should be scheduled. */
178 enum aarch64_processor aarch64_tune = cortexa53;
180 /* The current tuning set. */
181 const struct tune_params *aarch64_tune_params;
183 /* Mask to specify which instructions we are allowed to generate. */
184 unsigned long aarch64_isa_flags = 0;
186 /* Mask to specify which instruction scheduling options should be used. */
187 unsigned long aarch64_tune_flags = 0;
189 /* Tuning parameters. */
191 #if HAVE_DESIGNATED_INITIALIZERS
192 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
193 #else
194 #define NAMED_PARAM(NAME, VAL) (VAL)
195 #endif
197 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
198 __extension__
199 #endif
201 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
202 __extension__
203 #endif
204 static const struct cpu_addrcost_table generic_addrcost_table =
206 #if HAVE_DESIGNATED_INITIALIZERS
207 .addr_scale_costs =
208 #endif
210 NAMED_PARAM (hi, 0),
211 NAMED_PARAM (si, 0),
212 NAMED_PARAM (di, 0),
213 NAMED_PARAM (ti, 0),
215 NAMED_PARAM (pre_modify, 0),
216 NAMED_PARAM (post_modify, 0),
217 NAMED_PARAM (register_offset, 0),
218 NAMED_PARAM (register_extend, 0),
219 NAMED_PARAM (imm_offset, 0)
222 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
223 __extension__
224 #endif
225 static const struct cpu_addrcost_table cortexa57_addrcost_table =
227 #if HAVE_DESIGNATED_INITIALIZERS
228 .addr_scale_costs =
229 #endif
231 NAMED_PARAM (hi, 1),
232 NAMED_PARAM (si, 0),
233 NAMED_PARAM (di, 0),
234 NAMED_PARAM (ti, 1),
236 NAMED_PARAM (pre_modify, 0),
237 NAMED_PARAM (post_modify, 0),
238 NAMED_PARAM (register_offset, 0),
239 NAMED_PARAM (register_extend, 0),
240 NAMED_PARAM (imm_offset, 0),
243 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
244 __extension__
245 #endif
246 static const struct cpu_addrcost_table xgene1_addrcost_table =
248 #if HAVE_DESIGNATED_INITIALIZERS
249 .addr_scale_costs =
250 #endif
252 NAMED_PARAM (hi, 1),
253 NAMED_PARAM (si, 0),
254 NAMED_PARAM (di, 0),
255 NAMED_PARAM (ti, 1),
257 NAMED_PARAM (pre_modify, 1),
258 NAMED_PARAM (post_modify, 0),
259 NAMED_PARAM (register_offset, 0),
260 NAMED_PARAM (register_extend, 1),
261 NAMED_PARAM (imm_offset, 0),
264 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
265 __extension__
266 #endif
267 static const struct cpu_regmove_cost generic_regmove_cost =
269 NAMED_PARAM (GP2GP, 1),
270 /* Avoid the use of slow int<->fp moves for spilling by setting
271 their cost higher than memmov_cost. */
272 NAMED_PARAM (GP2FP, 5),
273 NAMED_PARAM (FP2GP, 5),
274 NAMED_PARAM (FP2FP, 2)
277 static const struct cpu_regmove_cost cortexa57_regmove_cost =
279 NAMED_PARAM (GP2GP, 1),
280 /* Avoid the use of slow int<->fp moves for spilling by setting
281 their cost higher than memmov_cost. */
282 NAMED_PARAM (GP2FP, 5),
283 NAMED_PARAM (FP2GP, 5),
284 NAMED_PARAM (FP2FP, 2)
287 static const struct cpu_regmove_cost cortexa53_regmove_cost =
289 NAMED_PARAM (GP2GP, 1),
290 /* Avoid the use of slow int<->fp moves for spilling by setting
291 their cost higher than memmov_cost. */
292 NAMED_PARAM (GP2FP, 5),
293 NAMED_PARAM (FP2GP, 5),
294 NAMED_PARAM (FP2FP, 2)
297 static const struct cpu_regmove_cost thunderx_regmove_cost =
299 NAMED_PARAM (GP2GP, 2),
300 NAMED_PARAM (GP2FP, 2),
301 NAMED_PARAM (FP2GP, 6),
302 NAMED_PARAM (FP2FP, 4)
305 static const struct cpu_regmove_cost xgene1_regmove_cost =
307 NAMED_PARAM (GP2GP, 1),
308 /* Avoid the use of slow int<->fp moves for spilling by setting
309 their cost higher than memmov_cost. */
310 NAMED_PARAM (GP2FP, 8),
311 NAMED_PARAM (FP2GP, 8),
312 NAMED_PARAM (FP2FP, 2)
315 /* Generic costs for vector insn classes. */
316 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
317 __extension__
318 #endif
319 static const struct cpu_vector_cost generic_vector_cost =
321 NAMED_PARAM (scalar_stmt_cost, 1),
322 NAMED_PARAM (scalar_load_cost, 1),
323 NAMED_PARAM (scalar_store_cost, 1),
324 NAMED_PARAM (vec_stmt_cost, 1),
325 NAMED_PARAM (vec_to_scalar_cost, 1),
326 NAMED_PARAM (scalar_to_vec_cost, 1),
327 NAMED_PARAM (vec_align_load_cost, 1),
328 NAMED_PARAM (vec_unalign_load_cost, 1),
329 NAMED_PARAM (vec_unalign_store_cost, 1),
330 NAMED_PARAM (vec_store_cost, 1),
331 NAMED_PARAM (cond_taken_branch_cost, 3),
332 NAMED_PARAM (cond_not_taken_branch_cost, 1)
335 /* Generic costs for vector insn classes. */
336 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
337 __extension__
338 #endif
339 static const struct cpu_vector_cost cortexa57_vector_cost =
341 NAMED_PARAM (scalar_stmt_cost, 1),
342 NAMED_PARAM (scalar_load_cost, 4),
343 NAMED_PARAM (scalar_store_cost, 1),
344 NAMED_PARAM (vec_stmt_cost, 3),
345 NAMED_PARAM (vec_to_scalar_cost, 8),
346 NAMED_PARAM (scalar_to_vec_cost, 8),
347 NAMED_PARAM (vec_align_load_cost, 5),
348 NAMED_PARAM (vec_unalign_load_cost, 5),
349 NAMED_PARAM (vec_unalign_store_cost, 1),
350 NAMED_PARAM (vec_store_cost, 1),
351 NAMED_PARAM (cond_taken_branch_cost, 1),
352 NAMED_PARAM (cond_not_taken_branch_cost, 1)
355 /* Generic costs for vector insn classes. */
356 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
357 __extension__
358 #endif
359 static const struct cpu_vector_cost xgene1_vector_cost =
361 NAMED_PARAM (scalar_stmt_cost, 1),
362 NAMED_PARAM (scalar_load_cost, 5),
363 NAMED_PARAM (scalar_store_cost, 1),
364 NAMED_PARAM (vec_stmt_cost, 2),
365 NAMED_PARAM (vec_to_scalar_cost, 4),
366 NAMED_PARAM (scalar_to_vec_cost, 4),
367 NAMED_PARAM (vec_align_load_cost, 10),
368 NAMED_PARAM (vec_unalign_load_cost, 10),
369 NAMED_PARAM (vec_unalign_store_cost, 2),
370 NAMED_PARAM (vec_store_cost, 2),
371 NAMED_PARAM (cond_taken_branch_cost, 2),
372 NAMED_PARAM (cond_not_taken_branch_cost, 1)
375 #define AARCH64_FUSE_NOTHING (0)
376 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
377 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
378 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
379 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
380 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
382 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
383 __extension__
384 #endif
385 static const struct tune_params generic_tunings =
387 &cortexa57_extra_costs,
388 &generic_addrcost_table,
389 &generic_regmove_cost,
390 &generic_vector_cost,
391 NAMED_PARAM (memmov_cost, 4),
392 NAMED_PARAM (issue_rate, 2),
393 NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
394 8, /* function_align. */
395 8, /* jump_align. */
396 4, /* loop_align. */
397 2, /* int_reassoc_width. */
398 4, /* fp_reassoc_width. */
399 1 /* vec_reassoc_width. */
402 static const struct tune_params cortexa53_tunings =
404 &cortexa53_extra_costs,
405 &generic_addrcost_table,
406 &cortexa53_regmove_cost,
407 &generic_vector_cost,
408 NAMED_PARAM (memmov_cost, 4),
409 NAMED_PARAM (issue_rate, 2),
410 NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
411 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR)),
412 8, /* function_align. */
413 8, /* jump_align. */
414 4, /* loop_align. */
415 2, /* int_reassoc_width. */
416 4, /* fp_reassoc_width. */
417 1 /* vec_reassoc_width. */
420 static const struct tune_params cortexa57_tunings =
422 &cortexa57_extra_costs,
423 &cortexa57_addrcost_table,
424 &cortexa57_regmove_cost,
425 &cortexa57_vector_cost,
426 NAMED_PARAM (memmov_cost, 4),
427 NAMED_PARAM (issue_rate, 3),
428 NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK)),
429 16, /* function_align. */
430 8, /* jump_align. */
431 4, /* loop_align. */
432 2, /* int_reassoc_width. */
433 4, /* fp_reassoc_width. */
434 1 /* vec_reassoc_width. */
437 static const struct tune_params thunderx_tunings =
439 &thunderx_extra_costs,
440 &generic_addrcost_table,
441 &thunderx_regmove_cost,
442 &generic_vector_cost,
443 NAMED_PARAM (memmov_cost, 6),
444 NAMED_PARAM (issue_rate, 2),
445 NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH),
446 8, /* function_align. */
447 8, /* jump_align. */
448 8, /* loop_align. */
449 2, /* int_reassoc_width. */
450 4, /* fp_reassoc_width. */
451 1 /* vec_reassoc_width. */
454 static const struct tune_params xgene1_tunings =
456 &xgene1_extra_costs,
457 &xgene1_addrcost_table,
458 &xgene1_regmove_cost,
459 &xgene1_vector_cost,
460 NAMED_PARAM (memmov_cost, 6),
461 NAMED_PARAM (issue_rate, 4),
462 NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
463 16, /* function_align. */
464 8, /* jump_align. */
465 16, /* loop_align. */
466 2, /* int_reassoc_width. */
467 4, /* fp_reassoc_width. */
468 1 /* vec_reassoc_width. */
471 /* A processor implementing AArch64. */
472 struct processor
474 const char *const name;
475 enum aarch64_processor core;
476 const char *arch;
477 unsigned architecture_version;
478 const unsigned long flags;
479 const struct tune_params *const tune;
482 /* Processor cores implementing AArch64. */
483 static const struct processor all_cores[] =
485 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
486 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
487 #include "aarch64-cores.def"
488 #undef AARCH64_CORE
489 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
490 {NULL, aarch64_none, NULL, 0, 0, NULL}
493 /* Architectures implementing AArch64. */
494 static const struct processor all_architectures[] =
496 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
497 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
498 #include "aarch64-arches.def"
499 #undef AARCH64_ARCH
500 {NULL, aarch64_none, NULL, 0, 0, NULL}
503 /* Target specification. These are populated as commandline arguments
504 are processed, or NULL if not specified. */
505 static const struct processor *selected_arch;
506 static const struct processor *selected_cpu;
507 static const struct processor *selected_tune;
509 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
511 /* An ISA extension in the co-processor and main instruction set space. */
512 struct aarch64_option_extension
514 const char *const name;
515 const unsigned long flags_on;
516 const unsigned long flags_off;
519 /* ISA extensions in AArch64. */
520 static const struct aarch64_option_extension all_extensions[] =
522 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
523 {NAME, FLAGS_ON, FLAGS_OFF},
524 #include "aarch64-option-extensions.def"
525 #undef AARCH64_OPT_EXTENSION
526 {NULL, 0, 0}
529 /* Used to track the size of an address when generating a pre/post
530 increment address. */
531 static machine_mode aarch64_memory_reference_mode;
533 /* Used to force GTY into this file. */
534 static GTY(()) int gty_dummy;
536 /* A table of valid AArch64 "bitmask immediate" values for
537 logical instructions. */
539 #define AARCH64_NUM_BITMASKS 5334
540 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
542 typedef enum aarch64_cond_code
544 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
545 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
546 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
548 aarch64_cc;
550 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
552 /* The condition codes of the processor, and the inverse function. */
553 static const char * const aarch64_condition_codes[] =
555 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
556 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
559 static unsigned int
560 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
562 return 2;
565 static int
566 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
567 enum machine_mode mode)
569 if (VECTOR_MODE_P (mode))
570 return aarch64_tune_params->vec_reassoc_width;
571 if (INTEGRAL_MODE_P (mode))
572 return aarch64_tune_params->int_reassoc_width;
573 if (FLOAT_MODE_P (mode))
574 return aarch64_tune_params->fp_reassoc_width;
575 return 1;
578 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
579 unsigned
580 aarch64_dbx_register_number (unsigned regno)
582 if (GP_REGNUM_P (regno))
583 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
584 else if (regno == SP_REGNUM)
585 return AARCH64_DWARF_SP;
586 else if (FP_REGNUM_P (regno))
587 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
589 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
590 equivalent DWARF register. */
591 return DWARF_FRAME_REGISTERS;
594 /* Return TRUE if MODE is any of the large INT modes. */
595 static bool
596 aarch64_vect_struct_mode_p (machine_mode mode)
598 return mode == OImode || mode == CImode || mode == XImode;
601 /* Return TRUE if MODE is any of the vector modes. */
602 static bool
603 aarch64_vector_mode_p (machine_mode mode)
605 return aarch64_vector_mode_supported_p (mode)
606 || aarch64_vect_struct_mode_p (mode);
609 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
610 static bool
611 aarch64_array_mode_supported_p (machine_mode mode,
612 unsigned HOST_WIDE_INT nelems)
614 if (TARGET_SIMD
615 && AARCH64_VALID_SIMD_QREG_MODE (mode)
616 && (nelems >= 2 && nelems <= 4))
617 return true;
619 return false;
622 /* Implement HARD_REGNO_NREGS. */
625 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
627 switch (aarch64_regno_regclass (regno))
629 case FP_REGS:
630 case FP_LO_REGS:
631 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
632 default:
633 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
635 gcc_unreachable ();
638 /* Implement HARD_REGNO_MODE_OK. */
641 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
643 if (GET_MODE_CLASS (mode) == MODE_CC)
644 return regno == CC_REGNUM;
646 if (regno == SP_REGNUM)
647 /* The purpose of comparing with ptr_mode is to support the
648 global register variable associated with the stack pointer
649 register via the syntax of asm ("wsp") in ILP32. */
650 return mode == Pmode || mode == ptr_mode;
652 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
653 return mode == Pmode;
655 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
656 return 1;
658 if (FP_REGNUM_P (regno))
660 if (aarch64_vect_struct_mode_p (mode))
661 return
662 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
663 else
664 return 1;
667 return 0;
670 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
671 machine_mode
672 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
673 machine_mode mode)
675 /* Handle modes that fit within single registers. */
676 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
678 if (GET_MODE_SIZE (mode) >= 4)
679 return mode;
680 else
681 return SImode;
683 /* Fall back to generic for multi-reg and very large modes. */
684 else
685 return choose_hard_reg_mode (regno, nregs, false);
688 /* Return true if calls to DECL should be treated as
689 long-calls (ie called via a register). */
690 static bool
691 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
693 return false;
696 /* Return true if calls to symbol-ref SYM should be treated as
697 long-calls (ie called via a register). */
698 bool
699 aarch64_is_long_call_p (rtx sym)
701 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
704 /* Return true if the offsets to a zero/sign-extract operation
705 represent an expression that matches an extend operation. The
706 operands represent the paramters from
708 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
709 bool
710 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
711 rtx extract_imm)
713 HOST_WIDE_INT mult_val, extract_val;
715 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
716 return false;
718 mult_val = INTVAL (mult_imm);
719 extract_val = INTVAL (extract_imm);
721 if (extract_val > 8
722 && extract_val < GET_MODE_BITSIZE (mode)
723 && exact_log2 (extract_val & ~7) > 0
724 && (extract_val & 7) <= 4
725 && mult_val == (1 << (extract_val & 7)))
726 return true;
728 return false;
731 /* Emit an insn that's a simple single-set. Both the operands must be
732 known to be valid. */
733 inline static rtx
734 emit_set_insn (rtx x, rtx y)
736 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
739 /* X and Y are two things to compare using CODE. Emit the compare insn and
740 return the rtx for register 0 in the proper mode. */
742 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
744 machine_mode mode = SELECT_CC_MODE (code, x, y);
745 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
747 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
748 return cc_reg;
751 /* Build the SYMBOL_REF for __tls_get_addr. */
753 static GTY(()) rtx tls_get_addr_libfunc;
756 aarch64_tls_get_addr (void)
758 if (!tls_get_addr_libfunc)
759 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
760 return tls_get_addr_libfunc;
763 /* Return the TLS model to use for ADDR. */
765 static enum tls_model
766 tls_symbolic_operand_type (rtx addr)
768 enum tls_model tls_kind = TLS_MODEL_NONE;
769 rtx sym, addend;
771 if (GET_CODE (addr) == CONST)
773 split_const (addr, &sym, &addend);
774 if (GET_CODE (sym) == SYMBOL_REF)
775 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
777 else if (GET_CODE (addr) == SYMBOL_REF)
778 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
780 return tls_kind;
783 /* We'll allow lo_sum's in addresses in our legitimate addresses
784 so that combine would take care of combining addresses where
785 necessary, but for generation purposes, we'll generate the address
786 as :
787 RTL Absolute
788 tmp = hi (symbol_ref); adrp x1, foo
789 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
792 PIC TLS
793 adrp x1, :got:foo adrp tmp, :tlsgd:foo
794 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
795 bl __tls_get_addr
798 Load TLS symbol, depending on TLS mechanism and TLS access model.
800 Global Dynamic - Traditional TLS:
801 adrp tmp, :tlsgd:imm
802 add dest, tmp, #:tlsgd_lo12:imm
803 bl __tls_get_addr
805 Global Dynamic - TLS Descriptors:
806 adrp dest, :tlsdesc:imm
807 ldr tmp, [dest, #:tlsdesc_lo12:imm]
808 add dest, dest, #:tlsdesc_lo12:imm
809 blr tmp
810 mrs tp, tpidr_el0
811 add dest, dest, tp
813 Initial Exec:
814 mrs tp, tpidr_el0
815 adrp tmp, :gottprel:imm
816 ldr dest, [tmp, #:gottprel_lo12:imm]
817 add dest, dest, tp
819 Local Exec:
820 mrs tp, tpidr_el0
821 add t0, tp, #:tprel_hi12:imm
822 add t0, #:tprel_lo12_nc:imm
825 static void
826 aarch64_load_symref_appropriately (rtx dest, rtx imm,
827 enum aarch64_symbol_type type)
829 switch (type)
831 case SYMBOL_SMALL_ABSOLUTE:
833 /* In ILP32, the mode of dest can be either SImode or DImode. */
834 rtx tmp_reg = dest;
835 machine_mode mode = GET_MODE (dest);
837 gcc_assert (mode == Pmode || mode == ptr_mode);
839 if (can_create_pseudo_p ())
840 tmp_reg = gen_reg_rtx (mode);
842 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
843 emit_insn (gen_add_losym (dest, tmp_reg, imm));
844 return;
847 case SYMBOL_TINY_ABSOLUTE:
848 emit_insn (gen_rtx_SET (Pmode, dest, imm));
849 return;
851 case SYMBOL_SMALL_GOT:
853 /* In ILP32, the mode of dest can be either SImode or DImode,
854 while the got entry is always of SImode size. The mode of
855 dest depends on how dest is used: if dest is assigned to a
856 pointer (e.g. in the memory), it has SImode; it may have
857 DImode if dest is dereferenced to access the memeory.
858 This is why we have to handle three different ldr_got_small
859 patterns here (two patterns for ILP32). */
860 rtx tmp_reg = dest;
861 machine_mode mode = GET_MODE (dest);
863 if (can_create_pseudo_p ())
864 tmp_reg = gen_reg_rtx (mode);
866 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
867 if (mode == ptr_mode)
869 if (mode == DImode)
870 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
871 else
872 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
874 else
876 gcc_assert (mode == Pmode);
877 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
880 return;
883 case SYMBOL_SMALL_TLSGD:
885 rtx_insn *insns;
886 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
888 start_sequence ();
889 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
890 insns = get_insns ();
891 end_sequence ();
893 RTL_CONST_CALL_P (insns) = 1;
894 emit_libcall_block (insns, dest, result, imm);
895 return;
898 case SYMBOL_SMALL_TLSDESC:
900 machine_mode mode = GET_MODE (dest);
901 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
902 rtx tp;
904 gcc_assert (mode == Pmode || mode == ptr_mode);
906 /* In ILP32, the got entry is always of SImode size. Unlike
907 small GOT, the dest is fixed at reg 0. */
908 if (TARGET_ILP32)
909 emit_insn (gen_tlsdesc_small_si (imm));
910 else
911 emit_insn (gen_tlsdesc_small_di (imm));
912 tp = aarch64_load_tp (NULL);
914 if (mode != Pmode)
915 tp = gen_lowpart (mode, tp);
917 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
918 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
919 return;
922 case SYMBOL_SMALL_GOTTPREL:
924 /* In ILP32, the mode of dest can be either SImode or DImode,
925 while the got entry is always of SImode size. The mode of
926 dest depends on how dest is used: if dest is assigned to a
927 pointer (e.g. in the memory), it has SImode; it may have
928 DImode if dest is dereferenced to access the memeory.
929 This is why we have to handle three different tlsie_small
930 patterns here (two patterns for ILP32). */
931 machine_mode mode = GET_MODE (dest);
932 rtx tmp_reg = gen_reg_rtx (mode);
933 rtx tp = aarch64_load_tp (NULL);
935 if (mode == ptr_mode)
937 if (mode == DImode)
938 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
939 else
941 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
942 tp = gen_lowpart (mode, tp);
945 else
947 gcc_assert (mode == Pmode);
948 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
951 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
952 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
953 return;
956 case SYMBOL_SMALL_TPREL:
958 rtx tp = aarch64_load_tp (NULL);
959 emit_insn (gen_tlsle_small (dest, tp, imm));
960 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
961 return;
964 case SYMBOL_TINY_GOT:
965 emit_insn (gen_ldr_got_tiny (dest, imm));
966 return;
968 default:
969 gcc_unreachable ();
973 /* Emit a move from SRC to DEST. Assume that the move expanders can
974 handle all moves if !can_create_pseudo_p (). The distinction is
975 important because, unlike emit_move_insn, the move expanders know
976 how to force Pmode objects into the constant pool even when the
977 constant pool address is not itself legitimate. */
978 static rtx
979 aarch64_emit_move (rtx dest, rtx src)
981 return (can_create_pseudo_p ()
982 ? emit_move_insn (dest, src)
983 : emit_move_insn_1 (dest, src));
986 /* Split a 128-bit move operation into two 64-bit move operations,
987 taking care to handle partial overlap of register to register
988 copies. Special cases are needed when moving between GP regs and
989 FP regs. SRC can be a register, constant or memory; DST a register
990 or memory. If either operand is memory it must not have any side
991 effects. */
992 void
993 aarch64_split_128bit_move (rtx dst, rtx src)
995 rtx dst_lo, dst_hi;
996 rtx src_lo, src_hi;
998 machine_mode mode = GET_MODE (dst);
1000 gcc_assert (mode == TImode || mode == TFmode);
1001 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1002 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1004 if (REG_P (dst) && REG_P (src))
1006 int src_regno = REGNO (src);
1007 int dst_regno = REGNO (dst);
1009 /* Handle FP <-> GP regs. */
1010 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1012 src_lo = gen_lowpart (word_mode, src);
1013 src_hi = gen_highpart (word_mode, src);
1015 if (mode == TImode)
1017 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1018 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1020 else
1022 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1023 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1025 return;
1027 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1029 dst_lo = gen_lowpart (word_mode, dst);
1030 dst_hi = gen_highpart (word_mode, dst);
1032 if (mode == TImode)
1034 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1035 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1037 else
1039 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1040 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1042 return;
1046 dst_lo = gen_lowpart (word_mode, dst);
1047 dst_hi = gen_highpart (word_mode, dst);
1048 src_lo = gen_lowpart (word_mode, src);
1049 src_hi = gen_highpart_mode (word_mode, mode, src);
1051 /* At most one pairing may overlap. */
1052 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1054 aarch64_emit_move (dst_hi, src_hi);
1055 aarch64_emit_move (dst_lo, src_lo);
1057 else
1059 aarch64_emit_move (dst_lo, src_lo);
1060 aarch64_emit_move (dst_hi, src_hi);
1064 bool
1065 aarch64_split_128bit_move_p (rtx dst, rtx src)
1067 return (! REG_P (src)
1068 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1071 /* Split a complex SIMD combine. */
1073 void
1074 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1076 machine_mode src_mode = GET_MODE (src1);
1077 machine_mode dst_mode = GET_MODE (dst);
1079 gcc_assert (VECTOR_MODE_P (dst_mode));
1081 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1083 rtx (*gen) (rtx, rtx, rtx);
1085 switch (src_mode)
1087 case V8QImode:
1088 gen = gen_aarch64_simd_combinev8qi;
1089 break;
1090 case V4HImode:
1091 gen = gen_aarch64_simd_combinev4hi;
1092 break;
1093 case V2SImode:
1094 gen = gen_aarch64_simd_combinev2si;
1095 break;
1096 case V2SFmode:
1097 gen = gen_aarch64_simd_combinev2sf;
1098 break;
1099 case DImode:
1100 gen = gen_aarch64_simd_combinedi;
1101 break;
1102 case DFmode:
1103 gen = gen_aarch64_simd_combinedf;
1104 break;
1105 default:
1106 gcc_unreachable ();
1109 emit_insn (gen (dst, src1, src2));
1110 return;
1114 /* Split a complex SIMD move. */
1116 void
1117 aarch64_split_simd_move (rtx dst, rtx src)
1119 machine_mode src_mode = GET_MODE (src);
1120 machine_mode dst_mode = GET_MODE (dst);
1122 gcc_assert (VECTOR_MODE_P (dst_mode));
1124 if (REG_P (dst) && REG_P (src))
1126 rtx (*gen) (rtx, rtx);
1128 gcc_assert (VECTOR_MODE_P (src_mode));
1130 switch (src_mode)
1132 case V16QImode:
1133 gen = gen_aarch64_split_simd_movv16qi;
1134 break;
1135 case V8HImode:
1136 gen = gen_aarch64_split_simd_movv8hi;
1137 break;
1138 case V4SImode:
1139 gen = gen_aarch64_split_simd_movv4si;
1140 break;
1141 case V2DImode:
1142 gen = gen_aarch64_split_simd_movv2di;
1143 break;
1144 case V4SFmode:
1145 gen = gen_aarch64_split_simd_movv4sf;
1146 break;
1147 case V2DFmode:
1148 gen = gen_aarch64_split_simd_movv2df;
1149 break;
1150 default:
1151 gcc_unreachable ();
1154 emit_insn (gen (dst, src));
1155 return;
1159 static rtx
1160 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1162 if (can_create_pseudo_p ())
1163 return force_reg (mode, value);
1164 else
1166 x = aarch64_emit_move (x, value);
1167 return x;
1172 static rtx
1173 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1175 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1177 rtx high;
1178 /* Load the full offset into a register. This
1179 might be improvable in the future. */
1180 high = GEN_INT (offset);
1181 offset = 0;
1182 high = aarch64_force_temporary (mode, temp, high);
1183 reg = aarch64_force_temporary (mode, temp,
1184 gen_rtx_PLUS (mode, high, reg));
1186 return plus_constant (mode, reg, offset);
1189 static int
1190 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1191 machine_mode mode)
1193 unsigned HOST_WIDE_INT mask;
1194 int i;
1195 bool first;
1196 unsigned HOST_WIDE_INT val;
1197 bool subtargets;
1198 rtx subtarget;
1199 int one_match, zero_match, first_not_ffff_match;
1200 int num_insns = 0;
1202 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1204 if (generate)
1205 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1206 num_insns++;
1207 return num_insns;
1210 if (mode == SImode)
1212 /* We know we can't do this in 1 insn, and we must be able to do it
1213 in two; so don't mess around looking for sequences that don't buy
1214 us anything. */
1215 if (generate)
1217 emit_insn (gen_rtx_SET (VOIDmode, dest,
1218 GEN_INT (INTVAL (imm) & 0xffff)));
1219 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1220 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1222 num_insns += 2;
1223 return num_insns;
1226 /* Remaining cases are all for DImode. */
1228 val = INTVAL (imm);
1229 subtargets = optimize && can_create_pseudo_p ();
1231 one_match = 0;
1232 zero_match = 0;
1233 mask = 0xffff;
1234 first_not_ffff_match = -1;
1236 for (i = 0; i < 64; i += 16, mask <<= 16)
1238 if ((val & mask) == mask)
1239 one_match++;
1240 else
1242 if (first_not_ffff_match < 0)
1243 first_not_ffff_match = i;
1244 if ((val & mask) == 0)
1245 zero_match++;
1249 if (one_match == 2)
1251 /* Set one of the quarters and then insert back into result. */
1252 mask = 0xffffll << first_not_ffff_match;
1253 if (generate)
1255 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1256 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1257 GEN_INT ((val >> first_not_ffff_match)
1258 & 0xffff)));
1260 num_insns += 2;
1261 return num_insns;
1264 if (zero_match == 2)
1265 goto simple_sequence;
1267 mask = 0x0ffff0000UL;
1268 for (i = 16; i < 64; i += 16, mask <<= 16)
1270 HOST_WIDE_INT comp = mask & ~(mask - 1);
1272 if (aarch64_uimm12_shift (val - (val & mask)))
1274 if (generate)
1276 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1277 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1278 GEN_INT (val & mask)));
1279 emit_insn (gen_adddi3 (dest, subtarget,
1280 GEN_INT (val - (val & mask))));
1282 num_insns += 2;
1283 return num_insns;
1285 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1287 if (generate)
1289 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1290 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1291 GEN_INT ((val + comp) & mask)));
1292 emit_insn (gen_adddi3 (dest, subtarget,
1293 GEN_INT (val - ((val + comp) & mask))));
1295 num_insns += 2;
1296 return num_insns;
1298 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1300 if (generate)
1302 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1303 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1304 GEN_INT ((val - comp) | ~mask)));
1305 emit_insn (gen_adddi3 (dest, subtarget,
1306 GEN_INT (val - ((val - comp) | ~mask))));
1308 num_insns += 2;
1309 return num_insns;
1311 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1313 if (generate)
1315 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1316 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1317 GEN_INT (val | ~mask)));
1318 emit_insn (gen_adddi3 (dest, subtarget,
1319 GEN_INT (val - (val | ~mask))));
1321 num_insns += 2;
1322 return num_insns;
1326 /* See if we can do it by arithmetically combining two
1327 immediates. */
1328 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1330 int j;
1331 mask = 0xffff;
1333 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1334 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1336 if (generate)
1338 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1339 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1340 GEN_INT (aarch64_bitmasks[i])));
1341 emit_insn (gen_adddi3 (dest, subtarget,
1342 GEN_INT (val - aarch64_bitmasks[i])));
1344 num_insns += 2;
1345 return num_insns;
1348 for (j = 0; j < 64; j += 16, mask <<= 16)
1350 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1352 if (generate)
1354 emit_insn (gen_rtx_SET (VOIDmode, dest,
1355 GEN_INT (aarch64_bitmasks[i])));
1356 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1357 GEN_INT ((val >> j) & 0xffff)));
1359 num_insns += 2;
1360 return num_insns;
1365 /* See if we can do it by logically combining two immediates. */
1366 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1368 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1370 int j;
1372 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1373 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1375 if (generate)
1377 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1378 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1379 GEN_INT (aarch64_bitmasks[i])));
1380 emit_insn (gen_iordi3 (dest, subtarget,
1381 GEN_INT (aarch64_bitmasks[j])));
1383 num_insns += 2;
1384 return num_insns;
1387 else if ((val & aarch64_bitmasks[i]) == val)
1389 int j;
1391 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1392 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1394 if (generate)
1396 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1397 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1398 GEN_INT (aarch64_bitmasks[j])));
1399 emit_insn (gen_anddi3 (dest, subtarget,
1400 GEN_INT (aarch64_bitmasks[i])));
1402 num_insns += 2;
1403 return num_insns;
1408 if (one_match > zero_match)
1410 /* Set either first three quarters or all but the third. */
1411 mask = 0xffffll << (16 - first_not_ffff_match);
1412 if (generate)
1413 emit_insn (gen_rtx_SET (VOIDmode, dest,
1414 GEN_INT (val | mask | 0xffffffff00000000ull)));
1415 num_insns ++;
1417 /* Now insert other two quarters. */
1418 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1419 i < 64; i += 16, mask <<= 16)
1421 if ((val & mask) != mask)
1423 if (generate)
1424 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1425 GEN_INT ((val >> i) & 0xffff)));
1426 num_insns ++;
1429 return num_insns;
1432 simple_sequence:
1433 first = true;
1434 mask = 0xffff;
1435 for (i = 0; i < 64; i += 16, mask <<= 16)
1437 if ((val & mask) != 0)
1439 if (first)
1441 if (generate)
1442 emit_insn (gen_rtx_SET (VOIDmode, dest,
1443 GEN_INT (val & mask)));
1444 num_insns ++;
1445 first = false;
1447 else
1449 if (generate)
1450 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1451 GEN_INT ((val >> i) & 0xffff)));
1452 num_insns ++;
1457 return num_insns;
1461 void
1462 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1464 machine_mode mode = GET_MODE (dest);
1466 gcc_assert (mode == SImode || mode == DImode);
1468 /* Check on what type of symbol it is. */
1469 if (GET_CODE (imm) == SYMBOL_REF
1470 || GET_CODE (imm) == LABEL_REF
1471 || GET_CODE (imm) == CONST)
1473 rtx mem, base, offset;
1474 enum aarch64_symbol_type sty;
1476 /* If we have (const (plus symbol offset)), separate out the offset
1477 before we start classifying the symbol. */
1478 split_const (imm, &base, &offset);
1480 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1481 switch (sty)
1483 case SYMBOL_FORCE_TO_MEM:
1484 if (offset != const0_rtx
1485 && targetm.cannot_force_const_mem (mode, imm))
1487 gcc_assert (can_create_pseudo_p ());
1488 base = aarch64_force_temporary (mode, dest, base);
1489 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1490 aarch64_emit_move (dest, base);
1491 return;
1493 mem = force_const_mem (ptr_mode, imm);
1494 gcc_assert (mem);
1495 if (mode != ptr_mode)
1496 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1497 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1498 return;
1500 case SYMBOL_SMALL_TLSGD:
1501 case SYMBOL_SMALL_TLSDESC:
1502 case SYMBOL_SMALL_GOTTPREL:
1503 case SYMBOL_SMALL_GOT:
1504 case SYMBOL_TINY_GOT:
1505 if (offset != const0_rtx)
1507 gcc_assert(can_create_pseudo_p ());
1508 base = aarch64_force_temporary (mode, dest, base);
1509 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1510 aarch64_emit_move (dest, base);
1511 return;
1513 /* FALLTHRU */
1515 case SYMBOL_SMALL_TPREL:
1516 case SYMBOL_SMALL_ABSOLUTE:
1517 case SYMBOL_TINY_ABSOLUTE:
1518 aarch64_load_symref_appropriately (dest, imm, sty);
1519 return;
1521 default:
1522 gcc_unreachable ();
1526 if (!CONST_INT_P (imm))
1528 if (GET_CODE (imm) == HIGH)
1529 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1530 else
1532 rtx mem = force_const_mem (mode, imm);
1533 gcc_assert (mem);
1534 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1537 return;
1540 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1543 static bool
1544 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1545 tree exp ATTRIBUTE_UNUSED)
1547 /* Currently, always true. */
1548 return true;
1551 /* Implement TARGET_PASS_BY_REFERENCE. */
1553 static bool
1554 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1555 machine_mode mode,
1556 const_tree type,
1557 bool named ATTRIBUTE_UNUSED)
1559 HOST_WIDE_INT size;
1560 machine_mode dummymode;
1561 int nregs;
1563 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1564 size = (mode == BLKmode && type)
1565 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1567 /* Aggregates are passed by reference based on their size. */
1568 if (type && AGGREGATE_TYPE_P (type))
1570 size = int_size_in_bytes (type);
1573 /* Variable sized arguments are always returned by reference. */
1574 if (size < 0)
1575 return true;
1577 /* Can this be a candidate to be passed in fp/simd register(s)? */
1578 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1579 &dummymode, &nregs,
1580 NULL))
1581 return false;
1583 /* Arguments which are variable sized or larger than 2 registers are
1584 passed by reference unless they are a homogenous floating point
1585 aggregate. */
1586 return size > 2 * UNITS_PER_WORD;
1589 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1590 static bool
1591 aarch64_return_in_msb (const_tree valtype)
1593 machine_mode dummy_mode;
1594 int dummy_int;
1596 /* Never happens in little-endian mode. */
1597 if (!BYTES_BIG_ENDIAN)
1598 return false;
1600 /* Only composite types smaller than or equal to 16 bytes can
1601 be potentially returned in registers. */
1602 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1603 || int_size_in_bytes (valtype) <= 0
1604 || int_size_in_bytes (valtype) > 16)
1605 return false;
1607 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1608 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1609 is always passed/returned in the least significant bits of fp/simd
1610 register(s). */
1611 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1612 &dummy_mode, &dummy_int, NULL))
1613 return false;
1615 return true;
1618 /* Implement TARGET_FUNCTION_VALUE.
1619 Define how to find the value returned by a function. */
1621 static rtx
1622 aarch64_function_value (const_tree type, const_tree func,
1623 bool outgoing ATTRIBUTE_UNUSED)
1625 machine_mode mode;
1626 int unsignedp;
1627 int count;
1628 machine_mode ag_mode;
1630 mode = TYPE_MODE (type);
1631 if (INTEGRAL_TYPE_P (type))
1632 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1634 if (aarch64_return_in_msb (type))
1636 HOST_WIDE_INT size = int_size_in_bytes (type);
1638 if (size % UNITS_PER_WORD != 0)
1640 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1641 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1645 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1646 &ag_mode, &count, NULL))
1648 if (!aarch64_composite_type_p (type, mode))
1650 gcc_assert (count == 1 && mode == ag_mode);
1651 return gen_rtx_REG (mode, V0_REGNUM);
1653 else
1655 int i;
1656 rtx par;
1658 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1659 for (i = 0; i < count; i++)
1661 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1662 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1663 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1664 XVECEXP (par, 0, i) = tmp;
1666 return par;
1669 else
1670 return gen_rtx_REG (mode, R0_REGNUM);
1673 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1674 Return true if REGNO is the number of a hard register in which the values
1675 of called function may come back. */
1677 static bool
1678 aarch64_function_value_regno_p (const unsigned int regno)
1680 /* Maximum of 16 bytes can be returned in the general registers. Examples
1681 of 16-byte return values are: 128-bit integers and 16-byte small
1682 structures (excluding homogeneous floating-point aggregates). */
1683 if (regno == R0_REGNUM || regno == R1_REGNUM)
1684 return true;
1686 /* Up to four fp/simd registers can return a function value, e.g. a
1687 homogeneous floating-point aggregate having four members. */
1688 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1689 return !TARGET_GENERAL_REGS_ONLY;
1691 return false;
1694 /* Implement TARGET_RETURN_IN_MEMORY.
1696 If the type T of the result of a function is such that
1697 void func (T arg)
1698 would require that arg be passed as a value in a register (or set of
1699 registers) according to the parameter passing rules, then the result
1700 is returned in the same registers as would be used for such an
1701 argument. */
1703 static bool
1704 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1706 HOST_WIDE_INT size;
1707 machine_mode ag_mode;
1708 int count;
1710 if (!AGGREGATE_TYPE_P (type)
1711 && TREE_CODE (type) != COMPLEX_TYPE
1712 && TREE_CODE (type) != VECTOR_TYPE)
1713 /* Simple scalar types always returned in registers. */
1714 return false;
1716 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1717 type,
1718 &ag_mode,
1719 &count,
1720 NULL))
1721 return false;
1723 /* Types larger than 2 registers returned in memory. */
1724 size = int_size_in_bytes (type);
1725 return (size < 0 || size > 2 * UNITS_PER_WORD);
1728 static bool
1729 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1730 const_tree type, int *nregs)
1732 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1733 return aarch64_vfp_is_call_or_return_candidate (mode,
1734 type,
1735 &pcum->aapcs_vfp_rmode,
1736 nregs,
1737 NULL);
1740 /* Given MODE and TYPE of a function argument, return the alignment in
1741 bits. The idea is to suppress any stronger alignment requested by
1742 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1743 This is a helper function for local use only. */
1745 static unsigned int
1746 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1748 unsigned int alignment;
1750 if (type)
1752 if (!integer_zerop (TYPE_SIZE (type)))
1754 if (TYPE_MODE (type) == mode)
1755 alignment = TYPE_ALIGN (type);
1756 else
1757 alignment = GET_MODE_ALIGNMENT (mode);
1759 else
1760 alignment = 0;
1762 else
1763 alignment = GET_MODE_ALIGNMENT (mode);
1765 return alignment;
1768 /* Layout a function argument according to the AAPCS64 rules. The rule
1769 numbers refer to the rule numbers in the AAPCS64. */
1771 static void
1772 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1773 const_tree type,
1774 bool named ATTRIBUTE_UNUSED)
1776 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1777 int ncrn, nvrn, nregs;
1778 bool allocate_ncrn, allocate_nvrn;
1779 HOST_WIDE_INT size;
1781 /* We need to do this once per argument. */
1782 if (pcum->aapcs_arg_processed)
1783 return;
1785 pcum->aapcs_arg_processed = true;
1787 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1788 size
1789 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1790 UNITS_PER_WORD);
1792 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1793 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1794 mode,
1795 type,
1796 &nregs);
1798 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1799 The following code thus handles passing by SIMD/FP registers first. */
1801 nvrn = pcum->aapcs_nvrn;
1803 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1804 and homogenous short-vector aggregates (HVA). */
1805 if (allocate_nvrn)
1807 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1809 pcum->aapcs_nextnvrn = nvrn + nregs;
1810 if (!aarch64_composite_type_p (type, mode))
1812 gcc_assert (nregs == 1);
1813 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1815 else
1817 rtx par;
1818 int i;
1819 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1820 for (i = 0; i < nregs; i++)
1822 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1823 V0_REGNUM + nvrn + i);
1824 tmp = gen_rtx_EXPR_LIST
1825 (VOIDmode, tmp,
1826 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1827 XVECEXP (par, 0, i) = tmp;
1829 pcum->aapcs_reg = par;
1831 return;
1833 else
1835 /* C.3 NSRN is set to 8. */
1836 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1837 goto on_stack;
1841 ncrn = pcum->aapcs_ncrn;
1842 nregs = size / UNITS_PER_WORD;
1844 /* C6 - C9. though the sign and zero extension semantics are
1845 handled elsewhere. This is the case where the argument fits
1846 entirely general registers. */
1847 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1849 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1851 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1853 /* C.8 if the argument has an alignment of 16 then the NGRN is
1854 rounded up to the next even number. */
1855 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1857 ++ncrn;
1858 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1860 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1861 A reg is still generated for it, but the caller should be smart
1862 enough not to use it. */
1863 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1865 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1867 else
1869 rtx par;
1870 int i;
1872 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1873 for (i = 0; i < nregs; i++)
1875 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1876 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1877 GEN_INT (i * UNITS_PER_WORD));
1878 XVECEXP (par, 0, i) = tmp;
1880 pcum->aapcs_reg = par;
1883 pcum->aapcs_nextncrn = ncrn + nregs;
1884 return;
1887 /* C.11 */
1888 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1890 /* The argument is passed on stack; record the needed number of words for
1891 this argument and align the total size if necessary. */
1892 on_stack:
1893 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1894 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1895 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1896 16 / UNITS_PER_WORD);
1897 return;
1900 /* Implement TARGET_FUNCTION_ARG. */
1902 static rtx
1903 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1904 const_tree type, bool named)
1906 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1907 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1909 if (mode == VOIDmode)
1910 return NULL_RTX;
1912 aarch64_layout_arg (pcum_v, mode, type, named);
1913 return pcum->aapcs_reg;
1916 void
1917 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1918 const_tree fntype ATTRIBUTE_UNUSED,
1919 rtx libname ATTRIBUTE_UNUSED,
1920 const_tree fndecl ATTRIBUTE_UNUSED,
1921 unsigned n_named ATTRIBUTE_UNUSED)
1923 pcum->aapcs_ncrn = 0;
1924 pcum->aapcs_nvrn = 0;
1925 pcum->aapcs_nextncrn = 0;
1926 pcum->aapcs_nextnvrn = 0;
1927 pcum->pcs_variant = ARM_PCS_AAPCS64;
1928 pcum->aapcs_reg = NULL_RTX;
1929 pcum->aapcs_arg_processed = false;
1930 pcum->aapcs_stack_words = 0;
1931 pcum->aapcs_stack_size = 0;
1933 return;
1936 static void
1937 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1938 machine_mode mode,
1939 const_tree type,
1940 bool named)
1942 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1943 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1945 aarch64_layout_arg (pcum_v, mode, type, named);
1946 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1947 != (pcum->aapcs_stack_words != 0));
1948 pcum->aapcs_arg_processed = false;
1949 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1950 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1951 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1952 pcum->aapcs_stack_words = 0;
1953 pcum->aapcs_reg = NULL_RTX;
1957 bool
1958 aarch64_function_arg_regno_p (unsigned regno)
1960 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1961 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1964 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1965 PARM_BOUNDARY bits of alignment, but will be given anything up
1966 to STACK_BOUNDARY bits if the type requires it. This makes sure
1967 that both before and after the layout of each argument, the Next
1968 Stacked Argument Address (NSAA) will have a minimum alignment of
1969 8 bytes. */
1971 static unsigned int
1972 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1974 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1976 if (alignment < PARM_BOUNDARY)
1977 alignment = PARM_BOUNDARY;
1978 if (alignment > STACK_BOUNDARY)
1979 alignment = STACK_BOUNDARY;
1980 return alignment;
1983 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1985 Return true if an argument passed on the stack should be padded upwards,
1986 i.e. if the least-significant byte of the stack slot has useful data.
1988 Small aggregate types are placed in the lowest memory address.
1990 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1992 bool
1993 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1995 /* On little-endian targets, the least significant byte of every stack
1996 argument is passed at the lowest byte address of the stack slot. */
1997 if (!BYTES_BIG_ENDIAN)
1998 return true;
2000 /* Otherwise, integral, floating-point and pointer types are padded downward:
2001 the least significant byte of a stack argument is passed at the highest
2002 byte address of the stack slot. */
2003 if (type
2004 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2005 || POINTER_TYPE_P (type))
2006 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2007 return false;
2009 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2010 return true;
2013 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2015 It specifies padding for the last (may also be the only)
2016 element of a block move between registers and memory. If
2017 assuming the block is in the memory, padding upward means that
2018 the last element is padded after its highest significant byte,
2019 while in downward padding, the last element is padded at the
2020 its least significant byte side.
2022 Small aggregates and small complex types are always padded
2023 upwards.
2025 We don't need to worry about homogeneous floating-point or
2026 short-vector aggregates; their move is not affected by the
2027 padding direction determined here. Regardless of endianness,
2028 each element of such an aggregate is put in the least
2029 significant bits of a fp/simd register.
2031 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2032 register has useful data, and return the opposite if the most
2033 significant byte does. */
2035 bool
2036 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2037 bool first ATTRIBUTE_UNUSED)
2040 /* Small composite types are always padded upward. */
2041 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2043 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2044 : GET_MODE_SIZE (mode));
2045 if (size < 2 * UNITS_PER_WORD)
2046 return true;
2049 /* Otherwise, use the default padding. */
2050 return !BYTES_BIG_ENDIAN;
2053 static machine_mode
2054 aarch64_libgcc_cmp_return_mode (void)
2056 return SImode;
2059 static bool
2060 aarch64_frame_pointer_required (void)
2062 /* In aarch64_override_options_after_change
2063 flag_omit_leaf_frame_pointer turns off the frame pointer by
2064 default. Turn it back on now if we've not got a leaf
2065 function. */
2066 if (flag_omit_leaf_frame_pointer
2067 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2068 return true;
2070 return false;
2073 /* Mark the registers that need to be saved by the callee and calculate
2074 the size of the callee-saved registers area and frame record (both FP
2075 and LR may be omitted). */
2076 static void
2077 aarch64_layout_frame (void)
2079 HOST_WIDE_INT offset = 0;
2080 int regno;
2082 if (reload_completed && cfun->machine->frame.laid_out)
2083 return;
2085 #define SLOT_NOT_REQUIRED (-2)
2086 #define SLOT_REQUIRED (-1)
2088 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2089 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2091 /* First mark all the registers that really need to be saved... */
2092 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2093 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2095 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2096 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2098 /* ... that includes the eh data registers (if needed)... */
2099 if (crtl->calls_eh_return)
2100 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2101 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2102 = SLOT_REQUIRED;
2104 /* ... and any callee saved register that dataflow says is live. */
2105 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2106 if (df_regs_ever_live_p (regno)
2107 && (regno == R30_REGNUM
2108 || !call_used_regs[regno]))
2109 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2111 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2112 if (df_regs_ever_live_p (regno)
2113 && !call_used_regs[regno])
2114 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2116 if (frame_pointer_needed)
2118 /* FP and LR are placed in the linkage record. */
2119 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2120 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2121 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2122 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2123 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2124 offset += 2 * UNITS_PER_WORD;
2127 /* Now assign stack slots for them. */
2128 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2129 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2131 cfun->machine->frame.reg_offset[regno] = offset;
2132 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2133 cfun->machine->frame.wb_candidate1 = regno;
2134 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2135 cfun->machine->frame.wb_candidate2 = regno;
2136 offset += UNITS_PER_WORD;
2139 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2140 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2142 cfun->machine->frame.reg_offset[regno] = offset;
2143 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2144 cfun->machine->frame.wb_candidate1 = regno;
2145 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2146 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2147 cfun->machine->frame.wb_candidate2 = regno;
2148 offset += UNITS_PER_WORD;
2151 cfun->machine->frame.padding0 =
2152 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2153 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2155 cfun->machine->frame.saved_regs_size = offset;
2157 cfun->machine->frame.hard_fp_offset
2158 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2159 + get_frame_size ()
2160 + cfun->machine->frame.saved_regs_size,
2161 STACK_BOUNDARY / BITS_PER_UNIT);
2163 cfun->machine->frame.frame_size
2164 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2165 + crtl->outgoing_args_size,
2166 STACK_BOUNDARY / BITS_PER_UNIT);
2168 cfun->machine->frame.laid_out = true;
2171 static bool
2172 aarch64_register_saved_on_entry (int regno)
2174 return cfun->machine->frame.reg_offset[regno] >= 0;
2177 static unsigned
2178 aarch64_next_callee_save (unsigned regno, unsigned limit)
2180 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2181 regno ++;
2182 return regno;
2185 static void
2186 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2187 HOST_WIDE_INT adjustment)
2189 rtx base_rtx = stack_pointer_rtx;
2190 rtx insn, reg, mem;
2192 reg = gen_rtx_REG (mode, regno);
2193 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2194 plus_constant (Pmode, base_rtx, -adjustment));
2195 mem = gen_rtx_MEM (mode, mem);
2197 insn = emit_move_insn (mem, reg);
2198 RTX_FRAME_RELATED_P (insn) = 1;
2201 static rtx
2202 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2203 HOST_WIDE_INT adjustment)
2205 switch (mode)
2207 case DImode:
2208 return gen_storewb_pairdi_di (base, base, reg, reg2,
2209 GEN_INT (-adjustment),
2210 GEN_INT (UNITS_PER_WORD - adjustment));
2211 case DFmode:
2212 return gen_storewb_pairdf_di (base, base, reg, reg2,
2213 GEN_INT (-adjustment),
2214 GEN_INT (UNITS_PER_WORD - adjustment));
2215 default:
2216 gcc_unreachable ();
2220 static void
2221 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2222 unsigned regno2, HOST_WIDE_INT adjustment)
2224 rtx_insn *insn;
2225 rtx reg1 = gen_rtx_REG (mode, regno1);
2226 rtx reg2 = gen_rtx_REG (mode, regno2);
2228 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2229 reg2, adjustment));
2230 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2231 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2232 RTX_FRAME_RELATED_P (insn) = 1;
2235 static rtx
2236 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2237 HOST_WIDE_INT adjustment)
2239 switch (mode)
2241 case DImode:
2242 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2243 GEN_INT (UNITS_PER_WORD));
2244 case DFmode:
2245 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2246 GEN_INT (UNITS_PER_WORD));
2247 default:
2248 gcc_unreachable ();
2252 static rtx
2253 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2254 rtx reg2)
2256 switch (mode)
2258 case DImode:
2259 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2261 case DFmode:
2262 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2264 default:
2265 gcc_unreachable ();
2269 static rtx
2270 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2271 rtx mem2)
2273 switch (mode)
2275 case DImode:
2276 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2278 case DFmode:
2279 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2281 default:
2282 gcc_unreachable ();
2287 static void
2288 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2289 unsigned start, unsigned limit, bool skip_wb)
2291 rtx_insn *insn;
2292 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2293 ? gen_frame_mem : gen_rtx_MEM);
2294 unsigned regno;
2295 unsigned regno2;
2297 for (regno = aarch64_next_callee_save (start, limit);
2298 regno <= limit;
2299 regno = aarch64_next_callee_save (regno + 1, limit))
2301 rtx reg, mem;
2302 HOST_WIDE_INT offset;
2304 if (skip_wb
2305 && (regno == cfun->machine->frame.wb_candidate1
2306 || regno == cfun->machine->frame.wb_candidate2))
2307 continue;
2309 reg = gen_rtx_REG (mode, regno);
2310 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2311 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2312 offset));
2314 regno2 = aarch64_next_callee_save (regno + 1, limit);
2316 if (regno2 <= limit
2317 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2318 == cfun->machine->frame.reg_offset[regno2]))
2321 rtx reg2 = gen_rtx_REG (mode, regno2);
2322 rtx mem2;
2324 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2325 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2326 offset));
2327 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2328 reg2));
2330 /* The first part of a frame-related parallel insn is
2331 always assumed to be relevant to the frame
2332 calculations; subsequent parts, are only
2333 frame-related if explicitly marked. */
2334 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2335 regno = regno2;
2337 else
2338 insn = emit_move_insn (mem, reg);
2340 RTX_FRAME_RELATED_P (insn) = 1;
2344 static void
2345 aarch64_restore_callee_saves (machine_mode mode,
2346 HOST_WIDE_INT start_offset, unsigned start,
2347 unsigned limit, bool skip_wb, rtx *cfi_ops)
2349 rtx base_rtx = stack_pointer_rtx;
2350 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2351 ? gen_frame_mem : gen_rtx_MEM);
2352 unsigned regno;
2353 unsigned regno2;
2354 HOST_WIDE_INT offset;
2356 for (regno = aarch64_next_callee_save (start, limit);
2357 regno <= limit;
2358 regno = aarch64_next_callee_save (regno + 1, limit))
2360 rtx reg, mem;
2362 if (skip_wb
2363 && (regno == cfun->machine->frame.wb_candidate1
2364 || regno == cfun->machine->frame.wb_candidate2))
2365 continue;
2367 reg = gen_rtx_REG (mode, regno);
2368 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2369 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2371 regno2 = aarch64_next_callee_save (regno + 1, limit);
2373 if (regno2 <= limit
2374 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2375 == cfun->machine->frame.reg_offset[regno2]))
2377 rtx reg2 = gen_rtx_REG (mode, regno2);
2378 rtx mem2;
2380 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2381 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2382 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2384 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2385 regno = regno2;
2387 else
2388 emit_move_insn (reg, mem);
2389 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2393 /* AArch64 stack frames generated by this compiler look like:
2395 +-------------------------------+
2397 | incoming stack arguments |
2399 +-------------------------------+
2400 | | <-- incoming stack pointer (aligned)
2401 | callee-allocated save area |
2402 | for register varargs |
2404 +-------------------------------+
2405 | local variables | <-- frame_pointer_rtx
2407 +-------------------------------+
2408 | padding0 | \
2409 +-------------------------------+ |
2410 | callee-saved registers | | frame.saved_regs_size
2411 +-------------------------------+ |
2412 | LR' | |
2413 +-------------------------------+ |
2414 | FP' | / <- hard_frame_pointer_rtx (aligned)
2415 +-------------------------------+
2416 | dynamic allocation |
2417 +-------------------------------+
2418 | padding |
2419 +-------------------------------+
2420 | outgoing stack arguments | <-- arg_pointer
2422 +-------------------------------+
2423 | | <-- stack_pointer_rtx (aligned)
2425 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2426 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2427 unchanged. */
2429 /* Generate the prologue instructions for entry into a function.
2430 Establish the stack frame by decreasing the stack pointer with a
2431 properly calculated size and, if necessary, create a frame record
2432 filled with the values of LR and previous frame pointer. The
2433 current FP is also set up if it is in use. */
2435 void
2436 aarch64_expand_prologue (void)
2438 /* sub sp, sp, #<frame_size>
2439 stp {fp, lr}, [sp, #<frame_size> - 16]
2440 add fp, sp, #<frame_size> - hardfp_offset
2441 stp {cs_reg}, [fp, #-16] etc.
2443 sub sp, sp, <final_adjustment_if_any>
2445 HOST_WIDE_INT frame_size, offset;
2446 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2447 HOST_WIDE_INT hard_fp_offset;
2448 rtx_insn *insn;
2450 aarch64_layout_frame ();
2452 offset = frame_size = cfun->machine->frame.frame_size;
2453 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2454 fp_offset = frame_size - hard_fp_offset;
2456 if (flag_stack_usage_info)
2457 current_function_static_stack_size = frame_size;
2459 /* Store pairs and load pairs have a range only -512 to 504. */
2460 if (offset >= 512)
2462 /* When the frame has a large size, an initial decrease is done on
2463 the stack pointer to jump over the callee-allocated save area for
2464 register varargs, the local variable area and/or the callee-saved
2465 register area. This will allow the pre-index write-back
2466 store pair instructions to be used for setting up the stack frame
2467 efficiently. */
2468 offset = hard_fp_offset;
2469 if (offset >= 512)
2470 offset = cfun->machine->frame.saved_regs_size;
2472 frame_size -= (offset + crtl->outgoing_args_size);
2473 fp_offset = 0;
2475 if (frame_size >= 0x1000000)
2477 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2478 emit_move_insn (op0, GEN_INT (-frame_size));
2479 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2481 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2482 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2483 plus_constant (Pmode, stack_pointer_rtx,
2484 -frame_size)));
2485 RTX_FRAME_RELATED_P (insn) = 1;
2487 else if (frame_size > 0)
2489 int hi_ofs = frame_size & 0xfff000;
2490 int lo_ofs = frame_size & 0x000fff;
2492 if (hi_ofs)
2494 insn = emit_insn (gen_add2_insn
2495 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2496 RTX_FRAME_RELATED_P (insn) = 1;
2498 if (lo_ofs)
2500 insn = emit_insn (gen_add2_insn
2501 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2502 RTX_FRAME_RELATED_P (insn) = 1;
2506 else
2507 frame_size = -1;
2509 if (offset > 0)
2511 bool skip_wb = false;
2513 if (frame_pointer_needed)
2515 skip_wb = true;
2517 if (fp_offset)
2519 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2520 GEN_INT (-offset)));
2521 RTX_FRAME_RELATED_P (insn) = 1;
2523 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2524 R30_REGNUM, false);
2526 else
2527 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2529 /* Set up frame pointer to point to the location of the
2530 previous frame pointer on the stack. */
2531 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2532 stack_pointer_rtx,
2533 GEN_INT (fp_offset)));
2534 RTX_FRAME_RELATED_P (insn) = 1;
2535 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2537 else
2539 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2540 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2542 if (fp_offset
2543 || reg1 == FIRST_PSEUDO_REGISTER
2544 || (reg2 == FIRST_PSEUDO_REGISTER
2545 && offset >= 256))
2547 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2548 GEN_INT (-offset)));
2549 RTX_FRAME_RELATED_P (insn) = 1;
2551 else
2553 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2555 skip_wb = true;
2557 if (reg2 == FIRST_PSEUDO_REGISTER)
2558 aarch64_pushwb_single_reg (mode1, reg1, offset);
2559 else
2560 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2564 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2565 skip_wb);
2566 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2567 skip_wb);
2570 /* when offset >= 512,
2571 sub sp, sp, #<outgoing_args_size> */
2572 if (frame_size > -1)
2574 if (crtl->outgoing_args_size > 0)
2576 insn = emit_insn (gen_add2_insn
2577 (stack_pointer_rtx,
2578 GEN_INT (- crtl->outgoing_args_size)));
2579 RTX_FRAME_RELATED_P (insn) = 1;
2584 /* Return TRUE if we can use a simple_return insn.
2586 This function checks whether the callee saved stack is empty, which
2587 means no restore actions are need. The pro_and_epilogue will use
2588 this to check whether shrink-wrapping opt is feasible. */
2590 bool
2591 aarch64_use_return_insn_p (void)
2593 if (!reload_completed)
2594 return false;
2596 if (crtl->profile)
2597 return false;
2599 aarch64_layout_frame ();
2601 return cfun->machine->frame.frame_size == 0;
2604 /* Generate the epilogue instructions for returning from a function. */
2605 void
2606 aarch64_expand_epilogue (bool for_sibcall)
2608 HOST_WIDE_INT frame_size, offset;
2609 HOST_WIDE_INT fp_offset;
2610 HOST_WIDE_INT hard_fp_offset;
2611 rtx_insn *insn;
2612 /* We need to add memory barrier to prevent read from deallocated stack. */
2613 bool need_barrier_p = (get_frame_size () != 0
2614 || cfun->machine->frame.saved_varargs_size);
2616 aarch64_layout_frame ();
2618 offset = frame_size = cfun->machine->frame.frame_size;
2619 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2620 fp_offset = frame_size - hard_fp_offset;
2622 /* Store pairs and load pairs have a range only -512 to 504. */
2623 if (offset >= 512)
2625 offset = hard_fp_offset;
2626 if (offset >= 512)
2627 offset = cfun->machine->frame.saved_regs_size;
2629 frame_size -= (offset + crtl->outgoing_args_size);
2630 fp_offset = 0;
2631 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2633 insn = emit_insn (gen_add2_insn
2634 (stack_pointer_rtx,
2635 GEN_INT (crtl->outgoing_args_size)));
2636 RTX_FRAME_RELATED_P (insn) = 1;
2639 else
2640 frame_size = -1;
2642 /* If there were outgoing arguments or we've done dynamic stack
2643 allocation, then restore the stack pointer from the frame
2644 pointer. This is at most one insn and more efficient than using
2645 GCC's internal mechanism. */
2646 if (frame_pointer_needed
2647 && (crtl->outgoing_args_size || cfun->calls_alloca))
2649 if (cfun->calls_alloca)
2650 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2652 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2653 hard_frame_pointer_rtx,
2654 GEN_INT (0)));
2655 offset = offset - fp_offset;
2658 if (offset > 0)
2660 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2661 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2662 bool skip_wb = true;
2663 rtx cfi_ops = NULL;
2665 if (frame_pointer_needed)
2666 fp_offset = 0;
2667 else if (fp_offset
2668 || reg1 == FIRST_PSEUDO_REGISTER
2669 || (reg2 == FIRST_PSEUDO_REGISTER
2670 && offset >= 256))
2671 skip_wb = false;
2673 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2674 skip_wb, &cfi_ops);
2675 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2676 skip_wb, &cfi_ops);
2678 if (need_barrier_p)
2679 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2681 if (skip_wb)
2683 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2684 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2686 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2687 if (reg2 == FIRST_PSEUDO_REGISTER)
2689 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2690 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2691 mem = gen_rtx_MEM (mode1, mem);
2692 insn = emit_move_insn (rreg1, mem);
2694 else
2696 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2698 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2699 insn = emit_insn (aarch64_gen_loadwb_pair
2700 (mode1, stack_pointer_rtx, rreg1,
2701 rreg2, offset));
2704 else
2706 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2707 GEN_INT (offset)));
2710 /* Reset the CFA to be SP + FRAME_SIZE. */
2711 rtx new_cfa = stack_pointer_rtx;
2712 if (frame_size > 0)
2713 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2714 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2715 REG_NOTES (insn) = cfi_ops;
2716 RTX_FRAME_RELATED_P (insn) = 1;
2719 if (frame_size > 0)
2721 if (need_barrier_p)
2722 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2724 if (frame_size >= 0x1000000)
2726 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2727 emit_move_insn (op0, GEN_INT (frame_size));
2728 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2730 else
2732 int hi_ofs = frame_size & 0xfff000;
2733 int lo_ofs = frame_size & 0x000fff;
2735 if (hi_ofs && lo_ofs)
2737 insn = emit_insn (gen_add2_insn
2738 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2739 RTX_FRAME_RELATED_P (insn) = 1;
2740 frame_size = lo_ofs;
2742 insn = emit_insn (gen_add2_insn
2743 (stack_pointer_rtx, GEN_INT (frame_size)));
2746 /* Reset the CFA to be SP + 0. */
2747 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2748 RTX_FRAME_RELATED_P (insn) = 1;
2751 /* Stack adjustment for exception handler. */
2752 if (crtl->calls_eh_return)
2754 /* We need to unwind the stack by the offset computed by
2755 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2756 to be SP; letting the CFA move during this adjustment
2757 is just as correct as retaining the CFA from the body
2758 of the function. Therefore, do nothing special. */
2759 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2762 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2763 if (!for_sibcall)
2764 emit_jump_insn (ret_rtx);
2767 /* Return the place to copy the exception unwinding return address to.
2768 This will probably be a stack slot, but could (in theory be the
2769 return register). */
2771 aarch64_final_eh_return_addr (void)
2773 HOST_WIDE_INT fp_offset;
2775 aarch64_layout_frame ();
2777 fp_offset = cfun->machine->frame.frame_size
2778 - cfun->machine->frame.hard_fp_offset;
2780 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2781 return gen_rtx_REG (DImode, LR_REGNUM);
2783 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2784 result in a store to save LR introduced by builtin_eh_return () being
2785 incorrectly deleted because the alias is not detected.
2786 So in the calculation of the address to copy the exception unwinding
2787 return address to, we note 2 cases.
2788 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2789 we return a SP-relative location since all the addresses are SP-relative
2790 in this case. This prevents the store from being optimized away.
2791 If the fp_offset is not 0, then the addresses will be FP-relative and
2792 therefore we return a FP-relative location. */
2794 if (frame_pointer_needed)
2796 if (fp_offset)
2797 return gen_frame_mem (DImode,
2798 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2799 else
2800 return gen_frame_mem (DImode,
2801 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2804 /* If FP is not needed, we calculate the location of LR, which would be
2805 at the top of the saved registers block. */
2807 return gen_frame_mem (DImode,
2808 plus_constant (Pmode,
2809 stack_pointer_rtx,
2810 fp_offset
2811 + cfun->machine->frame.saved_regs_size
2812 - 2 * UNITS_PER_WORD));
2815 /* Possibly output code to build up a constant in a register. For
2816 the benefit of the costs infrastructure, returns the number of
2817 instructions which would be emitted. GENERATE inhibits or
2818 enables code generation. */
2820 static int
2821 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2823 int insns = 0;
2825 if (aarch64_bitmask_imm (val, DImode))
2827 if (generate)
2828 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2829 insns = 1;
2831 else
2833 int i;
2834 int ncount = 0;
2835 int zcount = 0;
2836 HOST_WIDE_INT valp = val >> 16;
2837 HOST_WIDE_INT valm;
2838 HOST_WIDE_INT tval;
2840 for (i = 16; i < 64; i += 16)
2842 valm = (valp & 0xffff);
2844 if (valm != 0)
2845 ++ zcount;
2847 if (valm != 0xffff)
2848 ++ ncount;
2850 valp >>= 16;
2853 /* zcount contains the number of additional MOVK instructions
2854 required if the constant is built up with an initial MOVZ instruction,
2855 while ncount is the number of MOVK instructions required if starting
2856 with a MOVN instruction. Choose the sequence that yields the fewest
2857 number of instructions, preferring MOVZ instructions when they are both
2858 the same. */
2859 if (ncount < zcount)
2861 if (generate)
2862 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2863 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2864 tval = 0xffff;
2865 insns++;
2867 else
2869 if (generate)
2870 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2871 GEN_INT (val & 0xffff));
2872 tval = 0;
2873 insns++;
2876 val >>= 16;
2878 for (i = 16; i < 64; i += 16)
2880 if ((val & 0xffff) != tval)
2882 if (generate)
2883 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2884 GEN_INT (i),
2885 GEN_INT (val & 0xffff)));
2886 insns++;
2888 val >>= 16;
2891 return insns;
2894 static void
2895 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2897 HOST_WIDE_INT mdelta = delta;
2898 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2899 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2901 if (mdelta < 0)
2902 mdelta = -mdelta;
2904 if (mdelta >= 4096 * 4096)
2906 (void) aarch64_build_constant (scratchreg, delta, true);
2907 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2909 else if (mdelta > 0)
2911 if (mdelta >= 4096)
2913 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2914 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2915 if (delta < 0)
2916 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2917 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2918 else
2919 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2920 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2922 if (mdelta % 4096 != 0)
2924 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2925 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2926 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2931 /* Output code to add DELTA to the first argument, and then jump
2932 to FUNCTION. Used for C++ multiple inheritance. */
2933 static void
2934 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2935 HOST_WIDE_INT delta,
2936 HOST_WIDE_INT vcall_offset,
2937 tree function)
2939 /* The this pointer is always in x0. Note that this differs from
2940 Arm where the this pointer maybe bumped to r1 if r0 is required
2941 to return a pointer to an aggregate. On AArch64 a result value
2942 pointer will be in x8. */
2943 int this_regno = R0_REGNUM;
2944 rtx this_rtx, temp0, temp1, addr, funexp;
2945 rtx_insn *insn;
2947 reload_completed = 1;
2948 emit_note (NOTE_INSN_PROLOGUE_END);
2950 if (vcall_offset == 0)
2951 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2952 else
2954 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2956 this_rtx = gen_rtx_REG (Pmode, this_regno);
2957 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2958 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2960 addr = this_rtx;
2961 if (delta != 0)
2963 if (delta >= -256 && delta < 256)
2964 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2965 plus_constant (Pmode, this_rtx, delta));
2966 else
2967 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2970 if (Pmode == ptr_mode)
2971 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2972 else
2973 aarch64_emit_move (temp0,
2974 gen_rtx_ZERO_EXTEND (Pmode,
2975 gen_rtx_MEM (ptr_mode, addr)));
2977 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2978 addr = plus_constant (Pmode, temp0, vcall_offset);
2979 else
2981 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2982 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2985 if (Pmode == ptr_mode)
2986 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2987 else
2988 aarch64_emit_move (temp1,
2989 gen_rtx_SIGN_EXTEND (Pmode,
2990 gen_rtx_MEM (ptr_mode, addr)));
2992 emit_insn (gen_add2_insn (this_rtx, temp1));
2995 /* Generate a tail call to the target function. */
2996 if (!TREE_USED (function))
2998 assemble_external (function);
2999 TREE_USED (function) = 1;
3001 funexp = XEXP (DECL_RTL (function), 0);
3002 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3003 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3004 SIBLING_CALL_P (insn) = 1;
3006 insn = get_insns ();
3007 shorten_branches (insn);
3008 final_start_function (insn, file, 1);
3009 final (insn, file, 1);
3010 final_end_function ();
3012 /* Stop pretending to be a post-reload pass. */
3013 reload_completed = 0;
3016 static bool
3017 aarch64_tls_referenced_p (rtx x)
3019 if (!TARGET_HAVE_TLS)
3020 return false;
3021 subrtx_iterator::array_type array;
3022 FOR_EACH_SUBRTX (iter, array, x, ALL)
3024 const_rtx x = *iter;
3025 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3026 return true;
3027 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3028 TLS offsets, not real symbol references. */
3029 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3030 iter.skip_subrtxes ();
3032 return false;
3036 static int
3037 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3039 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3040 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3042 if (*imm1 < *imm2)
3043 return -1;
3044 if (*imm1 > *imm2)
3045 return +1;
3046 return 0;
3050 static void
3051 aarch64_build_bitmask_table (void)
3053 unsigned HOST_WIDE_INT mask, imm;
3054 unsigned int log_e, e, s, r;
3055 unsigned int nimms = 0;
3057 for (log_e = 1; log_e <= 6; log_e++)
3059 e = 1 << log_e;
3060 if (e == 64)
3061 mask = ~(HOST_WIDE_INT) 0;
3062 else
3063 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3064 for (s = 1; s < e; s++)
3066 for (r = 0; r < e; r++)
3068 /* set s consecutive bits to 1 (s < 64) */
3069 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3070 /* rotate right by r */
3071 if (r != 0)
3072 imm = ((imm >> r) | (imm << (e - r))) & mask;
3073 /* replicate the constant depending on SIMD size */
3074 switch (log_e) {
3075 case 1: imm |= (imm << 2);
3076 case 2: imm |= (imm << 4);
3077 case 3: imm |= (imm << 8);
3078 case 4: imm |= (imm << 16);
3079 case 5: imm |= (imm << 32);
3080 case 6:
3081 break;
3082 default:
3083 gcc_unreachable ();
3085 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3086 aarch64_bitmasks[nimms++] = imm;
3091 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3092 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3093 aarch64_bitmasks_cmp);
3097 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3098 a left shift of 0 or 12 bits. */
3099 bool
3100 aarch64_uimm12_shift (HOST_WIDE_INT val)
3102 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3103 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3108 /* Return true if val is an immediate that can be loaded into a
3109 register by a MOVZ instruction. */
3110 static bool
3111 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3113 if (GET_MODE_SIZE (mode) > 4)
3115 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3116 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3117 return 1;
3119 else
3121 /* Ignore sign extension. */
3122 val &= (HOST_WIDE_INT) 0xffffffff;
3124 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3125 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3129 /* Return true if val is a valid bitmask immediate. */
3130 bool
3131 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3133 if (GET_MODE_SIZE (mode) < 8)
3135 /* Replicate bit pattern. */
3136 val &= (HOST_WIDE_INT) 0xffffffff;
3137 val |= val << 32;
3139 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3140 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3144 /* Return true if val is an immediate that can be loaded into a
3145 register in a single instruction. */
3146 bool
3147 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3149 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3150 return 1;
3151 return aarch64_bitmask_imm (val, mode);
3154 static bool
3155 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3157 rtx base, offset;
3159 if (GET_CODE (x) == HIGH)
3160 return true;
3162 split_const (x, &base, &offset);
3163 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3165 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3166 != SYMBOL_FORCE_TO_MEM)
3167 return true;
3168 else
3169 /* Avoid generating a 64-bit relocation in ILP32; leave
3170 to aarch64_expand_mov_immediate to handle it properly. */
3171 return mode != ptr_mode;
3174 return aarch64_tls_referenced_p (x);
3177 /* Return true if register REGNO is a valid index register.
3178 STRICT_P is true if REG_OK_STRICT is in effect. */
3180 bool
3181 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3183 if (!HARD_REGISTER_NUM_P (regno))
3185 if (!strict_p)
3186 return true;
3188 if (!reg_renumber)
3189 return false;
3191 regno = reg_renumber[regno];
3193 return GP_REGNUM_P (regno);
3196 /* Return true if register REGNO is a valid base register for mode MODE.
3197 STRICT_P is true if REG_OK_STRICT is in effect. */
3199 bool
3200 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3202 if (!HARD_REGISTER_NUM_P (regno))
3204 if (!strict_p)
3205 return true;
3207 if (!reg_renumber)
3208 return false;
3210 regno = reg_renumber[regno];
3213 /* The fake registers will be eliminated to either the stack or
3214 hard frame pointer, both of which are usually valid base registers.
3215 Reload deals with the cases where the eliminated form isn't valid. */
3216 return (GP_REGNUM_P (regno)
3217 || regno == SP_REGNUM
3218 || regno == FRAME_POINTER_REGNUM
3219 || regno == ARG_POINTER_REGNUM);
3222 /* Return true if X is a valid base register for mode MODE.
3223 STRICT_P is true if REG_OK_STRICT is in effect. */
3225 static bool
3226 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3228 if (!strict_p && GET_CODE (x) == SUBREG)
3229 x = SUBREG_REG (x);
3231 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3234 /* Return true if address offset is a valid index. If it is, fill in INFO
3235 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3237 static bool
3238 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3239 machine_mode mode, bool strict_p)
3241 enum aarch64_address_type type;
3242 rtx index;
3243 int shift;
3245 /* (reg:P) */
3246 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3247 && GET_MODE (x) == Pmode)
3249 type = ADDRESS_REG_REG;
3250 index = x;
3251 shift = 0;
3253 /* (sign_extend:DI (reg:SI)) */
3254 else if ((GET_CODE (x) == SIGN_EXTEND
3255 || GET_CODE (x) == ZERO_EXTEND)
3256 && GET_MODE (x) == DImode
3257 && GET_MODE (XEXP (x, 0)) == SImode)
3259 type = (GET_CODE (x) == SIGN_EXTEND)
3260 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3261 index = XEXP (x, 0);
3262 shift = 0;
3264 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3265 else if (GET_CODE (x) == MULT
3266 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3267 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3268 && GET_MODE (XEXP (x, 0)) == DImode
3269 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3270 && CONST_INT_P (XEXP (x, 1)))
3272 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3273 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3274 index = XEXP (XEXP (x, 0), 0);
3275 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3277 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3278 else if (GET_CODE (x) == ASHIFT
3279 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3280 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3281 && GET_MODE (XEXP (x, 0)) == DImode
3282 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3283 && CONST_INT_P (XEXP (x, 1)))
3285 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3286 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3287 index = XEXP (XEXP (x, 0), 0);
3288 shift = INTVAL (XEXP (x, 1));
3290 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3291 else if ((GET_CODE (x) == SIGN_EXTRACT
3292 || GET_CODE (x) == ZERO_EXTRACT)
3293 && GET_MODE (x) == DImode
3294 && GET_CODE (XEXP (x, 0)) == MULT
3295 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3296 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3298 type = (GET_CODE (x) == SIGN_EXTRACT)
3299 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3300 index = XEXP (XEXP (x, 0), 0);
3301 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3302 if (INTVAL (XEXP (x, 1)) != 32 + shift
3303 || INTVAL (XEXP (x, 2)) != 0)
3304 shift = -1;
3306 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3307 (const_int 0xffffffff<<shift)) */
3308 else if (GET_CODE (x) == AND
3309 && GET_MODE (x) == DImode
3310 && GET_CODE (XEXP (x, 0)) == MULT
3311 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3312 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3313 && CONST_INT_P (XEXP (x, 1)))
3315 type = ADDRESS_REG_UXTW;
3316 index = XEXP (XEXP (x, 0), 0);
3317 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3318 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3319 shift = -1;
3321 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3322 else if ((GET_CODE (x) == SIGN_EXTRACT
3323 || GET_CODE (x) == ZERO_EXTRACT)
3324 && GET_MODE (x) == DImode
3325 && GET_CODE (XEXP (x, 0)) == ASHIFT
3326 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3327 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3329 type = (GET_CODE (x) == SIGN_EXTRACT)
3330 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3331 index = XEXP (XEXP (x, 0), 0);
3332 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3333 if (INTVAL (XEXP (x, 1)) != 32 + shift
3334 || INTVAL (XEXP (x, 2)) != 0)
3335 shift = -1;
3337 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3338 (const_int 0xffffffff<<shift)) */
3339 else if (GET_CODE (x) == AND
3340 && GET_MODE (x) == DImode
3341 && GET_CODE (XEXP (x, 0)) == ASHIFT
3342 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3343 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3344 && CONST_INT_P (XEXP (x, 1)))
3346 type = ADDRESS_REG_UXTW;
3347 index = XEXP (XEXP (x, 0), 0);
3348 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3349 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3350 shift = -1;
3352 /* (mult:P (reg:P) (const_int scale)) */
3353 else if (GET_CODE (x) == MULT
3354 && GET_MODE (x) == Pmode
3355 && GET_MODE (XEXP (x, 0)) == Pmode
3356 && CONST_INT_P (XEXP (x, 1)))
3358 type = ADDRESS_REG_REG;
3359 index = XEXP (x, 0);
3360 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3362 /* (ashift:P (reg:P) (const_int shift)) */
3363 else if (GET_CODE (x) == ASHIFT
3364 && GET_MODE (x) == Pmode
3365 && GET_MODE (XEXP (x, 0)) == Pmode
3366 && CONST_INT_P (XEXP (x, 1)))
3368 type = ADDRESS_REG_REG;
3369 index = XEXP (x, 0);
3370 shift = INTVAL (XEXP (x, 1));
3372 else
3373 return false;
3375 if (GET_CODE (index) == SUBREG)
3376 index = SUBREG_REG (index);
3378 if ((shift == 0 ||
3379 (shift > 0 && shift <= 3
3380 && (1 << shift) == GET_MODE_SIZE (mode)))
3381 && REG_P (index)
3382 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3384 info->type = type;
3385 info->offset = index;
3386 info->shift = shift;
3387 return true;
3390 return false;
3393 bool
3394 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3396 return (offset >= -64 * GET_MODE_SIZE (mode)
3397 && offset < 64 * GET_MODE_SIZE (mode)
3398 && offset % GET_MODE_SIZE (mode) == 0);
3401 static inline bool
3402 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3403 HOST_WIDE_INT offset)
3405 return offset >= -256 && offset < 256;
3408 static inline bool
3409 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3411 return (offset >= 0
3412 && offset < 4096 * GET_MODE_SIZE (mode)
3413 && offset % GET_MODE_SIZE (mode) == 0);
3416 /* Return true if X is a valid address for machine mode MODE. If it is,
3417 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3418 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3420 static bool
3421 aarch64_classify_address (struct aarch64_address_info *info,
3422 rtx x, machine_mode mode,
3423 RTX_CODE outer_code, bool strict_p)
3425 enum rtx_code code = GET_CODE (x);
3426 rtx op0, op1;
3428 /* On BE, we use load/store pair for all large int mode load/stores. */
3429 bool load_store_pair_p = (outer_code == PARALLEL
3430 || (BYTES_BIG_ENDIAN
3431 && aarch64_vect_struct_mode_p (mode)));
3433 bool allow_reg_index_p =
3434 !load_store_pair_p
3435 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3436 && !aarch64_vect_struct_mode_p (mode);
3438 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3439 REG addressing. */
3440 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3441 && (code != POST_INC && code != REG))
3442 return false;
3444 switch (code)
3446 case REG:
3447 case SUBREG:
3448 info->type = ADDRESS_REG_IMM;
3449 info->base = x;
3450 info->offset = const0_rtx;
3451 return aarch64_base_register_rtx_p (x, strict_p);
3453 case PLUS:
3454 op0 = XEXP (x, 0);
3455 op1 = XEXP (x, 1);
3457 if (! strict_p
3458 && REG_P (op0)
3459 && (op0 == virtual_stack_vars_rtx
3460 || op0 == frame_pointer_rtx
3461 || op0 == arg_pointer_rtx)
3462 && CONST_INT_P (op1))
3464 info->type = ADDRESS_REG_IMM;
3465 info->base = op0;
3466 info->offset = op1;
3468 return true;
3471 if (GET_MODE_SIZE (mode) != 0
3472 && CONST_INT_P (op1)
3473 && aarch64_base_register_rtx_p (op0, strict_p))
3475 HOST_WIDE_INT offset = INTVAL (op1);
3477 info->type = ADDRESS_REG_IMM;
3478 info->base = op0;
3479 info->offset = op1;
3481 /* TImode and TFmode values are allowed in both pairs of X
3482 registers and individual Q registers. The available
3483 address modes are:
3484 X,X: 7-bit signed scaled offset
3485 Q: 9-bit signed offset
3486 We conservatively require an offset representable in either mode.
3488 if (mode == TImode || mode == TFmode)
3489 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3490 && offset_9bit_signed_unscaled_p (mode, offset));
3492 /* A 7bit offset check because OImode will emit a ldp/stp
3493 instruction (only big endian will get here).
3494 For ldp/stp instructions, the offset is scaled for the size of a
3495 single element of the pair. */
3496 if (mode == OImode)
3497 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3499 /* Three 9/12 bit offsets checks because CImode will emit three
3500 ldr/str instructions (only big endian will get here). */
3501 if (mode == CImode)
3502 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3503 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3504 || offset_12bit_unsigned_scaled_p (V16QImode,
3505 offset + 32)));
3507 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3508 instructions (only big endian will get here). */
3509 if (mode == XImode)
3510 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3511 && aarch64_offset_7bit_signed_scaled_p (TImode,
3512 offset + 32));
3514 if (load_store_pair_p)
3515 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3516 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3517 else
3518 return (offset_9bit_signed_unscaled_p (mode, offset)
3519 || offset_12bit_unsigned_scaled_p (mode, offset));
3522 if (allow_reg_index_p)
3524 /* Look for base + (scaled/extended) index register. */
3525 if (aarch64_base_register_rtx_p (op0, strict_p)
3526 && aarch64_classify_index (info, op1, mode, strict_p))
3528 info->base = op0;
3529 return true;
3531 if (aarch64_base_register_rtx_p (op1, strict_p)
3532 && aarch64_classify_index (info, op0, mode, strict_p))
3534 info->base = op1;
3535 return true;
3539 return false;
3541 case POST_INC:
3542 case POST_DEC:
3543 case PRE_INC:
3544 case PRE_DEC:
3545 info->type = ADDRESS_REG_WB;
3546 info->base = XEXP (x, 0);
3547 info->offset = NULL_RTX;
3548 return aarch64_base_register_rtx_p (info->base, strict_p);
3550 case POST_MODIFY:
3551 case PRE_MODIFY:
3552 info->type = ADDRESS_REG_WB;
3553 info->base = XEXP (x, 0);
3554 if (GET_CODE (XEXP (x, 1)) == PLUS
3555 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3556 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3557 && aarch64_base_register_rtx_p (info->base, strict_p))
3559 HOST_WIDE_INT offset;
3560 info->offset = XEXP (XEXP (x, 1), 1);
3561 offset = INTVAL (info->offset);
3563 /* TImode and TFmode values are allowed in both pairs of X
3564 registers and individual Q registers. The available
3565 address modes are:
3566 X,X: 7-bit signed scaled offset
3567 Q: 9-bit signed offset
3568 We conservatively require an offset representable in either mode.
3570 if (mode == TImode || mode == TFmode)
3571 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3572 && offset_9bit_signed_unscaled_p (mode, offset));
3574 if (load_store_pair_p)
3575 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3576 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3577 else
3578 return offset_9bit_signed_unscaled_p (mode, offset);
3580 return false;
3582 case CONST:
3583 case SYMBOL_REF:
3584 case LABEL_REF:
3585 /* load literal: pc-relative constant pool entry. Only supported
3586 for SI mode or larger. */
3587 info->type = ADDRESS_SYMBOLIC;
3589 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3591 rtx sym, addend;
3593 split_const (x, &sym, &addend);
3594 return (GET_CODE (sym) == LABEL_REF
3595 || (GET_CODE (sym) == SYMBOL_REF
3596 && CONSTANT_POOL_ADDRESS_P (sym)));
3598 return false;
3600 case LO_SUM:
3601 info->type = ADDRESS_LO_SUM;
3602 info->base = XEXP (x, 0);
3603 info->offset = XEXP (x, 1);
3604 if (allow_reg_index_p
3605 && aarch64_base_register_rtx_p (info->base, strict_p))
3607 rtx sym, offs;
3608 split_const (info->offset, &sym, &offs);
3609 if (GET_CODE (sym) == SYMBOL_REF
3610 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3611 == SYMBOL_SMALL_ABSOLUTE))
3613 /* The symbol and offset must be aligned to the access size. */
3614 unsigned int align;
3615 unsigned int ref_size;
3617 if (CONSTANT_POOL_ADDRESS_P (sym))
3618 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3619 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3621 tree exp = SYMBOL_REF_DECL (sym);
3622 align = TYPE_ALIGN (TREE_TYPE (exp));
3623 align = CONSTANT_ALIGNMENT (exp, align);
3625 else if (SYMBOL_REF_DECL (sym))
3626 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3627 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3628 && SYMBOL_REF_BLOCK (sym) != NULL)
3629 align = SYMBOL_REF_BLOCK (sym)->alignment;
3630 else
3631 align = BITS_PER_UNIT;
3633 ref_size = GET_MODE_SIZE (mode);
3634 if (ref_size == 0)
3635 ref_size = GET_MODE_SIZE (DImode);
3637 return ((INTVAL (offs) & (ref_size - 1)) == 0
3638 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3641 return false;
3643 default:
3644 return false;
3648 bool
3649 aarch64_symbolic_address_p (rtx x)
3651 rtx offset;
3653 split_const (x, &x, &offset);
3654 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3657 /* Classify the base of symbolic expression X, given that X appears in
3658 context CONTEXT. */
3660 enum aarch64_symbol_type
3661 aarch64_classify_symbolic_expression (rtx x,
3662 enum aarch64_symbol_context context)
3664 rtx offset;
3666 split_const (x, &x, &offset);
3667 return aarch64_classify_symbol (x, offset, context);
3671 /* Return TRUE if X is a legitimate address for accessing memory in
3672 mode MODE. */
3673 static bool
3674 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3676 struct aarch64_address_info addr;
3678 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3681 /* Return TRUE if X is a legitimate address for accessing memory in
3682 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3683 pair operation. */
3684 bool
3685 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3686 RTX_CODE outer_code, bool strict_p)
3688 struct aarch64_address_info addr;
3690 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3693 /* Return TRUE if rtx X is immediate constant 0.0 */
3694 bool
3695 aarch64_float_const_zero_rtx_p (rtx x)
3697 REAL_VALUE_TYPE r;
3699 if (GET_MODE (x) == VOIDmode)
3700 return false;
3702 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3703 if (REAL_VALUE_MINUS_ZERO (r))
3704 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3705 return REAL_VALUES_EQUAL (r, dconst0);
3708 /* Return the fixed registers used for condition codes. */
3710 static bool
3711 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3713 *p1 = CC_REGNUM;
3714 *p2 = INVALID_REGNUM;
3715 return true;
3718 /* Emit call insn with PAT and do aarch64-specific handling. */
3720 void
3721 aarch64_emit_call_insn (rtx pat)
3723 rtx insn = emit_call_insn (pat);
3725 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3726 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3727 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3730 machine_mode
3731 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3733 /* All floating point compares return CCFP if it is an equality
3734 comparison, and CCFPE otherwise. */
3735 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3737 switch (code)
3739 case EQ:
3740 case NE:
3741 case UNORDERED:
3742 case ORDERED:
3743 case UNLT:
3744 case UNLE:
3745 case UNGT:
3746 case UNGE:
3747 case UNEQ:
3748 case LTGT:
3749 return CCFPmode;
3751 case LT:
3752 case LE:
3753 case GT:
3754 case GE:
3755 return CCFPEmode;
3757 default:
3758 gcc_unreachable ();
3762 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3763 && y == const0_rtx
3764 && (code == EQ || code == NE || code == LT || code == GE)
3765 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3766 || GET_CODE (x) == NEG))
3767 return CC_NZmode;
3769 /* A compare with a shifted operand. Because of canonicalization,
3770 the comparison will have to be swapped when we emit the assembly
3771 code. */
3772 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3773 && (REG_P (y) || GET_CODE (y) == SUBREG)
3774 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3775 || GET_CODE (x) == LSHIFTRT
3776 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3777 return CC_SWPmode;
3779 /* Similarly for a negated operand, but we can only do this for
3780 equalities. */
3781 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3782 && (REG_P (y) || GET_CODE (y) == SUBREG)
3783 && (code == EQ || code == NE)
3784 && GET_CODE (x) == NEG)
3785 return CC_Zmode;
3787 /* A compare of a mode narrower than SI mode against zero can be done
3788 by extending the value in the comparison. */
3789 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3790 && y == const0_rtx)
3791 /* Only use sign-extension if we really need it. */
3792 return ((code == GT || code == GE || code == LE || code == LT)
3793 ? CC_SESWPmode : CC_ZESWPmode);
3795 /* For everything else, return CCmode. */
3796 return CCmode;
3799 static int
3800 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3803 aarch64_get_condition_code (rtx x)
3805 machine_mode mode = GET_MODE (XEXP (x, 0));
3806 enum rtx_code comp_code = GET_CODE (x);
3808 if (GET_MODE_CLASS (mode) != MODE_CC)
3809 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3810 return aarch64_get_condition_code_1 (mode, comp_code);
3813 static int
3814 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3816 int ne = -1, eq = -1;
3817 switch (mode)
3819 case CCFPmode:
3820 case CCFPEmode:
3821 switch (comp_code)
3823 case GE: return AARCH64_GE;
3824 case GT: return AARCH64_GT;
3825 case LE: return AARCH64_LS;
3826 case LT: return AARCH64_MI;
3827 case NE: return AARCH64_NE;
3828 case EQ: return AARCH64_EQ;
3829 case ORDERED: return AARCH64_VC;
3830 case UNORDERED: return AARCH64_VS;
3831 case UNLT: return AARCH64_LT;
3832 case UNLE: return AARCH64_LE;
3833 case UNGT: return AARCH64_HI;
3834 case UNGE: return AARCH64_PL;
3835 default: return -1;
3837 break;
3839 case CC_DNEmode:
3840 ne = AARCH64_NE;
3841 eq = AARCH64_EQ;
3842 break;
3844 case CC_DEQmode:
3845 ne = AARCH64_EQ;
3846 eq = AARCH64_NE;
3847 break;
3849 case CC_DGEmode:
3850 ne = AARCH64_GE;
3851 eq = AARCH64_LT;
3852 break;
3854 case CC_DLTmode:
3855 ne = AARCH64_LT;
3856 eq = AARCH64_GE;
3857 break;
3859 case CC_DGTmode:
3860 ne = AARCH64_GT;
3861 eq = AARCH64_LE;
3862 break;
3864 case CC_DLEmode:
3865 ne = AARCH64_LE;
3866 eq = AARCH64_GT;
3867 break;
3869 case CC_DGEUmode:
3870 ne = AARCH64_CS;
3871 eq = AARCH64_CC;
3872 break;
3874 case CC_DLTUmode:
3875 ne = AARCH64_CC;
3876 eq = AARCH64_CS;
3877 break;
3879 case CC_DGTUmode:
3880 ne = AARCH64_HI;
3881 eq = AARCH64_LS;
3882 break;
3884 case CC_DLEUmode:
3885 ne = AARCH64_LS;
3886 eq = AARCH64_HI;
3887 break;
3889 case CCmode:
3890 switch (comp_code)
3892 case NE: return AARCH64_NE;
3893 case EQ: return AARCH64_EQ;
3894 case GE: return AARCH64_GE;
3895 case GT: return AARCH64_GT;
3896 case LE: return AARCH64_LE;
3897 case LT: return AARCH64_LT;
3898 case GEU: return AARCH64_CS;
3899 case GTU: return AARCH64_HI;
3900 case LEU: return AARCH64_LS;
3901 case LTU: return AARCH64_CC;
3902 default: return -1;
3904 break;
3906 case CC_SWPmode:
3907 case CC_ZESWPmode:
3908 case CC_SESWPmode:
3909 switch (comp_code)
3911 case NE: return AARCH64_NE;
3912 case EQ: return AARCH64_EQ;
3913 case GE: return AARCH64_LE;
3914 case GT: return AARCH64_LT;
3915 case LE: return AARCH64_GE;
3916 case LT: return AARCH64_GT;
3917 case GEU: return AARCH64_LS;
3918 case GTU: return AARCH64_CC;
3919 case LEU: return AARCH64_CS;
3920 case LTU: return AARCH64_HI;
3921 default: return -1;
3923 break;
3925 case CC_NZmode:
3926 switch (comp_code)
3928 case NE: return AARCH64_NE;
3929 case EQ: return AARCH64_EQ;
3930 case GE: return AARCH64_PL;
3931 case LT: return AARCH64_MI;
3932 default: return -1;
3934 break;
3936 case CC_Zmode:
3937 switch (comp_code)
3939 case NE: return AARCH64_NE;
3940 case EQ: return AARCH64_EQ;
3941 default: return -1;
3943 break;
3945 default:
3946 return -1;
3947 break;
3950 if (comp_code == NE)
3951 return ne;
3953 if (comp_code == EQ)
3954 return eq;
3956 return -1;
3959 bool
3960 aarch64_const_vec_all_same_in_range_p (rtx x,
3961 HOST_WIDE_INT minval,
3962 HOST_WIDE_INT maxval)
3964 HOST_WIDE_INT firstval;
3965 int count, i;
3967 if (GET_CODE (x) != CONST_VECTOR
3968 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3969 return false;
3971 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3972 if (firstval < minval || firstval > maxval)
3973 return false;
3975 count = CONST_VECTOR_NUNITS (x);
3976 for (i = 1; i < count; i++)
3977 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3978 return false;
3980 return true;
3983 bool
3984 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3986 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3989 static unsigned
3990 bit_count (unsigned HOST_WIDE_INT value)
3992 unsigned count = 0;
3994 while (value)
3996 count++;
3997 value &= value - 1;
4000 return count;
4003 /* N Z C V. */
4004 #define AARCH64_CC_V 1
4005 #define AARCH64_CC_C (1 << 1)
4006 #define AARCH64_CC_Z (1 << 2)
4007 #define AARCH64_CC_N (1 << 3)
4009 /* N Z C V flags for ccmp. The first code is for AND op and the other
4010 is for IOR op. Indexed by AARCH64_COND_CODE. */
4011 static const int aarch64_nzcv_codes[][2] =
4013 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
4014 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
4015 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4016 {0, AARCH64_CC_C}, /* CC, C == 0. */
4017 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4018 {0, AARCH64_CC_N}, /* PL, N == 0. */
4019 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4020 {0, AARCH64_CC_V}, /* VC, V == 0. */
4021 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4022 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4023 {0, AARCH64_CC_V}, /* GE, N == V. */
4024 {AARCH64_CC_V, 0}, /* LT, N != V. */
4025 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4026 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4027 {0, 0}, /* AL, Any. */
4028 {0, 0}, /* NV, Any. */
4032 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4034 switch (mode)
4036 case CC_DNEmode:
4037 return NE;
4039 case CC_DEQmode:
4040 return EQ;
4042 case CC_DLEmode:
4043 return LE;
4045 case CC_DGTmode:
4046 return GT;
4048 case CC_DLTmode:
4049 return LT;
4051 case CC_DGEmode:
4052 return GE;
4054 case CC_DLEUmode:
4055 return LEU;
4057 case CC_DGTUmode:
4058 return GTU;
4060 case CC_DLTUmode:
4061 return LTU;
4063 case CC_DGEUmode:
4064 return GEU;
4066 default:
4067 gcc_unreachable ();
4072 void
4073 aarch64_print_operand (FILE *f, rtx x, char code)
4075 switch (code)
4077 /* An integer or symbol address without a preceding # sign. */
4078 case 'c':
4079 switch (GET_CODE (x))
4081 case CONST_INT:
4082 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4083 break;
4085 case SYMBOL_REF:
4086 output_addr_const (f, x);
4087 break;
4089 case CONST:
4090 if (GET_CODE (XEXP (x, 0)) == PLUS
4091 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4093 output_addr_const (f, x);
4094 break;
4096 /* Fall through. */
4098 default:
4099 output_operand_lossage ("Unsupported operand for code '%c'", code);
4101 break;
4103 case 'e':
4104 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4106 int n;
4108 if (!CONST_INT_P (x)
4109 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4111 output_operand_lossage ("invalid operand for '%%%c'", code);
4112 return;
4115 switch (n)
4117 case 3:
4118 fputc ('b', f);
4119 break;
4120 case 4:
4121 fputc ('h', f);
4122 break;
4123 case 5:
4124 fputc ('w', f);
4125 break;
4126 default:
4127 output_operand_lossage ("invalid operand for '%%%c'", code);
4128 return;
4131 break;
4133 case 'p':
4135 int n;
4137 /* Print N such that 2^N == X. */
4138 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4140 output_operand_lossage ("invalid operand for '%%%c'", code);
4141 return;
4144 asm_fprintf (f, "%d", n);
4146 break;
4148 case 'P':
4149 /* Print the number of non-zero bits in X (a const_int). */
4150 if (!CONST_INT_P (x))
4152 output_operand_lossage ("invalid operand for '%%%c'", code);
4153 return;
4156 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4157 break;
4159 case 'H':
4160 /* Print the higher numbered register of a pair (TImode) of regs. */
4161 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4163 output_operand_lossage ("invalid operand for '%%%c'", code);
4164 return;
4167 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4168 break;
4170 case 'm':
4172 int cond_code;
4173 /* Print a condition (eq, ne, etc). */
4175 /* CONST_TRUE_RTX means always -- that's the default. */
4176 if (x == const_true_rtx)
4177 return;
4179 if (!COMPARISON_P (x))
4181 output_operand_lossage ("invalid operand for '%%%c'", code);
4182 return;
4185 cond_code = aarch64_get_condition_code (x);
4186 gcc_assert (cond_code >= 0);
4187 fputs (aarch64_condition_codes[cond_code], f);
4189 break;
4191 case 'M':
4193 int cond_code;
4194 /* Print the inverse of a condition (eq <-> ne, etc). */
4196 /* CONST_TRUE_RTX means never -- that's the default. */
4197 if (x == const_true_rtx)
4199 fputs ("nv", f);
4200 return;
4203 if (!COMPARISON_P (x))
4205 output_operand_lossage ("invalid operand for '%%%c'", code);
4206 return;
4208 cond_code = aarch64_get_condition_code (x);
4209 gcc_assert (cond_code >= 0);
4210 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4211 (cond_code)], f);
4213 break;
4215 case 'b':
4216 case 'h':
4217 case 's':
4218 case 'd':
4219 case 'q':
4220 /* Print a scalar FP/SIMD register name. */
4221 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4223 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4224 return;
4226 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4227 break;
4229 case 'S':
4230 case 'T':
4231 case 'U':
4232 case 'V':
4233 /* Print the first FP/SIMD register name in a list. */
4234 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4236 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4237 return;
4239 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4240 break;
4242 case 'R':
4243 /* Print a scalar FP/SIMD register name + 1. */
4244 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4246 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4247 return;
4249 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4250 break;
4252 case 'X':
4253 /* Print bottom 16 bits of integer constant in hex. */
4254 if (!CONST_INT_P (x))
4256 output_operand_lossage ("invalid operand for '%%%c'", code);
4257 return;
4259 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4260 break;
4262 case 'w':
4263 case 'x':
4264 /* Print a general register name or the zero register (32-bit or
4265 64-bit). */
4266 if (x == const0_rtx
4267 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4269 asm_fprintf (f, "%czr", code);
4270 break;
4273 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4275 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4276 break;
4279 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4281 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4282 break;
4285 /* Fall through */
4287 case 0:
4288 /* Print a normal operand, if it's a general register, then we
4289 assume DImode. */
4290 if (x == NULL)
4292 output_operand_lossage ("missing operand");
4293 return;
4296 switch (GET_CODE (x))
4298 case REG:
4299 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4300 break;
4302 case MEM:
4303 aarch64_memory_reference_mode = GET_MODE (x);
4304 output_address (XEXP (x, 0));
4305 break;
4307 case LABEL_REF:
4308 case SYMBOL_REF:
4309 output_addr_const (asm_out_file, x);
4310 break;
4312 case CONST_INT:
4313 asm_fprintf (f, "%wd", INTVAL (x));
4314 break;
4316 case CONST_VECTOR:
4317 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4319 gcc_assert (
4320 aarch64_const_vec_all_same_in_range_p (x,
4321 HOST_WIDE_INT_MIN,
4322 HOST_WIDE_INT_MAX));
4323 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4325 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4327 fputc ('0', f);
4329 else
4330 gcc_unreachable ();
4331 break;
4333 case CONST_DOUBLE:
4334 /* CONST_DOUBLE can represent a double-width integer.
4335 In this case, the mode of x is VOIDmode. */
4336 if (GET_MODE (x) == VOIDmode)
4337 ; /* Do Nothing. */
4338 else if (aarch64_float_const_zero_rtx_p (x))
4340 fputc ('0', f);
4341 break;
4343 else if (aarch64_float_const_representable_p (x))
4345 #define buf_size 20
4346 char float_buf[buf_size] = {'\0'};
4347 REAL_VALUE_TYPE r;
4348 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4349 real_to_decimal_for_mode (float_buf, &r,
4350 buf_size, buf_size,
4351 1, GET_MODE (x));
4352 asm_fprintf (asm_out_file, "%s", float_buf);
4353 break;
4354 #undef buf_size
4356 output_operand_lossage ("invalid constant");
4357 return;
4358 default:
4359 output_operand_lossage ("invalid operand");
4360 return;
4362 break;
4364 case 'A':
4365 if (GET_CODE (x) == HIGH)
4366 x = XEXP (x, 0);
4368 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4370 case SYMBOL_SMALL_GOT:
4371 asm_fprintf (asm_out_file, ":got:");
4372 break;
4374 case SYMBOL_SMALL_TLSGD:
4375 asm_fprintf (asm_out_file, ":tlsgd:");
4376 break;
4378 case SYMBOL_SMALL_TLSDESC:
4379 asm_fprintf (asm_out_file, ":tlsdesc:");
4380 break;
4382 case SYMBOL_SMALL_GOTTPREL:
4383 asm_fprintf (asm_out_file, ":gottprel:");
4384 break;
4386 case SYMBOL_SMALL_TPREL:
4387 asm_fprintf (asm_out_file, ":tprel:");
4388 break;
4390 case SYMBOL_TINY_GOT:
4391 gcc_unreachable ();
4392 break;
4394 default:
4395 break;
4397 output_addr_const (asm_out_file, x);
4398 break;
4400 case 'L':
4401 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4403 case SYMBOL_SMALL_GOT:
4404 asm_fprintf (asm_out_file, ":lo12:");
4405 break;
4407 case SYMBOL_SMALL_TLSGD:
4408 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4409 break;
4411 case SYMBOL_SMALL_TLSDESC:
4412 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4413 break;
4415 case SYMBOL_SMALL_GOTTPREL:
4416 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4417 break;
4419 case SYMBOL_SMALL_TPREL:
4420 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4421 break;
4423 case SYMBOL_TINY_GOT:
4424 asm_fprintf (asm_out_file, ":got:");
4425 break;
4427 default:
4428 break;
4430 output_addr_const (asm_out_file, x);
4431 break;
4433 case 'G':
4435 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4437 case SYMBOL_SMALL_TPREL:
4438 asm_fprintf (asm_out_file, ":tprel_hi12:");
4439 break;
4440 default:
4441 break;
4443 output_addr_const (asm_out_file, x);
4444 break;
4446 case 'K':
4448 int cond_code;
4449 /* Print nzcv. */
4451 if (!COMPARISON_P (x))
4453 output_operand_lossage ("invalid operand for '%%%c'", code);
4454 return;
4457 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4458 gcc_assert (cond_code >= 0);
4459 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4461 break;
4463 case 'k':
4465 int cond_code;
4466 /* Print nzcv. */
4468 if (!COMPARISON_P (x))
4470 output_operand_lossage ("invalid operand for '%%%c'", code);
4471 return;
4474 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4475 gcc_assert (cond_code >= 0);
4476 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4478 break;
4480 default:
4481 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4482 return;
4486 void
4487 aarch64_print_operand_address (FILE *f, rtx x)
4489 struct aarch64_address_info addr;
4491 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4492 MEM, true))
4493 switch (addr.type)
4495 case ADDRESS_REG_IMM:
4496 if (addr.offset == const0_rtx)
4497 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4498 else
4499 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4500 INTVAL (addr.offset));
4501 return;
4503 case ADDRESS_REG_REG:
4504 if (addr.shift == 0)
4505 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4506 reg_names [REGNO (addr.offset)]);
4507 else
4508 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4509 reg_names [REGNO (addr.offset)], addr.shift);
4510 return;
4512 case ADDRESS_REG_UXTW:
4513 if (addr.shift == 0)
4514 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4515 REGNO (addr.offset) - R0_REGNUM);
4516 else
4517 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4518 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4519 return;
4521 case ADDRESS_REG_SXTW:
4522 if (addr.shift == 0)
4523 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4524 REGNO (addr.offset) - R0_REGNUM);
4525 else
4526 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4527 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4528 return;
4530 case ADDRESS_REG_WB:
4531 switch (GET_CODE (x))
4533 case PRE_INC:
4534 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4535 GET_MODE_SIZE (aarch64_memory_reference_mode));
4536 return;
4537 case POST_INC:
4538 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4539 GET_MODE_SIZE (aarch64_memory_reference_mode));
4540 return;
4541 case PRE_DEC:
4542 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4543 GET_MODE_SIZE (aarch64_memory_reference_mode));
4544 return;
4545 case POST_DEC:
4546 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4547 GET_MODE_SIZE (aarch64_memory_reference_mode));
4548 return;
4549 case PRE_MODIFY:
4550 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4551 INTVAL (addr.offset));
4552 return;
4553 case POST_MODIFY:
4554 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4555 INTVAL (addr.offset));
4556 return;
4557 default:
4558 break;
4560 break;
4562 case ADDRESS_LO_SUM:
4563 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4564 output_addr_const (f, addr.offset);
4565 asm_fprintf (f, "]");
4566 return;
4568 case ADDRESS_SYMBOLIC:
4569 break;
4572 output_addr_const (f, x);
4575 bool
4576 aarch64_label_mentioned_p (rtx x)
4578 const char *fmt;
4579 int i;
4581 if (GET_CODE (x) == LABEL_REF)
4582 return true;
4584 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4585 referencing instruction, but they are constant offsets, not
4586 symbols. */
4587 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4588 return false;
4590 fmt = GET_RTX_FORMAT (GET_CODE (x));
4591 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4593 if (fmt[i] == 'E')
4595 int j;
4597 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4598 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4599 return 1;
4601 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4602 return 1;
4605 return 0;
4608 /* Implement REGNO_REG_CLASS. */
4610 enum reg_class
4611 aarch64_regno_regclass (unsigned regno)
4613 if (GP_REGNUM_P (regno))
4614 return GENERAL_REGS;
4616 if (regno == SP_REGNUM)
4617 return STACK_REG;
4619 if (regno == FRAME_POINTER_REGNUM
4620 || regno == ARG_POINTER_REGNUM)
4621 return POINTER_REGS;
4623 if (FP_REGNUM_P (regno))
4624 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4626 return NO_REGS;
4629 static rtx
4630 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4632 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4633 where mask is selected by alignment and size of the offset.
4634 We try to pick as large a range for the offset as possible to
4635 maximize the chance of a CSE. However, for aligned addresses
4636 we limit the range to 4k so that structures with different sized
4637 elements are likely to use the same base. */
4639 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4641 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4642 HOST_WIDE_INT base_offset;
4644 /* Does it look like we'll need a load/store-pair operation? */
4645 if (GET_MODE_SIZE (mode) > 16
4646 || mode == TImode)
4647 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4648 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4649 /* For offsets aren't a multiple of the access size, the limit is
4650 -256...255. */
4651 else if (offset & (GET_MODE_SIZE (mode) - 1))
4652 base_offset = (offset + 0x100) & ~0x1ff;
4653 else
4654 base_offset = offset & ~0xfff;
4656 if (base_offset == 0)
4657 return x;
4659 offset -= base_offset;
4660 rtx base_reg = gen_reg_rtx (Pmode);
4661 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4662 NULL_RTX);
4663 emit_move_insn (base_reg, val);
4664 x = plus_constant (Pmode, base_reg, offset);
4667 return x;
4670 /* Try a machine-dependent way of reloading an illegitimate address
4671 operand. If we find one, push the reload and return the new rtx. */
4674 aarch64_legitimize_reload_address (rtx *x_p,
4675 machine_mode mode,
4676 int opnum, int type,
4677 int ind_levels ATTRIBUTE_UNUSED)
4679 rtx x = *x_p;
4681 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4682 if (aarch64_vect_struct_mode_p (mode)
4683 && GET_CODE (x) == PLUS
4684 && REG_P (XEXP (x, 0))
4685 && CONST_INT_P (XEXP (x, 1)))
4687 rtx orig_rtx = x;
4688 x = copy_rtx (x);
4689 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4690 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4691 opnum, (enum reload_type) type);
4692 return x;
4695 /* We must recognize output that we have already generated ourselves. */
4696 if (GET_CODE (x) == PLUS
4697 && GET_CODE (XEXP (x, 0)) == PLUS
4698 && REG_P (XEXP (XEXP (x, 0), 0))
4699 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4700 && CONST_INT_P (XEXP (x, 1)))
4702 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4703 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4704 opnum, (enum reload_type) type);
4705 return x;
4708 /* We wish to handle large displacements off a base register by splitting
4709 the addend across an add and the mem insn. This can cut the number of
4710 extra insns needed from 3 to 1. It is only useful for load/store of a
4711 single register with 12 bit offset field. */
4712 if (GET_CODE (x) == PLUS
4713 && REG_P (XEXP (x, 0))
4714 && CONST_INT_P (XEXP (x, 1))
4715 && HARD_REGISTER_P (XEXP (x, 0))
4716 && mode != TImode
4717 && mode != TFmode
4718 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4720 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4721 HOST_WIDE_INT low = val & 0xfff;
4722 HOST_WIDE_INT high = val - low;
4723 HOST_WIDE_INT offs;
4724 rtx cst;
4725 machine_mode xmode = GET_MODE (x);
4727 /* In ILP32, xmode can be either DImode or SImode. */
4728 gcc_assert (xmode == DImode || xmode == SImode);
4730 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4731 BLKmode alignment. */
4732 if (GET_MODE_SIZE (mode) == 0)
4733 return NULL_RTX;
4735 offs = low % GET_MODE_SIZE (mode);
4737 /* Align misaligned offset by adjusting high part to compensate. */
4738 if (offs != 0)
4740 if (aarch64_uimm12_shift (high + offs))
4742 /* Align down. */
4743 low = low - offs;
4744 high = high + offs;
4746 else
4748 /* Align up. */
4749 offs = GET_MODE_SIZE (mode) - offs;
4750 low = low + offs;
4751 high = high + (low & 0x1000) - offs;
4752 low &= 0xfff;
4756 /* Check for overflow. */
4757 if (high + low != val)
4758 return NULL_RTX;
4760 cst = GEN_INT (high);
4761 if (!aarch64_uimm12_shift (high))
4762 cst = force_const_mem (xmode, cst);
4764 /* Reload high part into base reg, leaving the low part
4765 in the mem instruction.
4766 Note that replacing this gen_rtx_PLUS with plus_constant is
4767 wrong in this case because we rely on the
4768 (plus (plus reg c1) c2) structure being preserved so that
4769 XEXP (*p, 0) in push_reload below uses the correct term. */
4770 x = gen_rtx_PLUS (xmode,
4771 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4772 GEN_INT (low));
4774 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4775 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4776 opnum, (enum reload_type) type);
4777 return x;
4780 return NULL_RTX;
4784 static reg_class_t
4785 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4786 reg_class_t rclass,
4787 machine_mode mode,
4788 secondary_reload_info *sri)
4790 /* Without the TARGET_SIMD instructions we cannot move a Q register
4791 to a Q register directly. We need a scratch. */
4792 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4793 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4794 && reg_class_subset_p (rclass, FP_REGS))
4796 if (mode == TFmode)
4797 sri->icode = CODE_FOR_aarch64_reload_movtf;
4798 else if (mode == TImode)
4799 sri->icode = CODE_FOR_aarch64_reload_movti;
4800 return NO_REGS;
4803 /* A TFmode or TImode memory access should be handled via an FP_REGS
4804 because AArch64 has richer addressing modes for LDR/STR instructions
4805 than LDP/STP instructions. */
4806 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4807 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4808 return FP_REGS;
4810 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4811 return GENERAL_REGS;
4813 return NO_REGS;
4816 static bool
4817 aarch64_can_eliminate (const int from, const int to)
4819 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4820 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4822 if (frame_pointer_needed)
4824 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4825 return true;
4826 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4827 return false;
4828 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4829 && !cfun->calls_alloca)
4830 return true;
4831 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4832 return true;
4834 return false;
4836 else
4838 /* If we decided that we didn't need a leaf frame pointer but then used
4839 LR in the function, then we'll want a frame pointer after all, so
4840 prevent this elimination to ensure a frame pointer is used. */
4841 if (to == STACK_POINTER_REGNUM
4842 && flag_omit_leaf_frame_pointer
4843 && df_regs_ever_live_p (LR_REGNUM))
4844 return false;
4847 return true;
4850 HOST_WIDE_INT
4851 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4853 aarch64_layout_frame ();
4855 if (to == HARD_FRAME_POINTER_REGNUM)
4857 if (from == ARG_POINTER_REGNUM)
4858 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4860 if (from == FRAME_POINTER_REGNUM)
4861 return (cfun->machine->frame.hard_fp_offset
4862 - cfun->machine->frame.saved_varargs_size);
4865 if (to == STACK_POINTER_REGNUM)
4867 if (from == FRAME_POINTER_REGNUM)
4868 return (cfun->machine->frame.frame_size
4869 - cfun->machine->frame.saved_varargs_size);
4872 return cfun->machine->frame.frame_size;
4875 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4876 previous frame. */
4879 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4881 if (count != 0)
4882 return const0_rtx;
4883 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4887 static void
4888 aarch64_asm_trampoline_template (FILE *f)
4890 if (TARGET_ILP32)
4892 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4893 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4895 else
4897 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4898 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4900 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4901 assemble_aligned_integer (4, const0_rtx);
4902 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4903 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4906 static void
4907 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4909 rtx fnaddr, mem, a_tramp;
4910 const int tramp_code_sz = 16;
4912 /* Don't need to copy the trailing D-words, we fill those in below. */
4913 emit_block_move (m_tramp, assemble_trampoline_template (),
4914 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4915 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4916 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4917 if (GET_MODE (fnaddr) != ptr_mode)
4918 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4919 emit_move_insn (mem, fnaddr);
4921 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4922 emit_move_insn (mem, chain_value);
4924 /* XXX We should really define a "clear_cache" pattern and use
4925 gen_clear_cache(). */
4926 a_tramp = XEXP (m_tramp, 0);
4927 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4928 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4929 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4930 ptr_mode);
4933 static unsigned char
4934 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4936 switch (regclass)
4938 case CALLER_SAVE_REGS:
4939 case POINTER_REGS:
4940 case GENERAL_REGS:
4941 case ALL_REGS:
4942 case FP_REGS:
4943 case FP_LO_REGS:
4944 return
4945 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4946 (GET_MODE_SIZE (mode) + 7) / 8;
4947 case STACK_REG:
4948 return 1;
4950 case NO_REGS:
4951 return 0;
4953 default:
4954 break;
4956 gcc_unreachable ();
4959 static reg_class_t
4960 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4962 if (regclass == POINTER_REGS)
4963 return GENERAL_REGS;
4965 if (regclass == STACK_REG)
4967 if (REG_P(x)
4968 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4969 return regclass;
4971 return NO_REGS;
4974 /* If it's an integer immediate that MOVI can't handle, then
4975 FP_REGS is not an option, so we return NO_REGS instead. */
4976 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4977 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4978 return NO_REGS;
4980 /* Register eliminiation can result in a request for
4981 SP+constant->FP_REGS. We cannot support such operations which
4982 use SP as source and an FP_REG as destination, so reject out
4983 right now. */
4984 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4986 rtx lhs = XEXP (x, 0);
4988 /* Look through a possible SUBREG introduced by ILP32. */
4989 if (GET_CODE (lhs) == SUBREG)
4990 lhs = SUBREG_REG (lhs);
4992 gcc_assert (REG_P (lhs));
4993 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4994 POINTER_REGS));
4995 return NO_REGS;
4998 return regclass;
5001 void
5002 aarch64_asm_output_labelref (FILE* f, const char *name)
5004 asm_fprintf (f, "%U%s", name);
5007 static void
5008 aarch64_elf_asm_constructor (rtx symbol, int priority)
5010 if (priority == DEFAULT_INIT_PRIORITY)
5011 default_ctor_section_asm_out_constructor (symbol, priority);
5012 else
5014 section *s;
5015 char buf[18];
5016 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5017 s = get_section (buf, SECTION_WRITE, NULL);
5018 switch_to_section (s);
5019 assemble_align (POINTER_SIZE);
5020 assemble_aligned_integer (POINTER_BYTES, symbol);
5024 static void
5025 aarch64_elf_asm_destructor (rtx symbol, int priority)
5027 if (priority == DEFAULT_INIT_PRIORITY)
5028 default_dtor_section_asm_out_destructor (symbol, priority);
5029 else
5031 section *s;
5032 char buf[18];
5033 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5034 s = get_section (buf, SECTION_WRITE, NULL);
5035 switch_to_section (s);
5036 assemble_align (POINTER_SIZE);
5037 assemble_aligned_integer (POINTER_BYTES, symbol);
5041 const char*
5042 aarch64_output_casesi (rtx *operands)
5044 char buf[100];
5045 char label[100];
5046 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5047 int index;
5048 static const char *const patterns[4][2] =
5051 "ldrb\t%w3, [%0,%w1,uxtw]",
5052 "add\t%3, %4, %w3, sxtb #2"
5055 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5056 "add\t%3, %4, %w3, sxth #2"
5059 "ldr\t%w3, [%0,%w1,uxtw #2]",
5060 "add\t%3, %4, %w3, sxtw #2"
5062 /* We assume that DImode is only generated when not optimizing and
5063 that we don't really need 64-bit address offsets. That would
5064 imply an object file with 8GB of code in a single function! */
5066 "ldr\t%w3, [%0,%w1,uxtw #2]",
5067 "add\t%3, %4, %w3, sxtw #2"
5071 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5073 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5075 gcc_assert (index >= 0 && index <= 3);
5077 /* Need to implement table size reduction, by chaning the code below. */
5078 output_asm_insn (patterns[index][0], operands);
5079 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5080 snprintf (buf, sizeof (buf),
5081 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5082 output_asm_insn (buf, operands);
5083 output_asm_insn (patterns[index][1], operands);
5084 output_asm_insn ("br\t%3", operands);
5085 assemble_label (asm_out_file, label);
5086 return "";
5090 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5091 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5092 operator. */
5095 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5097 if (shift >= 0 && shift <= 3)
5099 int size;
5100 for (size = 8; size <= 32; size *= 2)
5102 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5103 if (mask == bits << shift)
5104 return size;
5107 return 0;
5110 static bool
5111 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5112 const_rtx x ATTRIBUTE_UNUSED)
5114 /* We can't use blocks for constants when we're using a per-function
5115 constant pool. */
5116 return false;
5119 static section *
5120 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5121 rtx x ATTRIBUTE_UNUSED,
5122 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5124 /* Force all constant pool entries into the current function section. */
5125 return function_section (current_function_decl);
5129 /* Costs. */
5131 /* Helper function for rtx cost calculation. Strip a shift expression
5132 from X. Returns the inner operand if successful, or the original
5133 expression on failure. */
5134 static rtx
5135 aarch64_strip_shift (rtx x)
5137 rtx op = x;
5139 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5140 we can convert both to ROR during final output. */
5141 if ((GET_CODE (op) == ASHIFT
5142 || GET_CODE (op) == ASHIFTRT
5143 || GET_CODE (op) == LSHIFTRT
5144 || GET_CODE (op) == ROTATERT
5145 || GET_CODE (op) == ROTATE)
5146 && CONST_INT_P (XEXP (op, 1)))
5147 return XEXP (op, 0);
5149 if (GET_CODE (op) == MULT
5150 && CONST_INT_P (XEXP (op, 1))
5151 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5152 return XEXP (op, 0);
5154 return x;
5157 /* Helper function for rtx cost calculation. Strip an extend
5158 expression from X. Returns the inner operand if successful, or the
5159 original expression on failure. We deal with a number of possible
5160 canonicalization variations here. */
5161 static rtx
5162 aarch64_strip_extend (rtx x)
5164 rtx op = x;
5166 /* Zero and sign extraction of a widened value. */
5167 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5168 && XEXP (op, 2) == const0_rtx
5169 && GET_CODE (XEXP (op, 0)) == MULT
5170 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5171 XEXP (op, 1)))
5172 return XEXP (XEXP (op, 0), 0);
5174 /* It can also be represented (for zero-extend) as an AND with an
5175 immediate. */
5176 if (GET_CODE (op) == AND
5177 && GET_CODE (XEXP (op, 0)) == MULT
5178 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5179 && CONST_INT_P (XEXP (op, 1))
5180 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5181 INTVAL (XEXP (op, 1))) != 0)
5182 return XEXP (XEXP (op, 0), 0);
5184 /* Now handle extended register, as this may also have an optional
5185 left shift by 1..4. */
5186 if (GET_CODE (op) == ASHIFT
5187 && CONST_INT_P (XEXP (op, 1))
5188 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5189 op = XEXP (op, 0);
5191 if (GET_CODE (op) == ZERO_EXTEND
5192 || GET_CODE (op) == SIGN_EXTEND)
5193 op = XEXP (op, 0);
5195 if (op != x)
5196 return op;
5198 return x;
5201 /* Helper function for rtx cost calculation. Calculate the cost of
5202 a MULT, which may be part of a multiply-accumulate rtx. Return
5203 the calculated cost of the expression, recursing manually in to
5204 operands where needed. */
5206 static int
5207 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5209 rtx op0, op1;
5210 const struct cpu_cost_table *extra_cost
5211 = aarch64_tune_params->insn_extra_cost;
5212 int cost = 0;
5213 bool maybe_fma = (outer == PLUS || outer == MINUS);
5214 machine_mode mode = GET_MODE (x);
5216 gcc_checking_assert (code == MULT);
5218 op0 = XEXP (x, 0);
5219 op1 = XEXP (x, 1);
5221 if (VECTOR_MODE_P (mode))
5222 mode = GET_MODE_INNER (mode);
5224 /* Integer multiply/fma. */
5225 if (GET_MODE_CLASS (mode) == MODE_INT)
5227 /* The multiply will be canonicalized as a shift, cost it as such. */
5228 if (CONST_INT_P (op1)
5229 && exact_log2 (INTVAL (op1)) > 0)
5231 if (speed)
5233 if (maybe_fma)
5234 /* ADD (shifted register). */
5235 cost += extra_cost->alu.arith_shift;
5236 else
5237 /* LSL (immediate). */
5238 cost += extra_cost->alu.shift;
5241 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5243 return cost;
5246 /* Integer multiplies or FMAs have zero/sign extending variants. */
5247 if ((GET_CODE (op0) == ZERO_EXTEND
5248 && GET_CODE (op1) == ZERO_EXTEND)
5249 || (GET_CODE (op0) == SIGN_EXTEND
5250 && GET_CODE (op1) == SIGN_EXTEND))
5252 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5253 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5255 if (speed)
5257 if (maybe_fma)
5258 /* MADD/SMADDL/UMADDL. */
5259 cost += extra_cost->mult[0].extend_add;
5260 else
5261 /* MUL/SMULL/UMULL. */
5262 cost += extra_cost->mult[0].extend;
5265 return cost;
5268 /* This is either an integer multiply or an FMA. In both cases
5269 we want to recurse and cost the operands. */
5270 cost += rtx_cost (op0, MULT, 0, speed)
5271 + rtx_cost (op1, MULT, 1, speed);
5273 if (speed)
5275 if (maybe_fma)
5276 /* MADD. */
5277 cost += extra_cost->mult[mode == DImode].add;
5278 else
5279 /* MUL. */
5280 cost += extra_cost->mult[mode == DImode].simple;
5283 return cost;
5285 else
5287 if (speed)
5289 /* Floating-point FMA/FMUL can also support negations of the
5290 operands. */
5291 if (GET_CODE (op0) == NEG)
5292 op0 = XEXP (op0, 0);
5293 if (GET_CODE (op1) == NEG)
5294 op1 = XEXP (op1, 0);
5296 if (maybe_fma)
5297 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5298 cost += extra_cost->fp[mode == DFmode].fma;
5299 else
5300 /* FMUL/FNMUL. */
5301 cost += extra_cost->fp[mode == DFmode].mult;
5304 cost += rtx_cost (op0, MULT, 0, speed)
5305 + rtx_cost (op1, MULT, 1, speed);
5306 return cost;
5310 static int
5311 aarch64_address_cost (rtx x,
5312 machine_mode mode,
5313 addr_space_t as ATTRIBUTE_UNUSED,
5314 bool speed)
5316 enum rtx_code c = GET_CODE (x);
5317 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5318 struct aarch64_address_info info;
5319 int cost = 0;
5320 info.shift = 0;
5322 if (!aarch64_classify_address (&info, x, mode, c, false))
5324 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5326 /* This is a CONST or SYMBOL ref which will be split
5327 in a different way depending on the code model in use.
5328 Cost it through the generic infrastructure. */
5329 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5330 /* Divide through by the cost of one instruction to
5331 bring it to the same units as the address costs. */
5332 cost_symbol_ref /= COSTS_N_INSNS (1);
5333 /* The cost is then the cost of preparing the address,
5334 followed by an immediate (possibly 0) offset. */
5335 return cost_symbol_ref + addr_cost->imm_offset;
5337 else
5339 /* This is most likely a jump table from a case
5340 statement. */
5341 return addr_cost->register_offset;
5345 switch (info.type)
5347 case ADDRESS_LO_SUM:
5348 case ADDRESS_SYMBOLIC:
5349 case ADDRESS_REG_IMM:
5350 cost += addr_cost->imm_offset;
5351 break;
5353 case ADDRESS_REG_WB:
5354 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5355 cost += addr_cost->pre_modify;
5356 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5357 cost += addr_cost->post_modify;
5358 else
5359 gcc_unreachable ();
5361 break;
5363 case ADDRESS_REG_REG:
5364 cost += addr_cost->register_offset;
5365 break;
5367 case ADDRESS_REG_UXTW:
5368 case ADDRESS_REG_SXTW:
5369 cost += addr_cost->register_extend;
5370 break;
5372 default:
5373 gcc_unreachable ();
5377 if (info.shift > 0)
5379 /* For the sake of calculating the cost of the shifted register
5380 component, we can treat same sized modes in the same way. */
5381 switch (GET_MODE_BITSIZE (mode))
5383 case 16:
5384 cost += addr_cost->addr_scale_costs.hi;
5385 break;
5387 case 32:
5388 cost += addr_cost->addr_scale_costs.si;
5389 break;
5391 case 64:
5392 cost += addr_cost->addr_scale_costs.di;
5393 break;
5395 /* We can't tell, or this is a 128-bit vector. */
5396 default:
5397 cost += addr_cost->addr_scale_costs.ti;
5398 break;
5402 return cost;
5405 /* Return true if the RTX X in mode MODE is a zero or sign extract
5406 usable in an ADD or SUB (extended register) instruction. */
5407 static bool
5408 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5410 /* Catch add with a sign extract.
5411 This is add_<optab><mode>_multp2. */
5412 if (GET_CODE (x) == SIGN_EXTRACT
5413 || GET_CODE (x) == ZERO_EXTRACT)
5415 rtx op0 = XEXP (x, 0);
5416 rtx op1 = XEXP (x, 1);
5417 rtx op2 = XEXP (x, 2);
5419 if (GET_CODE (op0) == MULT
5420 && CONST_INT_P (op1)
5421 && op2 == const0_rtx
5422 && CONST_INT_P (XEXP (op0, 1))
5423 && aarch64_is_extend_from_extract (mode,
5424 XEXP (op0, 1),
5425 op1))
5427 return true;
5431 return false;
5434 static bool
5435 aarch64_frint_unspec_p (unsigned int u)
5437 switch (u)
5439 case UNSPEC_FRINTZ:
5440 case UNSPEC_FRINTP:
5441 case UNSPEC_FRINTM:
5442 case UNSPEC_FRINTA:
5443 case UNSPEC_FRINTN:
5444 case UNSPEC_FRINTX:
5445 case UNSPEC_FRINTI:
5446 return true;
5448 default:
5449 return false;
5453 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5454 storing it in *COST. Result is true if the total cost of the operation
5455 has now been calculated. */
5456 static bool
5457 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5459 rtx inner;
5460 rtx comparator;
5461 enum rtx_code cmpcode;
5463 if (COMPARISON_P (op0))
5465 inner = XEXP (op0, 0);
5466 comparator = XEXP (op0, 1);
5467 cmpcode = GET_CODE (op0);
5469 else
5471 inner = op0;
5472 comparator = const0_rtx;
5473 cmpcode = NE;
5476 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5478 /* Conditional branch. */
5479 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5480 return true;
5481 else
5483 if (cmpcode == NE || cmpcode == EQ)
5485 if (comparator == const0_rtx)
5487 /* TBZ/TBNZ/CBZ/CBNZ. */
5488 if (GET_CODE (inner) == ZERO_EXTRACT)
5489 /* TBZ/TBNZ. */
5490 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5491 0, speed);
5492 else
5493 /* CBZ/CBNZ. */
5494 *cost += rtx_cost (inner, cmpcode, 0, speed);
5496 return true;
5499 else if (cmpcode == LT || cmpcode == GE)
5501 /* TBZ/TBNZ. */
5502 if (comparator == const0_rtx)
5503 return true;
5507 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5509 /* It's a conditional operation based on the status flags,
5510 so it must be some flavor of CSEL. */
5512 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5513 if (GET_CODE (op1) == NEG
5514 || GET_CODE (op1) == NOT
5515 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5516 op1 = XEXP (op1, 0);
5518 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5519 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5520 return true;
5523 /* We don't know what this is, cost all operands. */
5524 return false;
5527 /* Calculate the cost of calculating X, storing it in *COST. Result
5528 is true if the total cost of the operation has now been calculated. */
5529 static bool
5530 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5531 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5533 rtx op0, op1, op2;
5534 const struct cpu_cost_table *extra_cost
5535 = aarch64_tune_params->insn_extra_cost;
5536 machine_mode mode = GET_MODE (x);
5538 /* By default, assume that everything has equivalent cost to the
5539 cheapest instruction. Any additional costs are applied as a delta
5540 above this default. */
5541 *cost = COSTS_N_INSNS (1);
5543 /* TODO: The cost infrastructure currently does not handle
5544 vector operations. Assume that all vector operations
5545 are equally expensive. */
5546 if (VECTOR_MODE_P (mode))
5548 if (speed)
5549 *cost += extra_cost->vect.alu;
5550 return true;
5553 switch (code)
5555 case SET:
5556 /* The cost depends entirely on the operands to SET. */
5557 *cost = 0;
5558 op0 = SET_DEST (x);
5559 op1 = SET_SRC (x);
5561 switch (GET_CODE (op0))
5563 case MEM:
5564 if (speed)
5566 rtx address = XEXP (op0, 0);
5567 if (GET_MODE_CLASS (mode) == MODE_INT)
5568 *cost += extra_cost->ldst.store;
5569 else if (mode == SFmode)
5570 *cost += extra_cost->ldst.storef;
5571 else if (mode == DFmode)
5572 *cost += extra_cost->ldst.stored;
5574 *cost +=
5575 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5576 0, speed));
5579 *cost += rtx_cost (op1, SET, 1, speed);
5580 return true;
5582 case SUBREG:
5583 if (! REG_P (SUBREG_REG (op0)))
5584 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5586 /* Fall through. */
5587 case REG:
5588 /* const0_rtx is in general free, but we will use an
5589 instruction to set a register to 0. */
5590 if (REG_P (op1) || op1 == const0_rtx)
5592 /* The cost is 1 per register copied. */
5593 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5594 / UNITS_PER_WORD;
5595 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5597 else
5598 /* Cost is just the cost of the RHS of the set. */
5599 *cost += rtx_cost (op1, SET, 1, speed);
5600 return true;
5602 case ZERO_EXTRACT:
5603 case SIGN_EXTRACT:
5604 /* Bit-field insertion. Strip any redundant widening of
5605 the RHS to meet the width of the target. */
5606 if (GET_CODE (op1) == SUBREG)
5607 op1 = SUBREG_REG (op1);
5608 if ((GET_CODE (op1) == ZERO_EXTEND
5609 || GET_CODE (op1) == SIGN_EXTEND)
5610 && CONST_INT_P (XEXP (op0, 1))
5611 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5612 >= INTVAL (XEXP (op0, 1))))
5613 op1 = XEXP (op1, 0);
5615 if (CONST_INT_P (op1))
5617 /* MOV immediate is assumed to always be cheap. */
5618 *cost = COSTS_N_INSNS (1);
5620 else
5622 /* BFM. */
5623 if (speed)
5624 *cost += extra_cost->alu.bfi;
5625 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5628 return true;
5630 default:
5631 /* We can't make sense of this, assume default cost. */
5632 *cost = COSTS_N_INSNS (1);
5633 return false;
5635 return false;
5637 case CONST_INT:
5638 /* If an instruction can incorporate a constant within the
5639 instruction, the instruction's expression avoids calling
5640 rtx_cost() on the constant. If rtx_cost() is called on a
5641 constant, then it is usually because the constant must be
5642 moved into a register by one or more instructions.
5644 The exception is constant 0, which can be expressed
5645 as XZR/WZR and is therefore free. The exception to this is
5646 if we have (set (reg) (const0_rtx)) in which case we must cost
5647 the move. However, we can catch that when we cost the SET, so
5648 we don't need to consider that here. */
5649 if (x == const0_rtx)
5650 *cost = 0;
5651 else
5653 /* To an approximation, building any other constant is
5654 proportionally expensive to the number of instructions
5655 required to build that constant. This is true whether we
5656 are compiling for SPEED or otherwise. */
5657 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5658 (NULL_RTX, x, false, mode));
5660 return true;
5662 case CONST_DOUBLE:
5663 if (speed)
5665 /* mov[df,sf]_aarch64. */
5666 if (aarch64_float_const_representable_p (x))
5667 /* FMOV (scalar immediate). */
5668 *cost += extra_cost->fp[mode == DFmode].fpconst;
5669 else if (!aarch64_float_const_zero_rtx_p (x))
5671 /* This will be a load from memory. */
5672 if (mode == DFmode)
5673 *cost += extra_cost->ldst.loadd;
5674 else
5675 *cost += extra_cost->ldst.loadf;
5677 else
5678 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5679 or MOV v0.s[0], wzr - neither of which are modeled by the
5680 cost tables. Just use the default cost. */
5685 return true;
5687 case MEM:
5688 if (speed)
5690 /* For loads we want the base cost of a load, plus an
5691 approximation for the additional cost of the addressing
5692 mode. */
5693 rtx address = XEXP (x, 0);
5694 if (GET_MODE_CLASS (mode) == MODE_INT)
5695 *cost += extra_cost->ldst.load;
5696 else if (mode == SFmode)
5697 *cost += extra_cost->ldst.loadf;
5698 else if (mode == DFmode)
5699 *cost += extra_cost->ldst.loadd;
5701 *cost +=
5702 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5703 0, speed));
5706 return true;
5708 case NEG:
5709 op0 = XEXP (x, 0);
5711 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5713 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5714 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5716 /* CSETM. */
5717 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5718 return true;
5721 /* Cost this as SUB wzr, X. */
5722 op0 = CONST0_RTX (GET_MODE (x));
5723 op1 = XEXP (x, 0);
5724 goto cost_minus;
5727 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5729 /* Support (neg(fma...)) as a single instruction only if
5730 sign of zeros is unimportant. This matches the decision
5731 making in aarch64.md. */
5732 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5734 /* FNMADD. */
5735 *cost = rtx_cost (op0, NEG, 0, speed);
5736 return true;
5738 if (speed)
5739 /* FNEG. */
5740 *cost += extra_cost->fp[mode == DFmode].neg;
5741 return false;
5744 return false;
5746 case CLRSB:
5747 case CLZ:
5748 if (speed)
5749 *cost += extra_cost->alu.clz;
5751 return false;
5753 case COMPARE:
5754 op0 = XEXP (x, 0);
5755 op1 = XEXP (x, 1);
5757 if (op1 == const0_rtx
5758 && GET_CODE (op0) == AND)
5760 x = op0;
5761 goto cost_logic;
5764 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5766 /* TODO: A write to the CC flags possibly costs extra, this
5767 needs encoding in the cost tables. */
5769 /* CC_ZESWPmode supports zero extend for free. */
5770 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5771 op0 = XEXP (op0, 0);
5773 /* ANDS. */
5774 if (GET_CODE (op0) == AND)
5776 x = op0;
5777 goto cost_logic;
5780 if (GET_CODE (op0) == PLUS)
5782 /* ADDS (and CMN alias). */
5783 x = op0;
5784 goto cost_plus;
5787 if (GET_CODE (op0) == MINUS)
5789 /* SUBS. */
5790 x = op0;
5791 goto cost_minus;
5794 if (GET_CODE (op1) == NEG)
5796 /* CMN. */
5797 if (speed)
5798 *cost += extra_cost->alu.arith;
5800 *cost += rtx_cost (op0, COMPARE, 0, speed);
5801 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5802 return true;
5805 /* CMP.
5807 Compare can freely swap the order of operands, and
5808 canonicalization puts the more complex operation first.
5809 But the integer MINUS logic expects the shift/extend
5810 operation in op1. */
5811 if (! (REG_P (op0)
5812 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5814 op0 = XEXP (x, 1);
5815 op1 = XEXP (x, 0);
5817 goto cost_minus;
5820 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5822 /* FCMP. */
5823 if (speed)
5824 *cost += extra_cost->fp[mode == DFmode].compare;
5826 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5828 /* FCMP supports constant 0.0 for no extra cost. */
5829 return true;
5831 return false;
5834 return false;
5836 case MINUS:
5838 op0 = XEXP (x, 0);
5839 op1 = XEXP (x, 1);
5841 cost_minus:
5842 /* Detect valid immediates. */
5843 if ((GET_MODE_CLASS (mode) == MODE_INT
5844 || (GET_MODE_CLASS (mode) == MODE_CC
5845 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5846 && CONST_INT_P (op1)
5847 && aarch64_uimm12_shift (INTVAL (op1)))
5849 *cost += rtx_cost (op0, MINUS, 0, speed);
5851 if (speed)
5852 /* SUB(S) (immediate). */
5853 *cost += extra_cost->alu.arith;
5854 return true;
5858 /* Look for SUB (extended register). */
5859 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5861 if (speed)
5862 *cost += extra_cost->alu.arith_shift;
5864 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5865 (enum rtx_code) GET_CODE (op1),
5866 0, speed);
5867 return true;
5870 rtx new_op1 = aarch64_strip_extend (op1);
5872 /* Cost this as an FMA-alike operation. */
5873 if ((GET_CODE (new_op1) == MULT
5874 || GET_CODE (new_op1) == ASHIFT)
5875 && code != COMPARE)
5877 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5878 (enum rtx_code) code,
5879 speed);
5880 *cost += rtx_cost (op0, MINUS, 0, speed);
5881 return true;
5884 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5886 if (speed)
5888 if (GET_MODE_CLASS (mode) == MODE_INT)
5889 /* SUB(S). */
5890 *cost += extra_cost->alu.arith;
5891 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5892 /* FSUB. */
5893 *cost += extra_cost->fp[mode == DFmode].addsub;
5895 return true;
5898 case PLUS:
5900 rtx new_op0;
5902 op0 = XEXP (x, 0);
5903 op1 = XEXP (x, 1);
5905 cost_plus:
5906 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5907 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5909 /* CSINC. */
5910 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5911 *cost += rtx_cost (op1, PLUS, 1, speed);
5912 return true;
5915 if (GET_MODE_CLASS (mode) == MODE_INT
5916 && CONST_INT_P (op1)
5917 && aarch64_uimm12_shift (INTVAL (op1)))
5919 *cost += rtx_cost (op0, PLUS, 0, speed);
5921 if (speed)
5922 /* ADD (immediate). */
5923 *cost += extra_cost->alu.arith;
5924 return true;
5927 /* Look for ADD (extended register). */
5928 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5930 if (speed)
5931 *cost += extra_cost->alu.arith_shift;
5933 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5934 (enum rtx_code) GET_CODE (op0),
5935 0, speed);
5936 return true;
5939 /* Strip any extend, leave shifts behind as we will
5940 cost them through mult_cost. */
5941 new_op0 = aarch64_strip_extend (op0);
5943 if (GET_CODE (new_op0) == MULT
5944 || GET_CODE (new_op0) == ASHIFT)
5946 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5947 speed);
5948 *cost += rtx_cost (op1, PLUS, 1, speed);
5949 return true;
5952 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5953 + rtx_cost (op1, PLUS, 1, speed));
5955 if (speed)
5957 if (GET_MODE_CLASS (mode) == MODE_INT)
5958 /* ADD. */
5959 *cost += extra_cost->alu.arith;
5960 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5961 /* FADD. */
5962 *cost += extra_cost->fp[mode == DFmode].addsub;
5964 return true;
5967 case BSWAP:
5968 *cost = COSTS_N_INSNS (1);
5970 if (speed)
5971 *cost += extra_cost->alu.rev;
5973 return false;
5975 case IOR:
5976 if (aarch_rev16_p (x))
5978 *cost = COSTS_N_INSNS (1);
5980 if (speed)
5981 *cost += extra_cost->alu.rev;
5983 return true;
5985 /* Fall through. */
5986 case XOR:
5987 case AND:
5988 cost_logic:
5989 op0 = XEXP (x, 0);
5990 op1 = XEXP (x, 1);
5992 if (code == AND
5993 && GET_CODE (op0) == MULT
5994 && CONST_INT_P (XEXP (op0, 1))
5995 && CONST_INT_P (op1)
5996 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5997 INTVAL (op1)) != 0)
5999 /* This is a UBFM/SBFM. */
6000 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6001 if (speed)
6002 *cost += extra_cost->alu.bfx;
6003 return true;
6006 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6008 /* We possibly get the immediate for free, this is not
6009 modelled. */
6010 if (CONST_INT_P (op1)
6011 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6013 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6015 if (speed)
6016 *cost += extra_cost->alu.logical;
6018 return true;
6020 else
6022 rtx new_op0 = op0;
6024 /* Handle ORN, EON, or BIC. */
6025 if (GET_CODE (op0) == NOT)
6026 op0 = XEXP (op0, 0);
6028 new_op0 = aarch64_strip_shift (op0);
6030 /* If we had a shift on op0 then this is a logical-shift-
6031 by-register/immediate operation. Otherwise, this is just
6032 a logical operation. */
6033 if (speed)
6035 if (new_op0 != op0)
6037 /* Shift by immediate. */
6038 if (CONST_INT_P (XEXP (op0, 1)))
6039 *cost += extra_cost->alu.log_shift;
6040 else
6041 *cost += extra_cost->alu.log_shift_reg;
6043 else
6044 *cost += extra_cost->alu.logical;
6047 /* In both cases we want to cost both operands. */
6048 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6049 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6051 return true;
6054 return false;
6056 case NOT:
6057 /* MVN. */
6058 if (speed)
6059 *cost += extra_cost->alu.logical;
6061 /* The logical instruction could have the shifted register form,
6062 but the cost is the same if the shift is processed as a separate
6063 instruction, so we don't bother with it here. */
6064 return false;
6066 case ZERO_EXTEND:
6068 op0 = XEXP (x, 0);
6069 /* If a value is written in SI mode, then zero extended to DI
6070 mode, the operation will in general be free as a write to
6071 a 'w' register implicitly zeroes the upper bits of an 'x'
6072 register. However, if this is
6074 (set (reg) (zero_extend (reg)))
6076 we must cost the explicit register move. */
6077 if (mode == DImode
6078 && GET_MODE (op0) == SImode
6079 && outer == SET)
6081 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6083 if (!op_cost && speed)
6084 /* MOV. */
6085 *cost += extra_cost->alu.extend;
6086 else
6087 /* Free, the cost is that of the SI mode operation. */
6088 *cost = op_cost;
6090 return true;
6092 else if (MEM_P (XEXP (x, 0)))
6094 /* All loads can zero extend to any size for free. */
6095 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6096 return true;
6099 /* UXTB/UXTH. */
6100 if (speed)
6101 *cost += extra_cost->alu.extend;
6103 return false;
6105 case SIGN_EXTEND:
6106 if (MEM_P (XEXP (x, 0)))
6108 /* LDRSH. */
6109 if (speed)
6111 rtx address = XEXP (XEXP (x, 0), 0);
6112 *cost += extra_cost->ldst.load_sign_extend;
6114 *cost +=
6115 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6116 0, speed));
6118 return true;
6121 if (speed)
6122 *cost += extra_cost->alu.extend;
6123 return false;
6125 case ASHIFT:
6126 op0 = XEXP (x, 0);
6127 op1 = XEXP (x, 1);
6129 if (CONST_INT_P (op1))
6131 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6132 aliases. */
6133 if (speed)
6134 *cost += extra_cost->alu.shift;
6136 /* We can incorporate zero/sign extend for free. */
6137 if (GET_CODE (op0) == ZERO_EXTEND
6138 || GET_CODE (op0) == SIGN_EXTEND)
6139 op0 = XEXP (op0, 0);
6141 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6142 return true;
6144 else
6146 /* LSLV. */
6147 if (speed)
6148 *cost += extra_cost->alu.shift_reg;
6150 return false; /* All arguments need to be in registers. */
6153 case ROTATE:
6154 case ROTATERT:
6155 case LSHIFTRT:
6156 case ASHIFTRT:
6157 op0 = XEXP (x, 0);
6158 op1 = XEXP (x, 1);
6160 if (CONST_INT_P (op1))
6162 /* ASR (immediate) and friends. */
6163 if (speed)
6164 *cost += extra_cost->alu.shift;
6166 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6167 return true;
6169 else
6172 /* ASR (register) and friends. */
6173 if (speed)
6174 *cost += extra_cost->alu.shift_reg;
6176 return false; /* All arguments need to be in registers. */
6179 case SYMBOL_REF:
6181 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6183 /* LDR. */
6184 if (speed)
6185 *cost += extra_cost->ldst.load;
6187 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6188 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6190 /* ADRP, followed by ADD. */
6191 *cost += COSTS_N_INSNS (1);
6192 if (speed)
6193 *cost += 2 * extra_cost->alu.arith;
6195 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6196 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6198 /* ADR. */
6199 if (speed)
6200 *cost += extra_cost->alu.arith;
6203 if (flag_pic)
6205 /* One extra load instruction, after accessing the GOT. */
6206 *cost += COSTS_N_INSNS (1);
6207 if (speed)
6208 *cost += extra_cost->ldst.load;
6210 return true;
6212 case HIGH:
6213 case LO_SUM:
6214 /* ADRP/ADD (immediate). */
6215 if (speed)
6216 *cost += extra_cost->alu.arith;
6217 return true;
6219 case ZERO_EXTRACT:
6220 case SIGN_EXTRACT:
6221 /* UBFX/SBFX. */
6222 if (speed)
6223 *cost += extra_cost->alu.bfx;
6225 /* We can trust that the immediates used will be correct (there
6226 are no by-register forms), so we need only cost op0. */
6227 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6228 return true;
6230 case MULT:
6231 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6232 /* aarch64_rtx_mult_cost always handles recursion to its
6233 operands. */
6234 return true;
6236 case MOD:
6237 case UMOD:
6238 if (speed)
6240 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6241 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6242 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6243 else if (GET_MODE (x) == DFmode)
6244 *cost += (extra_cost->fp[1].mult
6245 + extra_cost->fp[1].div);
6246 else if (GET_MODE (x) == SFmode)
6247 *cost += (extra_cost->fp[0].mult
6248 + extra_cost->fp[0].div);
6250 return false; /* All arguments need to be in registers. */
6252 case DIV:
6253 case UDIV:
6254 case SQRT:
6255 if (speed)
6257 if (GET_MODE_CLASS (mode) == MODE_INT)
6258 /* There is no integer SQRT, so only DIV and UDIV can get
6259 here. */
6260 *cost += extra_cost->mult[mode == DImode].idiv;
6261 else
6262 *cost += extra_cost->fp[mode == DFmode].div;
6264 return false; /* All arguments need to be in registers. */
6266 case IF_THEN_ELSE:
6267 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6268 XEXP (x, 2), cost, speed);
6270 case EQ:
6271 case NE:
6272 case GT:
6273 case GTU:
6274 case LT:
6275 case LTU:
6276 case GE:
6277 case GEU:
6278 case LE:
6279 case LEU:
6281 return false; /* All arguments must be in registers. */
6283 case FMA:
6284 op0 = XEXP (x, 0);
6285 op1 = XEXP (x, 1);
6286 op2 = XEXP (x, 2);
6288 if (speed)
6289 *cost += extra_cost->fp[mode == DFmode].fma;
6291 /* FMSUB, FNMADD, and FNMSUB are free. */
6292 if (GET_CODE (op0) == NEG)
6293 op0 = XEXP (op0, 0);
6295 if (GET_CODE (op2) == NEG)
6296 op2 = XEXP (op2, 0);
6298 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6299 and the by-element operand as operand 0. */
6300 if (GET_CODE (op1) == NEG)
6301 op1 = XEXP (op1, 0);
6303 /* Catch vector-by-element operations. The by-element operand can
6304 either be (vec_duplicate (vec_select (x))) or just
6305 (vec_select (x)), depending on whether we are multiplying by
6306 a vector or a scalar.
6308 Canonicalization is not very good in these cases, FMA4 will put the
6309 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6310 if (GET_CODE (op0) == VEC_DUPLICATE)
6311 op0 = XEXP (op0, 0);
6312 else if (GET_CODE (op1) == VEC_DUPLICATE)
6313 op1 = XEXP (op1, 0);
6315 if (GET_CODE (op0) == VEC_SELECT)
6316 op0 = XEXP (op0, 0);
6317 else if (GET_CODE (op1) == VEC_SELECT)
6318 op1 = XEXP (op1, 0);
6320 /* If the remaining parameters are not registers,
6321 get the cost to put them into registers. */
6322 *cost += rtx_cost (op0, FMA, 0, speed);
6323 *cost += rtx_cost (op1, FMA, 1, speed);
6324 *cost += rtx_cost (op2, FMA, 2, speed);
6325 return true;
6327 case FLOAT_EXTEND:
6328 if (speed)
6329 *cost += extra_cost->fp[mode == DFmode].widen;
6330 return false;
6332 case FLOAT_TRUNCATE:
6333 if (speed)
6334 *cost += extra_cost->fp[mode == DFmode].narrow;
6335 return false;
6337 case FIX:
6338 case UNSIGNED_FIX:
6339 x = XEXP (x, 0);
6340 /* Strip the rounding part. They will all be implemented
6341 by the fcvt* family of instructions anyway. */
6342 if (GET_CODE (x) == UNSPEC)
6344 unsigned int uns_code = XINT (x, 1);
6346 if (uns_code == UNSPEC_FRINTA
6347 || uns_code == UNSPEC_FRINTM
6348 || uns_code == UNSPEC_FRINTN
6349 || uns_code == UNSPEC_FRINTP
6350 || uns_code == UNSPEC_FRINTZ)
6351 x = XVECEXP (x, 0, 0);
6354 if (speed)
6355 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6357 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6358 return true;
6360 case ABS:
6361 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6363 /* FABS and FNEG are analogous. */
6364 if (speed)
6365 *cost += extra_cost->fp[mode == DFmode].neg;
6367 else
6369 /* Integer ABS will either be split to
6370 two arithmetic instructions, or will be an ABS
6371 (scalar), which we don't model. */
6372 *cost = COSTS_N_INSNS (2);
6373 if (speed)
6374 *cost += 2 * extra_cost->alu.arith;
6376 return false;
6378 case SMAX:
6379 case SMIN:
6380 if (speed)
6382 /* FMAXNM/FMINNM/FMAX/FMIN.
6383 TODO: This may not be accurate for all implementations, but
6384 we do not model this in the cost tables. */
6385 *cost += extra_cost->fp[mode == DFmode].addsub;
6387 return false;
6389 case UNSPEC:
6390 /* The floating point round to integer frint* instructions. */
6391 if (aarch64_frint_unspec_p (XINT (x, 1)))
6393 if (speed)
6394 *cost += extra_cost->fp[mode == DFmode].roundint;
6396 return false;
6399 if (XINT (x, 1) == UNSPEC_RBIT)
6401 if (speed)
6402 *cost += extra_cost->alu.rev;
6404 return false;
6406 break;
6408 case TRUNCATE:
6410 /* Decompose <su>muldi3_highpart. */
6411 if (/* (truncate:DI */
6412 mode == DImode
6413 /* (lshiftrt:TI */
6414 && GET_MODE (XEXP (x, 0)) == TImode
6415 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6416 /* (mult:TI */
6417 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6418 /* (ANY_EXTEND:TI (reg:DI))
6419 (ANY_EXTEND:TI (reg:DI))) */
6420 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6421 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6422 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6423 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6424 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6425 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6426 /* (const_int 64) */
6427 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6428 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6430 /* UMULH/SMULH. */
6431 if (speed)
6432 *cost += extra_cost->mult[mode == DImode].extend;
6433 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6434 MULT, 0, speed);
6435 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6436 MULT, 1, speed);
6437 return true;
6440 /* Fall through. */
6441 default:
6442 break;
6445 if (dump_file && (dump_flags & TDF_DETAILS))
6446 fprintf (dump_file,
6447 "\nFailed to cost RTX. Assuming default cost.\n");
6449 return true;
6452 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6453 calculated for X. This cost is stored in *COST. Returns true
6454 if the total cost of X was calculated. */
6455 static bool
6456 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6457 int param, int *cost, bool speed)
6459 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6461 if (dump_file && (dump_flags & TDF_DETAILS))
6463 print_rtl_single (dump_file, x);
6464 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6465 speed ? "Hot" : "Cold",
6466 *cost, result ? "final" : "partial");
6469 return result;
6472 static int
6473 aarch64_register_move_cost (machine_mode mode,
6474 reg_class_t from_i, reg_class_t to_i)
6476 enum reg_class from = (enum reg_class) from_i;
6477 enum reg_class to = (enum reg_class) to_i;
6478 const struct cpu_regmove_cost *regmove_cost
6479 = aarch64_tune_params->regmove_cost;
6481 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6482 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6483 to = GENERAL_REGS;
6485 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6486 from = GENERAL_REGS;
6488 /* Moving between GPR and stack cost is the same as GP2GP. */
6489 if ((from == GENERAL_REGS && to == STACK_REG)
6490 || (to == GENERAL_REGS && from == STACK_REG))
6491 return regmove_cost->GP2GP;
6493 /* To/From the stack register, we move via the gprs. */
6494 if (to == STACK_REG || from == STACK_REG)
6495 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6496 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6498 if (GET_MODE_SIZE (mode) == 16)
6500 /* 128-bit operations on general registers require 2 instructions. */
6501 if (from == GENERAL_REGS && to == GENERAL_REGS)
6502 return regmove_cost->GP2GP * 2;
6503 else if (from == GENERAL_REGS)
6504 return regmove_cost->GP2FP * 2;
6505 else if (to == GENERAL_REGS)
6506 return regmove_cost->FP2GP * 2;
6508 /* When AdvSIMD instructions are disabled it is not possible to move
6509 a 128-bit value directly between Q registers. This is handled in
6510 secondary reload. A general register is used as a scratch to move
6511 the upper DI value and the lower DI value is moved directly,
6512 hence the cost is the sum of three moves. */
6513 if (! TARGET_SIMD)
6514 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6516 return regmove_cost->FP2FP;
6519 if (from == GENERAL_REGS && to == GENERAL_REGS)
6520 return regmove_cost->GP2GP;
6521 else if (from == GENERAL_REGS)
6522 return regmove_cost->GP2FP;
6523 else if (to == GENERAL_REGS)
6524 return regmove_cost->FP2GP;
6526 return regmove_cost->FP2FP;
6529 static int
6530 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6531 reg_class_t rclass ATTRIBUTE_UNUSED,
6532 bool in ATTRIBUTE_UNUSED)
6534 return aarch64_tune_params->memmov_cost;
6537 /* Return the number of instructions that can be issued per cycle. */
6538 static int
6539 aarch64_sched_issue_rate (void)
6541 return aarch64_tune_params->issue_rate;
6544 static int
6545 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6547 int issue_rate = aarch64_sched_issue_rate ();
6549 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6552 /* Vectorizer cost model target hooks. */
6554 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6555 static int
6556 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6557 tree vectype,
6558 int misalign ATTRIBUTE_UNUSED)
6560 unsigned elements;
6562 switch (type_of_cost)
6564 case scalar_stmt:
6565 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6567 case scalar_load:
6568 return aarch64_tune_params->vec_costs->scalar_load_cost;
6570 case scalar_store:
6571 return aarch64_tune_params->vec_costs->scalar_store_cost;
6573 case vector_stmt:
6574 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6576 case vector_load:
6577 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6579 case vector_store:
6580 return aarch64_tune_params->vec_costs->vec_store_cost;
6582 case vec_to_scalar:
6583 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6585 case scalar_to_vec:
6586 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6588 case unaligned_load:
6589 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6591 case unaligned_store:
6592 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6594 case cond_branch_taken:
6595 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6597 case cond_branch_not_taken:
6598 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6600 case vec_perm:
6601 case vec_promote_demote:
6602 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6604 case vec_construct:
6605 elements = TYPE_VECTOR_SUBPARTS (vectype);
6606 return elements / 2 + 1;
6608 default:
6609 gcc_unreachable ();
6613 /* Implement targetm.vectorize.add_stmt_cost. */
6614 static unsigned
6615 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6616 struct _stmt_vec_info *stmt_info, int misalign,
6617 enum vect_cost_model_location where)
6619 unsigned *cost = (unsigned *) data;
6620 unsigned retval = 0;
6622 if (flag_vect_cost_model)
6624 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6625 int stmt_cost =
6626 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6628 /* Statements in an inner loop relative to the loop being
6629 vectorized are weighted more heavily. The value here is
6630 a function (linear for now) of the loop nest level. */
6631 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6633 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6634 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6635 unsigned nest_level = loop_depth (loop);
6637 count *= nest_level;
6640 retval = (unsigned) (count * stmt_cost);
6641 cost[where] += retval;
6644 return retval;
6647 static void initialize_aarch64_code_model (void);
6649 /* Parse the architecture extension string. */
6651 static void
6652 aarch64_parse_extension (char *str)
6654 /* The extension string is parsed left to right. */
6655 const struct aarch64_option_extension *opt = NULL;
6657 /* Flag to say whether we are adding or removing an extension. */
6658 int adding_ext = -1;
6660 while (str != NULL && *str != 0)
6662 char *ext;
6663 size_t len;
6665 str++;
6666 ext = strchr (str, '+');
6668 if (ext != NULL)
6669 len = ext - str;
6670 else
6671 len = strlen (str);
6673 if (len >= 2 && strncmp (str, "no", 2) == 0)
6675 adding_ext = 0;
6676 len -= 2;
6677 str += 2;
6679 else if (len > 0)
6680 adding_ext = 1;
6682 if (len == 0)
6684 error ("missing feature modifier after %qs", adding_ext ? "+"
6685 : "+no");
6686 return;
6689 /* Scan over the extensions table trying to find an exact match. */
6690 for (opt = all_extensions; opt->name != NULL; opt++)
6692 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6694 /* Add or remove the extension. */
6695 if (adding_ext)
6696 aarch64_isa_flags |= opt->flags_on;
6697 else
6698 aarch64_isa_flags &= ~(opt->flags_off);
6699 break;
6703 if (opt->name == NULL)
6705 /* Extension not found in list. */
6706 error ("unknown feature modifier %qs", str);
6707 return;
6710 str = ext;
6713 return;
6716 /* Parse the ARCH string. */
6718 static void
6719 aarch64_parse_arch (void)
6721 char *ext;
6722 const struct processor *arch;
6723 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6724 size_t len;
6726 strcpy (str, aarch64_arch_string);
6728 ext = strchr (str, '+');
6730 if (ext != NULL)
6731 len = ext - str;
6732 else
6733 len = strlen (str);
6735 if (len == 0)
6737 error ("missing arch name in -march=%qs", str);
6738 return;
6741 /* Loop through the list of supported ARCHs to find a match. */
6742 for (arch = all_architectures; arch->name != NULL; arch++)
6744 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6746 selected_arch = arch;
6747 aarch64_isa_flags = selected_arch->flags;
6749 if (!selected_cpu)
6750 selected_cpu = &all_cores[selected_arch->core];
6752 if (ext != NULL)
6754 /* ARCH string contains at least one extension. */
6755 aarch64_parse_extension (ext);
6758 if (strcmp (selected_arch->arch, selected_cpu->arch))
6760 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6761 selected_cpu->name, selected_arch->name);
6764 return;
6768 /* ARCH name not found in list. */
6769 error ("unknown value %qs for -march", str);
6770 return;
6773 /* Parse the CPU string. */
6775 static void
6776 aarch64_parse_cpu (void)
6778 char *ext;
6779 const struct processor *cpu;
6780 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6781 size_t len;
6783 strcpy (str, aarch64_cpu_string);
6785 ext = strchr (str, '+');
6787 if (ext != NULL)
6788 len = ext - str;
6789 else
6790 len = strlen (str);
6792 if (len == 0)
6794 error ("missing cpu name in -mcpu=%qs", str);
6795 return;
6798 /* Loop through the list of supported CPUs to find a match. */
6799 for (cpu = all_cores; cpu->name != NULL; cpu++)
6801 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6803 selected_cpu = cpu;
6804 aarch64_isa_flags = selected_cpu->flags;
6806 if (ext != NULL)
6808 /* CPU string contains at least one extension. */
6809 aarch64_parse_extension (ext);
6812 return;
6816 /* CPU name not found in list. */
6817 error ("unknown value %qs for -mcpu", str);
6818 return;
6821 /* Parse the TUNE string. */
6823 static void
6824 aarch64_parse_tune (void)
6826 const struct processor *cpu;
6827 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6828 strcpy (str, aarch64_tune_string);
6830 /* Loop through the list of supported CPUs to find a match. */
6831 for (cpu = all_cores; cpu->name != NULL; cpu++)
6833 if (strcmp (cpu->name, str) == 0)
6835 selected_tune = cpu;
6836 return;
6840 /* CPU name not found in list. */
6841 error ("unknown value %qs for -mtune", str);
6842 return;
6846 /* Implement TARGET_OPTION_OVERRIDE. */
6848 static void
6849 aarch64_override_options (void)
6851 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6852 If either of -march or -mtune is given, they override their
6853 respective component of -mcpu.
6855 So, first parse AARCH64_CPU_STRING, then the others, be careful
6856 with -march as, if -mcpu is not present on the command line, march
6857 must set a sensible default CPU. */
6858 if (aarch64_cpu_string)
6860 aarch64_parse_cpu ();
6863 if (aarch64_arch_string)
6865 aarch64_parse_arch ();
6868 if (aarch64_tune_string)
6870 aarch64_parse_tune ();
6873 #ifndef HAVE_AS_MABI_OPTION
6874 /* The compiler may have been configured with 2.23.* binutils, which does
6875 not have support for ILP32. */
6876 if (TARGET_ILP32)
6877 error ("Assembler does not support -mabi=ilp32");
6878 #endif
6880 initialize_aarch64_code_model ();
6882 aarch64_build_bitmask_table ();
6884 /* This target defaults to strict volatile bitfields. */
6885 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6886 flag_strict_volatile_bitfields = 1;
6888 /* If the user did not specify a processor, choose the default
6889 one for them. This will be the CPU set during configuration using
6890 --with-cpu, otherwise it is "generic". */
6891 if (!selected_cpu)
6893 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6894 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6897 gcc_assert (selected_cpu);
6899 if (!selected_tune)
6900 selected_tune = selected_cpu;
6902 aarch64_tune_flags = selected_tune->flags;
6903 aarch64_tune = selected_tune->core;
6904 aarch64_tune_params = selected_tune->tune;
6905 aarch64_architecture_version = selected_cpu->architecture_version;
6907 if (aarch64_fix_a53_err835769 == 2)
6909 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6910 aarch64_fix_a53_err835769 = 1;
6911 #else
6912 aarch64_fix_a53_err835769 = 0;
6913 #endif
6916 /* If not opzimizing for size, set the default
6917 alignment to what the target wants */
6918 if (!optimize_size)
6920 if (align_loops <= 0)
6921 align_loops = aarch64_tune_params->loop_align;
6922 if (align_jumps <= 0)
6923 align_jumps = aarch64_tune_params->jump_align;
6924 if (align_functions <= 0)
6925 align_functions = aarch64_tune_params->function_align;
6928 aarch64_override_options_after_change ();
6931 /* Implement targetm.override_options_after_change. */
6933 static void
6934 aarch64_override_options_after_change (void)
6936 if (flag_omit_frame_pointer)
6937 flag_omit_leaf_frame_pointer = false;
6938 else if (flag_omit_leaf_frame_pointer)
6939 flag_omit_frame_pointer = true;
6942 static struct machine_function *
6943 aarch64_init_machine_status (void)
6945 struct machine_function *machine;
6946 machine = ggc_cleared_alloc<machine_function> ();
6947 return machine;
6950 void
6951 aarch64_init_expanders (void)
6953 init_machine_status = aarch64_init_machine_status;
6956 /* A checking mechanism for the implementation of the various code models. */
6957 static void
6958 initialize_aarch64_code_model (void)
6960 if (flag_pic)
6962 switch (aarch64_cmodel_var)
6964 case AARCH64_CMODEL_TINY:
6965 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6966 break;
6967 case AARCH64_CMODEL_SMALL:
6968 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6969 break;
6970 case AARCH64_CMODEL_LARGE:
6971 sorry ("code model %qs with -f%s", "large",
6972 flag_pic > 1 ? "PIC" : "pic");
6973 default:
6974 gcc_unreachable ();
6977 else
6978 aarch64_cmodel = aarch64_cmodel_var;
6981 /* Return true if SYMBOL_REF X binds locally. */
6983 static bool
6984 aarch64_symbol_binds_local_p (const_rtx x)
6986 return (SYMBOL_REF_DECL (x)
6987 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6988 : SYMBOL_REF_LOCAL_P (x));
6991 /* Return true if SYMBOL_REF X is thread local */
6992 static bool
6993 aarch64_tls_symbol_p (rtx x)
6995 if (! TARGET_HAVE_TLS)
6996 return false;
6998 if (GET_CODE (x) != SYMBOL_REF)
6999 return false;
7001 return SYMBOL_REF_TLS_MODEL (x) != 0;
7004 /* Classify a TLS symbol into one of the TLS kinds. */
7005 enum aarch64_symbol_type
7006 aarch64_classify_tls_symbol (rtx x)
7008 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7010 switch (tls_kind)
7012 case TLS_MODEL_GLOBAL_DYNAMIC:
7013 case TLS_MODEL_LOCAL_DYNAMIC:
7014 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7016 case TLS_MODEL_INITIAL_EXEC:
7017 return SYMBOL_SMALL_GOTTPREL;
7019 case TLS_MODEL_LOCAL_EXEC:
7020 return SYMBOL_SMALL_TPREL;
7022 case TLS_MODEL_EMULATED:
7023 case TLS_MODEL_NONE:
7024 return SYMBOL_FORCE_TO_MEM;
7026 default:
7027 gcc_unreachable ();
7031 /* Return the method that should be used to access SYMBOL_REF or
7032 LABEL_REF X in context CONTEXT. */
7034 enum aarch64_symbol_type
7035 aarch64_classify_symbol (rtx x, rtx offset,
7036 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7038 if (GET_CODE (x) == LABEL_REF)
7040 switch (aarch64_cmodel)
7042 case AARCH64_CMODEL_LARGE:
7043 return SYMBOL_FORCE_TO_MEM;
7045 case AARCH64_CMODEL_TINY_PIC:
7046 case AARCH64_CMODEL_TINY:
7047 return SYMBOL_TINY_ABSOLUTE;
7049 case AARCH64_CMODEL_SMALL_PIC:
7050 case AARCH64_CMODEL_SMALL:
7051 return SYMBOL_SMALL_ABSOLUTE;
7053 default:
7054 gcc_unreachable ();
7058 if (GET_CODE (x) == SYMBOL_REF)
7060 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7061 return SYMBOL_FORCE_TO_MEM;
7063 if (aarch64_tls_symbol_p (x))
7064 return aarch64_classify_tls_symbol (x);
7066 switch (aarch64_cmodel)
7068 case AARCH64_CMODEL_TINY:
7069 /* When we retreive symbol + offset address, we have to make sure
7070 the offset does not cause overflow of the final address. But
7071 we have no way of knowing the address of symbol at compile time
7072 so we can't accurately say if the distance between the PC and
7073 symbol + offset is outside the addressible range of +/-1M in the
7074 TINY code model. So we rely on images not being greater than
7075 1M and cap the offset at 1M and anything beyond 1M will have to
7076 be loaded using an alternative mechanism. */
7077 if (SYMBOL_REF_WEAK (x)
7078 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7079 return SYMBOL_FORCE_TO_MEM;
7080 return SYMBOL_TINY_ABSOLUTE;
7082 case AARCH64_CMODEL_SMALL:
7083 /* Same reasoning as the tiny code model, but the offset cap here is
7084 4G. */
7085 if (SYMBOL_REF_WEAK (x)
7086 || INTVAL (offset) < (HOST_WIDE_INT) -4294967263
7087 || INTVAL (offset) > (HOST_WIDE_INT) 4294967264)
7088 return SYMBOL_FORCE_TO_MEM;
7089 return SYMBOL_SMALL_ABSOLUTE;
7091 case AARCH64_CMODEL_TINY_PIC:
7092 if (!aarch64_symbol_binds_local_p (x))
7093 return SYMBOL_TINY_GOT;
7094 return SYMBOL_TINY_ABSOLUTE;
7096 case AARCH64_CMODEL_SMALL_PIC:
7097 if (!aarch64_symbol_binds_local_p (x))
7098 return SYMBOL_SMALL_GOT;
7099 return SYMBOL_SMALL_ABSOLUTE;
7101 default:
7102 gcc_unreachable ();
7106 /* By default push everything into the constant pool. */
7107 return SYMBOL_FORCE_TO_MEM;
7110 bool
7111 aarch64_constant_address_p (rtx x)
7113 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7116 bool
7117 aarch64_legitimate_pic_operand_p (rtx x)
7119 if (GET_CODE (x) == SYMBOL_REF
7120 || (GET_CODE (x) == CONST
7121 && GET_CODE (XEXP (x, 0)) == PLUS
7122 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7123 return false;
7125 return true;
7128 /* Return true if X holds either a quarter-precision or
7129 floating-point +0.0 constant. */
7130 static bool
7131 aarch64_valid_floating_const (machine_mode mode, rtx x)
7133 if (!CONST_DOUBLE_P (x))
7134 return false;
7136 /* TODO: We could handle moving 0.0 to a TFmode register,
7137 but first we would like to refactor the movtf_aarch64
7138 to be more amicable to split moves properly and
7139 correctly gate on TARGET_SIMD. For now - reject all
7140 constants which are not to SFmode or DFmode registers. */
7141 if (!(mode == SFmode || mode == DFmode))
7142 return false;
7144 if (aarch64_float_const_zero_rtx_p (x))
7145 return true;
7146 return aarch64_float_const_representable_p (x);
7149 static bool
7150 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7152 /* Do not allow vector struct mode constants. We could support
7153 0 and -1 easily, but they need support in aarch64-simd.md. */
7154 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7155 return false;
7157 /* This could probably go away because
7158 we now decompose CONST_INTs according to expand_mov_immediate. */
7159 if ((GET_CODE (x) == CONST_VECTOR
7160 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7161 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7162 return !targetm.cannot_force_const_mem (mode, x);
7164 if (GET_CODE (x) == HIGH
7165 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7166 return true;
7168 return aarch64_constant_address_p (x);
7172 aarch64_load_tp (rtx target)
7174 if (!target
7175 || GET_MODE (target) != Pmode
7176 || !register_operand (target, Pmode))
7177 target = gen_reg_rtx (Pmode);
7179 /* Can return in any reg. */
7180 emit_insn (gen_aarch64_load_tp_hard (target));
7181 return target;
7184 /* On AAPCS systems, this is the "struct __va_list". */
7185 static GTY(()) tree va_list_type;
7187 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7188 Return the type to use as __builtin_va_list.
7190 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7192 struct __va_list
7194 void *__stack;
7195 void *__gr_top;
7196 void *__vr_top;
7197 int __gr_offs;
7198 int __vr_offs;
7199 }; */
7201 static tree
7202 aarch64_build_builtin_va_list (void)
7204 tree va_list_name;
7205 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7207 /* Create the type. */
7208 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7209 /* Give it the required name. */
7210 va_list_name = build_decl (BUILTINS_LOCATION,
7211 TYPE_DECL,
7212 get_identifier ("__va_list"),
7213 va_list_type);
7214 DECL_ARTIFICIAL (va_list_name) = 1;
7215 TYPE_NAME (va_list_type) = va_list_name;
7216 TYPE_STUB_DECL (va_list_type) = va_list_name;
7218 /* Create the fields. */
7219 f_stack = build_decl (BUILTINS_LOCATION,
7220 FIELD_DECL, get_identifier ("__stack"),
7221 ptr_type_node);
7222 f_grtop = build_decl (BUILTINS_LOCATION,
7223 FIELD_DECL, get_identifier ("__gr_top"),
7224 ptr_type_node);
7225 f_vrtop = build_decl (BUILTINS_LOCATION,
7226 FIELD_DECL, get_identifier ("__vr_top"),
7227 ptr_type_node);
7228 f_groff = build_decl (BUILTINS_LOCATION,
7229 FIELD_DECL, get_identifier ("__gr_offs"),
7230 integer_type_node);
7231 f_vroff = build_decl (BUILTINS_LOCATION,
7232 FIELD_DECL, get_identifier ("__vr_offs"),
7233 integer_type_node);
7235 DECL_ARTIFICIAL (f_stack) = 1;
7236 DECL_ARTIFICIAL (f_grtop) = 1;
7237 DECL_ARTIFICIAL (f_vrtop) = 1;
7238 DECL_ARTIFICIAL (f_groff) = 1;
7239 DECL_ARTIFICIAL (f_vroff) = 1;
7241 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7242 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7243 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7244 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7245 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7247 TYPE_FIELDS (va_list_type) = f_stack;
7248 DECL_CHAIN (f_stack) = f_grtop;
7249 DECL_CHAIN (f_grtop) = f_vrtop;
7250 DECL_CHAIN (f_vrtop) = f_groff;
7251 DECL_CHAIN (f_groff) = f_vroff;
7253 /* Compute its layout. */
7254 layout_type (va_list_type);
7256 return va_list_type;
7259 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7260 static void
7261 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7263 const CUMULATIVE_ARGS *cum;
7264 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7265 tree stack, grtop, vrtop, groff, vroff;
7266 tree t;
7267 int gr_save_area_size;
7268 int vr_save_area_size;
7269 int vr_offset;
7271 cum = &crtl->args.info;
7272 gr_save_area_size
7273 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7274 vr_save_area_size
7275 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7277 if (TARGET_GENERAL_REGS_ONLY)
7279 if (cum->aapcs_nvrn > 0)
7280 sorry ("%qs and floating point or vector arguments",
7281 "-mgeneral-regs-only");
7282 vr_save_area_size = 0;
7285 f_stack = TYPE_FIELDS (va_list_type_node);
7286 f_grtop = DECL_CHAIN (f_stack);
7287 f_vrtop = DECL_CHAIN (f_grtop);
7288 f_groff = DECL_CHAIN (f_vrtop);
7289 f_vroff = DECL_CHAIN (f_groff);
7291 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7292 NULL_TREE);
7293 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7294 NULL_TREE);
7295 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7296 NULL_TREE);
7297 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7298 NULL_TREE);
7299 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7300 NULL_TREE);
7302 /* Emit code to initialize STACK, which points to the next varargs stack
7303 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7304 by named arguments. STACK is 8-byte aligned. */
7305 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7306 if (cum->aapcs_stack_size > 0)
7307 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7308 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7309 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7311 /* Emit code to initialize GRTOP, the top of the GR save area.
7312 virtual_incoming_args_rtx should have been 16 byte aligned. */
7313 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7314 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7315 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7317 /* Emit code to initialize VRTOP, the top of the VR save area.
7318 This address is gr_save_area_bytes below GRTOP, rounded
7319 down to the next 16-byte boundary. */
7320 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7321 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7322 STACK_BOUNDARY / BITS_PER_UNIT);
7324 if (vr_offset)
7325 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7326 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7327 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7329 /* Emit code to initialize GROFF, the offset from GRTOP of the
7330 next GPR argument. */
7331 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7332 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7333 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7335 /* Likewise emit code to initialize VROFF, the offset from FTOP
7336 of the next VR argument. */
7337 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7338 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7339 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7342 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7344 static tree
7345 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7346 gimple_seq *post_p ATTRIBUTE_UNUSED)
7348 tree addr;
7349 bool indirect_p;
7350 bool is_ha; /* is HFA or HVA. */
7351 bool dw_align; /* double-word align. */
7352 machine_mode ag_mode = VOIDmode;
7353 int nregs;
7354 machine_mode mode;
7356 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7357 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7358 HOST_WIDE_INT size, rsize, adjust, align;
7359 tree t, u, cond1, cond2;
7361 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7362 if (indirect_p)
7363 type = build_pointer_type (type);
7365 mode = TYPE_MODE (type);
7367 f_stack = TYPE_FIELDS (va_list_type_node);
7368 f_grtop = DECL_CHAIN (f_stack);
7369 f_vrtop = DECL_CHAIN (f_grtop);
7370 f_groff = DECL_CHAIN (f_vrtop);
7371 f_vroff = DECL_CHAIN (f_groff);
7373 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7374 f_stack, NULL_TREE);
7375 size = int_size_in_bytes (type);
7376 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7378 dw_align = false;
7379 adjust = 0;
7380 if (aarch64_vfp_is_call_or_return_candidate (mode,
7381 type,
7382 &ag_mode,
7383 &nregs,
7384 &is_ha))
7386 /* TYPE passed in fp/simd registers. */
7387 if (TARGET_GENERAL_REGS_ONLY)
7388 sorry ("%qs and floating point or vector arguments",
7389 "-mgeneral-regs-only");
7391 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7392 unshare_expr (valist), f_vrtop, NULL_TREE);
7393 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7394 unshare_expr (valist), f_vroff, NULL_TREE);
7396 rsize = nregs * UNITS_PER_VREG;
7398 if (is_ha)
7400 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7401 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7403 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7404 && size < UNITS_PER_VREG)
7406 adjust = UNITS_PER_VREG - size;
7409 else
7411 /* TYPE passed in general registers. */
7412 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7413 unshare_expr (valist), f_grtop, NULL_TREE);
7414 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7415 unshare_expr (valist), f_groff, NULL_TREE);
7416 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7417 nregs = rsize / UNITS_PER_WORD;
7419 if (align > 8)
7420 dw_align = true;
7422 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7423 && size < UNITS_PER_WORD)
7425 adjust = UNITS_PER_WORD - size;
7429 /* Get a local temporary for the field value. */
7430 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7432 /* Emit code to branch if off >= 0. */
7433 t = build2 (GE_EXPR, boolean_type_node, off,
7434 build_int_cst (TREE_TYPE (off), 0));
7435 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7437 if (dw_align)
7439 /* Emit: offs = (offs + 15) & -16. */
7440 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7441 build_int_cst (TREE_TYPE (off), 15));
7442 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7443 build_int_cst (TREE_TYPE (off), -16));
7444 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7446 else
7447 roundup = NULL;
7449 /* Update ap.__[g|v]r_offs */
7450 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7451 build_int_cst (TREE_TYPE (off), rsize));
7452 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7454 /* String up. */
7455 if (roundup)
7456 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7458 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7459 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7460 build_int_cst (TREE_TYPE (f_off), 0));
7461 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7463 /* String up: make sure the assignment happens before the use. */
7464 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7465 COND_EXPR_ELSE (cond1) = t;
7467 /* Prepare the trees handling the argument that is passed on the stack;
7468 the top level node will store in ON_STACK. */
7469 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7470 if (align > 8)
7472 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7473 t = fold_convert (intDI_type_node, arg);
7474 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7475 build_int_cst (TREE_TYPE (t), 15));
7476 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7477 build_int_cst (TREE_TYPE (t), -16));
7478 t = fold_convert (TREE_TYPE (arg), t);
7479 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7481 else
7482 roundup = NULL;
7483 /* Advance ap.__stack */
7484 t = fold_convert (intDI_type_node, arg);
7485 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7486 build_int_cst (TREE_TYPE (t), size + 7));
7487 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7488 build_int_cst (TREE_TYPE (t), -8));
7489 t = fold_convert (TREE_TYPE (arg), t);
7490 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7491 /* String up roundup and advance. */
7492 if (roundup)
7493 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7494 /* String up with arg */
7495 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7496 /* Big-endianness related address adjustment. */
7497 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7498 && size < UNITS_PER_WORD)
7500 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7501 size_int (UNITS_PER_WORD - size));
7502 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7505 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7506 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7508 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7509 t = off;
7510 if (adjust)
7511 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7512 build_int_cst (TREE_TYPE (off), adjust));
7514 t = fold_convert (sizetype, t);
7515 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7517 if (is_ha)
7519 /* type ha; // treat as "struct {ftype field[n];}"
7520 ... [computing offs]
7521 for (i = 0; i <nregs; ++i, offs += 16)
7522 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7523 return ha; */
7524 int i;
7525 tree tmp_ha, field_t, field_ptr_t;
7527 /* Declare a local variable. */
7528 tmp_ha = create_tmp_var_raw (type, "ha");
7529 gimple_add_tmp_var (tmp_ha);
7531 /* Establish the base type. */
7532 switch (ag_mode)
7534 case SFmode:
7535 field_t = float_type_node;
7536 field_ptr_t = float_ptr_type_node;
7537 break;
7538 case DFmode:
7539 field_t = double_type_node;
7540 field_ptr_t = double_ptr_type_node;
7541 break;
7542 case TFmode:
7543 field_t = long_double_type_node;
7544 field_ptr_t = long_double_ptr_type_node;
7545 break;
7546 /* The half precision and quad precision are not fully supported yet. Enable
7547 the following code after the support is complete. Need to find the correct
7548 type node for __fp16 *. */
7549 #if 0
7550 case HFmode:
7551 field_t = float_type_node;
7552 field_ptr_t = float_ptr_type_node;
7553 break;
7554 #endif
7555 case V2SImode:
7556 case V4SImode:
7558 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7559 field_t = build_vector_type_for_mode (innertype, ag_mode);
7560 field_ptr_t = build_pointer_type (field_t);
7562 break;
7563 default:
7564 gcc_assert (0);
7567 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7568 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7569 addr = t;
7570 t = fold_convert (field_ptr_t, addr);
7571 t = build2 (MODIFY_EXPR, field_t,
7572 build1 (INDIRECT_REF, field_t, tmp_ha),
7573 build1 (INDIRECT_REF, field_t, t));
7575 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7576 for (i = 1; i < nregs; ++i)
7578 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7579 u = fold_convert (field_ptr_t, addr);
7580 u = build2 (MODIFY_EXPR, field_t,
7581 build2 (MEM_REF, field_t, tmp_ha,
7582 build_int_cst (field_ptr_t,
7583 (i *
7584 int_size_in_bytes (field_t)))),
7585 build1 (INDIRECT_REF, field_t, u));
7586 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7589 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7590 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7593 COND_EXPR_ELSE (cond2) = t;
7594 addr = fold_convert (build_pointer_type (type), cond1);
7595 addr = build_va_arg_indirect_ref (addr);
7597 if (indirect_p)
7598 addr = build_va_arg_indirect_ref (addr);
7600 return addr;
7603 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7605 static void
7606 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7607 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7608 int no_rtl)
7610 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7611 CUMULATIVE_ARGS local_cum;
7612 int gr_saved, vr_saved;
7614 /* The caller has advanced CUM up to, but not beyond, the last named
7615 argument. Advance a local copy of CUM past the last "real" named
7616 argument, to find out how many registers are left over. */
7617 local_cum = *cum;
7618 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7620 /* Found out how many registers we need to save. */
7621 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7622 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7624 if (TARGET_GENERAL_REGS_ONLY)
7626 if (local_cum.aapcs_nvrn > 0)
7627 sorry ("%qs and floating point or vector arguments",
7628 "-mgeneral-regs-only");
7629 vr_saved = 0;
7632 if (!no_rtl)
7634 if (gr_saved > 0)
7636 rtx ptr, mem;
7638 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7639 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7640 - gr_saved * UNITS_PER_WORD);
7641 mem = gen_frame_mem (BLKmode, ptr);
7642 set_mem_alias_set (mem, get_varargs_alias_set ());
7644 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7645 mem, gr_saved);
7647 if (vr_saved > 0)
7649 /* We can't use move_block_from_reg, because it will use
7650 the wrong mode, storing D regs only. */
7651 machine_mode mode = TImode;
7652 int off, i;
7654 /* Set OFF to the offset from virtual_incoming_args_rtx of
7655 the first vector register. The VR save area lies below
7656 the GR one, and is aligned to 16 bytes. */
7657 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7658 STACK_BOUNDARY / BITS_PER_UNIT);
7659 off -= vr_saved * UNITS_PER_VREG;
7661 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7663 rtx ptr, mem;
7665 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7666 mem = gen_frame_mem (mode, ptr);
7667 set_mem_alias_set (mem, get_varargs_alias_set ());
7668 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7669 off += UNITS_PER_VREG;
7674 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7675 any complication of having crtl->args.pretend_args_size changed. */
7676 cfun->machine->frame.saved_varargs_size
7677 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7678 STACK_BOUNDARY / BITS_PER_UNIT)
7679 + vr_saved * UNITS_PER_VREG);
7682 static void
7683 aarch64_conditional_register_usage (void)
7685 int i;
7686 if (!TARGET_FLOAT)
7688 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7690 fixed_regs[i] = 1;
7691 call_used_regs[i] = 1;
7696 /* Walk down the type tree of TYPE counting consecutive base elements.
7697 If *MODEP is VOIDmode, then set it to the first valid floating point
7698 type. If a non-floating point type is found, or if a floating point
7699 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7700 otherwise return the count in the sub-tree. */
7701 static int
7702 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7704 machine_mode mode;
7705 HOST_WIDE_INT size;
7707 switch (TREE_CODE (type))
7709 case REAL_TYPE:
7710 mode = TYPE_MODE (type);
7711 if (mode != DFmode && mode != SFmode && mode != TFmode)
7712 return -1;
7714 if (*modep == VOIDmode)
7715 *modep = mode;
7717 if (*modep == mode)
7718 return 1;
7720 break;
7722 case COMPLEX_TYPE:
7723 mode = TYPE_MODE (TREE_TYPE (type));
7724 if (mode != DFmode && mode != SFmode && mode != TFmode)
7725 return -1;
7727 if (*modep == VOIDmode)
7728 *modep = mode;
7730 if (*modep == mode)
7731 return 2;
7733 break;
7735 case VECTOR_TYPE:
7736 /* Use V2SImode and V4SImode as representatives of all 64-bit
7737 and 128-bit vector types. */
7738 size = int_size_in_bytes (type);
7739 switch (size)
7741 case 8:
7742 mode = V2SImode;
7743 break;
7744 case 16:
7745 mode = V4SImode;
7746 break;
7747 default:
7748 return -1;
7751 if (*modep == VOIDmode)
7752 *modep = mode;
7754 /* Vector modes are considered to be opaque: two vectors are
7755 equivalent for the purposes of being homogeneous aggregates
7756 if they are the same size. */
7757 if (*modep == mode)
7758 return 1;
7760 break;
7762 case ARRAY_TYPE:
7764 int count;
7765 tree index = TYPE_DOMAIN (type);
7767 /* Can't handle incomplete types nor sizes that are not
7768 fixed. */
7769 if (!COMPLETE_TYPE_P (type)
7770 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7771 return -1;
7773 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7774 if (count == -1
7775 || !index
7776 || !TYPE_MAX_VALUE (index)
7777 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7778 || !TYPE_MIN_VALUE (index)
7779 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7780 || count < 0)
7781 return -1;
7783 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7784 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7786 /* There must be no padding. */
7787 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7788 return -1;
7790 return count;
7793 case RECORD_TYPE:
7795 int count = 0;
7796 int sub_count;
7797 tree field;
7799 /* Can't handle incomplete types nor sizes that are not
7800 fixed. */
7801 if (!COMPLETE_TYPE_P (type)
7802 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7803 return -1;
7805 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7807 if (TREE_CODE (field) != FIELD_DECL)
7808 continue;
7810 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7811 if (sub_count < 0)
7812 return -1;
7813 count += sub_count;
7816 /* There must be no padding. */
7817 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7818 return -1;
7820 return count;
7823 case UNION_TYPE:
7824 case QUAL_UNION_TYPE:
7826 /* These aren't very interesting except in a degenerate case. */
7827 int count = 0;
7828 int sub_count;
7829 tree field;
7831 /* Can't handle incomplete types nor sizes that are not
7832 fixed. */
7833 if (!COMPLETE_TYPE_P (type)
7834 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7835 return -1;
7837 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7839 if (TREE_CODE (field) != FIELD_DECL)
7840 continue;
7842 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7843 if (sub_count < 0)
7844 return -1;
7845 count = count > sub_count ? count : sub_count;
7848 /* There must be no padding. */
7849 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7850 return -1;
7852 return count;
7855 default:
7856 break;
7859 return -1;
7862 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7863 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7864 array types. The C99 floating-point complex types are also considered
7865 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7866 types, which are GCC extensions and out of the scope of AAPCS64, are
7867 treated as composite types here as well.
7869 Note that MODE itself is not sufficient in determining whether a type
7870 is such a composite type or not. This is because
7871 stor-layout.c:compute_record_mode may have already changed the MODE
7872 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7873 structure with only one field may have its MODE set to the mode of the
7874 field. Also an integer mode whose size matches the size of the
7875 RECORD_TYPE type may be used to substitute the original mode
7876 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7877 solely relied on. */
7879 static bool
7880 aarch64_composite_type_p (const_tree type,
7881 machine_mode mode)
7883 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7884 return true;
7886 if (mode == BLKmode
7887 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7888 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7889 return true;
7891 return false;
7894 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7895 type as described in AAPCS64 \S 4.1.2.
7897 See the comment above aarch64_composite_type_p for the notes on MODE. */
7899 static bool
7900 aarch64_short_vector_p (const_tree type,
7901 machine_mode mode)
7903 HOST_WIDE_INT size = -1;
7905 if (type && TREE_CODE (type) == VECTOR_TYPE)
7906 size = int_size_in_bytes (type);
7907 else if (!aarch64_composite_type_p (type, mode)
7908 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7909 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7910 size = GET_MODE_SIZE (mode);
7912 return (size == 8 || size == 16) ? true : false;
7915 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7916 shall be passed or returned in simd/fp register(s) (providing these
7917 parameter passing registers are available).
7919 Upon successful return, *COUNT returns the number of needed registers,
7920 *BASE_MODE returns the mode of the individual register and when IS_HAF
7921 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7922 floating-point aggregate or a homogeneous short-vector aggregate. */
7924 static bool
7925 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7926 const_tree type,
7927 machine_mode *base_mode,
7928 int *count,
7929 bool *is_ha)
7931 machine_mode new_mode = VOIDmode;
7932 bool composite_p = aarch64_composite_type_p (type, mode);
7934 if (is_ha != NULL) *is_ha = false;
7936 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7937 || aarch64_short_vector_p (type, mode))
7939 *count = 1;
7940 new_mode = mode;
7942 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7944 if (is_ha != NULL) *is_ha = true;
7945 *count = 2;
7946 new_mode = GET_MODE_INNER (mode);
7948 else if (type && composite_p)
7950 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7952 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7954 if (is_ha != NULL) *is_ha = true;
7955 *count = ag_count;
7957 else
7958 return false;
7960 else
7961 return false;
7963 *base_mode = new_mode;
7964 return true;
7967 /* Implement TARGET_STRUCT_VALUE_RTX. */
7969 static rtx
7970 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7971 int incoming ATTRIBUTE_UNUSED)
7973 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7976 /* Implements target hook vector_mode_supported_p. */
7977 static bool
7978 aarch64_vector_mode_supported_p (machine_mode mode)
7980 if (TARGET_SIMD
7981 && (mode == V4SImode || mode == V8HImode
7982 || mode == V16QImode || mode == V2DImode
7983 || mode == V2SImode || mode == V4HImode
7984 || mode == V8QImode || mode == V2SFmode
7985 || mode == V4SFmode || mode == V2DFmode
7986 || mode == V1DFmode))
7987 return true;
7989 return false;
7992 /* Return appropriate SIMD container
7993 for MODE within a vector of WIDTH bits. */
7994 static machine_mode
7995 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7997 gcc_assert (width == 64 || width == 128);
7998 if (TARGET_SIMD)
8000 if (width == 128)
8001 switch (mode)
8003 case DFmode:
8004 return V2DFmode;
8005 case SFmode:
8006 return V4SFmode;
8007 case SImode:
8008 return V4SImode;
8009 case HImode:
8010 return V8HImode;
8011 case QImode:
8012 return V16QImode;
8013 case DImode:
8014 return V2DImode;
8015 default:
8016 break;
8018 else
8019 switch (mode)
8021 case SFmode:
8022 return V2SFmode;
8023 case SImode:
8024 return V2SImode;
8025 case HImode:
8026 return V4HImode;
8027 case QImode:
8028 return V8QImode;
8029 default:
8030 break;
8033 return word_mode;
8036 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8037 static machine_mode
8038 aarch64_preferred_simd_mode (machine_mode mode)
8040 return aarch64_simd_container_mode (mode, 128);
8043 /* Return the bitmask of possible vector sizes for the vectorizer
8044 to iterate over. */
8045 static unsigned int
8046 aarch64_autovectorize_vector_sizes (void)
8048 return (16 | 8);
8051 /* Implement TARGET_MANGLE_TYPE. */
8053 static const char *
8054 aarch64_mangle_type (const_tree type)
8056 /* The AArch64 ABI documents say that "__va_list" has to be
8057 managled as if it is in the "std" namespace. */
8058 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8059 return "St9__va_list";
8061 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8062 builtin types. */
8063 if (TYPE_NAME (type) != NULL)
8064 return aarch64_mangle_builtin_type (type);
8066 /* Use the default mangling. */
8067 return NULL;
8071 /* Return true if the rtx_insn contains a MEM RTX somewhere
8072 in it. */
8074 static bool
8075 has_memory_op (rtx_insn *mem_insn)
8077 subrtx_iterator::array_type array;
8078 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8079 if (MEM_P (*iter))
8080 return true;
8082 return false;
8085 /* Find the first rtx_insn before insn that will generate an assembly
8086 instruction. */
8088 static rtx_insn *
8089 aarch64_prev_real_insn (rtx_insn *insn)
8091 if (!insn)
8092 return NULL;
8096 insn = prev_real_insn (insn);
8098 while (insn && recog_memoized (insn) < 0);
8100 return insn;
8103 static bool
8104 is_madd_op (enum attr_type t1)
8106 unsigned int i;
8107 /* A number of these may be AArch32 only. */
8108 enum attr_type mlatypes[] = {
8109 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8110 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8111 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8114 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8116 if (t1 == mlatypes[i])
8117 return true;
8120 return false;
8123 /* Check if there is a register dependency between a load and the insn
8124 for which we hold recog_data. */
8126 static bool
8127 dep_between_memop_and_curr (rtx memop)
8129 rtx load_reg;
8130 int opno;
8132 gcc_assert (GET_CODE (memop) == SET);
8134 if (!REG_P (SET_DEST (memop)))
8135 return false;
8137 load_reg = SET_DEST (memop);
8138 for (opno = 1; opno < recog_data.n_operands; opno++)
8140 rtx operand = recog_data.operand[opno];
8141 if (REG_P (operand)
8142 && reg_overlap_mentioned_p (load_reg, operand))
8143 return true;
8146 return false;
8150 /* When working around the Cortex-A53 erratum 835769,
8151 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8152 instruction and has a preceding memory instruction such that a NOP
8153 should be inserted between them. */
8155 bool
8156 aarch64_madd_needs_nop (rtx_insn* insn)
8158 enum attr_type attr_type;
8159 rtx_insn *prev;
8160 rtx body;
8162 if (!aarch64_fix_a53_err835769)
8163 return false;
8165 if (recog_memoized (insn) < 0)
8166 return false;
8168 attr_type = get_attr_type (insn);
8169 if (!is_madd_op (attr_type))
8170 return false;
8172 prev = aarch64_prev_real_insn (insn);
8173 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8174 Restore recog state to INSN to avoid state corruption. */
8175 extract_constrain_insn_cached (insn);
8177 if (!prev || !has_memory_op (prev))
8178 return false;
8180 body = single_set (prev);
8182 /* If the previous insn is a memory op and there is no dependency between
8183 it and the DImode madd, emit a NOP between them. If body is NULL then we
8184 have a complex memory operation, probably a load/store pair.
8185 Be conservative for now and emit a NOP. */
8186 if (GET_MODE (recog_data.operand[0]) == DImode
8187 && (!body || !dep_between_memop_and_curr (body)))
8188 return true;
8190 return false;
8195 /* Implement FINAL_PRESCAN_INSN. */
8197 void
8198 aarch64_final_prescan_insn (rtx_insn *insn)
8200 if (aarch64_madd_needs_nop (insn))
8201 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8205 /* Return the equivalent letter for size. */
8206 static char
8207 sizetochar (int size)
8209 switch (size)
8211 case 64: return 'd';
8212 case 32: return 's';
8213 case 16: return 'h';
8214 case 8 : return 'b';
8215 default: gcc_unreachable ();
8219 /* Return true iff x is a uniform vector of floating-point
8220 constants, and the constant can be represented in
8221 quarter-precision form. Note, as aarch64_float_const_representable
8222 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8223 static bool
8224 aarch64_vect_float_const_representable_p (rtx x)
8226 int i = 0;
8227 REAL_VALUE_TYPE r0, ri;
8228 rtx x0, xi;
8230 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8231 return false;
8233 x0 = CONST_VECTOR_ELT (x, 0);
8234 if (!CONST_DOUBLE_P (x0))
8235 return false;
8237 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8239 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8241 xi = CONST_VECTOR_ELT (x, i);
8242 if (!CONST_DOUBLE_P (xi))
8243 return false;
8245 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8246 if (!REAL_VALUES_EQUAL (r0, ri))
8247 return false;
8250 return aarch64_float_const_representable_p (x0);
8253 /* Return true for valid and false for invalid. */
8254 bool
8255 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8256 struct simd_immediate_info *info)
8258 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8259 matches = 1; \
8260 for (i = 0; i < idx; i += (STRIDE)) \
8261 if (!(TEST)) \
8262 matches = 0; \
8263 if (matches) \
8265 immtype = (CLASS); \
8266 elsize = (ELSIZE); \
8267 eshift = (SHIFT); \
8268 emvn = (NEG); \
8269 break; \
8272 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8273 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8274 unsigned char bytes[16];
8275 int immtype = -1, matches;
8276 unsigned int invmask = inverse ? 0xff : 0;
8277 int eshift, emvn;
8279 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8281 if (! (aarch64_simd_imm_zero_p (op, mode)
8282 || aarch64_vect_float_const_representable_p (op)))
8283 return false;
8285 if (info)
8287 info->value = CONST_VECTOR_ELT (op, 0);
8288 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8289 info->mvn = false;
8290 info->shift = 0;
8293 return true;
8296 /* Splat vector constant out into a byte vector. */
8297 for (i = 0; i < n_elts; i++)
8299 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8300 it must be laid out in the vector register in reverse order. */
8301 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8302 unsigned HOST_WIDE_INT elpart;
8303 unsigned int part, parts;
8305 if (CONST_INT_P (el))
8307 elpart = INTVAL (el);
8308 parts = 1;
8310 else if (GET_CODE (el) == CONST_DOUBLE)
8312 elpart = CONST_DOUBLE_LOW (el);
8313 parts = 2;
8315 else
8316 gcc_unreachable ();
8318 for (part = 0; part < parts; part++)
8320 unsigned int byte;
8321 for (byte = 0; byte < innersize; byte++)
8323 bytes[idx++] = (elpart & 0xff) ^ invmask;
8324 elpart >>= BITS_PER_UNIT;
8326 if (GET_CODE (el) == CONST_DOUBLE)
8327 elpart = CONST_DOUBLE_HIGH (el);
8331 /* Sanity check. */
8332 gcc_assert (idx == GET_MODE_SIZE (mode));
8336 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8337 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8339 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8340 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8342 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8343 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8345 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8346 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8348 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8350 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8352 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8353 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8355 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8356 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8358 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8359 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8361 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8362 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8364 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8366 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8368 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8369 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8371 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8372 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8374 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8375 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8377 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8378 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8380 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8382 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8383 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8385 while (0);
8387 if (immtype == -1)
8388 return false;
8390 if (info)
8392 info->element_width = elsize;
8393 info->mvn = emvn != 0;
8394 info->shift = eshift;
8396 unsigned HOST_WIDE_INT imm = 0;
8398 if (immtype >= 12 && immtype <= 15)
8399 info->msl = true;
8401 /* Un-invert bytes of recognized vector, if necessary. */
8402 if (invmask != 0)
8403 for (i = 0; i < idx; i++)
8404 bytes[i] ^= invmask;
8406 if (immtype == 17)
8408 /* FIXME: Broken on 32-bit H_W_I hosts. */
8409 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8411 for (i = 0; i < 8; i++)
8412 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8413 << (i * BITS_PER_UNIT);
8416 info->value = GEN_INT (imm);
8418 else
8420 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8421 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8423 /* Construct 'abcdefgh' because the assembler cannot handle
8424 generic constants. */
8425 if (info->mvn)
8426 imm = ~imm;
8427 imm = (imm >> info->shift) & 0xff;
8428 info->value = GEN_INT (imm);
8432 return true;
8433 #undef CHECK
8436 /* Check of immediate shift constants are within range. */
8437 bool
8438 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8440 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8441 if (left)
8442 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8443 else
8444 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8447 /* Return true if X is a uniform vector where all elements
8448 are either the floating-point constant 0.0 or the
8449 integer constant 0. */
8450 bool
8451 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8453 return x == CONST0_RTX (mode);
8456 bool
8457 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8459 HOST_WIDE_INT imm = INTVAL (x);
8460 int i;
8462 for (i = 0; i < 8; i++)
8464 unsigned int byte = imm & 0xff;
8465 if (byte != 0xff && byte != 0)
8466 return false;
8467 imm >>= 8;
8470 return true;
8473 bool
8474 aarch64_mov_operand_p (rtx x,
8475 enum aarch64_symbol_context context,
8476 machine_mode mode)
8478 if (GET_CODE (x) == HIGH
8479 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8480 return true;
8482 if (CONST_INT_P (x))
8483 return true;
8485 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8486 return true;
8488 return aarch64_classify_symbolic_expression (x, context)
8489 == SYMBOL_TINY_ABSOLUTE;
8492 /* Return a const_int vector of VAL. */
8494 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8496 int nunits = GET_MODE_NUNITS (mode);
8497 rtvec v = rtvec_alloc (nunits);
8498 int i;
8500 for (i=0; i < nunits; i++)
8501 RTVEC_ELT (v, i) = GEN_INT (val);
8503 return gen_rtx_CONST_VECTOR (mode, v);
8506 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8508 bool
8509 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8511 machine_mode vmode;
8513 gcc_assert (!VECTOR_MODE_P (mode));
8514 vmode = aarch64_preferred_simd_mode (mode);
8515 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8516 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8519 /* Construct and return a PARALLEL RTX vector with elements numbering the
8520 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8521 the vector - from the perspective of the architecture. This does not
8522 line up with GCC's perspective on lane numbers, so we end up with
8523 different masks depending on our target endian-ness. The diagram
8524 below may help. We must draw the distinction when building masks
8525 which select one half of the vector. An instruction selecting
8526 architectural low-lanes for a big-endian target, must be described using
8527 a mask selecting GCC high-lanes.
8529 Big-Endian Little-Endian
8531 GCC 0 1 2 3 3 2 1 0
8532 | x | x | x | x | | x | x | x | x |
8533 Architecture 3 2 1 0 3 2 1 0
8535 Low Mask: { 2, 3 } { 0, 1 }
8536 High Mask: { 0, 1 } { 2, 3 }
8540 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8542 int nunits = GET_MODE_NUNITS (mode);
8543 rtvec v = rtvec_alloc (nunits / 2);
8544 int high_base = nunits / 2;
8545 int low_base = 0;
8546 int base;
8547 rtx t1;
8548 int i;
8550 if (BYTES_BIG_ENDIAN)
8551 base = high ? low_base : high_base;
8552 else
8553 base = high ? high_base : low_base;
8555 for (i = 0; i < nunits / 2; i++)
8556 RTVEC_ELT (v, i) = GEN_INT (base + i);
8558 t1 = gen_rtx_PARALLEL (mode, v);
8559 return t1;
8562 /* Check OP for validity as a PARALLEL RTX vector with elements
8563 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8564 from the perspective of the architecture. See the diagram above
8565 aarch64_simd_vect_par_cnst_half for more details. */
8567 bool
8568 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8569 bool high)
8571 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8572 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8573 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8574 int i = 0;
8576 if (!VECTOR_MODE_P (mode))
8577 return false;
8579 if (count_op != count_ideal)
8580 return false;
8582 for (i = 0; i < count_ideal; i++)
8584 rtx elt_op = XVECEXP (op, 0, i);
8585 rtx elt_ideal = XVECEXP (ideal, 0, i);
8587 if (!CONST_INT_P (elt_op)
8588 || INTVAL (elt_ideal) != INTVAL (elt_op))
8589 return false;
8591 return true;
8594 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8595 HIGH (exclusive). */
8596 void
8597 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8598 const_tree exp)
8600 HOST_WIDE_INT lane;
8601 gcc_assert (CONST_INT_P (operand));
8602 lane = INTVAL (operand);
8604 if (lane < low || lane >= high)
8606 if (exp)
8607 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8608 else
8609 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8613 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8614 registers). */
8615 void
8616 aarch64_simd_emit_pair_result_insn (machine_mode mode,
8617 rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8618 rtx op1)
8620 rtx mem = gen_rtx_MEM (mode, destaddr);
8621 rtx tmp1 = gen_reg_rtx (mode);
8622 rtx tmp2 = gen_reg_rtx (mode);
8624 emit_insn (intfn (tmp1, op1, tmp2));
8626 emit_move_insn (mem, tmp1);
8627 mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8628 emit_move_insn (mem, tmp2);
8631 /* Return TRUE if OP is a valid vector addressing mode. */
8632 bool
8633 aarch64_simd_mem_operand_p (rtx op)
8635 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8636 || REG_P (XEXP (op, 0)));
8639 /* Emit a register copy from operand to operand, taking care not to
8640 early-clobber source registers in the process.
8642 COUNT is the number of components into which the copy needs to be
8643 decomposed. */
8644 void
8645 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8646 unsigned int count)
8648 unsigned int i;
8649 int rdest = REGNO (operands[0]);
8650 int rsrc = REGNO (operands[1]);
8652 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8653 || rdest < rsrc)
8654 for (i = 0; i < count; i++)
8655 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8656 gen_rtx_REG (mode, rsrc + i));
8657 else
8658 for (i = 0; i < count; i++)
8659 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8660 gen_rtx_REG (mode, rsrc + count - i - 1));
8663 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8664 one of VSTRUCT modes: OI, CI or XI. */
8666 aarch64_simd_attr_length_move (rtx_insn *insn)
8668 machine_mode mode;
8670 extract_insn_cached (insn);
8672 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8674 mode = GET_MODE (recog_data.operand[0]);
8675 switch (mode)
8677 case OImode:
8678 return 8;
8679 case CImode:
8680 return 12;
8681 case XImode:
8682 return 16;
8683 default:
8684 gcc_unreachable ();
8687 return 4;
8690 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8691 alignment of a vector to 128 bits. */
8692 static HOST_WIDE_INT
8693 aarch64_simd_vector_alignment (const_tree type)
8695 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8696 return MIN (align, 128);
8699 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8700 static bool
8701 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8703 if (is_packed)
8704 return false;
8706 /* We guarantee alignment for vectors up to 128-bits. */
8707 if (tree_int_cst_compare (TYPE_SIZE (type),
8708 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8709 return false;
8711 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8712 return true;
8715 /* If VALS is a vector constant that can be loaded into a register
8716 using DUP, generate instructions to do so and return an RTX to
8717 assign to the register. Otherwise return NULL_RTX. */
8718 static rtx
8719 aarch64_simd_dup_constant (rtx vals)
8721 machine_mode mode = GET_MODE (vals);
8722 machine_mode inner_mode = GET_MODE_INNER (mode);
8723 int n_elts = GET_MODE_NUNITS (mode);
8724 bool all_same = true;
8725 rtx x;
8726 int i;
8728 if (GET_CODE (vals) != CONST_VECTOR)
8729 return NULL_RTX;
8731 for (i = 1; i < n_elts; ++i)
8733 x = CONST_VECTOR_ELT (vals, i);
8734 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8735 all_same = false;
8738 if (!all_same)
8739 return NULL_RTX;
8741 /* We can load this constant by using DUP and a constant in a
8742 single ARM register. This will be cheaper than a vector
8743 load. */
8744 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8745 return gen_rtx_VEC_DUPLICATE (mode, x);
8749 /* Generate code to load VALS, which is a PARALLEL containing only
8750 constants (for vec_init) or CONST_VECTOR, efficiently into a
8751 register. Returns an RTX to copy into the register, or NULL_RTX
8752 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8753 static rtx
8754 aarch64_simd_make_constant (rtx vals)
8756 machine_mode mode = GET_MODE (vals);
8757 rtx const_dup;
8758 rtx const_vec = NULL_RTX;
8759 int n_elts = GET_MODE_NUNITS (mode);
8760 int n_const = 0;
8761 int i;
8763 if (GET_CODE (vals) == CONST_VECTOR)
8764 const_vec = vals;
8765 else if (GET_CODE (vals) == PARALLEL)
8767 /* A CONST_VECTOR must contain only CONST_INTs and
8768 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8769 Only store valid constants in a CONST_VECTOR. */
8770 for (i = 0; i < n_elts; ++i)
8772 rtx x = XVECEXP (vals, 0, i);
8773 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8774 n_const++;
8776 if (n_const == n_elts)
8777 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8779 else
8780 gcc_unreachable ();
8782 if (const_vec != NULL_RTX
8783 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8784 /* Load using MOVI/MVNI. */
8785 return const_vec;
8786 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8787 /* Loaded using DUP. */
8788 return const_dup;
8789 else if (const_vec != NULL_RTX)
8790 /* Load from constant pool. We can not take advantage of single-cycle
8791 LD1 because we need a PC-relative addressing mode. */
8792 return const_vec;
8793 else
8794 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8795 We can not construct an initializer. */
8796 return NULL_RTX;
8799 void
8800 aarch64_expand_vector_init (rtx target, rtx vals)
8802 machine_mode mode = GET_MODE (target);
8803 machine_mode inner_mode = GET_MODE_INNER (mode);
8804 int n_elts = GET_MODE_NUNITS (mode);
8805 int n_var = 0, one_var = -1;
8806 bool all_same = true;
8807 rtx x, mem;
8808 int i;
8810 x = XVECEXP (vals, 0, 0);
8811 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8812 n_var = 1, one_var = 0;
8814 for (i = 1; i < n_elts; ++i)
8816 x = XVECEXP (vals, 0, i);
8817 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8818 ++n_var, one_var = i;
8820 if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8821 all_same = false;
8824 if (n_var == 0)
8826 rtx constant = aarch64_simd_make_constant (vals);
8827 if (constant != NULL_RTX)
8829 emit_move_insn (target, constant);
8830 return;
8834 /* Splat a single non-constant element if we can. */
8835 if (all_same)
8837 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8838 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8839 return;
8842 /* One field is non-constant. Load constant then overwrite varying
8843 field. This is more efficient than using the stack. */
8844 if (n_var == 1)
8846 rtx copy = copy_rtx (vals);
8847 rtx index = GEN_INT (one_var);
8848 enum insn_code icode;
8850 /* Load constant part of vector, substitute neighboring value for
8851 varying element. */
8852 XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8853 aarch64_expand_vector_init (target, copy);
8855 /* Insert variable. */
8856 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8857 icode = optab_handler (vec_set_optab, mode);
8858 gcc_assert (icode != CODE_FOR_nothing);
8859 emit_insn (GEN_FCN (icode) (target, x, index));
8860 return;
8863 /* Construct the vector in memory one field at a time
8864 and load the whole vector. */
8865 mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8866 for (i = 0; i < n_elts; i++)
8867 emit_move_insn (adjust_address_nv (mem, inner_mode,
8868 i * GET_MODE_SIZE (inner_mode)),
8869 XVECEXP (vals, 0, i));
8870 emit_move_insn (target, mem);
8874 static unsigned HOST_WIDE_INT
8875 aarch64_shift_truncation_mask (machine_mode mode)
8877 return
8878 (aarch64_vector_mode_supported_p (mode)
8879 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8882 #ifndef TLS_SECTION_ASM_FLAG
8883 #define TLS_SECTION_ASM_FLAG 'T'
8884 #endif
8886 void
8887 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8888 tree decl ATTRIBUTE_UNUSED)
8890 char flagchars[10], *f = flagchars;
8892 /* If we have already declared this section, we can use an
8893 abbreviated form to switch back to it -- unless this section is
8894 part of a COMDAT groups, in which case GAS requires the full
8895 declaration every time. */
8896 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8897 && (flags & SECTION_DECLARED))
8899 fprintf (asm_out_file, "\t.section\t%s\n", name);
8900 return;
8903 if (!(flags & SECTION_DEBUG))
8904 *f++ = 'a';
8905 if (flags & SECTION_WRITE)
8906 *f++ = 'w';
8907 if (flags & SECTION_CODE)
8908 *f++ = 'x';
8909 if (flags & SECTION_SMALL)
8910 *f++ = 's';
8911 if (flags & SECTION_MERGE)
8912 *f++ = 'M';
8913 if (flags & SECTION_STRINGS)
8914 *f++ = 'S';
8915 if (flags & SECTION_TLS)
8916 *f++ = TLS_SECTION_ASM_FLAG;
8917 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8918 *f++ = 'G';
8919 *f = '\0';
8921 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8923 if (!(flags & SECTION_NOTYPE))
8925 const char *type;
8926 const char *format;
8928 if (flags & SECTION_BSS)
8929 type = "nobits";
8930 else
8931 type = "progbits";
8933 #ifdef TYPE_OPERAND_FMT
8934 format = "," TYPE_OPERAND_FMT;
8935 #else
8936 format = ",@%s";
8937 #endif
8939 fprintf (asm_out_file, format, type);
8941 if (flags & SECTION_ENTSIZE)
8942 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8943 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8945 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8946 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8947 else
8948 fprintf (asm_out_file, ",%s,comdat",
8949 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8953 putc ('\n', asm_out_file);
8956 /* Select a format to encode pointers in exception handling data. */
8958 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8960 int type;
8961 switch (aarch64_cmodel)
8963 case AARCH64_CMODEL_TINY:
8964 case AARCH64_CMODEL_TINY_PIC:
8965 case AARCH64_CMODEL_SMALL:
8966 case AARCH64_CMODEL_SMALL_PIC:
8967 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8968 for everything. */
8969 type = DW_EH_PE_sdata4;
8970 break;
8971 default:
8972 /* No assumptions here. 8-byte relocs required. */
8973 type = DW_EH_PE_sdata8;
8974 break;
8976 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8979 /* Emit load exclusive. */
8981 static void
8982 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8983 rtx mem, rtx model_rtx)
8985 rtx (*gen) (rtx, rtx, rtx);
8987 switch (mode)
8989 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8990 case HImode: gen = gen_aarch64_load_exclusivehi; break;
8991 case SImode: gen = gen_aarch64_load_exclusivesi; break;
8992 case DImode: gen = gen_aarch64_load_exclusivedi; break;
8993 default:
8994 gcc_unreachable ();
8997 emit_insn (gen (rval, mem, model_rtx));
9000 /* Emit store exclusive. */
9002 static void
9003 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9004 rtx rval, rtx mem, rtx model_rtx)
9006 rtx (*gen) (rtx, rtx, rtx, rtx);
9008 switch (mode)
9010 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9011 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9012 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9013 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9014 default:
9015 gcc_unreachable ();
9018 emit_insn (gen (bval, rval, mem, model_rtx));
9021 /* Mark the previous jump instruction as unlikely. */
9023 static void
9024 aarch64_emit_unlikely_jump (rtx insn)
9026 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9028 insn = emit_jump_insn (insn);
9029 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9032 /* Expand a compare and swap pattern. */
9034 void
9035 aarch64_expand_compare_and_swap (rtx operands[])
9037 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9038 machine_mode mode, cmp_mode;
9039 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9041 bval = operands[0];
9042 rval = operands[1];
9043 mem = operands[2];
9044 oldval = operands[3];
9045 newval = operands[4];
9046 is_weak = operands[5];
9047 mod_s = operands[6];
9048 mod_f = operands[7];
9049 mode = GET_MODE (mem);
9050 cmp_mode = mode;
9052 /* Normally the succ memory model must be stronger than fail, but in the
9053 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9054 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9056 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9057 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9058 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9060 switch (mode)
9062 case QImode:
9063 case HImode:
9064 /* For short modes, we're going to perform the comparison in SImode,
9065 so do the zero-extension now. */
9066 cmp_mode = SImode;
9067 rval = gen_reg_rtx (SImode);
9068 oldval = convert_modes (SImode, mode, oldval, true);
9069 /* Fall through. */
9071 case SImode:
9072 case DImode:
9073 /* Force the value into a register if needed. */
9074 if (!aarch64_plus_operand (oldval, mode))
9075 oldval = force_reg (cmp_mode, oldval);
9076 break;
9078 default:
9079 gcc_unreachable ();
9082 switch (mode)
9084 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9085 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9086 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9087 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9088 default:
9089 gcc_unreachable ();
9092 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9094 if (mode == QImode || mode == HImode)
9095 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9097 x = gen_rtx_REG (CCmode, CC_REGNUM);
9098 x = gen_rtx_EQ (SImode, x, const0_rtx);
9099 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9102 /* Split a compare and swap pattern. */
9104 void
9105 aarch64_split_compare_and_swap (rtx operands[])
9107 rtx rval, mem, oldval, newval, scratch;
9108 machine_mode mode;
9109 bool is_weak;
9110 rtx_code_label *label1, *label2;
9111 rtx x, cond;
9113 rval = operands[0];
9114 mem = operands[1];
9115 oldval = operands[2];
9116 newval = operands[3];
9117 is_weak = (operands[4] != const0_rtx);
9118 scratch = operands[7];
9119 mode = GET_MODE (mem);
9121 label1 = NULL;
9122 if (!is_weak)
9124 label1 = gen_label_rtx ();
9125 emit_label (label1);
9127 label2 = gen_label_rtx ();
9129 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9131 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9132 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9133 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9134 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9135 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9137 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9139 if (!is_weak)
9141 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9142 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9143 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9144 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9146 else
9148 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9149 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9150 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9153 emit_label (label2);
9156 /* Split an atomic operation. */
9158 void
9159 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9160 rtx value, rtx model_rtx, rtx cond)
9162 machine_mode mode = GET_MODE (mem);
9163 machine_mode wmode = (mode == DImode ? DImode : SImode);
9164 rtx_code_label *label;
9165 rtx x;
9167 label = gen_label_rtx ();
9168 emit_label (label);
9170 if (new_out)
9171 new_out = gen_lowpart (wmode, new_out);
9172 if (old_out)
9173 old_out = gen_lowpart (wmode, old_out);
9174 else
9175 old_out = new_out;
9176 value = simplify_gen_subreg (wmode, value, mode, 0);
9178 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9180 switch (code)
9182 case SET:
9183 new_out = value;
9184 break;
9186 case NOT:
9187 x = gen_rtx_AND (wmode, old_out, value);
9188 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9189 x = gen_rtx_NOT (wmode, new_out);
9190 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9191 break;
9193 case MINUS:
9194 if (CONST_INT_P (value))
9196 value = GEN_INT (-INTVAL (value));
9197 code = PLUS;
9199 /* Fall through. */
9201 default:
9202 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9203 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9204 break;
9207 aarch64_emit_store_exclusive (mode, cond, mem,
9208 gen_lowpart (mode, new_out), model_rtx);
9210 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9211 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9212 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9213 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9216 static void
9217 aarch64_print_extension (void)
9219 const struct aarch64_option_extension *opt = NULL;
9221 for (opt = all_extensions; opt->name != NULL; opt++)
9222 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9223 asm_fprintf (asm_out_file, "+%s", opt->name);
9225 asm_fprintf (asm_out_file, "\n");
9228 static void
9229 aarch64_start_file (void)
9231 if (selected_arch)
9233 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9234 aarch64_print_extension ();
9236 else if (selected_cpu)
9238 const char *truncated_name
9239 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9240 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9241 aarch64_print_extension ();
9243 default_file_start();
9246 /* Target hook for c_mode_for_suffix. */
9247 static machine_mode
9248 aarch64_c_mode_for_suffix (char suffix)
9250 if (suffix == 'q')
9251 return TFmode;
9253 return VOIDmode;
9256 /* We can only represent floating point constants which will fit in
9257 "quarter-precision" values. These values are characterised by
9258 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9261 (-1)^s * (n/16) * 2^r
9263 Where:
9264 's' is the sign bit.
9265 'n' is an integer in the range 16 <= n <= 31.
9266 'r' is an integer in the range -3 <= r <= 4. */
9268 /* Return true iff X can be represented by a quarter-precision
9269 floating point immediate operand X. Note, we cannot represent 0.0. */
9270 bool
9271 aarch64_float_const_representable_p (rtx x)
9273 /* This represents our current view of how many bits
9274 make up the mantissa. */
9275 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9276 int exponent;
9277 unsigned HOST_WIDE_INT mantissa, mask;
9278 REAL_VALUE_TYPE r, m;
9279 bool fail;
9281 if (!CONST_DOUBLE_P (x))
9282 return false;
9284 if (GET_MODE (x) == VOIDmode)
9285 return false;
9287 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9289 /* We cannot represent infinities, NaNs or +/-zero. We won't
9290 know if we have +zero until we analyse the mantissa, but we
9291 can reject the other invalid values. */
9292 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9293 || REAL_VALUE_MINUS_ZERO (r))
9294 return false;
9296 /* Extract exponent. */
9297 r = real_value_abs (&r);
9298 exponent = REAL_EXP (&r);
9300 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9301 highest (sign) bit, with a fixed binary point at bit point_pos.
9302 m1 holds the low part of the mantissa, m2 the high part.
9303 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9304 bits for the mantissa, this can fail (low bits will be lost). */
9305 real_ldexp (&m, &r, point_pos - exponent);
9306 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9308 /* If the low part of the mantissa has bits set we cannot represent
9309 the value. */
9310 if (w.elt (0) != 0)
9311 return false;
9312 /* We have rejected the lower HOST_WIDE_INT, so update our
9313 understanding of how many bits lie in the mantissa and
9314 look only at the high HOST_WIDE_INT. */
9315 mantissa = w.elt (1);
9316 point_pos -= HOST_BITS_PER_WIDE_INT;
9318 /* We can only represent values with a mantissa of the form 1.xxxx. */
9319 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9320 if ((mantissa & mask) != 0)
9321 return false;
9323 /* Having filtered unrepresentable values, we may now remove all
9324 but the highest 5 bits. */
9325 mantissa >>= point_pos - 5;
9327 /* We cannot represent the value 0.0, so reject it. This is handled
9328 elsewhere. */
9329 if (mantissa == 0)
9330 return false;
9332 /* Then, as bit 4 is always set, we can mask it off, leaving
9333 the mantissa in the range [0, 15]. */
9334 mantissa &= ~(1 << 4);
9335 gcc_assert (mantissa <= 15);
9337 /* GCC internally does not use IEEE754-like encoding (where normalized
9338 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9339 Our mantissa values are shifted 4 places to the left relative to
9340 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9341 by 5 places to correct for GCC's representation. */
9342 exponent = 5 - exponent;
9344 return (exponent >= 0 && exponent <= 7);
9347 char*
9348 aarch64_output_simd_mov_immediate (rtx const_vector,
9349 machine_mode mode,
9350 unsigned width)
9352 bool is_valid;
9353 static char templ[40];
9354 const char *mnemonic;
9355 const char *shift_op;
9356 unsigned int lane_count = 0;
9357 char element_char;
9359 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9361 /* This will return true to show const_vector is legal for use as either
9362 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9363 also update INFO to show how the immediate should be generated. */
9364 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9365 gcc_assert (is_valid);
9367 element_char = sizetochar (info.element_width);
9368 lane_count = width / info.element_width;
9370 mode = GET_MODE_INNER (mode);
9371 if (mode == SFmode || mode == DFmode)
9373 gcc_assert (info.shift == 0 && ! info.mvn);
9374 if (aarch64_float_const_zero_rtx_p (info.value))
9375 info.value = GEN_INT (0);
9376 else
9378 #define buf_size 20
9379 REAL_VALUE_TYPE r;
9380 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9381 char float_buf[buf_size] = {'\0'};
9382 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9383 #undef buf_size
9385 if (lane_count == 1)
9386 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9387 else
9388 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9389 lane_count, element_char, float_buf);
9390 return templ;
9394 mnemonic = info.mvn ? "mvni" : "movi";
9395 shift_op = info.msl ? "msl" : "lsl";
9397 if (lane_count == 1)
9398 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9399 mnemonic, UINTVAL (info.value));
9400 else if (info.shift)
9401 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9402 ", %s %d", mnemonic, lane_count, element_char,
9403 UINTVAL (info.value), shift_op, info.shift);
9404 else
9405 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9406 mnemonic, lane_count, element_char, UINTVAL (info.value));
9407 return templ;
9410 char*
9411 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9412 machine_mode mode)
9414 machine_mode vmode;
9416 gcc_assert (!VECTOR_MODE_P (mode));
9417 vmode = aarch64_simd_container_mode (mode, 64);
9418 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9419 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9422 /* Split operands into moves from op[1] + op[2] into op[0]. */
9424 void
9425 aarch64_split_combinev16qi (rtx operands[3])
9427 unsigned int dest = REGNO (operands[0]);
9428 unsigned int src1 = REGNO (operands[1]);
9429 unsigned int src2 = REGNO (operands[2]);
9430 machine_mode halfmode = GET_MODE (operands[1]);
9431 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9432 rtx destlo, desthi;
9434 gcc_assert (halfmode == V16QImode);
9436 if (src1 == dest && src2 == dest + halfregs)
9438 /* No-op move. Can't split to nothing; emit something. */
9439 emit_note (NOTE_INSN_DELETED);
9440 return;
9443 /* Preserve register attributes for variable tracking. */
9444 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9445 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9446 GET_MODE_SIZE (halfmode));
9448 /* Special case of reversed high/low parts. */
9449 if (reg_overlap_mentioned_p (operands[2], destlo)
9450 && reg_overlap_mentioned_p (operands[1], desthi))
9452 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9453 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9454 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9456 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9458 /* Try to avoid unnecessary moves if part of the result
9459 is in the right place already. */
9460 if (src1 != dest)
9461 emit_move_insn (destlo, operands[1]);
9462 if (src2 != dest + halfregs)
9463 emit_move_insn (desthi, operands[2]);
9465 else
9467 if (src2 != dest + halfregs)
9468 emit_move_insn (desthi, operands[2]);
9469 if (src1 != dest)
9470 emit_move_insn (destlo, operands[1]);
9474 /* vec_perm support. */
9476 #define MAX_VECT_LEN 16
9478 struct expand_vec_perm_d
9480 rtx target, op0, op1;
9481 unsigned char perm[MAX_VECT_LEN];
9482 machine_mode vmode;
9483 unsigned char nelt;
9484 bool one_vector_p;
9485 bool testing_p;
9488 /* Generate a variable permutation. */
9490 static void
9491 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9493 machine_mode vmode = GET_MODE (target);
9494 bool one_vector_p = rtx_equal_p (op0, op1);
9496 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9497 gcc_checking_assert (GET_MODE (op0) == vmode);
9498 gcc_checking_assert (GET_MODE (op1) == vmode);
9499 gcc_checking_assert (GET_MODE (sel) == vmode);
9500 gcc_checking_assert (TARGET_SIMD);
9502 if (one_vector_p)
9504 if (vmode == V8QImode)
9506 /* Expand the argument to a V16QI mode by duplicating it. */
9507 rtx pair = gen_reg_rtx (V16QImode);
9508 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9509 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9511 else
9513 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9516 else
9518 rtx pair;
9520 if (vmode == V8QImode)
9522 pair = gen_reg_rtx (V16QImode);
9523 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9524 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9526 else
9528 pair = gen_reg_rtx (OImode);
9529 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9530 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9535 void
9536 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9538 machine_mode vmode = GET_MODE (target);
9539 unsigned int nelt = GET_MODE_NUNITS (vmode);
9540 bool one_vector_p = rtx_equal_p (op0, op1);
9541 rtx mask;
9543 /* The TBL instruction does not use a modulo index, so we must take care
9544 of that ourselves. */
9545 mask = aarch64_simd_gen_const_vector_dup (vmode,
9546 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9547 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9549 /* For big-endian, we also need to reverse the index within the vector
9550 (but not which vector). */
9551 if (BYTES_BIG_ENDIAN)
9553 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9554 if (!one_vector_p)
9555 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9556 sel = expand_simple_binop (vmode, XOR, sel, mask,
9557 NULL, 0, OPTAB_LIB_WIDEN);
9559 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9562 /* Recognize patterns suitable for the TRN instructions. */
9563 static bool
9564 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9566 unsigned int i, odd, mask, nelt = d->nelt;
9567 rtx out, in0, in1, x;
9568 rtx (*gen) (rtx, rtx, rtx);
9569 machine_mode vmode = d->vmode;
9571 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9572 return false;
9574 /* Note that these are little-endian tests.
9575 We correct for big-endian later. */
9576 if (d->perm[0] == 0)
9577 odd = 0;
9578 else if (d->perm[0] == 1)
9579 odd = 1;
9580 else
9581 return false;
9582 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9584 for (i = 0; i < nelt; i += 2)
9586 if (d->perm[i] != i + odd)
9587 return false;
9588 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9589 return false;
9592 /* Success! */
9593 if (d->testing_p)
9594 return true;
9596 in0 = d->op0;
9597 in1 = d->op1;
9598 if (BYTES_BIG_ENDIAN)
9600 x = in0, in0 = in1, in1 = x;
9601 odd = !odd;
9603 out = d->target;
9605 if (odd)
9607 switch (vmode)
9609 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9610 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9611 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9612 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9613 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9614 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9615 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9616 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9617 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9618 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9619 default:
9620 return false;
9623 else
9625 switch (vmode)
9627 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9628 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9629 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9630 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9631 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9632 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9633 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9634 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9635 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9636 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9637 default:
9638 return false;
9642 emit_insn (gen (out, in0, in1));
9643 return true;
9646 /* Recognize patterns suitable for the UZP instructions. */
9647 static bool
9648 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9650 unsigned int i, odd, mask, nelt = d->nelt;
9651 rtx out, in0, in1, x;
9652 rtx (*gen) (rtx, rtx, rtx);
9653 machine_mode vmode = d->vmode;
9655 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9656 return false;
9658 /* Note that these are little-endian tests.
9659 We correct for big-endian later. */
9660 if (d->perm[0] == 0)
9661 odd = 0;
9662 else if (d->perm[0] == 1)
9663 odd = 1;
9664 else
9665 return false;
9666 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9668 for (i = 0; i < nelt; i++)
9670 unsigned elt = (i * 2 + odd) & mask;
9671 if (d->perm[i] != elt)
9672 return false;
9675 /* Success! */
9676 if (d->testing_p)
9677 return true;
9679 in0 = d->op0;
9680 in1 = d->op1;
9681 if (BYTES_BIG_ENDIAN)
9683 x = in0, in0 = in1, in1 = x;
9684 odd = !odd;
9686 out = d->target;
9688 if (odd)
9690 switch (vmode)
9692 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9693 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9694 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9695 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9696 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9697 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9698 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9699 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9700 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9701 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9702 default:
9703 return false;
9706 else
9708 switch (vmode)
9710 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9711 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9712 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9713 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9714 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9715 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9716 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9717 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9718 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9719 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9720 default:
9721 return false;
9725 emit_insn (gen (out, in0, in1));
9726 return true;
9729 /* Recognize patterns suitable for the ZIP instructions. */
9730 static bool
9731 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9733 unsigned int i, high, mask, nelt = d->nelt;
9734 rtx out, in0, in1, x;
9735 rtx (*gen) (rtx, rtx, rtx);
9736 machine_mode vmode = d->vmode;
9738 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9739 return false;
9741 /* Note that these are little-endian tests.
9742 We correct for big-endian later. */
9743 high = nelt / 2;
9744 if (d->perm[0] == high)
9745 /* Do Nothing. */
9747 else if (d->perm[0] == 0)
9748 high = 0;
9749 else
9750 return false;
9751 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9753 for (i = 0; i < nelt / 2; i++)
9755 unsigned elt = (i + high) & mask;
9756 if (d->perm[i * 2] != elt)
9757 return false;
9758 elt = (elt + nelt) & mask;
9759 if (d->perm[i * 2 + 1] != elt)
9760 return false;
9763 /* Success! */
9764 if (d->testing_p)
9765 return true;
9767 in0 = d->op0;
9768 in1 = d->op1;
9769 if (BYTES_BIG_ENDIAN)
9771 x = in0, in0 = in1, in1 = x;
9772 high = !high;
9774 out = d->target;
9776 if (high)
9778 switch (vmode)
9780 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9781 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9782 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9783 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9784 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9785 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9786 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9787 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9788 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9789 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9790 default:
9791 return false;
9794 else
9796 switch (vmode)
9798 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9799 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9800 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9801 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9802 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9803 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9804 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9805 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9806 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9807 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9808 default:
9809 return false;
9813 emit_insn (gen (out, in0, in1));
9814 return true;
9817 /* Recognize patterns for the EXT insn. */
9819 static bool
9820 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9822 unsigned int i, nelt = d->nelt;
9823 rtx (*gen) (rtx, rtx, rtx, rtx);
9824 rtx offset;
9826 unsigned int location = d->perm[0]; /* Always < nelt. */
9828 /* Check if the extracted indices are increasing by one. */
9829 for (i = 1; i < nelt; i++)
9831 unsigned int required = location + i;
9832 if (d->one_vector_p)
9834 /* We'll pass the same vector in twice, so allow indices to wrap. */
9835 required &= (nelt - 1);
9837 if (d->perm[i] != required)
9838 return false;
9841 switch (d->vmode)
9843 case V16QImode: gen = gen_aarch64_extv16qi; break;
9844 case V8QImode: gen = gen_aarch64_extv8qi; break;
9845 case V4HImode: gen = gen_aarch64_extv4hi; break;
9846 case V8HImode: gen = gen_aarch64_extv8hi; break;
9847 case V2SImode: gen = gen_aarch64_extv2si; break;
9848 case V4SImode: gen = gen_aarch64_extv4si; break;
9849 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9850 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9851 case V2DImode: gen = gen_aarch64_extv2di; break;
9852 case V2DFmode: gen = gen_aarch64_extv2df; break;
9853 default:
9854 return false;
9857 /* Success! */
9858 if (d->testing_p)
9859 return true;
9861 /* The case where (location == 0) is a no-op for both big- and little-endian,
9862 and is removed by the mid-end at optimization levels -O1 and higher. */
9864 if (BYTES_BIG_ENDIAN && (location != 0))
9866 /* After setup, we want the high elements of the first vector (stored
9867 at the LSB end of the register), and the low elements of the second
9868 vector (stored at the MSB end of the register). So swap. */
9869 rtx temp = d->op0;
9870 d->op0 = d->op1;
9871 d->op1 = temp;
9872 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9873 location = nelt - location;
9876 offset = GEN_INT (location);
9877 emit_insn (gen (d->target, d->op0, d->op1, offset));
9878 return true;
9881 /* Recognize patterns for the REV insns. */
9883 static bool
9884 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9886 unsigned int i, j, diff, nelt = d->nelt;
9887 rtx (*gen) (rtx, rtx);
9889 if (!d->one_vector_p)
9890 return false;
9892 diff = d->perm[0];
9893 switch (diff)
9895 case 7:
9896 switch (d->vmode)
9898 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9899 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
9900 default:
9901 return false;
9903 break;
9904 case 3:
9905 switch (d->vmode)
9907 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9908 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
9909 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
9910 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
9911 default:
9912 return false;
9914 break;
9915 case 1:
9916 switch (d->vmode)
9918 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9919 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
9920 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
9921 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
9922 case V4SImode: gen = gen_aarch64_rev64v4si; break;
9923 case V2SImode: gen = gen_aarch64_rev64v2si; break;
9924 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
9925 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
9926 default:
9927 return false;
9929 break;
9930 default:
9931 return false;
9934 for (i = 0; i < nelt ; i += diff + 1)
9935 for (j = 0; j <= diff; j += 1)
9937 /* This is guaranteed to be true as the value of diff
9938 is 7, 3, 1 and we should have enough elements in the
9939 queue to generate this. Getting a vector mask with a
9940 value of diff other than these values implies that
9941 something is wrong by the time we get here. */
9942 gcc_assert (i + j < nelt);
9943 if (d->perm[i + j] != i + diff - j)
9944 return false;
9947 /* Success! */
9948 if (d->testing_p)
9949 return true;
9951 emit_insn (gen (d->target, d->op0));
9952 return true;
9955 static bool
9956 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9958 rtx (*gen) (rtx, rtx, rtx);
9959 rtx out = d->target;
9960 rtx in0;
9961 machine_mode vmode = d->vmode;
9962 unsigned int i, elt, nelt = d->nelt;
9963 rtx lane;
9965 elt = d->perm[0];
9966 for (i = 1; i < nelt; i++)
9968 if (elt != d->perm[i])
9969 return false;
9972 /* The generic preparation in aarch64_expand_vec_perm_const_1
9973 swaps the operand order and the permute indices if it finds
9974 d->perm[0] to be in the second operand. Thus, we can always
9975 use d->op0 and need not do any extra arithmetic to get the
9976 correct lane number. */
9977 in0 = d->op0;
9978 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
9980 switch (vmode)
9982 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9983 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9984 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9985 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9986 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9987 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9988 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9989 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9990 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9991 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9992 default:
9993 return false;
9996 emit_insn (gen (out, in0, lane));
9997 return true;
10000 static bool
10001 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10003 rtx rperm[MAX_VECT_LEN], sel;
10004 machine_mode vmode = d->vmode;
10005 unsigned int i, nelt = d->nelt;
10007 if (d->testing_p)
10008 return true;
10010 /* Generic code will try constant permutation twice. Once with the
10011 original mode and again with the elements lowered to QImode.
10012 So wait and don't do the selector expansion ourselves. */
10013 if (vmode != V8QImode && vmode != V16QImode)
10014 return false;
10016 for (i = 0; i < nelt; ++i)
10018 int nunits = GET_MODE_NUNITS (vmode);
10020 /* If big-endian and two vectors we end up with a weird mixed-endian
10021 mode on NEON. Reverse the index within each word but not the word
10022 itself. */
10023 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10024 : d->perm[i]);
10026 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10027 sel = force_reg (vmode, sel);
10029 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10030 return true;
10033 static bool
10034 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10036 /* The pattern matching functions above are written to look for a small
10037 number to begin the sequence (0, 1, N/2). If we begin with an index
10038 from the second operand, we can swap the operands. */
10039 if (d->perm[0] >= d->nelt)
10041 unsigned i, nelt = d->nelt;
10042 rtx x;
10044 gcc_assert (nelt == (nelt & -nelt));
10045 for (i = 0; i < nelt; ++i)
10046 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10048 x = d->op0;
10049 d->op0 = d->op1;
10050 d->op1 = x;
10053 if (TARGET_SIMD)
10055 if (aarch64_evpc_rev (d))
10056 return true;
10057 else if (aarch64_evpc_ext (d))
10058 return true;
10059 else if (aarch64_evpc_dup (d))
10060 return true;
10061 else if (aarch64_evpc_zip (d))
10062 return true;
10063 else if (aarch64_evpc_uzp (d))
10064 return true;
10065 else if (aarch64_evpc_trn (d))
10066 return true;
10067 return aarch64_evpc_tbl (d);
10069 return false;
10072 /* Expand a vec_perm_const pattern. */
10074 bool
10075 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10077 struct expand_vec_perm_d d;
10078 int i, nelt, which;
10080 d.target = target;
10081 d.op0 = op0;
10082 d.op1 = op1;
10084 d.vmode = GET_MODE (target);
10085 gcc_assert (VECTOR_MODE_P (d.vmode));
10086 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10087 d.testing_p = false;
10089 for (i = which = 0; i < nelt; ++i)
10091 rtx e = XVECEXP (sel, 0, i);
10092 int ei = INTVAL (e) & (2 * nelt - 1);
10093 which |= (ei < nelt ? 1 : 2);
10094 d.perm[i] = ei;
10097 switch (which)
10099 default:
10100 gcc_unreachable ();
10102 case 3:
10103 d.one_vector_p = false;
10104 if (!rtx_equal_p (op0, op1))
10105 break;
10107 /* The elements of PERM do not suggest that only the first operand
10108 is used, but both operands are identical. Allow easier matching
10109 of the permutation by folding the permutation into the single
10110 input vector. */
10111 /* Fall Through. */
10112 case 2:
10113 for (i = 0; i < nelt; ++i)
10114 d.perm[i] &= nelt - 1;
10115 d.op0 = op1;
10116 d.one_vector_p = true;
10117 break;
10119 case 1:
10120 d.op1 = op0;
10121 d.one_vector_p = true;
10122 break;
10125 return aarch64_expand_vec_perm_const_1 (&d);
10128 static bool
10129 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10130 const unsigned char *sel)
10132 struct expand_vec_perm_d d;
10133 unsigned int i, nelt, which;
10134 bool ret;
10136 d.vmode = vmode;
10137 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10138 d.testing_p = true;
10139 memcpy (d.perm, sel, nelt);
10141 /* Calculate whether all elements are in one vector. */
10142 for (i = which = 0; i < nelt; ++i)
10144 unsigned char e = d.perm[i];
10145 gcc_assert (e < 2 * nelt);
10146 which |= (e < nelt ? 1 : 2);
10149 /* If all elements are from the second vector, reindex as if from the
10150 first vector. */
10151 if (which == 2)
10152 for (i = 0; i < nelt; ++i)
10153 d.perm[i] -= nelt;
10155 /* Check whether the mask can be applied to a single vector. */
10156 d.one_vector_p = (which != 3);
10158 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10159 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10160 if (!d.one_vector_p)
10161 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10163 start_sequence ();
10164 ret = aarch64_expand_vec_perm_const_1 (&d);
10165 end_sequence ();
10167 return ret;
10170 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
10171 bool
10172 aarch64_cannot_change_mode_class (machine_mode from,
10173 machine_mode to,
10174 enum reg_class rclass)
10176 /* Full-reg subregs are allowed on general regs or any class if they are
10177 the same size. */
10178 if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
10179 || !reg_classes_intersect_p (FP_REGS, rclass))
10180 return false;
10182 /* Limited combinations of subregs are safe on FPREGs. Particularly,
10183 1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
10184 2. Scalar to Scalar for integer modes or same size float modes.
10185 3. Vector to Vector modes.
10186 4. On little-endian only, Vector-Structure to Vector modes. */
10187 if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
10189 if (aarch64_vector_mode_supported_p (from)
10190 && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
10191 return false;
10193 if (GET_MODE_NUNITS (from) == 1
10194 && GET_MODE_NUNITS (to) == 1
10195 && (GET_MODE_CLASS (from) == MODE_INT
10196 || from == to))
10197 return false;
10199 if (aarch64_vector_mode_supported_p (from)
10200 && aarch64_vector_mode_supported_p (to))
10201 return false;
10203 /* Within an vector structure straddling multiple vector registers
10204 we are in a mixed-endian representation. As such, we can't
10205 easily change modes for BYTES_BIG_ENDIAN. Otherwise, we can
10206 switch between vectors and vector structures cheaply. */
10207 if (!BYTES_BIG_ENDIAN)
10208 if ((aarch64_vector_mode_supported_p (from)
10209 && aarch64_vect_struct_mode_p (to))
10210 || (aarch64_vector_mode_supported_p (to)
10211 && aarch64_vect_struct_mode_p (from)))
10212 return false;
10215 return true;
10218 /* Implement MODES_TIEABLE_P. */
10220 bool
10221 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10223 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10224 return true;
10226 /* We specifically want to allow elements of "structure" modes to
10227 be tieable to the structure. This more general condition allows
10228 other rarer situations too. */
10229 if (TARGET_SIMD
10230 && aarch64_vector_mode_p (mode1)
10231 && aarch64_vector_mode_p (mode2))
10232 return true;
10234 return false;
10237 /* Return a new RTX holding the result of moving POINTER forward by
10238 AMOUNT bytes. */
10240 static rtx
10241 aarch64_move_pointer (rtx pointer, int amount)
10243 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10245 return adjust_automodify_address (pointer, GET_MODE (pointer),
10246 next, amount);
10249 /* Return a new RTX holding the result of moving POINTER forward by the
10250 size of the mode it points to. */
10252 static rtx
10253 aarch64_progress_pointer (rtx pointer)
10255 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10257 return aarch64_move_pointer (pointer, amount);
10260 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10261 MODE bytes. */
10263 static void
10264 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10265 machine_mode mode)
10267 rtx reg = gen_reg_rtx (mode);
10269 /* "Cast" the pointers to the correct mode. */
10270 *src = adjust_address (*src, mode, 0);
10271 *dst = adjust_address (*dst, mode, 0);
10272 /* Emit the memcpy. */
10273 emit_move_insn (reg, *src);
10274 emit_move_insn (*dst, reg);
10275 /* Move the pointers forward. */
10276 *src = aarch64_progress_pointer (*src);
10277 *dst = aarch64_progress_pointer (*dst);
10280 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10281 we succeed, otherwise return false. */
10283 bool
10284 aarch64_expand_movmem (rtx *operands)
10286 unsigned int n;
10287 rtx dst = operands[0];
10288 rtx src = operands[1];
10289 rtx base;
10290 bool speed_p = !optimize_function_for_size_p (cfun);
10292 /* When optimizing for size, give a better estimate of the length of a
10293 memcpy call, but use the default otherwise. */
10294 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10296 /* We can't do anything smart if the amount to copy is not constant. */
10297 if (!CONST_INT_P (operands[2]))
10298 return false;
10300 n = UINTVAL (operands[2]);
10302 /* Try to keep the number of instructions low. For cases below 16 bytes we
10303 need to make at most two moves. For cases above 16 bytes it will be one
10304 move for each 16 byte chunk, then at most two additional moves. */
10305 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10306 return false;
10308 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10309 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10311 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10312 src = adjust_automodify_address (src, VOIDmode, base, 0);
10314 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10315 1-byte chunk. */
10316 if (n < 4)
10318 if (n >= 2)
10320 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10321 n -= 2;
10324 if (n == 1)
10325 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10327 return true;
10330 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10331 4-byte chunk, partially overlapping with the previously copied chunk. */
10332 if (n < 8)
10334 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10335 n -= 4;
10336 if (n > 0)
10338 int move = n - 4;
10340 src = aarch64_move_pointer (src, move);
10341 dst = aarch64_move_pointer (dst, move);
10342 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10344 return true;
10347 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10348 them, then (if applicable) an 8-byte chunk. */
10349 while (n >= 8)
10351 if (n / 16)
10353 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10354 n -= 16;
10356 else
10358 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10359 n -= 8;
10363 /* Finish the final bytes of the copy. We can always do this in one
10364 instruction. We either copy the exact amount we need, or partially
10365 overlap with the previous chunk we copied and copy 8-bytes. */
10366 if (n == 0)
10367 return true;
10368 else if (n == 1)
10369 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10370 else if (n == 2)
10371 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10372 else if (n == 4)
10373 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10374 else
10376 if (n == 3)
10378 src = aarch64_move_pointer (src, -1);
10379 dst = aarch64_move_pointer (dst, -1);
10380 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10382 else
10384 int move = n - 8;
10386 src = aarch64_move_pointer (src, move);
10387 dst = aarch64_move_pointer (dst, move);
10388 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10392 return true;
10395 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10397 static unsigned HOST_WIDE_INT
10398 aarch64_asan_shadow_offset (void)
10400 return (HOST_WIDE_INT_1 << 36);
10403 static bool
10404 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10405 unsigned int align,
10406 enum by_pieces_operation op,
10407 bool speed_p)
10409 /* STORE_BY_PIECES can be used when copying a constant string, but
10410 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10411 For now we always fail this and let the move_by_pieces code copy
10412 the string from read-only memory. */
10413 if (op == STORE_BY_PIECES)
10414 return false;
10416 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10419 static enum machine_mode
10420 aarch64_code_to_ccmode (enum rtx_code code)
10422 switch (code)
10424 case NE:
10425 return CC_DNEmode;
10427 case EQ:
10428 return CC_DEQmode;
10430 case LE:
10431 return CC_DLEmode;
10433 case LT:
10434 return CC_DLTmode;
10436 case GE:
10437 return CC_DGEmode;
10439 case GT:
10440 return CC_DGTmode;
10442 case LEU:
10443 return CC_DLEUmode;
10445 case LTU:
10446 return CC_DLTUmode;
10448 case GEU:
10449 return CC_DGEUmode;
10451 case GTU:
10452 return CC_DGTUmode;
10454 default:
10455 return CCmode;
10459 static rtx
10460 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10461 int code, tree treeop0, tree treeop1)
10463 enum machine_mode op_mode, cmp_mode, cc_mode;
10464 rtx op0, op1, cmp, target;
10465 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10466 enum insn_code icode;
10467 struct expand_operand ops[4];
10469 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10470 if (cc_mode == CCmode)
10471 return NULL_RTX;
10473 start_sequence ();
10474 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10476 op_mode = GET_MODE (op0);
10477 if (op_mode == VOIDmode)
10478 op_mode = GET_MODE (op1);
10480 switch (op_mode)
10482 case QImode:
10483 case HImode:
10484 case SImode:
10485 cmp_mode = SImode;
10486 icode = CODE_FOR_cmpsi;
10487 break;
10489 case DImode:
10490 cmp_mode = DImode;
10491 icode = CODE_FOR_cmpdi;
10492 break;
10494 default:
10495 end_sequence ();
10496 return NULL_RTX;
10499 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10500 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10501 if (!op0 || !op1)
10503 end_sequence ();
10504 return NULL_RTX;
10506 *prep_seq = get_insns ();
10507 end_sequence ();
10509 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10510 target = gen_rtx_REG (CCmode, CC_REGNUM);
10512 create_output_operand (&ops[0], target, CCmode);
10513 create_fixed_operand (&ops[1], cmp);
10514 create_fixed_operand (&ops[2], op0);
10515 create_fixed_operand (&ops[3], op1);
10517 start_sequence ();
10518 if (!maybe_expand_insn (icode, 4, ops))
10520 end_sequence ();
10521 return NULL_RTX;
10523 *gen_seq = get_insns ();
10524 end_sequence ();
10526 return gen_rtx_REG (cc_mode, CC_REGNUM);
10529 static rtx
10530 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10531 tree treeop0, tree treeop1, int bit_code)
10533 rtx op0, op1, cmp0, cmp1, target;
10534 enum machine_mode op_mode, cmp_mode, cc_mode;
10535 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10536 enum insn_code icode = CODE_FOR_ccmp_andsi;
10537 struct expand_operand ops[6];
10539 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10540 if (cc_mode == CCmode)
10541 return NULL_RTX;
10543 push_to_sequence ((rtx_insn*) *prep_seq);
10544 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10546 op_mode = GET_MODE (op0);
10547 if (op_mode == VOIDmode)
10548 op_mode = GET_MODE (op1);
10550 switch (op_mode)
10552 case QImode:
10553 case HImode:
10554 case SImode:
10555 cmp_mode = SImode;
10556 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10557 : CODE_FOR_ccmp_iorsi;
10558 break;
10560 case DImode:
10561 cmp_mode = DImode;
10562 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10563 : CODE_FOR_ccmp_iordi;
10564 break;
10566 default:
10567 end_sequence ();
10568 return NULL_RTX;
10571 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10572 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10573 if (!op0 || !op1)
10575 end_sequence ();
10576 return NULL_RTX;
10578 *prep_seq = get_insns ();
10579 end_sequence ();
10581 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10582 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10583 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10585 create_fixed_operand (&ops[0], prev);
10586 create_fixed_operand (&ops[1], target);
10587 create_fixed_operand (&ops[2], op0);
10588 create_fixed_operand (&ops[3], op1);
10589 create_fixed_operand (&ops[4], cmp0);
10590 create_fixed_operand (&ops[5], cmp1);
10592 push_to_sequence ((rtx_insn*) *gen_seq);
10593 if (!maybe_expand_insn (icode, 6, ops))
10595 end_sequence ();
10596 return NULL_RTX;
10599 *gen_seq = get_insns ();
10600 end_sequence ();
10602 return target;
10605 #undef TARGET_GEN_CCMP_FIRST
10606 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10608 #undef TARGET_GEN_CCMP_NEXT
10609 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10611 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10612 instruction fusion of some sort. */
10614 static bool
10615 aarch64_macro_fusion_p (void)
10617 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10621 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10622 should be kept together during scheduling. */
10624 static bool
10625 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10627 rtx set_dest;
10628 rtx prev_set = single_set (prev);
10629 rtx curr_set = single_set (curr);
10630 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10631 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10633 if (!aarch64_macro_fusion_p ())
10634 return false;
10636 if (simple_sets_p
10637 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10639 /* We are trying to match:
10640 prev (mov) == (set (reg r0) (const_int imm16))
10641 curr (movk) == (set (zero_extract (reg r0)
10642 (const_int 16)
10643 (const_int 16))
10644 (const_int imm16_1)) */
10646 set_dest = SET_DEST (curr_set);
10648 if (GET_CODE (set_dest) == ZERO_EXTRACT
10649 && CONST_INT_P (SET_SRC (curr_set))
10650 && CONST_INT_P (SET_SRC (prev_set))
10651 && CONST_INT_P (XEXP (set_dest, 2))
10652 && INTVAL (XEXP (set_dest, 2)) == 16
10653 && REG_P (XEXP (set_dest, 0))
10654 && REG_P (SET_DEST (prev_set))
10655 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10657 return true;
10661 if (simple_sets_p
10662 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10665 /* We're trying to match:
10666 prev (adrp) == (set (reg r1)
10667 (high (symbol_ref ("SYM"))))
10668 curr (add) == (set (reg r0)
10669 (lo_sum (reg r1)
10670 (symbol_ref ("SYM"))))
10671 Note that r0 need not necessarily be the same as r1, especially
10672 during pre-regalloc scheduling. */
10674 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10675 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10677 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10678 && REG_P (XEXP (SET_SRC (curr_set), 0))
10679 && REGNO (XEXP (SET_SRC (curr_set), 0))
10680 == REGNO (SET_DEST (prev_set))
10681 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10682 XEXP (SET_SRC (curr_set), 1)))
10683 return true;
10687 if (simple_sets_p
10688 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10691 /* We're trying to match:
10692 prev (movk) == (set (zero_extract (reg r0)
10693 (const_int 16)
10694 (const_int 32))
10695 (const_int imm16_1))
10696 curr (movk) == (set (zero_extract (reg r0)
10697 (const_int 16)
10698 (const_int 48))
10699 (const_int imm16_2)) */
10701 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10702 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10703 && REG_P (XEXP (SET_DEST (prev_set), 0))
10704 && REG_P (XEXP (SET_DEST (curr_set), 0))
10705 && REGNO (XEXP (SET_DEST (prev_set), 0))
10706 == REGNO (XEXP (SET_DEST (curr_set), 0))
10707 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10708 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10709 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10710 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10711 && CONST_INT_P (SET_SRC (prev_set))
10712 && CONST_INT_P (SET_SRC (curr_set)))
10713 return true;
10716 if (simple_sets_p
10717 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10719 /* We're trying to match:
10720 prev (adrp) == (set (reg r0)
10721 (high (symbol_ref ("SYM"))))
10722 curr (ldr) == (set (reg r1)
10723 (mem (lo_sum (reg r0)
10724 (symbol_ref ("SYM")))))
10726 curr (ldr) == (set (reg r1)
10727 (zero_extend (mem
10728 (lo_sum (reg r0)
10729 (symbol_ref ("SYM")))))) */
10730 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10731 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10733 rtx curr_src = SET_SRC (curr_set);
10735 if (GET_CODE (curr_src) == ZERO_EXTEND)
10736 curr_src = XEXP (curr_src, 0);
10738 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10739 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10740 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10741 == REGNO (SET_DEST (prev_set))
10742 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10743 XEXP (SET_SRC (prev_set), 0)))
10744 return true;
10748 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10749 && any_condjump_p (curr))
10751 enum attr_type prev_type = get_attr_type (prev);
10753 /* FIXME: this misses some which is considered simple arthematic
10754 instructions for ThunderX. Simple shifts are missed here. */
10755 if (prev_type == TYPE_ALUS_SREG
10756 || prev_type == TYPE_ALUS_IMM
10757 || prev_type == TYPE_LOGICS_REG
10758 || prev_type == TYPE_LOGICS_IMM)
10759 return true;
10762 return false;
10765 /* If MEM is in the form of [base+offset], extract the two parts
10766 of address and set to BASE and OFFSET, otherwise return false
10767 after clearing BASE and OFFSET. */
10769 bool
10770 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10772 rtx addr;
10774 gcc_assert (MEM_P (mem));
10776 addr = XEXP (mem, 0);
10778 if (REG_P (addr))
10780 *base = addr;
10781 *offset = const0_rtx;
10782 return true;
10785 if (GET_CODE (addr) == PLUS
10786 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10788 *base = XEXP (addr, 0);
10789 *offset = XEXP (addr, 1);
10790 return true;
10793 *base = NULL_RTX;
10794 *offset = NULL_RTX;
10796 return false;
10799 /* Types for scheduling fusion. */
10800 enum sched_fusion_type
10802 SCHED_FUSION_NONE = 0,
10803 SCHED_FUSION_LD_SIGN_EXTEND,
10804 SCHED_FUSION_LD_ZERO_EXTEND,
10805 SCHED_FUSION_LD,
10806 SCHED_FUSION_ST,
10807 SCHED_FUSION_NUM
10810 /* If INSN is a load or store of address in the form of [base+offset],
10811 extract the two parts and set to BASE and OFFSET. Return scheduling
10812 fusion type this INSN is. */
10814 static enum sched_fusion_type
10815 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10817 rtx x, dest, src;
10818 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10820 gcc_assert (INSN_P (insn));
10821 x = PATTERN (insn);
10822 if (GET_CODE (x) != SET)
10823 return SCHED_FUSION_NONE;
10825 src = SET_SRC (x);
10826 dest = SET_DEST (x);
10828 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10829 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10830 return SCHED_FUSION_NONE;
10832 if (GET_CODE (src) == SIGN_EXTEND)
10834 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10835 src = XEXP (src, 0);
10836 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10837 return SCHED_FUSION_NONE;
10839 else if (GET_CODE (src) == ZERO_EXTEND)
10841 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10842 src = XEXP (src, 0);
10843 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10844 return SCHED_FUSION_NONE;
10847 if (GET_CODE (src) == MEM && REG_P (dest))
10848 extract_base_offset_in_addr (src, base, offset);
10849 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10851 fusion = SCHED_FUSION_ST;
10852 extract_base_offset_in_addr (dest, base, offset);
10854 else
10855 return SCHED_FUSION_NONE;
10857 if (*base == NULL_RTX || *offset == NULL_RTX)
10858 fusion = SCHED_FUSION_NONE;
10860 return fusion;
10863 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10865 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10866 and PRI are only calculated for these instructions. For other instruction,
10867 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10868 type instruction fusion can be added by returning different priorities.
10870 It's important that irrelevant instructions get the largest FUSION_PRI. */
10872 static void
10873 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10874 int *fusion_pri, int *pri)
10876 int tmp, off_val;
10877 rtx base, offset;
10878 enum sched_fusion_type fusion;
10880 gcc_assert (INSN_P (insn));
10882 tmp = max_pri - 1;
10883 fusion = fusion_load_store (insn, &base, &offset);
10884 if (fusion == SCHED_FUSION_NONE)
10886 *pri = tmp;
10887 *fusion_pri = tmp;
10888 return;
10891 /* Set FUSION_PRI according to fusion type and base register. */
10892 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10894 /* Calculate PRI. */
10895 tmp /= 2;
10897 /* INSN with smaller offset goes first. */
10898 off_val = (int)(INTVAL (offset));
10899 if (off_val >= 0)
10900 tmp -= (off_val & 0xfffff);
10901 else
10902 tmp += ((- off_val) & 0xfffff);
10904 *pri = tmp;
10905 return;
10908 /* Given OPERANDS of consecutive load/store, check if we can merge
10909 them into ldp/stp. LOAD is true if they are load instructions.
10910 MODE is the mode of memory operands. */
10912 bool
10913 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10914 enum machine_mode mode)
10916 HOST_WIDE_INT offval_1, offval_2, msize;
10917 enum reg_class rclass_1, rclass_2;
10918 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10920 if (load)
10922 mem_1 = operands[1];
10923 mem_2 = operands[3];
10924 reg_1 = operands[0];
10925 reg_2 = operands[2];
10926 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10927 if (REGNO (reg_1) == REGNO (reg_2))
10928 return false;
10930 else
10932 mem_1 = operands[0];
10933 mem_2 = operands[2];
10934 reg_1 = operands[1];
10935 reg_2 = operands[3];
10938 /* The mems cannot be volatile. */
10939 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10940 return false;
10942 /* Check if the addresses are in the form of [base+offset]. */
10943 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10944 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10945 return false;
10946 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10947 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10948 return false;
10950 /* Check if the bases are same. */
10951 if (!rtx_equal_p (base_1, base_2))
10952 return false;
10954 offval_1 = INTVAL (offset_1);
10955 offval_2 = INTVAL (offset_2);
10956 msize = GET_MODE_SIZE (mode);
10957 /* Check if the offsets are consecutive. */
10958 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10959 return false;
10961 /* Check if the addresses are clobbered by load. */
10962 if (load)
10964 if (reg_mentioned_p (reg_1, mem_1))
10965 return false;
10967 /* In increasing order, the last load can clobber the address. */
10968 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10969 return false;
10972 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10973 rclass_1 = FP_REGS;
10974 else
10975 rclass_1 = GENERAL_REGS;
10977 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10978 rclass_2 = FP_REGS;
10979 else
10980 rclass_2 = GENERAL_REGS;
10982 /* Check if the registers are of same class. */
10983 if (rclass_1 != rclass_2)
10984 return false;
10986 return true;
10989 /* Given OPERANDS of consecutive load/store, check if we can merge
10990 them into ldp/stp by adjusting the offset. LOAD is true if they
10991 are load instructions. MODE is the mode of memory operands.
10993 Given below consecutive stores:
10995 str w1, [xb, 0x100]
10996 str w1, [xb, 0x104]
10997 str w1, [xb, 0x108]
10998 str w1, [xb, 0x10c]
11000 Though the offsets are out of the range supported by stp, we can
11001 still pair them after adjusting the offset, like:
11003 add scratch, xb, 0x100
11004 stp w1, w1, [scratch]
11005 stp w1, w1, [scratch, 0x8]
11007 The peephole patterns detecting this opportunity should guarantee
11008 the scratch register is avaliable. */
11010 bool
11011 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11012 enum machine_mode mode)
11014 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11015 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11016 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11017 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11019 if (load)
11021 reg_1 = operands[0];
11022 mem_1 = operands[1];
11023 reg_2 = operands[2];
11024 mem_2 = operands[3];
11025 reg_3 = operands[4];
11026 mem_3 = operands[5];
11027 reg_4 = operands[6];
11028 mem_4 = operands[7];
11029 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11030 && REG_P (reg_3) && REG_P (reg_4));
11031 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11032 return false;
11034 else
11036 mem_1 = operands[0];
11037 reg_1 = operands[1];
11038 mem_2 = operands[2];
11039 reg_2 = operands[3];
11040 mem_3 = operands[4];
11041 reg_3 = operands[5];
11042 mem_4 = operands[6];
11043 reg_4 = operands[7];
11045 /* Skip if memory operand is by itslef valid for ldp/stp. */
11046 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11047 return false;
11049 /* The mems cannot be volatile. */
11050 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11051 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11052 return false;
11054 /* Check if the addresses are in the form of [base+offset]. */
11055 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11056 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11057 return false;
11058 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11059 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11060 return false;
11061 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11062 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11063 return false;
11064 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11065 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11066 return false;
11068 /* Check if the bases are same. */
11069 if (!rtx_equal_p (base_1, base_2)
11070 || !rtx_equal_p (base_2, base_3)
11071 || !rtx_equal_p (base_3, base_4))
11072 return false;
11074 offval_1 = INTVAL (offset_1);
11075 offval_2 = INTVAL (offset_2);
11076 offval_3 = INTVAL (offset_3);
11077 offval_4 = INTVAL (offset_4);
11078 msize = GET_MODE_SIZE (mode);
11079 /* Check if the offsets are consecutive. */
11080 if ((offval_1 != (offval_2 + msize)
11081 || offval_1 != (offval_3 + msize * 2)
11082 || offval_1 != (offval_4 + msize * 3))
11083 && (offval_4 != (offval_3 + msize)
11084 || offval_4 != (offval_2 + msize * 2)
11085 || offval_4 != (offval_1 + msize * 3)))
11086 return false;
11088 /* Check if the addresses are clobbered by load. */
11089 if (load)
11091 if (reg_mentioned_p (reg_1, mem_1)
11092 || reg_mentioned_p (reg_2, mem_2)
11093 || reg_mentioned_p (reg_3, mem_3))
11094 return false;
11096 /* In increasing order, the last load can clobber the address. */
11097 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11098 return false;
11101 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11102 rclass_1 = FP_REGS;
11103 else
11104 rclass_1 = GENERAL_REGS;
11106 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11107 rclass_2 = FP_REGS;
11108 else
11109 rclass_2 = GENERAL_REGS;
11111 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11112 rclass_3 = FP_REGS;
11113 else
11114 rclass_3 = GENERAL_REGS;
11116 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11117 rclass_4 = FP_REGS;
11118 else
11119 rclass_4 = GENERAL_REGS;
11121 /* Check if the registers are of same class. */
11122 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11123 return false;
11125 return true;
11128 /* Given OPERANDS of consecutive load/store, this function pairs them
11129 into ldp/stp after adjusting the offset. It depends on the fact
11130 that addresses of load/store instructions are in increasing order.
11131 MODE is the mode of memory operands. CODE is the rtl operator
11132 which should be applied to all memory operands, it's SIGN_EXTEND,
11133 ZERO_EXTEND or UNKNOWN. */
11135 bool
11136 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11137 enum machine_mode mode, RTX_CODE code)
11139 rtx base, offset, t1, t2;
11140 rtx mem_1, mem_2, mem_3, mem_4;
11141 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11143 if (load)
11145 mem_1 = operands[1];
11146 mem_2 = operands[3];
11147 mem_3 = operands[5];
11148 mem_4 = operands[7];
11150 else
11152 mem_1 = operands[0];
11153 mem_2 = operands[2];
11154 mem_3 = operands[4];
11155 mem_4 = operands[6];
11156 gcc_assert (code == UNKNOWN);
11159 extract_base_offset_in_addr (mem_1, &base, &offset);
11160 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11162 /* Adjust offset thus it can fit in ldp/stp instruction. */
11163 msize = GET_MODE_SIZE (mode);
11164 stp_off_limit = msize * 0x40;
11165 off_val = INTVAL (offset);
11166 abs_off = (off_val < 0) ? -off_val : off_val;
11167 new_off = abs_off % stp_off_limit;
11168 adj_off = abs_off - new_off;
11170 /* Further adjust to make sure all offsets are OK. */
11171 if ((new_off + msize * 2) >= stp_off_limit)
11173 adj_off += stp_off_limit;
11174 new_off -= stp_off_limit;
11177 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11178 if (adj_off >= 0x1000)
11179 return false;
11181 if (off_val < 0)
11183 adj_off = -adj_off;
11184 new_off = -new_off;
11187 /* Create new memory references. */
11188 mem_1 = change_address (mem_1, VOIDmode,
11189 plus_constant (DImode, operands[8], new_off));
11191 /* Check if the adjusted address is OK for ldp/stp. */
11192 if (!aarch64_mem_pair_operand (mem_1, mode))
11193 return false;
11195 msize = GET_MODE_SIZE (mode);
11196 mem_2 = change_address (mem_2, VOIDmode,
11197 plus_constant (DImode,
11198 operands[8],
11199 new_off + msize));
11200 mem_3 = change_address (mem_3, VOIDmode,
11201 plus_constant (DImode,
11202 operands[8],
11203 new_off + msize * 2));
11204 mem_4 = change_address (mem_4, VOIDmode,
11205 plus_constant (DImode,
11206 operands[8],
11207 new_off + msize * 3));
11209 if (code == ZERO_EXTEND)
11211 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11212 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11213 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11214 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11216 else if (code == SIGN_EXTEND)
11218 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11219 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11220 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11221 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11224 if (load)
11226 operands[1] = mem_1;
11227 operands[3] = mem_2;
11228 operands[5] = mem_3;
11229 operands[7] = mem_4;
11231 else
11233 operands[0] = mem_1;
11234 operands[2] = mem_2;
11235 operands[4] = mem_3;
11236 operands[6] = mem_4;
11239 /* Emit adjusting instruction. */
11240 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11241 plus_constant (DImode, base, adj_off)));
11242 /* Emit ldp/stp instructions. */
11243 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11244 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11245 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11246 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11247 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11248 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11249 return true;
11252 #undef TARGET_ADDRESS_COST
11253 #define TARGET_ADDRESS_COST aarch64_address_cost
11255 /* This hook will determines whether unnamed bitfields affect the alignment
11256 of the containing structure. The hook returns true if the structure
11257 should inherit the alignment requirements of an unnamed bitfield's
11258 type. */
11259 #undef TARGET_ALIGN_ANON_BITFIELD
11260 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11262 #undef TARGET_ASM_ALIGNED_DI_OP
11263 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11265 #undef TARGET_ASM_ALIGNED_HI_OP
11266 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11268 #undef TARGET_ASM_ALIGNED_SI_OP
11269 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11271 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11272 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11273 hook_bool_const_tree_hwi_hwi_const_tree_true
11275 #undef TARGET_ASM_FILE_START
11276 #define TARGET_ASM_FILE_START aarch64_start_file
11278 #undef TARGET_ASM_OUTPUT_MI_THUNK
11279 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11281 #undef TARGET_ASM_SELECT_RTX_SECTION
11282 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11284 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11285 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11287 #undef TARGET_BUILD_BUILTIN_VA_LIST
11288 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11290 #undef TARGET_CALLEE_COPIES
11291 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11293 #undef TARGET_CAN_ELIMINATE
11294 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11296 #undef TARGET_CANNOT_FORCE_CONST_MEM
11297 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11299 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11300 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11302 /* Only the least significant bit is used for initialization guard
11303 variables. */
11304 #undef TARGET_CXX_GUARD_MASK_BIT
11305 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11307 #undef TARGET_C_MODE_FOR_SUFFIX
11308 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11310 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11311 #undef TARGET_DEFAULT_TARGET_FLAGS
11312 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11313 #endif
11315 #undef TARGET_CLASS_MAX_NREGS
11316 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11318 #undef TARGET_BUILTIN_DECL
11319 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11321 #undef TARGET_EXPAND_BUILTIN
11322 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11324 #undef TARGET_EXPAND_BUILTIN_VA_START
11325 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11327 #undef TARGET_FOLD_BUILTIN
11328 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11330 #undef TARGET_FUNCTION_ARG
11331 #define TARGET_FUNCTION_ARG aarch64_function_arg
11333 #undef TARGET_FUNCTION_ARG_ADVANCE
11334 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11336 #undef TARGET_FUNCTION_ARG_BOUNDARY
11337 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11339 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11340 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11342 #undef TARGET_FUNCTION_VALUE
11343 #define TARGET_FUNCTION_VALUE aarch64_function_value
11345 #undef TARGET_FUNCTION_VALUE_REGNO_P
11346 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11348 #undef TARGET_FRAME_POINTER_REQUIRED
11349 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11351 #undef TARGET_GIMPLE_FOLD_BUILTIN
11352 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11354 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11355 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11357 #undef TARGET_INIT_BUILTINS
11358 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11360 #undef TARGET_LEGITIMATE_ADDRESS_P
11361 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11363 #undef TARGET_LEGITIMATE_CONSTANT_P
11364 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11366 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11367 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11369 #undef TARGET_LRA_P
11370 #define TARGET_LRA_P hook_bool_void_true
11372 #undef TARGET_MANGLE_TYPE
11373 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11375 #undef TARGET_MEMORY_MOVE_COST
11376 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11378 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11379 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11381 #undef TARGET_MUST_PASS_IN_STACK
11382 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11384 /* This target hook should return true if accesses to volatile bitfields
11385 should use the narrowest mode possible. It should return false if these
11386 accesses should use the bitfield container type. */
11387 #undef TARGET_NARROW_VOLATILE_BITFIELD
11388 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11390 #undef TARGET_OPTION_OVERRIDE
11391 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11393 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11394 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11395 aarch64_override_options_after_change
11397 #undef TARGET_PASS_BY_REFERENCE
11398 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11400 #undef TARGET_PREFERRED_RELOAD_CLASS
11401 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11403 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11404 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11406 #undef TARGET_SECONDARY_RELOAD
11407 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11409 #undef TARGET_SHIFT_TRUNCATION_MASK
11410 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11412 #undef TARGET_SETUP_INCOMING_VARARGS
11413 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11415 #undef TARGET_STRUCT_VALUE_RTX
11416 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11418 #undef TARGET_REGISTER_MOVE_COST
11419 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11421 #undef TARGET_RETURN_IN_MEMORY
11422 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11424 #undef TARGET_RETURN_IN_MSB
11425 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11427 #undef TARGET_RTX_COSTS
11428 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11430 #undef TARGET_SCHED_ISSUE_RATE
11431 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11433 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11434 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11435 aarch64_sched_first_cycle_multipass_dfa_lookahead
11437 #undef TARGET_TRAMPOLINE_INIT
11438 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11440 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11441 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11443 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11444 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11446 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11447 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11449 #undef TARGET_VECTORIZE_ADD_STMT_COST
11450 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11452 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11453 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11454 aarch64_builtin_vectorization_cost
11456 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11457 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11459 #undef TARGET_VECTORIZE_BUILTINS
11460 #define TARGET_VECTORIZE_BUILTINS
11462 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11463 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11464 aarch64_builtin_vectorized_function
11466 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11467 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11468 aarch64_autovectorize_vector_sizes
11470 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11471 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11472 aarch64_atomic_assign_expand_fenv
11474 /* Section anchor support. */
11476 #undef TARGET_MIN_ANCHOR_OFFSET
11477 #define TARGET_MIN_ANCHOR_OFFSET -256
11479 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11480 byte offset; we can do much more for larger data types, but have no way
11481 to determine the size of the access. We assume accesses are aligned. */
11482 #undef TARGET_MAX_ANCHOR_OFFSET
11483 #define TARGET_MAX_ANCHOR_OFFSET 4095
11485 #undef TARGET_VECTOR_ALIGNMENT
11486 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11488 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11489 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11490 aarch64_simd_vector_alignment_reachable
11492 /* vec_perm support. */
11494 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11495 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11496 aarch64_vectorize_vec_perm_const_ok
11499 #undef TARGET_FIXED_CONDITION_CODE_REGS
11500 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11502 #undef TARGET_FLAGS_REGNUM
11503 #define TARGET_FLAGS_REGNUM CC_REGNUM
11505 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11506 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11508 #undef TARGET_ASAN_SHADOW_OFFSET
11509 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11511 #undef TARGET_LEGITIMIZE_ADDRESS
11512 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11514 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11515 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11516 aarch64_use_by_pieces_infrastructure_p
11518 #undef TARGET_CAN_USE_DOLOOP_P
11519 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11521 #undef TARGET_SCHED_MACRO_FUSION_P
11522 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11524 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11525 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11527 #undef TARGET_SCHED_FUSION_PRIORITY
11528 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11530 struct gcc_target targetm = TARGET_INITIALIZER;
11532 #include "gt-aarch64.h"