[AArch64] Remove -mlra/-mno-lra option for Aarch64
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobfdb0116048a355d920a66fb8896f58ddc4ddb947
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "hash-set.h"
29 #include "machmode.h"
30 #include "vec.h"
31 #include "double-int.h"
32 #include "input.h"
33 #include "alias.h"
34 #include "symtab.h"
35 #include "wide-int.h"
36 #include "inchash.h"
37 #include "tree.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
41 #include "calls.h"
42 #include "varasm.h"
43 #include "regs.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "cfgrtl.h"
47 #include "cfganal.h"
48 #include "lcm.h"
49 #include "cfgbuild.h"
50 #include "cfgcleanup.h"
51 #include "predict.h"
52 #include "basic-block.h"
53 #include "df.h"
54 #include "hard-reg-set.h"
55 #include "output.h"
56 #include "hashtab.h"
57 #include "function.h"
58 #include "flags.h"
59 #include "statistics.h"
60 #include "real.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
63 #include "expmed.h"
64 #include "dojump.h"
65 #include "explow.h"
66 #include "emit-rtl.h"
67 #include "stmt.h"
68 #include "expr.h"
69 #include "reload.h"
70 #include "toplev.h"
71 #include "target.h"
72 #include "target-def.h"
73 #include "targhooks.h"
74 #include "ggc.h"
75 #include "tm_p.h"
76 #include "recog.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
83 #include "tree-eh.h"
84 #include "gimple-expr.h"
85 #include "is-a.h"
86 #include "gimple.h"
87 #include "gimplify.h"
88 #include "optabs.h"
89 #include "dwarf2.h"
90 #include "cfgloop.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
93 #include "dumpfile.h"
94 #include "builtins.h"
95 #include "rtl-iter.h"
96 #include "tm-constrs.h"
98 /* Defined for convenience. */
99 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
101 /* Classifies an address.
103 ADDRESS_REG_IMM
104 A simple base register plus immediate offset.
106 ADDRESS_REG_WB
107 A base register indexed by immediate offset with writeback.
109 ADDRESS_REG_REG
110 A base register indexed by (optionally scaled) register.
112 ADDRESS_REG_UXTW
113 A base register indexed by (optionally scaled) zero-extended register.
115 ADDRESS_REG_SXTW
116 A base register indexed by (optionally scaled) sign-extended register.
118 ADDRESS_LO_SUM
119 A LO_SUM rtx with a base register and "LO12" symbol relocation.
121 ADDRESS_SYMBOLIC:
122 A constant symbolic address, in pc-relative literal pool. */
124 enum aarch64_address_type {
125 ADDRESS_REG_IMM,
126 ADDRESS_REG_WB,
127 ADDRESS_REG_REG,
128 ADDRESS_REG_UXTW,
129 ADDRESS_REG_SXTW,
130 ADDRESS_LO_SUM,
131 ADDRESS_SYMBOLIC
134 struct aarch64_address_info {
135 enum aarch64_address_type type;
136 rtx base;
137 rtx offset;
138 int shift;
139 enum aarch64_symbol_type symbol_type;
142 struct simd_immediate_info
144 rtx value;
145 int shift;
146 int element_width;
147 bool mvn;
148 bool msl;
151 /* The current code model. */
152 enum aarch64_code_model aarch64_cmodel;
154 #ifdef HAVE_AS_TLS
155 #undef TARGET_HAVE_TLS
156 #define TARGET_HAVE_TLS 1
157 #endif
159 static bool aarch64_composite_type_p (const_tree, machine_mode);
160 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
161 const_tree,
162 machine_mode *, int *,
163 bool *);
164 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
165 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
166 static void aarch64_override_options_after_change (void);
167 static bool aarch64_vector_mode_supported_p (machine_mode);
168 static unsigned bit_count (unsigned HOST_WIDE_INT);
169 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
170 const unsigned char *sel);
171 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
179 /* The current tuning set. */
180 const struct tune_params *aarch64_tune_params;
182 /* Mask to specify which instructions we are allowed to generate. */
183 unsigned long aarch64_isa_flags = 0;
185 /* Mask to specify which instruction scheduling options should be used. */
186 unsigned long aarch64_tune_flags = 0;
188 /* Tuning parameters. */
190 #if HAVE_DESIGNATED_INITIALIZERS
191 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
192 #else
193 #define NAMED_PARAM(NAME, VAL) (VAL)
194 #endif
196 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
197 __extension__
198 #endif
200 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
201 __extension__
202 #endif
203 static const struct cpu_addrcost_table generic_addrcost_table =
205 #if HAVE_DESIGNATED_INITIALIZERS
206 .addr_scale_costs =
207 #endif
209 NAMED_PARAM (hi, 0),
210 NAMED_PARAM (si, 0),
211 NAMED_PARAM (di, 0),
212 NAMED_PARAM (ti, 0),
214 NAMED_PARAM (pre_modify, 0),
215 NAMED_PARAM (post_modify, 0),
216 NAMED_PARAM (register_offset, 0),
217 NAMED_PARAM (register_extend, 0),
218 NAMED_PARAM (imm_offset, 0)
221 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
222 __extension__
223 #endif
224 static const struct cpu_addrcost_table cortexa57_addrcost_table =
226 #if HAVE_DESIGNATED_INITIALIZERS
227 .addr_scale_costs =
228 #endif
230 NAMED_PARAM (hi, 1),
231 NAMED_PARAM (si, 0),
232 NAMED_PARAM (di, 0),
233 NAMED_PARAM (ti, 1),
235 NAMED_PARAM (pre_modify, 0),
236 NAMED_PARAM (post_modify, 0),
237 NAMED_PARAM (register_offset, 0),
238 NAMED_PARAM (register_extend, 0),
239 NAMED_PARAM (imm_offset, 0),
242 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
243 __extension__
244 #endif
245 static const struct cpu_addrcost_table xgene1_addrcost_table =
247 #if HAVE_DESIGNATED_INITIALIZERS
248 .addr_scale_costs =
249 #endif
251 NAMED_PARAM (hi, 1),
252 NAMED_PARAM (si, 0),
253 NAMED_PARAM (di, 0),
254 NAMED_PARAM (ti, 1),
256 NAMED_PARAM (pre_modify, 1),
257 NAMED_PARAM (post_modify, 0),
258 NAMED_PARAM (register_offset, 0),
259 NAMED_PARAM (register_extend, 1),
260 NAMED_PARAM (imm_offset, 0),
263 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
264 __extension__
265 #endif
266 static const struct cpu_regmove_cost generic_regmove_cost =
268 NAMED_PARAM (GP2GP, 1),
269 /* Avoid the use of slow int<->fp moves for spilling by setting
270 their cost higher than memmov_cost. */
271 NAMED_PARAM (GP2FP, 5),
272 NAMED_PARAM (FP2GP, 5),
273 NAMED_PARAM (FP2FP, 2)
276 static const struct cpu_regmove_cost cortexa57_regmove_cost =
278 NAMED_PARAM (GP2GP, 1),
279 /* Avoid the use of slow int<->fp moves for spilling by setting
280 their cost higher than memmov_cost. */
281 NAMED_PARAM (GP2FP, 5),
282 NAMED_PARAM (FP2GP, 5),
283 NAMED_PARAM (FP2FP, 2)
286 static const struct cpu_regmove_cost cortexa53_regmove_cost =
288 NAMED_PARAM (GP2GP, 1),
289 /* Avoid the use of slow int<->fp moves for spilling by setting
290 their cost higher than memmov_cost. */
291 NAMED_PARAM (GP2FP, 5),
292 NAMED_PARAM (FP2GP, 5),
293 NAMED_PARAM (FP2FP, 2)
296 static const struct cpu_regmove_cost thunderx_regmove_cost =
298 NAMED_PARAM (GP2GP, 2),
299 NAMED_PARAM (GP2FP, 2),
300 NAMED_PARAM (FP2GP, 6),
301 NAMED_PARAM (FP2FP, 4)
304 static const struct cpu_regmove_cost xgene1_regmove_cost =
306 NAMED_PARAM (GP2GP, 1),
307 /* Avoid the use of slow int<->fp moves for spilling by setting
308 their cost higher than memmov_cost. */
309 NAMED_PARAM (GP2FP, 8),
310 NAMED_PARAM (FP2GP, 8),
311 NAMED_PARAM (FP2FP, 2)
314 /* Generic costs for vector insn classes. */
315 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
316 __extension__
317 #endif
318 static const struct cpu_vector_cost generic_vector_cost =
320 NAMED_PARAM (scalar_stmt_cost, 1),
321 NAMED_PARAM (scalar_load_cost, 1),
322 NAMED_PARAM (scalar_store_cost, 1),
323 NAMED_PARAM (vec_stmt_cost, 1),
324 NAMED_PARAM (vec_to_scalar_cost, 1),
325 NAMED_PARAM (scalar_to_vec_cost, 1),
326 NAMED_PARAM (vec_align_load_cost, 1),
327 NAMED_PARAM (vec_unalign_load_cost, 1),
328 NAMED_PARAM (vec_unalign_store_cost, 1),
329 NAMED_PARAM (vec_store_cost, 1),
330 NAMED_PARAM (cond_taken_branch_cost, 3),
331 NAMED_PARAM (cond_not_taken_branch_cost, 1)
334 /* Generic costs for vector insn classes. */
335 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
336 __extension__
337 #endif
338 static const struct cpu_vector_cost cortexa57_vector_cost =
340 NAMED_PARAM (scalar_stmt_cost, 1),
341 NAMED_PARAM (scalar_load_cost, 4),
342 NAMED_PARAM (scalar_store_cost, 1),
343 NAMED_PARAM (vec_stmt_cost, 3),
344 NAMED_PARAM (vec_to_scalar_cost, 8),
345 NAMED_PARAM (scalar_to_vec_cost, 8),
346 NAMED_PARAM (vec_align_load_cost, 5),
347 NAMED_PARAM (vec_unalign_load_cost, 5),
348 NAMED_PARAM (vec_unalign_store_cost, 1),
349 NAMED_PARAM (vec_store_cost, 1),
350 NAMED_PARAM (cond_taken_branch_cost, 1),
351 NAMED_PARAM (cond_not_taken_branch_cost, 1)
354 /* Generic costs for vector insn classes. */
355 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
356 __extension__
357 #endif
358 static const struct cpu_vector_cost xgene1_vector_cost =
360 NAMED_PARAM (scalar_stmt_cost, 1),
361 NAMED_PARAM (scalar_load_cost, 5),
362 NAMED_PARAM (scalar_store_cost, 1),
363 NAMED_PARAM (vec_stmt_cost, 2),
364 NAMED_PARAM (vec_to_scalar_cost, 4),
365 NAMED_PARAM (scalar_to_vec_cost, 4),
366 NAMED_PARAM (vec_align_load_cost, 10),
367 NAMED_PARAM (vec_unalign_load_cost, 10),
368 NAMED_PARAM (vec_unalign_store_cost, 2),
369 NAMED_PARAM (vec_store_cost, 2),
370 NAMED_PARAM (cond_taken_branch_cost, 2),
371 NAMED_PARAM (cond_not_taken_branch_cost, 1)
374 #define AARCH64_FUSE_NOTHING (0)
375 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
376 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
377 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
378 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
379 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
381 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
382 __extension__
383 #endif
384 static const struct tune_params generic_tunings =
386 &cortexa57_extra_costs,
387 &generic_addrcost_table,
388 &generic_regmove_cost,
389 &generic_vector_cost,
390 NAMED_PARAM (memmov_cost, 4),
391 NAMED_PARAM (issue_rate, 2),
392 NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
393 8, /* function_align. */
394 8, /* jump_align. */
395 4, /* loop_align. */
396 2, /* int_reassoc_width. */
397 4, /* fp_reassoc_width. */
398 1 /* vec_reassoc_width. */
401 static const struct tune_params cortexa53_tunings =
403 &cortexa53_extra_costs,
404 &generic_addrcost_table,
405 &cortexa53_regmove_cost,
406 &generic_vector_cost,
407 NAMED_PARAM (memmov_cost, 4),
408 NAMED_PARAM (issue_rate, 2),
409 NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
410 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR)),
411 8, /* function_align. */
412 8, /* jump_align. */
413 4, /* loop_align. */
414 2, /* int_reassoc_width. */
415 4, /* fp_reassoc_width. */
416 1 /* vec_reassoc_width. */
419 static const struct tune_params cortexa57_tunings =
421 &cortexa57_extra_costs,
422 &cortexa57_addrcost_table,
423 &cortexa57_regmove_cost,
424 &cortexa57_vector_cost,
425 NAMED_PARAM (memmov_cost, 4),
426 NAMED_PARAM (issue_rate, 3),
427 NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK)),
428 16, /* function_align. */
429 8, /* jump_align. */
430 4, /* loop_align. */
431 2, /* int_reassoc_width. */
432 4, /* fp_reassoc_width. */
433 1 /* vec_reassoc_width. */
436 static const struct tune_params thunderx_tunings =
438 &thunderx_extra_costs,
439 &generic_addrcost_table,
440 &thunderx_regmove_cost,
441 &generic_vector_cost,
442 NAMED_PARAM (memmov_cost, 6),
443 NAMED_PARAM (issue_rate, 2),
444 NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH),
445 8, /* function_align. */
446 8, /* jump_align. */
447 8, /* loop_align. */
448 2, /* int_reassoc_width. */
449 4, /* fp_reassoc_width. */
450 1 /* vec_reassoc_width. */
453 static const struct tune_params xgene1_tunings =
455 &xgene1_extra_costs,
456 &xgene1_addrcost_table,
457 &xgene1_regmove_cost,
458 &xgene1_vector_cost,
459 NAMED_PARAM (memmov_cost, 6),
460 NAMED_PARAM (issue_rate, 4),
461 NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
462 16, /* function_align. */
463 8, /* jump_align. */
464 16, /* loop_align. */
465 2, /* int_reassoc_width. */
466 4, /* fp_reassoc_width. */
467 1 /* vec_reassoc_width. */
470 /* A processor implementing AArch64. */
471 struct processor
473 const char *const name;
474 enum aarch64_processor core;
475 const char *arch;
476 unsigned architecture_version;
477 const unsigned long flags;
478 const struct tune_params *const tune;
481 /* Processor cores implementing AArch64. */
482 static const struct processor all_cores[] =
484 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
485 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
486 #include "aarch64-cores.def"
487 #undef AARCH64_CORE
488 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
489 {NULL, aarch64_none, NULL, 0, 0, NULL}
492 /* Architectures implementing AArch64. */
493 static const struct processor all_architectures[] =
495 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
496 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
497 #include "aarch64-arches.def"
498 #undef AARCH64_ARCH
499 {NULL, aarch64_none, NULL, 0, 0, NULL}
502 /* Target specification. These are populated as commandline arguments
503 are processed, or NULL if not specified. */
504 static const struct processor *selected_arch;
505 static const struct processor *selected_cpu;
506 static const struct processor *selected_tune;
508 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
510 /* An ISA extension in the co-processor and main instruction set space. */
511 struct aarch64_option_extension
513 const char *const name;
514 const unsigned long flags_on;
515 const unsigned long flags_off;
518 /* ISA extensions in AArch64. */
519 static const struct aarch64_option_extension all_extensions[] =
521 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
522 {NAME, FLAGS_ON, FLAGS_OFF},
523 #include "aarch64-option-extensions.def"
524 #undef AARCH64_OPT_EXTENSION
525 {NULL, 0, 0}
528 /* Used to track the size of an address when generating a pre/post
529 increment address. */
530 static machine_mode aarch64_memory_reference_mode;
532 /* Used to force GTY into this file. */
533 static GTY(()) int gty_dummy;
535 /* A table of valid AArch64 "bitmask immediate" values for
536 logical instructions. */
538 #define AARCH64_NUM_BITMASKS 5334
539 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
541 typedef enum aarch64_cond_code
543 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
544 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
545 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
547 aarch64_cc;
549 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
551 /* The condition codes of the processor, and the inverse function. */
552 static const char * const aarch64_condition_codes[] =
554 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
555 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
558 static unsigned int
559 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
561 return 2;
564 static int
565 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
566 enum machine_mode mode)
568 if (VECTOR_MODE_P (mode))
569 return aarch64_tune_params->vec_reassoc_width;
570 if (INTEGRAL_MODE_P (mode))
571 return aarch64_tune_params->int_reassoc_width;
572 if (FLOAT_MODE_P (mode))
573 return aarch64_tune_params->fp_reassoc_width;
574 return 1;
577 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
578 unsigned
579 aarch64_dbx_register_number (unsigned regno)
581 if (GP_REGNUM_P (regno))
582 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
583 else if (regno == SP_REGNUM)
584 return AARCH64_DWARF_SP;
585 else if (FP_REGNUM_P (regno))
586 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
588 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
589 equivalent DWARF register. */
590 return DWARF_FRAME_REGISTERS;
593 /* Return TRUE if MODE is any of the large INT modes. */
594 static bool
595 aarch64_vect_struct_mode_p (machine_mode mode)
597 return mode == OImode || mode == CImode || mode == XImode;
600 /* Return TRUE if MODE is any of the vector modes. */
601 static bool
602 aarch64_vector_mode_p (machine_mode mode)
604 return aarch64_vector_mode_supported_p (mode)
605 || aarch64_vect_struct_mode_p (mode);
608 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
609 static bool
610 aarch64_array_mode_supported_p (machine_mode mode,
611 unsigned HOST_WIDE_INT nelems)
613 if (TARGET_SIMD
614 && AARCH64_VALID_SIMD_QREG_MODE (mode)
615 && (nelems >= 2 && nelems <= 4))
616 return true;
618 return false;
621 /* Implement HARD_REGNO_NREGS. */
624 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
626 switch (aarch64_regno_regclass (regno))
628 case FP_REGS:
629 case FP_LO_REGS:
630 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
631 default:
632 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
634 gcc_unreachable ();
637 /* Implement HARD_REGNO_MODE_OK. */
640 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
642 if (GET_MODE_CLASS (mode) == MODE_CC)
643 return regno == CC_REGNUM;
645 if (regno == SP_REGNUM)
646 /* The purpose of comparing with ptr_mode is to support the
647 global register variable associated with the stack pointer
648 register via the syntax of asm ("wsp") in ILP32. */
649 return mode == Pmode || mode == ptr_mode;
651 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
652 return mode == Pmode;
654 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
655 return 1;
657 if (FP_REGNUM_P (regno))
659 if (aarch64_vect_struct_mode_p (mode))
660 return
661 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
662 else
663 return 1;
666 return 0;
669 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
670 machine_mode
671 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
672 machine_mode mode)
674 /* Handle modes that fit within single registers. */
675 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
677 if (GET_MODE_SIZE (mode) >= 4)
678 return mode;
679 else
680 return SImode;
682 /* Fall back to generic for multi-reg and very large modes. */
683 else
684 return choose_hard_reg_mode (regno, nregs, false);
687 /* Return true if calls to DECL should be treated as
688 long-calls (ie called via a register). */
689 static bool
690 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
692 return false;
695 /* Return true if calls to symbol-ref SYM should be treated as
696 long-calls (ie called via a register). */
697 bool
698 aarch64_is_long_call_p (rtx sym)
700 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
703 /* Return true if the offsets to a zero/sign-extract operation
704 represent an expression that matches an extend operation. The
705 operands represent the paramters from
707 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
708 bool
709 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
710 rtx extract_imm)
712 HOST_WIDE_INT mult_val, extract_val;
714 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
715 return false;
717 mult_val = INTVAL (mult_imm);
718 extract_val = INTVAL (extract_imm);
720 if (extract_val > 8
721 && extract_val < GET_MODE_BITSIZE (mode)
722 && exact_log2 (extract_val & ~7) > 0
723 && (extract_val & 7) <= 4
724 && mult_val == (1 << (extract_val & 7)))
725 return true;
727 return false;
730 /* Emit an insn that's a simple single-set. Both the operands must be
731 known to be valid. */
732 inline static rtx
733 emit_set_insn (rtx x, rtx y)
735 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
738 /* X and Y are two things to compare using CODE. Emit the compare insn and
739 return the rtx for register 0 in the proper mode. */
741 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
743 machine_mode mode = SELECT_CC_MODE (code, x, y);
744 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
746 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
747 return cc_reg;
750 /* Build the SYMBOL_REF for __tls_get_addr. */
752 static GTY(()) rtx tls_get_addr_libfunc;
755 aarch64_tls_get_addr (void)
757 if (!tls_get_addr_libfunc)
758 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
759 return tls_get_addr_libfunc;
762 /* Return the TLS model to use for ADDR. */
764 static enum tls_model
765 tls_symbolic_operand_type (rtx addr)
767 enum tls_model tls_kind = TLS_MODEL_NONE;
768 rtx sym, addend;
770 if (GET_CODE (addr) == CONST)
772 split_const (addr, &sym, &addend);
773 if (GET_CODE (sym) == SYMBOL_REF)
774 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
776 else if (GET_CODE (addr) == SYMBOL_REF)
777 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
779 return tls_kind;
782 /* We'll allow lo_sum's in addresses in our legitimate addresses
783 so that combine would take care of combining addresses where
784 necessary, but for generation purposes, we'll generate the address
785 as :
786 RTL Absolute
787 tmp = hi (symbol_ref); adrp x1, foo
788 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
791 PIC TLS
792 adrp x1, :got:foo adrp tmp, :tlsgd:foo
793 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
794 bl __tls_get_addr
797 Load TLS symbol, depending on TLS mechanism and TLS access model.
799 Global Dynamic - Traditional TLS:
800 adrp tmp, :tlsgd:imm
801 add dest, tmp, #:tlsgd_lo12:imm
802 bl __tls_get_addr
804 Global Dynamic - TLS Descriptors:
805 adrp dest, :tlsdesc:imm
806 ldr tmp, [dest, #:tlsdesc_lo12:imm]
807 add dest, dest, #:tlsdesc_lo12:imm
808 blr tmp
809 mrs tp, tpidr_el0
810 add dest, dest, tp
812 Initial Exec:
813 mrs tp, tpidr_el0
814 adrp tmp, :gottprel:imm
815 ldr dest, [tmp, #:gottprel_lo12:imm]
816 add dest, dest, tp
818 Local Exec:
819 mrs tp, tpidr_el0
820 add t0, tp, #:tprel_hi12:imm
821 add t0, #:tprel_lo12_nc:imm
824 static void
825 aarch64_load_symref_appropriately (rtx dest, rtx imm,
826 enum aarch64_symbol_type type)
828 switch (type)
830 case SYMBOL_SMALL_ABSOLUTE:
832 /* In ILP32, the mode of dest can be either SImode or DImode. */
833 rtx tmp_reg = dest;
834 machine_mode mode = GET_MODE (dest);
836 gcc_assert (mode == Pmode || mode == ptr_mode);
838 if (can_create_pseudo_p ())
839 tmp_reg = gen_reg_rtx (mode);
841 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
842 emit_insn (gen_add_losym (dest, tmp_reg, imm));
843 return;
846 case SYMBOL_TINY_ABSOLUTE:
847 emit_insn (gen_rtx_SET (Pmode, dest, imm));
848 return;
850 case SYMBOL_SMALL_GOT:
852 /* In ILP32, the mode of dest can be either SImode or DImode,
853 while the got entry is always of SImode size. The mode of
854 dest depends on how dest is used: if dest is assigned to a
855 pointer (e.g. in the memory), it has SImode; it may have
856 DImode if dest is dereferenced to access the memeory.
857 This is why we have to handle three different ldr_got_small
858 patterns here (two patterns for ILP32). */
859 rtx tmp_reg = dest;
860 machine_mode mode = GET_MODE (dest);
862 if (can_create_pseudo_p ())
863 tmp_reg = gen_reg_rtx (mode);
865 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
866 if (mode == ptr_mode)
868 if (mode == DImode)
869 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
870 else
871 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
873 else
875 gcc_assert (mode == Pmode);
876 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
879 return;
882 case SYMBOL_SMALL_TLSGD:
884 rtx_insn *insns;
885 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
887 start_sequence ();
888 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
889 insns = get_insns ();
890 end_sequence ();
892 RTL_CONST_CALL_P (insns) = 1;
893 emit_libcall_block (insns, dest, result, imm);
894 return;
897 case SYMBOL_SMALL_TLSDESC:
899 machine_mode mode = GET_MODE (dest);
900 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
901 rtx tp;
903 gcc_assert (mode == Pmode || mode == ptr_mode);
905 /* In ILP32, the got entry is always of SImode size. Unlike
906 small GOT, the dest is fixed at reg 0. */
907 if (TARGET_ILP32)
908 emit_insn (gen_tlsdesc_small_si (imm));
909 else
910 emit_insn (gen_tlsdesc_small_di (imm));
911 tp = aarch64_load_tp (NULL);
913 if (mode != Pmode)
914 tp = gen_lowpart (mode, tp);
916 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
917 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
918 return;
921 case SYMBOL_SMALL_GOTTPREL:
923 /* In ILP32, the mode of dest can be either SImode or DImode,
924 while the got entry is always of SImode size. The mode of
925 dest depends on how dest is used: if dest is assigned to a
926 pointer (e.g. in the memory), it has SImode; it may have
927 DImode if dest is dereferenced to access the memeory.
928 This is why we have to handle three different tlsie_small
929 patterns here (two patterns for ILP32). */
930 machine_mode mode = GET_MODE (dest);
931 rtx tmp_reg = gen_reg_rtx (mode);
932 rtx tp = aarch64_load_tp (NULL);
934 if (mode == ptr_mode)
936 if (mode == DImode)
937 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
938 else
940 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
941 tp = gen_lowpart (mode, tp);
944 else
946 gcc_assert (mode == Pmode);
947 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
950 emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
951 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
952 return;
955 case SYMBOL_SMALL_TPREL:
957 rtx tp = aarch64_load_tp (NULL);
958 emit_insn (gen_tlsle_small (dest, tp, imm));
959 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
960 return;
963 case SYMBOL_TINY_GOT:
964 emit_insn (gen_ldr_got_tiny (dest, imm));
965 return;
967 default:
968 gcc_unreachable ();
972 /* Emit a move from SRC to DEST. Assume that the move expanders can
973 handle all moves if !can_create_pseudo_p (). The distinction is
974 important because, unlike emit_move_insn, the move expanders know
975 how to force Pmode objects into the constant pool even when the
976 constant pool address is not itself legitimate. */
977 static rtx
978 aarch64_emit_move (rtx dest, rtx src)
980 return (can_create_pseudo_p ()
981 ? emit_move_insn (dest, src)
982 : emit_move_insn_1 (dest, src));
985 /* Split a 128-bit move operation into two 64-bit move operations,
986 taking care to handle partial overlap of register to register
987 copies. Special cases are needed when moving between GP regs and
988 FP regs. SRC can be a register, constant or memory; DST a register
989 or memory. If either operand is memory it must not have any side
990 effects. */
991 void
992 aarch64_split_128bit_move (rtx dst, rtx src)
994 rtx dst_lo, dst_hi;
995 rtx src_lo, src_hi;
997 machine_mode mode = GET_MODE (dst);
999 gcc_assert (mode == TImode || mode == TFmode);
1000 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1001 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1003 if (REG_P (dst) && REG_P (src))
1005 int src_regno = REGNO (src);
1006 int dst_regno = REGNO (dst);
1008 /* Handle FP <-> GP regs. */
1009 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1011 src_lo = gen_lowpart (word_mode, src);
1012 src_hi = gen_highpart (word_mode, src);
1014 if (mode == TImode)
1016 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1017 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1019 else
1021 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1022 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1024 return;
1026 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1028 dst_lo = gen_lowpart (word_mode, dst);
1029 dst_hi = gen_highpart (word_mode, dst);
1031 if (mode == TImode)
1033 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1034 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1036 else
1038 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1039 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1041 return;
1045 dst_lo = gen_lowpart (word_mode, dst);
1046 dst_hi = gen_highpart (word_mode, dst);
1047 src_lo = gen_lowpart (word_mode, src);
1048 src_hi = gen_highpart_mode (word_mode, mode, src);
1050 /* At most one pairing may overlap. */
1051 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1053 aarch64_emit_move (dst_hi, src_hi);
1054 aarch64_emit_move (dst_lo, src_lo);
1056 else
1058 aarch64_emit_move (dst_lo, src_lo);
1059 aarch64_emit_move (dst_hi, src_hi);
1063 bool
1064 aarch64_split_128bit_move_p (rtx dst, rtx src)
1066 return (! REG_P (src)
1067 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1070 /* Split a complex SIMD combine. */
1072 void
1073 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1075 machine_mode src_mode = GET_MODE (src1);
1076 machine_mode dst_mode = GET_MODE (dst);
1078 gcc_assert (VECTOR_MODE_P (dst_mode));
1080 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1082 rtx (*gen) (rtx, rtx, rtx);
1084 switch (src_mode)
1086 case V8QImode:
1087 gen = gen_aarch64_simd_combinev8qi;
1088 break;
1089 case V4HImode:
1090 gen = gen_aarch64_simd_combinev4hi;
1091 break;
1092 case V2SImode:
1093 gen = gen_aarch64_simd_combinev2si;
1094 break;
1095 case V2SFmode:
1096 gen = gen_aarch64_simd_combinev2sf;
1097 break;
1098 case DImode:
1099 gen = gen_aarch64_simd_combinedi;
1100 break;
1101 case DFmode:
1102 gen = gen_aarch64_simd_combinedf;
1103 break;
1104 default:
1105 gcc_unreachable ();
1108 emit_insn (gen (dst, src1, src2));
1109 return;
1113 /* Split a complex SIMD move. */
1115 void
1116 aarch64_split_simd_move (rtx dst, rtx src)
1118 machine_mode src_mode = GET_MODE (src);
1119 machine_mode dst_mode = GET_MODE (dst);
1121 gcc_assert (VECTOR_MODE_P (dst_mode));
1123 if (REG_P (dst) && REG_P (src))
1125 rtx (*gen) (rtx, rtx);
1127 gcc_assert (VECTOR_MODE_P (src_mode));
1129 switch (src_mode)
1131 case V16QImode:
1132 gen = gen_aarch64_split_simd_movv16qi;
1133 break;
1134 case V8HImode:
1135 gen = gen_aarch64_split_simd_movv8hi;
1136 break;
1137 case V4SImode:
1138 gen = gen_aarch64_split_simd_movv4si;
1139 break;
1140 case V2DImode:
1141 gen = gen_aarch64_split_simd_movv2di;
1142 break;
1143 case V4SFmode:
1144 gen = gen_aarch64_split_simd_movv4sf;
1145 break;
1146 case V2DFmode:
1147 gen = gen_aarch64_split_simd_movv2df;
1148 break;
1149 default:
1150 gcc_unreachable ();
1153 emit_insn (gen (dst, src));
1154 return;
1158 static rtx
1159 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1161 if (can_create_pseudo_p ())
1162 return force_reg (mode, value);
1163 else
1165 x = aarch64_emit_move (x, value);
1166 return x;
1171 static rtx
1172 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1174 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1176 rtx high;
1177 /* Load the full offset into a register. This
1178 might be improvable in the future. */
1179 high = GEN_INT (offset);
1180 offset = 0;
1181 high = aarch64_force_temporary (mode, temp, high);
1182 reg = aarch64_force_temporary (mode, temp,
1183 gen_rtx_PLUS (mode, high, reg));
1185 return plus_constant (mode, reg, offset);
1188 static int
1189 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1190 machine_mode mode)
1192 unsigned HOST_WIDE_INT mask;
1193 int i;
1194 bool first;
1195 unsigned HOST_WIDE_INT val;
1196 bool subtargets;
1197 rtx subtarget;
1198 int one_match, zero_match, first_not_ffff_match;
1199 int num_insns = 0;
1201 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1203 if (generate)
1204 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1205 num_insns++;
1206 return num_insns;
1209 if (mode == SImode)
1211 /* We know we can't do this in 1 insn, and we must be able to do it
1212 in two; so don't mess around looking for sequences that don't buy
1213 us anything. */
1214 if (generate)
1216 emit_insn (gen_rtx_SET (VOIDmode, dest,
1217 GEN_INT (INTVAL (imm) & 0xffff)));
1218 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1219 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1221 num_insns += 2;
1222 return num_insns;
1225 /* Remaining cases are all for DImode. */
1227 val = INTVAL (imm);
1228 subtargets = optimize && can_create_pseudo_p ();
1230 one_match = 0;
1231 zero_match = 0;
1232 mask = 0xffff;
1233 first_not_ffff_match = -1;
1235 for (i = 0; i < 64; i += 16, mask <<= 16)
1237 if ((val & mask) == mask)
1238 one_match++;
1239 else
1241 if (first_not_ffff_match < 0)
1242 first_not_ffff_match = i;
1243 if ((val & mask) == 0)
1244 zero_match++;
1248 if (one_match == 2)
1250 /* Set one of the quarters and then insert back into result. */
1251 mask = 0xffffll << first_not_ffff_match;
1252 if (generate)
1254 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1255 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1256 GEN_INT ((val >> first_not_ffff_match)
1257 & 0xffff)));
1259 num_insns += 2;
1260 return num_insns;
1263 if (zero_match == 2)
1264 goto simple_sequence;
1266 mask = 0x0ffff0000UL;
1267 for (i = 16; i < 64; i += 16, mask <<= 16)
1269 HOST_WIDE_INT comp = mask & ~(mask - 1);
1271 if (aarch64_uimm12_shift (val - (val & mask)))
1273 if (generate)
1275 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277 GEN_INT (val & mask)));
1278 emit_insn (gen_adddi3 (dest, subtarget,
1279 GEN_INT (val - (val & mask))));
1281 num_insns += 2;
1282 return num_insns;
1284 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1286 if (generate)
1288 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1289 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1290 GEN_INT ((val + comp) & mask)));
1291 emit_insn (gen_adddi3 (dest, subtarget,
1292 GEN_INT (val - ((val + comp) & mask))));
1294 num_insns += 2;
1295 return num_insns;
1297 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1299 if (generate)
1301 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1302 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1303 GEN_INT ((val - comp) | ~mask)));
1304 emit_insn (gen_adddi3 (dest, subtarget,
1305 GEN_INT (val - ((val - comp) | ~mask))));
1307 num_insns += 2;
1308 return num_insns;
1310 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1312 if (generate)
1314 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1315 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1316 GEN_INT (val | ~mask)));
1317 emit_insn (gen_adddi3 (dest, subtarget,
1318 GEN_INT (val - (val | ~mask))));
1320 num_insns += 2;
1321 return num_insns;
1325 /* See if we can do it by arithmetically combining two
1326 immediates. */
1327 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1329 int j;
1330 mask = 0xffff;
1332 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1333 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1335 if (generate)
1337 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1338 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339 GEN_INT (aarch64_bitmasks[i])));
1340 emit_insn (gen_adddi3 (dest, subtarget,
1341 GEN_INT (val - aarch64_bitmasks[i])));
1343 num_insns += 2;
1344 return num_insns;
1347 for (j = 0; j < 64; j += 16, mask <<= 16)
1349 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1351 if (generate)
1353 emit_insn (gen_rtx_SET (VOIDmode, dest,
1354 GEN_INT (aarch64_bitmasks[i])));
1355 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1356 GEN_INT ((val >> j) & 0xffff)));
1358 num_insns += 2;
1359 return num_insns;
1364 /* See if we can do it by logically combining two immediates. */
1365 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1367 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1369 int j;
1371 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1372 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1374 if (generate)
1376 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1377 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1378 GEN_INT (aarch64_bitmasks[i])));
1379 emit_insn (gen_iordi3 (dest, subtarget,
1380 GEN_INT (aarch64_bitmasks[j])));
1382 num_insns += 2;
1383 return num_insns;
1386 else if ((val & aarch64_bitmasks[i]) == val)
1388 int j;
1390 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1391 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1393 if (generate)
1395 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1396 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1397 GEN_INT (aarch64_bitmasks[j])));
1398 emit_insn (gen_anddi3 (dest, subtarget,
1399 GEN_INT (aarch64_bitmasks[i])));
1401 num_insns += 2;
1402 return num_insns;
1407 if (one_match > zero_match)
1409 /* Set either first three quarters or all but the third. */
1410 mask = 0xffffll << (16 - first_not_ffff_match);
1411 if (generate)
1412 emit_insn (gen_rtx_SET (VOIDmode, dest,
1413 GEN_INT (val | mask | 0xffffffff00000000ull)));
1414 num_insns ++;
1416 /* Now insert other two quarters. */
1417 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1418 i < 64; i += 16, mask <<= 16)
1420 if ((val & mask) != mask)
1422 if (generate)
1423 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1424 GEN_INT ((val >> i) & 0xffff)));
1425 num_insns ++;
1428 return num_insns;
1431 simple_sequence:
1432 first = true;
1433 mask = 0xffff;
1434 for (i = 0; i < 64; i += 16, mask <<= 16)
1436 if ((val & mask) != 0)
1438 if (first)
1440 if (generate)
1441 emit_insn (gen_rtx_SET (VOIDmode, dest,
1442 GEN_INT (val & mask)));
1443 num_insns ++;
1444 first = false;
1446 else
1448 if (generate)
1449 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1450 GEN_INT ((val >> i) & 0xffff)));
1451 num_insns ++;
1456 return num_insns;
1460 void
1461 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1463 machine_mode mode = GET_MODE (dest);
1465 gcc_assert (mode == SImode || mode == DImode);
1467 /* Check on what type of symbol it is. */
1468 if (GET_CODE (imm) == SYMBOL_REF
1469 || GET_CODE (imm) == LABEL_REF
1470 || GET_CODE (imm) == CONST)
1472 rtx mem, base, offset;
1473 enum aarch64_symbol_type sty;
1475 /* If we have (const (plus symbol offset)), separate out the offset
1476 before we start classifying the symbol. */
1477 split_const (imm, &base, &offset);
1479 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1480 switch (sty)
1482 case SYMBOL_FORCE_TO_MEM:
1483 if (offset != const0_rtx
1484 && targetm.cannot_force_const_mem (mode, imm))
1486 gcc_assert (can_create_pseudo_p ());
1487 base = aarch64_force_temporary (mode, dest, base);
1488 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1489 aarch64_emit_move (dest, base);
1490 return;
1492 mem = force_const_mem (ptr_mode, imm);
1493 gcc_assert (mem);
1494 if (mode != ptr_mode)
1495 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1496 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1497 return;
1499 case SYMBOL_SMALL_TLSGD:
1500 case SYMBOL_SMALL_TLSDESC:
1501 case SYMBOL_SMALL_GOTTPREL:
1502 case SYMBOL_SMALL_GOT:
1503 case SYMBOL_TINY_GOT:
1504 if (offset != const0_rtx)
1506 gcc_assert(can_create_pseudo_p ());
1507 base = aarch64_force_temporary (mode, dest, base);
1508 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1509 aarch64_emit_move (dest, base);
1510 return;
1512 /* FALLTHRU */
1514 case SYMBOL_SMALL_TPREL:
1515 case SYMBOL_SMALL_ABSOLUTE:
1516 case SYMBOL_TINY_ABSOLUTE:
1517 aarch64_load_symref_appropriately (dest, imm, sty);
1518 return;
1520 default:
1521 gcc_unreachable ();
1525 if (!CONST_INT_P (imm))
1527 if (GET_CODE (imm) == HIGH)
1528 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1529 else
1531 rtx mem = force_const_mem (mode, imm);
1532 gcc_assert (mem);
1533 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1536 return;
1539 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1542 static bool
1543 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1544 tree exp ATTRIBUTE_UNUSED)
1546 /* Currently, always true. */
1547 return true;
1550 /* Implement TARGET_PASS_BY_REFERENCE. */
1552 static bool
1553 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1554 machine_mode mode,
1555 const_tree type,
1556 bool named ATTRIBUTE_UNUSED)
1558 HOST_WIDE_INT size;
1559 machine_mode dummymode;
1560 int nregs;
1562 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1563 size = (mode == BLKmode && type)
1564 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1566 /* Aggregates are passed by reference based on their size. */
1567 if (type && AGGREGATE_TYPE_P (type))
1569 size = int_size_in_bytes (type);
1572 /* Variable sized arguments are always returned by reference. */
1573 if (size < 0)
1574 return true;
1576 /* Can this be a candidate to be passed in fp/simd register(s)? */
1577 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1578 &dummymode, &nregs,
1579 NULL))
1580 return false;
1582 /* Arguments which are variable sized or larger than 2 registers are
1583 passed by reference unless they are a homogenous floating point
1584 aggregate. */
1585 return size > 2 * UNITS_PER_WORD;
1588 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1589 static bool
1590 aarch64_return_in_msb (const_tree valtype)
1592 machine_mode dummy_mode;
1593 int dummy_int;
1595 /* Never happens in little-endian mode. */
1596 if (!BYTES_BIG_ENDIAN)
1597 return false;
1599 /* Only composite types smaller than or equal to 16 bytes can
1600 be potentially returned in registers. */
1601 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1602 || int_size_in_bytes (valtype) <= 0
1603 || int_size_in_bytes (valtype) > 16)
1604 return false;
1606 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1607 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1608 is always passed/returned in the least significant bits of fp/simd
1609 register(s). */
1610 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1611 &dummy_mode, &dummy_int, NULL))
1612 return false;
1614 return true;
1617 /* Implement TARGET_FUNCTION_VALUE.
1618 Define how to find the value returned by a function. */
1620 static rtx
1621 aarch64_function_value (const_tree type, const_tree func,
1622 bool outgoing ATTRIBUTE_UNUSED)
1624 machine_mode mode;
1625 int unsignedp;
1626 int count;
1627 machine_mode ag_mode;
1629 mode = TYPE_MODE (type);
1630 if (INTEGRAL_TYPE_P (type))
1631 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1633 if (aarch64_return_in_msb (type))
1635 HOST_WIDE_INT size = int_size_in_bytes (type);
1637 if (size % UNITS_PER_WORD != 0)
1639 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1640 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1644 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1645 &ag_mode, &count, NULL))
1647 if (!aarch64_composite_type_p (type, mode))
1649 gcc_assert (count == 1 && mode == ag_mode);
1650 return gen_rtx_REG (mode, V0_REGNUM);
1652 else
1654 int i;
1655 rtx par;
1657 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1658 for (i = 0; i < count; i++)
1660 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1661 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1662 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1663 XVECEXP (par, 0, i) = tmp;
1665 return par;
1668 else
1669 return gen_rtx_REG (mode, R0_REGNUM);
1672 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1673 Return true if REGNO is the number of a hard register in which the values
1674 of called function may come back. */
1676 static bool
1677 aarch64_function_value_regno_p (const unsigned int regno)
1679 /* Maximum of 16 bytes can be returned in the general registers. Examples
1680 of 16-byte return values are: 128-bit integers and 16-byte small
1681 structures (excluding homogeneous floating-point aggregates). */
1682 if (regno == R0_REGNUM || regno == R1_REGNUM)
1683 return true;
1685 /* Up to four fp/simd registers can return a function value, e.g. a
1686 homogeneous floating-point aggregate having four members. */
1687 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1688 return !TARGET_GENERAL_REGS_ONLY;
1690 return false;
1693 /* Implement TARGET_RETURN_IN_MEMORY.
1695 If the type T of the result of a function is such that
1696 void func (T arg)
1697 would require that arg be passed as a value in a register (or set of
1698 registers) according to the parameter passing rules, then the result
1699 is returned in the same registers as would be used for such an
1700 argument. */
1702 static bool
1703 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1705 HOST_WIDE_INT size;
1706 machine_mode ag_mode;
1707 int count;
1709 if (!AGGREGATE_TYPE_P (type)
1710 && TREE_CODE (type) != COMPLEX_TYPE
1711 && TREE_CODE (type) != VECTOR_TYPE)
1712 /* Simple scalar types always returned in registers. */
1713 return false;
1715 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1716 type,
1717 &ag_mode,
1718 &count,
1719 NULL))
1720 return false;
1722 /* Types larger than 2 registers returned in memory. */
1723 size = int_size_in_bytes (type);
1724 return (size < 0 || size > 2 * UNITS_PER_WORD);
1727 static bool
1728 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1729 const_tree type, int *nregs)
1731 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1732 return aarch64_vfp_is_call_or_return_candidate (mode,
1733 type,
1734 &pcum->aapcs_vfp_rmode,
1735 nregs,
1736 NULL);
1739 /* Given MODE and TYPE of a function argument, return the alignment in
1740 bits. The idea is to suppress any stronger alignment requested by
1741 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1742 This is a helper function for local use only. */
1744 static unsigned int
1745 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1747 unsigned int alignment;
1749 if (type)
1751 if (!integer_zerop (TYPE_SIZE (type)))
1753 if (TYPE_MODE (type) == mode)
1754 alignment = TYPE_ALIGN (type);
1755 else
1756 alignment = GET_MODE_ALIGNMENT (mode);
1758 else
1759 alignment = 0;
1761 else
1762 alignment = GET_MODE_ALIGNMENT (mode);
1764 return alignment;
1767 /* Layout a function argument according to the AAPCS64 rules. The rule
1768 numbers refer to the rule numbers in the AAPCS64. */
1770 static void
1771 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1772 const_tree type,
1773 bool named ATTRIBUTE_UNUSED)
1775 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1776 int ncrn, nvrn, nregs;
1777 bool allocate_ncrn, allocate_nvrn;
1778 HOST_WIDE_INT size;
1780 /* We need to do this once per argument. */
1781 if (pcum->aapcs_arg_processed)
1782 return;
1784 pcum->aapcs_arg_processed = true;
1786 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1787 size
1788 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1789 UNITS_PER_WORD);
1791 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1792 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1793 mode,
1794 type,
1795 &nregs);
1797 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1798 The following code thus handles passing by SIMD/FP registers first. */
1800 nvrn = pcum->aapcs_nvrn;
1802 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1803 and homogenous short-vector aggregates (HVA). */
1804 if (allocate_nvrn)
1806 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1808 pcum->aapcs_nextnvrn = nvrn + nregs;
1809 if (!aarch64_composite_type_p (type, mode))
1811 gcc_assert (nregs == 1);
1812 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1814 else
1816 rtx par;
1817 int i;
1818 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1819 for (i = 0; i < nregs; i++)
1821 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1822 V0_REGNUM + nvrn + i);
1823 tmp = gen_rtx_EXPR_LIST
1824 (VOIDmode, tmp,
1825 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1826 XVECEXP (par, 0, i) = tmp;
1828 pcum->aapcs_reg = par;
1830 return;
1832 else
1834 /* C.3 NSRN is set to 8. */
1835 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1836 goto on_stack;
1840 ncrn = pcum->aapcs_ncrn;
1841 nregs = size / UNITS_PER_WORD;
1843 /* C6 - C9. though the sign and zero extension semantics are
1844 handled elsewhere. This is the case where the argument fits
1845 entirely general registers. */
1846 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1848 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1850 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1852 /* C.8 if the argument has an alignment of 16 then the NGRN is
1853 rounded up to the next even number. */
1854 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1856 ++ncrn;
1857 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1859 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1860 A reg is still generated for it, but the caller should be smart
1861 enough not to use it. */
1862 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1864 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1866 else
1868 rtx par;
1869 int i;
1871 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1872 for (i = 0; i < nregs; i++)
1874 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1875 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1876 GEN_INT (i * UNITS_PER_WORD));
1877 XVECEXP (par, 0, i) = tmp;
1879 pcum->aapcs_reg = par;
1882 pcum->aapcs_nextncrn = ncrn + nregs;
1883 return;
1886 /* C.11 */
1887 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1889 /* The argument is passed on stack; record the needed number of words for
1890 this argument and align the total size if necessary. */
1891 on_stack:
1892 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1893 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1894 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1895 16 / UNITS_PER_WORD);
1896 return;
1899 /* Implement TARGET_FUNCTION_ARG. */
1901 static rtx
1902 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1903 const_tree type, bool named)
1905 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1906 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1908 if (mode == VOIDmode)
1909 return NULL_RTX;
1911 aarch64_layout_arg (pcum_v, mode, type, named);
1912 return pcum->aapcs_reg;
1915 void
1916 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1917 const_tree fntype ATTRIBUTE_UNUSED,
1918 rtx libname ATTRIBUTE_UNUSED,
1919 const_tree fndecl ATTRIBUTE_UNUSED,
1920 unsigned n_named ATTRIBUTE_UNUSED)
1922 pcum->aapcs_ncrn = 0;
1923 pcum->aapcs_nvrn = 0;
1924 pcum->aapcs_nextncrn = 0;
1925 pcum->aapcs_nextnvrn = 0;
1926 pcum->pcs_variant = ARM_PCS_AAPCS64;
1927 pcum->aapcs_reg = NULL_RTX;
1928 pcum->aapcs_arg_processed = false;
1929 pcum->aapcs_stack_words = 0;
1930 pcum->aapcs_stack_size = 0;
1932 return;
1935 static void
1936 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1937 machine_mode mode,
1938 const_tree type,
1939 bool named)
1941 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1942 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1944 aarch64_layout_arg (pcum_v, mode, type, named);
1945 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1946 != (pcum->aapcs_stack_words != 0));
1947 pcum->aapcs_arg_processed = false;
1948 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1949 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1950 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1951 pcum->aapcs_stack_words = 0;
1952 pcum->aapcs_reg = NULL_RTX;
1956 bool
1957 aarch64_function_arg_regno_p (unsigned regno)
1959 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1960 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1963 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1964 PARM_BOUNDARY bits of alignment, but will be given anything up
1965 to STACK_BOUNDARY bits if the type requires it. This makes sure
1966 that both before and after the layout of each argument, the Next
1967 Stacked Argument Address (NSAA) will have a minimum alignment of
1968 8 bytes. */
1970 static unsigned int
1971 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1973 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1975 if (alignment < PARM_BOUNDARY)
1976 alignment = PARM_BOUNDARY;
1977 if (alignment > STACK_BOUNDARY)
1978 alignment = STACK_BOUNDARY;
1979 return alignment;
1982 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1984 Return true if an argument passed on the stack should be padded upwards,
1985 i.e. if the least-significant byte of the stack slot has useful data.
1987 Small aggregate types are placed in the lowest memory address.
1989 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1991 bool
1992 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1994 /* On little-endian targets, the least significant byte of every stack
1995 argument is passed at the lowest byte address of the stack slot. */
1996 if (!BYTES_BIG_ENDIAN)
1997 return true;
1999 /* Otherwise, integral, floating-point and pointer types are padded downward:
2000 the least significant byte of a stack argument is passed at the highest
2001 byte address of the stack slot. */
2002 if (type
2003 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2004 || POINTER_TYPE_P (type))
2005 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2006 return false;
2008 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2009 return true;
2012 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2014 It specifies padding for the last (may also be the only)
2015 element of a block move between registers and memory. If
2016 assuming the block is in the memory, padding upward means that
2017 the last element is padded after its highest significant byte,
2018 while in downward padding, the last element is padded at the
2019 its least significant byte side.
2021 Small aggregates and small complex types are always padded
2022 upwards.
2024 We don't need to worry about homogeneous floating-point or
2025 short-vector aggregates; their move is not affected by the
2026 padding direction determined here. Regardless of endianness,
2027 each element of such an aggregate is put in the least
2028 significant bits of a fp/simd register.
2030 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2031 register has useful data, and return the opposite if the most
2032 significant byte does. */
2034 bool
2035 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2036 bool first ATTRIBUTE_UNUSED)
2039 /* Small composite types are always padded upward. */
2040 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2042 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2043 : GET_MODE_SIZE (mode));
2044 if (size < 2 * UNITS_PER_WORD)
2045 return true;
2048 /* Otherwise, use the default padding. */
2049 return !BYTES_BIG_ENDIAN;
2052 static machine_mode
2053 aarch64_libgcc_cmp_return_mode (void)
2055 return SImode;
2058 static bool
2059 aarch64_frame_pointer_required (void)
2061 /* In aarch64_override_options_after_change
2062 flag_omit_leaf_frame_pointer turns off the frame pointer by
2063 default. Turn it back on now if we've not got a leaf
2064 function. */
2065 if (flag_omit_leaf_frame_pointer
2066 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2067 return true;
2069 return false;
2072 /* Mark the registers that need to be saved by the callee and calculate
2073 the size of the callee-saved registers area and frame record (both FP
2074 and LR may be omitted). */
2075 static void
2076 aarch64_layout_frame (void)
2078 HOST_WIDE_INT offset = 0;
2079 int regno;
2081 if (reload_completed && cfun->machine->frame.laid_out)
2082 return;
2084 #define SLOT_NOT_REQUIRED (-2)
2085 #define SLOT_REQUIRED (-1)
2087 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2088 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2090 /* First mark all the registers that really need to be saved... */
2091 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2092 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2094 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2095 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2097 /* ... that includes the eh data registers (if needed)... */
2098 if (crtl->calls_eh_return)
2099 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2100 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2101 = SLOT_REQUIRED;
2103 /* ... and any callee saved register that dataflow says is live. */
2104 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2105 if (df_regs_ever_live_p (regno)
2106 && (regno == R30_REGNUM
2107 || !call_used_regs[regno]))
2108 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2110 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2111 if (df_regs_ever_live_p (regno)
2112 && !call_used_regs[regno])
2113 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2115 if (frame_pointer_needed)
2117 /* FP and LR are placed in the linkage record. */
2118 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2119 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2120 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2121 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2122 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2123 offset += 2 * UNITS_PER_WORD;
2126 /* Now assign stack slots for them. */
2127 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2128 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2130 cfun->machine->frame.reg_offset[regno] = offset;
2131 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2132 cfun->machine->frame.wb_candidate1 = regno;
2133 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2134 cfun->machine->frame.wb_candidate2 = regno;
2135 offset += UNITS_PER_WORD;
2138 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2139 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2141 cfun->machine->frame.reg_offset[regno] = offset;
2142 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2143 cfun->machine->frame.wb_candidate1 = regno;
2144 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2145 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2146 cfun->machine->frame.wb_candidate2 = regno;
2147 offset += UNITS_PER_WORD;
2150 cfun->machine->frame.padding0 =
2151 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2152 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2154 cfun->machine->frame.saved_regs_size = offset;
2156 cfun->machine->frame.hard_fp_offset
2157 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2158 + get_frame_size ()
2159 + cfun->machine->frame.saved_regs_size,
2160 STACK_BOUNDARY / BITS_PER_UNIT);
2162 cfun->machine->frame.frame_size
2163 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2164 + crtl->outgoing_args_size,
2165 STACK_BOUNDARY / BITS_PER_UNIT);
2167 cfun->machine->frame.laid_out = true;
2170 static bool
2171 aarch64_register_saved_on_entry (int regno)
2173 return cfun->machine->frame.reg_offset[regno] >= 0;
2176 static unsigned
2177 aarch64_next_callee_save (unsigned regno, unsigned limit)
2179 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2180 regno ++;
2181 return regno;
2184 static void
2185 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2186 HOST_WIDE_INT adjustment)
2188 rtx base_rtx = stack_pointer_rtx;
2189 rtx insn, reg, mem;
2191 reg = gen_rtx_REG (mode, regno);
2192 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2193 plus_constant (Pmode, base_rtx, -adjustment));
2194 mem = gen_rtx_MEM (mode, mem);
2196 insn = emit_move_insn (mem, reg);
2197 RTX_FRAME_RELATED_P (insn) = 1;
2200 static rtx
2201 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2202 HOST_WIDE_INT adjustment)
2204 switch (mode)
2206 case DImode:
2207 return gen_storewb_pairdi_di (base, base, reg, reg2,
2208 GEN_INT (-adjustment),
2209 GEN_INT (UNITS_PER_WORD - adjustment));
2210 case DFmode:
2211 return gen_storewb_pairdf_di (base, base, reg, reg2,
2212 GEN_INT (-adjustment),
2213 GEN_INT (UNITS_PER_WORD - adjustment));
2214 default:
2215 gcc_unreachable ();
2219 static void
2220 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2221 unsigned regno2, HOST_WIDE_INT adjustment)
2223 rtx_insn *insn;
2224 rtx reg1 = gen_rtx_REG (mode, regno1);
2225 rtx reg2 = gen_rtx_REG (mode, regno2);
2227 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2228 reg2, adjustment));
2229 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2230 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2231 RTX_FRAME_RELATED_P (insn) = 1;
2234 static rtx
2235 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2236 HOST_WIDE_INT adjustment)
2238 switch (mode)
2240 case DImode:
2241 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2242 GEN_INT (UNITS_PER_WORD));
2243 case DFmode:
2244 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2245 GEN_INT (UNITS_PER_WORD));
2246 default:
2247 gcc_unreachable ();
2251 static rtx
2252 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2253 rtx reg2)
2255 switch (mode)
2257 case DImode:
2258 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2260 case DFmode:
2261 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2263 default:
2264 gcc_unreachable ();
2268 static rtx
2269 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2270 rtx mem2)
2272 switch (mode)
2274 case DImode:
2275 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2277 case DFmode:
2278 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2280 default:
2281 gcc_unreachable ();
2286 static void
2287 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2288 unsigned start, unsigned limit, bool skip_wb)
2290 rtx_insn *insn;
2291 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2292 ? gen_frame_mem : gen_rtx_MEM);
2293 unsigned regno;
2294 unsigned regno2;
2296 for (regno = aarch64_next_callee_save (start, limit);
2297 regno <= limit;
2298 regno = aarch64_next_callee_save (regno + 1, limit))
2300 rtx reg, mem;
2301 HOST_WIDE_INT offset;
2303 if (skip_wb
2304 && (regno == cfun->machine->frame.wb_candidate1
2305 || regno == cfun->machine->frame.wb_candidate2))
2306 continue;
2308 reg = gen_rtx_REG (mode, regno);
2309 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2310 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2311 offset));
2313 regno2 = aarch64_next_callee_save (regno + 1, limit);
2315 if (regno2 <= limit
2316 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2317 == cfun->machine->frame.reg_offset[regno2]))
2320 rtx reg2 = gen_rtx_REG (mode, regno2);
2321 rtx mem2;
2323 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2324 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2325 offset));
2326 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2327 reg2));
2329 /* The first part of a frame-related parallel insn is
2330 always assumed to be relevant to the frame
2331 calculations; subsequent parts, are only
2332 frame-related if explicitly marked. */
2333 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2334 regno = regno2;
2336 else
2337 insn = emit_move_insn (mem, reg);
2339 RTX_FRAME_RELATED_P (insn) = 1;
2343 static void
2344 aarch64_restore_callee_saves (machine_mode mode,
2345 HOST_WIDE_INT start_offset, unsigned start,
2346 unsigned limit, bool skip_wb, rtx *cfi_ops)
2348 rtx base_rtx = stack_pointer_rtx;
2349 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2350 ? gen_frame_mem : gen_rtx_MEM);
2351 unsigned regno;
2352 unsigned regno2;
2353 HOST_WIDE_INT offset;
2355 for (regno = aarch64_next_callee_save (start, limit);
2356 regno <= limit;
2357 regno = aarch64_next_callee_save (regno + 1, limit))
2359 rtx reg, mem;
2361 if (skip_wb
2362 && (regno == cfun->machine->frame.wb_candidate1
2363 || regno == cfun->machine->frame.wb_candidate2))
2364 continue;
2366 reg = gen_rtx_REG (mode, regno);
2367 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2368 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2370 regno2 = aarch64_next_callee_save (regno + 1, limit);
2372 if (regno2 <= limit
2373 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2374 == cfun->machine->frame.reg_offset[regno2]))
2376 rtx reg2 = gen_rtx_REG (mode, regno2);
2377 rtx mem2;
2379 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2380 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2381 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2383 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2384 regno = regno2;
2386 else
2387 emit_move_insn (reg, mem);
2388 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2392 /* AArch64 stack frames generated by this compiler look like:
2394 +-------------------------------+
2396 | incoming stack arguments |
2398 +-------------------------------+
2399 | | <-- incoming stack pointer (aligned)
2400 | callee-allocated save area |
2401 | for register varargs |
2403 +-------------------------------+
2404 | local variables | <-- frame_pointer_rtx
2406 +-------------------------------+
2407 | padding0 | \
2408 +-------------------------------+ |
2409 | callee-saved registers | | frame.saved_regs_size
2410 +-------------------------------+ |
2411 | LR' | |
2412 +-------------------------------+ |
2413 | FP' | / <- hard_frame_pointer_rtx (aligned)
2414 +-------------------------------+
2415 | dynamic allocation |
2416 +-------------------------------+
2417 | padding |
2418 +-------------------------------+
2419 | outgoing stack arguments | <-- arg_pointer
2421 +-------------------------------+
2422 | | <-- stack_pointer_rtx (aligned)
2424 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2425 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2426 unchanged. */
2428 /* Generate the prologue instructions for entry into a function.
2429 Establish the stack frame by decreasing the stack pointer with a
2430 properly calculated size and, if necessary, create a frame record
2431 filled with the values of LR and previous frame pointer. The
2432 current FP is also set up if it is in use. */
2434 void
2435 aarch64_expand_prologue (void)
2437 /* sub sp, sp, #<frame_size>
2438 stp {fp, lr}, [sp, #<frame_size> - 16]
2439 add fp, sp, #<frame_size> - hardfp_offset
2440 stp {cs_reg}, [fp, #-16] etc.
2442 sub sp, sp, <final_adjustment_if_any>
2444 HOST_WIDE_INT frame_size, offset;
2445 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2446 HOST_WIDE_INT hard_fp_offset;
2447 rtx_insn *insn;
2449 aarch64_layout_frame ();
2451 offset = frame_size = cfun->machine->frame.frame_size;
2452 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2453 fp_offset = frame_size - hard_fp_offset;
2455 if (flag_stack_usage_info)
2456 current_function_static_stack_size = frame_size;
2458 /* Store pairs and load pairs have a range only -512 to 504. */
2459 if (offset >= 512)
2461 /* When the frame has a large size, an initial decrease is done on
2462 the stack pointer to jump over the callee-allocated save area for
2463 register varargs, the local variable area and/or the callee-saved
2464 register area. This will allow the pre-index write-back
2465 store pair instructions to be used for setting up the stack frame
2466 efficiently. */
2467 offset = hard_fp_offset;
2468 if (offset >= 512)
2469 offset = cfun->machine->frame.saved_regs_size;
2471 frame_size -= (offset + crtl->outgoing_args_size);
2472 fp_offset = 0;
2474 if (frame_size >= 0x1000000)
2476 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2477 emit_move_insn (op0, GEN_INT (-frame_size));
2478 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2480 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2481 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2482 plus_constant (Pmode, stack_pointer_rtx,
2483 -frame_size)));
2484 RTX_FRAME_RELATED_P (insn) = 1;
2486 else if (frame_size > 0)
2488 int hi_ofs = frame_size & 0xfff000;
2489 int lo_ofs = frame_size & 0x000fff;
2491 if (hi_ofs)
2493 insn = emit_insn (gen_add2_insn
2494 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2495 RTX_FRAME_RELATED_P (insn) = 1;
2497 if (lo_ofs)
2499 insn = emit_insn (gen_add2_insn
2500 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2501 RTX_FRAME_RELATED_P (insn) = 1;
2505 else
2506 frame_size = -1;
2508 if (offset > 0)
2510 bool skip_wb = false;
2512 if (frame_pointer_needed)
2514 skip_wb = true;
2516 if (fp_offset)
2518 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2519 GEN_INT (-offset)));
2520 RTX_FRAME_RELATED_P (insn) = 1;
2522 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2523 R30_REGNUM, false);
2525 else
2526 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2528 /* Set up frame pointer to point to the location of the
2529 previous frame pointer on the stack. */
2530 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2531 stack_pointer_rtx,
2532 GEN_INT (fp_offset)));
2533 RTX_FRAME_RELATED_P (insn) = 1;
2534 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2536 else
2538 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2539 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2541 if (fp_offset
2542 || reg1 == FIRST_PSEUDO_REGISTER
2543 || (reg2 == FIRST_PSEUDO_REGISTER
2544 && offset >= 256))
2546 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2547 GEN_INT (-offset)));
2548 RTX_FRAME_RELATED_P (insn) = 1;
2550 else
2552 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2554 skip_wb = true;
2556 if (reg2 == FIRST_PSEUDO_REGISTER)
2557 aarch64_pushwb_single_reg (mode1, reg1, offset);
2558 else
2559 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2563 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2564 skip_wb);
2565 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2566 skip_wb);
2569 /* when offset >= 512,
2570 sub sp, sp, #<outgoing_args_size> */
2571 if (frame_size > -1)
2573 if (crtl->outgoing_args_size > 0)
2575 insn = emit_insn (gen_add2_insn
2576 (stack_pointer_rtx,
2577 GEN_INT (- crtl->outgoing_args_size)));
2578 RTX_FRAME_RELATED_P (insn) = 1;
2583 /* Return TRUE if we can use a simple_return insn.
2585 This function checks whether the callee saved stack is empty, which
2586 means no restore actions are need. The pro_and_epilogue will use
2587 this to check whether shrink-wrapping opt is feasible. */
2589 bool
2590 aarch64_use_return_insn_p (void)
2592 if (!reload_completed)
2593 return false;
2595 if (crtl->profile)
2596 return false;
2598 aarch64_layout_frame ();
2600 return cfun->machine->frame.frame_size == 0;
2603 /* Generate the epilogue instructions for returning from a function. */
2604 void
2605 aarch64_expand_epilogue (bool for_sibcall)
2607 HOST_WIDE_INT frame_size, offset;
2608 HOST_WIDE_INT fp_offset;
2609 HOST_WIDE_INT hard_fp_offset;
2610 rtx_insn *insn;
2611 /* We need to add memory barrier to prevent read from deallocated stack. */
2612 bool need_barrier_p = (get_frame_size () != 0
2613 || cfun->machine->frame.saved_varargs_size);
2615 aarch64_layout_frame ();
2617 offset = frame_size = cfun->machine->frame.frame_size;
2618 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2619 fp_offset = frame_size - hard_fp_offset;
2621 /* Store pairs and load pairs have a range only -512 to 504. */
2622 if (offset >= 512)
2624 offset = hard_fp_offset;
2625 if (offset >= 512)
2626 offset = cfun->machine->frame.saved_regs_size;
2628 frame_size -= (offset + crtl->outgoing_args_size);
2629 fp_offset = 0;
2630 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2632 insn = emit_insn (gen_add2_insn
2633 (stack_pointer_rtx,
2634 GEN_INT (crtl->outgoing_args_size)));
2635 RTX_FRAME_RELATED_P (insn) = 1;
2638 else
2639 frame_size = -1;
2641 /* If there were outgoing arguments or we've done dynamic stack
2642 allocation, then restore the stack pointer from the frame
2643 pointer. This is at most one insn and more efficient than using
2644 GCC's internal mechanism. */
2645 if (frame_pointer_needed
2646 && (crtl->outgoing_args_size || cfun->calls_alloca))
2648 if (cfun->calls_alloca)
2649 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2651 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2652 hard_frame_pointer_rtx,
2653 GEN_INT (0)));
2654 offset = offset - fp_offset;
2657 if (offset > 0)
2659 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2660 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2661 bool skip_wb = true;
2662 rtx cfi_ops = NULL;
2664 if (frame_pointer_needed)
2665 fp_offset = 0;
2666 else if (fp_offset
2667 || reg1 == FIRST_PSEUDO_REGISTER
2668 || (reg2 == FIRST_PSEUDO_REGISTER
2669 && offset >= 256))
2670 skip_wb = false;
2672 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2673 skip_wb, &cfi_ops);
2674 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2675 skip_wb, &cfi_ops);
2677 if (need_barrier_p)
2678 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2680 if (skip_wb)
2682 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2683 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2685 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2686 if (reg2 == FIRST_PSEUDO_REGISTER)
2688 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2689 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2690 mem = gen_rtx_MEM (mode1, mem);
2691 insn = emit_move_insn (rreg1, mem);
2693 else
2695 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2697 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2698 insn = emit_insn (aarch64_gen_loadwb_pair
2699 (mode1, stack_pointer_rtx, rreg1,
2700 rreg2, offset));
2703 else
2705 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2706 GEN_INT (offset)));
2709 /* Reset the CFA to be SP + FRAME_SIZE. */
2710 rtx new_cfa = stack_pointer_rtx;
2711 if (frame_size > 0)
2712 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2713 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2714 REG_NOTES (insn) = cfi_ops;
2715 RTX_FRAME_RELATED_P (insn) = 1;
2718 if (frame_size > 0)
2720 if (need_barrier_p)
2721 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2723 if (frame_size >= 0x1000000)
2725 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2726 emit_move_insn (op0, GEN_INT (frame_size));
2727 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2729 else
2731 int hi_ofs = frame_size & 0xfff000;
2732 int lo_ofs = frame_size & 0x000fff;
2734 if (hi_ofs && lo_ofs)
2736 insn = emit_insn (gen_add2_insn
2737 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2738 RTX_FRAME_RELATED_P (insn) = 1;
2739 frame_size = lo_ofs;
2741 insn = emit_insn (gen_add2_insn
2742 (stack_pointer_rtx, GEN_INT (frame_size)));
2745 /* Reset the CFA to be SP + 0. */
2746 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2747 RTX_FRAME_RELATED_P (insn) = 1;
2750 /* Stack adjustment for exception handler. */
2751 if (crtl->calls_eh_return)
2753 /* We need to unwind the stack by the offset computed by
2754 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2755 to be SP; letting the CFA move during this adjustment
2756 is just as correct as retaining the CFA from the body
2757 of the function. Therefore, do nothing special. */
2758 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2761 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2762 if (!for_sibcall)
2763 emit_jump_insn (ret_rtx);
2766 /* Return the place to copy the exception unwinding return address to.
2767 This will probably be a stack slot, but could (in theory be the
2768 return register). */
2770 aarch64_final_eh_return_addr (void)
2772 HOST_WIDE_INT fp_offset;
2774 aarch64_layout_frame ();
2776 fp_offset = cfun->machine->frame.frame_size
2777 - cfun->machine->frame.hard_fp_offset;
2779 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2780 return gen_rtx_REG (DImode, LR_REGNUM);
2782 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2783 result in a store to save LR introduced by builtin_eh_return () being
2784 incorrectly deleted because the alias is not detected.
2785 So in the calculation of the address to copy the exception unwinding
2786 return address to, we note 2 cases.
2787 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2788 we return a SP-relative location since all the addresses are SP-relative
2789 in this case. This prevents the store from being optimized away.
2790 If the fp_offset is not 0, then the addresses will be FP-relative and
2791 therefore we return a FP-relative location. */
2793 if (frame_pointer_needed)
2795 if (fp_offset)
2796 return gen_frame_mem (DImode,
2797 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2798 else
2799 return gen_frame_mem (DImode,
2800 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2803 /* If FP is not needed, we calculate the location of LR, which would be
2804 at the top of the saved registers block. */
2806 return gen_frame_mem (DImode,
2807 plus_constant (Pmode,
2808 stack_pointer_rtx,
2809 fp_offset
2810 + cfun->machine->frame.saved_regs_size
2811 - 2 * UNITS_PER_WORD));
2814 /* Possibly output code to build up a constant in a register. For
2815 the benefit of the costs infrastructure, returns the number of
2816 instructions which would be emitted. GENERATE inhibits or
2817 enables code generation. */
2819 static int
2820 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2822 int insns = 0;
2824 if (aarch64_bitmask_imm (val, DImode))
2826 if (generate)
2827 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2828 insns = 1;
2830 else
2832 int i;
2833 int ncount = 0;
2834 int zcount = 0;
2835 HOST_WIDE_INT valp = val >> 16;
2836 HOST_WIDE_INT valm;
2837 HOST_WIDE_INT tval;
2839 for (i = 16; i < 64; i += 16)
2841 valm = (valp & 0xffff);
2843 if (valm != 0)
2844 ++ zcount;
2846 if (valm != 0xffff)
2847 ++ ncount;
2849 valp >>= 16;
2852 /* zcount contains the number of additional MOVK instructions
2853 required if the constant is built up with an initial MOVZ instruction,
2854 while ncount is the number of MOVK instructions required if starting
2855 with a MOVN instruction. Choose the sequence that yields the fewest
2856 number of instructions, preferring MOVZ instructions when they are both
2857 the same. */
2858 if (ncount < zcount)
2860 if (generate)
2861 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2862 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2863 tval = 0xffff;
2864 insns++;
2866 else
2868 if (generate)
2869 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2870 GEN_INT (val & 0xffff));
2871 tval = 0;
2872 insns++;
2875 val >>= 16;
2877 for (i = 16; i < 64; i += 16)
2879 if ((val & 0xffff) != tval)
2881 if (generate)
2882 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2883 GEN_INT (i),
2884 GEN_INT (val & 0xffff)));
2885 insns++;
2887 val >>= 16;
2890 return insns;
2893 static void
2894 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2896 HOST_WIDE_INT mdelta = delta;
2897 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2898 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2900 if (mdelta < 0)
2901 mdelta = -mdelta;
2903 if (mdelta >= 4096 * 4096)
2905 (void) aarch64_build_constant (scratchreg, delta, true);
2906 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2908 else if (mdelta > 0)
2910 if (mdelta >= 4096)
2912 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2913 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2914 if (delta < 0)
2915 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2916 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2917 else
2918 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2919 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2921 if (mdelta % 4096 != 0)
2923 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2924 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2925 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2930 /* Output code to add DELTA to the first argument, and then jump
2931 to FUNCTION. Used for C++ multiple inheritance. */
2932 static void
2933 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2934 HOST_WIDE_INT delta,
2935 HOST_WIDE_INT vcall_offset,
2936 tree function)
2938 /* The this pointer is always in x0. Note that this differs from
2939 Arm where the this pointer maybe bumped to r1 if r0 is required
2940 to return a pointer to an aggregate. On AArch64 a result value
2941 pointer will be in x8. */
2942 int this_regno = R0_REGNUM;
2943 rtx this_rtx, temp0, temp1, addr, funexp;
2944 rtx_insn *insn;
2946 reload_completed = 1;
2947 emit_note (NOTE_INSN_PROLOGUE_END);
2949 if (vcall_offset == 0)
2950 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2951 else
2953 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2955 this_rtx = gen_rtx_REG (Pmode, this_regno);
2956 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2957 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2959 addr = this_rtx;
2960 if (delta != 0)
2962 if (delta >= -256 && delta < 256)
2963 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2964 plus_constant (Pmode, this_rtx, delta));
2965 else
2966 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2969 if (Pmode == ptr_mode)
2970 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2971 else
2972 aarch64_emit_move (temp0,
2973 gen_rtx_ZERO_EXTEND (Pmode,
2974 gen_rtx_MEM (ptr_mode, addr)));
2976 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2977 addr = plus_constant (Pmode, temp0, vcall_offset);
2978 else
2980 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2981 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2984 if (Pmode == ptr_mode)
2985 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2986 else
2987 aarch64_emit_move (temp1,
2988 gen_rtx_SIGN_EXTEND (Pmode,
2989 gen_rtx_MEM (ptr_mode, addr)));
2991 emit_insn (gen_add2_insn (this_rtx, temp1));
2994 /* Generate a tail call to the target function. */
2995 if (!TREE_USED (function))
2997 assemble_external (function);
2998 TREE_USED (function) = 1;
3000 funexp = XEXP (DECL_RTL (function), 0);
3001 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3002 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3003 SIBLING_CALL_P (insn) = 1;
3005 insn = get_insns ();
3006 shorten_branches (insn);
3007 final_start_function (insn, file, 1);
3008 final (insn, file, 1);
3009 final_end_function ();
3011 /* Stop pretending to be a post-reload pass. */
3012 reload_completed = 0;
3015 static bool
3016 aarch64_tls_referenced_p (rtx x)
3018 if (!TARGET_HAVE_TLS)
3019 return false;
3020 subrtx_iterator::array_type array;
3021 FOR_EACH_SUBRTX (iter, array, x, ALL)
3023 const_rtx x = *iter;
3024 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3025 return true;
3026 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3027 TLS offsets, not real symbol references. */
3028 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3029 iter.skip_subrtxes ();
3031 return false;
3035 static int
3036 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3038 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3039 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3041 if (*imm1 < *imm2)
3042 return -1;
3043 if (*imm1 > *imm2)
3044 return +1;
3045 return 0;
3049 static void
3050 aarch64_build_bitmask_table (void)
3052 unsigned HOST_WIDE_INT mask, imm;
3053 unsigned int log_e, e, s, r;
3054 unsigned int nimms = 0;
3056 for (log_e = 1; log_e <= 6; log_e++)
3058 e = 1 << log_e;
3059 if (e == 64)
3060 mask = ~(HOST_WIDE_INT) 0;
3061 else
3062 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3063 for (s = 1; s < e; s++)
3065 for (r = 0; r < e; r++)
3067 /* set s consecutive bits to 1 (s < 64) */
3068 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3069 /* rotate right by r */
3070 if (r != 0)
3071 imm = ((imm >> r) | (imm << (e - r))) & mask;
3072 /* replicate the constant depending on SIMD size */
3073 switch (log_e) {
3074 case 1: imm |= (imm << 2);
3075 case 2: imm |= (imm << 4);
3076 case 3: imm |= (imm << 8);
3077 case 4: imm |= (imm << 16);
3078 case 5: imm |= (imm << 32);
3079 case 6:
3080 break;
3081 default:
3082 gcc_unreachable ();
3084 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3085 aarch64_bitmasks[nimms++] = imm;
3090 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3091 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3092 aarch64_bitmasks_cmp);
3096 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3097 a left shift of 0 or 12 bits. */
3098 bool
3099 aarch64_uimm12_shift (HOST_WIDE_INT val)
3101 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3102 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3107 /* Return true if val is an immediate that can be loaded into a
3108 register by a MOVZ instruction. */
3109 static bool
3110 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3112 if (GET_MODE_SIZE (mode) > 4)
3114 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3115 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3116 return 1;
3118 else
3120 /* Ignore sign extension. */
3121 val &= (HOST_WIDE_INT) 0xffffffff;
3123 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3124 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3128 /* Return true if val is a valid bitmask immediate. */
3129 bool
3130 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3132 if (GET_MODE_SIZE (mode) < 8)
3134 /* Replicate bit pattern. */
3135 val &= (HOST_WIDE_INT) 0xffffffff;
3136 val |= val << 32;
3138 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3139 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3143 /* Return true if val is an immediate that can be loaded into a
3144 register in a single instruction. */
3145 bool
3146 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3148 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3149 return 1;
3150 return aarch64_bitmask_imm (val, mode);
3153 static bool
3154 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3156 rtx base, offset;
3158 if (GET_CODE (x) == HIGH)
3159 return true;
3161 split_const (x, &base, &offset);
3162 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3164 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3165 != SYMBOL_FORCE_TO_MEM)
3166 return true;
3167 else
3168 /* Avoid generating a 64-bit relocation in ILP32; leave
3169 to aarch64_expand_mov_immediate to handle it properly. */
3170 return mode != ptr_mode;
3173 return aarch64_tls_referenced_p (x);
3176 /* Return true if register REGNO is a valid index register.
3177 STRICT_P is true if REG_OK_STRICT is in effect. */
3179 bool
3180 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3182 if (!HARD_REGISTER_NUM_P (regno))
3184 if (!strict_p)
3185 return true;
3187 if (!reg_renumber)
3188 return false;
3190 regno = reg_renumber[regno];
3192 return GP_REGNUM_P (regno);
3195 /* Return true if register REGNO is a valid base register for mode MODE.
3196 STRICT_P is true if REG_OK_STRICT is in effect. */
3198 bool
3199 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3201 if (!HARD_REGISTER_NUM_P (regno))
3203 if (!strict_p)
3204 return true;
3206 if (!reg_renumber)
3207 return false;
3209 regno = reg_renumber[regno];
3212 /* The fake registers will be eliminated to either the stack or
3213 hard frame pointer, both of which are usually valid base registers.
3214 Reload deals with the cases where the eliminated form isn't valid. */
3215 return (GP_REGNUM_P (regno)
3216 || regno == SP_REGNUM
3217 || regno == FRAME_POINTER_REGNUM
3218 || regno == ARG_POINTER_REGNUM);
3221 /* Return true if X is a valid base register for mode MODE.
3222 STRICT_P is true if REG_OK_STRICT is in effect. */
3224 static bool
3225 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3227 if (!strict_p && GET_CODE (x) == SUBREG)
3228 x = SUBREG_REG (x);
3230 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3233 /* Return true if address offset is a valid index. If it is, fill in INFO
3234 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3236 static bool
3237 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3238 machine_mode mode, bool strict_p)
3240 enum aarch64_address_type type;
3241 rtx index;
3242 int shift;
3244 /* (reg:P) */
3245 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3246 && GET_MODE (x) == Pmode)
3248 type = ADDRESS_REG_REG;
3249 index = x;
3250 shift = 0;
3252 /* (sign_extend:DI (reg:SI)) */
3253 else if ((GET_CODE (x) == SIGN_EXTEND
3254 || GET_CODE (x) == ZERO_EXTEND)
3255 && GET_MODE (x) == DImode
3256 && GET_MODE (XEXP (x, 0)) == SImode)
3258 type = (GET_CODE (x) == SIGN_EXTEND)
3259 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260 index = XEXP (x, 0);
3261 shift = 0;
3263 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3264 else if (GET_CODE (x) == MULT
3265 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3266 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3267 && GET_MODE (XEXP (x, 0)) == DImode
3268 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3269 && CONST_INT_P (XEXP (x, 1)))
3271 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3272 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3273 index = XEXP (XEXP (x, 0), 0);
3274 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3276 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3277 else if (GET_CODE (x) == ASHIFT
3278 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3279 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3280 && GET_MODE (XEXP (x, 0)) == DImode
3281 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3282 && CONST_INT_P (XEXP (x, 1)))
3284 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3285 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3286 index = XEXP (XEXP (x, 0), 0);
3287 shift = INTVAL (XEXP (x, 1));
3289 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3290 else if ((GET_CODE (x) == SIGN_EXTRACT
3291 || GET_CODE (x) == ZERO_EXTRACT)
3292 && GET_MODE (x) == DImode
3293 && GET_CODE (XEXP (x, 0)) == MULT
3294 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3295 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3297 type = (GET_CODE (x) == SIGN_EXTRACT)
3298 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3299 index = XEXP (XEXP (x, 0), 0);
3300 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3301 if (INTVAL (XEXP (x, 1)) != 32 + shift
3302 || INTVAL (XEXP (x, 2)) != 0)
3303 shift = -1;
3305 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3306 (const_int 0xffffffff<<shift)) */
3307 else if (GET_CODE (x) == AND
3308 && GET_MODE (x) == DImode
3309 && GET_CODE (XEXP (x, 0)) == MULT
3310 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3311 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3312 && CONST_INT_P (XEXP (x, 1)))
3314 type = ADDRESS_REG_UXTW;
3315 index = XEXP (XEXP (x, 0), 0);
3316 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3317 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3318 shift = -1;
3320 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3321 else if ((GET_CODE (x) == SIGN_EXTRACT
3322 || GET_CODE (x) == ZERO_EXTRACT)
3323 && GET_MODE (x) == DImode
3324 && GET_CODE (XEXP (x, 0)) == ASHIFT
3325 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3326 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3328 type = (GET_CODE (x) == SIGN_EXTRACT)
3329 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3330 index = XEXP (XEXP (x, 0), 0);
3331 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3332 if (INTVAL (XEXP (x, 1)) != 32 + shift
3333 || INTVAL (XEXP (x, 2)) != 0)
3334 shift = -1;
3336 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3337 (const_int 0xffffffff<<shift)) */
3338 else if (GET_CODE (x) == AND
3339 && GET_MODE (x) == DImode
3340 && GET_CODE (XEXP (x, 0)) == ASHIFT
3341 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3342 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3343 && CONST_INT_P (XEXP (x, 1)))
3345 type = ADDRESS_REG_UXTW;
3346 index = XEXP (XEXP (x, 0), 0);
3347 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3348 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3349 shift = -1;
3351 /* (mult:P (reg:P) (const_int scale)) */
3352 else if (GET_CODE (x) == MULT
3353 && GET_MODE (x) == Pmode
3354 && GET_MODE (XEXP (x, 0)) == Pmode
3355 && CONST_INT_P (XEXP (x, 1)))
3357 type = ADDRESS_REG_REG;
3358 index = XEXP (x, 0);
3359 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3361 /* (ashift:P (reg:P) (const_int shift)) */
3362 else if (GET_CODE (x) == ASHIFT
3363 && GET_MODE (x) == Pmode
3364 && GET_MODE (XEXP (x, 0)) == Pmode
3365 && CONST_INT_P (XEXP (x, 1)))
3367 type = ADDRESS_REG_REG;
3368 index = XEXP (x, 0);
3369 shift = INTVAL (XEXP (x, 1));
3371 else
3372 return false;
3374 if (GET_CODE (index) == SUBREG)
3375 index = SUBREG_REG (index);
3377 if ((shift == 0 ||
3378 (shift > 0 && shift <= 3
3379 && (1 << shift) == GET_MODE_SIZE (mode)))
3380 && REG_P (index)
3381 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3383 info->type = type;
3384 info->offset = index;
3385 info->shift = shift;
3386 return true;
3389 return false;
3392 bool
3393 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3395 return (offset >= -64 * GET_MODE_SIZE (mode)
3396 && offset < 64 * GET_MODE_SIZE (mode)
3397 && offset % GET_MODE_SIZE (mode) == 0);
3400 static inline bool
3401 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3402 HOST_WIDE_INT offset)
3404 return offset >= -256 && offset < 256;
3407 static inline bool
3408 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3410 return (offset >= 0
3411 && offset < 4096 * GET_MODE_SIZE (mode)
3412 && offset % GET_MODE_SIZE (mode) == 0);
3415 /* Return true if X is a valid address for machine mode MODE. If it is,
3416 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3417 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3419 static bool
3420 aarch64_classify_address (struct aarch64_address_info *info,
3421 rtx x, machine_mode mode,
3422 RTX_CODE outer_code, bool strict_p)
3424 enum rtx_code code = GET_CODE (x);
3425 rtx op0, op1;
3426 bool allow_reg_index_p =
3427 outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3428 || aarch64_vector_mode_supported_p (mode));
3429 /* Don't support anything other than POST_INC or REG addressing for
3430 AdvSIMD. */
3431 if (aarch64_vect_struct_mode_p (mode)
3432 && (code != POST_INC && code != REG))
3433 return false;
3435 switch (code)
3437 case REG:
3438 case SUBREG:
3439 info->type = ADDRESS_REG_IMM;
3440 info->base = x;
3441 info->offset = const0_rtx;
3442 return aarch64_base_register_rtx_p (x, strict_p);
3444 case PLUS:
3445 op0 = XEXP (x, 0);
3446 op1 = XEXP (x, 1);
3448 if (! strict_p
3449 && REG_P (op0)
3450 && (op0 == virtual_stack_vars_rtx
3451 || op0 == frame_pointer_rtx
3452 || op0 == arg_pointer_rtx)
3453 && CONST_INT_P (op1))
3455 info->type = ADDRESS_REG_IMM;
3456 info->base = op0;
3457 info->offset = op1;
3459 return true;
3462 if (GET_MODE_SIZE (mode) != 0
3463 && CONST_INT_P (op1)
3464 && aarch64_base_register_rtx_p (op0, strict_p))
3466 HOST_WIDE_INT offset = INTVAL (op1);
3468 info->type = ADDRESS_REG_IMM;
3469 info->base = op0;
3470 info->offset = op1;
3472 /* TImode and TFmode values are allowed in both pairs of X
3473 registers and individual Q registers. The available
3474 address modes are:
3475 X,X: 7-bit signed scaled offset
3476 Q: 9-bit signed offset
3477 We conservatively require an offset representable in either mode.
3479 if (mode == TImode || mode == TFmode)
3480 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3481 && offset_9bit_signed_unscaled_p (mode, offset));
3483 if (outer_code == PARALLEL)
3484 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3485 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3486 else
3487 return (offset_9bit_signed_unscaled_p (mode, offset)
3488 || offset_12bit_unsigned_scaled_p (mode, offset));
3491 if (allow_reg_index_p)
3493 /* Look for base + (scaled/extended) index register. */
3494 if (aarch64_base_register_rtx_p (op0, strict_p)
3495 && aarch64_classify_index (info, op1, mode, strict_p))
3497 info->base = op0;
3498 return true;
3500 if (aarch64_base_register_rtx_p (op1, strict_p)
3501 && aarch64_classify_index (info, op0, mode, strict_p))
3503 info->base = op1;
3504 return true;
3508 return false;
3510 case POST_INC:
3511 case POST_DEC:
3512 case PRE_INC:
3513 case PRE_DEC:
3514 info->type = ADDRESS_REG_WB;
3515 info->base = XEXP (x, 0);
3516 info->offset = NULL_RTX;
3517 return aarch64_base_register_rtx_p (info->base, strict_p);
3519 case POST_MODIFY:
3520 case PRE_MODIFY:
3521 info->type = ADDRESS_REG_WB;
3522 info->base = XEXP (x, 0);
3523 if (GET_CODE (XEXP (x, 1)) == PLUS
3524 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3525 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3526 && aarch64_base_register_rtx_p (info->base, strict_p))
3528 HOST_WIDE_INT offset;
3529 info->offset = XEXP (XEXP (x, 1), 1);
3530 offset = INTVAL (info->offset);
3532 /* TImode and TFmode values are allowed in both pairs of X
3533 registers and individual Q registers. The available
3534 address modes are:
3535 X,X: 7-bit signed scaled offset
3536 Q: 9-bit signed offset
3537 We conservatively require an offset representable in either mode.
3539 if (mode == TImode || mode == TFmode)
3540 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3541 && offset_9bit_signed_unscaled_p (mode, offset));
3543 if (outer_code == PARALLEL)
3544 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3545 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3546 else
3547 return offset_9bit_signed_unscaled_p (mode, offset);
3549 return false;
3551 case CONST:
3552 case SYMBOL_REF:
3553 case LABEL_REF:
3554 /* load literal: pc-relative constant pool entry. Only supported
3555 for SI mode or larger. */
3556 info->type = ADDRESS_SYMBOLIC;
3557 if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3559 rtx sym, addend;
3561 split_const (x, &sym, &addend);
3562 return (GET_CODE (sym) == LABEL_REF
3563 || (GET_CODE (sym) == SYMBOL_REF
3564 && CONSTANT_POOL_ADDRESS_P (sym)));
3566 return false;
3568 case LO_SUM:
3569 info->type = ADDRESS_LO_SUM;
3570 info->base = XEXP (x, 0);
3571 info->offset = XEXP (x, 1);
3572 if (allow_reg_index_p
3573 && aarch64_base_register_rtx_p (info->base, strict_p))
3575 rtx sym, offs;
3576 split_const (info->offset, &sym, &offs);
3577 if (GET_CODE (sym) == SYMBOL_REF
3578 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3579 == SYMBOL_SMALL_ABSOLUTE))
3581 /* The symbol and offset must be aligned to the access size. */
3582 unsigned int align;
3583 unsigned int ref_size;
3585 if (CONSTANT_POOL_ADDRESS_P (sym))
3586 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3587 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3589 tree exp = SYMBOL_REF_DECL (sym);
3590 align = TYPE_ALIGN (TREE_TYPE (exp));
3591 align = CONSTANT_ALIGNMENT (exp, align);
3593 else if (SYMBOL_REF_DECL (sym))
3594 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3595 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3596 && SYMBOL_REF_BLOCK (sym) != NULL)
3597 align = SYMBOL_REF_BLOCK (sym)->alignment;
3598 else
3599 align = BITS_PER_UNIT;
3601 ref_size = GET_MODE_SIZE (mode);
3602 if (ref_size == 0)
3603 ref_size = GET_MODE_SIZE (DImode);
3605 return ((INTVAL (offs) & (ref_size - 1)) == 0
3606 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3609 return false;
3611 default:
3612 return false;
3616 bool
3617 aarch64_symbolic_address_p (rtx x)
3619 rtx offset;
3621 split_const (x, &x, &offset);
3622 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3625 /* Classify the base of symbolic expression X, given that X appears in
3626 context CONTEXT. */
3628 enum aarch64_symbol_type
3629 aarch64_classify_symbolic_expression (rtx x,
3630 enum aarch64_symbol_context context)
3632 rtx offset;
3634 split_const (x, &x, &offset);
3635 return aarch64_classify_symbol (x, offset, context);
3639 /* Return TRUE if X is a legitimate address for accessing memory in
3640 mode MODE. */
3641 static bool
3642 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3644 struct aarch64_address_info addr;
3646 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3649 /* Return TRUE if X is a legitimate address for accessing memory in
3650 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3651 pair operation. */
3652 bool
3653 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3654 RTX_CODE outer_code, bool strict_p)
3656 struct aarch64_address_info addr;
3658 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3661 /* Return TRUE if rtx X is immediate constant 0.0 */
3662 bool
3663 aarch64_float_const_zero_rtx_p (rtx x)
3665 REAL_VALUE_TYPE r;
3667 if (GET_MODE (x) == VOIDmode)
3668 return false;
3670 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3671 if (REAL_VALUE_MINUS_ZERO (r))
3672 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3673 return REAL_VALUES_EQUAL (r, dconst0);
3676 /* Return the fixed registers used for condition codes. */
3678 static bool
3679 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3681 *p1 = CC_REGNUM;
3682 *p2 = INVALID_REGNUM;
3683 return true;
3686 /* Emit call insn with PAT and do aarch64-specific handling. */
3688 void
3689 aarch64_emit_call_insn (rtx pat)
3691 rtx insn = emit_call_insn (pat);
3693 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3694 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3695 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3698 machine_mode
3699 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3701 /* All floating point compares return CCFP if it is an equality
3702 comparison, and CCFPE otherwise. */
3703 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3705 switch (code)
3707 case EQ:
3708 case NE:
3709 case UNORDERED:
3710 case ORDERED:
3711 case UNLT:
3712 case UNLE:
3713 case UNGT:
3714 case UNGE:
3715 case UNEQ:
3716 case LTGT:
3717 return CCFPmode;
3719 case LT:
3720 case LE:
3721 case GT:
3722 case GE:
3723 return CCFPEmode;
3725 default:
3726 gcc_unreachable ();
3730 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3731 && y == const0_rtx
3732 && (code == EQ || code == NE || code == LT || code == GE)
3733 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3734 || GET_CODE (x) == NEG))
3735 return CC_NZmode;
3737 /* A compare with a shifted operand. Because of canonicalization,
3738 the comparison will have to be swapped when we emit the assembly
3739 code. */
3740 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3741 && (REG_P (y) || GET_CODE (y) == SUBREG)
3742 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3743 || GET_CODE (x) == LSHIFTRT
3744 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3745 return CC_SWPmode;
3747 /* Similarly for a negated operand, but we can only do this for
3748 equalities. */
3749 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3750 && (REG_P (y) || GET_CODE (y) == SUBREG)
3751 && (code == EQ || code == NE)
3752 && GET_CODE (x) == NEG)
3753 return CC_Zmode;
3755 /* A compare of a mode narrower than SI mode against zero can be done
3756 by extending the value in the comparison. */
3757 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3758 && y == const0_rtx)
3759 /* Only use sign-extension if we really need it. */
3760 return ((code == GT || code == GE || code == LE || code == LT)
3761 ? CC_SESWPmode : CC_ZESWPmode);
3763 /* For everything else, return CCmode. */
3764 return CCmode;
3767 static int
3768 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3771 aarch64_get_condition_code (rtx x)
3773 machine_mode mode = GET_MODE (XEXP (x, 0));
3774 enum rtx_code comp_code = GET_CODE (x);
3776 if (GET_MODE_CLASS (mode) != MODE_CC)
3777 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3778 return aarch64_get_condition_code_1 (mode, comp_code);
3781 static int
3782 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3784 int ne = -1, eq = -1;
3785 switch (mode)
3787 case CCFPmode:
3788 case CCFPEmode:
3789 switch (comp_code)
3791 case GE: return AARCH64_GE;
3792 case GT: return AARCH64_GT;
3793 case LE: return AARCH64_LS;
3794 case LT: return AARCH64_MI;
3795 case NE: return AARCH64_NE;
3796 case EQ: return AARCH64_EQ;
3797 case ORDERED: return AARCH64_VC;
3798 case UNORDERED: return AARCH64_VS;
3799 case UNLT: return AARCH64_LT;
3800 case UNLE: return AARCH64_LE;
3801 case UNGT: return AARCH64_HI;
3802 case UNGE: return AARCH64_PL;
3803 default: return -1;
3805 break;
3807 case CC_DNEmode:
3808 ne = AARCH64_NE;
3809 eq = AARCH64_EQ;
3810 break;
3812 case CC_DEQmode:
3813 ne = AARCH64_EQ;
3814 eq = AARCH64_NE;
3815 break;
3817 case CC_DGEmode:
3818 ne = AARCH64_GE;
3819 eq = AARCH64_LT;
3820 break;
3822 case CC_DLTmode:
3823 ne = AARCH64_LT;
3824 eq = AARCH64_GE;
3825 break;
3827 case CC_DGTmode:
3828 ne = AARCH64_GT;
3829 eq = AARCH64_LE;
3830 break;
3832 case CC_DLEmode:
3833 ne = AARCH64_LE;
3834 eq = AARCH64_GT;
3835 break;
3837 case CC_DGEUmode:
3838 ne = AARCH64_CS;
3839 eq = AARCH64_CC;
3840 break;
3842 case CC_DLTUmode:
3843 ne = AARCH64_CC;
3844 eq = AARCH64_CS;
3845 break;
3847 case CC_DGTUmode:
3848 ne = AARCH64_HI;
3849 eq = AARCH64_LS;
3850 break;
3852 case CC_DLEUmode:
3853 ne = AARCH64_LS;
3854 eq = AARCH64_HI;
3855 break;
3857 case CCmode:
3858 switch (comp_code)
3860 case NE: return AARCH64_NE;
3861 case EQ: return AARCH64_EQ;
3862 case GE: return AARCH64_GE;
3863 case GT: return AARCH64_GT;
3864 case LE: return AARCH64_LE;
3865 case LT: return AARCH64_LT;
3866 case GEU: return AARCH64_CS;
3867 case GTU: return AARCH64_HI;
3868 case LEU: return AARCH64_LS;
3869 case LTU: return AARCH64_CC;
3870 default: return -1;
3872 break;
3874 case CC_SWPmode:
3875 case CC_ZESWPmode:
3876 case CC_SESWPmode:
3877 switch (comp_code)
3879 case NE: return AARCH64_NE;
3880 case EQ: return AARCH64_EQ;
3881 case GE: return AARCH64_LE;
3882 case GT: return AARCH64_LT;
3883 case LE: return AARCH64_GE;
3884 case LT: return AARCH64_GT;
3885 case GEU: return AARCH64_LS;
3886 case GTU: return AARCH64_CC;
3887 case LEU: return AARCH64_CS;
3888 case LTU: return AARCH64_HI;
3889 default: return -1;
3891 break;
3893 case CC_NZmode:
3894 switch (comp_code)
3896 case NE: return AARCH64_NE;
3897 case EQ: return AARCH64_EQ;
3898 case GE: return AARCH64_PL;
3899 case LT: return AARCH64_MI;
3900 default: return -1;
3902 break;
3904 case CC_Zmode:
3905 switch (comp_code)
3907 case NE: return AARCH64_NE;
3908 case EQ: return AARCH64_EQ;
3909 default: return -1;
3911 break;
3913 default:
3914 return -1;
3915 break;
3918 if (comp_code == NE)
3919 return ne;
3921 if (comp_code == EQ)
3922 return eq;
3924 return -1;
3927 bool
3928 aarch64_const_vec_all_same_in_range_p (rtx x,
3929 HOST_WIDE_INT minval,
3930 HOST_WIDE_INT maxval)
3932 HOST_WIDE_INT firstval;
3933 int count, i;
3935 if (GET_CODE (x) != CONST_VECTOR
3936 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3937 return false;
3939 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3940 if (firstval < minval || firstval > maxval)
3941 return false;
3943 count = CONST_VECTOR_NUNITS (x);
3944 for (i = 1; i < count; i++)
3945 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3946 return false;
3948 return true;
3951 bool
3952 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3954 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3957 static unsigned
3958 bit_count (unsigned HOST_WIDE_INT value)
3960 unsigned count = 0;
3962 while (value)
3964 count++;
3965 value &= value - 1;
3968 return count;
3971 /* N Z C V. */
3972 #define AARCH64_CC_V 1
3973 #define AARCH64_CC_C (1 << 1)
3974 #define AARCH64_CC_Z (1 << 2)
3975 #define AARCH64_CC_N (1 << 3)
3977 /* N Z C V flags for ccmp. The first code is for AND op and the other
3978 is for IOR op. Indexed by AARCH64_COND_CODE. */
3979 static const int aarch64_nzcv_codes[][2] =
3981 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3982 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3983 {AARCH64_CC_C, 0}, /* CS, C == 1. */
3984 {0, AARCH64_CC_C}, /* CC, C == 0. */
3985 {AARCH64_CC_N, 0}, /* MI, N == 1. */
3986 {0, AARCH64_CC_N}, /* PL, N == 0. */
3987 {AARCH64_CC_V, 0}, /* VS, V == 1. */
3988 {0, AARCH64_CC_V}, /* VC, V == 0. */
3989 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
3990 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
3991 {0, AARCH64_CC_V}, /* GE, N == V. */
3992 {AARCH64_CC_V, 0}, /* LT, N != V. */
3993 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
3994 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
3995 {0, 0}, /* AL, Any. */
3996 {0, 0}, /* NV, Any. */
4000 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4002 switch (mode)
4004 case CC_DNEmode:
4005 return NE;
4007 case CC_DEQmode:
4008 return EQ;
4010 case CC_DLEmode:
4011 return LE;
4013 case CC_DGTmode:
4014 return GT;
4016 case CC_DLTmode:
4017 return LT;
4019 case CC_DGEmode:
4020 return GE;
4022 case CC_DLEUmode:
4023 return LEU;
4025 case CC_DGTUmode:
4026 return GTU;
4028 case CC_DLTUmode:
4029 return LTU;
4031 case CC_DGEUmode:
4032 return GEU;
4034 default:
4035 gcc_unreachable ();
4040 void
4041 aarch64_print_operand (FILE *f, rtx x, char code)
4043 switch (code)
4045 /* An integer or symbol address without a preceding # sign. */
4046 case 'c':
4047 switch (GET_CODE (x))
4049 case CONST_INT:
4050 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4051 break;
4053 case SYMBOL_REF:
4054 output_addr_const (f, x);
4055 break;
4057 case CONST:
4058 if (GET_CODE (XEXP (x, 0)) == PLUS
4059 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4061 output_addr_const (f, x);
4062 break;
4064 /* Fall through. */
4066 default:
4067 output_operand_lossage ("Unsupported operand for code '%c'", code);
4069 break;
4071 case 'e':
4072 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4074 int n;
4076 if (!CONST_INT_P (x)
4077 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4079 output_operand_lossage ("invalid operand for '%%%c'", code);
4080 return;
4083 switch (n)
4085 case 3:
4086 fputc ('b', f);
4087 break;
4088 case 4:
4089 fputc ('h', f);
4090 break;
4091 case 5:
4092 fputc ('w', f);
4093 break;
4094 default:
4095 output_operand_lossage ("invalid operand for '%%%c'", code);
4096 return;
4099 break;
4101 case 'p':
4103 int n;
4105 /* Print N such that 2^N == X. */
4106 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4108 output_operand_lossage ("invalid operand for '%%%c'", code);
4109 return;
4112 asm_fprintf (f, "%d", n);
4114 break;
4116 case 'P':
4117 /* Print the number of non-zero bits in X (a const_int). */
4118 if (!CONST_INT_P (x))
4120 output_operand_lossage ("invalid operand for '%%%c'", code);
4121 return;
4124 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4125 break;
4127 case 'H':
4128 /* Print the higher numbered register of a pair (TImode) of regs. */
4129 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4131 output_operand_lossage ("invalid operand for '%%%c'", code);
4132 return;
4135 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4136 break;
4138 case 'm':
4140 int cond_code;
4141 /* Print a condition (eq, ne, etc). */
4143 /* CONST_TRUE_RTX means always -- that's the default. */
4144 if (x == const_true_rtx)
4145 return;
4147 if (!COMPARISON_P (x))
4149 output_operand_lossage ("invalid operand for '%%%c'", code);
4150 return;
4153 cond_code = aarch64_get_condition_code (x);
4154 gcc_assert (cond_code >= 0);
4155 fputs (aarch64_condition_codes[cond_code], f);
4157 break;
4159 case 'M':
4161 int cond_code;
4162 /* Print the inverse of a condition (eq <-> ne, etc). */
4164 /* CONST_TRUE_RTX means never -- that's the default. */
4165 if (x == const_true_rtx)
4167 fputs ("nv", f);
4168 return;
4171 if (!COMPARISON_P (x))
4173 output_operand_lossage ("invalid operand for '%%%c'", code);
4174 return;
4176 cond_code = aarch64_get_condition_code (x);
4177 gcc_assert (cond_code >= 0);
4178 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4179 (cond_code)], f);
4181 break;
4183 case 'b':
4184 case 'h':
4185 case 's':
4186 case 'd':
4187 case 'q':
4188 /* Print a scalar FP/SIMD register name. */
4189 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4191 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4192 return;
4194 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4195 break;
4197 case 'S':
4198 case 'T':
4199 case 'U':
4200 case 'V':
4201 /* Print the first FP/SIMD register name in a list. */
4202 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4204 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4205 return;
4207 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4208 break;
4210 case 'X':
4211 /* Print bottom 16 bits of integer constant in hex. */
4212 if (!CONST_INT_P (x))
4214 output_operand_lossage ("invalid operand for '%%%c'", code);
4215 return;
4217 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4218 break;
4220 case 'w':
4221 case 'x':
4222 /* Print a general register name or the zero register (32-bit or
4223 64-bit). */
4224 if (x == const0_rtx
4225 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4227 asm_fprintf (f, "%czr", code);
4228 break;
4231 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4233 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4234 break;
4237 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4239 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4240 break;
4243 /* Fall through */
4245 case 0:
4246 /* Print a normal operand, if it's a general register, then we
4247 assume DImode. */
4248 if (x == NULL)
4250 output_operand_lossage ("missing operand");
4251 return;
4254 switch (GET_CODE (x))
4256 case REG:
4257 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4258 break;
4260 case MEM:
4261 aarch64_memory_reference_mode = GET_MODE (x);
4262 output_address (XEXP (x, 0));
4263 break;
4265 case LABEL_REF:
4266 case SYMBOL_REF:
4267 output_addr_const (asm_out_file, x);
4268 break;
4270 case CONST_INT:
4271 asm_fprintf (f, "%wd", INTVAL (x));
4272 break;
4274 case CONST_VECTOR:
4275 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4277 gcc_assert (
4278 aarch64_const_vec_all_same_in_range_p (x,
4279 HOST_WIDE_INT_MIN,
4280 HOST_WIDE_INT_MAX));
4281 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4283 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4285 fputc ('0', f);
4287 else
4288 gcc_unreachable ();
4289 break;
4291 case CONST_DOUBLE:
4292 /* CONST_DOUBLE can represent a double-width integer.
4293 In this case, the mode of x is VOIDmode. */
4294 if (GET_MODE (x) == VOIDmode)
4295 ; /* Do Nothing. */
4296 else if (aarch64_float_const_zero_rtx_p (x))
4298 fputc ('0', f);
4299 break;
4301 else if (aarch64_float_const_representable_p (x))
4303 #define buf_size 20
4304 char float_buf[buf_size] = {'\0'};
4305 REAL_VALUE_TYPE r;
4306 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4307 real_to_decimal_for_mode (float_buf, &r,
4308 buf_size, buf_size,
4309 1, GET_MODE (x));
4310 asm_fprintf (asm_out_file, "%s", float_buf);
4311 break;
4312 #undef buf_size
4314 output_operand_lossage ("invalid constant");
4315 return;
4316 default:
4317 output_operand_lossage ("invalid operand");
4318 return;
4320 break;
4322 case 'A':
4323 if (GET_CODE (x) == HIGH)
4324 x = XEXP (x, 0);
4326 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4328 case SYMBOL_SMALL_GOT:
4329 asm_fprintf (asm_out_file, ":got:");
4330 break;
4332 case SYMBOL_SMALL_TLSGD:
4333 asm_fprintf (asm_out_file, ":tlsgd:");
4334 break;
4336 case SYMBOL_SMALL_TLSDESC:
4337 asm_fprintf (asm_out_file, ":tlsdesc:");
4338 break;
4340 case SYMBOL_SMALL_GOTTPREL:
4341 asm_fprintf (asm_out_file, ":gottprel:");
4342 break;
4344 case SYMBOL_SMALL_TPREL:
4345 asm_fprintf (asm_out_file, ":tprel:");
4346 break;
4348 case SYMBOL_TINY_GOT:
4349 gcc_unreachable ();
4350 break;
4352 default:
4353 break;
4355 output_addr_const (asm_out_file, x);
4356 break;
4358 case 'L':
4359 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4361 case SYMBOL_SMALL_GOT:
4362 asm_fprintf (asm_out_file, ":lo12:");
4363 break;
4365 case SYMBOL_SMALL_TLSGD:
4366 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4367 break;
4369 case SYMBOL_SMALL_TLSDESC:
4370 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4371 break;
4373 case SYMBOL_SMALL_GOTTPREL:
4374 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4375 break;
4377 case SYMBOL_SMALL_TPREL:
4378 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4379 break;
4381 case SYMBOL_TINY_GOT:
4382 asm_fprintf (asm_out_file, ":got:");
4383 break;
4385 default:
4386 break;
4388 output_addr_const (asm_out_file, x);
4389 break;
4391 case 'G':
4393 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4395 case SYMBOL_SMALL_TPREL:
4396 asm_fprintf (asm_out_file, ":tprel_hi12:");
4397 break;
4398 default:
4399 break;
4401 output_addr_const (asm_out_file, x);
4402 break;
4404 case 'K':
4406 int cond_code;
4407 /* Print nzcv. */
4409 if (!COMPARISON_P (x))
4411 output_operand_lossage ("invalid operand for '%%%c'", code);
4412 return;
4415 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4416 gcc_assert (cond_code >= 0);
4417 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4419 break;
4421 case 'k':
4423 int cond_code;
4424 /* Print nzcv. */
4426 if (!COMPARISON_P (x))
4428 output_operand_lossage ("invalid operand for '%%%c'", code);
4429 return;
4432 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4433 gcc_assert (cond_code >= 0);
4434 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4436 break;
4438 default:
4439 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4440 return;
4444 void
4445 aarch64_print_operand_address (FILE *f, rtx x)
4447 struct aarch64_address_info addr;
4449 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4450 MEM, true))
4451 switch (addr.type)
4453 case ADDRESS_REG_IMM:
4454 if (addr.offset == const0_rtx)
4455 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4456 else
4457 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4458 INTVAL (addr.offset));
4459 return;
4461 case ADDRESS_REG_REG:
4462 if (addr.shift == 0)
4463 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4464 reg_names [REGNO (addr.offset)]);
4465 else
4466 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4467 reg_names [REGNO (addr.offset)], addr.shift);
4468 return;
4470 case ADDRESS_REG_UXTW:
4471 if (addr.shift == 0)
4472 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4473 REGNO (addr.offset) - R0_REGNUM);
4474 else
4475 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4476 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4477 return;
4479 case ADDRESS_REG_SXTW:
4480 if (addr.shift == 0)
4481 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4482 REGNO (addr.offset) - R0_REGNUM);
4483 else
4484 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4485 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4486 return;
4488 case ADDRESS_REG_WB:
4489 switch (GET_CODE (x))
4491 case PRE_INC:
4492 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4493 GET_MODE_SIZE (aarch64_memory_reference_mode));
4494 return;
4495 case POST_INC:
4496 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4497 GET_MODE_SIZE (aarch64_memory_reference_mode));
4498 return;
4499 case PRE_DEC:
4500 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4501 GET_MODE_SIZE (aarch64_memory_reference_mode));
4502 return;
4503 case POST_DEC:
4504 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4505 GET_MODE_SIZE (aarch64_memory_reference_mode));
4506 return;
4507 case PRE_MODIFY:
4508 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4509 INTVAL (addr.offset));
4510 return;
4511 case POST_MODIFY:
4512 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4513 INTVAL (addr.offset));
4514 return;
4515 default:
4516 break;
4518 break;
4520 case ADDRESS_LO_SUM:
4521 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4522 output_addr_const (f, addr.offset);
4523 asm_fprintf (f, "]");
4524 return;
4526 case ADDRESS_SYMBOLIC:
4527 break;
4530 output_addr_const (f, x);
4533 bool
4534 aarch64_label_mentioned_p (rtx x)
4536 const char *fmt;
4537 int i;
4539 if (GET_CODE (x) == LABEL_REF)
4540 return true;
4542 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4543 referencing instruction, but they are constant offsets, not
4544 symbols. */
4545 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4546 return false;
4548 fmt = GET_RTX_FORMAT (GET_CODE (x));
4549 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4551 if (fmt[i] == 'E')
4553 int j;
4555 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4556 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4557 return 1;
4559 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4560 return 1;
4563 return 0;
4566 /* Implement REGNO_REG_CLASS. */
4568 enum reg_class
4569 aarch64_regno_regclass (unsigned regno)
4571 if (GP_REGNUM_P (regno))
4572 return GENERAL_REGS;
4574 if (regno == SP_REGNUM)
4575 return STACK_REG;
4577 if (regno == FRAME_POINTER_REGNUM
4578 || regno == ARG_POINTER_REGNUM)
4579 return POINTER_REGS;
4581 if (FP_REGNUM_P (regno))
4582 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4584 return NO_REGS;
4587 static rtx
4588 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4590 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4591 where mask is selected by alignment and size of the offset.
4592 We try to pick as large a range for the offset as possible to
4593 maximize the chance of a CSE. However, for aligned addresses
4594 we limit the range to 4k so that structures with different sized
4595 elements are likely to use the same base. */
4597 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4599 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4600 HOST_WIDE_INT base_offset;
4602 /* Does it look like we'll need a load/store-pair operation? */
4603 if (GET_MODE_SIZE (mode) > 16
4604 || mode == TImode)
4605 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4606 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4607 /* For offsets aren't a multiple of the access size, the limit is
4608 -256...255. */
4609 else if (offset & (GET_MODE_SIZE (mode) - 1))
4610 base_offset = (offset + 0x100) & ~0x1ff;
4611 else
4612 base_offset = offset & ~0xfff;
4614 if (base_offset == 0)
4615 return x;
4617 offset -= base_offset;
4618 rtx base_reg = gen_reg_rtx (Pmode);
4619 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4620 NULL_RTX);
4621 emit_move_insn (base_reg, val);
4622 x = plus_constant (Pmode, base_reg, offset);
4625 return x;
4628 /* Try a machine-dependent way of reloading an illegitimate address
4629 operand. If we find one, push the reload and return the new rtx. */
4632 aarch64_legitimize_reload_address (rtx *x_p,
4633 machine_mode mode,
4634 int opnum, int type,
4635 int ind_levels ATTRIBUTE_UNUSED)
4637 rtx x = *x_p;
4639 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4640 if (aarch64_vect_struct_mode_p (mode)
4641 && GET_CODE (x) == PLUS
4642 && REG_P (XEXP (x, 0))
4643 && CONST_INT_P (XEXP (x, 1)))
4645 rtx orig_rtx = x;
4646 x = copy_rtx (x);
4647 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4648 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4649 opnum, (enum reload_type) type);
4650 return x;
4653 /* We must recognize output that we have already generated ourselves. */
4654 if (GET_CODE (x) == PLUS
4655 && GET_CODE (XEXP (x, 0)) == PLUS
4656 && REG_P (XEXP (XEXP (x, 0), 0))
4657 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4658 && CONST_INT_P (XEXP (x, 1)))
4660 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4661 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4662 opnum, (enum reload_type) type);
4663 return x;
4666 /* We wish to handle large displacements off a base register by splitting
4667 the addend across an add and the mem insn. This can cut the number of
4668 extra insns needed from 3 to 1. It is only useful for load/store of a
4669 single register with 12 bit offset field. */
4670 if (GET_CODE (x) == PLUS
4671 && REG_P (XEXP (x, 0))
4672 && CONST_INT_P (XEXP (x, 1))
4673 && HARD_REGISTER_P (XEXP (x, 0))
4674 && mode != TImode
4675 && mode != TFmode
4676 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4678 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4679 HOST_WIDE_INT low = val & 0xfff;
4680 HOST_WIDE_INT high = val - low;
4681 HOST_WIDE_INT offs;
4682 rtx cst;
4683 machine_mode xmode = GET_MODE (x);
4685 /* In ILP32, xmode can be either DImode or SImode. */
4686 gcc_assert (xmode == DImode || xmode == SImode);
4688 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4689 BLKmode alignment. */
4690 if (GET_MODE_SIZE (mode) == 0)
4691 return NULL_RTX;
4693 offs = low % GET_MODE_SIZE (mode);
4695 /* Align misaligned offset by adjusting high part to compensate. */
4696 if (offs != 0)
4698 if (aarch64_uimm12_shift (high + offs))
4700 /* Align down. */
4701 low = low - offs;
4702 high = high + offs;
4704 else
4706 /* Align up. */
4707 offs = GET_MODE_SIZE (mode) - offs;
4708 low = low + offs;
4709 high = high + (low & 0x1000) - offs;
4710 low &= 0xfff;
4714 /* Check for overflow. */
4715 if (high + low != val)
4716 return NULL_RTX;
4718 cst = GEN_INT (high);
4719 if (!aarch64_uimm12_shift (high))
4720 cst = force_const_mem (xmode, cst);
4722 /* Reload high part into base reg, leaving the low part
4723 in the mem instruction.
4724 Note that replacing this gen_rtx_PLUS with plus_constant is
4725 wrong in this case because we rely on the
4726 (plus (plus reg c1) c2) structure being preserved so that
4727 XEXP (*p, 0) in push_reload below uses the correct term. */
4728 x = gen_rtx_PLUS (xmode,
4729 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4730 GEN_INT (low));
4732 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4733 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4734 opnum, (enum reload_type) type);
4735 return x;
4738 return NULL_RTX;
4742 static reg_class_t
4743 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4744 reg_class_t rclass,
4745 machine_mode mode,
4746 secondary_reload_info *sri)
4748 /* Without the TARGET_SIMD instructions we cannot move a Q register
4749 to a Q register directly. We need a scratch. */
4750 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4751 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4752 && reg_class_subset_p (rclass, FP_REGS))
4754 if (mode == TFmode)
4755 sri->icode = CODE_FOR_aarch64_reload_movtf;
4756 else if (mode == TImode)
4757 sri->icode = CODE_FOR_aarch64_reload_movti;
4758 return NO_REGS;
4761 /* A TFmode or TImode memory access should be handled via an FP_REGS
4762 because AArch64 has richer addressing modes for LDR/STR instructions
4763 than LDP/STP instructions. */
4764 if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4765 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4766 return FP_REGS;
4768 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4769 return GENERAL_REGS;
4771 return NO_REGS;
4774 static bool
4775 aarch64_can_eliminate (const int from, const int to)
4777 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4778 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4780 if (frame_pointer_needed)
4782 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4783 return true;
4784 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4785 return false;
4786 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4787 && !cfun->calls_alloca)
4788 return true;
4789 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4790 return true;
4792 return false;
4794 else
4796 /* If we decided that we didn't need a leaf frame pointer but then used
4797 LR in the function, then we'll want a frame pointer after all, so
4798 prevent this elimination to ensure a frame pointer is used. */
4799 if (to == STACK_POINTER_REGNUM
4800 && flag_omit_leaf_frame_pointer
4801 && df_regs_ever_live_p (LR_REGNUM))
4802 return false;
4805 return true;
4808 HOST_WIDE_INT
4809 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4811 aarch64_layout_frame ();
4813 if (to == HARD_FRAME_POINTER_REGNUM)
4815 if (from == ARG_POINTER_REGNUM)
4816 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4818 if (from == FRAME_POINTER_REGNUM)
4819 return (cfun->machine->frame.hard_fp_offset
4820 - cfun->machine->frame.saved_varargs_size);
4823 if (to == STACK_POINTER_REGNUM)
4825 if (from == FRAME_POINTER_REGNUM)
4826 return (cfun->machine->frame.frame_size
4827 - cfun->machine->frame.saved_varargs_size);
4830 return cfun->machine->frame.frame_size;
4833 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4834 previous frame. */
4837 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4839 if (count != 0)
4840 return const0_rtx;
4841 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4845 static void
4846 aarch64_asm_trampoline_template (FILE *f)
4848 if (TARGET_ILP32)
4850 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4851 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4853 else
4855 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4856 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4858 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4859 assemble_aligned_integer (4, const0_rtx);
4860 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4861 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4864 static void
4865 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4867 rtx fnaddr, mem, a_tramp;
4868 const int tramp_code_sz = 16;
4870 /* Don't need to copy the trailing D-words, we fill those in below. */
4871 emit_block_move (m_tramp, assemble_trampoline_template (),
4872 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4873 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4874 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4875 if (GET_MODE (fnaddr) != ptr_mode)
4876 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4877 emit_move_insn (mem, fnaddr);
4879 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4880 emit_move_insn (mem, chain_value);
4882 /* XXX We should really define a "clear_cache" pattern and use
4883 gen_clear_cache(). */
4884 a_tramp = XEXP (m_tramp, 0);
4885 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4886 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4887 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4888 ptr_mode);
4891 static unsigned char
4892 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4894 switch (regclass)
4896 case CALLER_SAVE_REGS:
4897 case POINTER_REGS:
4898 case GENERAL_REGS:
4899 case ALL_REGS:
4900 case FP_REGS:
4901 case FP_LO_REGS:
4902 return
4903 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4904 (GET_MODE_SIZE (mode) + 7) / 8;
4905 case STACK_REG:
4906 return 1;
4908 case NO_REGS:
4909 return 0;
4911 default:
4912 break;
4914 gcc_unreachable ();
4917 static reg_class_t
4918 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4920 if (regclass == POINTER_REGS)
4921 return GENERAL_REGS;
4923 if (regclass == STACK_REG)
4925 if (REG_P(x)
4926 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4927 return regclass;
4929 return NO_REGS;
4932 /* If it's an integer immediate that MOVI can't handle, then
4933 FP_REGS is not an option, so we return NO_REGS instead. */
4934 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4935 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4936 return NO_REGS;
4938 /* Register eliminiation can result in a request for
4939 SP+constant->FP_REGS. We cannot support such operations which
4940 use SP as source and an FP_REG as destination, so reject out
4941 right now. */
4942 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4944 rtx lhs = XEXP (x, 0);
4946 /* Look through a possible SUBREG introduced by ILP32. */
4947 if (GET_CODE (lhs) == SUBREG)
4948 lhs = SUBREG_REG (lhs);
4950 gcc_assert (REG_P (lhs));
4951 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4952 POINTER_REGS));
4953 return NO_REGS;
4956 return regclass;
4959 void
4960 aarch64_asm_output_labelref (FILE* f, const char *name)
4962 asm_fprintf (f, "%U%s", name);
4965 static void
4966 aarch64_elf_asm_constructor (rtx symbol, int priority)
4968 if (priority == DEFAULT_INIT_PRIORITY)
4969 default_ctor_section_asm_out_constructor (symbol, priority);
4970 else
4972 section *s;
4973 char buf[18];
4974 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4975 s = get_section (buf, SECTION_WRITE, NULL);
4976 switch_to_section (s);
4977 assemble_align (POINTER_SIZE);
4978 assemble_aligned_integer (POINTER_BYTES, symbol);
4982 static void
4983 aarch64_elf_asm_destructor (rtx symbol, int priority)
4985 if (priority == DEFAULT_INIT_PRIORITY)
4986 default_dtor_section_asm_out_destructor (symbol, priority);
4987 else
4989 section *s;
4990 char buf[18];
4991 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4992 s = get_section (buf, SECTION_WRITE, NULL);
4993 switch_to_section (s);
4994 assemble_align (POINTER_SIZE);
4995 assemble_aligned_integer (POINTER_BYTES, symbol);
4999 const char*
5000 aarch64_output_casesi (rtx *operands)
5002 char buf[100];
5003 char label[100];
5004 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5005 int index;
5006 static const char *const patterns[4][2] =
5009 "ldrb\t%w3, [%0,%w1,uxtw]",
5010 "add\t%3, %4, %w3, sxtb #2"
5013 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5014 "add\t%3, %4, %w3, sxth #2"
5017 "ldr\t%w3, [%0,%w1,uxtw #2]",
5018 "add\t%3, %4, %w3, sxtw #2"
5020 /* We assume that DImode is only generated when not optimizing and
5021 that we don't really need 64-bit address offsets. That would
5022 imply an object file with 8GB of code in a single function! */
5024 "ldr\t%w3, [%0,%w1,uxtw #2]",
5025 "add\t%3, %4, %w3, sxtw #2"
5029 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5031 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5033 gcc_assert (index >= 0 && index <= 3);
5035 /* Need to implement table size reduction, by chaning the code below. */
5036 output_asm_insn (patterns[index][0], operands);
5037 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5038 snprintf (buf, sizeof (buf),
5039 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5040 output_asm_insn (buf, operands);
5041 output_asm_insn (patterns[index][1], operands);
5042 output_asm_insn ("br\t%3", operands);
5043 assemble_label (asm_out_file, label);
5044 return "";
5048 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5049 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5050 operator. */
5053 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5055 if (shift >= 0 && shift <= 3)
5057 int size;
5058 for (size = 8; size <= 32; size *= 2)
5060 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5061 if (mask == bits << shift)
5062 return size;
5065 return 0;
5068 static bool
5069 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5070 const_rtx x ATTRIBUTE_UNUSED)
5072 /* We can't use blocks for constants when we're using a per-function
5073 constant pool. */
5074 return false;
5077 static section *
5078 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5079 rtx x ATTRIBUTE_UNUSED,
5080 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5082 /* Force all constant pool entries into the current function section. */
5083 return function_section (current_function_decl);
5087 /* Costs. */
5089 /* Helper function for rtx cost calculation. Strip a shift expression
5090 from X. Returns the inner operand if successful, or the original
5091 expression on failure. */
5092 static rtx
5093 aarch64_strip_shift (rtx x)
5095 rtx op = x;
5097 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5098 we can convert both to ROR during final output. */
5099 if ((GET_CODE (op) == ASHIFT
5100 || GET_CODE (op) == ASHIFTRT
5101 || GET_CODE (op) == LSHIFTRT
5102 || GET_CODE (op) == ROTATERT
5103 || GET_CODE (op) == ROTATE)
5104 && CONST_INT_P (XEXP (op, 1)))
5105 return XEXP (op, 0);
5107 if (GET_CODE (op) == MULT
5108 && CONST_INT_P (XEXP (op, 1))
5109 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5110 return XEXP (op, 0);
5112 return x;
5115 /* Helper function for rtx cost calculation. Strip an extend
5116 expression from X. Returns the inner operand if successful, or the
5117 original expression on failure. We deal with a number of possible
5118 canonicalization variations here. */
5119 static rtx
5120 aarch64_strip_extend (rtx x)
5122 rtx op = x;
5124 /* Zero and sign extraction of a widened value. */
5125 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5126 && XEXP (op, 2) == const0_rtx
5127 && GET_CODE (XEXP (op, 0)) == MULT
5128 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5129 XEXP (op, 1)))
5130 return XEXP (XEXP (op, 0), 0);
5132 /* It can also be represented (for zero-extend) as an AND with an
5133 immediate. */
5134 if (GET_CODE (op) == AND
5135 && GET_CODE (XEXP (op, 0)) == MULT
5136 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5137 && CONST_INT_P (XEXP (op, 1))
5138 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5139 INTVAL (XEXP (op, 1))) != 0)
5140 return XEXP (XEXP (op, 0), 0);
5142 /* Now handle extended register, as this may also have an optional
5143 left shift by 1..4. */
5144 if (GET_CODE (op) == ASHIFT
5145 && CONST_INT_P (XEXP (op, 1))
5146 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5147 op = XEXP (op, 0);
5149 if (GET_CODE (op) == ZERO_EXTEND
5150 || GET_CODE (op) == SIGN_EXTEND)
5151 op = XEXP (op, 0);
5153 if (op != x)
5154 return op;
5156 return x;
5159 /* Helper function for rtx cost calculation. Calculate the cost of
5160 a MULT, which may be part of a multiply-accumulate rtx. Return
5161 the calculated cost of the expression, recursing manually in to
5162 operands where needed. */
5164 static int
5165 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5167 rtx op0, op1;
5168 const struct cpu_cost_table *extra_cost
5169 = aarch64_tune_params->insn_extra_cost;
5170 int cost = 0;
5171 bool maybe_fma = (outer == PLUS || outer == MINUS);
5172 machine_mode mode = GET_MODE (x);
5174 gcc_checking_assert (code == MULT);
5176 op0 = XEXP (x, 0);
5177 op1 = XEXP (x, 1);
5179 if (VECTOR_MODE_P (mode))
5180 mode = GET_MODE_INNER (mode);
5182 /* Integer multiply/fma. */
5183 if (GET_MODE_CLASS (mode) == MODE_INT)
5185 /* The multiply will be canonicalized as a shift, cost it as such. */
5186 if (CONST_INT_P (op1)
5187 && exact_log2 (INTVAL (op1)) > 0)
5189 if (speed)
5191 if (maybe_fma)
5192 /* ADD (shifted register). */
5193 cost += extra_cost->alu.arith_shift;
5194 else
5195 /* LSL (immediate). */
5196 cost += extra_cost->alu.shift;
5199 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5201 return cost;
5204 /* Integer multiplies or FMAs have zero/sign extending variants. */
5205 if ((GET_CODE (op0) == ZERO_EXTEND
5206 && GET_CODE (op1) == ZERO_EXTEND)
5207 || (GET_CODE (op0) == SIGN_EXTEND
5208 && GET_CODE (op1) == SIGN_EXTEND))
5210 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5211 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5213 if (speed)
5215 if (maybe_fma)
5216 /* MADD/SMADDL/UMADDL. */
5217 cost += extra_cost->mult[0].extend_add;
5218 else
5219 /* MUL/SMULL/UMULL. */
5220 cost += extra_cost->mult[0].extend;
5223 return cost;
5226 /* This is either an integer multiply or an FMA. In both cases
5227 we want to recurse and cost the operands. */
5228 cost += rtx_cost (op0, MULT, 0, speed)
5229 + rtx_cost (op1, MULT, 1, speed);
5231 if (speed)
5233 if (maybe_fma)
5234 /* MADD. */
5235 cost += extra_cost->mult[mode == DImode].add;
5236 else
5237 /* MUL. */
5238 cost += extra_cost->mult[mode == DImode].simple;
5241 return cost;
5243 else
5245 if (speed)
5247 /* Floating-point FMA/FMUL can also support negations of the
5248 operands. */
5249 if (GET_CODE (op0) == NEG)
5250 op0 = XEXP (op0, 0);
5251 if (GET_CODE (op1) == NEG)
5252 op1 = XEXP (op1, 0);
5254 if (maybe_fma)
5255 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5256 cost += extra_cost->fp[mode == DFmode].fma;
5257 else
5258 /* FMUL/FNMUL. */
5259 cost += extra_cost->fp[mode == DFmode].mult;
5262 cost += rtx_cost (op0, MULT, 0, speed)
5263 + rtx_cost (op1, MULT, 1, speed);
5264 return cost;
5268 static int
5269 aarch64_address_cost (rtx x,
5270 machine_mode mode,
5271 addr_space_t as ATTRIBUTE_UNUSED,
5272 bool speed)
5274 enum rtx_code c = GET_CODE (x);
5275 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5276 struct aarch64_address_info info;
5277 int cost = 0;
5278 info.shift = 0;
5280 if (!aarch64_classify_address (&info, x, mode, c, false))
5282 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5284 /* This is a CONST or SYMBOL ref which will be split
5285 in a different way depending on the code model in use.
5286 Cost it through the generic infrastructure. */
5287 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5288 /* Divide through by the cost of one instruction to
5289 bring it to the same units as the address costs. */
5290 cost_symbol_ref /= COSTS_N_INSNS (1);
5291 /* The cost is then the cost of preparing the address,
5292 followed by an immediate (possibly 0) offset. */
5293 return cost_symbol_ref + addr_cost->imm_offset;
5295 else
5297 /* This is most likely a jump table from a case
5298 statement. */
5299 return addr_cost->register_offset;
5303 switch (info.type)
5305 case ADDRESS_LO_SUM:
5306 case ADDRESS_SYMBOLIC:
5307 case ADDRESS_REG_IMM:
5308 cost += addr_cost->imm_offset;
5309 break;
5311 case ADDRESS_REG_WB:
5312 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5313 cost += addr_cost->pre_modify;
5314 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5315 cost += addr_cost->post_modify;
5316 else
5317 gcc_unreachable ();
5319 break;
5321 case ADDRESS_REG_REG:
5322 cost += addr_cost->register_offset;
5323 break;
5325 case ADDRESS_REG_UXTW:
5326 case ADDRESS_REG_SXTW:
5327 cost += addr_cost->register_extend;
5328 break;
5330 default:
5331 gcc_unreachable ();
5335 if (info.shift > 0)
5337 /* For the sake of calculating the cost of the shifted register
5338 component, we can treat same sized modes in the same way. */
5339 switch (GET_MODE_BITSIZE (mode))
5341 case 16:
5342 cost += addr_cost->addr_scale_costs.hi;
5343 break;
5345 case 32:
5346 cost += addr_cost->addr_scale_costs.si;
5347 break;
5349 case 64:
5350 cost += addr_cost->addr_scale_costs.di;
5351 break;
5353 /* We can't tell, or this is a 128-bit vector. */
5354 default:
5355 cost += addr_cost->addr_scale_costs.ti;
5356 break;
5360 return cost;
5363 /* Return true if the RTX X in mode MODE is a zero or sign extract
5364 usable in an ADD or SUB (extended register) instruction. */
5365 static bool
5366 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5368 /* Catch add with a sign extract.
5369 This is add_<optab><mode>_multp2. */
5370 if (GET_CODE (x) == SIGN_EXTRACT
5371 || GET_CODE (x) == ZERO_EXTRACT)
5373 rtx op0 = XEXP (x, 0);
5374 rtx op1 = XEXP (x, 1);
5375 rtx op2 = XEXP (x, 2);
5377 if (GET_CODE (op0) == MULT
5378 && CONST_INT_P (op1)
5379 && op2 == const0_rtx
5380 && CONST_INT_P (XEXP (op0, 1))
5381 && aarch64_is_extend_from_extract (mode,
5382 XEXP (op0, 1),
5383 op1))
5385 return true;
5389 return false;
5392 static bool
5393 aarch64_frint_unspec_p (unsigned int u)
5395 switch (u)
5397 case UNSPEC_FRINTZ:
5398 case UNSPEC_FRINTP:
5399 case UNSPEC_FRINTM:
5400 case UNSPEC_FRINTA:
5401 case UNSPEC_FRINTN:
5402 case UNSPEC_FRINTX:
5403 case UNSPEC_FRINTI:
5404 return true;
5406 default:
5407 return false;
5411 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5412 storing it in *COST. Result is true if the total cost of the operation
5413 has now been calculated. */
5414 static bool
5415 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5417 rtx inner;
5418 rtx comparator;
5419 enum rtx_code cmpcode;
5421 if (COMPARISON_P (op0))
5423 inner = XEXP (op0, 0);
5424 comparator = XEXP (op0, 1);
5425 cmpcode = GET_CODE (op0);
5427 else
5429 inner = op0;
5430 comparator = const0_rtx;
5431 cmpcode = NE;
5434 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5436 /* Conditional branch. */
5437 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5438 return true;
5439 else
5441 if (cmpcode == NE || cmpcode == EQ)
5443 if (comparator == const0_rtx)
5445 /* TBZ/TBNZ/CBZ/CBNZ. */
5446 if (GET_CODE (inner) == ZERO_EXTRACT)
5447 /* TBZ/TBNZ. */
5448 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5449 0, speed);
5450 else
5451 /* CBZ/CBNZ. */
5452 *cost += rtx_cost (inner, cmpcode, 0, speed);
5454 return true;
5457 else if (cmpcode == LT || cmpcode == GE)
5459 /* TBZ/TBNZ. */
5460 if (comparator == const0_rtx)
5461 return true;
5465 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5467 /* It's a conditional operation based on the status flags,
5468 so it must be some flavor of CSEL. */
5470 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5471 if (GET_CODE (op1) == NEG
5472 || GET_CODE (op1) == NOT
5473 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5474 op1 = XEXP (op1, 0);
5476 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5477 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5478 return true;
5481 /* We don't know what this is, cost all operands. */
5482 return false;
5485 /* Calculate the cost of calculating X, storing it in *COST. Result
5486 is true if the total cost of the operation has now been calculated. */
5487 static bool
5488 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5489 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5491 rtx op0, op1, op2;
5492 const struct cpu_cost_table *extra_cost
5493 = aarch64_tune_params->insn_extra_cost;
5494 machine_mode mode = GET_MODE (x);
5496 /* By default, assume that everything has equivalent cost to the
5497 cheapest instruction. Any additional costs are applied as a delta
5498 above this default. */
5499 *cost = COSTS_N_INSNS (1);
5501 /* TODO: The cost infrastructure currently does not handle
5502 vector operations. Assume that all vector operations
5503 are equally expensive. */
5504 if (VECTOR_MODE_P (mode))
5506 if (speed)
5507 *cost += extra_cost->vect.alu;
5508 return true;
5511 switch (code)
5513 case SET:
5514 /* The cost depends entirely on the operands to SET. */
5515 *cost = 0;
5516 op0 = SET_DEST (x);
5517 op1 = SET_SRC (x);
5519 switch (GET_CODE (op0))
5521 case MEM:
5522 if (speed)
5524 rtx address = XEXP (op0, 0);
5525 if (GET_MODE_CLASS (mode) == MODE_INT)
5526 *cost += extra_cost->ldst.store;
5527 else if (mode == SFmode)
5528 *cost += extra_cost->ldst.storef;
5529 else if (mode == DFmode)
5530 *cost += extra_cost->ldst.stored;
5532 *cost +=
5533 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5534 0, speed));
5537 *cost += rtx_cost (op1, SET, 1, speed);
5538 return true;
5540 case SUBREG:
5541 if (! REG_P (SUBREG_REG (op0)))
5542 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5544 /* Fall through. */
5545 case REG:
5546 /* const0_rtx is in general free, but we will use an
5547 instruction to set a register to 0. */
5548 if (REG_P (op1) || op1 == const0_rtx)
5550 /* The cost is 1 per register copied. */
5551 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5552 / UNITS_PER_WORD;
5553 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5555 else
5556 /* Cost is just the cost of the RHS of the set. */
5557 *cost += rtx_cost (op1, SET, 1, speed);
5558 return true;
5560 case ZERO_EXTRACT:
5561 case SIGN_EXTRACT:
5562 /* Bit-field insertion. Strip any redundant widening of
5563 the RHS to meet the width of the target. */
5564 if (GET_CODE (op1) == SUBREG)
5565 op1 = SUBREG_REG (op1);
5566 if ((GET_CODE (op1) == ZERO_EXTEND
5567 || GET_CODE (op1) == SIGN_EXTEND)
5568 && CONST_INT_P (XEXP (op0, 1))
5569 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5570 >= INTVAL (XEXP (op0, 1))))
5571 op1 = XEXP (op1, 0);
5573 if (CONST_INT_P (op1))
5575 /* MOV immediate is assumed to always be cheap. */
5576 *cost = COSTS_N_INSNS (1);
5578 else
5580 /* BFM. */
5581 if (speed)
5582 *cost += extra_cost->alu.bfi;
5583 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5586 return true;
5588 default:
5589 /* We can't make sense of this, assume default cost. */
5590 *cost = COSTS_N_INSNS (1);
5591 return false;
5593 return false;
5595 case CONST_INT:
5596 /* If an instruction can incorporate a constant within the
5597 instruction, the instruction's expression avoids calling
5598 rtx_cost() on the constant. If rtx_cost() is called on a
5599 constant, then it is usually because the constant must be
5600 moved into a register by one or more instructions.
5602 The exception is constant 0, which can be expressed
5603 as XZR/WZR and is therefore free. The exception to this is
5604 if we have (set (reg) (const0_rtx)) in which case we must cost
5605 the move. However, we can catch that when we cost the SET, so
5606 we don't need to consider that here. */
5607 if (x == const0_rtx)
5608 *cost = 0;
5609 else
5611 /* To an approximation, building any other constant is
5612 proportionally expensive to the number of instructions
5613 required to build that constant. This is true whether we
5614 are compiling for SPEED or otherwise. */
5615 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5616 (NULL_RTX, x, false, mode));
5618 return true;
5620 case CONST_DOUBLE:
5621 if (speed)
5623 /* mov[df,sf]_aarch64. */
5624 if (aarch64_float_const_representable_p (x))
5625 /* FMOV (scalar immediate). */
5626 *cost += extra_cost->fp[mode == DFmode].fpconst;
5627 else if (!aarch64_float_const_zero_rtx_p (x))
5629 /* This will be a load from memory. */
5630 if (mode == DFmode)
5631 *cost += extra_cost->ldst.loadd;
5632 else
5633 *cost += extra_cost->ldst.loadf;
5635 else
5636 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5637 or MOV v0.s[0], wzr - neither of which are modeled by the
5638 cost tables. Just use the default cost. */
5643 return true;
5645 case MEM:
5646 if (speed)
5648 /* For loads we want the base cost of a load, plus an
5649 approximation for the additional cost of the addressing
5650 mode. */
5651 rtx address = XEXP (x, 0);
5652 if (GET_MODE_CLASS (mode) == MODE_INT)
5653 *cost += extra_cost->ldst.load;
5654 else if (mode == SFmode)
5655 *cost += extra_cost->ldst.loadf;
5656 else if (mode == DFmode)
5657 *cost += extra_cost->ldst.loadd;
5659 *cost +=
5660 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5661 0, speed));
5664 return true;
5666 case NEG:
5667 op0 = XEXP (x, 0);
5669 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5671 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5672 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5674 /* CSETM. */
5675 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5676 return true;
5679 /* Cost this as SUB wzr, X. */
5680 op0 = CONST0_RTX (GET_MODE (x));
5681 op1 = XEXP (x, 0);
5682 goto cost_minus;
5685 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5687 /* Support (neg(fma...)) as a single instruction only if
5688 sign of zeros is unimportant. This matches the decision
5689 making in aarch64.md. */
5690 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5692 /* FNMADD. */
5693 *cost = rtx_cost (op0, NEG, 0, speed);
5694 return true;
5696 if (speed)
5697 /* FNEG. */
5698 *cost += extra_cost->fp[mode == DFmode].neg;
5699 return false;
5702 return false;
5704 case CLRSB:
5705 case CLZ:
5706 if (speed)
5707 *cost += extra_cost->alu.clz;
5709 return false;
5711 case COMPARE:
5712 op0 = XEXP (x, 0);
5713 op1 = XEXP (x, 1);
5715 if (op1 == const0_rtx
5716 && GET_CODE (op0) == AND)
5718 x = op0;
5719 goto cost_logic;
5722 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5724 /* TODO: A write to the CC flags possibly costs extra, this
5725 needs encoding in the cost tables. */
5727 /* CC_ZESWPmode supports zero extend for free. */
5728 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5729 op0 = XEXP (op0, 0);
5731 /* ANDS. */
5732 if (GET_CODE (op0) == AND)
5734 x = op0;
5735 goto cost_logic;
5738 if (GET_CODE (op0) == PLUS)
5740 /* ADDS (and CMN alias). */
5741 x = op0;
5742 goto cost_plus;
5745 if (GET_CODE (op0) == MINUS)
5747 /* SUBS. */
5748 x = op0;
5749 goto cost_minus;
5752 if (GET_CODE (op1) == NEG)
5754 /* CMN. */
5755 if (speed)
5756 *cost += extra_cost->alu.arith;
5758 *cost += rtx_cost (op0, COMPARE, 0, speed);
5759 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5760 return true;
5763 /* CMP.
5765 Compare can freely swap the order of operands, and
5766 canonicalization puts the more complex operation first.
5767 But the integer MINUS logic expects the shift/extend
5768 operation in op1. */
5769 if (! (REG_P (op0)
5770 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5772 op0 = XEXP (x, 1);
5773 op1 = XEXP (x, 0);
5775 goto cost_minus;
5778 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5780 /* FCMP. */
5781 if (speed)
5782 *cost += extra_cost->fp[mode == DFmode].compare;
5784 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5786 /* FCMP supports constant 0.0 for no extra cost. */
5787 return true;
5789 return false;
5792 return false;
5794 case MINUS:
5796 op0 = XEXP (x, 0);
5797 op1 = XEXP (x, 1);
5799 cost_minus:
5800 /* Detect valid immediates. */
5801 if ((GET_MODE_CLASS (mode) == MODE_INT
5802 || (GET_MODE_CLASS (mode) == MODE_CC
5803 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5804 && CONST_INT_P (op1)
5805 && aarch64_uimm12_shift (INTVAL (op1)))
5807 *cost += rtx_cost (op0, MINUS, 0, speed);
5809 if (speed)
5810 /* SUB(S) (immediate). */
5811 *cost += extra_cost->alu.arith;
5812 return true;
5816 /* Look for SUB (extended register). */
5817 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5819 if (speed)
5820 *cost += extra_cost->alu.arith_shift;
5822 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5823 (enum rtx_code) GET_CODE (op1),
5824 0, speed);
5825 return true;
5828 rtx new_op1 = aarch64_strip_extend (op1);
5830 /* Cost this as an FMA-alike operation. */
5831 if ((GET_CODE (new_op1) == MULT
5832 || GET_CODE (new_op1) == ASHIFT)
5833 && code != COMPARE)
5835 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5836 (enum rtx_code) code,
5837 speed);
5838 *cost += rtx_cost (op0, MINUS, 0, speed);
5839 return true;
5842 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5844 if (speed)
5846 if (GET_MODE_CLASS (mode) == MODE_INT)
5847 /* SUB(S). */
5848 *cost += extra_cost->alu.arith;
5849 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5850 /* FSUB. */
5851 *cost += extra_cost->fp[mode == DFmode].addsub;
5853 return true;
5856 case PLUS:
5858 rtx new_op0;
5860 op0 = XEXP (x, 0);
5861 op1 = XEXP (x, 1);
5863 cost_plus:
5864 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5865 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5867 /* CSINC. */
5868 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5869 *cost += rtx_cost (op1, PLUS, 1, speed);
5870 return true;
5873 if (GET_MODE_CLASS (mode) == MODE_INT
5874 && CONST_INT_P (op1)
5875 && aarch64_uimm12_shift (INTVAL (op1)))
5877 *cost += rtx_cost (op0, PLUS, 0, speed);
5879 if (speed)
5880 /* ADD (immediate). */
5881 *cost += extra_cost->alu.arith;
5882 return true;
5885 /* Look for ADD (extended register). */
5886 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5888 if (speed)
5889 *cost += extra_cost->alu.arith_shift;
5891 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5892 (enum rtx_code) GET_CODE (op0),
5893 0, speed);
5894 return true;
5897 /* Strip any extend, leave shifts behind as we will
5898 cost them through mult_cost. */
5899 new_op0 = aarch64_strip_extend (op0);
5901 if (GET_CODE (new_op0) == MULT
5902 || GET_CODE (new_op0) == ASHIFT)
5904 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5905 speed);
5906 *cost += rtx_cost (op1, PLUS, 1, speed);
5907 return true;
5910 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5911 + rtx_cost (op1, PLUS, 1, speed));
5913 if (speed)
5915 if (GET_MODE_CLASS (mode) == MODE_INT)
5916 /* ADD. */
5917 *cost += extra_cost->alu.arith;
5918 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5919 /* FADD. */
5920 *cost += extra_cost->fp[mode == DFmode].addsub;
5922 return true;
5925 case BSWAP:
5926 *cost = COSTS_N_INSNS (1);
5928 if (speed)
5929 *cost += extra_cost->alu.rev;
5931 return false;
5933 case IOR:
5934 if (aarch_rev16_p (x))
5936 *cost = COSTS_N_INSNS (1);
5938 if (speed)
5939 *cost += extra_cost->alu.rev;
5941 return true;
5943 /* Fall through. */
5944 case XOR:
5945 case AND:
5946 cost_logic:
5947 op0 = XEXP (x, 0);
5948 op1 = XEXP (x, 1);
5950 if (code == AND
5951 && GET_CODE (op0) == MULT
5952 && CONST_INT_P (XEXP (op0, 1))
5953 && CONST_INT_P (op1)
5954 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5955 INTVAL (op1)) != 0)
5957 /* This is a UBFM/SBFM. */
5958 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5959 if (speed)
5960 *cost += extra_cost->alu.bfx;
5961 return true;
5964 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5966 /* We possibly get the immediate for free, this is not
5967 modelled. */
5968 if (CONST_INT_P (op1)
5969 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5971 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5973 if (speed)
5974 *cost += extra_cost->alu.logical;
5976 return true;
5978 else
5980 rtx new_op0 = op0;
5982 /* Handle ORN, EON, or BIC. */
5983 if (GET_CODE (op0) == NOT)
5984 op0 = XEXP (op0, 0);
5986 new_op0 = aarch64_strip_shift (op0);
5988 /* If we had a shift on op0 then this is a logical-shift-
5989 by-register/immediate operation. Otherwise, this is just
5990 a logical operation. */
5991 if (speed)
5993 if (new_op0 != op0)
5995 /* Shift by immediate. */
5996 if (CONST_INT_P (XEXP (op0, 1)))
5997 *cost += extra_cost->alu.log_shift;
5998 else
5999 *cost += extra_cost->alu.log_shift_reg;
6001 else
6002 *cost += extra_cost->alu.logical;
6005 /* In both cases we want to cost both operands. */
6006 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6007 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6009 return true;
6012 return false;
6014 case NOT:
6015 /* MVN. */
6016 if (speed)
6017 *cost += extra_cost->alu.logical;
6019 /* The logical instruction could have the shifted register form,
6020 but the cost is the same if the shift is processed as a separate
6021 instruction, so we don't bother with it here. */
6022 return false;
6024 case ZERO_EXTEND:
6026 op0 = XEXP (x, 0);
6027 /* If a value is written in SI mode, then zero extended to DI
6028 mode, the operation will in general be free as a write to
6029 a 'w' register implicitly zeroes the upper bits of an 'x'
6030 register. However, if this is
6032 (set (reg) (zero_extend (reg)))
6034 we must cost the explicit register move. */
6035 if (mode == DImode
6036 && GET_MODE (op0) == SImode
6037 && outer == SET)
6039 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6041 if (!op_cost && speed)
6042 /* MOV. */
6043 *cost += extra_cost->alu.extend;
6044 else
6045 /* Free, the cost is that of the SI mode operation. */
6046 *cost = op_cost;
6048 return true;
6050 else if (MEM_P (XEXP (x, 0)))
6052 /* All loads can zero extend to any size for free. */
6053 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6054 return true;
6057 /* UXTB/UXTH. */
6058 if (speed)
6059 *cost += extra_cost->alu.extend;
6061 return false;
6063 case SIGN_EXTEND:
6064 if (MEM_P (XEXP (x, 0)))
6066 /* LDRSH. */
6067 if (speed)
6069 rtx address = XEXP (XEXP (x, 0), 0);
6070 *cost += extra_cost->ldst.load_sign_extend;
6072 *cost +=
6073 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6074 0, speed));
6076 return true;
6079 if (speed)
6080 *cost += extra_cost->alu.extend;
6081 return false;
6083 case ASHIFT:
6084 op0 = XEXP (x, 0);
6085 op1 = XEXP (x, 1);
6087 if (CONST_INT_P (op1))
6089 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6090 aliases. */
6091 if (speed)
6092 *cost += extra_cost->alu.shift;
6094 /* We can incorporate zero/sign extend for free. */
6095 if (GET_CODE (op0) == ZERO_EXTEND
6096 || GET_CODE (op0) == SIGN_EXTEND)
6097 op0 = XEXP (op0, 0);
6099 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6100 return true;
6102 else
6104 /* LSLV. */
6105 if (speed)
6106 *cost += extra_cost->alu.shift_reg;
6108 return false; /* All arguments need to be in registers. */
6111 case ROTATE:
6112 case ROTATERT:
6113 case LSHIFTRT:
6114 case ASHIFTRT:
6115 op0 = XEXP (x, 0);
6116 op1 = XEXP (x, 1);
6118 if (CONST_INT_P (op1))
6120 /* ASR (immediate) and friends. */
6121 if (speed)
6122 *cost += extra_cost->alu.shift;
6124 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6125 return true;
6127 else
6130 /* ASR (register) and friends. */
6131 if (speed)
6132 *cost += extra_cost->alu.shift_reg;
6134 return false; /* All arguments need to be in registers. */
6137 case SYMBOL_REF:
6139 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6141 /* LDR. */
6142 if (speed)
6143 *cost += extra_cost->ldst.load;
6145 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6146 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6148 /* ADRP, followed by ADD. */
6149 *cost += COSTS_N_INSNS (1);
6150 if (speed)
6151 *cost += 2 * extra_cost->alu.arith;
6153 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6154 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6156 /* ADR. */
6157 if (speed)
6158 *cost += extra_cost->alu.arith;
6161 if (flag_pic)
6163 /* One extra load instruction, after accessing the GOT. */
6164 *cost += COSTS_N_INSNS (1);
6165 if (speed)
6166 *cost += extra_cost->ldst.load;
6168 return true;
6170 case HIGH:
6171 case LO_SUM:
6172 /* ADRP/ADD (immediate). */
6173 if (speed)
6174 *cost += extra_cost->alu.arith;
6175 return true;
6177 case ZERO_EXTRACT:
6178 case SIGN_EXTRACT:
6179 /* UBFX/SBFX. */
6180 if (speed)
6181 *cost += extra_cost->alu.bfx;
6183 /* We can trust that the immediates used will be correct (there
6184 are no by-register forms), so we need only cost op0. */
6185 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6186 return true;
6188 case MULT:
6189 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6190 /* aarch64_rtx_mult_cost always handles recursion to its
6191 operands. */
6192 return true;
6194 case MOD:
6195 case UMOD:
6196 if (speed)
6198 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6199 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6200 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6201 else if (GET_MODE (x) == DFmode)
6202 *cost += (extra_cost->fp[1].mult
6203 + extra_cost->fp[1].div);
6204 else if (GET_MODE (x) == SFmode)
6205 *cost += (extra_cost->fp[0].mult
6206 + extra_cost->fp[0].div);
6208 return false; /* All arguments need to be in registers. */
6210 case DIV:
6211 case UDIV:
6212 case SQRT:
6213 if (speed)
6215 if (GET_MODE_CLASS (mode) == MODE_INT)
6216 /* There is no integer SQRT, so only DIV and UDIV can get
6217 here. */
6218 *cost += extra_cost->mult[mode == DImode].idiv;
6219 else
6220 *cost += extra_cost->fp[mode == DFmode].div;
6222 return false; /* All arguments need to be in registers. */
6224 case IF_THEN_ELSE:
6225 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6226 XEXP (x, 2), cost, speed);
6228 case EQ:
6229 case NE:
6230 case GT:
6231 case GTU:
6232 case LT:
6233 case LTU:
6234 case GE:
6235 case GEU:
6236 case LE:
6237 case LEU:
6239 return false; /* All arguments must be in registers. */
6241 case FMA:
6242 op0 = XEXP (x, 0);
6243 op1 = XEXP (x, 1);
6244 op2 = XEXP (x, 2);
6246 if (speed)
6247 *cost += extra_cost->fp[mode == DFmode].fma;
6249 /* FMSUB, FNMADD, and FNMSUB are free. */
6250 if (GET_CODE (op0) == NEG)
6251 op0 = XEXP (op0, 0);
6253 if (GET_CODE (op2) == NEG)
6254 op2 = XEXP (op2, 0);
6256 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6257 and the by-element operand as operand 0. */
6258 if (GET_CODE (op1) == NEG)
6259 op1 = XEXP (op1, 0);
6261 /* Catch vector-by-element operations. The by-element operand can
6262 either be (vec_duplicate (vec_select (x))) or just
6263 (vec_select (x)), depending on whether we are multiplying by
6264 a vector or a scalar.
6266 Canonicalization is not very good in these cases, FMA4 will put the
6267 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6268 if (GET_CODE (op0) == VEC_DUPLICATE)
6269 op0 = XEXP (op0, 0);
6270 else if (GET_CODE (op1) == VEC_DUPLICATE)
6271 op1 = XEXP (op1, 0);
6273 if (GET_CODE (op0) == VEC_SELECT)
6274 op0 = XEXP (op0, 0);
6275 else if (GET_CODE (op1) == VEC_SELECT)
6276 op1 = XEXP (op1, 0);
6278 /* If the remaining parameters are not registers,
6279 get the cost to put them into registers. */
6280 *cost += rtx_cost (op0, FMA, 0, speed);
6281 *cost += rtx_cost (op1, FMA, 1, speed);
6282 *cost += rtx_cost (op2, FMA, 2, speed);
6283 return true;
6285 case FLOAT_EXTEND:
6286 if (speed)
6287 *cost += extra_cost->fp[mode == DFmode].widen;
6288 return false;
6290 case FLOAT_TRUNCATE:
6291 if (speed)
6292 *cost += extra_cost->fp[mode == DFmode].narrow;
6293 return false;
6295 case FIX:
6296 case UNSIGNED_FIX:
6297 x = XEXP (x, 0);
6298 /* Strip the rounding part. They will all be implemented
6299 by the fcvt* family of instructions anyway. */
6300 if (GET_CODE (x) == UNSPEC)
6302 unsigned int uns_code = XINT (x, 1);
6304 if (uns_code == UNSPEC_FRINTA
6305 || uns_code == UNSPEC_FRINTM
6306 || uns_code == UNSPEC_FRINTN
6307 || uns_code == UNSPEC_FRINTP
6308 || uns_code == UNSPEC_FRINTZ)
6309 x = XVECEXP (x, 0, 0);
6312 if (speed)
6313 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6315 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6316 return true;
6318 case ABS:
6319 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6321 /* FABS and FNEG are analogous. */
6322 if (speed)
6323 *cost += extra_cost->fp[mode == DFmode].neg;
6325 else
6327 /* Integer ABS will either be split to
6328 two arithmetic instructions, or will be an ABS
6329 (scalar), which we don't model. */
6330 *cost = COSTS_N_INSNS (2);
6331 if (speed)
6332 *cost += 2 * extra_cost->alu.arith;
6334 return false;
6336 case SMAX:
6337 case SMIN:
6338 if (speed)
6340 /* FMAXNM/FMINNM/FMAX/FMIN.
6341 TODO: This may not be accurate for all implementations, but
6342 we do not model this in the cost tables. */
6343 *cost += extra_cost->fp[mode == DFmode].addsub;
6345 return false;
6347 case UNSPEC:
6348 /* The floating point round to integer frint* instructions. */
6349 if (aarch64_frint_unspec_p (XINT (x, 1)))
6351 if (speed)
6352 *cost += extra_cost->fp[mode == DFmode].roundint;
6354 return false;
6357 if (XINT (x, 1) == UNSPEC_RBIT)
6359 if (speed)
6360 *cost += extra_cost->alu.rev;
6362 return false;
6364 break;
6366 case TRUNCATE:
6368 /* Decompose <su>muldi3_highpart. */
6369 if (/* (truncate:DI */
6370 mode == DImode
6371 /* (lshiftrt:TI */
6372 && GET_MODE (XEXP (x, 0)) == TImode
6373 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6374 /* (mult:TI */
6375 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6376 /* (ANY_EXTEND:TI (reg:DI))
6377 (ANY_EXTEND:TI (reg:DI))) */
6378 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6379 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6380 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6381 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6382 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6383 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6384 /* (const_int 64) */
6385 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6386 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6388 /* UMULH/SMULH. */
6389 if (speed)
6390 *cost += extra_cost->mult[mode == DImode].extend;
6391 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6392 MULT, 0, speed);
6393 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6394 MULT, 1, speed);
6395 return true;
6398 /* Fall through. */
6399 default:
6400 break;
6403 if (dump_file && (dump_flags & TDF_DETAILS))
6404 fprintf (dump_file,
6405 "\nFailed to cost RTX. Assuming default cost.\n");
6407 return true;
6410 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6411 calculated for X. This cost is stored in *COST. Returns true
6412 if the total cost of X was calculated. */
6413 static bool
6414 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6415 int param, int *cost, bool speed)
6417 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6419 if (dump_file && (dump_flags & TDF_DETAILS))
6421 print_rtl_single (dump_file, x);
6422 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6423 speed ? "Hot" : "Cold",
6424 *cost, result ? "final" : "partial");
6427 return result;
6430 static int
6431 aarch64_register_move_cost (machine_mode mode,
6432 reg_class_t from_i, reg_class_t to_i)
6434 enum reg_class from = (enum reg_class) from_i;
6435 enum reg_class to = (enum reg_class) to_i;
6436 const struct cpu_regmove_cost *regmove_cost
6437 = aarch64_tune_params->regmove_cost;
6439 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6440 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6441 to = GENERAL_REGS;
6443 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6444 from = GENERAL_REGS;
6446 /* Moving between GPR and stack cost is the same as GP2GP. */
6447 if ((from == GENERAL_REGS && to == STACK_REG)
6448 || (to == GENERAL_REGS && from == STACK_REG))
6449 return regmove_cost->GP2GP;
6451 /* To/From the stack register, we move via the gprs. */
6452 if (to == STACK_REG || from == STACK_REG)
6453 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6454 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6456 if (GET_MODE_SIZE (mode) == 16)
6458 /* 128-bit operations on general registers require 2 instructions. */
6459 if (from == GENERAL_REGS && to == GENERAL_REGS)
6460 return regmove_cost->GP2GP * 2;
6461 else if (from == GENERAL_REGS)
6462 return regmove_cost->GP2FP * 2;
6463 else if (to == GENERAL_REGS)
6464 return regmove_cost->FP2GP * 2;
6466 /* When AdvSIMD instructions are disabled it is not possible to move
6467 a 128-bit value directly between Q registers. This is handled in
6468 secondary reload. A general register is used as a scratch to move
6469 the upper DI value and the lower DI value is moved directly,
6470 hence the cost is the sum of three moves. */
6471 if (! TARGET_SIMD)
6472 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6474 return regmove_cost->FP2FP;
6477 if (from == GENERAL_REGS && to == GENERAL_REGS)
6478 return regmove_cost->GP2GP;
6479 else if (from == GENERAL_REGS)
6480 return regmove_cost->GP2FP;
6481 else if (to == GENERAL_REGS)
6482 return regmove_cost->FP2GP;
6484 return regmove_cost->FP2FP;
6487 static int
6488 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6489 reg_class_t rclass ATTRIBUTE_UNUSED,
6490 bool in ATTRIBUTE_UNUSED)
6492 return aarch64_tune_params->memmov_cost;
6495 /* Return the number of instructions that can be issued per cycle. */
6496 static int
6497 aarch64_sched_issue_rate (void)
6499 return aarch64_tune_params->issue_rate;
6502 /* Vectorizer cost model target hooks. */
6504 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6505 static int
6506 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6507 tree vectype,
6508 int misalign ATTRIBUTE_UNUSED)
6510 unsigned elements;
6512 switch (type_of_cost)
6514 case scalar_stmt:
6515 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6517 case scalar_load:
6518 return aarch64_tune_params->vec_costs->scalar_load_cost;
6520 case scalar_store:
6521 return aarch64_tune_params->vec_costs->scalar_store_cost;
6523 case vector_stmt:
6524 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6526 case vector_load:
6527 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6529 case vector_store:
6530 return aarch64_tune_params->vec_costs->vec_store_cost;
6532 case vec_to_scalar:
6533 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6535 case scalar_to_vec:
6536 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6538 case unaligned_load:
6539 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6541 case unaligned_store:
6542 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6544 case cond_branch_taken:
6545 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6547 case cond_branch_not_taken:
6548 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6550 case vec_perm:
6551 case vec_promote_demote:
6552 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6554 case vec_construct:
6555 elements = TYPE_VECTOR_SUBPARTS (vectype);
6556 return elements / 2 + 1;
6558 default:
6559 gcc_unreachable ();
6563 /* Implement targetm.vectorize.add_stmt_cost. */
6564 static unsigned
6565 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6566 struct _stmt_vec_info *stmt_info, int misalign,
6567 enum vect_cost_model_location where)
6569 unsigned *cost = (unsigned *) data;
6570 unsigned retval = 0;
6572 if (flag_vect_cost_model)
6574 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6575 int stmt_cost =
6576 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6578 /* Statements in an inner loop relative to the loop being
6579 vectorized are weighted more heavily. The value here is
6580 a function (linear for now) of the loop nest level. */
6581 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6583 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6584 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6585 unsigned nest_level = loop_depth (loop);
6587 count *= nest_level;
6590 retval = (unsigned) (count * stmt_cost);
6591 cost[where] += retval;
6594 return retval;
6597 static void initialize_aarch64_code_model (void);
6599 /* Parse the architecture extension string. */
6601 static void
6602 aarch64_parse_extension (char *str)
6604 /* The extension string is parsed left to right. */
6605 const struct aarch64_option_extension *opt = NULL;
6607 /* Flag to say whether we are adding or removing an extension. */
6608 int adding_ext = -1;
6610 while (str != NULL && *str != 0)
6612 char *ext;
6613 size_t len;
6615 str++;
6616 ext = strchr (str, '+');
6618 if (ext != NULL)
6619 len = ext - str;
6620 else
6621 len = strlen (str);
6623 if (len >= 2 && strncmp (str, "no", 2) == 0)
6625 adding_ext = 0;
6626 len -= 2;
6627 str += 2;
6629 else if (len > 0)
6630 adding_ext = 1;
6632 if (len == 0)
6634 error ("missing feature modifier after %qs", adding_ext ? "+"
6635 : "+no");
6636 return;
6639 /* Scan over the extensions table trying to find an exact match. */
6640 for (opt = all_extensions; opt->name != NULL; opt++)
6642 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6644 /* Add or remove the extension. */
6645 if (adding_ext)
6646 aarch64_isa_flags |= opt->flags_on;
6647 else
6648 aarch64_isa_flags &= ~(opt->flags_off);
6649 break;
6653 if (opt->name == NULL)
6655 /* Extension not found in list. */
6656 error ("unknown feature modifier %qs", str);
6657 return;
6660 str = ext;
6663 return;
6666 /* Parse the ARCH string. */
6668 static void
6669 aarch64_parse_arch (void)
6671 char *ext;
6672 const struct processor *arch;
6673 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6674 size_t len;
6676 strcpy (str, aarch64_arch_string);
6678 ext = strchr (str, '+');
6680 if (ext != NULL)
6681 len = ext - str;
6682 else
6683 len = strlen (str);
6685 if (len == 0)
6687 error ("missing arch name in -march=%qs", str);
6688 return;
6691 /* Loop through the list of supported ARCHs to find a match. */
6692 for (arch = all_architectures; arch->name != NULL; arch++)
6694 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6696 selected_arch = arch;
6697 aarch64_isa_flags = selected_arch->flags;
6699 if (!selected_cpu)
6700 selected_cpu = &all_cores[selected_arch->core];
6702 if (ext != NULL)
6704 /* ARCH string contains at least one extension. */
6705 aarch64_parse_extension (ext);
6708 if (strcmp (selected_arch->arch, selected_cpu->arch))
6710 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6711 selected_cpu->name, selected_arch->name);
6714 return;
6718 /* ARCH name not found in list. */
6719 error ("unknown value %qs for -march", str);
6720 return;
6723 /* Parse the CPU string. */
6725 static void
6726 aarch64_parse_cpu (void)
6728 char *ext;
6729 const struct processor *cpu;
6730 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6731 size_t len;
6733 strcpy (str, aarch64_cpu_string);
6735 ext = strchr (str, '+');
6737 if (ext != NULL)
6738 len = ext - str;
6739 else
6740 len = strlen (str);
6742 if (len == 0)
6744 error ("missing cpu name in -mcpu=%qs", str);
6745 return;
6748 /* Loop through the list of supported CPUs to find a match. */
6749 for (cpu = all_cores; cpu->name != NULL; cpu++)
6751 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6753 selected_cpu = cpu;
6754 aarch64_isa_flags = selected_cpu->flags;
6756 if (ext != NULL)
6758 /* CPU string contains at least one extension. */
6759 aarch64_parse_extension (ext);
6762 return;
6766 /* CPU name not found in list. */
6767 error ("unknown value %qs for -mcpu", str);
6768 return;
6771 /* Parse the TUNE string. */
6773 static void
6774 aarch64_parse_tune (void)
6776 const struct processor *cpu;
6777 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6778 strcpy (str, aarch64_tune_string);
6780 /* Loop through the list of supported CPUs to find a match. */
6781 for (cpu = all_cores; cpu->name != NULL; cpu++)
6783 if (strcmp (cpu->name, str) == 0)
6785 selected_tune = cpu;
6786 return;
6790 /* CPU name not found in list. */
6791 error ("unknown value %qs for -mtune", str);
6792 return;
6796 /* Implement TARGET_OPTION_OVERRIDE. */
6798 static void
6799 aarch64_override_options (void)
6801 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6802 If either of -march or -mtune is given, they override their
6803 respective component of -mcpu.
6805 So, first parse AARCH64_CPU_STRING, then the others, be careful
6806 with -march as, if -mcpu is not present on the command line, march
6807 must set a sensible default CPU. */
6808 if (aarch64_cpu_string)
6810 aarch64_parse_cpu ();
6813 if (aarch64_arch_string)
6815 aarch64_parse_arch ();
6818 if (aarch64_tune_string)
6820 aarch64_parse_tune ();
6823 #ifndef HAVE_AS_MABI_OPTION
6824 /* The compiler may have been configured with 2.23.* binutils, which does
6825 not have support for ILP32. */
6826 if (TARGET_ILP32)
6827 error ("Assembler does not support -mabi=ilp32");
6828 #endif
6830 initialize_aarch64_code_model ();
6832 aarch64_build_bitmask_table ();
6834 /* This target defaults to strict volatile bitfields. */
6835 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6836 flag_strict_volatile_bitfields = 1;
6838 /* If the user did not specify a processor, choose the default
6839 one for them. This will be the CPU set during configuration using
6840 --with-cpu, otherwise it is "generic". */
6841 if (!selected_cpu)
6843 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6844 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6847 gcc_assert (selected_cpu);
6849 if (!selected_tune)
6850 selected_tune = selected_cpu;
6852 aarch64_tune_flags = selected_tune->flags;
6853 aarch64_tune = selected_tune->core;
6854 aarch64_tune_params = selected_tune->tune;
6855 aarch64_architecture_version = selected_cpu->architecture_version;
6857 if (aarch64_fix_a53_err835769 == 2)
6859 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6860 aarch64_fix_a53_err835769 = 1;
6861 #else
6862 aarch64_fix_a53_err835769 = 0;
6863 #endif
6866 /* If not opzimizing for size, set the default
6867 alignment to what the target wants */
6868 if (!optimize_size)
6870 if (align_loops <= 0)
6871 align_loops = aarch64_tune_params->loop_align;
6872 if (align_jumps <= 0)
6873 align_jumps = aarch64_tune_params->jump_align;
6874 if (align_functions <= 0)
6875 align_functions = aarch64_tune_params->function_align;
6878 aarch64_override_options_after_change ();
6881 /* Implement targetm.override_options_after_change. */
6883 static void
6884 aarch64_override_options_after_change (void)
6886 if (flag_omit_frame_pointer)
6887 flag_omit_leaf_frame_pointer = false;
6888 else if (flag_omit_leaf_frame_pointer)
6889 flag_omit_frame_pointer = true;
6892 static struct machine_function *
6893 aarch64_init_machine_status (void)
6895 struct machine_function *machine;
6896 machine = ggc_cleared_alloc<machine_function> ();
6897 return machine;
6900 void
6901 aarch64_init_expanders (void)
6903 init_machine_status = aarch64_init_machine_status;
6906 /* A checking mechanism for the implementation of the various code models. */
6907 static void
6908 initialize_aarch64_code_model (void)
6910 if (flag_pic)
6912 switch (aarch64_cmodel_var)
6914 case AARCH64_CMODEL_TINY:
6915 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6916 break;
6917 case AARCH64_CMODEL_SMALL:
6918 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6919 break;
6920 case AARCH64_CMODEL_LARGE:
6921 sorry ("code model %qs with -f%s", "large",
6922 flag_pic > 1 ? "PIC" : "pic");
6923 default:
6924 gcc_unreachable ();
6927 else
6928 aarch64_cmodel = aarch64_cmodel_var;
6931 /* Return true if SYMBOL_REF X binds locally. */
6933 static bool
6934 aarch64_symbol_binds_local_p (const_rtx x)
6936 return (SYMBOL_REF_DECL (x)
6937 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6938 : SYMBOL_REF_LOCAL_P (x));
6941 /* Return true if SYMBOL_REF X is thread local */
6942 static bool
6943 aarch64_tls_symbol_p (rtx x)
6945 if (! TARGET_HAVE_TLS)
6946 return false;
6948 if (GET_CODE (x) != SYMBOL_REF)
6949 return false;
6951 return SYMBOL_REF_TLS_MODEL (x) != 0;
6954 /* Classify a TLS symbol into one of the TLS kinds. */
6955 enum aarch64_symbol_type
6956 aarch64_classify_tls_symbol (rtx x)
6958 enum tls_model tls_kind = tls_symbolic_operand_type (x);
6960 switch (tls_kind)
6962 case TLS_MODEL_GLOBAL_DYNAMIC:
6963 case TLS_MODEL_LOCAL_DYNAMIC:
6964 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6966 case TLS_MODEL_INITIAL_EXEC:
6967 return SYMBOL_SMALL_GOTTPREL;
6969 case TLS_MODEL_LOCAL_EXEC:
6970 return SYMBOL_SMALL_TPREL;
6972 case TLS_MODEL_EMULATED:
6973 case TLS_MODEL_NONE:
6974 return SYMBOL_FORCE_TO_MEM;
6976 default:
6977 gcc_unreachable ();
6981 /* Return the method that should be used to access SYMBOL_REF or
6982 LABEL_REF X in context CONTEXT. */
6984 enum aarch64_symbol_type
6985 aarch64_classify_symbol (rtx x, rtx offset,
6986 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6988 if (GET_CODE (x) == LABEL_REF)
6990 switch (aarch64_cmodel)
6992 case AARCH64_CMODEL_LARGE:
6993 return SYMBOL_FORCE_TO_MEM;
6995 case AARCH64_CMODEL_TINY_PIC:
6996 case AARCH64_CMODEL_TINY:
6997 return SYMBOL_TINY_ABSOLUTE;
6999 case AARCH64_CMODEL_SMALL_PIC:
7000 case AARCH64_CMODEL_SMALL:
7001 return SYMBOL_SMALL_ABSOLUTE;
7003 default:
7004 gcc_unreachable ();
7008 if (GET_CODE (x) == SYMBOL_REF)
7010 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7011 return SYMBOL_FORCE_TO_MEM;
7013 if (aarch64_tls_symbol_p (x))
7014 return aarch64_classify_tls_symbol (x);
7016 switch (aarch64_cmodel)
7018 case AARCH64_CMODEL_TINY:
7019 /* When we retreive symbol + offset address, we have to make sure
7020 the offset does not cause overflow of the final address. But
7021 we have no way of knowing the address of symbol at compile time
7022 so we can't accurately say if the distance between the PC and
7023 symbol + offset is outside the addressible range of +/-1M in the
7024 TINY code model. So we rely on images not being greater than
7025 1M and cap the offset at 1M and anything beyond 1M will have to
7026 be loaded using an alternative mechanism. */
7027 if (SYMBOL_REF_WEAK (x)
7028 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7029 return SYMBOL_FORCE_TO_MEM;
7030 return SYMBOL_TINY_ABSOLUTE;
7032 case AARCH64_CMODEL_SMALL:
7033 /* Same reasoning as the tiny code model, but the offset cap here is
7034 4G. */
7035 if (SYMBOL_REF_WEAK (x)
7036 || INTVAL (offset) < (HOST_WIDE_INT) -4294967263
7037 || INTVAL (offset) > (HOST_WIDE_INT) 4294967264)
7038 return SYMBOL_FORCE_TO_MEM;
7039 return SYMBOL_SMALL_ABSOLUTE;
7041 case AARCH64_CMODEL_TINY_PIC:
7042 if (!aarch64_symbol_binds_local_p (x))
7043 return SYMBOL_TINY_GOT;
7044 return SYMBOL_TINY_ABSOLUTE;
7046 case AARCH64_CMODEL_SMALL_PIC:
7047 if (!aarch64_symbol_binds_local_p (x))
7048 return SYMBOL_SMALL_GOT;
7049 return SYMBOL_SMALL_ABSOLUTE;
7051 default:
7052 gcc_unreachable ();
7056 /* By default push everything into the constant pool. */
7057 return SYMBOL_FORCE_TO_MEM;
7060 bool
7061 aarch64_constant_address_p (rtx x)
7063 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7066 bool
7067 aarch64_legitimate_pic_operand_p (rtx x)
7069 if (GET_CODE (x) == SYMBOL_REF
7070 || (GET_CODE (x) == CONST
7071 && GET_CODE (XEXP (x, 0)) == PLUS
7072 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7073 return false;
7075 return true;
7078 /* Return true if X holds either a quarter-precision or
7079 floating-point +0.0 constant. */
7080 static bool
7081 aarch64_valid_floating_const (machine_mode mode, rtx x)
7083 if (!CONST_DOUBLE_P (x))
7084 return false;
7086 /* TODO: We could handle moving 0.0 to a TFmode register,
7087 but first we would like to refactor the movtf_aarch64
7088 to be more amicable to split moves properly and
7089 correctly gate on TARGET_SIMD. For now - reject all
7090 constants which are not to SFmode or DFmode registers. */
7091 if (!(mode == SFmode || mode == DFmode))
7092 return false;
7094 if (aarch64_float_const_zero_rtx_p (x))
7095 return true;
7096 return aarch64_float_const_representable_p (x);
7099 static bool
7100 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7102 /* Do not allow vector struct mode constants. We could support
7103 0 and -1 easily, but they need support in aarch64-simd.md. */
7104 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7105 return false;
7107 /* This could probably go away because
7108 we now decompose CONST_INTs according to expand_mov_immediate. */
7109 if ((GET_CODE (x) == CONST_VECTOR
7110 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7111 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7112 return !targetm.cannot_force_const_mem (mode, x);
7114 if (GET_CODE (x) == HIGH
7115 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7116 return true;
7118 return aarch64_constant_address_p (x);
7122 aarch64_load_tp (rtx target)
7124 if (!target
7125 || GET_MODE (target) != Pmode
7126 || !register_operand (target, Pmode))
7127 target = gen_reg_rtx (Pmode);
7129 /* Can return in any reg. */
7130 emit_insn (gen_aarch64_load_tp_hard (target));
7131 return target;
7134 /* On AAPCS systems, this is the "struct __va_list". */
7135 static GTY(()) tree va_list_type;
7137 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7138 Return the type to use as __builtin_va_list.
7140 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7142 struct __va_list
7144 void *__stack;
7145 void *__gr_top;
7146 void *__vr_top;
7147 int __gr_offs;
7148 int __vr_offs;
7149 }; */
7151 static tree
7152 aarch64_build_builtin_va_list (void)
7154 tree va_list_name;
7155 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7157 /* Create the type. */
7158 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7159 /* Give it the required name. */
7160 va_list_name = build_decl (BUILTINS_LOCATION,
7161 TYPE_DECL,
7162 get_identifier ("__va_list"),
7163 va_list_type);
7164 DECL_ARTIFICIAL (va_list_name) = 1;
7165 TYPE_NAME (va_list_type) = va_list_name;
7166 TYPE_STUB_DECL (va_list_type) = va_list_name;
7168 /* Create the fields. */
7169 f_stack = build_decl (BUILTINS_LOCATION,
7170 FIELD_DECL, get_identifier ("__stack"),
7171 ptr_type_node);
7172 f_grtop = build_decl (BUILTINS_LOCATION,
7173 FIELD_DECL, get_identifier ("__gr_top"),
7174 ptr_type_node);
7175 f_vrtop = build_decl (BUILTINS_LOCATION,
7176 FIELD_DECL, get_identifier ("__vr_top"),
7177 ptr_type_node);
7178 f_groff = build_decl (BUILTINS_LOCATION,
7179 FIELD_DECL, get_identifier ("__gr_offs"),
7180 integer_type_node);
7181 f_vroff = build_decl (BUILTINS_LOCATION,
7182 FIELD_DECL, get_identifier ("__vr_offs"),
7183 integer_type_node);
7185 DECL_ARTIFICIAL (f_stack) = 1;
7186 DECL_ARTIFICIAL (f_grtop) = 1;
7187 DECL_ARTIFICIAL (f_vrtop) = 1;
7188 DECL_ARTIFICIAL (f_groff) = 1;
7189 DECL_ARTIFICIAL (f_vroff) = 1;
7191 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7192 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7193 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7194 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7195 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7197 TYPE_FIELDS (va_list_type) = f_stack;
7198 DECL_CHAIN (f_stack) = f_grtop;
7199 DECL_CHAIN (f_grtop) = f_vrtop;
7200 DECL_CHAIN (f_vrtop) = f_groff;
7201 DECL_CHAIN (f_groff) = f_vroff;
7203 /* Compute its layout. */
7204 layout_type (va_list_type);
7206 return va_list_type;
7209 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7210 static void
7211 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7213 const CUMULATIVE_ARGS *cum;
7214 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7215 tree stack, grtop, vrtop, groff, vroff;
7216 tree t;
7217 int gr_save_area_size;
7218 int vr_save_area_size;
7219 int vr_offset;
7221 cum = &crtl->args.info;
7222 gr_save_area_size
7223 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7224 vr_save_area_size
7225 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7227 if (TARGET_GENERAL_REGS_ONLY)
7229 if (cum->aapcs_nvrn > 0)
7230 sorry ("%qs and floating point or vector arguments",
7231 "-mgeneral-regs-only");
7232 vr_save_area_size = 0;
7235 f_stack = TYPE_FIELDS (va_list_type_node);
7236 f_grtop = DECL_CHAIN (f_stack);
7237 f_vrtop = DECL_CHAIN (f_grtop);
7238 f_groff = DECL_CHAIN (f_vrtop);
7239 f_vroff = DECL_CHAIN (f_groff);
7241 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7242 NULL_TREE);
7243 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7244 NULL_TREE);
7245 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7246 NULL_TREE);
7247 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7248 NULL_TREE);
7249 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7250 NULL_TREE);
7252 /* Emit code to initialize STACK, which points to the next varargs stack
7253 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7254 by named arguments. STACK is 8-byte aligned. */
7255 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7256 if (cum->aapcs_stack_size > 0)
7257 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7258 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7259 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7261 /* Emit code to initialize GRTOP, the top of the GR save area.
7262 virtual_incoming_args_rtx should have been 16 byte aligned. */
7263 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7264 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7265 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7267 /* Emit code to initialize VRTOP, the top of the VR save area.
7268 This address is gr_save_area_bytes below GRTOP, rounded
7269 down to the next 16-byte boundary. */
7270 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7271 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7272 STACK_BOUNDARY / BITS_PER_UNIT);
7274 if (vr_offset)
7275 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7276 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7277 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7279 /* Emit code to initialize GROFF, the offset from GRTOP of the
7280 next GPR argument. */
7281 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7282 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7283 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7285 /* Likewise emit code to initialize VROFF, the offset from FTOP
7286 of the next VR argument. */
7287 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7288 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7289 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7292 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7294 static tree
7295 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7296 gimple_seq *post_p ATTRIBUTE_UNUSED)
7298 tree addr;
7299 bool indirect_p;
7300 bool is_ha; /* is HFA or HVA. */
7301 bool dw_align; /* double-word align. */
7302 machine_mode ag_mode = VOIDmode;
7303 int nregs;
7304 machine_mode mode;
7306 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7307 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7308 HOST_WIDE_INT size, rsize, adjust, align;
7309 tree t, u, cond1, cond2;
7311 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7312 if (indirect_p)
7313 type = build_pointer_type (type);
7315 mode = TYPE_MODE (type);
7317 f_stack = TYPE_FIELDS (va_list_type_node);
7318 f_grtop = DECL_CHAIN (f_stack);
7319 f_vrtop = DECL_CHAIN (f_grtop);
7320 f_groff = DECL_CHAIN (f_vrtop);
7321 f_vroff = DECL_CHAIN (f_groff);
7323 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7324 f_stack, NULL_TREE);
7325 size = int_size_in_bytes (type);
7326 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7328 dw_align = false;
7329 adjust = 0;
7330 if (aarch64_vfp_is_call_or_return_candidate (mode,
7331 type,
7332 &ag_mode,
7333 &nregs,
7334 &is_ha))
7336 /* TYPE passed in fp/simd registers. */
7337 if (TARGET_GENERAL_REGS_ONLY)
7338 sorry ("%qs and floating point or vector arguments",
7339 "-mgeneral-regs-only");
7341 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7342 unshare_expr (valist), f_vrtop, NULL_TREE);
7343 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7344 unshare_expr (valist), f_vroff, NULL_TREE);
7346 rsize = nregs * UNITS_PER_VREG;
7348 if (is_ha)
7350 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7351 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7353 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7354 && size < UNITS_PER_VREG)
7356 adjust = UNITS_PER_VREG - size;
7359 else
7361 /* TYPE passed in general registers. */
7362 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7363 unshare_expr (valist), f_grtop, NULL_TREE);
7364 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7365 unshare_expr (valist), f_groff, NULL_TREE);
7366 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7367 nregs = rsize / UNITS_PER_WORD;
7369 if (align > 8)
7370 dw_align = true;
7372 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7373 && size < UNITS_PER_WORD)
7375 adjust = UNITS_PER_WORD - size;
7379 /* Get a local temporary for the field value. */
7380 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7382 /* Emit code to branch if off >= 0. */
7383 t = build2 (GE_EXPR, boolean_type_node, off,
7384 build_int_cst (TREE_TYPE (off), 0));
7385 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7387 if (dw_align)
7389 /* Emit: offs = (offs + 15) & -16. */
7390 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7391 build_int_cst (TREE_TYPE (off), 15));
7392 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7393 build_int_cst (TREE_TYPE (off), -16));
7394 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7396 else
7397 roundup = NULL;
7399 /* Update ap.__[g|v]r_offs */
7400 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7401 build_int_cst (TREE_TYPE (off), rsize));
7402 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7404 /* String up. */
7405 if (roundup)
7406 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7408 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7409 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7410 build_int_cst (TREE_TYPE (f_off), 0));
7411 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7413 /* String up: make sure the assignment happens before the use. */
7414 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7415 COND_EXPR_ELSE (cond1) = t;
7417 /* Prepare the trees handling the argument that is passed on the stack;
7418 the top level node will store in ON_STACK. */
7419 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7420 if (align > 8)
7422 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7423 t = fold_convert (intDI_type_node, arg);
7424 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7425 build_int_cst (TREE_TYPE (t), 15));
7426 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7427 build_int_cst (TREE_TYPE (t), -16));
7428 t = fold_convert (TREE_TYPE (arg), t);
7429 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7431 else
7432 roundup = NULL;
7433 /* Advance ap.__stack */
7434 t = fold_convert (intDI_type_node, arg);
7435 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7436 build_int_cst (TREE_TYPE (t), size + 7));
7437 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7438 build_int_cst (TREE_TYPE (t), -8));
7439 t = fold_convert (TREE_TYPE (arg), t);
7440 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7441 /* String up roundup and advance. */
7442 if (roundup)
7443 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7444 /* String up with arg */
7445 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7446 /* Big-endianness related address adjustment. */
7447 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7448 && size < UNITS_PER_WORD)
7450 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7451 size_int (UNITS_PER_WORD - size));
7452 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7455 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7456 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7458 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7459 t = off;
7460 if (adjust)
7461 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7462 build_int_cst (TREE_TYPE (off), adjust));
7464 t = fold_convert (sizetype, t);
7465 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7467 if (is_ha)
7469 /* type ha; // treat as "struct {ftype field[n];}"
7470 ... [computing offs]
7471 for (i = 0; i <nregs; ++i, offs += 16)
7472 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7473 return ha; */
7474 int i;
7475 tree tmp_ha, field_t, field_ptr_t;
7477 /* Declare a local variable. */
7478 tmp_ha = create_tmp_var_raw (type, "ha");
7479 gimple_add_tmp_var (tmp_ha);
7481 /* Establish the base type. */
7482 switch (ag_mode)
7484 case SFmode:
7485 field_t = float_type_node;
7486 field_ptr_t = float_ptr_type_node;
7487 break;
7488 case DFmode:
7489 field_t = double_type_node;
7490 field_ptr_t = double_ptr_type_node;
7491 break;
7492 case TFmode:
7493 field_t = long_double_type_node;
7494 field_ptr_t = long_double_ptr_type_node;
7495 break;
7496 /* The half precision and quad precision are not fully supported yet. Enable
7497 the following code after the support is complete. Need to find the correct
7498 type node for __fp16 *. */
7499 #if 0
7500 case HFmode:
7501 field_t = float_type_node;
7502 field_ptr_t = float_ptr_type_node;
7503 break;
7504 #endif
7505 case V2SImode:
7506 case V4SImode:
7508 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7509 field_t = build_vector_type_for_mode (innertype, ag_mode);
7510 field_ptr_t = build_pointer_type (field_t);
7512 break;
7513 default:
7514 gcc_assert (0);
7517 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7518 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7519 addr = t;
7520 t = fold_convert (field_ptr_t, addr);
7521 t = build2 (MODIFY_EXPR, field_t,
7522 build1 (INDIRECT_REF, field_t, tmp_ha),
7523 build1 (INDIRECT_REF, field_t, t));
7525 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7526 for (i = 1; i < nregs; ++i)
7528 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7529 u = fold_convert (field_ptr_t, addr);
7530 u = build2 (MODIFY_EXPR, field_t,
7531 build2 (MEM_REF, field_t, tmp_ha,
7532 build_int_cst (field_ptr_t,
7533 (i *
7534 int_size_in_bytes (field_t)))),
7535 build1 (INDIRECT_REF, field_t, u));
7536 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7539 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7540 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7543 COND_EXPR_ELSE (cond2) = t;
7544 addr = fold_convert (build_pointer_type (type), cond1);
7545 addr = build_va_arg_indirect_ref (addr);
7547 if (indirect_p)
7548 addr = build_va_arg_indirect_ref (addr);
7550 return addr;
7553 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7555 static void
7556 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7557 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7558 int no_rtl)
7560 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7561 CUMULATIVE_ARGS local_cum;
7562 int gr_saved, vr_saved;
7564 /* The caller has advanced CUM up to, but not beyond, the last named
7565 argument. Advance a local copy of CUM past the last "real" named
7566 argument, to find out how many registers are left over. */
7567 local_cum = *cum;
7568 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7570 /* Found out how many registers we need to save. */
7571 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7572 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7574 if (TARGET_GENERAL_REGS_ONLY)
7576 if (local_cum.aapcs_nvrn > 0)
7577 sorry ("%qs and floating point or vector arguments",
7578 "-mgeneral-regs-only");
7579 vr_saved = 0;
7582 if (!no_rtl)
7584 if (gr_saved > 0)
7586 rtx ptr, mem;
7588 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7589 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7590 - gr_saved * UNITS_PER_WORD);
7591 mem = gen_frame_mem (BLKmode, ptr);
7592 set_mem_alias_set (mem, get_varargs_alias_set ());
7594 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7595 mem, gr_saved);
7597 if (vr_saved > 0)
7599 /* We can't use move_block_from_reg, because it will use
7600 the wrong mode, storing D regs only. */
7601 machine_mode mode = TImode;
7602 int off, i;
7604 /* Set OFF to the offset from virtual_incoming_args_rtx of
7605 the first vector register. The VR save area lies below
7606 the GR one, and is aligned to 16 bytes. */
7607 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7608 STACK_BOUNDARY / BITS_PER_UNIT);
7609 off -= vr_saved * UNITS_PER_VREG;
7611 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7613 rtx ptr, mem;
7615 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7616 mem = gen_frame_mem (mode, ptr);
7617 set_mem_alias_set (mem, get_varargs_alias_set ());
7618 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7619 off += UNITS_PER_VREG;
7624 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7625 any complication of having crtl->args.pretend_args_size changed. */
7626 cfun->machine->frame.saved_varargs_size
7627 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7628 STACK_BOUNDARY / BITS_PER_UNIT)
7629 + vr_saved * UNITS_PER_VREG);
7632 static void
7633 aarch64_conditional_register_usage (void)
7635 int i;
7636 if (!TARGET_FLOAT)
7638 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7640 fixed_regs[i] = 1;
7641 call_used_regs[i] = 1;
7646 /* Walk down the type tree of TYPE counting consecutive base elements.
7647 If *MODEP is VOIDmode, then set it to the first valid floating point
7648 type. If a non-floating point type is found, or if a floating point
7649 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7650 otherwise return the count in the sub-tree. */
7651 static int
7652 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7654 machine_mode mode;
7655 HOST_WIDE_INT size;
7657 switch (TREE_CODE (type))
7659 case REAL_TYPE:
7660 mode = TYPE_MODE (type);
7661 if (mode != DFmode && mode != SFmode && mode != TFmode)
7662 return -1;
7664 if (*modep == VOIDmode)
7665 *modep = mode;
7667 if (*modep == mode)
7668 return 1;
7670 break;
7672 case COMPLEX_TYPE:
7673 mode = TYPE_MODE (TREE_TYPE (type));
7674 if (mode != DFmode && mode != SFmode && mode != TFmode)
7675 return -1;
7677 if (*modep == VOIDmode)
7678 *modep = mode;
7680 if (*modep == mode)
7681 return 2;
7683 break;
7685 case VECTOR_TYPE:
7686 /* Use V2SImode and V4SImode as representatives of all 64-bit
7687 and 128-bit vector types. */
7688 size = int_size_in_bytes (type);
7689 switch (size)
7691 case 8:
7692 mode = V2SImode;
7693 break;
7694 case 16:
7695 mode = V4SImode;
7696 break;
7697 default:
7698 return -1;
7701 if (*modep == VOIDmode)
7702 *modep = mode;
7704 /* Vector modes are considered to be opaque: two vectors are
7705 equivalent for the purposes of being homogeneous aggregates
7706 if they are the same size. */
7707 if (*modep == mode)
7708 return 1;
7710 break;
7712 case ARRAY_TYPE:
7714 int count;
7715 tree index = TYPE_DOMAIN (type);
7717 /* Can't handle incomplete types nor sizes that are not
7718 fixed. */
7719 if (!COMPLETE_TYPE_P (type)
7720 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7721 return -1;
7723 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7724 if (count == -1
7725 || !index
7726 || !TYPE_MAX_VALUE (index)
7727 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7728 || !TYPE_MIN_VALUE (index)
7729 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7730 || count < 0)
7731 return -1;
7733 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7734 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7736 /* There must be no padding. */
7737 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7738 return -1;
7740 return count;
7743 case RECORD_TYPE:
7745 int count = 0;
7746 int sub_count;
7747 tree field;
7749 /* Can't handle incomplete types nor sizes that are not
7750 fixed. */
7751 if (!COMPLETE_TYPE_P (type)
7752 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7753 return -1;
7755 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7757 if (TREE_CODE (field) != FIELD_DECL)
7758 continue;
7760 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7761 if (sub_count < 0)
7762 return -1;
7763 count += sub_count;
7766 /* There must be no padding. */
7767 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7768 return -1;
7770 return count;
7773 case UNION_TYPE:
7774 case QUAL_UNION_TYPE:
7776 /* These aren't very interesting except in a degenerate case. */
7777 int count = 0;
7778 int sub_count;
7779 tree field;
7781 /* Can't handle incomplete types nor sizes that are not
7782 fixed. */
7783 if (!COMPLETE_TYPE_P (type)
7784 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7785 return -1;
7787 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7789 if (TREE_CODE (field) != FIELD_DECL)
7790 continue;
7792 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7793 if (sub_count < 0)
7794 return -1;
7795 count = count > sub_count ? count : sub_count;
7798 /* There must be no padding. */
7799 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7800 return -1;
7802 return count;
7805 default:
7806 break;
7809 return -1;
7812 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7813 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7814 array types. The C99 floating-point complex types are also considered
7815 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7816 types, which are GCC extensions and out of the scope of AAPCS64, are
7817 treated as composite types here as well.
7819 Note that MODE itself is not sufficient in determining whether a type
7820 is such a composite type or not. This is because
7821 stor-layout.c:compute_record_mode may have already changed the MODE
7822 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7823 structure with only one field may have its MODE set to the mode of the
7824 field. Also an integer mode whose size matches the size of the
7825 RECORD_TYPE type may be used to substitute the original mode
7826 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7827 solely relied on. */
7829 static bool
7830 aarch64_composite_type_p (const_tree type,
7831 machine_mode mode)
7833 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7834 return true;
7836 if (mode == BLKmode
7837 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7838 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7839 return true;
7841 return false;
7844 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7845 type as described in AAPCS64 \S 4.1.2.
7847 See the comment above aarch64_composite_type_p for the notes on MODE. */
7849 static bool
7850 aarch64_short_vector_p (const_tree type,
7851 machine_mode mode)
7853 HOST_WIDE_INT size = -1;
7855 if (type && TREE_CODE (type) == VECTOR_TYPE)
7856 size = int_size_in_bytes (type);
7857 else if (!aarch64_composite_type_p (type, mode)
7858 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7859 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7860 size = GET_MODE_SIZE (mode);
7862 return (size == 8 || size == 16) ? true : false;
7865 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7866 shall be passed or returned in simd/fp register(s) (providing these
7867 parameter passing registers are available).
7869 Upon successful return, *COUNT returns the number of needed registers,
7870 *BASE_MODE returns the mode of the individual register and when IS_HAF
7871 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7872 floating-point aggregate or a homogeneous short-vector aggregate. */
7874 static bool
7875 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7876 const_tree type,
7877 machine_mode *base_mode,
7878 int *count,
7879 bool *is_ha)
7881 machine_mode new_mode = VOIDmode;
7882 bool composite_p = aarch64_composite_type_p (type, mode);
7884 if (is_ha != NULL) *is_ha = false;
7886 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7887 || aarch64_short_vector_p (type, mode))
7889 *count = 1;
7890 new_mode = mode;
7892 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7894 if (is_ha != NULL) *is_ha = true;
7895 *count = 2;
7896 new_mode = GET_MODE_INNER (mode);
7898 else if (type && composite_p)
7900 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7902 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7904 if (is_ha != NULL) *is_ha = true;
7905 *count = ag_count;
7907 else
7908 return false;
7910 else
7911 return false;
7913 *base_mode = new_mode;
7914 return true;
7917 /* Implement TARGET_STRUCT_VALUE_RTX. */
7919 static rtx
7920 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7921 int incoming ATTRIBUTE_UNUSED)
7923 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7926 /* Implements target hook vector_mode_supported_p. */
7927 static bool
7928 aarch64_vector_mode_supported_p (machine_mode mode)
7930 if (TARGET_SIMD
7931 && (mode == V4SImode || mode == V8HImode
7932 || mode == V16QImode || mode == V2DImode
7933 || mode == V2SImode || mode == V4HImode
7934 || mode == V8QImode || mode == V2SFmode
7935 || mode == V4SFmode || mode == V2DFmode
7936 || mode == V1DFmode))
7937 return true;
7939 return false;
7942 /* Return appropriate SIMD container
7943 for MODE within a vector of WIDTH bits. */
7944 static machine_mode
7945 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7947 gcc_assert (width == 64 || width == 128);
7948 if (TARGET_SIMD)
7950 if (width == 128)
7951 switch (mode)
7953 case DFmode:
7954 return V2DFmode;
7955 case SFmode:
7956 return V4SFmode;
7957 case SImode:
7958 return V4SImode;
7959 case HImode:
7960 return V8HImode;
7961 case QImode:
7962 return V16QImode;
7963 case DImode:
7964 return V2DImode;
7965 default:
7966 break;
7968 else
7969 switch (mode)
7971 case SFmode:
7972 return V2SFmode;
7973 case SImode:
7974 return V2SImode;
7975 case HImode:
7976 return V4HImode;
7977 case QImode:
7978 return V8QImode;
7979 default:
7980 break;
7983 return word_mode;
7986 /* Return 128-bit container as the preferred SIMD mode for MODE. */
7987 static machine_mode
7988 aarch64_preferred_simd_mode (machine_mode mode)
7990 return aarch64_simd_container_mode (mode, 128);
7993 /* Return the bitmask of possible vector sizes for the vectorizer
7994 to iterate over. */
7995 static unsigned int
7996 aarch64_autovectorize_vector_sizes (void)
7998 return (16 | 8);
8001 /* Implement TARGET_MANGLE_TYPE. */
8003 static const char *
8004 aarch64_mangle_type (const_tree type)
8006 /* The AArch64 ABI documents say that "__va_list" has to be
8007 managled as if it is in the "std" namespace. */
8008 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8009 return "St9__va_list";
8011 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8012 builtin types. */
8013 if (TYPE_NAME (type) != NULL)
8014 return aarch64_mangle_builtin_type (type);
8016 /* Use the default mangling. */
8017 return NULL;
8021 /* Return true if the rtx_insn contains a MEM RTX somewhere
8022 in it. */
8024 static bool
8025 has_memory_op (rtx_insn *mem_insn)
8027 subrtx_iterator::array_type array;
8028 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8029 if (MEM_P (*iter))
8030 return true;
8032 return false;
8035 /* Find the first rtx_insn before insn that will generate an assembly
8036 instruction. */
8038 static rtx_insn *
8039 aarch64_prev_real_insn (rtx_insn *insn)
8041 if (!insn)
8042 return NULL;
8046 insn = prev_real_insn (insn);
8048 while (insn && recog_memoized (insn) < 0);
8050 return insn;
8053 static bool
8054 is_madd_op (enum attr_type t1)
8056 unsigned int i;
8057 /* A number of these may be AArch32 only. */
8058 enum attr_type mlatypes[] = {
8059 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8060 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8061 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8064 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8066 if (t1 == mlatypes[i])
8067 return true;
8070 return false;
8073 /* Check if there is a register dependency between a load and the insn
8074 for which we hold recog_data. */
8076 static bool
8077 dep_between_memop_and_curr (rtx memop)
8079 rtx load_reg;
8080 int opno;
8082 gcc_assert (GET_CODE (memop) == SET);
8084 if (!REG_P (SET_DEST (memop)))
8085 return false;
8087 load_reg = SET_DEST (memop);
8088 for (opno = 1; opno < recog_data.n_operands; opno++)
8090 rtx operand = recog_data.operand[opno];
8091 if (REG_P (operand)
8092 && reg_overlap_mentioned_p (load_reg, operand))
8093 return true;
8096 return false;
8100 /* When working around the Cortex-A53 erratum 835769,
8101 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8102 instruction and has a preceding memory instruction such that a NOP
8103 should be inserted between them. */
8105 bool
8106 aarch64_madd_needs_nop (rtx_insn* insn)
8108 enum attr_type attr_type;
8109 rtx_insn *prev;
8110 rtx body;
8112 if (!aarch64_fix_a53_err835769)
8113 return false;
8115 if (recog_memoized (insn) < 0)
8116 return false;
8118 attr_type = get_attr_type (insn);
8119 if (!is_madd_op (attr_type))
8120 return false;
8122 prev = aarch64_prev_real_insn (insn);
8123 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8124 Restore recog state to INSN to avoid state corruption. */
8125 extract_constrain_insn_cached (insn);
8127 if (!prev || !has_memory_op (prev))
8128 return false;
8130 body = single_set (prev);
8132 /* If the previous insn is a memory op and there is no dependency between
8133 it and the DImode madd, emit a NOP between them. If body is NULL then we
8134 have a complex memory operation, probably a load/store pair.
8135 Be conservative for now and emit a NOP. */
8136 if (GET_MODE (recog_data.operand[0]) == DImode
8137 && (!body || !dep_between_memop_and_curr (body)))
8138 return true;
8140 return false;
8145 /* Implement FINAL_PRESCAN_INSN. */
8147 void
8148 aarch64_final_prescan_insn (rtx_insn *insn)
8150 if (aarch64_madd_needs_nop (insn))
8151 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8155 /* Return the equivalent letter for size. */
8156 static char
8157 sizetochar (int size)
8159 switch (size)
8161 case 64: return 'd';
8162 case 32: return 's';
8163 case 16: return 'h';
8164 case 8 : return 'b';
8165 default: gcc_unreachable ();
8169 /* Return true iff x is a uniform vector of floating-point
8170 constants, and the constant can be represented in
8171 quarter-precision form. Note, as aarch64_float_const_representable
8172 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8173 static bool
8174 aarch64_vect_float_const_representable_p (rtx x)
8176 int i = 0;
8177 REAL_VALUE_TYPE r0, ri;
8178 rtx x0, xi;
8180 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8181 return false;
8183 x0 = CONST_VECTOR_ELT (x, 0);
8184 if (!CONST_DOUBLE_P (x0))
8185 return false;
8187 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8189 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8191 xi = CONST_VECTOR_ELT (x, i);
8192 if (!CONST_DOUBLE_P (xi))
8193 return false;
8195 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8196 if (!REAL_VALUES_EQUAL (r0, ri))
8197 return false;
8200 return aarch64_float_const_representable_p (x0);
8203 /* Return true for valid and false for invalid. */
8204 bool
8205 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8206 struct simd_immediate_info *info)
8208 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8209 matches = 1; \
8210 for (i = 0; i < idx; i += (STRIDE)) \
8211 if (!(TEST)) \
8212 matches = 0; \
8213 if (matches) \
8215 immtype = (CLASS); \
8216 elsize = (ELSIZE); \
8217 eshift = (SHIFT); \
8218 emvn = (NEG); \
8219 break; \
8222 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8223 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8224 unsigned char bytes[16];
8225 int immtype = -1, matches;
8226 unsigned int invmask = inverse ? 0xff : 0;
8227 int eshift, emvn;
8229 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8231 if (! (aarch64_simd_imm_zero_p (op, mode)
8232 || aarch64_vect_float_const_representable_p (op)))
8233 return false;
8235 if (info)
8237 info->value = CONST_VECTOR_ELT (op, 0);
8238 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8239 info->mvn = false;
8240 info->shift = 0;
8243 return true;
8246 /* Splat vector constant out into a byte vector. */
8247 for (i = 0; i < n_elts; i++)
8249 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8250 it must be laid out in the vector register in reverse order. */
8251 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8252 unsigned HOST_WIDE_INT elpart;
8253 unsigned int part, parts;
8255 if (CONST_INT_P (el))
8257 elpart = INTVAL (el);
8258 parts = 1;
8260 else if (GET_CODE (el) == CONST_DOUBLE)
8262 elpart = CONST_DOUBLE_LOW (el);
8263 parts = 2;
8265 else
8266 gcc_unreachable ();
8268 for (part = 0; part < parts; part++)
8270 unsigned int byte;
8271 for (byte = 0; byte < innersize; byte++)
8273 bytes[idx++] = (elpart & 0xff) ^ invmask;
8274 elpart >>= BITS_PER_UNIT;
8276 if (GET_CODE (el) == CONST_DOUBLE)
8277 elpart = CONST_DOUBLE_HIGH (el);
8281 /* Sanity check. */
8282 gcc_assert (idx == GET_MODE_SIZE (mode));
8286 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8287 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8289 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8290 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8292 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8293 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8295 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8296 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8298 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8300 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8302 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8303 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8305 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8306 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8308 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8309 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8311 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8312 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8314 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8316 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8318 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8319 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8321 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8322 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8324 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8325 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8327 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8328 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8330 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8332 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8333 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8335 while (0);
8337 if (immtype == -1)
8338 return false;
8340 if (info)
8342 info->element_width = elsize;
8343 info->mvn = emvn != 0;
8344 info->shift = eshift;
8346 unsigned HOST_WIDE_INT imm = 0;
8348 if (immtype >= 12 && immtype <= 15)
8349 info->msl = true;
8351 /* Un-invert bytes of recognized vector, if necessary. */
8352 if (invmask != 0)
8353 for (i = 0; i < idx; i++)
8354 bytes[i] ^= invmask;
8356 if (immtype == 17)
8358 /* FIXME: Broken on 32-bit H_W_I hosts. */
8359 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8361 for (i = 0; i < 8; i++)
8362 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8363 << (i * BITS_PER_UNIT);
8366 info->value = GEN_INT (imm);
8368 else
8370 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8371 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8373 /* Construct 'abcdefgh' because the assembler cannot handle
8374 generic constants. */
8375 if (info->mvn)
8376 imm = ~imm;
8377 imm = (imm >> info->shift) & 0xff;
8378 info->value = GEN_INT (imm);
8382 return true;
8383 #undef CHECK
8386 /* Check of immediate shift constants are within range. */
8387 bool
8388 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8390 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8391 if (left)
8392 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8393 else
8394 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8397 /* Return true if X is a uniform vector where all elements
8398 are either the floating-point constant 0.0 or the
8399 integer constant 0. */
8400 bool
8401 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8403 return x == CONST0_RTX (mode);
8406 bool
8407 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8409 HOST_WIDE_INT imm = INTVAL (x);
8410 int i;
8412 for (i = 0; i < 8; i++)
8414 unsigned int byte = imm & 0xff;
8415 if (byte != 0xff && byte != 0)
8416 return false;
8417 imm >>= 8;
8420 return true;
8423 bool
8424 aarch64_mov_operand_p (rtx x,
8425 enum aarch64_symbol_context context,
8426 machine_mode mode)
8428 if (GET_CODE (x) == HIGH
8429 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8430 return true;
8432 if (CONST_INT_P (x))
8433 return true;
8435 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8436 return true;
8438 return aarch64_classify_symbolic_expression (x, context)
8439 == SYMBOL_TINY_ABSOLUTE;
8442 /* Return a const_int vector of VAL. */
8444 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8446 int nunits = GET_MODE_NUNITS (mode);
8447 rtvec v = rtvec_alloc (nunits);
8448 int i;
8450 for (i=0; i < nunits; i++)
8451 RTVEC_ELT (v, i) = GEN_INT (val);
8453 return gen_rtx_CONST_VECTOR (mode, v);
8456 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8458 bool
8459 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8461 machine_mode vmode;
8463 gcc_assert (!VECTOR_MODE_P (mode));
8464 vmode = aarch64_preferred_simd_mode (mode);
8465 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8466 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8469 /* Construct and return a PARALLEL RTX vector with elements numbering the
8470 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8471 the vector - from the perspective of the architecture. This does not
8472 line up with GCC's perspective on lane numbers, so we end up with
8473 different masks depending on our target endian-ness. The diagram
8474 below may help. We must draw the distinction when building masks
8475 which select one half of the vector. An instruction selecting
8476 architectural low-lanes for a big-endian target, must be described using
8477 a mask selecting GCC high-lanes.
8479 Big-Endian Little-Endian
8481 GCC 0 1 2 3 3 2 1 0
8482 | x | x | x | x | | x | x | x | x |
8483 Architecture 3 2 1 0 3 2 1 0
8485 Low Mask: { 2, 3 } { 0, 1 }
8486 High Mask: { 0, 1 } { 2, 3 }
8490 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8492 int nunits = GET_MODE_NUNITS (mode);
8493 rtvec v = rtvec_alloc (nunits / 2);
8494 int high_base = nunits / 2;
8495 int low_base = 0;
8496 int base;
8497 rtx t1;
8498 int i;
8500 if (BYTES_BIG_ENDIAN)
8501 base = high ? low_base : high_base;
8502 else
8503 base = high ? high_base : low_base;
8505 for (i = 0; i < nunits / 2; i++)
8506 RTVEC_ELT (v, i) = GEN_INT (base + i);
8508 t1 = gen_rtx_PARALLEL (mode, v);
8509 return t1;
8512 /* Check OP for validity as a PARALLEL RTX vector with elements
8513 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8514 from the perspective of the architecture. See the diagram above
8515 aarch64_simd_vect_par_cnst_half for more details. */
8517 bool
8518 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8519 bool high)
8521 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8522 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8523 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8524 int i = 0;
8526 if (!VECTOR_MODE_P (mode))
8527 return false;
8529 if (count_op != count_ideal)
8530 return false;
8532 for (i = 0; i < count_ideal; i++)
8534 rtx elt_op = XVECEXP (op, 0, i);
8535 rtx elt_ideal = XVECEXP (ideal, 0, i);
8537 if (!CONST_INT_P (elt_op)
8538 || INTVAL (elt_ideal) != INTVAL (elt_op))
8539 return false;
8541 return true;
8544 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8545 HIGH (exclusive). */
8546 void
8547 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8548 const_tree exp)
8550 HOST_WIDE_INT lane;
8551 gcc_assert (CONST_INT_P (operand));
8552 lane = INTVAL (operand);
8554 if (lane < low || lane >= high)
8556 if (exp)
8557 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8558 else
8559 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8563 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8564 registers). */
8565 void
8566 aarch64_simd_emit_pair_result_insn (machine_mode mode,
8567 rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8568 rtx op1)
8570 rtx mem = gen_rtx_MEM (mode, destaddr);
8571 rtx tmp1 = gen_reg_rtx (mode);
8572 rtx tmp2 = gen_reg_rtx (mode);
8574 emit_insn (intfn (tmp1, op1, tmp2));
8576 emit_move_insn (mem, tmp1);
8577 mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8578 emit_move_insn (mem, tmp2);
8581 /* Return TRUE if OP is a valid vector addressing mode. */
8582 bool
8583 aarch64_simd_mem_operand_p (rtx op)
8585 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8586 || REG_P (XEXP (op, 0)));
8589 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8590 not to early-clobber SRC registers in the process.
8592 We assume that the operands described by SRC and DEST represent a
8593 decomposed copy of OPERANDS[1] into OPERANDS[0]. COUNT is the
8594 number of components into which the copy has been decomposed. */
8595 void
8596 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8597 rtx *src, unsigned int count)
8599 unsigned int i;
8601 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8602 || REGNO (operands[0]) < REGNO (operands[1]))
8604 for (i = 0; i < count; i++)
8606 operands[2 * i] = dest[i];
8607 operands[2 * i + 1] = src[i];
8610 else
8612 for (i = 0; i < count; i++)
8614 operands[2 * i] = dest[count - i - 1];
8615 operands[2 * i + 1] = src[count - i - 1];
8620 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8621 one of VSTRUCT modes: OI, CI or XI. */
8623 aarch64_simd_attr_length_move (rtx_insn *insn)
8625 machine_mode mode;
8627 extract_insn_cached (insn);
8629 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8631 mode = GET_MODE (recog_data.operand[0]);
8632 switch (mode)
8634 case OImode:
8635 return 8;
8636 case CImode:
8637 return 12;
8638 case XImode:
8639 return 16;
8640 default:
8641 gcc_unreachable ();
8644 return 4;
8647 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8648 alignment of a vector to 128 bits. */
8649 static HOST_WIDE_INT
8650 aarch64_simd_vector_alignment (const_tree type)
8652 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8653 return MIN (align, 128);
8656 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8657 static bool
8658 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8660 if (is_packed)
8661 return false;
8663 /* We guarantee alignment for vectors up to 128-bits. */
8664 if (tree_int_cst_compare (TYPE_SIZE (type),
8665 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8666 return false;
8668 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
8669 return true;
8672 /* If VALS is a vector constant that can be loaded into a register
8673 using DUP, generate instructions to do so and return an RTX to
8674 assign to the register. Otherwise return NULL_RTX. */
8675 static rtx
8676 aarch64_simd_dup_constant (rtx vals)
8678 machine_mode mode = GET_MODE (vals);
8679 machine_mode inner_mode = GET_MODE_INNER (mode);
8680 int n_elts = GET_MODE_NUNITS (mode);
8681 bool all_same = true;
8682 rtx x;
8683 int i;
8685 if (GET_CODE (vals) != CONST_VECTOR)
8686 return NULL_RTX;
8688 for (i = 1; i < n_elts; ++i)
8690 x = CONST_VECTOR_ELT (vals, i);
8691 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8692 all_same = false;
8695 if (!all_same)
8696 return NULL_RTX;
8698 /* We can load this constant by using DUP and a constant in a
8699 single ARM register. This will be cheaper than a vector
8700 load. */
8701 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8702 return gen_rtx_VEC_DUPLICATE (mode, x);
8706 /* Generate code to load VALS, which is a PARALLEL containing only
8707 constants (for vec_init) or CONST_VECTOR, efficiently into a
8708 register. Returns an RTX to copy into the register, or NULL_RTX
8709 for a PARALLEL that can not be converted into a CONST_VECTOR. */
8710 static rtx
8711 aarch64_simd_make_constant (rtx vals)
8713 machine_mode mode = GET_MODE (vals);
8714 rtx const_dup;
8715 rtx const_vec = NULL_RTX;
8716 int n_elts = GET_MODE_NUNITS (mode);
8717 int n_const = 0;
8718 int i;
8720 if (GET_CODE (vals) == CONST_VECTOR)
8721 const_vec = vals;
8722 else if (GET_CODE (vals) == PARALLEL)
8724 /* A CONST_VECTOR must contain only CONST_INTs and
8725 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8726 Only store valid constants in a CONST_VECTOR. */
8727 for (i = 0; i < n_elts; ++i)
8729 rtx x = XVECEXP (vals, 0, i);
8730 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8731 n_const++;
8733 if (n_const == n_elts)
8734 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8736 else
8737 gcc_unreachable ();
8739 if (const_vec != NULL_RTX
8740 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8741 /* Load using MOVI/MVNI. */
8742 return const_vec;
8743 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8744 /* Loaded using DUP. */
8745 return const_dup;
8746 else if (const_vec != NULL_RTX)
8747 /* Load from constant pool. We can not take advantage of single-cycle
8748 LD1 because we need a PC-relative addressing mode. */
8749 return const_vec;
8750 else
8751 /* A PARALLEL containing something not valid inside CONST_VECTOR.
8752 We can not construct an initializer. */
8753 return NULL_RTX;
8756 void
8757 aarch64_expand_vector_init (rtx target, rtx vals)
8759 machine_mode mode = GET_MODE (target);
8760 machine_mode inner_mode = GET_MODE_INNER (mode);
8761 int n_elts = GET_MODE_NUNITS (mode);
8762 int n_var = 0, one_var = -1;
8763 bool all_same = true;
8764 rtx x, mem;
8765 int i;
8767 x = XVECEXP (vals, 0, 0);
8768 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8769 n_var = 1, one_var = 0;
8771 for (i = 1; i < n_elts; ++i)
8773 x = XVECEXP (vals, 0, i);
8774 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8775 ++n_var, one_var = i;
8777 if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8778 all_same = false;
8781 if (n_var == 0)
8783 rtx constant = aarch64_simd_make_constant (vals);
8784 if (constant != NULL_RTX)
8786 emit_move_insn (target, constant);
8787 return;
8791 /* Splat a single non-constant element if we can. */
8792 if (all_same)
8794 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8795 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8796 return;
8799 /* One field is non-constant. Load constant then overwrite varying
8800 field. This is more efficient than using the stack. */
8801 if (n_var == 1)
8803 rtx copy = copy_rtx (vals);
8804 rtx index = GEN_INT (one_var);
8805 enum insn_code icode;
8807 /* Load constant part of vector, substitute neighboring value for
8808 varying element. */
8809 XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8810 aarch64_expand_vector_init (target, copy);
8812 /* Insert variable. */
8813 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8814 icode = optab_handler (vec_set_optab, mode);
8815 gcc_assert (icode != CODE_FOR_nothing);
8816 emit_insn (GEN_FCN (icode) (target, x, index));
8817 return;
8820 /* Construct the vector in memory one field at a time
8821 and load the whole vector. */
8822 mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8823 for (i = 0; i < n_elts; i++)
8824 emit_move_insn (adjust_address_nv (mem, inner_mode,
8825 i * GET_MODE_SIZE (inner_mode)),
8826 XVECEXP (vals, 0, i));
8827 emit_move_insn (target, mem);
8831 static unsigned HOST_WIDE_INT
8832 aarch64_shift_truncation_mask (machine_mode mode)
8834 return
8835 (aarch64_vector_mode_supported_p (mode)
8836 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8839 #ifndef TLS_SECTION_ASM_FLAG
8840 #define TLS_SECTION_ASM_FLAG 'T'
8841 #endif
8843 void
8844 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8845 tree decl ATTRIBUTE_UNUSED)
8847 char flagchars[10], *f = flagchars;
8849 /* If we have already declared this section, we can use an
8850 abbreviated form to switch back to it -- unless this section is
8851 part of a COMDAT groups, in which case GAS requires the full
8852 declaration every time. */
8853 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8854 && (flags & SECTION_DECLARED))
8856 fprintf (asm_out_file, "\t.section\t%s\n", name);
8857 return;
8860 if (!(flags & SECTION_DEBUG))
8861 *f++ = 'a';
8862 if (flags & SECTION_WRITE)
8863 *f++ = 'w';
8864 if (flags & SECTION_CODE)
8865 *f++ = 'x';
8866 if (flags & SECTION_SMALL)
8867 *f++ = 's';
8868 if (flags & SECTION_MERGE)
8869 *f++ = 'M';
8870 if (flags & SECTION_STRINGS)
8871 *f++ = 'S';
8872 if (flags & SECTION_TLS)
8873 *f++ = TLS_SECTION_ASM_FLAG;
8874 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8875 *f++ = 'G';
8876 *f = '\0';
8878 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8880 if (!(flags & SECTION_NOTYPE))
8882 const char *type;
8883 const char *format;
8885 if (flags & SECTION_BSS)
8886 type = "nobits";
8887 else
8888 type = "progbits";
8890 #ifdef TYPE_OPERAND_FMT
8891 format = "," TYPE_OPERAND_FMT;
8892 #else
8893 format = ",@%s";
8894 #endif
8896 fprintf (asm_out_file, format, type);
8898 if (flags & SECTION_ENTSIZE)
8899 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8900 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8902 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8903 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8904 else
8905 fprintf (asm_out_file, ",%s,comdat",
8906 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8910 putc ('\n', asm_out_file);
8913 /* Select a format to encode pointers in exception handling data. */
8915 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8917 int type;
8918 switch (aarch64_cmodel)
8920 case AARCH64_CMODEL_TINY:
8921 case AARCH64_CMODEL_TINY_PIC:
8922 case AARCH64_CMODEL_SMALL:
8923 case AARCH64_CMODEL_SMALL_PIC:
8924 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8925 for everything. */
8926 type = DW_EH_PE_sdata4;
8927 break;
8928 default:
8929 /* No assumptions here. 8-byte relocs required. */
8930 type = DW_EH_PE_sdata8;
8931 break;
8933 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8936 /* Emit load exclusive. */
8938 static void
8939 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8940 rtx mem, rtx model_rtx)
8942 rtx (*gen) (rtx, rtx, rtx);
8944 switch (mode)
8946 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8947 case HImode: gen = gen_aarch64_load_exclusivehi; break;
8948 case SImode: gen = gen_aarch64_load_exclusivesi; break;
8949 case DImode: gen = gen_aarch64_load_exclusivedi; break;
8950 default:
8951 gcc_unreachable ();
8954 emit_insn (gen (rval, mem, model_rtx));
8957 /* Emit store exclusive. */
8959 static void
8960 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8961 rtx rval, rtx mem, rtx model_rtx)
8963 rtx (*gen) (rtx, rtx, rtx, rtx);
8965 switch (mode)
8967 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8968 case HImode: gen = gen_aarch64_store_exclusivehi; break;
8969 case SImode: gen = gen_aarch64_store_exclusivesi; break;
8970 case DImode: gen = gen_aarch64_store_exclusivedi; break;
8971 default:
8972 gcc_unreachable ();
8975 emit_insn (gen (bval, rval, mem, model_rtx));
8978 /* Mark the previous jump instruction as unlikely. */
8980 static void
8981 aarch64_emit_unlikely_jump (rtx insn)
8983 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8985 insn = emit_jump_insn (insn);
8986 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8989 /* Expand a compare and swap pattern. */
8991 void
8992 aarch64_expand_compare_and_swap (rtx operands[])
8994 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8995 machine_mode mode, cmp_mode;
8996 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8998 bval = operands[0];
8999 rval = operands[1];
9000 mem = operands[2];
9001 oldval = operands[3];
9002 newval = operands[4];
9003 is_weak = operands[5];
9004 mod_s = operands[6];
9005 mod_f = operands[7];
9006 mode = GET_MODE (mem);
9007 cmp_mode = mode;
9009 /* Normally the succ memory model must be stronger than fail, but in the
9010 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9011 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9013 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9014 && INTVAL (mod_s) == MEMMODEL_RELEASE)
9015 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9017 switch (mode)
9019 case QImode:
9020 case HImode:
9021 /* For short modes, we're going to perform the comparison in SImode,
9022 so do the zero-extension now. */
9023 cmp_mode = SImode;
9024 rval = gen_reg_rtx (SImode);
9025 oldval = convert_modes (SImode, mode, oldval, true);
9026 /* Fall through. */
9028 case SImode:
9029 case DImode:
9030 /* Force the value into a register if needed. */
9031 if (!aarch64_plus_operand (oldval, mode))
9032 oldval = force_reg (cmp_mode, oldval);
9033 break;
9035 default:
9036 gcc_unreachable ();
9039 switch (mode)
9041 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9042 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9043 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9044 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9045 default:
9046 gcc_unreachable ();
9049 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9051 if (mode == QImode || mode == HImode)
9052 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9054 x = gen_rtx_REG (CCmode, CC_REGNUM);
9055 x = gen_rtx_EQ (SImode, x, const0_rtx);
9056 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9059 /* Split a compare and swap pattern. */
9061 void
9062 aarch64_split_compare_and_swap (rtx operands[])
9064 rtx rval, mem, oldval, newval, scratch;
9065 machine_mode mode;
9066 bool is_weak;
9067 rtx_code_label *label1, *label2;
9068 rtx x, cond;
9070 rval = operands[0];
9071 mem = operands[1];
9072 oldval = operands[2];
9073 newval = operands[3];
9074 is_weak = (operands[4] != const0_rtx);
9075 scratch = operands[7];
9076 mode = GET_MODE (mem);
9078 label1 = NULL;
9079 if (!is_weak)
9081 label1 = gen_label_rtx ();
9082 emit_label (label1);
9084 label2 = gen_label_rtx ();
9086 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9088 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9089 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9090 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9091 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9092 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9094 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9096 if (!is_weak)
9098 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9099 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9100 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9101 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9103 else
9105 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9106 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9107 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9110 emit_label (label2);
9113 /* Split an atomic operation. */
9115 void
9116 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9117 rtx value, rtx model_rtx, rtx cond)
9119 machine_mode mode = GET_MODE (mem);
9120 machine_mode wmode = (mode == DImode ? DImode : SImode);
9121 rtx_code_label *label;
9122 rtx x;
9124 label = gen_label_rtx ();
9125 emit_label (label);
9127 if (new_out)
9128 new_out = gen_lowpart (wmode, new_out);
9129 if (old_out)
9130 old_out = gen_lowpart (wmode, old_out);
9131 else
9132 old_out = new_out;
9133 value = simplify_gen_subreg (wmode, value, mode, 0);
9135 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9137 switch (code)
9139 case SET:
9140 new_out = value;
9141 break;
9143 case NOT:
9144 x = gen_rtx_AND (wmode, old_out, value);
9145 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9146 x = gen_rtx_NOT (wmode, new_out);
9147 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9148 break;
9150 case MINUS:
9151 if (CONST_INT_P (value))
9153 value = GEN_INT (-INTVAL (value));
9154 code = PLUS;
9156 /* Fall through. */
9158 default:
9159 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9160 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9161 break;
9164 aarch64_emit_store_exclusive (mode, cond, mem,
9165 gen_lowpart (mode, new_out), model_rtx);
9167 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9168 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9169 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9170 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9173 static void
9174 aarch64_print_extension (void)
9176 const struct aarch64_option_extension *opt = NULL;
9178 for (opt = all_extensions; opt->name != NULL; opt++)
9179 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9180 asm_fprintf (asm_out_file, "+%s", opt->name);
9182 asm_fprintf (asm_out_file, "\n");
9185 static void
9186 aarch64_start_file (void)
9188 if (selected_arch)
9190 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9191 aarch64_print_extension ();
9193 else if (selected_cpu)
9195 const char *truncated_name
9196 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9197 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9198 aarch64_print_extension ();
9200 default_file_start();
9203 /* Target hook for c_mode_for_suffix. */
9204 static machine_mode
9205 aarch64_c_mode_for_suffix (char suffix)
9207 if (suffix == 'q')
9208 return TFmode;
9210 return VOIDmode;
9213 /* We can only represent floating point constants which will fit in
9214 "quarter-precision" values. These values are characterised by
9215 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9218 (-1)^s * (n/16) * 2^r
9220 Where:
9221 's' is the sign bit.
9222 'n' is an integer in the range 16 <= n <= 31.
9223 'r' is an integer in the range -3 <= r <= 4. */
9225 /* Return true iff X can be represented by a quarter-precision
9226 floating point immediate operand X. Note, we cannot represent 0.0. */
9227 bool
9228 aarch64_float_const_representable_p (rtx x)
9230 /* This represents our current view of how many bits
9231 make up the mantissa. */
9232 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9233 int exponent;
9234 unsigned HOST_WIDE_INT mantissa, mask;
9235 REAL_VALUE_TYPE r, m;
9236 bool fail;
9238 if (!CONST_DOUBLE_P (x))
9239 return false;
9241 if (GET_MODE (x) == VOIDmode)
9242 return false;
9244 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9246 /* We cannot represent infinities, NaNs or +/-zero. We won't
9247 know if we have +zero until we analyse the mantissa, but we
9248 can reject the other invalid values. */
9249 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9250 || REAL_VALUE_MINUS_ZERO (r))
9251 return false;
9253 /* Extract exponent. */
9254 r = real_value_abs (&r);
9255 exponent = REAL_EXP (&r);
9257 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9258 highest (sign) bit, with a fixed binary point at bit point_pos.
9259 m1 holds the low part of the mantissa, m2 the high part.
9260 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9261 bits for the mantissa, this can fail (low bits will be lost). */
9262 real_ldexp (&m, &r, point_pos - exponent);
9263 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9265 /* If the low part of the mantissa has bits set we cannot represent
9266 the value. */
9267 if (w.elt (0) != 0)
9268 return false;
9269 /* We have rejected the lower HOST_WIDE_INT, so update our
9270 understanding of how many bits lie in the mantissa and
9271 look only at the high HOST_WIDE_INT. */
9272 mantissa = w.elt (1);
9273 point_pos -= HOST_BITS_PER_WIDE_INT;
9275 /* We can only represent values with a mantissa of the form 1.xxxx. */
9276 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9277 if ((mantissa & mask) != 0)
9278 return false;
9280 /* Having filtered unrepresentable values, we may now remove all
9281 but the highest 5 bits. */
9282 mantissa >>= point_pos - 5;
9284 /* We cannot represent the value 0.0, so reject it. This is handled
9285 elsewhere. */
9286 if (mantissa == 0)
9287 return false;
9289 /* Then, as bit 4 is always set, we can mask it off, leaving
9290 the mantissa in the range [0, 15]. */
9291 mantissa &= ~(1 << 4);
9292 gcc_assert (mantissa <= 15);
9294 /* GCC internally does not use IEEE754-like encoding (where normalized
9295 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9296 Our mantissa values are shifted 4 places to the left relative to
9297 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9298 by 5 places to correct for GCC's representation. */
9299 exponent = 5 - exponent;
9301 return (exponent >= 0 && exponent <= 7);
9304 char*
9305 aarch64_output_simd_mov_immediate (rtx const_vector,
9306 machine_mode mode,
9307 unsigned width)
9309 bool is_valid;
9310 static char templ[40];
9311 const char *mnemonic;
9312 const char *shift_op;
9313 unsigned int lane_count = 0;
9314 char element_char;
9316 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9318 /* This will return true to show const_vector is legal for use as either
9319 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9320 also update INFO to show how the immediate should be generated. */
9321 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9322 gcc_assert (is_valid);
9324 element_char = sizetochar (info.element_width);
9325 lane_count = width / info.element_width;
9327 mode = GET_MODE_INNER (mode);
9328 if (mode == SFmode || mode == DFmode)
9330 gcc_assert (info.shift == 0 && ! info.mvn);
9331 if (aarch64_float_const_zero_rtx_p (info.value))
9332 info.value = GEN_INT (0);
9333 else
9335 #define buf_size 20
9336 REAL_VALUE_TYPE r;
9337 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9338 char float_buf[buf_size] = {'\0'};
9339 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9340 #undef buf_size
9342 if (lane_count == 1)
9343 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9344 else
9345 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9346 lane_count, element_char, float_buf);
9347 return templ;
9351 mnemonic = info.mvn ? "mvni" : "movi";
9352 shift_op = info.msl ? "msl" : "lsl";
9354 if (lane_count == 1)
9355 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9356 mnemonic, UINTVAL (info.value));
9357 else if (info.shift)
9358 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9359 ", %s %d", mnemonic, lane_count, element_char,
9360 UINTVAL (info.value), shift_op, info.shift);
9361 else
9362 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9363 mnemonic, lane_count, element_char, UINTVAL (info.value));
9364 return templ;
9367 char*
9368 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9369 machine_mode mode)
9371 machine_mode vmode;
9373 gcc_assert (!VECTOR_MODE_P (mode));
9374 vmode = aarch64_simd_container_mode (mode, 64);
9375 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9376 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9379 /* Split operands into moves from op[1] + op[2] into op[0]. */
9381 void
9382 aarch64_split_combinev16qi (rtx operands[3])
9384 unsigned int dest = REGNO (operands[0]);
9385 unsigned int src1 = REGNO (operands[1]);
9386 unsigned int src2 = REGNO (operands[2]);
9387 machine_mode halfmode = GET_MODE (operands[1]);
9388 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9389 rtx destlo, desthi;
9391 gcc_assert (halfmode == V16QImode);
9393 if (src1 == dest && src2 == dest + halfregs)
9395 /* No-op move. Can't split to nothing; emit something. */
9396 emit_note (NOTE_INSN_DELETED);
9397 return;
9400 /* Preserve register attributes for variable tracking. */
9401 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9402 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9403 GET_MODE_SIZE (halfmode));
9405 /* Special case of reversed high/low parts. */
9406 if (reg_overlap_mentioned_p (operands[2], destlo)
9407 && reg_overlap_mentioned_p (operands[1], desthi))
9409 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9410 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9411 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9413 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9415 /* Try to avoid unnecessary moves if part of the result
9416 is in the right place already. */
9417 if (src1 != dest)
9418 emit_move_insn (destlo, operands[1]);
9419 if (src2 != dest + halfregs)
9420 emit_move_insn (desthi, operands[2]);
9422 else
9424 if (src2 != dest + halfregs)
9425 emit_move_insn (desthi, operands[2]);
9426 if (src1 != dest)
9427 emit_move_insn (destlo, operands[1]);
9431 /* vec_perm support. */
9433 #define MAX_VECT_LEN 16
9435 struct expand_vec_perm_d
9437 rtx target, op0, op1;
9438 unsigned char perm[MAX_VECT_LEN];
9439 machine_mode vmode;
9440 unsigned char nelt;
9441 bool one_vector_p;
9442 bool testing_p;
9445 /* Generate a variable permutation. */
9447 static void
9448 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9450 machine_mode vmode = GET_MODE (target);
9451 bool one_vector_p = rtx_equal_p (op0, op1);
9453 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9454 gcc_checking_assert (GET_MODE (op0) == vmode);
9455 gcc_checking_assert (GET_MODE (op1) == vmode);
9456 gcc_checking_assert (GET_MODE (sel) == vmode);
9457 gcc_checking_assert (TARGET_SIMD);
9459 if (one_vector_p)
9461 if (vmode == V8QImode)
9463 /* Expand the argument to a V16QI mode by duplicating it. */
9464 rtx pair = gen_reg_rtx (V16QImode);
9465 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9466 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9468 else
9470 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9473 else
9475 rtx pair;
9477 if (vmode == V8QImode)
9479 pair = gen_reg_rtx (V16QImode);
9480 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9481 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9483 else
9485 pair = gen_reg_rtx (OImode);
9486 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9487 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9492 void
9493 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9495 machine_mode vmode = GET_MODE (target);
9496 unsigned int nelt = GET_MODE_NUNITS (vmode);
9497 bool one_vector_p = rtx_equal_p (op0, op1);
9498 rtx mask;
9500 /* The TBL instruction does not use a modulo index, so we must take care
9501 of that ourselves. */
9502 mask = aarch64_simd_gen_const_vector_dup (vmode,
9503 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9504 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9506 /* For big-endian, we also need to reverse the index within the vector
9507 (but not which vector). */
9508 if (BYTES_BIG_ENDIAN)
9510 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9511 if (!one_vector_p)
9512 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9513 sel = expand_simple_binop (vmode, XOR, sel, mask,
9514 NULL, 0, OPTAB_LIB_WIDEN);
9516 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9519 /* Recognize patterns suitable for the TRN instructions. */
9520 static bool
9521 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9523 unsigned int i, odd, mask, nelt = d->nelt;
9524 rtx out, in0, in1, x;
9525 rtx (*gen) (rtx, rtx, rtx);
9526 machine_mode vmode = d->vmode;
9528 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9529 return false;
9531 /* Note that these are little-endian tests.
9532 We correct for big-endian later. */
9533 if (d->perm[0] == 0)
9534 odd = 0;
9535 else if (d->perm[0] == 1)
9536 odd = 1;
9537 else
9538 return false;
9539 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9541 for (i = 0; i < nelt; i += 2)
9543 if (d->perm[i] != i + odd)
9544 return false;
9545 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9546 return false;
9549 /* Success! */
9550 if (d->testing_p)
9551 return true;
9553 in0 = d->op0;
9554 in1 = d->op1;
9555 if (BYTES_BIG_ENDIAN)
9557 x = in0, in0 = in1, in1 = x;
9558 odd = !odd;
9560 out = d->target;
9562 if (odd)
9564 switch (vmode)
9566 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9567 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9568 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9569 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9570 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9571 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9572 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9573 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9574 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9575 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9576 default:
9577 return false;
9580 else
9582 switch (vmode)
9584 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9585 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9586 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9587 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9588 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9589 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9590 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9591 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9592 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9593 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9594 default:
9595 return false;
9599 emit_insn (gen (out, in0, in1));
9600 return true;
9603 /* Recognize patterns suitable for the UZP instructions. */
9604 static bool
9605 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9607 unsigned int i, odd, mask, nelt = d->nelt;
9608 rtx out, in0, in1, x;
9609 rtx (*gen) (rtx, rtx, rtx);
9610 machine_mode vmode = d->vmode;
9612 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9613 return false;
9615 /* Note that these are little-endian tests.
9616 We correct for big-endian later. */
9617 if (d->perm[0] == 0)
9618 odd = 0;
9619 else if (d->perm[0] == 1)
9620 odd = 1;
9621 else
9622 return false;
9623 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9625 for (i = 0; i < nelt; i++)
9627 unsigned elt = (i * 2 + odd) & mask;
9628 if (d->perm[i] != elt)
9629 return false;
9632 /* Success! */
9633 if (d->testing_p)
9634 return true;
9636 in0 = d->op0;
9637 in1 = d->op1;
9638 if (BYTES_BIG_ENDIAN)
9640 x = in0, in0 = in1, in1 = x;
9641 odd = !odd;
9643 out = d->target;
9645 if (odd)
9647 switch (vmode)
9649 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9650 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9651 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9652 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9653 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9654 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9655 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9656 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9657 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9658 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9659 default:
9660 return false;
9663 else
9665 switch (vmode)
9667 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9668 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9669 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9670 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9671 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9672 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9673 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9674 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9675 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9676 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9677 default:
9678 return false;
9682 emit_insn (gen (out, in0, in1));
9683 return true;
9686 /* Recognize patterns suitable for the ZIP instructions. */
9687 static bool
9688 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9690 unsigned int i, high, mask, nelt = d->nelt;
9691 rtx out, in0, in1, x;
9692 rtx (*gen) (rtx, rtx, rtx);
9693 machine_mode vmode = d->vmode;
9695 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9696 return false;
9698 /* Note that these are little-endian tests.
9699 We correct for big-endian later. */
9700 high = nelt / 2;
9701 if (d->perm[0] == high)
9702 /* Do Nothing. */
9704 else if (d->perm[0] == 0)
9705 high = 0;
9706 else
9707 return false;
9708 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9710 for (i = 0; i < nelt / 2; i++)
9712 unsigned elt = (i + high) & mask;
9713 if (d->perm[i * 2] != elt)
9714 return false;
9715 elt = (elt + nelt) & mask;
9716 if (d->perm[i * 2 + 1] != elt)
9717 return false;
9720 /* Success! */
9721 if (d->testing_p)
9722 return true;
9724 in0 = d->op0;
9725 in1 = d->op1;
9726 if (BYTES_BIG_ENDIAN)
9728 x = in0, in0 = in1, in1 = x;
9729 high = !high;
9731 out = d->target;
9733 if (high)
9735 switch (vmode)
9737 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9738 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9739 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9740 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9741 case V4SImode: gen = gen_aarch64_zip2v4si; break;
9742 case V2SImode: gen = gen_aarch64_zip2v2si; break;
9743 case V2DImode: gen = gen_aarch64_zip2v2di; break;
9744 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9745 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9746 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9747 default:
9748 return false;
9751 else
9753 switch (vmode)
9755 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9756 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9757 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9758 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9759 case V4SImode: gen = gen_aarch64_zip1v4si; break;
9760 case V2SImode: gen = gen_aarch64_zip1v2si; break;
9761 case V2DImode: gen = gen_aarch64_zip1v2di; break;
9762 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9763 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9764 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9765 default:
9766 return false;
9770 emit_insn (gen (out, in0, in1));
9771 return true;
9774 /* Recognize patterns for the EXT insn. */
9776 static bool
9777 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9779 unsigned int i, nelt = d->nelt;
9780 rtx (*gen) (rtx, rtx, rtx, rtx);
9781 rtx offset;
9783 unsigned int location = d->perm[0]; /* Always < nelt. */
9785 /* Check if the extracted indices are increasing by one. */
9786 for (i = 1; i < nelt; i++)
9788 unsigned int required = location + i;
9789 if (d->one_vector_p)
9791 /* We'll pass the same vector in twice, so allow indices to wrap. */
9792 required &= (nelt - 1);
9794 if (d->perm[i] != required)
9795 return false;
9798 switch (d->vmode)
9800 case V16QImode: gen = gen_aarch64_extv16qi; break;
9801 case V8QImode: gen = gen_aarch64_extv8qi; break;
9802 case V4HImode: gen = gen_aarch64_extv4hi; break;
9803 case V8HImode: gen = gen_aarch64_extv8hi; break;
9804 case V2SImode: gen = gen_aarch64_extv2si; break;
9805 case V4SImode: gen = gen_aarch64_extv4si; break;
9806 case V2SFmode: gen = gen_aarch64_extv2sf; break;
9807 case V4SFmode: gen = gen_aarch64_extv4sf; break;
9808 case V2DImode: gen = gen_aarch64_extv2di; break;
9809 case V2DFmode: gen = gen_aarch64_extv2df; break;
9810 default:
9811 return false;
9814 /* Success! */
9815 if (d->testing_p)
9816 return true;
9818 /* The case where (location == 0) is a no-op for both big- and little-endian,
9819 and is removed by the mid-end at optimization levels -O1 and higher. */
9821 if (BYTES_BIG_ENDIAN && (location != 0))
9823 /* After setup, we want the high elements of the first vector (stored
9824 at the LSB end of the register), and the low elements of the second
9825 vector (stored at the MSB end of the register). So swap. */
9826 rtx temp = d->op0;
9827 d->op0 = d->op1;
9828 d->op1 = temp;
9829 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
9830 location = nelt - location;
9833 offset = GEN_INT (location);
9834 emit_insn (gen (d->target, d->op0, d->op1, offset));
9835 return true;
9838 /* Recognize patterns for the REV insns. */
9840 static bool
9841 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9843 unsigned int i, j, diff, nelt = d->nelt;
9844 rtx (*gen) (rtx, rtx);
9846 if (!d->one_vector_p)
9847 return false;
9849 diff = d->perm[0];
9850 switch (diff)
9852 case 7:
9853 switch (d->vmode)
9855 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9856 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
9857 default:
9858 return false;
9860 break;
9861 case 3:
9862 switch (d->vmode)
9864 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9865 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
9866 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
9867 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
9868 default:
9869 return false;
9871 break;
9872 case 1:
9873 switch (d->vmode)
9875 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9876 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
9877 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
9878 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
9879 case V4SImode: gen = gen_aarch64_rev64v4si; break;
9880 case V2SImode: gen = gen_aarch64_rev64v2si; break;
9881 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
9882 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
9883 default:
9884 return false;
9886 break;
9887 default:
9888 return false;
9891 for (i = 0; i < nelt ; i += diff + 1)
9892 for (j = 0; j <= diff; j += 1)
9894 /* This is guaranteed to be true as the value of diff
9895 is 7, 3, 1 and we should have enough elements in the
9896 queue to generate this. Getting a vector mask with a
9897 value of diff other than these values implies that
9898 something is wrong by the time we get here. */
9899 gcc_assert (i + j < nelt);
9900 if (d->perm[i + j] != i + diff - j)
9901 return false;
9904 /* Success! */
9905 if (d->testing_p)
9906 return true;
9908 emit_insn (gen (d->target, d->op0));
9909 return true;
9912 static bool
9913 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9915 rtx (*gen) (rtx, rtx, rtx);
9916 rtx out = d->target;
9917 rtx in0;
9918 machine_mode vmode = d->vmode;
9919 unsigned int i, elt, nelt = d->nelt;
9920 rtx lane;
9922 elt = d->perm[0];
9923 for (i = 1; i < nelt; i++)
9925 if (elt != d->perm[i])
9926 return false;
9929 /* The generic preparation in aarch64_expand_vec_perm_const_1
9930 swaps the operand order and the permute indices if it finds
9931 d->perm[0] to be in the second operand. Thus, we can always
9932 use d->op0 and need not do any extra arithmetic to get the
9933 correct lane number. */
9934 in0 = d->op0;
9935 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
9937 switch (vmode)
9939 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9940 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9941 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9942 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9943 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9944 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9945 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9946 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9947 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9948 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9949 default:
9950 return false;
9953 emit_insn (gen (out, in0, lane));
9954 return true;
9957 static bool
9958 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9960 rtx rperm[MAX_VECT_LEN], sel;
9961 machine_mode vmode = d->vmode;
9962 unsigned int i, nelt = d->nelt;
9964 if (d->testing_p)
9965 return true;
9967 /* Generic code will try constant permutation twice. Once with the
9968 original mode and again with the elements lowered to QImode.
9969 So wait and don't do the selector expansion ourselves. */
9970 if (vmode != V8QImode && vmode != V16QImode)
9971 return false;
9973 for (i = 0; i < nelt; ++i)
9975 int nunits = GET_MODE_NUNITS (vmode);
9977 /* If big-endian and two vectors we end up with a weird mixed-endian
9978 mode on NEON. Reverse the index within each word but not the word
9979 itself. */
9980 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9981 : d->perm[i]);
9983 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9984 sel = force_reg (vmode, sel);
9986 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9987 return true;
9990 static bool
9991 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9993 /* The pattern matching functions above are written to look for a small
9994 number to begin the sequence (0, 1, N/2). If we begin with an index
9995 from the second operand, we can swap the operands. */
9996 if (d->perm[0] >= d->nelt)
9998 unsigned i, nelt = d->nelt;
9999 rtx x;
10001 gcc_assert (nelt == (nelt & -nelt));
10002 for (i = 0; i < nelt; ++i)
10003 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10005 x = d->op0;
10006 d->op0 = d->op1;
10007 d->op1 = x;
10010 if (TARGET_SIMD)
10012 if (aarch64_evpc_rev (d))
10013 return true;
10014 else if (aarch64_evpc_ext (d))
10015 return true;
10016 else if (aarch64_evpc_dup (d))
10017 return true;
10018 else if (aarch64_evpc_zip (d))
10019 return true;
10020 else if (aarch64_evpc_uzp (d))
10021 return true;
10022 else if (aarch64_evpc_trn (d))
10023 return true;
10024 return aarch64_evpc_tbl (d);
10026 return false;
10029 /* Expand a vec_perm_const pattern. */
10031 bool
10032 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10034 struct expand_vec_perm_d d;
10035 int i, nelt, which;
10037 d.target = target;
10038 d.op0 = op0;
10039 d.op1 = op1;
10041 d.vmode = GET_MODE (target);
10042 gcc_assert (VECTOR_MODE_P (d.vmode));
10043 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10044 d.testing_p = false;
10046 for (i = which = 0; i < nelt; ++i)
10048 rtx e = XVECEXP (sel, 0, i);
10049 int ei = INTVAL (e) & (2 * nelt - 1);
10050 which |= (ei < nelt ? 1 : 2);
10051 d.perm[i] = ei;
10054 switch (which)
10056 default:
10057 gcc_unreachable ();
10059 case 3:
10060 d.one_vector_p = false;
10061 if (!rtx_equal_p (op0, op1))
10062 break;
10064 /* The elements of PERM do not suggest that only the first operand
10065 is used, but both operands are identical. Allow easier matching
10066 of the permutation by folding the permutation into the single
10067 input vector. */
10068 /* Fall Through. */
10069 case 2:
10070 for (i = 0; i < nelt; ++i)
10071 d.perm[i] &= nelt - 1;
10072 d.op0 = op1;
10073 d.one_vector_p = true;
10074 break;
10076 case 1:
10077 d.op1 = op0;
10078 d.one_vector_p = true;
10079 break;
10082 return aarch64_expand_vec_perm_const_1 (&d);
10085 static bool
10086 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10087 const unsigned char *sel)
10089 struct expand_vec_perm_d d;
10090 unsigned int i, nelt, which;
10091 bool ret;
10093 d.vmode = vmode;
10094 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10095 d.testing_p = true;
10096 memcpy (d.perm, sel, nelt);
10098 /* Calculate whether all elements are in one vector. */
10099 for (i = which = 0; i < nelt; ++i)
10101 unsigned char e = d.perm[i];
10102 gcc_assert (e < 2 * nelt);
10103 which |= (e < nelt ? 1 : 2);
10106 /* If all elements are from the second vector, reindex as if from the
10107 first vector. */
10108 if (which == 2)
10109 for (i = 0; i < nelt; ++i)
10110 d.perm[i] -= nelt;
10112 /* Check whether the mask can be applied to a single vector. */
10113 d.one_vector_p = (which != 3);
10115 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10116 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10117 if (!d.one_vector_p)
10118 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10120 start_sequence ();
10121 ret = aarch64_expand_vec_perm_const_1 (&d);
10122 end_sequence ();
10124 return ret;
10127 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
10128 bool
10129 aarch64_cannot_change_mode_class (machine_mode from,
10130 machine_mode to,
10131 enum reg_class rclass)
10133 /* Full-reg subregs are allowed on general regs or any class if they are
10134 the same size. */
10135 if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
10136 || !reg_classes_intersect_p (FP_REGS, rclass))
10137 return false;
10139 /* Limited combinations of subregs are safe on FPREGs. Particularly,
10140 1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
10141 2. Scalar to Scalar for integer modes or same size float modes.
10142 3. Vector to Vector modes.
10143 4. On little-endian only, Vector-Structure to Vector modes. */
10144 if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
10146 if (aarch64_vector_mode_supported_p (from)
10147 && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
10148 return false;
10150 if (GET_MODE_NUNITS (from) == 1
10151 && GET_MODE_NUNITS (to) == 1
10152 && (GET_MODE_CLASS (from) == MODE_INT
10153 || from == to))
10154 return false;
10156 if (aarch64_vector_mode_supported_p (from)
10157 && aarch64_vector_mode_supported_p (to))
10158 return false;
10160 /* Within an vector structure straddling multiple vector registers
10161 we are in a mixed-endian representation. As such, we can't
10162 easily change modes for BYTES_BIG_ENDIAN. Otherwise, we can
10163 switch between vectors and vector structures cheaply. */
10164 if (!BYTES_BIG_ENDIAN)
10165 if ((aarch64_vector_mode_supported_p (from)
10166 && aarch64_vect_struct_mode_p (to))
10167 || (aarch64_vector_mode_supported_p (to)
10168 && aarch64_vect_struct_mode_p (from)))
10169 return false;
10172 return true;
10175 /* Implement MODES_TIEABLE_P. */
10177 bool
10178 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10180 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10181 return true;
10183 /* We specifically want to allow elements of "structure" modes to
10184 be tieable to the structure. This more general condition allows
10185 other rarer situations too. */
10186 if (TARGET_SIMD
10187 && aarch64_vector_mode_p (mode1)
10188 && aarch64_vector_mode_p (mode2))
10189 return true;
10191 return false;
10194 /* Return a new RTX holding the result of moving POINTER forward by
10195 AMOUNT bytes. */
10197 static rtx
10198 aarch64_move_pointer (rtx pointer, int amount)
10200 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10202 return adjust_automodify_address (pointer, GET_MODE (pointer),
10203 next, amount);
10206 /* Return a new RTX holding the result of moving POINTER forward by the
10207 size of the mode it points to. */
10209 static rtx
10210 aarch64_progress_pointer (rtx pointer)
10212 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10214 return aarch64_move_pointer (pointer, amount);
10217 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10218 MODE bytes. */
10220 static void
10221 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10222 machine_mode mode)
10224 rtx reg = gen_reg_rtx (mode);
10226 /* "Cast" the pointers to the correct mode. */
10227 *src = adjust_address (*src, mode, 0);
10228 *dst = adjust_address (*dst, mode, 0);
10229 /* Emit the memcpy. */
10230 emit_move_insn (reg, *src);
10231 emit_move_insn (*dst, reg);
10232 /* Move the pointers forward. */
10233 *src = aarch64_progress_pointer (*src);
10234 *dst = aarch64_progress_pointer (*dst);
10237 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10238 we succeed, otherwise return false. */
10240 bool
10241 aarch64_expand_movmem (rtx *operands)
10243 unsigned int n;
10244 rtx dst = operands[0];
10245 rtx src = operands[1];
10246 rtx base;
10247 bool speed_p = !optimize_function_for_size_p (cfun);
10249 /* When optimizing for size, give a better estimate of the length of a
10250 memcpy call, but use the default otherwise. */
10251 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10253 /* We can't do anything smart if the amount to copy is not constant. */
10254 if (!CONST_INT_P (operands[2]))
10255 return false;
10257 n = UINTVAL (operands[2]);
10259 /* Try to keep the number of instructions low. For cases below 16 bytes we
10260 need to make at most two moves. For cases above 16 bytes it will be one
10261 move for each 16 byte chunk, then at most two additional moves. */
10262 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10263 return false;
10265 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10266 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10268 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10269 src = adjust_automodify_address (src, VOIDmode, base, 0);
10271 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10272 1-byte chunk. */
10273 if (n < 4)
10275 if (n >= 2)
10277 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10278 n -= 2;
10281 if (n == 1)
10282 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10284 return true;
10287 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10288 4-byte chunk, partially overlapping with the previously copied chunk. */
10289 if (n < 8)
10291 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10292 n -= 4;
10293 if (n > 0)
10295 int move = n - 4;
10297 src = aarch64_move_pointer (src, move);
10298 dst = aarch64_move_pointer (dst, move);
10299 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10301 return true;
10304 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10305 them, then (if applicable) an 8-byte chunk. */
10306 while (n >= 8)
10308 if (n / 16)
10310 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10311 n -= 16;
10313 else
10315 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10316 n -= 8;
10320 /* Finish the final bytes of the copy. We can always do this in one
10321 instruction. We either copy the exact amount we need, or partially
10322 overlap with the previous chunk we copied and copy 8-bytes. */
10323 if (n == 0)
10324 return true;
10325 else if (n == 1)
10326 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10327 else if (n == 2)
10328 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10329 else if (n == 4)
10330 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10331 else
10333 if (n == 3)
10335 src = aarch64_move_pointer (src, -1);
10336 dst = aarch64_move_pointer (dst, -1);
10337 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10339 else
10341 int move = n - 8;
10343 src = aarch64_move_pointer (src, move);
10344 dst = aarch64_move_pointer (dst, move);
10345 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10349 return true;
10352 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10354 static unsigned HOST_WIDE_INT
10355 aarch64_asan_shadow_offset (void)
10357 return (HOST_WIDE_INT_1 << 36);
10360 static bool
10361 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10362 unsigned int align,
10363 enum by_pieces_operation op,
10364 bool speed_p)
10366 /* STORE_BY_PIECES can be used when copying a constant string, but
10367 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10368 For now we always fail this and let the move_by_pieces code copy
10369 the string from read-only memory. */
10370 if (op == STORE_BY_PIECES)
10371 return false;
10373 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10376 static enum machine_mode
10377 aarch64_code_to_ccmode (enum rtx_code code)
10379 switch (code)
10381 case NE:
10382 return CC_DNEmode;
10384 case EQ:
10385 return CC_DEQmode;
10387 case LE:
10388 return CC_DLEmode;
10390 case LT:
10391 return CC_DLTmode;
10393 case GE:
10394 return CC_DGEmode;
10396 case GT:
10397 return CC_DGTmode;
10399 case LEU:
10400 return CC_DLEUmode;
10402 case LTU:
10403 return CC_DLTUmode;
10405 case GEU:
10406 return CC_DGEUmode;
10408 case GTU:
10409 return CC_DGTUmode;
10411 default:
10412 return CCmode;
10416 static rtx
10417 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10418 int code, tree treeop0, tree treeop1)
10420 enum machine_mode op_mode, cmp_mode, cc_mode;
10421 rtx op0, op1, cmp, target;
10422 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10423 enum insn_code icode;
10424 struct expand_operand ops[4];
10426 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10427 if (cc_mode == CCmode)
10428 return NULL_RTX;
10430 start_sequence ();
10431 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10433 op_mode = GET_MODE (op0);
10434 if (op_mode == VOIDmode)
10435 op_mode = GET_MODE (op1);
10437 switch (op_mode)
10439 case QImode:
10440 case HImode:
10441 case SImode:
10442 cmp_mode = SImode;
10443 icode = CODE_FOR_cmpsi;
10444 break;
10446 case DImode:
10447 cmp_mode = DImode;
10448 icode = CODE_FOR_cmpdi;
10449 break;
10451 default:
10452 end_sequence ();
10453 return NULL_RTX;
10456 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10457 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10458 if (!op0 || !op1)
10460 end_sequence ();
10461 return NULL_RTX;
10463 *prep_seq = get_insns ();
10464 end_sequence ();
10466 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10467 target = gen_rtx_REG (CCmode, CC_REGNUM);
10469 create_output_operand (&ops[0], target, CCmode);
10470 create_fixed_operand (&ops[1], cmp);
10471 create_fixed_operand (&ops[2], op0);
10472 create_fixed_operand (&ops[3], op1);
10474 start_sequence ();
10475 if (!maybe_expand_insn (icode, 4, ops))
10477 end_sequence ();
10478 return NULL_RTX;
10480 *gen_seq = get_insns ();
10481 end_sequence ();
10483 return gen_rtx_REG (cc_mode, CC_REGNUM);
10486 static rtx
10487 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10488 tree treeop0, tree treeop1, int bit_code)
10490 rtx op0, op1, cmp0, cmp1, target;
10491 enum machine_mode op_mode, cmp_mode, cc_mode;
10492 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10493 enum insn_code icode = CODE_FOR_ccmp_andsi;
10494 struct expand_operand ops[6];
10496 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10497 if (cc_mode == CCmode)
10498 return NULL_RTX;
10500 push_to_sequence ((rtx_insn*) *prep_seq);
10501 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10503 op_mode = GET_MODE (op0);
10504 if (op_mode == VOIDmode)
10505 op_mode = GET_MODE (op1);
10507 switch (op_mode)
10509 case QImode:
10510 case HImode:
10511 case SImode:
10512 cmp_mode = SImode;
10513 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10514 : CODE_FOR_ccmp_iorsi;
10515 break;
10517 case DImode:
10518 cmp_mode = DImode;
10519 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10520 : CODE_FOR_ccmp_iordi;
10521 break;
10523 default:
10524 end_sequence ();
10525 return NULL_RTX;
10528 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10529 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10530 if (!op0 || !op1)
10532 end_sequence ();
10533 return NULL_RTX;
10535 *prep_seq = get_insns ();
10536 end_sequence ();
10538 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10539 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10540 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10542 create_fixed_operand (&ops[0], prev);
10543 create_fixed_operand (&ops[1], target);
10544 create_fixed_operand (&ops[2], op0);
10545 create_fixed_operand (&ops[3], op1);
10546 create_fixed_operand (&ops[4], cmp0);
10547 create_fixed_operand (&ops[5], cmp1);
10549 push_to_sequence ((rtx_insn*) *gen_seq);
10550 if (!maybe_expand_insn (icode, 6, ops))
10552 end_sequence ();
10553 return NULL_RTX;
10556 *gen_seq = get_insns ();
10557 end_sequence ();
10559 return target;
10562 #undef TARGET_GEN_CCMP_FIRST
10563 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10565 #undef TARGET_GEN_CCMP_NEXT
10566 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10568 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10569 instruction fusion of some sort. */
10571 static bool
10572 aarch64_macro_fusion_p (void)
10574 return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10578 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10579 should be kept together during scheduling. */
10581 static bool
10582 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10584 rtx set_dest;
10585 rtx prev_set = single_set (prev);
10586 rtx curr_set = single_set (curr);
10587 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10588 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10590 if (!aarch64_macro_fusion_p ())
10591 return false;
10593 if (simple_sets_p
10594 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10596 /* We are trying to match:
10597 prev (mov) == (set (reg r0) (const_int imm16))
10598 curr (movk) == (set (zero_extract (reg r0)
10599 (const_int 16)
10600 (const_int 16))
10601 (const_int imm16_1)) */
10603 set_dest = SET_DEST (curr_set);
10605 if (GET_CODE (set_dest) == ZERO_EXTRACT
10606 && CONST_INT_P (SET_SRC (curr_set))
10607 && CONST_INT_P (SET_SRC (prev_set))
10608 && CONST_INT_P (XEXP (set_dest, 2))
10609 && INTVAL (XEXP (set_dest, 2)) == 16
10610 && REG_P (XEXP (set_dest, 0))
10611 && REG_P (SET_DEST (prev_set))
10612 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10614 return true;
10618 if (simple_sets_p
10619 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10622 /* We're trying to match:
10623 prev (adrp) == (set (reg r1)
10624 (high (symbol_ref ("SYM"))))
10625 curr (add) == (set (reg r0)
10626 (lo_sum (reg r1)
10627 (symbol_ref ("SYM"))))
10628 Note that r0 need not necessarily be the same as r1, especially
10629 during pre-regalloc scheduling. */
10631 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10632 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10634 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10635 && REG_P (XEXP (SET_SRC (curr_set), 0))
10636 && REGNO (XEXP (SET_SRC (curr_set), 0))
10637 == REGNO (SET_DEST (prev_set))
10638 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10639 XEXP (SET_SRC (curr_set), 1)))
10640 return true;
10644 if (simple_sets_p
10645 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10648 /* We're trying to match:
10649 prev (movk) == (set (zero_extract (reg r0)
10650 (const_int 16)
10651 (const_int 32))
10652 (const_int imm16_1))
10653 curr (movk) == (set (zero_extract (reg r0)
10654 (const_int 16)
10655 (const_int 48))
10656 (const_int imm16_2)) */
10658 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10659 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10660 && REG_P (XEXP (SET_DEST (prev_set), 0))
10661 && REG_P (XEXP (SET_DEST (curr_set), 0))
10662 && REGNO (XEXP (SET_DEST (prev_set), 0))
10663 == REGNO (XEXP (SET_DEST (curr_set), 0))
10664 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10665 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10666 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10667 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10668 && CONST_INT_P (SET_SRC (prev_set))
10669 && CONST_INT_P (SET_SRC (curr_set)))
10670 return true;
10673 if (simple_sets_p
10674 && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10676 /* We're trying to match:
10677 prev (adrp) == (set (reg r0)
10678 (high (symbol_ref ("SYM"))))
10679 curr (ldr) == (set (reg r1)
10680 (mem (lo_sum (reg r0)
10681 (symbol_ref ("SYM")))))
10683 curr (ldr) == (set (reg r1)
10684 (zero_extend (mem
10685 (lo_sum (reg r0)
10686 (symbol_ref ("SYM")))))) */
10687 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10688 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10690 rtx curr_src = SET_SRC (curr_set);
10692 if (GET_CODE (curr_src) == ZERO_EXTEND)
10693 curr_src = XEXP (curr_src, 0);
10695 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10696 && REG_P (XEXP (XEXP (curr_src, 0), 0))
10697 && REGNO (XEXP (XEXP (curr_src, 0), 0))
10698 == REGNO (SET_DEST (prev_set))
10699 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10700 XEXP (SET_SRC (prev_set), 0)))
10701 return true;
10705 if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10706 && any_condjump_p (curr))
10708 enum attr_type prev_type = get_attr_type (prev);
10710 /* FIXME: this misses some which is considered simple arthematic
10711 instructions for ThunderX. Simple shifts are missed here. */
10712 if (prev_type == TYPE_ALUS_SREG
10713 || prev_type == TYPE_ALUS_IMM
10714 || prev_type == TYPE_LOGICS_REG
10715 || prev_type == TYPE_LOGICS_IMM)
10716 return true;
10719 return false;
10722 /* If MEM is in the form of [base+offset], extract the two parts
10723 of address and set to BASE and OFFSET, otherwise return false
10724 after clearing BASE and OFFSET. */
10726 bool
10727 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10729 rtx addr;
10731 gcc_assert (MEM_P (mem));
10733 addr = XEXP (mem, 0);
10735 if (REG_P (addr))
10737 *base = addr;
10738 *offset = const0_rtx;
10739 return true;
10742 if (GET_CODE (addr) == PLUS
10743 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10745 *base = XEXP (addr, 0);
10746 *offset = XEXP (addr, 1);
10747 return true;
10750 *base = NULL_RTX;
10751 *offset = NULL_RTX;
10753 return false;
10756 /* Types for scheduling fusion. */
10757 enum sched_fusion_type
10759 SCHED_FUSION_NONE = 0,
10760 SCHED_FUSION_LD_SIGN_EXTEND,
10761 SCHED_FUSION_LD_ZERO_EXTEND,
10762 SCHED_FUSION_LD,
10763 SCHED_FUSION_ST,
10764 SCHED_FUSION_NUM
10767 /* If INSN is a load or store of address in the form of [base+offset],
10768 extract the two parts and set to BASE and OFFSET. Return scheduling
10769 fusion type this INSN is. */
10771 static enum sched_fusion_type
10772 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10774 rtx x, dest, src;
10775 enum sched_fusion_type fusion = SCHED_FUSION_LD;
10777 gcc_assert (INSN_P (insn));
10778 x = PATTERN (insn);
10779 if (GET_CODE (x) != SET)
10780 return SCHED_FUSION_NONE;
10782 src = SET_SRC (x);
10783 dest = SET_DEST (x);
10785 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10786 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10787 return SCHED_FUSION_NONE;
10789 if (GET_CODE (src) == SIGN_EXTEND)
10791 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10792 src = XEXP (src, 0);
10793 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10794 return SCHED_FUSION_NONE;
10796 else if (GET_CODE (src) == ZERO_EXTEND)
10798 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10799 src = XEXP (src, 0);
10800 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10801 return SCHED_FUSION_NONE;
10804 if (GET_CODE (src) == MEM && REG_P (dest))
10805 extract_base_offset_in_addr (src, base, offset);
10806 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10808 fusion = SCHED_FUSION_ST;
10809 extract_base_offset_in_addr (dest, base, offset);
10811 else
10812 return SCHED_FUSION_NONE;
10814 if (*base == NULL_RTX || *offset == NULL_RTX)
10815 fusion = SCHED_FUSION_NONE;
10817 return fusion;
10820 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10822 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10823 and PRI are only calculated for these instructions. For other instruction,
10824 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
10825 type instruction fusion can be added by returning different priorities.
10827 It's important that irrelevant instructions get the largest FUSION_PRI. */
10829 static void
10830 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10831 int *fusion_pri, int *pri)
10833 int tmp, off_val;
10834 rtx base, offset;
10835 enum sched_fusion_type fusion;
10837 gcc_assert (INSN_P (insn));
10839 tmp = max_pri - 1;
10840 fusion = fusion_load_store (insn, &base, &offset);
10841 if (fusion == SCHED_FUSION_NONE)
10843 *pri = tmp;
10844 *fusion_pri = tmp;
10845 return;
10848 /* Set FUSION_PRI according to fusion type and base register. */
10849 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10851 /* Calculate PRI. */
10852 tmp /= 2;
10854 /* INSN with smaller offset goes first. */
10855 off_val = (int)(INTVAL (offset));
10856 if (off_val >= 0)
10857 tmp -= (off_val & 0xfffff);
10858 else
10859 tmp += ((- off_val) & 0xfffff);
10861 *pri = tmp;
10862 return;
10865 /* Given OPERANDS of consecutive load/store, check if we can merge
10866 them into ldp/stp. LOAD is true if they are load instructions.
10867 MODE is the mode of memory operands. */
10869 bool
10870 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10871 enum machine_mode mode)
10873 HOST_WIDE_INT offval_1, offval_2, msize;
10874 enum reg_class rclass_1, rclass_2;
10875 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10877 if (load)
10879 mem_1 = operands[1];
10880 mem_2 = operands[3];
10881 reg_1 = operands[0];
10882 reg_2 = operands[2];
10883 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10884 if (REGNO (reg_1) == REGNO (reg_2))
10885 return false;
10887 else
10889 mem_1 = operands[0];
10890 mem_2 = operands[2];
10891 reg_1 = operands[1];
10892 reg_2 = operands[3];
10895 /* The mems cannot be volatile. */
10896 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10897 return false;
10899 /* Check if the addresses are in the form of [base+offset]. */
10900 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10901 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10902 return false;
10903 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10904 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10905 return false;
10907 /* Check if the bases are same. */
10908 if (!rtx_equal_p (base_1, base_2))
10909 return false;
10911 offval_1 = INTVAL (offset_1);
10912 offval_2 = INTVAL (offset_2);
10913 msize = GET_MODE_SIZE (mode);
10914 /* Check if the offsets are consecutive. */
10915 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10916 return false;
10918 /* Check if the addresses are clobbered by load. */
10919 if (load)
10921 if (reg_mentioned_p (reg_1, mem_1))
10922 return false;
10924 /* In increasing order, the last load can clobber the address. */
10925 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10926 return false;
10929 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10930 rclass_1 = FP_REGS;
10931 else
10932 rclass_1 = GENERAL_REGS;
10934 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10935 rclass_2 = FP_REGS;
10936 else
10937 rclass_2 = GENERAL_REGS;
10939 /* Check if the registers are of same class. */
10940 if (rclass_1 != rclass_2)
10941 return false;
10943 return true;
10946 /* Given OPERANDS of consecutive load/store, check if we can merge
10947 them into ldp/stp by adjusting the offset. LOAD is true if they
10948 are load instructions. MODE is the mode of memory operands.
10950 Given below consecutive stores:
10952 str w1, [xb, 0x100]
10953 str w1, [xb, 0x104]
10954 str w1, [xb, 0x108]
10955 str w1, [xb, 0x10c]
10957 Though the offsets are out of the range supported by stp, we can
10958 still pair them after adjusting the offset, like:
10960 add scratch, xb, 0x100
10961 stp w1, w1, [scratch]
10962 stp w1, w1, [scratch, 0x8]
10964 The peephole patterns detecting this opportunity should guarantee
10965 the scratch register is avaliable. */
10967 bool
10968 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
10969 enum machine_mode mode)
10971 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
10972 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
10973 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
10974 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
10976 if (load)
10978 reg_1 = operands[0];
10979 mem_1 = operands[1];
10980 reg_2 = operands[2];
10981 mem_2 = operands[3];
10982 reg_3 = operands[4];
10983 mem_3 = operands[5];
10984 reg_4 = operands[6];
10985 mem_4 = operands[7];
10986 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
10987 && REG_P (reg_3) && REG_P (reg_4));
10988 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
10989 return false;
10991 else
10993 mem_1 = operands[0];
10994 reg_1 = operands[1];
10995 mem_2 = operands[2];
10996 reg_2 = operands[3];
10997 mem_3 = operands[4];
10998 reg_3 = operands[5];
10999 mem_4 = operands[6];
11000 reg_4 = operands[7];
11002 /* Skip if memory operand is by itslef valid for ldp/stp. */
11003 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11004 return false;
11006 /* The mems cannot be volatile. */
11007 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11008 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11009 return false;
11011 /* Check if the addresses are in the form of [base+offset]. */
11012 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11013 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11014 return false;
11015 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11016 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11017 return false;
11018 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11019 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11020 return false;
11021 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11022 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11023 return false;
11025 /* Check if the bases are same. */
11026 if (!rtx_equal_p (base_1, base_2)
11027 || !rtx_equal_p (base_2, base_3)
11028 || !rtx_equal_p (base_3, base_4))
11029 return false;
11031 offval_1 = INTVAL (offset_1);
11032 offval_2 = INTVAL (offset_2);
11033 offval_3 = INTVAL (offset_3);
11034 offval_4 = INTVAL (offset_4);
11035 msize = GET_MODE_SIZE (mode);
11036 /* Check if the offsets are consecutive. */
11037 if ((offval_1 != (offval_2 + msize)
11038 || offval_1 != (offval_3 + msize * 2)
11039 || offval_1 != (offval_4 + msize * 3))
11040 && (offval_4 != (offval_3 + msize)
11041 || offval_4 != (offval_2 + msize * 2)
11042 || offval_4 != (offval_1 + msize * 3)))
11043 return false;
11045 /* Check if the addresses are clobbered by load. */
11046 if (load)
11048 if (reg_mentioned_p (reg_1, mem_1)
11049 || reg_mentioned_p (reg_2, mem_2)
11050 || reg_mentioned_p (reg_3, mem_3))
11051 return false;
11053 /* In increasing order, the last load can clobber the address. */
11054 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11055 return false;
11058 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11059 rclass_1 = FP_REGS;
11060 else
11061 rclass_1 = GENERAL_REGS;
11063 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11064 rclass_2 = FP_REGS;
11065 else
11066 rclass_2 = GENERAL_REGS;
11068 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11069 rclass_3 = FP_REGS;
11070 else
11071 rclass_3 = GENERAL_REGS;
11073 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11074 rclass_4 = FP_REGS;
11075 else
11076 rclass_4 = GENERAL_REGS;
11078 /* Check if the registers are of same class. */
11079 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11080 return false;
11082 return true;
11085 /* Given OPERANDS of consecutive load/store, this function pairs them
11086 into ldp/stp after adjusting the offset. It depends on the fact
11087 that addresses of load/store instructions are in increasing order.
11088 MODE is the mode of memory operands. CODE is the rtl operator
11089 which should be applied to all memory operands, it's SIGN_EXTEND,
11090 ZERO_EXTEND or UNKNOWN. */
11092 bool
11093 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11094 enum machine_mode mode, RTX_CODE code)
11096 rtx base, offset, t1, t2;
11097 rtx mem_1, mem_2, mem_3, mem_4;
11098 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11100 if (load)
11102 mem_1 = operands[1];
11103 mem_2 = operands[3];
11104 mem_3 = operands[5];
11105 mem_4 = operands[7];
11107 else
11109 mem_1 = operands[0];
11110 mem_2 = operands[2];
11111 mem_3 = operands[4];
11112 mem_4 = operands[6];
11113 gcc_assert (code == UNKNOWN);
11116 extract_base_offset_in_addr (mem_1, &base, &offset);
11117 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11119 /* Adjust offset thus it can fit in ldp/stp instruction. */
11120 msize = GET_MODE_SIZE (mode);
11121 stp_off_limit = msize * 0x40;
11122 off_val = INTVAL (offset);
11123 abs_off = (off_val < 0) ? -off_val : off_val;
11124 new_off = abs_off % stp_off_limit;
11125 adj_off = abs_off - new_off;
11127 /* Further adjust to make sure all offsets are OK. */
11128 if ((new_off + msize * 2) >= stp_off_limit)
11130 adj_off += stp_off_limit;
11131 new_off -= stp_off_limit;
11134 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11135 if (adj_off >= 0x1000)
11136 return false;
11138 if (off_val < 0)
11140 adj_off = -adj_off;
11141 new_off = -new_off;
11144 /* Create new memory references. */
11145 mem_1 = change_address (mem_1, VOIDmode,
11146 plus_constant (DImode, operands[8], new_off));
11148 /* Check if the adjusted address is OK for ldp/stp. */
11149 if (!aarch64_mem_pair_operand (mem_1, mode))
11150 return false;
11152 msize = GET_MODE_SIZE (mode);
11153 mem_2 = change_address (mem_2, VOIDmode,
11154 plus_constant (DImode,
11155 operands[8],
11156 new_off + msize));
11157 mem_3 = change_address (mem_3, VOIDmode,
11158 plus_constant (DImode,
11159 operands[8],
11160 new_off + msize * 2));
11161 mem_4 = change_address (mem_4, VOIDmode,
11162 plus_constant (DImode,
11163 operands[8],
11164 new_off + msize * 3));
11166 if (code == ZERO_EXTEND)
11168 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11169 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11170 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11171 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11173 else if (code == SIGN_EXTEND)
11175 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11176 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11177 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11178 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11181 if (load)
11183 operands[1] = mem_1;
11184 operands[3] = mem_2;
11185 operands[5] = mem_3;
11186 operands[7] = mem_4;
11188 else
11190 operands[0] = mem_1;
11191 operands[2] = mem_2;
11192 operands[4] = mem_3;
11193 operands[6] = mem_4;
11196 /* Emit adjusting instruction. */
11197 emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11198 plus_constant (DImode, base, adj_off)));
11199 /* Emit ldp/stp instructions. */
11200 t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11201 t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11202 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11203 t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11204 t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11205 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11206 return true;
11209 #undef TARGET_ADDRESS_COST
11210 #define TARGET_ADDRESS_COST aarch64_address_cost
11212 /* This hook will determines whether unnamed bitfields affect the alignment
11213 of the containing structure. The hook returns true if the structure
11214 should inherit the alignment requirements of an unnamed bitfield's
11215 type. */
11216 #undef TARGET_ALIGN_ANON_BITFIELD
11217 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11219 #undef TARGET_ASM_ALIGNED_DI_OP
11220 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11222 #undef TARGET_ASM_ALIGNED_HI_OP
11223 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11225 #undef TARGET_ASM_ALIGNED_SI_OP
11226 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11228 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11229 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11230 hook_bool_const_tree_hwi_hwi_const_tree_true
11232 #undef TARGET_ASM_FILE_START
11233 #define TARGET_ASM_FILE_START aarch64_start_file
11235 #undef TARGET_ASM_OUTPUT_MI_THUNK
11236 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11238 #undef TARGET_ASM_SELECT_RTX_SECTION
11239 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11241 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11242 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11244 #undef TARGET_BUILD_BUILTIN_VA_LIST
11245 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11247 #undef TARGET_CALLEE_COPIES
11248 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11250 #undef TARGET_CAN_ELIMINATE
11251 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11253 #undef TARGET_CANNOT_FORCE_CONST_MEM
11254 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11256 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11257 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11259 /* Only the least significant bit is used for initialization guard
11260 variables. */
11261 #undef TARGET_CXX_GUARD_MASK_BIT
11262 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11264 #undef TARGET_C_MODE_FOR_SUFFIX
11265 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11267 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11268 #undef TARGET_DEFAULT_TARGET_FLAGS
11269 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11270 #endif
11272 #undef TARGET_CLASS_MAX_NREGS
11273 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11275 #undef TARGET_BUILTIN_DECL
11276 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11278 #undef TARGET_EXPAND_BUILTIN
11279 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11281 #undef TARGET_EXPAND_BUILTIN_VA_START
11282 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11284 #undef TARGET_FOLD_BUILTIN
11285 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11287 #undef TARGET_FUNCTION_ARG
11288 #define TARGET_FUNCTION_ARG aarch64_function_arg
11290 #undef TARGET_FUNCTION_ARG_ADVANCE
11291 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11293 #undef TARGET_FUNCTION_ARG_BOUNDARY
11294 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11296 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11297 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11299 #undef TARGET_FUNCTION_VALUE
11300 #define TARGET_FUNCTION_VALUE aarch64_function_value
11302 #undef TARGET_FUNCTION_VALUE_REGNO_P
11303 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11305 #undef TARGET_FRAME_POINTER_REQUIRED
11306 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11308 #undef TARGET_GIMPLE_FOLD_BUILTIN
11309 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11311 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11312 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11314 #undef TARGET_INIT_BUILTINS
11315 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11317 #undef TARGET_LEGITIMATE_ADDRESS_P
11318 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11320 #undef TARGET_LEGITIMATE_CONSTANT_P
11321 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11323 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11324 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11326 #undef TARGET_LRA_P
11327 #define TARGET_LRA_P hook_bool_void_true
11329 #undef TARGET_MANGLE_TYPE
11330 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11332 #undef TARGET_MEMORY_MOVE_COST
11333 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11335 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11336 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11338 #undef TARGET_MUST_PASS_IN_STACK
11339 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11341 /* This target hook should return true if accesses to volatile bitfields
11342 should use the narrowest mode possible. It should return false if these
11343 accesses should use the bitfield container type. */
11344 #undef TARGET_NARROW_VOLATILE_BITFIELD
11345 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11347 #undef TARGET_OPTION_OVERRIDE
11348 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11350 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11351 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11352 aarch64_override_options_after_change
11354 #undef TARGET_PASS_BY_REFERENCE
11355 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11357 #undef TARGET_PREFERRED_RELOAD_CLASS
11358 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11360 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11361 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11363 #undef TARGET_SECONDARY_RELOAD
11364 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11366 #undef TARGET_SHIFT_TRUNCATION_MASK
11367 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11369 #undef TARGET_SETUP_INCOMING_VARARGS
11370 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11372 #undef TARGET_STRUCT_VALUE_RTX
11373 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11375 #undef TARGET_REGISTER_MOVE_COST
11376 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11378 #undef TARGET_RETURN_IN_MEMORY
11379 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11381 #undef TARGET_RETURN_IN_MSB
11382 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11384 #undef TARGET_RTX_COSTS
11385 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11387 #undef TARGET_SCHED_ISSUE_RATE
11388 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11390 #undef TARGET_TRAMPOLINE_INIT
11391 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11393 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11394 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11396 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11397 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11399 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11400 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11402 #undef TARGET_VECTORIZE_ADD_STMT_COST
11403 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11405 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11406 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11407 aarch64_builtin_vectorization_cost
11409 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11410 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11412 #undef TARGET_VECTORIZE_BUILTINS
11413 #define TARGET_VECTORIZE_BUILTINS
11415 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11416 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11417 aarch64_builtin_vectorized_function
11419 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11420 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11421 aarch64_autovectorize_vector_sizes
11423 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11424 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11425 aarch64_atomic_assign_expand_fenv
11427 /* Section anchor support. */
11429 #undef TARGET_MIN_ANCHOR_OFFSET
11430 #define TARGET_MIN_ANCHOR_OFFSET -256
11432 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11433 byte offset; we can do much more for larger data types, but have no way
11434 to determine the size of the access. We assume accesses are aligned. */
11435 #undef TARGET_MAX_ANCHOR_OFFSET
11436 #define TARGET_MAX_ANCHOR_OFFSET 4095
11438 #undef TARGET_VECTOR_ALIGNMENT
11439 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11441 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11442 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11443 aarch64_simd_vector_alignment_reachable
11445 /* vec_perm support. */
11447 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11448 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11449 aarch64_vectorize_vec_perm_const_ok
11452 #undef TARGET_FIXED_CONDITION_CODE_REGS
11453 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11455 #undef TARGET_FLAGS_REGNUM
11456 #define TARGET_FLAGS_REGNUM CC_REGNUM
11458 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11459 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11461 #undef TARGET_ASAN_SHADOW_OFFSET
11462 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11464 #undef TARGET_LEGITIMIZE_ADDRESS
11465 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11467 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11468 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11469 aarch64_use_by_pieces_infrastructure_p
11471 #undef TARGET_CAN_USE_DOLOOP_P
11472 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11474 #undef TARGET_SCHED_MACRO_FUSION_P
11475 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11477 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11478 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11480 #undef TARGET_SCHED_FUSION_PRIORITY
11481 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11483 struct gcc_target targetm = TARGET_INITIALIZER;
11485 #include "gt-aarch64.h"