tree-core.h: Include symtab.h.
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobf7a9f3ebbad32908144dbadda8607c4227a65c9c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "tree.h"
26 #include "gimple.h"
27 #include "rtl.h"
28 #include "df.h"
29 #include "insn-codes.h"
30 #include "insn-attr.h"
31 #include "alias.h"
32 #include "fold-const.h"
33 #include "stringpool.h"
34 #include "stor-layout.h"
35 #include "calls.h"
36 #include "varasm.h"
37 #include "regs.h"
38 #include "cfgrtl.h"
39 #include "cfganal.h"
40 #include "lcm.h"
41 #include "cfgbuild.h"
42 #include "cfgcleanup.h"
43 #include "output.h"
44 #include "flags.h"
45 #include "insn-config.h"
46 #include "expmed.h"
47 #include "dojump.h"
48 #include "explow.h"
49 #include "emit-rtl.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "toplev.h"
54 #include "target.h"
55 #include "targhooks.h"
56 #include "tm_p.h"
57 #include "recog.h"
58 #include "langhooks.h"
59 #include "diagnostic-core.h"
60 #include "internal-fn.h"
61 #include "gimple-fold.h"
62 #include "tree-eh.h"
63 #include "gimplify.h"
64 #include "optabs.h"
65 #include "dwarf2.h"
66 #include "cfgloop.h"
67 #include "tree-vectorizer.h"
68 #include "aarch64-cost-tables.h"
69 #include "dumpfile.h"
70 #include "builtins.h"
71 #include "rtl-iter.h"
72 #include "tm-constrs.h"
73 #include "sched-int.h"
74 #include "cortex-a57-fma-steering.h"
76 /* This file should be included last. */
77 #include "target-def.h"
79 /* Defined for convenience. */
80 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82 /* Classifies an address.
84 ADDRESS_REG_IMM
85 A simple base register plus immediate offset.
87 ADDRESS_REG_WB
88 A base register indexed by immediate offset with writeback.
90 ADDRESS_REG_REG
91 A base register indexed by (optionally scaled) register.
93 ADDRESS_REG_UXTW
94 A base register indexed by (optionally scaled) zero-extended register.
96 ADDRESS_REG_SXTW
97 A base register indexed by (optionally scaled) sign-extended register.
99 ADDRESS_LO_SUM
100 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 ADDRESS_SYMBOLIC:
103 A constant symbolic address, in pc-relative literal pool. */
105 enum aarch64_address_type {
106 ADDRESS_REG_IMM,
107 ADDRESS_REG_WB,
108 ADDRESS_REG_REG,
109 ADDRESS_REG_UXTW,
110 ADDRESS_REG_SXTW,
111 ADDRESS_LO_SUM,
112 ADDRESS_SYMBOLIC
115 struct aarch64_address_info {
116 enum aarch64_address_type type;
117 rtx base;
118 rtx offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 struct simd_immediate_info
125 rtx value;
126 int shift;
127 int element_width;
128 bool mvn;
129 bool msl;
132 /* The current code model. */
133 enum aarch64_code_model aarch64_cmodel;
135 #ifdef HAVE_AS_TLS
136 #undef TARGET_HAVE_TLS
137 #define TARGET_HAVE_TLS 1
138 #endif
140 static bool aarch64_composite_type_p (const_tree, machine_mode);
141 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
142 const_tree,
143 machine_mode *, int *,
144 bool *);
145 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
146 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
147 static void aarch64_override_options_after_change (void);
148 static bool aarch64_vector_mode_supported_p (machine_mode);
149 static unsigned bit_count (unsigned HOST_WIDE_INT);
150 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
151 const unsigned char *sel);
152 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
160 /* Mask to specify which instructions we are allowed to generate. */
161 unsigned long aarch64_isa_flags = 0;
163 /* Mask to specify which instruction scheduling options should be used. */
164 unsigned long aarch64_tune_flags = 0;
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
170 const char* name;
171 unsigned int flag;
174 #define AARCH64_FUSION_PAIR(name, internal_name, y) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
183 #undef AARCH64_FUION_PAIR
185 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name, y) \
186 { name, AARCH64_EXTRA_TUNE_##internal_name },
187 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
189 { "none", AARCH64_EXTRA_TUNE_NONE },
190 #include "aarch64-tuning-flags.def"
191 { "all", AARCH64_EXTRA_TUNE_ALL },
192 { NULL, AARCH64_EXTRA_TUNE_NONE }
194 #undef AARCH64_EXTRA_TUNING_OPTION
196 /* Tuning parameters. */
198 static const struct cpu_addrcost_table generic_addrcost_table =
201 0, /* hi */
202 0, /* si */
203 0, /* di */
204 0, /* ti */
206 0, /* pre_modify */
207 0, /* post_modify */
208 0, /* register_offset */
209 0, /* register_extend */
210 0 /* imm_offset */
213 static const struct cpu_addrcost_table cortexa57_addrcost_table =
216 1, /* hi */
217 0, /* si */
218 0, /* di */
219 1, /* ti */
221 0, /* pre_modify */
222 0, /* post_modify */
223 0, /* register_offset */
224 0, /* register_extend */
225 0, /* imm_offset */
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_extend */
240 0, /* imm_offset */
243 static const struct cpu_regmove_cost generic_regmove_cost =
245 1, /* GP2GP */
246 /* Avoid the use of slow int<->fp moves for spilling by setting
247 their cost higher than memmov_cost. */
248 5, /* GP2FP */
249 5, /* FP2GP */
250 2 /* FP2FP */
253 static const struct cpu_regmove_cost cortexa57_regmove_cost =
255 1, /* GP2GP */
256 /* Avoid the use of slow int<->fp moves for spilling by setting
257 their cost higher than memmov_cost. */
258 5, /* GP2FP */
259 5, /* FP2GP */
260 2 /* FP2FP */
263 static const struct cpu_regmove_cost cortexa53_regmove_cost =
265 1, /* GP2GP */
266 /* Avoid the use of slow int<->fp moves for spilling by setting
267 their cost higher than memmov_cost. */
268 5, /* GP2FP */
269 5, /* FP2GP */
270 2 /* FP2FP */
273 static const struct cpu_regmove_cost thunderx_regmove_cost =
275 2, /* GP2GP */
276 2, /* GP2FP */
277 6, /* FP2GP */
278 4 /* FP2FP */
281 static const struct cpu_regmove_cost xgene1_regmove_cost =
283 1, /* GP2GP */
284 /* Avoid the use of slow int<->fp moves for spilling by setting
285 their cost higher than memmov_cost. */
286 8, /* GP2FP */
287 8, /* FP2GP */
288 2 /* FP2FP */
291 /* Generic costs for vector insn classes. */
292 static const struct cpu_vector_cost generic_vector_cost =
294 1, /* scalar_stmt_cost */
295 1, /* scalar_load_cost */
296 1, /* scalar_store_cost */
297 1, /* vec_stmt_cost */
298 1, /* vec_to_scalar_cost */
299 1, /* scalar_to_vec_cost */
300 1, /* vec_align_load_cost */
301 1, /* vec_unalign_load_cost */
302 1, /* vec_unalign_store_cost */
303 1, /* vec_store_cost */
304 3, /* cond_taken_branch_cost */
305 1 /* cond_not_taken_branch_cost */
308 /* Generic costs for vector insn classes. */
309 static const struct cpu_vector_cost cortexa57_vector_cost =
311 1, /* scalar_stmt_cost */
312 4, /* scalar_load_cost */
313 1, /* scalar_store_cost */
314 3, /* vec_stmt_cost */
315 8, /* vec_to_scalar_cost */
316 8, /* scalar_to_vec_cost */
317 5, /* vec_align_load_cost */
318 5, /* vec_unalign_load_cost */
319 1, /* vec_unalign_store_cost */
320 1, /* vec_store_cost */
321 1, /* cond_taken_branch_cost */
322 1 /* cond_not_taken_branch_cost */
325 /* Generic costs for vector insn classes. */
326 static const struct cpu_vector_cost xgene1_vector_cost =
328 1, /* scalar_stmt_cost */
329 5, /* scalar_load_cost */
330 1, /* scalar_store_cost */
331 2, /* vec_stmt_cost */
332 4, /* vec_to_scalar_cost */
333 4, /* scalar_to_vec_cost */
334 10, /* vec_align_load_cost */
335 10, /* vec_unalign_load_cost */
336 2, /* vec_unalign_store_cost */
337 2, /* vec_store_cost */
338 2, /* cond_taken_branch_cost */
339 1 /* cond_not_taken_branch_cost */
342 /* Generic costs for branch instructions. */
343 static const struct cpu_branch_cost generic_branch_cost =
345 2, /* Predictable. */
346 2 /* Unpredictable. */
349 static const struct tune_params generic_tunings =
351 &cortexa57_extra_costs,
352 &generic_addrcost_table,
353 &generic_regmove_cost,
354 &generic_vector_cost,
355 &generic_branch_cost,
356 4, /* memmov_cost */
357 2, /* issue_rate */
358 AARCH64_FUSE_NOTHING, /* fusible_ops */
359 8, /* function_align. */
360 8, /* jump_align. */
361 4, /* loop_align. */
362 2, /* int_reassoc_width. */
363 4, /* fp_reassoc_width. */
364 1, /* vec_reassoc_width. */
365 2, /* min_div_recip_mul_sf. */
366 2, /* min_div_recip_mul_df. */
367 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
370 static const struct tune_params cortexa53_tunings =
372 &cortexa53_extra_costs,
373 &generic_addrcost_table,
374 &cortexa53_regmove_cost,
375 &generic_vector_cost,
376 &generic_branch_cost,
377 4, /* memmov_cost */
378 2, /* issue_rate */
379 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
380 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
381 8, /* function_align. */
382 8, /* jump_align. */
383 4, /* loop_align. */
384 2, /* int_reassoc_width. */
385 4, /* fp_reassoc_width. */
386 1, /* vec_reassoc_width. */
387 2, /* min_div_recip_mul_sf. */
388 2, /* min_div_recip_mul_df. */
389 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
392 static const struct tune_params cortexa57_tunings =
394 &cortexa57_extra_costs,
395 &cortexa57_addrcost_table,
396 &cortexa57_regmove_cost,
397 &cortexa57_vector_cost,
398 &generic_branch_cost,
399 4, /* memmov_cost */
400 3, /* issue_rate */
401 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
402 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
403 16, /* function_align. */
404 8, /* jump_align. */
405 4, /* loop_align. */
406 2, /* int_reassoc_width. */
407 4, /* fp_reassoc_width. */
408 1, /* vec_reassoc_width. */
409 2, /* min_div_recip_mul_sf. */
410 2, /* min_div_recip_mul_df. */
411 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
414 static const struct tune_params cortexa72_tunings =
416 &cortexa57_extra_costs,
417 &cortexa57_addrcost_table,
418 &cortexa57_regmove_cost,
419 &cortexa57_vector_cost,
420 &generic_branch_cost,
421 4, /* memmov_cost */
422 3, /* issue_rate */
423 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
424 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
425 16, /* function_align. */
426 8, /* jump_align. */
427 4, /* loop_align. */
428 2, /* int_reassoc_width. */
429 4, /* fp_reassoc_width. */
430 1, /* vec_reassoc_width. */
431 2, /* min_div_recip_mul_sf. */
432 2, /* min_div_recip_mul_df. */
433 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
436 static const struct tune_params thunderx_tunings =
438 &thunderx_extra_costs,
439 &generic_addrcost_table,
440 &thunderx_regmove_cost,
441 &generic_vector_cost,
442 &generic_branch_cost,
443 6, /* memmov_cost */
444 2, /* issue_rate */
445 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
446 8, /* function_align. */
447 8, /* jump_align. */
448 8, /* loop_align. */
449 2, /* int_reassoc_width. */
450 4, /* fp_reassoc_width. */
451 1, /* vec_reassoc_width. */
452 2, /* min_div_recip_mul_sf. */
453 2, /* min_div_recip_mul_df. */
454 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
457 static const struct tune_params xgene1_tunings =
459 &xgene1_extra_costs,
460 &xgene1_addrcost_table,
461 &xgene1_regmove_cost,
462 &xgene1_vector_cost,
463 &generic_branch_cost,
464 6, /* memmov_cost */
465 4, /* issue_rate */
466 AARCH64_FUSE_NOTHING, /* fusible_ops */
467 16, /* function_align. */
468 8, /* jump_align. */
469 16, /* loop_align. */
470 2, /* int_reassoc_width. */
471 4, /* fp_reassoc_width. */
472 1, /* vec_reassoc_width. */
473 2, /* min_div_recip_mul_sf. */
474 2, /* min_div_recip_mul_df. */
475 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
478 /* Support for fine-grained override of the tuning structures. */
479 struct aarch64_tuning_override_function
481 const char* name;
482 void (*parse_override)(const char*, struct tune_params*);
485 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
486 static void aarch64_parse_tune_string (const char*, struct tune_params*);
488 static const struct aarch64_tuning_override_function
489 aarch64_tuning_override_functions[] =
491 { "fuse", aarch64_parse_fuse_string },
492 { "tune", aarch64_parse_tune_string },
493 { NULL, NULL }
496 /* A processor implementing AArch64. */
497 struct processor
499 const char *const name;
500 enum aarch64_processor core;
501 const char *arch;
502 unsigned architecture_version;
503 const unsigned long flags;
504 const struct tune_params *const tune;
507 /* Processor cores implementing AArch64. */
508 static const struct processor all_cores[] =
510 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
511 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
512 #include "aarch64-cores.def"
513 #undef AARCH64_CORE
514 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
515 {NULL, aarch64_none, NULL, 0, 0, NULL}
518 /* Architectures implementing AArch64. */
519 static const struct processor all_architectures[] =
521 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
522 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
523 #include "aarch64-arches.def"
524 #undef AARCH64_ARCH
525 {NULL, aarch64_none, NULL, 0, 0, NULL}
528 /* Target specification. These are populated as commandline arguments
529 are processed, or NULL if not specified. */
530 static const struct processor *selected_arch;
531 static const struct processor *selected_cpu;
532 static const struct processor *selected_tune;
534 /* The current tuning set. */
535 struct tune_params aarch64_tune_params = generic_tunings;
537 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
539 /* An ISA extension in the co-processor and main instruction set space. */
540 struct aarch64_option_extension
542 const char *const name;
543 const unsigned long flags_on;
544 const unsigned long flags_off;
547 /* ISA extensions in AArch64. */
548 static const struct aarch64_option_extension all_extensions[] =
550 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
551 {NAME, FLAGS_ON, FLAGS_OFF},
552 #include "aarch64-option-extensions.def"
553 #undef AARCH64_OPT_EXTENSION
554 {NULL, 0, 0}
557 /* Used to track the size of an address when generating a pre/post
558 increment address. */
559 static machine_mode aarch64_memory_reference_mode;
561 /* A table of valid AArch64 "bitmask immediate" values for
562 logical instructions. */
564 #define AARCH64_NUM_BITMASKS 5334
565 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
567 typedef enum aarch64_cond_code
569 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
570 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
571 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
573 aarch64_cc;
575 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
577 /* The condition codes of the processor, and the inverse function. */
578 static const char * const aarch64_condition_codes[] =
580 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
581 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
584 void
585 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
587 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
588 if (TARGET_GENERAL_REGS_ONLY)
589 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
590 else
591 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
594 static unsigned int
595 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
597 if (GET_MODE_UNIT_SIZE (mode) == 4)
598 return aarch64_tune_params.min_div_recip_mul_sf;
599 return aarch64_tune_params.min_div_recip_mul_df;
602 static int
603 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
604 enum machine_mode mode)
606 if (VECTOR_MODE_P (mode))
607 return aarch64_tune_params.vec_reassoc_width;
608 if (INTEGRAL_MODE_P (mode))
609 return aarch64_tune_params.int_reassoc_width;
610 if (FLOAT_MODE_P (mode))
611 return aarch64_tune_params.fp_reassoc_width;
612 return 1;
615 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
616 unsigned
617 aarch64_dbx_register_number (unsigned regno)
619 if (GP_REGNUM_P (regno))
620 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
621 else if (regno == SP_REGNUM)
622 return AARCH64_DWARF_SP;
623 else if (FP_REGNUM_P (regno))
624 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
626 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
627 equivalent DWARF register. */
628 return DWARF_FRAME_REGISTERS;
631 /* Return TRUE if MODE is any of the large INT modes. */
632 static bool
633 aarch64_vect_struct_mode_p (machine_mode mode)
635 return mode == OImode || mode == CImode || mode == XImode;
638 /* Return TRUE if MODE is any of the vector modes. */
639 static bool
640 aarch64_vector_mode_p (machine_mode mode)
642 return aarch64_vector_mode_supported_p (mode)
643 || aarch64_vect_struct_mode_p (mode);
646 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
647 static bool
648 aarch64_array_mode_supported_p (machine_mode mode,
649 unsigned HOST_WIDE_INT nelems)
651 if (TARGET_SIMD
652 && AARCH64_VALID_SIMD_QREG_MODE (mode)
653 && (nelems >= 2 && nelems <= 4))
654 return true;
656 return false;
659 /* Implement HARD_REGNO_NREGS. */
662 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
664 switch (aarch64_regno_regclass (regno))
666 case FP_REGS:
667 case FP_LO_REGS:
668 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
669 default:
670 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
672 gcc_unreachable ();
675 /* Implement HARD_REGNO_MODE_OK. */
678 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
680 if (GET_MODE_CLASS (mode) == MODE_CC)
681 return regno == CC_REGNUM;
683 if (regno == SP_REGNUM)
684 /* The purpose of comparing with ptr_mode is to support the
685 global register variable associated with the stack pointer
686 register via the syntax of asm ("wsp") in ILP32. */
687 return mode == Pmode || mode == ptr_mode;
689 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
690 return mode == Pmode;
692 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
693 return 1;
695 if (FP_REGNUM_P (regno))
697 if (aarch64_vect_struct_mode_p (mode))
698 return
699 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
700 else
701 return 1;
704 return 0;
707 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
708 machine_mode
709 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
710 machine_mode mode)
712 /* Handle modes that fit within single registers. */
713 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
715 if (GET_MODE_SIZE (mode) >= 4)
716 return mode;
717 else
718 return SImode;
720 /* Fall back to generic for multi-reg and very large modes. */
721 else
722 return choose_hard_reg_mode (regno, nregs, false);
725 /* Return true if calls to DECL should be treated as
726 long-calls (ie called via a register). */
727 static bool
728 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
730 return false;
733 /* Return true if calls to symbol-ref SYM should be treated as
734 long-calls (ie called via a register). */
735 bool
736 aarch64_is_long_call_p (rtx sym)
738 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
741 /* Return true if the offsets to a zero/sign-extract operation
742 represent an expression that matches an extend operation. The
743 operands represent the paramters from
745 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
746 bool
747 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
748 rtx extract_imm)
750 HOST_WIDE_INT mult_val, extract_val;
752 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
753 return false;
755 mult_val = INTVAL (mult_imm);
756 extract_val = INTVAL (extract_imm);
758 if (extract_val > 8
759 && extract_val < GET_MODE_BITSIZE (mode)
760 && exact_log2 (extract_val & ~7) > 0
761 && (extract_val & 7) <= 4
762 && mult_val == (1 << (extract_val & 7)))
763 return true;
765 return false;
768 /* Emit an insn that's a simple single-set. Both the operands must be
769 known to be valid. */
770 inline static rtx
771 emit_set_insn (rtx x, rtx y)
773 return emit_insn (gen_rtx_SET (x, y));
776 /* X and Y are two things to compare using CODE. Emit the compare insn and
777 return the rtx for register 0 in the proper mode. */
779 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
781 machine_mode mode = SELECT_CC_MODE (code, x, y);
782 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
784 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
785 return cc_reg;
788 /* Build the SYMBOL_REF for __tls_get_addr. */
790 static GTY(()) rtx tls_get_addr_libfunc;
793 aarch64_tls_get_addr (void)
795 if (!tls_get_addr_libfunc)
796 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
797 return tls_get_addr_libfunc;
800 /* Return the TLS model to use for ADDR. */
802 static enum tls_model
803 tls_symbolic_operand_type (rtx addr)
805 enum tls_model tls_kind = TLS_MODEL_NONE;
806 rtx sym, addend;
808 if (GET_CODE (addr) == CONST)
810 split_const (addr, &sym, &addend);
811 if (GET_CODE (sym) == SYMBOL_REF)
812 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
814 else if (GET_CODE (addr) == SYMBOL_REF)
815 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
817 return tls_kind;
820 /* We'll allow lo_sum's in addresses in our legitimate addresses
821 so that combine would take care of combining addresses where
822 necessary, but for generation purposes, we'll generate the address
823 as :
824 RTL Absolute
825 tmp = hi (symbol_ref); adrp x1, foo
826 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
829 PIC TLS
830 adrp x1, :got:foo adrp tmp, :tlsgd:foo
831 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
832 bl __tls_get_addr
835 Load TLS symbol, depending on TLS mechanism and TLS access model.
837 Global Dynamic - Traditional TLS:
838 adrp tmp, :tlsgd:imm
839 add dest, tmp, #:tlsgd_lo12:imm
840 bl __tls_get_addr
842 Global Dynamic - TLS Descriptors:
843 adrp dest, :tlsdesc:imm
844 ldr tmp, [dest, #:tlsdesc_lo12:imm]
845 add dest, dest, #:tlsdesc_lo12:imm
846 blr tmp
847 mrs tp, tpidr_el0
848 add dest, dest, tp
850 Initial Exec:
851 mrs tp, tpidr_el0
852 adrp tmp, :gottprel:imm
853 ldr dest, [tmp, #:gottprel_lo12:imm]
854 add dest, dest, tp
856 Local Exec:
857 mrs tp, tpidr_el0
858 add t0, tp, #:tprel_hi12:imm, lsl #12
859 add t0, t0, #:tprel_lo12_nc:imm
862 static void
863 aarch64_load_symref_appropriately (rtx dest, rtx imm,
864 enum aarch64_symbol_type type)
866 switch (type)
868 case SYMBOL_SMALL_ABSOLUTE:
870 /* In ILP32, the mode of dest can be either SImode or DImode. */
871 rtx tmp_reg = dest;
872 machine_mode mode = GET_MODE (dest);
874 gcc_assert (mode == Pmode || mode == ptr_mode);
876 if (can_create_pseudo_p ())
877 tmp_reg = gen_reg_rtx (mode);
879 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
880 emit_insn (gen_add_losym (dest, tmp_reg, imm));
881 return;
884 case SYMBOL_TINY_ABSOLUTE:
885 emit_insn (gen_rtx_SET (dest, imm));
886 return;
888 case SYMBOL_SMALL_GOT_28K:
890 machine_mode mode = GET_MODE (dest);
891 rtx gp_rtx = pic_offset_table_rtx;
893 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
894 here before rtl expand. Tree IVOPT will generate rtl pattern to
895 decide rtx costs, in which case pic_offset_table_rtx is not
896 initialized. For that case no need to generate the first adrp
897 instruction as the the final cost for global variable access is
898 one instruction. */
899 if (gp_rtx != NULL)
901 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
902 using the page base as GOT base, the first page may be wasted,
903 in the worst scenario, there is only 28K space for GOT).
905 The generate instruction sequence for accessing global variable
908 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
910 Only one instruction needed. But we must initialize
911 pic_offset_table_rtx properly. We generate initialize insn for
912 every global access, and allow CSE to remove all redundant.
914 The final instruction sequences will look like the following
915 for multiply global variables access.
917 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
919 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
920 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
921 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
922 ... */
924 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
925 crtl->uses_pic_offset_table = 1;
926 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
928 if (mode != GET_MODE (gp_rtx))
929 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
932 if (mode == ptr_mode)
934 if (mode == DImode)
935 emit_insn (gen_ldr_got_small_28k_di (dest, gp_rtx, imm));
936 else
937 emit_insn (gen_ldr_got_small_28k_si (dest, gp_rtx, imm));
939 else
941 gcc_assert (mode == Pmode);
942 emit_insn (gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm));
945 return;
948 case SYMBOL_SMALL_GOT_4G:
950 /* In ILP32, the mode of dest can be either SImode or DImode,
951 while the got entry is always of SImode size. The mode of
952 dest depends on how dest is used: if dest is assigned to a
953 pointer (e.g. in the memory), it has SImode; it may have
954 DImode if dest is dereferenced to access the memeory.
955 This is why we have to handle three different ldr_got_small
956 patterns here (two patterns for ILP32). */
957 rtx tmp_reg = dest;
958 machine_mode mode = GET_MODE (dest);
960 if (can_create_pseudo_p ())
961 tmp_reg = gen_reg_rtx (mode);
963 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
964 if (mode == ptr_mode)
966 if (mode == DImode)
967 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
968 else
969 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
971 else
973 gcc_assert (mode == Pmode);
974 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
977 return;
980 case SYMBOL_SMALL_TLSGD:
982 rtx_insn *insns;
983 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
985 start_sequence ();
986 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
987 insns = get_insns ();
988 end_sequence ();
990 RTL_CONST_CALL_P (insns) = 1;
991 emit_libcall_block (insns, dest, result, imm);
992 return;
995 case SYMBOL_SMALL_TLSDESC:
997 machine_mode mode = GET_MODE (dest);
998 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
999 rtx tp;
1001 gcc_assert (mode == Pmode || mode == ptr_mode);
1003 /* In ILP32, the got entry is always of SImode size. Unlike
1004 small GOT, the dest is fixed at reg 0. */
1005 if (TARGET_ILP32)
1006 emit_insn (gen_tlsdesc_small_si (imm));
1007 else
1008 emit_insn (gen_tlsdesc_small_di (imm));
1009 tp = aarch64_load_tp (NULL);
1011 if (mode != Pmode)
1012 tp = gen_lowpart (mode, tp);
1014 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1015 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1016 return;
1019 case SYMBOL_SMALL_GOTTPREL:
1021 /* In ILP32, the mode of dest can be either SImode or DImode,
1022 while the got entry is always of SImode size. The mode of
1023 dest depends on how dest is used: if dest is assigned to a
1024 pointer (e.g. in the memory), it has SImode; it may have
1025 DImode if dest is dereferenced to access the memeory.
1026 This is why we have to handle three different tlsie_small
1027 patterns here (two patterns for ILP32). */
1028 machine_mode mode = GET_MODE (dest);
1029 rtx tmp_reg = gen_reg_rtx (mode);
1030 rtx tp = aarch64_load_tp (NULL);
1032 if (mode == ptr_mode)
1034 if (mode == DImode)
1035 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1036 else
1038 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1039 tp = gen_lowpart (mode, tp);
1042 else
1044 gcc_assert (mode == Pmode);
1045 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1048 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1049 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1050 return;
1053 case SYMBOL_TLSLE:
1055 rtx tp = aarch64_load_tp (NULL);
1057 if (GET_MODE (dest) != Pmode)
1058 tp = gen_lowpart (GET_MODE (dest), tp);
1060 emit_insn (gen_tlsle (dest, tp, imm));
1061 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1062 return;
1065 case SYMBOL_TINY_GOT:
1066 emit_insn (gen_ldr_got_tiny (dest, imm));
1067 return;
1069 default:
1070 gcc_unreachable ();
1074 /* Emit a move from SRC to DEST. Assume that the move expanders can
1075 handle all moves if !can_create_pseudo_p (). The distinction is
1076 important because, unlike emit_move_insn, the move expanders know
1077 how to force Pmode objects into the constant pool even when the
1078 constant pool address is not itself legitimate. */
1079 static rtx
1080 aarch64_emit_move (rtx dest, rtx src)
1082 return (can_create_pseudo_p ()
1083 ? emit_move_insn (dest, src)
1084 : emit_move_insn_1 (dest, src));
1087 /* Split a 128-bit move operation into two 64-bit move operations,
1088 taking care to handle partial overlap of register to register
1089 copies. Special cases are needed when moving between GP regs and
1090 FP regs. SRC can be a register, constant or memory; DST a register
1091 or memory. If either operand is memory it must not have any side
1092 effects. */
1093 void
1094 aarch64_split_128bit_move (rtx dst, rtx src)
1096 rtx dst_lo, dst_hi;
1097 rtx src_lo, src_hi;
1099 machine_mode mode = GET_MODE (dst);
1101 gcc_assert (mode == TImode || mode == TFmode);
1102 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1103 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1105 if (REG_P (dst) && REG_P (src))
1107 int src_regno = REGNO (src);
1108 int dst_regno = REGNO (dst);
1110 /* Handle FP <-> GP regs. */
1111 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1113 src_lo = gen_lowpart (word_mode, src);
1114 src_hi = gen_highpart (word_mode, src);
1116 if (mode == TImode)
1118 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1119 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1121 else
1123 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1124 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1126 return;
1128 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1130 dst_lo = gen_lowpart (word_mode, dst);
1131 dst_hi = gen_highpart (word_mode, dst);
1133 if (mode == TImode)
1135 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1136 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1138 else
1140 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1141 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1143 return;
1147 dst_lo = gen_lowpart (word_mode, dst);
1148 dst_hi = gen_highpart (word_mode, dst);
1149 src_lo = gen_lowpart (word_mode, src);
1150 src_hi = gen_highpart_mode (word_mode, mode, src);
1152 /* At most one pairing may overlap. */
1153 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1155 aarch64_emit_move (dst_hi, src_hi);
1156 aarch64_emit_move (dst_lo, src_lo);
1158 else
1160 aarch64_emit_move (dst_lo, src_lo);
1161 aarch64_emit_move (dst_hi, src_hi);
1165 bool
1166 aarch64_split_128bit_move_p (rtx dst, rtx src)
1168 return (! REG_P (src)
1169 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1172 /* Split a complex SIMD combine. */
1174 void
1175 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1177 machine_mode src_mode = GET_MODE (src1);
1178 machine_mode dst_mode = GET_MODE (dst);
1180 gcc_assert (VECTOR_MODE_P (dst_mode));
1182 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1184 rtx (*gen) (rtx, rtx, rtx);
1186 switch (src_mode)
1188 case V8QImode:
1189 gen = gen_aarch64_simd_combinev8qi;
1190 break;
1191 case V4HImode:
1192 gen = gen_aarch64_simd_combinev4hi;
1193 break;
1194 case V2SImode:
1195 gen = gen_aarch64_simd_combinev2si;
1196 break;
1197 case V2SFmode:
1198 gen = gen_aarch64_simd_combinev2sf;
1199 break;
1200 case DImode:
1201 gen = gen_aarch64_simd_combinedi;
1202 break;
1203 case DFmode:
1204 gen = gen_aarch64_simd_combinedf;
1205 break;
1206 default:
1207 gcc_unreachable ();
1210 emit_insn (gen (dst, src1, src2));
1211 return;
1215 /* Split a complex SIMD move. */
1217 void
1218 aarch64_split_simd_move (rtx dst, rtx src)
1220 machine_mode src_mode = GET_MODE (src);
1221 machine_mode dst_mode = GET_MODE (dst);
1223 gcc_assert (VECTOR_MODE_P (dst_mode));
1225 if (REG_P (dst) && REG_P (src))
1227 rtx (*gen) (rtx, rtx);
1229 gcc_assert (VECTOR_MODE_P (src_mode));
1231 switch (src_mode)
1233 case V16QImode:
1234 gen = gen_aarch64_split_simd_movv16qi;
1235 break;
1236 case V8HImode:
1237 gen = gen_aarch64_split_simd_movv8hi;
1238 break;
1239 case V4SImode:
1240 gen = gen_aarch64_split_simd_movv4si;
1241 break;
1242 case V2DImode:
1243 gen = gen_aarch64_split_simd_movv2di;
1244 break;
1245 case V4SFmode:
1246 gen = gen_aarch64_split_simd_movv4sf;
1247 break;
1248 case V2DFmode:
1249 gen = gen_aarch64_split_simd_movv2df;
1250 break;
1251 default:
1252 gcc_unreachable ();
1255 emit_insn (gen (dst, src));
1256 return;
1260 static rtx
1261 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1263 if (can_create_pseudo_p ())
1264 return force_reg (mode, value);
1265 else
1267 x = aarch64_emit_move (x, value);
1268 return x;
1273 static rtx
1274 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1276 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1278 rtx high;
1279 /* Load the full offset into a register. This
1280 might be improvable in the future. */
1281 high = GEN_INT (offset);
1282 offset = 0;
1283 high = aarch64_force_temporary (mode, temp, high);
1284 reg = aarch64_force_temporary (mode, temp,
1285 gen_rtx_PLUS (mode, high, reg));
1287 return plus_constant (mode, reg, offset);
1290 static int
1291 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1292 machine_mode mode)
1294 unsigned HOST_WIDE_INT mask;
1295 int i;
1296 bool first;
1297 unsigned HOST_WIDE_INT val;
1298 bool subtargets;
1299 rtx subtarget;
1300 int one_match, zero_match, first_not_ffff_match;
1301 int num_insns = 0;
1303 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1305 if (generate)
1306 emit_insn (gen_rtx_SET (dest, imm));
1307 num_insns++;
1308 return num_insns;
1311 if (mode == SImode)
1313 /* We know we can't do this in 1 insn, and we must be able to do it
1314 in two; so don't mess around looking for sequences that don't buy
1315 us anything. */
1316 if (generate)
1318 emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1319 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1320 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1322 num_insns += 2;
1323 return num_insns;
1326 /* Remaining cases are all for DImode. */
1328 val = INTVAL (imm);
1329 subtargets = optimize && can_create_pseudo_p ();
1331 one_match = 0;
1332 zero_match = 0;
1333 mask = 0xffff;
1334 first_not_ffff_match = -1;
1336 for (i = 0; i < 64; i += 16, mask <<= 16)
1338 if ((val & mask) == mask)
1339 one_match++;
1340 else
1342 if (first_not_ffff_match < 0)
1343 first_not_ffff_match = i;
1344 if ((val & mask) == 0)
1345 zero_match++;
1349 if (one_match == 2)
1351 /* Set one of the quarters and then insert back into result. */
1352 mask = 0xffffll << first_not_ffff_match;
1353 if (generate)
1355 emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1356 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1357 GEN_INT ((val >> first_not_ffff_match)
1358 & 0xffff)));
1360 num_insns += 2;
1361 return num_insns;
1364 if (zero_match == 2)
1365 goto simple_sequence;
1367 mask = 0x0ffff0000UL;
1368 for (i = 16; i < 64; i += 16, mask <<= 16)
1370 HOST_WIDE_INT comp = mask & ~(mask - 1);
1372 if (aarch64_uimm12_shift (val - (val & mask)))
1374 if (generate)
1376 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1377 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1378 emit_insn (gen_adddi3 (dest, subtarget,
1379 GEN_INT (val - (val & mask))));
1381 num_insns += 2;
1382 return num_insns;
1384 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1386 if (generate)
1388 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1389 emit_insn (gen_rtx_SET (subtarget,
1390 GEN_INT ((val + comp) & mask)));
1391 emit_insn (gen_adddi3 (dest, subtarget,
1392 GEN_INT (val - ((val + comp) & mask))));
1394 num_insns += 2;
1395 return num_insns;
1397 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1399 if (generate)
1401 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1402 emit_insn (gen_rtx_SET (subtarget,
1403 GEN_INT ((val - comp) | ~mask)));
1404 emit_insn (gen_adddi3 (dest, subtarget,
1405 GEN_INT (val - ((val - comp) | ~mask))));
1407 num_insns += 2;
1408 return num_insns;
1410 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1412 if (generate)
1414 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1415 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1416 emit_insn (gen_adddi3 (dest, subtarget,
1417 GEN_INT (val - (val | ~mask))));
1419 num_insns += 2;
1420 return num_insns;
1424 /* See if we can do it by arithmetically combining two
1425 immediates. */
1426 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1428 int j;
1429 mask = 0xffff;
1431 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1432 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1434 if (generate)
1436 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1437 emit_insn (gen_rtx_SET (subtarget,
1438 GEN_INT (aarch64_bitmasks[i])));
1439 emit_insn (gen_adddi3 (dest, subtarget,
1440 GEN_INT (val - aarch64_bitmasks[i])));
1442 num_insns += 2;
1443 return num_insns;
1446 for (j = 0; j < 64; j += 16, mask <<= 16)
1448 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1450 if (generate)
1452 emit_insn (gen_rtx_SET (dest,
1453 GEN_INT (aarch64_bitmasks[i])));
1454 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1455 GEN_INT ((val >> j) & 0xffff)));
1457 num_insns += 2;
1458 return num_insns;
1463 /* See if we can do it by logically combining two immediates. */
1464 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1466 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1468 int j;
1470 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1471 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1473 if (generate)
1475 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1476 emit_insn (gen_rtx_SET (subtarget,
1477 GEN_INT (aarch64_bitmasks[i])));
1478 emit_insn (gen_iordi3 (dest, subtarget,
1479 GEN_INT (aarch64_bitmasks[j])));
1481 num_insns += 2;
1482 return num_insns;
1485 else if ((val & aarch64_bitmasks[i]) == val)
1487 int j;
1489 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1490 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1492 if (generate)
1494 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1495 emit_insn (gen_rtx_SET (subtarget,
1496 GEN_INT (aarch64_bitmasks[j])));
1497 emit_insn (gen_anddi3 (dest, subtarget,
1498 GEN_INT (aarch64_bitmasks[i])));
1500 num_insns += 2;
1501 return num_insns;
1506 if (one_match > zero_match)
1508 /* Set either first three quarters or all but the third. */
1509 mask = 0xffffll << (16 - first_not_ffff_match);
1510 if (generate)
1511 emit_insn (gen_rtx_SET (dest,
1512 GEN_INT (val | mask | 0xffffffff00000000ull)));
1513 num_insns ++;
1515 /* Now insert other two quarters. */
1516 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1517 i < 64; i += 16, mask <<= 16)
1519 if ((val & mask) != mask)
1521 if (generate)
1522 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1523 GEN_INT ((val >> i) & 0xffff)));
1524 num_insns ++;
1527 return num_insns;
1530 simple_sequence:
1531 first = true;
1532 mask = 0xffff;
1533 for (i = 0; i < 64; i += 16, mask <<= 16)
1535 if ((val & mask) != 0)
1537 if (first)
1539 if (generate)
1540 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1541 num_insns ++;
1542 first = false;
1544 else
1546 if (generate)
1547 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1548 GEN_INT ((val >> i) & 0xffff)));
1549 num_insns ++;
1554 return num_insns;
1558 void
1559 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1561 machine_mode mode = GET_MODE (dest);
1563 gcc_assert (mode == SImode || mode == DImode);
1565 /* Check on what type of symbol it is. */
1566 if (GET_CODE (imm) == SYMBOL_REF
1567 || GET_CODE (imm) == LABEL_REF
1568 || GET_CODE (imm) == CONST)
1570 rtx mem, base, offset;
1571 enum aarch64_symbol_type sty;
1573 /* If we have (const (plus symbol offset)), separate out the offset
1574 before we start classifying the symbol. */
1575 split_const (imm, &base, &offset);
1577 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1578 switch (sty)
1580 case SYMBOL_FORCE_TO_MEM:
1581 if (offset != const0_rtx
1582 && targetm.cannot_force_const_mem (mode, imm))
1584 gcc_assert (can_create_pseudo_p ());
1585 base = aarch64_force_temporary (mode, dest, base);
1586 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1587 aarch64_emit_move (dest, base);
1588 return;
1590 mem = force_const_mem (ptr_mode, imm);
1591 gcc_assert (mem);
1592 if (mode != ptr_mode)
1593 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1594 emit_insn (gen_rtx_SET (dest, mem));
1595 return;
1597 case SYMBOL_SMALL_TLSGD:
1598 case SYMBOL_SMALL_TLSDESC:
1599 case SYMBOL_SMALL_GOTTPREL:
1600 case SYMBOL_SMALL_GOT_28K:
1601 case SYMBOL_SMALL_GOT_4G:
1602 case SYMBOL_TINY_GOT:
1603 if (offset != const0_rtx)
1605 gcc_assert(can_create_pseudo_p ());
1606 base = aarch64_force_temporary (mode, dest, base);
1607 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1608 aarch64_emit_move (dest, base);
1609 return;
1611 /* FALLTHRU */
1613 case SYMBOL_SMALL_ABSOLUTE:
1614 case SYMBOL_TINY_ABSOLUTE:
1615 case SYMBOL_TLSLE:
1616 aarch64_load_symref_appropriately (dest, imm, sty);
1617 return;
1619 default:
1620 gcc_unreachable ();
1624 if (!CONST_INT_P (imm))
1626 if (GET_CODE (imm) == HIGH)
1627 emit_insn (gen_rtx_SET (dest, imm));
1628 else
1630 rtx mem = force_const_mem (mode, imm);
1631 gcc_assert (mem);
1632 emit_insn (gen_rtx_SET (dest, mem));
1635 return;
1638 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1641 static bool
1642 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1643 tree exp ATTRIBUTE_UNUSED)
1645 /* Currently, always true. */
1646 return true;
1649 /* Implement TARGET_PASS_BY_REFERENCE. */
1651 static bool
1652 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1653 machine_mode mode,
1654 const_tree type,
1655 bool named ATTRIBUTE_UNUSED)
1657 HOST_WIDE_INT size;
1658 machine_mode dummymode;
1659 int nregs;
1661 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1662 size = (mode == BLKmode && type)
1663 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1665 /* Aggregates are passed by reference based on their size. */
1666 if (type && AGGREGATE_TYPE_P (type))
1668 size = int_size_in_bytes (type);
1671 /* Variable sized arguments are always returned by reference. */
1672 if (size < 0)
1673 return true;
1675 /* Can this be a candidate to be passed in fp/simd register(s)? */
1676 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1677 &dummymode, &nregs,
1678 NULL))
1679 return false;
1681 /* Arguments which are variable sized or larger than 2 registers are
1682 passed by reference unless they are a homogenous floating point
1683 aggregate. */
1684 return size > 2 * UNITS_PER_WORD;
1687 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1688 static bool
1689 aarch64_return_in_msb (const_tree valtype)
1691 machine_mode dummy_mode;
1692 int dummy_int;
1694 /* Never happens in little-endian mode. */
1695 if (!BYTES_BIG_ENDIAN)
1696 return false;
1698 /* Only composite types smaller than or equal to 16 bytes can
1699 be potentially returned in registers. */
1700 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1701 || int_size_in_bytes (valtype) <= 0
1702 || int_size_in_bytes (valtype) > 16)
1703 return false;
1705 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1706 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1707 is always passed/returned in the least significant bits of fp/simd
1708 register(s). */
1709 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1710 &dummy_mode, &dummy_int, NULL))
1711 return false;
1713 return true;
1716 /* Implement TARGET_FUNCTION_VALUE.
1717 Define how to find the value returned by a function. */
1719 static rtx
1720 aarch64_function_value (const_tree type, const_tree func,
1721 bool outgoing ATTRIBUTE_UNUSED)
1723 machine_mode mode;
1724 int unsignedp;
1725 int count;
1726 machine_mode ag_mode;
1728 mode = TYPE_MODE (type);
1729 if (INTEGRAL_TYPE_P (type))
1730 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1732 if (aarch64_return_in_msb (type))
1734 HOST_WIDE_INT size = int_size_in_bytes (type);
1736 if (size % UNITS_PER_WORD != 0)
1738 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1739 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1743 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1744 &ag_mode, &count, NULL))
1746 if (!aarch64_composite_type_p (type, mode))
1748 gcc_assert (count == 1 && mode == ag_mode);
1749 return gen_rtx_REG (mode, V0_REGNUM);
1751 else
1753 int i;
1754 rtx par;
1756 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1757 for (i = 0; i < count; i++)
1759 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1760 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1761 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1762 XVECEXP (par, 0, i) = tmp;
1764 return par;
1767 else
1768 return gen_rtx_REG (mode, R0_REGNUM);
1771 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1772 Return true if REGNO is the number of a hard register in which the values
1773 of called function may come back. */
1775 static bool
1776 aarch64_function_value_regno_p (const unsigned int regno)
1778 /* Maximum of 16 bytes can be returned in the general registers. Examples
1779 of 16-byte return values are: 128-bit integers and 16-byte small
1780 structures (excluding homogeneous floating-point aggregates). */
1781 if (regno == R0_REGNUM || regno == R1_REGNUM)
1782 return true;
1784 /* Up to four fp/simd registers can return a function value, e.g. a
1785 homogeneous floating-point aggregate having four members. */
1786 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1787 return TARGET_FLOAT;
1789 return false;
1792 /* Implement TARGET_RETURN_IN_MEMORY.
1794 If the type T of the result of a function is such that
1795 void func (T arg)
1796 would require that arg be passed as a value in a register (or set of
1797 registers) according to the parameter passing rules, then the result
1798 is returned in the same registers as would be used for such an
1799 argument. */
1801 static bool
1802 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1804 HOST_WIDE_INT size;
1805 machine_mode ag_mode;
1806 int count;
1808 if (!AGGREGATE_TYPE_P (type)
1809 && TREE_CODE (type) != COMPLEX_TYPE
1810 && TREE_CODE (type) != VECTOR_TYPE)
1811 /* Simple scalar types always returned in registers. */
1812 return false;
1814 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1815 type,
1816 &ag_mode,
1817 &count,
1818 NULL))
1819 return false;
1821 /* Types larger than 2 registers returned in memory. */
1822 size = int_size_in_bytes (type);
1823 return (size < 0 || size > 2 * UNITS_PER_WORD);
1826 static bool
1827 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1828 const_tree type, int *nregs)
1830 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1831 return aarch64_vfp_is_call_or_return_candidate (mode,
1832 type,
1833 &pcum->aapcs_vfp_rmode,
1834 nregs,
1835 NULL);
1838 /* Given MODE and TYPE of a function argument, return the alignment in
1839 bits. The idea is to suppress any stronger alignment requested by
1840 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1841 This is a helper function for local use only. */
1843 static unsigned int
1844 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1846 unsigned int alignment;
1848 if (type)
1850 if (!integer_zerop (TYPE_SIZE (type)))
1852 if (TYPE_MODE (type) == mode)
1853 alignment = TYPE_ALIGN (type);
1854 else
1855 alignment = GET_MODE_ALIGNMENT (mode);
1857 else
1858 alignment = 0;
1860 else
1861 alignment = GET_MODE_ALIGNMENT (mode);
1863 return alignment;
1866 /* Layout a function argument according to the AAPCS64 rules. The rule
1867 numbers refer to the rule numbers in the AAPCS64. */
1869 static void
1870 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1871 const_tree type,
1872 bool named ATTRIBUTE_UNUSED)
1874 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1875 int ncrn, nvrn, nregs;
1876 bool allocate_ncrn, allocate_nvrn;
1877 HOST_WIDE_INT size;
1879 /* We need to do this once per argument. */
1880 if (pcum->aapcs_arg_processed)
1881 return;
1883 pcum->aapcs_arg_processed = true;
1885 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1886 size
1887 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1888 UNITS_PER_WORD);
1890 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1891 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1892 mode,
1893 type,
1894 &nregs);
1896 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1897 The following code thus handles passing by SIMD/FP registers first. */
1899 nvrn = pcum->aapcs_nvrn;
1901 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1902 and homogenous short-vector aggregates (HVA). */
1903 if (allocate_nvrn)
1905 if (!TARGET_FLOAT)
1906 aarch64_err_no_fpadvsimd (mode, "argument");
1908 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1910 pcum->aapcs_nextnvrn = nvrn + nregs;
1911 if (!aarch64_composite_type_p (type, mode))
1913 gcc_assert (nregs == 1);
1914 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1916 else
1918 rtx par;
1919 int i;
1920 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1921 for (i = 0; i < nregs; i++)
1923 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1924 V0_REGNUM + nvrn + i);
1925 tmp = gen_rtx_EXPR_LIST
1926 (VOIDmode, tmp,
1927 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1928 XVECEXP (par, 0, i) = tmp;
1930 pcum->aapcs_reg = par;
1932 return;
1934 else
1936 /* C.3 NSRN is set to 8. */
1937 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1938 goto on_stack;
1942 ncrn = pcum->aapcs_ncrn;
1943 nregs = size / UNITS_PER_WORD;
1945 /* C6 - C9. though the sign and zero extension semantics are
1946 handled elsewhere. This is the case where the argument fits
1947 entirely general registers. */
1948 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1950 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1952 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1954 /* C.8 if the argument has an alignment of 16 then the NGRN is
1955 rounded up to the next even number. */
1956 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1958 ++ncrn;
1959 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1961 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1962 A reg is still generated for it, but the caller should be smart
1963 enough not to use it. */
1964 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1966 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1968 else
1970 rtx par;
1971 int i;
1973 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1974 for (i = 0; i < nregs; i++)
1976 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1977 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1978 GEN_INT (i * UNITS_PER_WORD));
1979 XVECEXP (par, 0, i) = tmp;
1981 pcum->aapcs_reg = par;
1984 pcum->aapcs_nextncrn = ncrn + nregs;
1985 return;
1988 /* C.11 */
1989 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1991 /* The argument is passed on stack; record the needed number of words for
1992 this argument and align the total size if necessary. */
1993 on_stack:
1994 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1995 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1996 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1997 16 / UNITS_PER_WORD);
1998 return;
2001 /* Implement TARGET_FUNCTION_ARG. */
2003 static rtx
2004 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2005 const_tree type, bool named)
2007 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2008 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2010 if (mode == VOIDmode)
2011 return NULL_RTX;
2013 aarch64_layout_arg (pcum_v, mode, type, named);
2014 return pcum->aapcs_reg;
2017 void
2018 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2019 const_tree fntype ATTRIBUTE_UNUSED,
2020 rtx libname ATTRIBUTE_UNUSED,
2021 const_tree fndecl ATTRIBUTE_UNUSED,
2022 unsigned n_named ATTRIBUTE_UNUSED)
2024 pcum->aapcs_ncrn = 0;
2025 pcum->aapcs_nvrn = 0;
2026 pcum->aapcs_nextncrn = 0;
2027 pcum->aapcs_nextnvrn = 0;
2028 pcum->pcs_variant = ARM_PCS_AAPCS64;
2029 pcum->aapcs_reg = NULL_RTX;
2030 pcum->aapcs_arg_processed = false;
2031 pcum->aapcs_stack_words = 0;
2032 pcum->aapcs_stack_size = 0;
2034 if (!TARGET_FLOAT
2035 && fndecl && TREE_PUBLIC (fndecl)
2036 && fntype && fntype != error_mark_node)
2038 const_tree type = TREE_TYPE (fntype);
2039 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2040 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2041 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2042 &mode, &nregs, NULL))
2043 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2045 return;
2048 static void
2049 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2050 machine_mode mode,
2051 const_tree type,
2052 bool named)
2054 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2055 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2057 aarch64_layout_arg (pcum_v, mode, type, named);
2058 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2059 != (pcum->aapcs_stack_words != 0));
2060 pcum->aapcs_arg_processed = false;
2061 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2062 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2063 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2064 pcum->aapcs_stack_words = 0;
2065 pcum->aapcs_reg = NULL_RTX;
2069 bool
2070 aarch64_function_arg_regno_p (unsigned regno)
2072 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2073 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2076 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2077 PARM_BOUNDARY bits of alignment, but will be given anything up
2078 to STACK_BOUNDARY bits if the type requires it. This makes sure
2079 that both before and after the layout of each argument, the Next
2080 Stacked Argument Address (NSAA) will have a minimum alignment of
2081 8 bytes. */
2083 static unsigned int
2084 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2086 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2088 if (alignment < PARM_BOUNDARY)
2089 alignment = PARM_BOUNDARY;
2090 if (alignment > STACK_BOUNDARY)
2091 alignment = STACK_BOUNDARY;
2092 return alignment;
2095 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2097 Return true if an argument passed on the stack should be padded upwards,
2098 i.e. if the least-significant byte of the stack slot has useful data.
2100 Small aggregate types are placed in the lowest memory address.
2102 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2104 bool
2105 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2107 /* On little-endian targets, the least significant byte of every stack
2108 argument is passed at the lowest byte address of the stack slot. */
2109 if (!BYTES_BIG_ENDIAN)
2110 return true;
2112 /* Otherwise, integral, floating-point and pointer types are padded downward:
2113 the least significant byte of a stack argument is passed at the highest
2114 byte address of the stack slot. */
2115 if (type
2116 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2117 || POINTER_TYPE_P (type))
2118 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2119 return false;
2121 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2122 return true;
2125 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2127 It specifies padding for the last (may also be the only)
2128 element of a block move between registers and memory. If
2129 assuming the block is in the memory, padding upward means that
2130 the last element is padded after its highest significant byte,
2131 while in downward padding, the last element is padded at the
2132 its least significant byte side.
2134 Small aggregates and small complex types are always padded
2135 upwards.
2137 We don't need to worry about homogeneous floating-point or
2138 short-vector aggregates; their move is not affected by the
2139 padding direction determined here. Regardless of endianness,
2140 each element of such an aggregate is put in the least
2141 significant bits of a fp/simd register.
2143 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2144 register has useful data, and return the opposite if the most
2145 significant byte does. */
2147 bool
2148 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2149 bool first ATTRIBUTE_UNUSED)
2152 /* Small composite types are always padded upward. */
2153 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2155 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2156 : GET_MODE_SIZE (mode));
2157 if (size < 2 * UNITS_PER_WORD)
2158 return true;
2161 /* Otherwise, use the default padding. */
2162 return !BYTES_BIG_ENDIAN;
2165 static machine_mode
2166 aarch64_libgcc_cmp_return_mode (void)
2168 return SImode;
2171 static bool
2172 aarch64_frame_pointer_required (void)
2174 /* In aarch64_override_options_after_change
2175 flag_omit_leaf_frame_pointer turns off the frame pointer by
2176 default. Turn it back on now if we've not got a leaf
2177 function. */
2178 if (flag_omit_leaf_frame_pointer
2179 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2180 return true;
2182 return false;
2185 /* Mark the registers that need to be saved by the callee and calculate
2186 the size of the callee-saved registers area and frame record (both FP
2187 and LR may be omitted). */
2188 static void
2189 aarch64_layout_frame (void)
2191 HOST_WIDE_INT offset = 0;
2192 int regno;
2194 if (reload_completed && cfun->machine->frame.laid_out)
2195 return;
2197 #define SLOT_NOT_REQUIRED (-2)
2198 #define SLOT_REQUIRED (-1)
2200 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2201 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2203 /* First mark all the registers that really need to be saved... */
2204 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2205 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2207 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2208 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2210 /* ... that includes the eh data registers (if needed)... */
2211 if (crtl->calls_eh_return)
2212 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2213 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2214 = SLOT_REQUIRED;
2216 /* ... and any callee saved register that dataflow says is live. */
2217 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2218 if (df_regs_ever_live_p (regno)
2219 && (regno == R30_REGNUM
2220 || !call_used_regs[regno]))
2221 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2223 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2224 if (df_regs_ever_live_p (regno)
2225 && !call_used_regs[regno])
2226 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2228 if (frame_pointer_needed)
2230 /* FP and LR are placed in the linkage record. */
2231 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2232 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2233 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2234 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2235 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2236 offset += 2 * UNITS_PER_WORD;
2239 /* Now assign stack slots for them. */
2240 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2241 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2243 cfun->machine->frame.reg_offset[regno] = offset;
2244 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2245 cfun->machine->frame.wb_candidate1 = regno;
2246 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2247 cfun->machine->frame.wb_candidate2 = regno;
2248 offset += UNITS_PER_WORD;
2251 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2252 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2254 cfun->machine->frame.reg_offset[regno] = offset;
2255 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2256 cfun->machine->frame.wb_candidate1 = regno;
2257 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2258 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2259 cfun->machine->frame.wb_candidate2 = regno;
2260 offset += UNITS_PER_WORD;
2263 cfun->machine->frame.padding0 =
2264 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2265 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2267 cfun->machine->frame.saved_regs_size = offset;
2269 cfun->machine->frame.hard_fp_offset
2270 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2271 + get_frame_size ()
2272 + cfun->machine->frame.saved_regs_size,
2273 STACK_BOUNDARY / BITS_PER_UNIT);
2275 cfun->machine->frame.frame_size
2276 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2277 + crtl->outgoing_args_size,
2278 STACK_BOUNDARY / BITS_PER_UNIT);
2280 cfun->machine->frame.laid_out = true;
2283 static bool
2284 aarch64_register_saved_on_entry (int regno)
2286 return cfun->machine->frame.reg_offset[regno] >= 0;
2289 static unsigned
2290 aarch64_next_callee_save (unsigned regno, unsigned limit)
2292 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2293 regno ++;
2294 return regno;
2297 static void
2298 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2299 HOST_WIDE_INT adjustment)
2301 rtx base_rtx = stack_pointer_rtx;
2302 rtx insn, reg, mem;
2304 reg = gen_rtx_REG (mode, regno);
2305 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2306 plus_constant (Pmode, base_rtx, -adjustment));
2307 mem = gen_rtx_MEM (mode, mem);
2309 insn = emit_move_insn (mem, reg);
2310 RTX_FRAME_RELATED_P (insn) = 1;
2313 static rtx
2314 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2315 HOST_WIDE_INT adjustment)
2317 switch (mode)
2319 case DImode:
2320 return gen_storewb_pairdi_di (base, base, reg, reg2,
2321 GEN_INT (-adjustment),
2322 GEN_INT (UNITS_PER_WORD - adjustment));
2323 case DFmode:
2324 return gen_storewb_pairdf_di (base, base, reg, reg2,
2325 GEN_INT (-adjustment),
2326 GEN_INT (UNITS_PER_WORD - adjustment));
2327 default:
2328 gcc_unreachable ();
2332 static void
2333 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2334 unsigned regno2, HOST_WIDE_INT adjustment)
2336 rtx_insn *insn;
2337 rtx reg1 = gen_rtx_REG (mode, regno1);
2338 rtx reg2 = gen_rtx_REG (mode, regno2);
2340 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2341 reg2, adjustment));
2342 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2343 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2344 RTX_FRAME_RELATED_P (insn) = 1;
2347 static rtx
2348 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2349 HOST_WIDE_INT adjustment)
2351 switch (mode)
2353 case DImode:
2354 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2355 GEN_INT (UNITS_PER_WORD));
2356 case DFmode:
2357 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2358 GEN_INT (UNITS_PER_WORD));
2359 default:
2360 gcc_unreachable ();
2364 static rtx
2365 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2366 rtx reg2)
2368 switch (mode)
2370 case DImode:
2371 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2373 case DFmode:
2374 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2376 default:
2377 gcc_unreachable ();
2381 static rtx
2382 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2383 rtx mem2)
2385 switch (mode)
2387 case DImode:
2388 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2390 case DFmode:
2391 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2393 default:
2394 gcc_unreachable ();
2399 static void
2400 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2401 unsigned start, unsigned limit, bool skip_wb)
2403 rtx_insn *insn;
2404 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2405 ? gen_frame_mem : gen_rtx_MEM);
2406 unsigned regno;
2407 unsigned regno2;
2409 for (regno = aarch64_next_callee_save (start, limit);
2410 regno <= limit;
2411 regno = aarch64_next_callee_save (regno + 1, limit))
2413 rtx reg, mem;
2414 HOST_WIDE_INT offset;
2416 if (skip_wb
2417 && (regno == cfun->machine->frame.wb_candidate1
2418 || regno == cfun->machine->frame.wb_candidate2))
2419 continue;
2421 reg = gen_rtx_REG (mode, regno);
2422 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2423 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2424 offset));
2426 regno2 = aarch64_next_callee_save (regno + 1, limit);
2428 if (regno2 <= limit
2429 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2430 == cfun->machine->frame.reg_offset[regno2]))
2433 rtx reg2 = gen_rtx_REG (mode, regno2);
2434 rtx mem2;
2436 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2437 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2438 offset));
2439 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2440 reg2));
2442 /* The first part of a frame-related parallel insn is
2443 always assumed to be relevant to the frame
2444 calculations; subsequent parts, are only
2445 frame-related if explicitly marked. */
2446 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2447 regno = regno2;
2449 else
2450 insn = emit_move_insn (mem, reg);
2452 RTX_FRAME_RELATED_P (insn) = 1;
2456 static void
2457 aarch64_restore_callee_saves (machine_mode mode,
2458 HOST_WIDE_INT start_offset, unsigned start,
2459 unsigned limit, bool skip_wb, rtx *cfi_ops)
2461 rtx base_rtx = stack_pointer_rtx;
2462 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2463 ? gen_frame_mem : gen_rtx_MEM);
2464 unsigned regno;
2465 unsigned regno2;
2466 HOST_WIDE_INT offset;
2468 for (regno = aarch64_next_callee_save (start, limit);
2469 regno <= limit;
2470 regno = aarch64_next_callee_save (regno + 1, limit))
2472 rtx reg, mem;
2474 if (skip_wb
2475 && (regno == cfun->machine->frame.wb_candidate1
2476 || regno == cfun->machine->frame.wb_candidate2))
2477 continue;
2479 reg = gen_rtx_REG (mode, regno);
2480 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2481 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2483 regno2 = aarch64_next_callee_save (regno + 1, limit);
2485 if (regno2 <= limit
2486 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2487 == cfun->machine->frame.reg_offset[regno2]))
2489 rtx reg2 = gen_rtx_REG (mode, regno2);
2490 rtx mem2;
2492 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2493 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2494 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2496 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2497 regno = regno2;
2499 else
2500 emit_move_insn (reg, mem);
2501 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2505 /* AArch64 stack frames generated by this compiler look like:
2507 +-------------------------------+
2509 | incoming stack arguments |
2511 +-------------------------------+
2512 | | <-- incoming stack pointer (aligned)
2513 | callee-allocated save area |
2514 | for register varargs |
2516 +-------------------------------+
2517 | local variables | <-- frame_pointer_rtx
2519 +-------------------------------+
2520 | padding0 | \
2521 +-------------------------------+ |
2522 | callee-saved registers | | frame.saved_regs_size
2523 +-------------------------------+ |
2524 | LR' | |
2525 +-------------------------------+ |
2526 | FP' | / <- hard_frame_pointer_rtx (aligned)
2527 +-------------------------------+
2528 | dynamic allocation |
2529 +-------------------------------+
2530 | padding |
2531 +-------------------------------+
2532 | outgoing stack arguments | <-- arg_pointer
2534 +-------------------------------+
2535 | | <-- stack_pointer_rtx (aligned)
2537 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2538 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2539 unchanged. */
2541 /* Generate the prologue instructions for entry into a function.
2542 Establish the stack frame by decreasing the stack pointer with a
2543 properly calculated size and, if necessary, create a frame record
2544 filled with the values of LR and previous frame pointer. The
2545 current FP is also set up if it is in use. */
2547 void
2548 aarch64_expand_prologue (void)
2550 /* sub sp, sp, #<frame_size>
2551 stp {fp, lr}, [sp, #<frame_size> - 16]
2552 add fp, sp, #<frame_size> - hardfp_offset
2553 stp {cs_reg}, [fp, #-16] etc.
2555 sub sp, sp, <final_adjustment_if_any>
2557 HOST_WIDE_INT frame_size, offset;
2558 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2559 HOST_WIDE_INT hard_fp_offset;
2560 rtx_insn *insn;
2562 aarch64_layout_frame ();
2564 offset = frame_size = cfun->machine->frame.frame_size;
2565 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2566 fp_offset = frame_size - hard_fp_offset;
2568 if (flag_stack_usage_info)
2569 current_function_static_stack_size = frame_size;
2571 /* Store pairs and load pairs have a range only -512 to 504. */
2572 if (offset >= 512)
2574 /* When the frame has a large size, an initial decrease is done on
2575 the stack pointer to jump over the callee-allocated save area for
2576 register varargs, the local variable area and/or the callee-saved
2577 register area. This will allow the pre-index write-back
2578 store pair instructions to be used for setting up the stack frame
2579 efficiently. */
2580 offset = hard_fp_offset;
2581 if (offset >= 512)
2582 offset = cfun->machine->frame.saved_regs_size;
2584 frame_size -= (offset + crtl->outgoing_args_size);
2585 fp_offset = 0;
2587 if (frame_size >= 0x1000000)
2589 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2590 emit_move_insn (op0, GEN_INT (-frame_size));
2591 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2593 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2594 gen_rtx_SET (stack_pointer_rtx,
2595 plus_constant (Pmode, stack_pointer_rtx,
2596 -frame_size)));
2597 RTX_FRAME_RELATED_P (insn) = 1;
2599 else if (frame_size > 0)
2601 int hi_ofs = frame_size & 0xfff000;
2602 int lo_ofs = frame_size & 0x000fff;
2604 if (hi_ofs)
2606 insn = emit_insn (gen_add2_insn
2607 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2608 RTX_FRAME_RELATED_P (insn) = 1;
2610 if (lo_ofs)
2612 insn = emit_insn (gen_add2_insn
2613 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2614 RTX_FRAME_RELATED_P (insn) = 1;
2618 else
2619 frame_size = -1;
2621 if (offset > 0)
2623 bool skip_wb = false;
2625 if (frame_pointer_needed)
2627 skip_wb = true;
2629 if (fp_offset)
2631 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2632 GEN_INT (-offset)));
2633 RTX_FRAME_RELATED_P (insn) = 1;
2635 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2636 R30_REGNUM, false);
2638 else
2639 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2641 /* Set up frame pointer to point to the location of the
2642 previous frame pointer on the stack. */
2643 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2644 stack_pointer_rtx,
2645 GEN_INT (fp_offset)));
2646 RTX_FRAME_RELATED_P (insn) = 1;
2647 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2649 else
2651 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2652 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2654 if (fp_offset
2655 || reg1 == FIRST_PSEUDO_REGISTER
2656 || (reg2 == FIRST_PSEUDO_REGISTER
2657 && offset >= 256))
2659 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2660 GEN_INT (-offset)));
2661 RTX_FRAME_RELATED_P (insn) = 1;
2663 else
2665 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2667 skip_wb = true;
2669 if (reg2 == FIRST_PSEUDO_REGISTER)
2670 aarch64_pushwb_single_reg (mode1, reg1, offset);
2671 else
2672 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2676 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2677 skip_wb);
2678 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2679 skip_wb);
2682 /* when offset >= 512,
2683 sub sp, sp, #<outgoing_args_size> */
2684 if (frame_size > -1)
2686 if (crtl->outgoing_args_size > 0)
2688 insn = emit_insn (gen_add2_insn
2689 (stack_pointer_rtx,
2690 GEN_INT (- crtl->outgoing_args_size)));
2691 RTX_FRAME_RELATED_P (insn) = 1;
2696 /* Return TRUE if we can use a simple_return insn.
2698 This function checks whether the callee saved stack is empty, which
2699 means no restore actions are need. The pro_and_epilogue will use
2700 this to check whether shrink-wrapping opt is feasible. */
2702 bool
2703 aarch64_use_return_insn_p (void)
2705 if (!reload_completed)
2706 return false;
2708 if (crtl->profile)
2709 return false;
2711 aarch64_layout_frame ();
2713 return cfun->machine->frame.frame_size == 0;
2716 /* Generate the epilogue instructions for returning from a function. */
2717 void
2718 aarch64_expand_epilogue (bool for_sibcall)
2720 HOST_WIDE_INT frame_size, offset;
2721 HOST_WIDE_INT fp_offset;
2722 HOST_WIDE_INT hard_fp_offset;
2723 rtx_insn *insn;
2724 /* We need to add memory barrier to prevent read from deallocated stack. */
2725 bool need_barrier_p = (get_frame_size () != 0
2726 || cfun->machine->frame.saved_varargs_size);
2728 aarch64_layout_frame ();
2730 offset = frame_size = cfun->machine->frame.frame_size;
2731 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2732 fp_offset = frame_size - hard_fp_offset;
2734 /* Store pairs and load pairs have a range only -512 to 504. */
2735 if (offset >= 512)
2737 offset = hard_fp_offset;
2738 if (offset >= 512)
2739 offset = cfun->machine->frame.saved_regs_size;
2741 frame_size -= (offset + crtl->outgoing_args_size);
2742 fp_offset = 0;
2743 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2745 insn = emit_insn (gen_add2_insn
2746 (stack_pointer_rtx,
2747 GEN_INT (crtl->outgoing_args_size)));
2748 RTX_FRAME_RELATED_P (insn) = 1;
2751 else
2752 frame_size = -1;
2754 /* If there were outgoing arguments or we've done dynamic stack
2755 allocation, then restore the stack pointer from the frame
2756 pointer. This is at most one insn and more efficient than using
2757 GCC's internal mechanism. */
2758 if (frame_pointer_needed
2759 && (crtl->outgoing_args_size || cfun->calls_alloca))
2761 if (cfun->calls_alloca)
2762 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2764 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2765 hard_frame_pointer_rtx,
2766 GEN_INT (0)));
2767 offset = offset - fp_offset;
2770 if (offset > 0)
2772 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2773 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2774 bool skip_wb = true;
2775 rtx cfi_ops = NULL;
2777 if (frame_pointer_needed)
2778 fp_offset = 0;
2779 else if (fp_offset
2780 || reg1 == FIRST_PSEUDO_REGISTER
2781 || (reg2 == FIRST_PSEUDO_REGISTER
2782 && offset >= 256))
2783 skip_wb = false;
2785 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2786 skip_wb, &cfi_ops);
2787 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2788 skip_wb, &cfi_ops);
2790 if (need_barrier_p)
2791 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2793 if (skip_wb)
2795 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2796 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2798 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2799 if (reg2 == FIRST_PSEUDO_REGISTER)
2801 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2802 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2803 mem = gen_rtx_MEM (mode1, mem);
2804 insn = emit_move_insn (rreg1, mem);
2806 else
2808 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2810 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2811 insn = emit_insn (aarch64_gen_loadwb_pair
2812 (mode1, stack_pointer_rtx, rreg1,
2813 rreg2, offset));
2816 else
2818 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2819 GEN_INT (offset)));
2822 /* Reset the CFA to be SP + FRAME_SIZE. */
2823 rtx new_cfa = stack_pointer_rtx;
2824 if (frame_size > 0)
2825 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2826 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2827 REG_NOTES (insn) = cfi_ops;
2828 RTX_FRAME_RELATED_P (insn) = 1;
2831 if (frame_size > 0)
2833 if (need_barrier_p)
2834 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2836 if (frame_size >= 0x1000000)
2838 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2839 emit_move_insn (op0, GEN_INT (frame_size));
2840 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2842 else
2844 int hi_ofs = frame_size & 0xfff000;
2845 int lo_ofs = frame_size & 0x000fff;
2847 if (hi_ofs && lo_ofs)
2849 insn = emit_insn (gen_add2_insn
2850 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2851 RTX_FRAME_RELATED_P (insn) = 1;
2852 frame_size = lo_ofs;
2854 insn = emit_insn (gen_add2_insn
2855 (stack_pointer_rtx, GEN_INT (frame_size)));
2858 /* Reset the CFA to be SP + 0. */
2859 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2860 RTX_FRAME_RELATED_P (insn) = 1;
2863 /* Stack adjustment for exception handler. */
2864 if (crtl->calls_eh_return)
2866 /* We need to unwind the stack by the offset computed by
2867 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2868 to be SP; letting the CFA move during this adjustment
2869 is just as correct as retaining the CFA from the body
2870 of the function. Therefore, do nothing special. */
2871 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2874 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2875 if (!for_sibcall)
2876 emit_jump_insn (ret_rtx);
2879 /* Return the place to copy the exception unwinding return address to.
2880 This will probably be a stack slot, but could (in theory be the
2881 return register). */
2883 aarch64_final_eh_return_addr (void)
2885 HOST_WIDE_INT fp_offset;
2887 aarch64_layout_frame ();
2889 fp_offset = cfun->machine->frame.frame_size
2890 - cfun->machine->frame.hard_fp_offset;
2892 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2893 return gen_rtx_REG (DImode, LR_REGNUM);
2895 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2896 result in a store to save LR introduced by builtin_eh_return () being
2897 incorrectly deleted because the alias is not detected.
2898 So in the calculation of the address to copy the exception unwinding
2899 return address to, we note 2 cases.
2900 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2901 we return a SP-relative location since all the addresses are SP-relative
2902 in this case. This prevents the store from being optimized away.
2903 If the fp_offset is not 0, then the addresses will be FP-relative and
2904 therefore we return a FP-relative location. */
2906 if (frame_pointer_needed)
2908 if (fp_offset)
2909 return gen_frame_mem (DImode,
2910 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2911 else
2912 return gen_frame_mem (DImode,
2913 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2916 /* If FP is not needed, we calculate the location of LR, which would be
2917 at the top of the saved registers block. */
2919 return gen_frame_mem (DImode,
2920 plus_constant (Pmode,
2921 stack_pointer_rtx,
2922 fp_offset
2923 + cfun->machine->frame.saved_regs_size
2924 - 2 * UNITS_PER_WORD));
2927 /* Possibly output code to build up a constant in a register. For
2928 the benefit of the costs infrastructure, returns the number of
2929 instructions which would be emitted. GENERATE inhibits or
2930 enables code generation. */
2932 static int
2933 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2935 int insns = 0;
2937 if (aarch64_bitmask_imm (val, DImode))
2939 if (generate)
2940 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2941 insns = 1;
2943 else
2945 int i;
2946 int ncount = 0;
2947 int zcount = 0;
2948 HOST_WIDE_INT valp = val >> 16;
2949 HOST_WIDE_INT valm;
2950 HOST_WIDE_INT tval;
2952 for (i = 16; i < 64; i += 16)
2954 valm = (valp & 0xffff);
2956 if (valm != 0)
2957 ++ zcount;
2959 if (valm != 0xffff)
2960 ++ ncount;
2962 valp >>= 16;
2965 /* zcount contains the number of additional MOVK instructions
2966 required if the constant is built up with an initial MOVZ instruction,
2967 while ncount is the number of MOVK instructions required if starting
2968 with a MOVN instruction. Choose the sequence that yields the fewest
2969 number of instructions, preferring MOVZ instructions when they are both
2970 the same. */
2971 if (ncount < zcount)
2973 if (generate)
2974 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2975 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2976 tval = 0xffff;
2977 insns++;
2979 else
2981 if (generate)
2982 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2983 GEN_INT (val & 0xffff));
2984 tval = 0;
2985 insns++;
2988 val >>= 16;
2990 for (i = 16; i < 64; i += 16)
2992 if ((val & 0xffff) != tval)
2994 if (generate)
2995 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2996 GEN_INT (i),
2997 GEN_INT (val & 0xffff)));
2998 insns++;
3000 val >>= 16;
3003 return insns;
3006 static void
3007 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3009 HOST_WIDE_INT mdelta = delta;
3010 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3011 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3013 if (mdelta < 0)
3014 mdelta = -mdelta;
3016 if (mdelta >= 4096 * 4096)
3018 (void) aarch64_build_constant (scratchreg, delta, true);
3019 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3021 else if (mdelta > 0)
3023 if (mdelta >= 4096)
3025 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3026 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3027 if (delta < 0)
3028 emit_insn (gen_rtx_SET (this_rtx,
3029 gen_rtx_MINUS (Pmode, this_rtx, shift)));
3030 else
3031 emit_insn (gen_rtx_SET (this_rtx,
3032 gen_rtx_PLUS (Pmode, this_rtx, shift)));
3034 if (mdelta % 4096 != 0)
3036 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3037 emit_insn (gen_rtx_SET (this_rtx,
3038 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3043 /* Output code to add DELTA to the first argument, and then jump
3044 to FUNCTION. Used for C++ multiple inheritance. */
3045 static void
3046 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3047 HOST_WIDE_INT delta,
3048 HOST_WIDE_INT vcall_offset,
3049 tree function)
3051 /* The this pointer is always in x0. Note that this differs from
3052 Arm where the this pointer maybe bumped to r1 if r0 is required
3053 to return a pointer to an aggregate. On AArch64 a result value
3054 pointer will be in x8. */
3055 int this_regno = R0_REGNUM;
3056 rtx this_rtx, temp0, temp1, addr, funexp;
3057 rtx_insn *insn;
3059 reload_completed = 1;
3060 emit_note (NOTE_INSN_PROLOGUE_END);
3062 if (vcall_offset == 0)
3063 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3064 else
3066 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3068 this_rtx = gen_rtx_REG (Pmode, this_regno);
3069 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3070 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3072 addr = this_rtx;
3073 if (delta != 0)
3075 if (delta >= -256 && delta < 256)
3076 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3077 plus_constant (Pmode, this_rtx, delta));
3078 else
3079 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3082 if (Pmode == ptr_mode)
3083 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3084 else
3085 aarch64_emit_move (temp0,
3086 gen_rtx_ZERO_EXTEND (Pmode,
3087 gen_rtx_MEM (ptr_mode, addr)));
3089 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3090 addr = plus_constant (Pmode, temp0, vcall_offset);
3091 else
3093 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3094 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3097 if (Pmode == ptr_mode)
3098 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3099 else
3100 aarch64_emit_move (temp1,
3101 gen_rtx_SIGN_EXTEND (Pmode,
3102 gen_rtx_MEM (ptr_mode, addr)));
3104 emit_insn (gen_add2_insn (this_rtx, temp1));
3107 /* Generate a tail call to the target function. */
3108 if (!TREE_USED (function))
3110 assemble_external (function);
3111 TREE_USED (function) = 1;
3113 funexp = XEXP (DECL_RTL (function), 0);
3114 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3115 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3116 SIBLING_CALL_P (insn) = 1;
3118 insn = get_insns ();
3119 shorten_branches (insn);
3120 final_start_function (insn, file, 1);
3121 final (insn, file, 1);
3122 final_end_function ();
3124 /* Stop pretending to be a post-reload pass. */
3125 reload_completed = 0;
3128 static bool
3129 aarch64_tls_referenced_p (rtx x)
3131 if (!TARGET_HAVE_TLS)
3132 return false;
3133 subrtx_iterator::array_type array;
3134 FOR_EACH_SUBRTX (iter, array, x, ALL)
3136 const_rtx x = *iter;
3137 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3138 return true;
3139 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3140 TLS offsets, not real symbol references. */
3141 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3142 iter.skip_subrtxes ();
3144 return false;
3148 static int
3149 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3151 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3152 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3154 if (*imm1 < *imm2)
3155 return -1;
3156 if (*imm1 > *imm2)
3157 return +1;
3158 return 0;
3162 static void
3163 aarch64_build_bitmask_table (void)
3165 unsigned HOST_WIDE_INT mask, imm;
3166 unsigned int log_e, e, s, r;
3167 unsigned int nimms = 0;
3169 for (log_e = 1; log_e <= 6; log_e++)
3171 e = 1 << log_e;
3172 if (e == 64)
3173 mask = ~(HOST_WIDE_INT) 0;
3174 else
3175 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3176 for (s = 1; s < e; s++)
3178 for (r = 0; r < e; r++)
3180 /* set s consecutive bits to 1 (s < 64) */
3181 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3182 /* rotate right by r */
3183 if (r != 0)
3184 imm = ((imm >> r) | (imm << (e - r))) & mask;
3185 /* replicate the constant depending on SIMD size */
3186 switch (log_e) {
3187 case 1: imm |= (imm << 2);
3188 case 2: imm |= (imm << 4);
3189 case 3: imm |= (imm << 8);
3190 case 4: imm |= (imm << 16);
3191 case 5: imm |= (imm << 32);
3192 case 6:
3193 break;
3194 default:
3195 gcc_unreachable ();
3197 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3198 aarch64_bitmasks[nimms++] = imm;
3203 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3204 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3205 aarch64_bitmasks_cmp);
3209 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3210 a left shift of 0 or 12 bits. */
3211 bool
3212 aarch64_uimm12_shift (HOST_WIDE_INT val)
3214 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3215 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3220 /* Return true if val is an immediate that can be loaded into a
3221 register by a MOVZ instruction. */
3222 static bool
3223 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3225 if (GET_MODE_SIZE (mode) > 4)
3227 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3228 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3229 return 1;
3231 else
3233 /* Ignore sign extension. */
3234 val &= (HOST_WIDE_INT) 0xffffffff;
3236 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3237 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3241 /* Return true if val is a valid bitmask immediate. */
3242 bool
3243 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3245 if (GET_MODE_SIZE (mode) < 8)
3247 /* Replicate bit pattern. */
3248 val &= (HOST_WIDE_INT) 0xffffffff;
3249 val |= val << 32;
3251 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3252 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3256 /* Return true if val is an immediate that can be loaded into a
3257 register in a single instruction. */
3258 bool
3259 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3261 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3262 return 1;
3263 return aarch64_bitmask_imm (val, mode);
3266 static bool
3267 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3269 rtx base, offset;
3271 if (GET_CODE (x) == HIGH)
3272 return true;
3274 split_const (x, &base, &offset);
3275 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3277 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3278 != SYMBOL_FORCE_TO_MEM)
3279 return true;
3280 else
3281 /* Avoid generating a 64-bit relocation in ILP32; leave
3282 to aarch64_expand_mov_immediate to handle it properly. */
3283 return mode != ptr_mode;
3286 return aarch64_tls_referenced_p (x);
3289 /* Return true if register REGNO is a valid index register.
3290 STRICT_P is true if REG_OK_STRICT is in effect. */
3292 bool
3293 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3295 if (!HARD_REGISTER_NUM_P (regno))
3297 if (!strict_p)
3298 return true;
3300 if (!reg_renumber)
3301 return false;
3303 regno = reg_renumber[regno];
3305 return GP_REGNUM_P (regno);
3308 /* Return true if register REGNO is a valid base register for mode MODE.
3309 STRICT_P is true if REG_OK_STRICT is in effect. */
3311 bool
3312 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3314 if (!HARD_REGISTER_NUM_P (regno))
3316 if (!strict_p)
3317 return true;
3319 if (!reg_renumber)
3320 return false;
3322 regno = reg_renumber[regno];
3325 /* The fake registers will be eliminated to either the stack or
3326 hard frame pointer, both of which are usually valid base registers.
3327 Reload deals with the cases where the eliminated form isn't valid. */
3328 return (GP_REGNUM_P (regno)
3329 || regno == SP_REGNUM
3330 || regno == FRAME_POINTER_REGNUM
3331 || regno == ARG_POINTER_REGNUM);
3334 /* Return true if X is a valid base register for mode MODE.
3335 STRICT_P is true if REG_OK_STRICT is in effect. */
3337 static bool
3338 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3340 if (!strict_p && GET_CODE (x) == SUBREG)
3341 x = SUBREG_REG (x);
3343 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3346 /* Return true if address offset is a valid index. If it is, fill in INFO
3347 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3349 static bool
3350 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3351 machine_mode mode, bool strict_p)
3353 enum aarch64_address_type type;
3354 rtx index;
3355 int shift;
3357 /* (reg:P) */
3358 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3359 && GET_MODE (x) == Pmode)
3361 type = ADDRESS_REG_REG;
3362 index = x;
3363 shift = 0;
3365 /* (sign_extend:DI (reg:SI)) */
3366 else if ((GET_CODE (x) == SIGN_EXTEND
3367 || GET_CODE (x) == ZERO_EXTEND)
3368 && GET_MODE (x) == DImode
3369 && GET_MODE (XEXP (x, 0)) == SImode)
3371 type = (GET_CODE (x) == SIGN_EXTEND)
3372 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3373 index = XEXP (x, 0);
3374 shift = 0;
3376 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3377 else if (GET_CODE (x) == MULT
3378 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3379 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3380 && GET_MODE (XEXP (x, 0)) == DImode
3381 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3382 && CONST_INT_P (XEXP (x, 1)))
3384 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3385 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3386 index = XEXP (XEXP (x, 0), 0);
3387 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3389 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3390 else if (GET_CODE (x) == ASHIFT
3391 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3392 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3393 && GET_MODE (XEXP (x, 0)) == DImode
3394 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3395 && CONST_INT_P (XEXP (x, 1)))
3397 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3398 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3399 index = XEXP (XEXP (x, 0), 0);
3400 shift = INTVAL (XEXP (x, 1));
3402 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3403 else if ((GET_CODE (x) == SIGN_EXTRACT
3404 || GET_CODE (x) == ZERO_EXTRACT)
3405 && GET_MODE (x) == DImode
3406 && GET_CODE (XEXP (x, 0)) == MULT
3407 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3408 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3410 type = (GET_CODE (x) == SIGN_EXTRACT)
3411 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3412 index = XEXP (XEXP (x, 0), 0);
3413 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3414 if (INTVAL (XEXP (x, 1)) != 32 + shift
3415 || INTVAL (XEXP (x, 2)) != 0)
3416 shift = -1;
3418 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3419 (const_int 0xffffffff<<shift)) */
3420 else if (GET_CODE (x) == AND
3421 && GET_MODE (x) == DImode
3422 && GET_CODE (XEXP (x, 0)) == MULT
3423 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3424 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3425 && CONST_INT_P (XEXP (x, 1)))
3427 type = ADDRESS_REG_UXTW;
3428 index = XEXP (XEXP (x, 0), 0);
3429 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3430 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3431 shift = -1;
3433 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3434 else if ((GET_CODE (x) == SIGN_EXTRACT
3435 || GET_CODE (x) == ZERO_EXTRACT)
3436 && GET_MODE (x) == DImode
3437 && GET_CODE (XEXP (x, 0)) == ASHIFT
3438 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3439 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3441 type = (GET_CODE (x) == SIGN_EXTRACT)
3442 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3443 index = XEXP (XEXP (x, 0), 0);
3444 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3445 if (INTVAL (XEXP (x, 1)) != 32 + shift
3446 || INTVAL (XEXP (x, 2)) != 0)
3447 shift = -1;
3449 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3450 (const_int 0xffffffff<<shift)) */
3451 else if (GET_CODE (x) == AND
3452 && GET_MODE (x) == DImode
3453 && GET_CODE (XEXP (x, 0)) == ASHIFT
3454 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3455 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3456 && CONST_INT_P (XEXP (x, 1)))
3458 type = ADDRESS_REG_UXTW;
3459 index = XEXP (XEXP (x, 0), 0);
3460 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3461 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3462 shift = -1;
3464 /* (mult:P (reg:P) (const_int scale)) */
3465 else if (GET_CODE (x) == MULT
3466 && GET_MODE (x) == Pmode
3467 && GET_MODE (XEXP (x, 0)) == Pmode
3468 && CONST_INT_P (XEXP (x, 1)))
3470 type = ADDRESS_REG_REG;
3471 index = XEXP (x, 0);
3472 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3474 /* (ashift:P (reg:P) (const_int shift)) */
3475 else if (GET_CODE (x) == ASHIFT
3476 && GET_MODE (x) == Pmode
3477 && GET_MODE (XEXP (x, 0)) == Pmode
3478 && CONST_INT_P (XEXP (x, 1)))
3480 type = ADDRESS_REG_REG;
3481 index = XEXP (x, 0);
3482 shift = INTVAL (XEXP (x, 1));
3484 else
3485 return false;
3487 if (GET_CODE (index) == SUBREG)
3488 index = SUBREG_REG (index);
3490 if ((shift == 0 ||
3491 (shift > 0 && shift <= 3
3492 && (1 << shift) == GET_MODE_SIZE (mode)))
3493 && REG_P (index)
3494 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3496 info->type = type;
3497 info->offset = index;
3498 info->shift = shift;
3499 return true;
3502 return false;
3505 bool
3506 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3508 return (offset >= -64 * GET_MODE_SIZE (mode)
3509 && offset < 64 * GET_MODE_SIZE (mode)
3510 && offset % GET_MODE_SIZE (mode) == 0);
3513 static inline bool
3514 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3515 HOST_WIDE_INT offset)
3517 return offset >= -256 && offset < 256;
3520 static inline bool
3521 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3523 return (offset >= 0
3524 && offset < 4096 * GET_MODE_SIZE (mode)
3525 && offset % GET_MODE_SIZE (mode) == 0);
3528 /* Return true if X is a valid address for machine mode MODE. If it is,
3529 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3530 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3532 static bool
3533 aarch64_classify_address (struct aarch64_address_info *info,
3534 rtx x, machine_mode mode,
3535 RTX_CODE outer_code, bool strict_p)
3537 enum rtx_code code = GET_CODE (x);
3538 rtx op0, op1;
3540 /* On BE, we use load/store pair for all large int mode load/stores. */
3541 bool load_store_pair_p = (outer_code == PARALLEL
3542 || (BYTES_BIG_ENDIAN
3543 && aarch64_vect_struct_mode_p (mode)));
3545 bool allow_reg_index_p =
3546 !load_store_pair_p
3547 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3548 && !aarch64_vect_struct_mode_p (mode);
3550 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3551 REG addressing. */
3552 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3553 && (code != POST_INC && code != REG))
3554 return false;
3556 switch (code)
3558 case REG:
3559 case SUBREG:
3560 info->type = ADDRESS_REG_IMM;
3561 info->base = x;
3562 info->offset = const0_rtx;
3563 return aarch64_base_register_rtx_p (x, strict_p);
3565 case PLUS:
3566 op0 = XEXP (x, 0);
3567 op1 = XEXP (x, 1);
3569 if (! strict_p
3570 && REG_P (op0)
3571 && (op0 == virtual_stack_vars_rtx
3572 || op0 == frame_pointer_rtx
3573 || op0 == arg_pointer_rtx)
3574 && CONST_INT_P (op1))
3576 info->type = ADDRESS_REG_IMM;
3577 info->base = op0;
3578 info->offset = op1;
3580 return true;
3583 if (GET_MODE_SIZE (mode) != 0
3584 && CONST_INT_P (op1)
3585 && aarch64_base_register_rtx_p (op0, strict_p))
3587 HOST_WIDE_INT offset = INTVAL (op1);
3589 info->type = ADDRESS_REG_IMM;
3590 info->base = op0;
3591 info->offset = op1;
3593 /* TImode and TFmode values are allowed in both pairs of X
3594 registers and individual Q registers. The available
3595 address modes are:
3596 X,X: 7-bit signed scaled offset
3597 Q: 9-bit signed offset
3598 We conservatively require an offset representable in either mode.
3600 if (mode == TImode || mode == TFmode)
3601 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3602 && offset_9bit_signed_unscaled_p (mode, offset));
3604 /* A 7bit offset check because OImode will emit a ldp/stp
3605 instruction (only big endian will get here).
3606 For ldp/stp instructions, the offset is scaled for the size of a
3607 single element of the pair. */
3608 if (mode == OImode)
3609 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3611 /* Three 9/12 bit offsets checks because CImode will emit three
3612 ldr/str instructions (only big endian will get here). */
3613 if (mode == CImode)
3614 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3615 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3616 || offset_12bit_unsigned_scaled_p (V16QImode,
3617 offset + 32)));
3619 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3620 instructions (only big endian will get here). */
3621 if (mode == XImode)
3622 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3623 && aarch64_offset_7bit_signed_scaled_p (TImode,
3624 offset + 32));
3626 if (load_store_pair_p)
3627 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3628 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3629 else
3630 return (offset_9bit_signed_unscaled_p (mode, offset)
3631 || offset_12bit_unsigned_scaled_p (mode, offset));
3634 if (allow_reg_index_p)
3636 /* Look for base + (scaled/extended) index register. */
3637 if (aarch64_base_register_rtx_p (op0, strict_p)
3638 && aarch64_classify_index (info, op1, mode, strict_p))
3640 info->base = op0;
3641 return true;
3643 if (aarch64_base_register_rtx_p (op1, strict_p)
3644 && aarch64_classify_index (info, op0, mode, strict_p))
3646 info->base = op1;
3647 return true;
3651 return false;
3653 case POST_INC:
3654 case POST_DEC:
3655 case PRE_INC:
3656 case PRE_DEC:
3657 info->type = ADDRESS_REG_WB;
3658 info->base = XEXP (x, 0);
3659 info->offset = NULL_RTX;
3660 return aarch64_base_register_rtx_p (info->base, strict_p);
3662 case POST_MODIFY:
3663 case PRE_MODIFY:
3664 info->type = ADDRESS_REG_WB;
3665 info->base = XEXP (x, 0);
3666 if (GET_CODE (XEXP (x, 1)) == PLUS
3667 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3668 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3669 && aarch64_base_register_rtx_p (info->base, strict_p))
3671 HOST_WIDE_INT offset;
3672 info->offset = XEXP (XEXP (x, 1), 1);
3673 offset = INTVAL (info->offset);
3675 /* TImode and TFmode values are allowed in both pairs of X
3676 registers and individual Q registers. The available
3677 address modes are:
3678 X,X: 7-bit signed scaled offset
3679 Q: 9-bit signed offset
3680 We conservatively require an offset representable in either mode.
3682 if (mode == TImode || mode == TFmode)
3683 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3684 && offset_9bit_signed_unscaled_p (mode, offset));
3686 if (load_store_pair_p)
3687 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3688 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3689 else
3690 return offset_9bit_signed_unscaled_p (mode, offset);
3692 return false;
3694 case CONST:
3695 case SYMBOL_REF:
3696 case LABEL_REF:
3697 /* load literal: pc-relative constant pool entry. Only supported
3698 for SI mode or larger. */
3699 info->type = ADDRESS_SYMBOLIC;
3701 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3703 rtx sym, addend;
3705 split_const (x, &sym, &addend);
3706 return (GET_CODE (sym) == LABEL_REF
3707 || (GET_CODE (sym) == SYMBOL_REF
3708 && CONSTANT_POOL_ADDRESS_P (sym)));
3710 return false;
3712 case LO_SUM:
3713 info->type = ADDRESS_LO_SUM;
3714 info->base = XEXP (x, 0);
3715 info->offset = XEXP (x, 1);
3716 if (allow_reg_index_p
3717 && aarch64_base_register_rtx_p (info->base, strict_p))
3719 rtx sym, offs;
3720 split_const (info->offset, &sym, &offs);
3721 if (GET_CODE (sym) == SYMBOL_REF
3722 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3723 == SYMBOL_SMALL_ABSOLUTE))
3725 /* The symbol and offset must be aligned to the access size. */
3726 unsigned int align;
3727 unsigned int ref_size;
3729 if (CONSTANT_POOL_ADDRESS_P (sym))
3730 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3731 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3733 tree exp = SYMBOL_REF_DECL (sym);
3734 align = TYPE_ALIGN (TREE_TYPE (exp));
3735 align = CONSTANT_ALIGNMENT (exp, align);
3737 else if (SYMBOL_REF_DECL (sym))
3738 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3739 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3740 && SYMBOL_REF_BLOCK (sym) != NULL)
3741 align = SYMBOL_REF_BLOCK (sym)->alignment;
3742 else
3743 align = BITS_PER_UNIT;
3745 ref_size = GET_MODE_SIZE (mode);
3746 if (ref_size == 0)
3747 ref_size = GET_MODE_SIZE (DImode);
3749 return ((INTVAL (offs) & (ref_size - 1)) == 0
3750 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3753 return false;
3755 default:
3756 return false;
3760 bool
3761 aarch64_symbolic_address_p (rtx x)
3763 rtx offset;
3765 split_const (x, &x, &offset);
3766 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3769 /* Classify the base of symbolic expression X, given that X appears in
3770 context CONTEXT. */
3772 enum aarch64_symbol_type
3773 aarch64_classify_symbolic_expression (rtx x,
3774 enum aarch64_symbol_context context)
3776 rtx offset;
3778 split_const (x, &x, &offset);
3779 return aarch64_classify_symbol (x, offset, context);
3783 /* Return TRUE if X is a legitimate address for accessing memory in
3784 mode MODE. */
3785 static bool
3786 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3788 struct aarch64_address_info addr;
3790 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3793 /* Return TRUE if X is a legitimate address for accessing memory in
3794 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3795 pair operation. */
3796 bool
3797 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3798 RTX_CODE outer_code, bool strict_p)
3800 struct aarch64_address_info addr;
3802 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3805 /* Return TRUE if rtx X is immediate constant 0.0 */
3806 bool
3807 aarch64_float_const_zero_rtx_p (rtx x)
3809 REAL_VALUE_TYPE r;
3811 if (GET_MODE (x) == VOIDmode)
3812 return false;
3814 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3815 if (REAL_VALUE_MINUS_ZERO (r))
3816 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3817 return REAL_VALUES_EQUAL (r, dconst0);
3820 /* Return the fixed registers used for condition codes. */
3822 static bool
3823 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3825 *p1 = CC_REGNUM;
3826 *p2 = INVALID_REGNUM;
3827 return true;
3830 /* Emit call insn with PAT and do aarch64-specific handling. */
3832 void
3833 aarch64_emit_call_insn (rtx pat)
3835 rtx insn = emit_call_insn (pat);
3837 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3838 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3839 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3842 machine_mode
3843 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3845 /* All floating point compares return CCFP if it is an equality
3846 comparison, and CCFPE otherwise. */
3847 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3849 switch (code)
3851 case EQ:
3852 case NE:
3853 case UNORDERED:
3854 case ORDERED:
3855 case UNLT:
3856 case UNLE:
3857 case UNGT:
3858 case UNGE:
3859 case UNEQ:
3860 case LTGT:
3861 return CCFPmode;
3863 case LT:
3864 case LE:
3865 case GT:
3866 case GE:
3867 return CCFPEmode;
3869 default:
3870 gcc_unreachable ();
3874 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3875 && y == const0_rtx
3876 && (code == EQ || code == NE || code == LT || code == GE)
3877 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3878 || GET_CODE (x) == NEG))
3879 return CC_NZmode;
3881 /* A compare with a shifted operand. Because of canonicalization,
3882 the comparison will have to be swapped when we emit the assembly
3883 code. */
3884 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3885 && (REG_P (y) || GET_CODE (y) == SUBREG)
3886 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3887 || GET_CODE (x) == LSHIFTRT
3888 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3889 return CC_SWPmode;
3891 /* Similarly for a negated operand, but we can only do this for
3892 equalities. */
3893 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3894 && (REG_P (y) || GET_CODE (y) == SUBREG)
3895 && (code == EQ || code == NE)
3896 && GET_CODE (x) == NEG)
3897 return CC_Zmode;
3899 /* A compare of a mode narrower than SI mode against zero can be done
3900 by extending the value in the comparison. */
3901 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3902 && y == const0_rtx)
3903 /* Only use sign-extension if we really need it. */
3904 return ((code == GT || code == GE || code == LE || code == LT)
3905 ? CC_SESWPmode : CC_ZESWPmode);
3907 /* For everything else, return CCmode. */
3908 return CCmode;
3911 static int
3912 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3915 aarch64_get_condition_code (rtx x)
3917 machine_mode mode = GET_MODE (XEXP (x, 0));
3918 enum rtx_code comp_code = GET_CODE (x);
3920 if (GET_MODE_CLASS (mode) != MODE_CC)
3921 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3922 return aarch64_get_condition_code_1 (mode, comp_code);
3925 static int
3926 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3928 int ne = -1, eq = -1;
3929 switch (mode)
3931 case CCFPmode:
3932 case CCFPEmode:
3933 switch (comp_code)
3935 case GE: return AARCH64_GE;
3936 case GT: return AARCH64_GT;
3937 case LE: return AARCH64_LS;
3938 case LT: return AARCH64_MI;
3939 case NE: return AARCH64_NE;
3940 case EQ: return AARCH64_EQ;
3941 case ORDERED: return AARCH64_VC;
3942 case UNORDERED: return AARCH64_VS;
3943 case UNLT: return AARCH64_LT;
3944 case UNLE: return AARCH64_LE;
3945 case UNGT: return AARCH64_HI;
3946 case UNGE: return AARCH64_PL;
3947 default: return -1;
3949 break;
3951 case CC_DNEmode:
3952 ne = AARCH64_NE;
3953 eq = AARCH64_EQ;
3954 break;
3956 case CC_DEQmode:
3957 ne = AARCH64_EQ;
3958 eq = AARCH64_NE;
3959 break;
3961 case CC_DGEmode:
3962 ne = AARCH64_GE;
3963 eq = AARCH64_LT;
3964 break;
3966 case CC_DLTmode:
3967 ne = AARCH64_LT;
3968 eq = AARCH64_GE;
3969 break;
3971 case CC_DGTmode:
3972 ne = AARCH64_GT;
3973 eq = AARCH64_LE;
3974 break;
3976 case CC_DLEmode:
3977 ne = AARCH64_LE;
3978 eq = AARCH64_GT;
3979 break;
3981 case CC_DGEUmode:
3982 ne = AARCH64_CS;
3983 eq = AARCH64_CC;
3984 break;
3986 case CC_DLTUmode:
3987 ne = AARCH64_CC;
3988 eq = AARCH64_CS;
3989 break;
3991 case CC_DGTUmode:
3992 ne = AARCH64_HI;
3993 eq = AARCH64_LS;
3994 break;
3996 case CC_DLEUmode:
3997 ne = AARCH64_LS;
3998 eq = AARCH64_HI;
3999 break;
4001 case CCmode:
4002 switch (comp_code)
4004 case NE: return AARCH64_NE;
4005 case EQ: return AARCH64_EQ;
4006 case GE: return AARCH64_GE;
4007 case GT: return AARCH64_GT;
4008 case LE: return AARCH64_LE;
4009 case LT: return AARCH64_LT;
4010 case GEU: return AARCH64_CS;
4011 case GTU: return AARCH64_HI;
4012 case LEU: return AARCH64_LS;
4013 case LTU: return AARCH64_CC;
4014 default: return -1;
4016 break;
4018 case CC_SWPmode:
4019 case CC_ZESWPmode:
4020 case CC_SESWPmode:
4021 switch (comp_code)
4023 case NE: return AARCH64_NE;
4024 case EQ: return AARCH64_EQ;
4025 case GE: return AARCH64_LE;
4026 case GT: return AARCH64_LT;
4027 case LE: return AARCH64_GE;
4028 case LT: return AARCH64_GT;
4029 case GEU: return AARCH64_LS;
4030 case GTU: return AARCH64_CC;
4031 case LEU: return AARCH64_CS;
4032 case LTU: return AARCH64_HI;
4033 default: return -1;
4035 break;
4037 case CC_NZmode:
4038 switch (comp_code)
4040 case NE: return AARCH64_NE;
4041 case EQ: return AARCH64_EQ;
4042 case GE: return AARCH64_PL;
4043 case LT: return AARCH64_MI;
4044 default: return -1;
4046 break;
4048 case CC_Zmode:
4049 switch (comp_code)
4051 case NE: return AARCH64_NE;
4052 case EQ: return AARCH64_EQ;
4053 default: return -1;
4055 break;
4057 default:
4058 return -1;
4059 break;
4062 if (comp_code == NE)
4063 return ne;
4065 if (comp_code == EQ)
4066 return eq;
4068 return -1;
4071 bool
4072 aarch64_const_vec_all_same_in_range_p (rtx x,
4073 HOST_WIDE_INT minval,
4074 HOST_WIDE_INT maxval)
4076 HOST_WIDE_INT firstval;
4077 int count, i;
4079 if (GET_CODE (x) != CONST_VECTOR
4080 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4081 return false;
4083 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4084 if (firstval < minval || firstval > maxval)
4085 return false;
4087 count = CONST_VECTOR_NUNITS (x);
4088 for (i = 1; i < count; i++)
4089 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4090 return false;
4092 return true;
4095 bool
4096 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4098 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4101 static unsigned
4102 bit_count (unsigned HOST_WIDE_INT value)
4104 unsigned count = 0;
4106 while (value)
4108 count++;
4109 value &= value - 1;
4112 return count;
4115 /* N Z C V. */
4116 #define AARCH64_CC_V 1
4117 #define AARCH64_CC_C (1 << 1)
4118 #define AARCH64_CC_Z (1 << 2)
4119 #define AARCH64_CC_N (1 << 3)
4121 /* N Z C V flags for ccmp. The first code is for AND op and the other
4122 is for IOR op. Indexed by AARCH64_COND_CODE. */
4123 static const int aarch64_nzcv_codes[][2] =
4125 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
4126 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
4127 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4128 {0, AARCH64_CC_C}, /* CC, C == 0. */
4129 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4130 {0, AARCH64_CC_N}, /* PL, N == 0. */
4131 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4132 {0, AARCH64_CC_V}, /* VC, V == 0. */
4133 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4134 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4135 {0, AARCH64_CC_V}, /* GE, N == V. */
4136 {AARCH64_CC_V, 0}, /* LT, N != V. */
4137 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4138 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4139 {0, 0}, /* AL, Any. */
4140 {0, 0}, /* NV, Any. */
4144 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4146 switch (mode)
4148 case CC_DNEmode:
4149 return NE;
4151 case CC_DEQmode:
4152 return EQ;
4154 case CC_DLEmode:
4155 return LE;
4157 case CC_DGTmode:
4158 return GT;
4160 case CC_DLTmode:
4161 return LT;
4163 case CC_DGEmode:
4164 return GE;
4166 case CC_DLEUmode:
4167 return LEU;
4169 case CC_DGTUmode:
4170 return GTU;
4172 case CC_DLTUmode:
4173 return LTU;
4175 case CC_DGEUmode:
4176 return GEU;
4178 default:
4179 gcc_unreachable ();
4184 void
4185 aarch64_print_operand (FILE *f, rtx x, char code)
4187 switch (code)
4189 /* An integer or symbol address without a preceding # sign. */
4190 case 'c':
4191 switch (GET_CODE (x))
4193 case CONST_INT:
4194 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4195 break;
4197 case SYMBOL_REF:
4198 output_addr_const (f, x);
4199 break;
4201 case CONST:
4202 if (GET_CODE (XEXP (x, 0)) == PLUS
4203 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4205 output_addr_const (f, x);
4206 break;
4208 /* Fall through. */
4210 default:
4211 output_operand_lossage ("Unsupported operand for code '%c'", code);
4213 break;
4215 case 'e':
4216 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4218 int n;
4220 if (!CONST_INT_P (x)
4221 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4223 output_operand_lossage ("invalid operand for '%%%c'", code);
4224 return;
4227 switch (n)
4229 case 3:
4230 fputc ('b', f);
4231 break;
4232 case 4:
4233 fputc ('h', f);
4234 break;
4235 case 5:
4236 fputc ('w', f);
4237 break;
4238 default:
4239 output_operand_lossage ("invalid operand for '%%%c'", code);
4240 return;
4243 break;
4245 case 'p':
4247 int n;
4249 /* Print N such that 2^N == X. */
4250 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4252 output_operand_lossage ("invalid operand for '%%%c'", code);
4253 return;
4256 asm_fprintf (f, "%d", n);
4258 break;
4260 case 'P':
4261 /* Print the number of non-zero bits in X (a const_int). */
4262 if (!CONST_INT_P (x))
4264 output_operand_lossage ("invalid operand for '%%%c'", code);
4265 return;
4268 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4269 break;
4271 case 'H':
4272 /* Print the higher numbered register of a pair (TImode) of regs. */
4273 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4275 output_operand_lossage ("invalid operand for '%%%c'", code);
4276 return;
4279 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4280 break;
4282 case 'm':
4284 int cond_code;
4285 /* Print a condition (eq, ne, etc). */
4287 /* CONST_TRUE_RTX means always -- that's the default. */
4288 if (x == const_true_rtx)
4289 return;
4291 if (!COMPARISON_P (x))
4293 output_operand_lossage ("invalid operand for '%%%c'", code);
4294 return;
4297 cond_code = aarch64_get_condition_code (x);
4298 gcc_assert (cond_code >= 0);
4299 fputs (aarch64_condition_codes[cond_code], f);
4301 break;
4303 case 'M':
4305 int cond_code;
4306 /* Print the inverse of a condition (eq <-> ne, etc). */
4308 /* CONST_TRUE_RTX means never -- that's the default. */
4309 if (x == const_true_rtx)
4311 fputs ("nv", f);
4312 return;
4315 if (!COMPARISON_P (x))
4317 output_operand_lossage ("invalid operand for '%%%c'", code);
4318 return;
4320 cond_code = aarch64_get_condition_code (x);
4321 gcc_assert (cond_code >= 0);
4322 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4323 (cond_code)], f);
4325 break;
4327 case 'b':
4328 case 'h':
4329 case 's':
4330 case 'd':
4331 case 'q':
4332 /* Print a scalar FP/SIMD register name. */
4333 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4335 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4336 return;
4338 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4339 break;
4341 case 'S':
4342 case 'T':
4343 case 'U':
4344 case 'V':
4345 /* Print the first FP/SIMD register name in a list. */
4346 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4348 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4349 return;
4351 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4352 break;
4354 case 'R':
4355 /* Print a scalar FP/SIMD register name + 1. */
4356 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4358 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4359 return;
4361 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4362 break;
4364 case 'X':
4365 /* Print bottom 16 bits of integer constant in hex. */
4366 if (!CONST_INT_P (x))
4368 output_operand_lossage ("invalid operand for '%%%c'", code);
4369 return;
4371 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4372 break;
4374 case 'w':
4375 case 'x':
4376 /* Print a general register name or the zero register (32-bit or
4377 64-bit). */
4378 if (x == const0_rtx
4379 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4381 asm_fprintf (f, "%czr", code);
4382 break;
4385 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4387 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4388 break;
4391 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4393 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4394 break;
4397 /* Fall through */
4399 case 0:
4400 /* Print a normal operand, if it's a general register, then we
4401 assume DImode. */
4402 if (x == NULL)
4404 output_operand_lossage ("missing operand");
4405 return;
4408 switch (GET_CODE (x))
4410 case REG:
4411 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4412 break;
4414 case MEM:
4415 aarch64_memory_reference_mode = GET_MODE (x);
4416 output_address (XEXP (x, 0));
4417 break;
4419 case LABEL_REF:
4420 case SYMBOL_REF:
4421 output_addr_const (asm_out_file, x);
4422 break;
4424 case CONST_INT:
4425 asm_fprintf (f, "%wd", INTVAL (x));
4426 break;
4428 case CONST_VECTOR:
4429 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4431 gcc_assert (
4432 aarch64_const_vec_all_same_in_range_p (x,
4433 HOST_WIDE_INT_MIN,
4434 HOST_WIDE_INT_MAX));
4435 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4437 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4439 fputc ('0', f);
4441 else
4442 gcc_unreachable ();
4443 break;
4445 case CONST_DOUBLE:
4446 /* CONST_DOUBLE can represent a double-width integer.
4447 In this case, the mode of x is VOIDmode. */
4448 if (GET_MODE (x) == VOIDmode)
4449 ; /* Do Nothing. */
4450 else if (aarch64_float_const_zero_rtx_p (x))
4452 fputc ('0', f);
4453 break;
4455 else if (aarch64_float_const_representable_p (x))
4457 #define buf_size 20
4458 char float_buf[buf_size] = {'\0'};
4459 REAL_VALUE_TYPE r;
4460 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4461 real_to_decimal_for_mode (float_buf, &r,
4462 buf_size, buf_size,
4463 1, GET_MODE (x));
4464 asm_fprintf (asm_out_file, "%s", float_buf);
4465 break;
4466 #undef buf_size
4468 output_operand_lossage ("invalid constant");
4469 return;
4470 default:
4471 output_operand_lossage ("invalid operand");
4472 return;
4474 break;
4476 case 'A':
4477 if (GET_CODE (x) == HIGH)
4478 x = XEXP (x, 0);
4480 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4482 case SYMBOL_SMALL_GOT_4G:
4483 asm_fprintf (asm_out_file, ":got:");
4484 break;
4486 case SYMBOL_SMALL_TLSGD:
4487 asm_fprintf (asm_out_file, ":tlsgd:");
4488 break;
4490 case SYMBOL_SMALL_TLSDESC:
4491 asm_fprintf (asm_out_file, ":tlsdesc:");
4492 break;
4494 case SYMBOL_SMALL_GOTTPREL:
4495 asm_fprintf (asm_out_file, ":gottprel:");
4496 break;
4498 case SYMBOL_TLSLE:
4499 asm_fprintf (asm_out_file, ":tprel:");
4500 break;
4502 case SYMBOL_TINY_GOT:
4503 gcc_unreachable ();
4504 break;
4506 default:
4507 break;
4509 output_addr_const (asm_out_file, x);
4510 break;
4512 case 'L':
4513 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4515 case SYMBOL_SMALL_GOT_4G:
4516 asm_fprintf (asm_out_file, ":lo12:");
4517 break;
4519 case SYMBOL_SMALL_TLSGD:
4520 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4521 break;
4523 case SYMBOL_SMALL_TLSDESC:
4524 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4525 break;
4527 case SYMBOL_SMALL_GOTTPREL:
4528 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4529 break;
4531 case SYMBOL_TLSLE:
4532 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4533 break;
4535 case SYMBOL_TINY_GOT:
4536 asm_fprintf (asm_out_file, ":got:");
4537 break;
4539 default:
4540 break;
4542 output_addr_const (asm_out_file, x);
4543 break;
4545 case 'G':
4547 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4549 case SYMBOL_TLSLE:
4550 asm_fprintf (asm_out_file, ":tprel_hi12:");
4551 break;
4552 default:
4553 break;
4555 output_addr_const (asm_out_file, x);
4556 break;
4558 case 'K':
4560 int cond_code;
4561 /* Print nzcv. */
4563 if (!COMPARISON_P (x))
4565 output_operand_lossage ("invalid operand for '%%%c'", code);
4566 return;
4569 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4570 gcc_assert (cond_code >= 0);
4571 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4573 break;
4575 case 'k':
4577 int cond_code;
4578 /* Print nzcv. */
4580 if (!COMPARISON_P (x))
4582 output_operand_lossage ("invalid operand for '%%%c'", code);
4583 return;
4586 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4587 gcc_assert (cond_code >= 0);
4588 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4590 break;
4592 default:
4593 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4594 return;
4598 void
4599 aarch64_print_operand_address (FILE *f, rtx x)
4601 struct aarch64_address_info addr;
4603 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4604 MEM, true))
4605 switch (addr.type)
4607 case ADDRESS_REG_IMM:
4608 if (addr.offset == const0_rtx)
4609 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4610 else
4611 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4612 INTVAL (addr.offset));
4613 return;
4615 case ADDRESS_REG_REG:
4616 if (addr.shift == 0)
4617 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4618 reg_names [REGNO (addr.offset)]);
4619 else
4620 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4621 reg_names [REGNO (addr.offset)], addr.shift);
4622 return;
4624 case ADDRESS_REG_UXTW:
4625 if (addr.shift == 0)
4626 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4627 REGNO (addr.offset) - R0_REGNUM);
4628 else
4629 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4630 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4631 return;
4633 case ADDRESS_REG_SXTW:
4634 if (addr.shift == 0)
4635 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4636 REGNO (addr.offset) - R0_REGNUM);
4637 else
4638 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4639 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4640 return;
4642 case ADDRESS_REG_WB:
4643 switch (GET_CODE (x))
4645 case PRE_INC:
4646 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4647 GET_MODE_SIZE (aarch64_memory_reference_mode));
4648 return;
4649 case POST_INC:
4650 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4651 GET_MODE_SIZE (aarch64_memory_reference_mode));
4652 return;
4653 case PRE_DEC:
4654 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4655 GET_MODE_SIZE (aarch64_memory_reference_mode));
4656 return;
4657 case POST_DEC:
4658 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4659 GET_MODE_SIZE (aarch64_memory_reference_mode));
4660 return;
4661 case PRE_MODIFY:
4662 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4663 INTVAL (addr.offset));
4664 return;
4665 case POST_MODIFY:
4666 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4667 INTVAL (addr.offset));
4668 return;
4669 default:
4670 break;
4672 break;
4674 case ADDRESS_LO_SUM:
4675 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4676 output_addr_const (f, addr.offset);
4677 asm_fprintf (f, "]");
4678 return;
4680 case ADDRESS_SYMBOLIC:
4681 break;
4684 output_addr_const (f, x);
4687 bool
4688 aarch64_label_mentioned_p (rtx x)
4690 const char *fmt;
4691 int i;
4693 if (GET_CODE (x) == LABEL_REF)
4694 return true;
4696 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4697 referencing instruction, but they are constant offsets, not
4698 symbols. */
4699 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4700 return false;
4702 fmt = GET_RTX_FORMAT (GET_CODE (x));
4703 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4705 if (fmt[i] == 'E')
4707 int j;
4709 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4710 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4711 return 1;
4713 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4714 return 1;
4717 return 0;
4720 /* Implement REGNO_REG_CLASS. */
4722 enum reg_class
4723 aarch64_regno_regclass (unsigned regno)
4725 if (GP_REGNUM_P (regno))
4726 return GENERAL_REGS;
4728 if (regno == SP_REGNUM)
4729 return STACK_REG;
4731 if (regno == FRAME_POINTER_REGNUM
4732 || regno == ARG_POINTER_REGNUM)
4733 return POINTER_REGS;
4735 if (FP_REGNUM_P (regno))
4736 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4738 return NO_REGS;
4741 static rtx
4742 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4744 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4745 where mask is selected by alignment and size of the offset.
4746 We try to pick as large a range for the offset as possible to
4747 maximize the chance of a CSE. However, for aligned addresses
4748 we limit the range to 4k so that structures with different sized
4749 elements are likely to use the same base. */
4751 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4753 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4754 HOST_WIDE_INT base_offset;
4756 /* Does it look like we'll need a load/store-pair operation? */
4757 if (GET_MODE_SIZE (mode) > 16
4758 || mode == TImode)
4759 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4760 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4761 /* For offsets aren't a multiple of the access size, the limit is
4762 -256...255. */
4763 else if (offset & (GET_MODE_SIZE (mode) - 1))
4764 base_offset = (offset + 0x100) & ~0x1ff;
4765 else
4766 base_offset = offset & ~0xfff;
4768 if (base_offset == 0)
4769 return x;
4771 offset -= base_offset;
4772 rtx base_reg = gen_reg_rtx (Pmode);
4773 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4774 NULL_RTX);
4775 emit_move_insn (base_reg, val);
4776 x = plus_constant (Pmode, base_reg, offset);
4779 return x;
4782 /* Try a machine-dependent way of reloading an illegitimate address
4783 operand. If we find one, push the reload and return the new rtx. */
4786 aarch64_legitimize_reload_address (rtx *x_p,
4787 machine_mode mode,
4788 int opnum, int type,
4789 int ind_levels ATTRIBUTE_UNUSED)
4791 rtx x = *x_p;
4793 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4794 if (aarch64_vect_struct_mode_p (mode)
4795 && GET_CODE (x) == PLUS
4796 && REG_P (XEXP (x, 0))
4797 && CONST_INT_P (XEXP (x, 1)))
4799 rtx orig_rtx = x;
4800 x = copy_rtx (x);
4801 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4802 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4803 opnum, (enum reload_type) type);
4804 return x;
4807 /* We must recognize output that we have already generated ourselves. */
4808 if (GET_CODE (x) == PLUS
4809 && GET_CODE (XEXP (x, 0)) == PLUS
4810 && REG_P (XEXP (XEXP (x, 0), 0))
4811 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4812 && CONST_INT_P (XEXP (x, 1)))
4814 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4815 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4816 opnum, (enum reload_type) type);
4817 return x;
4820 /* We wish to handle large displacements off a base register by splitting
4821 the addend across an add and the mem insn. This can cut the number of
4822 extra insns needed from 3 to 1. It is only useful for load/store of a
4823 single register with 12 bit offset field. */
4824 if (GET_CODE (x) == PLUS
4825 && REG_P (XEXP (x, 0))
4826 && CONST_INT_P (XEXP (x, 1))
4827 && HARD_REGISTER_P (XEXP (x, 0))
4828 && mode != TImode
4829 && mode != TFmode
4830 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4832 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4833 HOST_WIDE_INT low = val & 0xfff;
4834 HOST_WIDE_INT high = val - low;
4835 HOST_WIDE_INT offs;
4836 rtx cst;
4837 machine_mode xmode = GET_MODE (x);
4839 /* In ILP32, xmode can be either DImode or SImode. */
4840 gcc_assert (xmode == DImode || xmode == SImode);
4842 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4843 BLKmode alignment. */
4844 if (GET_MODE_SIZE (mode) == 0)
4845 return NULL_RTX;
4847 offs = low % GET_MODE_SIZE (mode);
4849 /* Align misaligned offset by adjusting high part to compensate. */
4850 if (offs != 0)
4852 if (aarch64_uimm12_shift (high + offs))
4854 /* Align down. */
4855 low = low - offs;
4856 high = high + offs;
4858 else
4860 /* Align up. */
4861 offs = GET_MODE_SIZE (mode) - offs;
4862 low = low + offs;
4863 high = high + (low & 0x1000) - offs;
4864 low &= 0xfff;
4868 /* Check for overflow. */
4869 if (high + low != val)
4870 return NULL_RTX;
4872 cst = GEN_INT (high);
4873 if (!aarch64_uimm12_shift (high))
4874 cst = force_const_mem (xmode, cst);
4876 /* Reload high part into base reg, leaving the low part
4877 in the mem instruction.
4878 Note that replacing this gen_rtx_PLUS with plus_constant is
4879 wrong in this case because we rely on the
4880 (plus (plus reg c1) c2) structure being preserved so that
4881 XEXP (*p, 0) in push_reload below uses the correct term. */
4882 x = gen_rtx_PLUS (xmode,
4883 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4884 GEN_INT (low));
4886 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4887 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4888 opnum, (enum reload_type) type);
4889 return x;
4892 return NULL_RTX;
4896 static reg_class_t
4897 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4898 reg_class_t rclass,
4899 machine_mode mode,
4900 secondary_reload_info *sri)
4902 /* Without the TARGET_SIMD instructions we cannot move a Q register
4903 to a Q register directly. We need a scratch. */
4904 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4905 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4906 && reg_class_subset_p (rclass, FP_REGS))
4908 if (mode == TFmode)
4909 sri->icode = CODE_FOR_aarch64_reload_movtf;
4910 else if (mode == TImode)
4911 sri->icode = CODE_FOR_aarch64_reload_movti;
4912 return NO_REGS;
4915 /* A TFmode or TImode memory access should be handled via an FP_REGS
4916 because AArch64 has richer addressing modes for LDR/STR instructions
4917 than LDP/STP instructions. */
4918 if (TARGET_FLOAT && rclass == GENERAL_REGS
4919 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4920 return FP_REGS;
4922 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4923 return GENERAL_REGS;
4925 return NO_REGS;
4928 static bool
4929 aarch64_can_eliminate (const int from, const int to)
4931 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4932 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4934 if (frame_pointer_needed)
4936 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4937 return true;
4938 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4939 return false;
4940 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4941 && !cfun->calls_alloca)
4942 return true;
4943 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4944 return true;
4946 return false;
4948 else
4950 /* If we decided that we didn't need a leaf frame pointer but then used
4951 LR in the function, then we'll want a frame pointer after all, so
4952 prevent this elimination to ensure a frame pointer is used. */
4953 if (to == STACK_POINTER_REGNUM
4954 && flag_omit_leaf_frame_pointer
4955 && df_regs_ever_live_p (LR_REGNUM))
4956 return false;
4959 return true;
4962 HOST_WIDE_INT
4963 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4965 aarch64_layout_frame ();
4967 if (to == HARD_FRAME_POINTER_REGNUM)
4969 if (from == ARG_POINTER_REGNUM)
4970 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4972 if (from == FRAME_POINTER_REGNUM)
4973 return (cfun->machine->frame.hard_fp_offset
4974 - cfun->machine->frame.saved_varargs_size);
4977 if (to == STACK_POINTER_REGNUM)
4979 if (from == FRAME_POINTER_REGNUM)
4980 return (cfun->machine->frame.frame_size
4981 - cfun->machine->frame.saved_varargs_size);
4984 return cfun->machine->frame.frame_size;
4987 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4988 previous frame. */
4991 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4993 if (count != 0)
4994 return const0_rtx;
4995 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4999 static void
5000 aarch64_asm_trampoline_template (FILE *f)
5002 if (TARGET_ILP32)
5004 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5005 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5007 else
5009 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5010 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5012 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5013 assemble_aligned_integer (4, const0_rtx);
5014 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5015 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5018 static void
5019 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5021 rtx fnaddr, mem, a_tramp;
5022 const int tramp_code_sz = 16;
5024 /* Don't need to copy the trailing D-words, we fill those in below. */
5025 emit_block_move (m_tramp, assemble_trampoline_template (),
5026 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5027 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5028 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5029 if (GET_MODE (fnaddr) != ptr_mode)
5030 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5031 emit_move_insn (mem, fnaddr);
5033 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5034 emit_move_insn (mem, chain_value);
5036 /* XXX We should really define a "clear_cache" pattern and use
5037 gen_clear_cache(). */
5038 a_tramp = XEXP (m_tramp, 0);
5039 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5040 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5041 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5042 ptr_mode);
5045 static unsigned char
5046 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5048 switch (regclass)
5050 case CALLER_SAVE_REGS:
5051 case POINTER_REGS:
5052 case GENERAL_REGS:
5053 case ALL_REGS:
5054 case FP_REGS:
5055 case FP_LO_REGS:
5056 return
5057 aarch64_vector_mode_p (mode)
5058 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5059 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5060 case STACK_REG:
5061 return 1;
5063 case NO_REGS:
5064 return 0;
5066 default:
5067 break;
5069 gcc_unreachable ();
5072 static reg_class_t
5073 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5075 if (regclass == POINTER_REGS)
5076 return GENERAL_REGS;
5078 if (regclass == STACK_REG)
5080 if (REG_P(x)
5081 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5082 return regclass;
5084 return NO_REGS;
5087 /* If it's an integer immediate that MOVI can't handle, then
5088 FP_REGS is not an option, so we return NO_REGS instead. */
5089 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5090 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5091 return NO_REGS;
5093 /* Register eliminiation can result in a request for
5094 SP+constant->FP_REGS. We cannot support such operations which
5095 use SP as source and an FP_REG as destination, so reject out
5096 right now. */
5097 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5099 rtx lhs = XEXP (x, 0);
5101 /* Look through a possible SUBREG introduced by ILP32. */
5102 if (GET_CODE (lhs) == SUBREG)
5103 lhs = SUBREG_REG (lhs);
5105 gcc_assert (REG_P (lhs));
5106 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5107 POINTER_REGS));
5108 return NO_REGS;
5111 return regclass;
5114 void
5115 aarch64_asm_output_labelref (FILE* f, const char *name)
5117 asm_fprintf (f, "%U%s", name);
5120 static void
5121 aarch64_elf_asm_constructor (rtx symbol, int priority)
5123 if (priority == DEFAULT_INIT_PRIORITY)
5124 default_ctor_section_asm_out_constructor (symbol, priority);
5125 else
5127 section *s;
5128 char buf[18];
5129 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5130 s = get_section (buf, SECTION_WRITE, NULL);
5131 switch_to_section (s);
5132 assemble_align (POINTER_SIZE);
5133 assemble_aligned_integer (POINTER_BYTES, symbol);
5137 static void
5138 aarch64_elf_asm_destructor (rtx symbol, int priority)
5140 if (priority == DEFAULT_INIT_PRIORITY)
5141 default_dtor_section_asm_out_destructor (symbol, priority);
5142 else
5144 section *s;
5145 char buf[18];
5146 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5147 s = get_section (buf, SECTION_WRITE, NULL);
5148 switch_to_section (s);
5149 assemble_align (POINTER_SIZE);
5150 assemble_aligned_integer (POINTER_BYTES, symbol);
5154 const char*
5155 aarch64_output_casesi (rtx *operands)
5157 char buf[100];
5158 char label[100];
5159 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5160 int index;
5161 static const char *const patterns[4][2] =
5164 "ldrb\t%w3, [%0,%w1,uxtw]",
5165 "add\t%3, %4, %w3, sxtb #2"
5168 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5169 "add\t%3, %4, %w3, sxth #2"
5172 "ldr\t%w3, [%0,%w1,uxtw #2]",
5173 "add\t%3, %4, %w3, sxtw #2"
5175 /* We assume that DImode is only generated when not optimizing and
5176 that we don't really need 64-bit address offsets. That would
5177 imply an object file with 8GB of code in a single function! */
5179 "ldr\t%w3, [%0,%w1,uxtw #2]",
5180 "add\t%3, %4, %w3, sxtw #2"
5184 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5186 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5188 gcc_assert (index >= 0 && index <= 3);
5190 /* Need to implement table size reduction, by chaning the code below. */
5191 output_asm_insn (patterns[index][0], operands);
5192 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5193 snprintf (buf, sizeof (buf),
5194 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5195 output_asm_insn (buf, operands);
5196 output_asm_insn (patterns[index][1], operands);
5197 output_asm_insn ("br\t%3", operands);
5198 assemble_label (asm_out_file, label);
5199 return "";
5203 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5204 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5205 operator. */
5208 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5210 if (shift >= 0 && shift <= 3)
5212 int size;
5213 for (size = 8; size <= 32; size *= 2)
5215 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5216 if (mask == bits << shift)
5217 return size;
5220 return 0;
5223 static bool
5224 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5225 const_rtx x ATTRIBUTE_UNUSED)
5227 /* We can't use blocks for constants when we're using a per-function
5228 constant pool. */
5229 return false;
5232 static section *
5233 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5234 rtx x ATTRIBUTE_UNUSED,
5235 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5237 /* Force all constant pool entries into the current function section. */
5238 return function_section (current_function_decl);
5242 /* Costs. */
5244 /* Helper function for rtx cost calculation. Strip a shift expression
5245 from X. Returns the inner operand if successful, or the original
5246 expression on failure. */
5247 static rtx
5248 aarch64_strip_shift (rtx x)
5250 rtx op = x;
5252 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5253 we can convert both to ROR during final output. */
5254 if ((GET_CODE (op) == ASHIFT
5255 || GET_CODE (op) == ASHIFTRT
5256 || GET_CODE (op) == LSHIFTRT
5257 || GET_CODE (op) == ROTATERT
5258 || GET_CODE (op) == ROTATE)
5259 && CONST_INT_P (XEXP (op, 1)))
5260 return XEXP (op, 0);
5262 if (GET_CODE (op) == MULT
5263 && CONST_INT_P (XEXP (op, 1))
5264 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5265 return XEXP (op, 0);
5267 return x;
5270 /* Helper function for rtx cost calculation. Strip an extend
5271 expression from X. Returns the inner operand if successful, or the
5272 original expression on failure. We deal with a number of possible
5273 canonicalization variations here. */
5274 static rtx
5275 aarch64_strip_extend (rtx x)
5277 rtx op = x;
5279 /* Zero and sign extraction of a widened value. */
5280 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5281 && XEXP (op, 2) == const0_rtx
5282 && GET_CODE (XEXP (op, 0)) == MULT
5283 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5284 XEXP (op, 1)))
5285 return XEXP (XEXP (op, 0), 0);
5287 /* It can also be represented (for zero-extend) as an AND with an
5288 immediate. */
5289 if (GET_CODE (op) == AND
5290 && GET_CODE (XEXP (op, 0)) == MULT
5291 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5292 && CONST_INT_P (XEXP (op, 1))
5293 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5294 INTVAL (XEXP (op, 1))) != 0)
5295 return XEXP (XEXP (op, 0), 0);
5297 /* Now handle extended register, as this may also have an optional
5298 left shift by 1..4. */
5299 if (GET_CODE (op) == ASHIFT
5300 && CONST_INT_P (XEXP (op, 1))
5301 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5302 op = XEXP (op, 0);
5304 if (GET_CODE (op) == ZERO_EXTEND
5305 || GET_CODE (op) == SIGN_EXTEND)
5306 op = XEXP (op, 0);
5308 if (op != x)
5309 return op;
5311 return x;
5314 /* Return true iff CODE is a shift supported in combination
5315 with arithmetic instructions. */
5317 static bool
5318 aarch64_shift_p (enum rtx_code code)
5320 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5323 /* Helper function for rtx cost calculation. Calculate the cost of
5324 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5325 Return the calculated cost of the expression, recursing manually in to
5326 operands where needed. */
5328 static int
5329 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5331 rtx op0, op1;
5332 const struct cpu_cost_table *extra_cost
5333 = aarch64_tune_params.insn_extra_cost;
5334 int cost = 0;
5335 bool compound_p = (outer == PLUS || outer == MINUS);
5336 machine_mode mode = GET_MODE (x);
5338 gcc_checking_assert (code == MULT);
5340 op0 = XEXP (x, 0);
5341 op1 = XEXP (x, 1);
5343 if (VECTOR_MODE_P (mode))
5344 mode = GET_MODE_INNER (mode);
5346 /* Integer multiply/fma. */
5347 if (GET_MODE_CLASS (mode) == MODE_INT)
5349 /* The multiply will be canonicalized as a shift, cost it as such. */
5350 if (aarch64_shift_p (GET_CODE (x))
5351 || (CONST_INT_P (op1)
5352 && exact_log2 (INTVAL (op1)) > 0))
5354 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5355 || GET_CODE (op0) == SIGN_EXTEND;
5356 if (speed)
5358 if (compound_p)
5360 if (REG_P (op1))
5361 /* ARITH + shift-by-register. */
5362 cost += extra_cost->alu.arith_shift_reg;
5363 else if (is_extend)
5364 /* ARITH + extended register. We don't have a cost field
5365 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5366 cost += extra_cost->alu.extend_arith;
5367 else
5368 /* ARITH + shift-by-immediate. */
5369 cost += extra_cost->alu.arith_shift;
5371 else
5372 /* LSL (immediate). */
5373 cost += extra_cost->alu.shift;
5376 /* Strip extends as we will have costed them in the case above. */
5377 if (is_extend)
5378 op0 = aarch64_strip_extend (op0);
5380 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5382 return cost;
5385 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5386 compound and let the below cases handle it. After all, MNEG is a
5387 special-case alias of MSUB. */
5388 if (GET_CODE (op0) == NEG)
5390 op0 = XEXP (op0, 0);
5391 compound_p = true;
5394 /* Integer multiplies or FMAs have zero/sign extending variants. */
5395 if ((GET_CODE (op0) == ZERO_EXTEND
5396 && GET_CODE (op1) == ZERO_EXTEND)
5397 || (GET_CODE (op0) == SIGN_EXTEND
5398 && GET_CODE (op1) == SIGN_EXTEND))
5400 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5401 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5403 if (speed)
5405 if (compound_p)
5406 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5407 cost += extra_cost->mult[0].extend_add;
5408 else
5409 /* MUL/SMULL/UMULL. */
5410 cost += extra_cost->mult[0].extend;
5413 return cost;
5416 /* This is either an integer multiply or a MADD. In both cases
5417 we want to recurse and cost the operands. */
5418 cost += rtx_cost (op0, MULT, 0, speed)
5419 + rtx_cost (op1, MULT, 1, speed);
5421 if (speed)
5423 if (compound_p)
5424 /* MADD/MSUB. */
5425 cost += extra_cost->mult[mode == DImode].add;
5426 else
5427 /* MUL. */
5428 cost += extra_cost->mult[mode == DImode].simple;
5431 return cost;
5433 else
5435 if (speed)
5437 /* Floating-point FMA/FMUL can also support negations of the
5438 operands. */
5439 if (GET_CODE (op0) == NEG)
5440 op0 = XEXP (op0, 0);
5441 if (GET_CODE (op1) == NEG)
5442 op1 = XEXP (op1, 0);
5444 if (compound_p)
5445 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5446 cost += extra_cost->fp[mode == DFmode].fma;
5447 else
5448 /* FMUL/FNMUL. */
5449 cost += extra_cost->fp[mode == DFmode].mult;
5452 cost += rtx_cost (op0, MULT, 0, speed)
5453 + rtx_cost (op1, MULT, 1, speed);
5454 return cost;
5458 static int
5459 aarch64_address_cost (rtx x,
5460 machine_mode mode,
5461 addr_space_t as ATTRIBUTE_UNUSED,
5462 bool speed)
5464 enum rtx_code c = GET_CODE (x);
5465 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5466 struct aarch64_address_info info;
5467 int cost = 0;
5468 info.shift = 0;
5470 if (!aarch64_classify_address (&info, x, mode, c, false))
5472 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5474 /* This is a CONST or SYMBOL ref which will be split
5475 in a different way depending on the code model in use.
5476 Cost it through the generic infrastructure. */
5477 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5478 /* Divide through by the cost of one instruction to
5479 bring it to the same units as the address costs. */
5480 cost_symbol_ref /= COSTS_N_INSNS (1);
5481 /* The cost is then the cost of preparing the address,
5482 followed by an immediate (possibly 0) offset. */
5483 return cost_symbol_ref + addr_cost->imm_offset;
5485 else
5487 /* This is most likely a jump table from a case
5488 statement. */
5489 return addr_cost->register_offset;
5493 switch (info.type)
5495 case ADDRESS_LO_SUM:
5496 case ADDRESS_SYMBOLIC:
5497 case ADDRESS_REG_IMM:
5498 cost += addr_cost->imm_offset;
5499 break;
5501 case ADDRESS_REG_WB:
5502 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5503 cost += addr_cost->pre_modify;
5504 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5505 cost += addr_cost->post_modify;
5506 else
5507 gcc_unreachable ();
5509 break;
5511 case ADDRESS_REG_REG:
5512 cost += addr_cost->register_offset;
5513 break;
5515 case ADDRESS_REG_UXTW:
5516 case ADDRESS_REG_SXTW:
5517 cost += addr_cost->register_extend;
5518 break;
5520 default:
5521 gcc_unreachable ();
5525 if (info.shift > 0)
5527 /* For the sake of calculating the cost of the shifted register
5528 component, we can treat same sized modes in the same way. */
5529 switch (GET_MODE_BITSIZE (mode))
5531 case 16:
5532 cost += addr_cost->addr_scale_costs.hi;
5533 break;
5535 case 32:
5536 cost += addr_cost->addr_scale_costs.si;
5537 break;
5539 case 64:
5540 cost += addr_cost->addr_scale_costs.di;
5541 break;
5543 /* We can't tell, or this is a 128-bit vector. */
5544 default:
5545 cost += addr_cost->addr_scale_costs.ti;
5546 break;
5550 return cost;
5553 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5554 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5555 to be taken. */
5558 aarch64_branch_cost (bool speed_p, bool predictable_p)
5560 /* When optimizing for speed, use the cost of unpredictable branches. */
5561 const struct cpu_branch_cost *branch_costs =
5562 aarch64_tune_params.branch_costs;
5564 if (!speed_p || predictable_p)
5565 return branch_costs->predictable;
5566 else
5567 return branch_costs->unpredictable;
5570 /* Return true if the RTX X in mode MODE is a zero or sign extract
5571 usable in an ADD or SUB (extended register) instruction. */
5572 static bool
5573 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5575 /* Catch add with a sign extract.
5576 This is add_<optab><mode>_multp2. */
5577 if (GET_CODE (x) == SIGN_EXTRACT
5578 || GET_CODE (x) == ZERO_EXTRACT)
5580 rtx op0 = XEXP (x, 0);
5581 rtx op1 = XEXP (x, 1);
5582 rtx op2 = XEXP (x, 2);
5584 if (GET_CODE (op0) == MULT
5585 && CONST_INT_P (op1)
5586 && op2 == const0_rtx
5587 && CONST_INT_P (XEXP (op0, 1))
5588 && aarch64_is_extend_from_extract (mode,
5589 XEXP (op0, 1),
5590 op1))
5592 return true;
5596 return false;
5599 static bool
5600 aarch64_frint_unspec_p (unsigned int u)
5602 switch (u)
5604 case UNSPEC_FRINTZ:
5605 case UNSPEC_FRINTP:
5606 case UNSPEC_FRINTM:
5607 case UNSPEC_FRINTA:
5608 case UNSPEC_FRINTN:
5609 case UNSPEC_FRINTX:
5610 case UNSPEC_FRINTI:
5611 return true;
5613 default:
5614 return false;
5618 /* Return true iff X is an rtx that will match an extr instruction
5619 i.e. as described in the *extr<mode>5_insn family of patterns.
5620 OP0 and OP1 will be set to the operands of the shifts involved
5621 on success and will be NULL_RTX otherwise. */
5623 static bool
5624 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5626 rtx op0, op1;
5627 machine_mode mode = GET_MODE (x);
5629 *res_op0 = NULL_RTX;
5630 *res_op1 = NULL_RTX;
5632 if (GET_CODE (x) != IOR)
5633 return false;
5635 op0 = XEXP (x, 0);
5636 op1 = XEXP (x, 1);
5638 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5639 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5641 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5642 if (GET_CODE (op1) == ASHIFT)
5643 std::swap (op0, op1);
5645 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5646 return false;
5648 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5649 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5651 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5652 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5654 *res_op0 = XEXP (op0, 0);
5655 *res_op1 = XEXP (op1, 0);
5656 return true;
5660 return false;
5663 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5664 storing it in *COST. Result is true if the total cost of the operation
5665 has now been calculated. */
5666 static bool
5667 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5669 rtx inner;
5670 rtx comparator;
5671 enum rtx_code cmpcode;
5673 if (COMPARISON_P (op0))
5675 inner = XEXP (op0, 0);
5676 comparator = XEXP (op0, 1);
5677 cmpcode = GET_CODE (op0);
5679 else
5681 inner = op0;
5682 comparator = const0_rtx;
5683 cmpcode = NE;
5686 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5688 /* Conditional branch. */
5689 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5690 return true;
5691 else
5693 if (cmpcode == NE || cmpcode == EQ)
5695 if (comparator == const0_rtx)
5697 /* TBZ/TBNZ/CBZ/CBNZ. */
5698 if (GET_CODE (inner) == ZERO_EXTRACT)
5699 /* TBZ/TBNZ. */
5700 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5701 0, speed);
5702 else
5703 /* CBZ/CBNZ. */
5704 *cost += rtx_cost (inner, cmpcode, 0, speed);
5706 return true;
5709 else if (cmpcode == LT || cmpcode == GE)
5711 /* TBZ/TBNZ. */
5712 if (comparator == const0_rtx)
5713 return true;
5717 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5719 /* It's a conditional operation based on the status flags,
5720 so it must be some flavor of CSEL. */
5722 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5723 if (GET_CODE (op1) == NEG
5724 || GET_CODE (op1) == NOT
5725 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5726 op1 = XEXP (op1, 0);
5728 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5729 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5730 return true;
5733 /* We don't know what this is, cost all operands. */
5734 return false;
5737 /* Calculate the cost of calculating X, storing it in *COST. Result
5738 is true if the total cost of the operation has now been calculated. */
5739 static bool
5740 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5741 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5743 rtx op0, op1, op2;
5744 const struct cpu_cost_table *extra_cost
5745 = aarch64_tune_params.insn_extra_cost;
5746 machine_mode mode = GET_MODE (x);
5748 /* By default, assume that everything has equivalent cost to the
5749 cheapest instruction. Any additional costs are applied as a delta
5750 above this default. */
5751 *cost = COSTS_N_INSNS (1);
5753 switch (code)
5755 case SET:
5756 /* The cost depends entirely on the operands to SET. */
5757 *cost = 0;
5758 op0 = SET_DEST (x);
5759 op1 = SET_SRC (x);
5761 switch (GET_CODE (op0))
5763 case MEM:
5764 if (speed)
5766 rtx address = XEXP (op0, 0);
5767 if (VECTOR_MODE_P (mode))
5768 *cost += extra_cost->ldst.storev;
5769 else if (GET_MODE_CLASS (mode) == MODE_INT)
5770 *cost += extra_cost->ldst.store;
5771 else if (mode == SFmode)
5772 *cost += extra_cost->ldst.storef;
5773 else if (mode == DFmode)
5774 *cost += extra_cost->ldst.stored;
5776 *cost +=
5777 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5778 0, speed));
5781 *cost += rtx_cost (op1, SET, 1, speed);
5782 return true;
5784 case SUBREG:
5785 if (! REG_P (SUBREG_REG (op0)))
5786 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5788 /* Fall through. */
5789 case REG:
5790 /* The cost is one per vector-register copied. */
5791 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5793 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5794 / GET_MODE_SIZE (V4SImode);
5795 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5797 /* const0_rtx is in general free, but we will use an
5798 instruction to set a register to 0. */
5799 else if (REG_P (op1) || op1 == const0_rtx)
5801 /* The cost is 1 per register copied. */
5802 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5803 / UNITS_PER_WORD;
5804 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5806 else
5807 /* Cost is just the cost of the RHS of the set. */
5808 *cost += rtx_cost (op1, SET, 1, speed);
5809 return true;
5811 case ZERO_EXTRACT:
5812 case SIGN_EXTRACT:
5813 /* Bit-field insertion. Strip any redundant widening of
5814 the RHS to meet the width of the target. */
5815 if (GET_CODE (op1) == SUBREG)
5816 op1 = SUBREG_REG (op1);
5817 if ((GET_CODE (op1) == ZERO_EXTEND
5818 || GET_CODE (op1) == SIGN_EXTEND)
5819 && CONST_INT_P (XEXP (op0, 1))
5820 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5821 >= INTVAL (XEXP (op0, 1))))
5822 op1 = XEXP (op1, 0);
5824 if (CONST_INT_P (op1))
5826 /* MOV immediate is assumed to always be cheap. */
5827 *cost = COSTS_N_INSNS (1);
5829 else
5831 /* BFM. */
5832 if (speed)
5833 *cost += extra_cost->alu.bfi;
5834 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5837 return true;
5839 default:
5840 /* We can't make sense of this, assume default cost. */
5841 *cost = COSTS_N_INSNS (1);
5842 return false;
5844 return false;
5846 case CONST_INT:
5847 /* If an instruction can incorporate a constant within the
5848 instruction, the instruction's expression avoids calling
5849 rtx_cost() on the constant. If rtx_cost() is called on a
5850 constant, then it is usually because the constant must be
5851 moved into a register by one or more instructions.
5853 The exception is constant 0, which can be expressed
5854 as XZR/WZR and is therefore free. The exception to this is
5855 if we have (set (reg) (const0_rtx)) in which case we must cost
5856 the move. However, we can catch that when we cost the SET, so
5857 we don't need to consider that here. */
5858 if (x == const0_rtx)
5859 *cost = 0;
5860 else
5862 /* To an approximation, building any other constant is
5863 proportionally expensive to the number of instructions
5864 required to build that constant. This is true whether we
5865 are compiling for SPEED or otherwise. */
5866 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5867 (NULL_RTX, x, false, mode));
5869 return true;
5871 case CONST_DOUBLE:
5872 if (speed)
5874 /* mov[df,sf]_aarch64. */
5875 if (aarch64_float_const_representable_p (x))
5876 /* FMOV (scalar immediate). */
5877 *cost += extra_cost->fp[mode == DFmode].fpconst;
5878 else if (!aarch64_float_const_zero_rtx_p (x))
5880 /* This will be a load from memory. */
5881 if (mode == DFmode)
5882 *cost += extra_cost->ldst.loadd;
5883 else
5884 *cost += extra_cost->ldst.loadf;
5886 else
5887 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5888 or MOV v0.s[0], wzr - neither of which are modeled by the
5889 cost tables. Just use the default cost. */
5894 return true;
5896 case MEM:
5897 if (speed)
5899 /* For loads we want the base cost of a load, plus an
5900 approximation for the additional cost of the addressing
5901 mode. */
5902 rtx address = XEXP (x, 0);
5903 if (VECTOR_MODE_P (mode))
5904 *cost += extra_cost->ldst.loadv;
5905 else if (GET_MODE_CLASS (mode) == MODE_INT)
5906 *cost += extra_cost->ldst.load;
5907 else if (mode == SFmode)
5908 *cost += extra_cost->ldst.loadf;
5909 else if (mode == DFmode)
5910 *cost += extra_cost->ldst.loadd;
5912 *cost +=
5913 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5914 0, speed));
5917 return true;
5919 case NEG:
5920 op0 = XEXP (x, 0);
5922 if (VECTOR_MODE_P (mode))
5924 if (speed)
5926 /* FNEG. */
5927 *cost += extra_cost->vect.alu;
5929 return false;
5932 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5934 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5935 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5937 /* CSETM. */
5938 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5939 return true;
5942 /* Cost this as SUB wzr, X. */
5943 op0 = CONST0_RTX (GET_MODE (x));
5944 op1 = XEXP (x, 0);
5945 goto cost_minus;
5948 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5950 /* Support (neg(fma...)) as a single instruction only if
5951 sign of zeros is unimportant. This matches the decision
5952 making in aarch64.md. */
5953 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5955 /* FNMADD. */
5956 *cost = rtx_cost (op0, NEG, 0, speed);
5957 return true;
5959 if (speed)
5960 /* FNEG. */
5961 *cost += extra_cost->fp[mode == DFmode].neg;
5962 return false;
5965 return false;
5967 case CLRSB:
5968 case CLZ:
5969 if (speed)
5971 if (VECTOR_MODE_P (mode))
5972 *cost += extra_cost->vect.alu;
5973 else
5974 *cost += extra_cost->alu.clz;
5977 return false;
5979 case COMPARE:
5980 op0 = XEXP (x, 0);
5981 op1 = XEXP (x, 1);
5983 if (op1 == const0_rtx
5984 && GET_CODE (op0) == AND)
5986 x = op0;
5987 goto cost_logic;
5990 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5992 /* TODO: A write to the CC flags possibly costs extra, this
5993 needs encoding in the cost tables. */
5995 /* CC_ZESWPmode supports zero extend for free. */
5996 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5997 op0 = XEXP (op0, 0);
5999 /* ANDS. */
6000 if (GET_CODE (op0) == AND)
6002 x = op0;
6003 goto cost_logic;
6006 if (GET_CODE (op0) == PLUS)
6008 /* ADDS (and CMN alias). */
6009 x = op0;
6010 goto cost_plus;
6013 if (GET_CODE (op0) == MINUS)
6015 /* SUBS. */
6016 x = op0;
6017 goto cost_minus;
6020 if (GET_CODE (op1) == NEG)
6022 /* CMN. */
6023 if (speed)
6024 *cost += extra_cost->alu.arith;
6026 *cost += rtx_cost (op0, COMPARE, 0, speed);
6027 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
6028 return true;
6031 /* CMP.
6033 Compare can freely swap the order of operands, and
6034 canonicalization puts the more complex operation first.
6035 But the integer MINUS logic expects the shift/extend
6036 operation in op1. */
6037 if (! (REG_P (op0)
6038 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6040 op0 = XEXP (x, 1);
6041 op1 = XEXP (x, 0);
6043 goto cost_minus;
6046 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6048 /* FCMP. */
6049 if (speed)
6050 *cost += extra_cost->fp[mode == DFmode].compare;
6052 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6054 *cost += rtx_cost (op0, COMPARE, 0, speed);
6055 /* FCMP supports constant 0.0 for no extra cost. */
6056 return true;
6058 return false;
6061 if (VECTOR_MODE_P (mode))
6063 /* Vector compare. */
6064 if (speed)
6065 *cost += extra_cost->vect.alu;
6067 if (aarch64_float_const_zero_rtx_p (op1))
6069 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6070 cost. */
6071 return true;
6073 return false;
6075 return false;
6077 case MINUS:
6079 op0 = XEXP (x, 0);
6080 op1 = XEXP (x, 1);
6082 cost_minus:
6083 *cost += rtx_cost (op0, MINUS, 0, speed);
6085 /* Detect valid immediates. */
6086 if ((GET_MODE_CLASS (mode) == MODE_INT
6087 || (GET_MODE_CLASS (mode) == MODE_CC
6088 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6089 && CONST_INT_P (op1)
6090 && aarch64_uimm12_shift (INTVAL (op1)))
6092 if (speed)
6093 /* SUB(S) (immediate). */
6094 *cost += extra_cost->alu.arith;
6095 return true;
6098 /* Look for SUB (extended register). */
6099 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6101 if (speed)
6102 *cost += extra_cost->alu.extend_arith;
6104 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
6105 (enum rtx_code) GET_CODE (op1),
6106 0, speed);
6107 return true;
6110 rtx new_op1 = aarch64_strip_extend (op1);
6112 /* Cost this as an FMA-alike operation. */
6113 if ((GET_CODE (new_op1) == MULT
6114 || aarch64_shift_p (GET_CODE (new_op1)))
6115 && code != COMPARE)
6117 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6118 (enum rtx_code) code,
6119 speed);
6120 return true;
6123 *cost += rtx_cost (new_op1, MINUS, 1, speed);
6125 if (speed)
6127 if (VECTOR_MODE_P (mode))
6129 /* Vector SUB. */
6130 *cost += extra_cost->vect.alu;
6132 else if (GET_MODE_CLASS (mode) == MODE_INT)
6134 /* SUB(S). */
6135 *cost += extra_cost->alu.arith;
6137 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6139 /* FSUB. */
6140 *cost += extra_cost->fp[mode == DFmode].addsub;
6143 return true;
6146 case PLUS:
6148 rtx new_op0;
6150 op0 = XEXP (x, 0);
6151 op1 = XEXP (x, 1);
6153 cost_plus:
6154 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6155 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6157 /* CSINC. */
6158 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6159 *cost += rtx_cost (op1, PLUS, 1, speed);
6160 return true;
6163 if (GET_MODE_CLASS (mode) == MODE_INT
6164 && CONST_INT_P (op1)
6165 && aarch64_uimm12_shift (INTVAL (op1)))
6167 *cost += rtx_cost (op0, PLUS, 0, speed);
6169 if (speed)
6170 /* ADD (immediate). */
6171 *cost += extra_cost->alu.arith;
6172 return true;
6175 *cost += rtx_cost (op1, PLUS, 1, speed);
6177 /* Look for ADD (extended register). */
6178 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6180 if (speed)
6181 *cost += extra_cost->alu.extend_arith;
6183 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6184 (enum rtx_code) GET_CODE (op0),
6185 0, speed);
6186 return true;
6189 /* Strip any extend, leave shifts behind as we will
6190 cost them through mult_cost. */
6191 new_op0 = aarch64_strip_extend (op0);
6193 if (GET_CODE (new_op0) == MULT
6194 || aarch64_shift_p (GET_CODE (new_op0)))
6196 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6197 speed);
6198 return true;
6201 *cost += rtx_cost (new_op0, PLUS, 0, speed);
6203 if (speed)
6205 if (VECTOR_MODE_P (mode))
6207 /* Vector ADD. */
6208 *cost += extra_cost->vect.alu;
6210 else if (GET_MODE_CLASS (mode) == MODE_INT)
6212 /* ADD. */
6213 *cost += extra_cost->alu.arith;
6215 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6217 /* FADD. */
6218 *cost += extra_cost->fp[mode == DFmode].addsub;
6221 return true;
6224 case BSWAP:
6225 *cost = COSTS_N_INSNS (1);
6227 if (speed)
6229 if (VECTOR_MODE_P (mode))
6230 *cost += extra_cost->vect.alu;
6231 else
6232 *cost += extra_cost->alu.rev;
6234 return false;
6236 case IOR:
6237 if (aarch_rev16_p (x))
6239 *cost = COSTS_N_INSNS (1);
6241 if (speed)
6243 if (VECTOR_MODE_P (mode))
6244 *cost += extra_cost->vect.alu;
6245 else
6246 *cost += extra_cost->alu.rev;
6248 return true;
6251 if (aarch64_extr_rtx_p (x, &op0, &op1))
6253 *cost += rtx_cost (op0, IOR, 0, speed)
6254 + rtx_cost (op1, IOR, 1, speed);
6255 if (speed)
6256 *cost += extra_cost->alu.shift;
6258 return true;
6260 /* Fall through. */
6261 case XOR:
6262 case AND:
6263 cost_logic:
6264 op0 = XEXP (x, 0);
6265 op1 = XEXP (x, 1);
6267 if (VECTOR_MODE_P (mode))
6269 if (speed)
6270 *cost += extra_cost->vect.alu;
6271 return true;
6274 if (code == AND
6275 && GET_CODE (op0) == MULT
6276 && CONST_INT_P (XEXP (op0, 1))
6277 && CONST_INT_P (op1)
6278 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6279 INTVAL (op1)) != 0)
6281 /* This is a UBFM/SBFM. */
6282 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6283 if (speed)
6284 *cost += extra_cost->alu.bfx;
6285 return true;
6288 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6290 /* We possibly get the immediate for free, this is not
6291 modelled. */
6292 if (CONST_INT_P (op1)
6293 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6295 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6297 if (speed)
6298 *cost += extra_cost->alu.logical;
6300 return true;
6302 else
6304 rtx new_op0 = op0;
6306 /* Handle ORN, EON, or BIC. */
6307 if (GET_CODE (op0) == NOT)
6308 op0 = XEXP (op0, 0);
6310 new_op0 = aarch64_strip_shift (op0);
6312 /* If we had a shift on op0 then this is a logical-shift-
6313 by-register/immediate operation. Otherwise, this is just
6314 a logical operation. */
6315 if (speed)
6317 if (new_op0 != op0)
6319 /* Shift by immediate. */
6320 if (CONST_INT_P (XEXP (op0, 1)))
6321 *cost += extra_cost->alu.log_shift;
6322 else
6323 *cost += extra_cost->alu.log_shift_reg;
6325 else
6326 *cost += extra_cost->alu.logical;
6329 /* In both cases we want to cost both operands. */
6330 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6331 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6333 return true;
6336 return false;
6338 case NOT:
6339 x = XEXP (x, 0);
6340 op0 = aarch64_strip_shift (x);
6342 if (VECTOR_MODE_P (mode))
6344 /* Vector NOT. */
6345 *cost += extra_cost->vect.alu;
6346 return false;
6349 /* MVN-shifted-reg. */
6350 if (op0 != x)
6352 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6354 if (speed)
6355 *cost += extra_cost->alu.log_shift;
6357 return true;
6359 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6360 Handle the second form here taking care that 'a' in the above can
6361 be a shift. */
6362 else if (GET_CODE (op0) == XOR)
6364 rtx newop0 = XEXP (op0, 0);
6365 rtx newop1 = XEXP (op0, 1);
6366 rtx op0_stripped = aarch64_strip_shift (newop0);
6368 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6369 + rtx_cost (op0_stripped, XOR, 0, speed);
6371 if (speed)
6373 if (op0_stripped != newop0)
6374 *cost += extra_cost->alu.log_shift;
6375 else
6376 *cost += extra_cost->alu.logical;
6379 return true;
6381 /* MVN. */
6382 if (speed)
6383 *cost += extra_cost->alu.logical;
6385 return false;
6387 case ZERO_EXTEND:
6389 op0 = XEXP (x, 0);
6390 /* If a value is written in SI mode, then zero extended to DI
6391 mode, the operation will in general be free as a write to
6392 a 'w' register implicitly zeroes the upper bits of an 'x'
6393 register. However, if this is
6395 (set (reg) (zero_extend (reg)))
6397 we must cost the explicit register move. */
6398 if (mode == DImode
6399 && GET_MODE (op0) == SImode
6400 && outer == SET)
6402 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6404 if (!op_cost && speed)
6405 /* MOV. */
6406 *cost += extra_cost->alu.extend;
6407 else
6408 /* Free, the cost is that of the SI mode operation. */
6409 *cost = op_cost;
6411 return true;
6413 else if (MEM_P (XEXP (x, 0)))
6415 /* All loads can zero extend to any size for free. */
6416 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6417 return true;
6420 if (speed)
6422 if (VECTOR_MODE_P (mode))
6424 /* UMOV. */
6425 *cost += extra_cost->vect.alu;
6427 else
6429 /* UXTB/UXTH. */
6430 *cost += extra_cost->alu.extend;
6433 return false;
6435 case SIGN_EXTEND:
6436 if (MEM_P (XEXP (x, 0)))
6438 /* LDRSH. */
6439 if (speed)
6441 rtx address = XEXP (XEXP (x, 0), 0);
6442 *cost += extra_cost->ldst.load_sign_extend;
6444 *cost +=
6445 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6446 0, speed));
6448 return true;
6451 if (speed)
6453 if (VECTOR_MODE_P (mode))
6454 *cost += extra_cost->vect.alu;
6455 else
6456 *cost += extra_cost->alu.extend;
6458 return false;
6460 case ASHIFT:
6461 op0 = XEXP (x, 0);
6462 op1 = XEXP (x, 1);
6464 if (CONST_INT_P (op1))
6466 if (speed)
6468 if (VECTOR_MODE_P (mode))
6470 /* Vector shift (immediate). */
6471 *cost += extra_cost->vect.alu;
6473 else
6475 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6476 aliases. */
6477 *cost += extra_cost->alu.shift;
6481 /* We can incorporate zero/sign extend for free. */
6482 if (GET_CODE (op0) == ZERO_EXTEND
6483 || GET_CODE (op0) == SIGN_EXTEND)
6484 op0 = XEXP (op0, 0);
6486 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6487 return true;
6489 else
6491 if (speed)
6493 if (VECTOR_MODE_P (mode))
6495 /* Vector shift (register). */
6496 *cost += extra_cost->vect.alu;
6498 else
6500 /* LSLV. */
6501 *cost += extra_cost->alu.shift_reg;
6504 return false; /* All arguments need to be in registers. */
6507 case ROTATE:
6508 case ROTATERT:
6509 case LSHIFTRT:
6510 case ASHIFTRT:
6511 op0 = XEXP (x, 0);
6512 op1 = XEXP (x, 1);
6514 if (CONST_INT_P (op1))
6516 /* ASR (immediate) and friends. */
6517 if (speed)
6519 if (VECTOR_MODE_P (mode))
6520 *cost += extra_cost->vect.alu;
6521 else
6522 *cost += extra_cost->alu.shift;
6525 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6526 return true;
6528 else
6531 /* ASR (register) and friends. */
6532 if (speed)
6534 if (VECTOR_MODE_P (mode))
6535 *cost += extra_cost->vect.alu;
6536 else
6537 *cost += extra_cost->alu.shift_reg;
6539 return false; /* All arguments need to be in registers. */
6542 case SYMBOL_REF:
6544 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6545 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6547 /* LDR. */
6548 if (speed)
6549 *cost += extra_cost->ldst.load;
6551 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6552 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6554 /* ADRP, followed by ADD. */
6555 *cost += COSTS_N_INSNS (1);
6556 if (speed)
6557 *cost += 2 * extra_cost->alu.arith;
6559 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6560 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6562 /* ADR. */
6563 if (speed)
6564 *cost += extra_cost->alu.arith;
6567 if (flag_pic)
6569 /* One extra load instruction, after accessing the GOT. */
6570 *cost += COSTS_N_INSNS (1);
6571 if (speed)
6572 *cost += extra_cost->ldst.load;
6574 return true;
6576 case HIGH:
6577 case LO_SUM:
6578 /* ADRP/ADD (immediate). */
6579 if (speed)
6580 *cost += extra_cost->alu.arith;
6581 return true;
6583 case ZERO_EXTRACT:
6584 case SIGN_EXTRACT:
6585 /* UBFX/SBFX. */
6586 if (speed)
6588 if (VECTOR_MODE_P (mode))
6589 *cost += extra_cost->vect.alu;
6590 else
6591 *cost += extra_cost->alu.bfx;
6594 /* We can trust that the immediates used will be correct (there
6595 are no by-register forms), so we need only cost op0. */
6596 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6597 return true;
6599 case MULT:
6600 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6601 /* aarch64_rtx_mult_cost always handles recursion to its
6602 operands. */
6603 return true;
6605 case MOD:
6606 case UMOD:
6607 if (speed)
6609 if (VECTOR_MODE_P (mode))
6610 *cost += extra_cost->vect.alu;
6611 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6612 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6613 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6614 else if (GET_MODE (x) == DFmode)
6615 *cost += (extra_cost->fp[1].mult
6616 + extra_cost->fp[1].div);
6617 else if (GET_MODE (x) == SFmode)
6618 *cost += (extra_cost->fp[0].mult
6619 + extra_cost->fp[0].div);
6621 return false; /* All arguments need to be in registers. */
6623 case DIV:
6624 case UDIV:
6625 case SQRT:
6626 if (speed)
6628 if (VECTOR_MODE_P (mode))
6629 *cost += extra_cost->vect.alu;
6630 else if (GET_MODE_CLASS (mode) == MODE_INT)
6631 /* There is no integer SQRT, so only DIV and UDIV can get
6632 here. */
6633 *cost += extra_cost->mult[mode == DImode].idiv;
6634 else
6635 *cost += extra_cost->fp[mode == DFmode].div;
6637 return false; /* All arguments need to be in registers. */
6639 case IF_THEN_ELSE:
6640 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6641 XEXP (x, 2), cost, speed);
6643 case EQ:
6644 case NE:
6645 case GT:
6646 case GTU:
6647 case LT:
6648 case LTU:
6649 case GE:
6650 case GEU:
6651 case LE:
6652 case LEU:
6654 return false; /* All arguments must be in registers. */
6656 case FMA:
6657 op0 = XEXP (x, 0);
6658 op1 = XEXP (x, 1);
6659 op2 = XEXP (x, 2);
6661 if (speed)
6663 if (VECTOR_MODE_P (mode))
6664 *cost += extra_cost->vect.alu;
6665 else
6666 *cost += extra_cost->fp[mode == DFmode].fma;
6669 /* FMSUB, FNMADD, and FNMSUB are free. */
6670 if (GET_CODE (op0) == NEG)
6671 op0 = XEXP (op0, 0);
6673 if (GET_CODE (op2) == NEG)
6674 op2 = XEXP (op2, 0);
6676 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6677 and the by-element operand as operand 0. */
6678 if (GET_CODE (op1) == NEG)
6679 op1 = XEXP (op1, 0);
6681 /* Catch vector-by-element operations. The by-element operand can
6682 either be (vec_duplicate (vec_select (x))) or just
6683 (vec_select (x)), depending on whether we are multiplying by
6684 a vector or a scalar.
6686 Canonicalization is not very good in these cases, FMA4 will put the
6687 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6688 if (GET_CODE (op0) == VEC_DUPLICATE)
6689 op0 = XEXP (op0, 0);
6690 else if (GET_CODE (op1) == VEC_DUPLICATE)
6691 op1 = XEXP (op1, 0);
6693 if (GET_CODE (op0) == VEC_SELECT)
6694 op0 = XEXP (op0, 0);
6695 else if (GET_CODE (op1) == VEC_SELECT)
6696 op1 = XEXP (op1, 0);
6698 /* If the remaining parameters are not registers,
6699 get the cost to put them into registers. */
6700 *cost += rtx_cost (op0, FMA, 0, speed);
6701 *cost += rtx_cost (op1, FMA, 1, speed);
6702 *cost += rtx_cost (op2, FMA, 2, speed);
6703 return true;
6705 case FLOAT:
6706 case UNSIGNED_FLOAT:
6707 if (speed)
6708 *cost += extra_cost->fp[mode == DFmode].fromint;
6709 return false;
6711 case FLOAT_EXTEND:
6712 if (speed)
6714 if (VECTOR_MODE_P (mode))
6716 /*Vector truncate. */
6717 *cost += extra_cost->vect.alu;
6719 else
6720 *cost += extra_cost->fp[mode == DFmode].widen;
6722 return false;
6724 case FLOAT_TRUNCATE:
6725 if (speed)
6727 if (VECTOR_MODE_P (mode))
6729 /*Vector conversion. */
6730 *cost += extra_cost->vect.alu;
6732 else
6733 *cost += extra_cost->fp[mode == DFmode].narrow;
6735 return false;
6737 case FIX:
6738 case UNSIGNED_FIX:
6739 x = XEXP (x, 0);
6740 /* Strip the rounding part. They will all be implemented
6741 by the fcvt* family of instructions anyway. */
6742 if (GET_CODE (x) == UNSPEC)
6744 unsigned int uns_code = XINT (x, 1);
6746 if (uns_code == UNSPEC_FRINTA
6747 || uns_code == UNSPEC_FRINTM
6748 || uns_code == UNSPEC_FRINTN
6749 || uns_code == UNSPEC_FRINTP
6750 || uns_code == UNSPEC_FRINTZ)
6751 x = XVECEXP (x, 0, 0);
6754 if (speed)
6756 if (VECTOR_MODE_P (mode))
6757 *cost += extra_cost->vect.alu;
6758 else
6759 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6761 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6762 return true;
6764 case ABS:
6765 if (VECTOR_MODE_P (mode))
6767 /* ABS (vector). */
6768 if (speed)
6769 *cost += extra_cost->vect.alu;
6771 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6773 op0 = XEXP (x, 0);
6775 /* FABD, which is analogous to FADD. */
6776 if (GET_CODE (op0) == MINUS)
6778 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6779 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6780 if (speed)
6781 *cost += extra_cost->fp[mode == DFmode].addsub;
6783 return true;
6785 /* Simple FABS is analogous to FNEG. */
6786 if (speed)
6787 *cost += extra_cost->fp[mode == DFmode].neg;
6789 else
6791 /* Integer ABS will either be split to
6792 two arithmetic instructions, or will be an ABS
6793 (scalar), which we don't model. */
6794 *cost = COSTS_N_INSNS (2);
6795 if (speed)
6796 *cost += 2 * extra_cost->alu.arith;
6798 return false;
6800 case SMAX:
6801 case SMIN:
6802 if (speed)
6804 if (VECTOR_MODE_P (mode))
6805 *cost += extra_cost->vect.alu;
6806 else
6808 /* FMAXNM/FMINNM/FMAX/FMIN.
6809 TODO: This may not be accurate for all implementations, but
6810 we do not model this in the cost tables. */
6811 *cost += extra_cost->fp[mode == DFmode].addsub;
6814 return false;
6816 case UNSPEC:
6817 /* The floating point round to integer frint* instructions. */
6818 if (aarch64_frint_unspec_p (XINT (x, 1)))
6820 if (speed)
6821 *cost += extra_cost->fp[mode == DFmode].roundint;
6823 return false;
6826 if (XINT (x, 1) == UNSPEC_RBIT)
6828 if (speed)
6829 *cost += extra_cost->alu.rev;
6831 return false;
6833 break;
6835 case TRUNCATE:
6837 /* Decompose <su>muldi3_highpart. */
6838 if (/* (truncate:DI */
6839 mode == DImode
6840 /* (lshiftrt:TI */
6841 && GET_MODE (XEXP (x, 0)) == TImode
6842 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6843 /* (mult:TI */
6844 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6845 /* (ANY_EXTEND:TI (reg:DI))
6846 (ANY_EXTEND:TI (reg:DI))) */
6847 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6848 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6849 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6850 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6851 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6852 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6853 /* (const_int 64) */
6854 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6855 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6857 /* UMULH/SMULH. */
6858 if (speed)
6859 *cost += extra_cost->mult[mode == DImode].extend;
6860 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6861 MULT, 0, speed);
6862 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6863 MULT, 1, speed);
6864 return true;
6867 /* Fall through. */
6868 default:
6869 break;
6872 if (dump_file && (dump_flags & TDF_DETAILS))
6873 fprintf (dump_file,
6874 "\nFailed to cost RTX. Assuming default cost.\n");
6876 return true;
6879 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6880 calculated for X. This cost is stored in *COST. Returns true
6881 if the total cost of X was calculated. */
6882 static bool
6883 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6884 int param, int *cost, bool speed)
6886 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6888 if (dump_file && (dump_flags & TDF_DETAILS))
6890 print_rtl_single (dump_file, x);
6891 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6892 speed ? "Hot" : "Cold",
6893 *cost, result ? "final" : "partial");
6896 return result;
6899 static int
6900 aarch64_register_move_cost (machine_mode mode,
6901 reg_class_t from_i, reg_class_t to_i)
6903 enum reg_class from = (enum reg_class) from_i;
6904 enum reg_class to = (enum reg_class) to_i;
6905 const struct cpu_regmove_cost *regmove_cost
6906 = aarch64_tune_params.regmove_cost;
6908 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6909 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6910 to = GENERAL_REGS;
6912 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6913 from = GENERAL_REGS;
6915 /* Moving between GPR and stack cost is the same as GP2GP. */
6916 if ((from == GENERAL_REGS && to == STACK_REG)
6917 || (to == GENERAL_REGS && from == STACK_REG))
6918 return regmove_cost->GP2GP;
6920 /* To/From the stack register, we move via the gprs. */
6921 if (to == STACK_REG || from == STACK_REG)
6922 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6923 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6925 if (GET_MODE_SIZE (mode) == 16)
6927 /* 128-bit operations on general registers require 2 instructions. */
6928 if (from == GENERAL_REGS && to == GENERAL_REGS)
6929 return regmove_cost->GP2GP * 2;
6930 else if (from == GENERAL_REGS)
6931 return regmove_cost->GP2FP * 2;
6932 else if (to == GENERAL_REGS)
6933 return regmove_cost->FP2GP * 2;
6935 /* When AdvSIMD instructions are disabled it is not possible to move
6936 a 128-bit value directly between Q registers. This is handled in
6937 secondary reload. A general register is used as a scratch to move
6938 the upper DI value and the lower DI value is moved directly,
6939 hence the cost is the sum of three moves. */
6940 if (! TARGET_SIMD)
6941 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6943 return regmove_cost->FP2FP;
6946 if (from == GENERAL_REGS && to == GENERAL_REGS)
6947 return regmove_cost->GP2GP;
6948 else if (from == GENERAL_REGS)
6949 return regmove_cost->GP2FP;
6950 else if (to == GENERAL_REGS)
6951 return regmove_cost->FP2GP;
6953 return regmove_cost->FP2FP;
6956 static int
6957 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6958 reg_class_t rclass ATTRIBUTE_UNUSED,
6959 bool in ATTRIBUTE_UNUSED)
6961 return aarch64_tune_params.memmov_cost;
6964 /* Return the number of instructions that can be issued per cycle. */
6965 static int
6966 aarch64_sched_issue_rate (void)
6968 return aarch64_tune_params.issue_rate;
6971 static int
6972 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6974 int issue_rate = aarch64_sched_issue_rate ();
6976 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6979 /* Vectorizer cost model target hooks. */
6981 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6982 static int
6983 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6984 tree vectype,
6985 int misalign ATTRIBUTE_UNUSED)
6987 unsigned elements;
6989 switch (type_of_cost)
6991 case scalar_stmt:
6992 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
6994 case scalar_load:
6995 return aarch64_tune_params.vec_costs->scalar_load_cost;
6997 case scalar_store:
6998 return aarch64_tune_params.vec_costs->scalar_store_cost;
7000 case vector_stmt:
7001 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7003 case vector_load:
7004 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7006 case vector_store:
7007 return aarch64_tune_params.vec_costs->vec_store_cost;
7009 case vec_to_scalar:
7010 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7012 case scalar_to_vec:
7013 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7015 case unaligned_load:
7016 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7018 case unaligned_store:
7019 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7021 case cond_branch_taken:
7022 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7024 case cond_branch_not_taken:
7025 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7027 case vec_perm:
7028 case vec_promote_demote:
7029 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7031 case vec_construct:
7032 elements = TYPE_VECTOR_SUBPARTS (vectype);
7033 return elements / 2 + 1;
7035 default:
7036 gcc_unreachable ();
7040 /* Implement targetm.vectorize.add_stmt_cost. */
7041 static unsigned
7042 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7043 struct _stmt_vec_info *stmt_info, int misalign,
7044 enum vect_cost_model_location where)
7046 unsigned *cost = (unsigned *) data;
7047 unsigned retval = 0;
7049 if (flag_vect_cost_model)
7051 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7052 int stmt_cost =
7053 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7055 /* Statements in an inner loop relative to the loop being
7056 vectorized are weighted more heavily. The value here is
7057 a function (linear for now) of the loop nest level. */
7058 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7060 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
7061 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
7062 unsigned nest_level = loop_depth (loop);
7064 count *= nest_level;
7067 retval = (unsigned) (count * stmt_cost);
7068 cost[where] += retval;
7071 return retval;
7074 static void initialize_aarch64_code_model (void);
7076 /* Parse the architecture extension string. */
7078 static void
7079 aarch64_parse_extension (char *str)
7081 /* The extension string is parsed left to right. */
7082 const struct aarch64_option_extension *opt = NULL;
7084 /* Flag to say whether we are adding or removing an extension. */
7085 int adding_ext = -1;
7087 while (str != NULL && *str != 0)
7089 char *ext;
7090 size_t len;
7092 str++;
7093 ext = strchr (str, '+');
7095 if (ext != NULL)
7096 len = ext - str;
7097 else
7098 len = strlen (str);
7100 if (len >= 2 && strncmp (str, "no", 2) == 0)
7102 adding_ext = 0;
7103 len -= 2;
7104 str += 2;
7106 else if (len > 0)
7107 adding_ext = 1;
7109 if (len == 0)
7111 error ("missing feature modifier after %qs", adding_ext ? "+"
7112 : "+no");
7113 return;
7116 /* Scan over the extensions table trying to find an exact match. */
7117 for (opt = all_extensions; opt->name != NULL; opt++)
7119 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7121 /* Add or remove the extension. */
7122 if (adding_ext)
7123 aarch64_isa_flags |= opt->flags_on;
7124 else
7125 aarch64_isa_flags &= ~(opt->flags_off);
7126 break;
7130 if (opt->name == NULL)
7132 /* Extension not found in list. */
7133 error ("unknown feature modifier %qs", str);
7134 return;
7137 str = ext;
7140 return;
7143 /* Parse the ARCH string. */
7145 static void
7146 aarch64_parse_arch (void)
7148 char *ext;
7149 const struct processor *arch;
7150 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7151 size_t len;
7153 strcpy (str, aarch64_arch_string);
7155 ext = strchr (str, '+');
7157 if (ext != NULL)
7158 len = ext - str;
7159 else
7160 len = strlen (str);
7162 if (len == 0)
7164 error ("missing arch name in -march=%qs", str);
7165 return;
7168 /* Loop through the list of supported ARCHs to find a match. */
7169 for (arch = all_architectures; arch->name != NULL; arch++)
7171 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7173 selected_arch = arch;
7174 aarch64_isa_flags = selected_arch->flags;
7176 if (!selected_cpu)
7177 selected_cpu = &all_cores[selected_arch->core];
7179 if (ext != NULL)
7181 /* ARCH string contains at least one extension. */
7182 aarch64_parse_extension (ext);
7185 if (strcmp (selected_arch->arch, selected_cpu->arch))
7187 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7188 selected_cpu->name, selected_arch->name);
7191 return;
7195 /* ARCH name not found in list. */
7196 error ("unknown value %qs for -march", str);
7197 return;
7200 /* Parse the CPU string. */
7202 static void
7203 aarch64_parse_cpu (void)
7205 char *ext;
7206 const struct processor *cpu;
7207 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7208 size_t len;
7210 strcpy (str, aarch64_cpu_string);
7212 ext = strchr (str, '+');
7214 if (ext != NULL)
7215 len = ext - str;
7216 else
7217 len = strlen (str);
7219 if (len == 0)
7221 error ("missing cpu name in -mcpu=%qs", str);
7222 return;
7225 /* Loop through the list of supported CPUs to find a match. */
7226 for (cpu = all_cores; cpu->name != NULL; cpu++)
7228 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7230 selected_cpu = cpu;
7231 aarch64_isa_flags = selected_cpu->flags;
7233 if (ext != NULL)
7235 /* CPU string contains at least one extension. */
7236 aarch64_parse_extension (ext);
7239 return;
7243 /* CPU name not found in list. */
7244 error ("unknown value %qs for -mcpu", str);
7245 return;
7248 /* Parse the TUNE string. */
7250 static void
7251 aarch64_parse_tune (void)
7253 const struct processor *cpu;
7254 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7255 strcpy (str, aarch64_tune_string);
7257 /* Loop through the list of supported CPUs to find a match. */
7258 for (cpu = all_cores; cpu->name != NULL; cpu++)
7260 if (strcmp (cpu->name, str) == 0)
7262 selected_tune = cpu;
7263 return;
7267 /* CPU name not found in list. */
7268 error ("unknown value %qs for -mtune", str);
7269 return;
7272 /* Parse TOKEN, which has length LENGTH to see if it is an option
7273 described in FLAG. If it is, return the index bit for that fusion type.
7274 If not, error (printing OPTION_NAME) and return zero. */
7276 static unsigned int
7277 aarch64_parse_one_option_token (const char *token,
7278 size_t length,
7279 const struct aarch64_flag_desc *flag,
7280 const char *option_name)
7282 for (; flag->name != NULL; flag++)
7284 if (length == strlen (flag->name)
7285 && !strncmp (flag->name, token, length))
7286 return flag->flag;
7289 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7290 return 0;
7293 /* Parse OPTION which is a comma-separated list of flags to enable.
7294 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7295 default state we inherit from the CPU tuning structures. OPTION_NAME
7296 gives the top-level option we are parsing in the -moverride string,
7297 for use in error messages. */
7299 static unsigned int
7300 aarch64_parse_boolean_options (const char *option,
7301 const struct aarch64_flag_desc *flags,
7302 unsigned int initial_state,
7303 const char *option_name)
7305 const char separator = '.';
7306 const char* specs = option;
7307 const char* ntoken = option;
7308 unsigned int found_flags = initial_state;
7310 while ((ntoken = strchr (specs, separator)))
7312 size_t token_length = ntoken - specs;
7313 unsigned token_ops = aarch64_parse_one_option_token (specs,
7314 token_length,
7315 flags,
7316 option_name);
7317 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7318 in the token stream, reset the supported operations. So:
7320 adrp+add.cmp+branch.none.adrp+add
7322 would have the result of turning on only adrp+add fusion. */
7323 if (!token_ops)
7324 found_flags = 0;
7326 found_flags |= token_ops;
7327 specs = ++ntoken;
7330 /* We ended with a comma, print something. */
7331 if (!(*specs))
7333 error ("%s string ill-formed\n", option_name);
7334 return 0;
7337 /* We still have one more token to parse. */
7338 size_t token_length = strlen (specs);
7339 unsigned token_ops = aarch64_parse_one_option_token (specs,
7340 token_length,
7341 flags,
7342 option_name);
7343 if (!token_ops)
7344 found_flags = 0;
7346 found_flags |= token_ops;
7347 return found_flags;
7350 /* Support for overriding instruction fusion. */
7352 static void
7353 aarch64_parse_fuse_string (const char *fuse_string,
7354 struct tune_params *tune)
7356 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7357 aarch64_fusible_pairs,
7358 tune->fusible_ops,
7359 "fuse=");
7362 /* Support for overriding other tuning flags. */
7364 static void
7365 aarch64_parse_tune_string (const char *tune_string,
7366 struct tune_params *tune)
7368 tune->extra_tuning_flags
7369 = aarch64_parse_boolean_options (tune_string,
7370 aarch64_tuning_flags,
7371 tune->extra_tuning_flags,
7372 "tune=");
7375 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7376 we understand. If it is, extract the option string and handoff to
7377 the appropriate function. */
7379 void
7380 aarch64_parse_one_override_token (const char* token,
7381 size_t length,
7382 struct tune_params *tune)
7384 const struct aarch64_tuning_override_function *fn
7385 = aarch64_tuning_override_functions;
7387 const char *option_part = strchr (token, '=');
7388 if (!option_part)
7390 error ("tuning string missing in option (%s)", token);
7391 return;
7394 /* Get the length of the option name. */
7395 length = option_part - token;
7396 /* Skip the '=' to get to the option string. */
7397 option_part++;
7399 for (; fn->name != NULL; fn++)
7401 if (!strncmp (fn->name, token, length))
7403 fn->parse_override (option_part, tune);
7404 return;
7408 error ("unknown tuning option (%s)",token);
7409 return;
7412 /* Parse STRING looking for options in the format:
7413 string :: option:string
7414 option :: name=substring
7415 name :: {a-z}
7416 substring :: defined by option. */
7418 static void
7419 aarch64_parse_override_string (const char* input_string,
7420 struct tune_params* tune)
7422 const char separator = ':';
7423 size_t string_length = strlen (input_string) + 1;
7424 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7425 char *string = string_root;
7426 strncpy (string, input_string, string_length);
7427 string[string_length - 1] = '\0';
7429 char* ntoken = string;
7431 while ((ntoken = strchr (string, separator)))
7433 size_t token_length = ntoken - string;
7434 /* Make this substring look like a string. */
7435 *ntoken = '\0';
7436 aarch64_parse_one_override_token (string, token_length, tune);
7437 string = ++ntoken;
7440 /* One last option to parse. */
7441 aarch64_parse_one_override_token (string, strlen (string), tune);
7442 free (string_root);
7445 /* Implement TARGET_OPTION_OVERRIDE. */
7447 static void
7448 aarch64_override_options (void)
7450 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7451 If either of -march or -mtune is given, they override their
7452 respective component of -mcpu.
7454 So, first parse AARCH64_CPU_STRING, then the others, be careful
7455 with -march as, if -mcpu is not present on the command line, march
7456 must set a sensible default CPU. */
7457 if (aarch64_cpu_string)
7459 aarch64_parse_cpu ();
7462 if (aarch64_arch_string)
7464 aarch64_parse_arch ();
7467 if (aarch64_tune_string)
7469 aarch64_parse_tune ();
7472 #ifndef HAVE_AS_MABI_OPTION
7473 /* The compiler may have been configured with 2.23.* binutils, which does
7474 not have support for ILP32. */
7475 if (TARGET_ILP32)
7476 error ("Assembler does not support -mabi=ilp32");
7477 #endif
7479 initialize_aarch64_code_model ();
7481 aarch64_build_bitmask_table ();
7483 /* This target defaults to strict volatile bitfields. */
7484 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7485 flag_strict_volatile_bitfields = 1;
7487 /* If the user did not specify a processor, choose the default
7488 one for them. This will be the CPU set during configuration using
7489 --with-cpu, otherwise it is "generic". */
7490 if (!selected_cpu)
7492 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7493 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7496 gcc_assert (selected_cpu);
7498 if (!selected_tune)
7499 selected_tune = selected_cpu;
7501 aarch64_tune_flags = selected_tune->flags;
7502 aarch64_tune = selected_tune->core;
7503 /* Make a copy of the tuning parameters attached to the core, which
7504 we may later overwrite. */
7505 aarch64_tune_params = *(selected_tune->tune);
7506 aarch64_architecture_version = selected_cpu->architecture_version;
7508 if (aarch64_override_tune_string)
7509 aarch64_parse_override_string (aarch64_override_tune_string,
7510 &aarch64_tune_params);
7512 if (aarch64_fix_a53_err835769 == 2)
7514 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7515 aarch64_fix_a53_err835769 = 1;
7516 #else
7517 aarch64_fix_a53_err835769 = 0;
7518 #endif
7521 aarch64_register_fma_steering ();
7523 aarch64_override_options_after_change ();
7526 /* Implement targetm.override_options_after_change. */
7528 static void
7529 aarch64_override_options_after_change (void)
7531 if (flag_omit_frame_pointer)
7532 flag_omit_leaf_frame_pointer = false;
7533 else if (flag_omit_leaf_frame_pointer)
7534 flag_omit_frame_pointer = true;
7536 /* If not optimizing for size, set the default
7537 alignment to what the target wants */
7538 if (!optimize_size)
7540 if (align_loops <= 0)
7541 align_loops = aarch64_tune_params.loop_align;
7542 if (align_jumps <= 0)
7543 align_jumps = aarch64_tune_params.jump_align;
7544 if (align_functions <= 0)
7545 align_functions = aarch64_tune_params.function_align;
7549 static struct machine_function *
7550 aarch64_init_machine_status (void)
7552 struct machine_function *machine;
7553 machine = ggc_cleared_alloc<machine_function> ();
7554 return machine;
7557 void
7558 aarch64_init_expanders (void)
7560 init_machine_status = aarch64_init_machine_status;
7563 /* A checking mechanism for the implementation of the various code models. */
7564 static void
7565 initialize_aarch64_code_model (void)
7567 if (flag_pic)
7569 switch (aarch64_cmodel_var)
7571 case AARCH64_CMODEL_TINY:
7572 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7573 break;
7574 case AARCH64_CMODEL_SMALL:
7575 #ifdef HAVE_AS_SMALL_PIC_RELOCS
7576 aarch64_cmodel = (flag_pic == 2
7577 ? AARCH64_CMODEL_SMALL_PIC
7578 : AARCH64_CMODEL_SMALL_SPIC);
7579 #else
7580 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7581 #endif
7582 break;
7583 case AARCH64_CMODEL_LARGE:
7584 sorry ("code model %qs with -f%s", "large",
7585 flag_pic > 1 ? "PIC" : "pic");
7586 default:
7587 gcc_unreachable ();
7590 else
7591 aarch64_cmodel = aarch64_cmodel_var;
7594 /* Return true if SYMBOL_REF X binds locally. */
7596 static bool
7597 aarch64_symbol_binds_local_p (const_rtx x)
7599 return (SYMBOL_REF_DECL (x)
7600 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7601 : SYMBOL_REF_LOCAL_P (x));
7604 /* Return true if SYMBOL_REF X is thread local */
7605 static bool
7606 aarch64_tls_symbol_p (rtx x)
7608 if (! TARGET_HAVE_TLS)
7609 return false;
7611 if (GET_CODE (x) != SYMBOL_REF)
7612 return false;
7614 return SYMBOL_REF_TLS_MODEL (x) != 0;
7617 /* Classify a TLS symbol into one of the TLS kinds. */
7618 enum aarch64_symbol_type
7619 aarch64_classify_tls_symbol (rtx x)
7621 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7623 switch (tls_kind)
7625 case TLS_MODEL_GLOBAL_DYNAMIC:
7626 case TLS_MODEL_LOCAL_DYNAMIC:
7627 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7629 case TLS_MODEL_INITIAL_EXEC:
7630 return SYMBOL_SMALL_GOTTPREL;
7632 case TLS_MODEL_LOCAL_EXEC:
7633 return SYMBOL_TLSLE;
7635 case TLS_MODEL_EMULATED:
7636 case TLS_MODEL_NONE:
7637 return SYMBOL_FORCE_TO_MEM;
7639 default:
7640 gcc_unreachable ();
7644 /* Return the method that should be used to access SYMBOL_REF or
7645 LABEL_REF X in context CONTEXT. */
7647 enum aarch64_symbol_type
7648 aarch64_classify_symbol (rtx x, rtx offset,
7649 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7651 if (GET_CODE (x) == LABEL_REF)
7653 switch (aarch64_cmodel)
7655 case AARCH64_CMODEL_LARGE:
7656 return SYMBOL_FORCE_TO_MEM;
7658 case AARCH64_CMODEL_TINY_PIC:
7659 case AARCH64_CMODEL_TINY:
7660 return SYMBOL_TINY_ABSOLUTE;
7662 case AARCH64_CMODEL_SMALL_SPIC:
7663 case AARCH64_CMODEL_SMALL_PIC:
7664 case AARCH64_CMODEL_SMALL:
7665 return SYMBOL_SMALL_ABSOLUTE;
7667 default:
7668 gcc_unreachable ();
7672 if (GET_CODE (x) == SYMBOL_REF)
7674 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7675 return SYMBOL_FORCE_TO_MEM;
7677 if (aarch64_tls_symbol_p (x))
7678 return aarch64_classify_tls_symbol (x);
7680 switch (aarch64_cmodel)
7682 case AARCH64_CMODEL_TINY:
7683 /* When we retreive symbol + offset address, we have to make sure
7684 the offset does not cause overflow of the final address. But
7685 we have no way of knowing the address of symbol at compile time
7686 so we can't accurately say if the distance between the PC and
7687 symbol + offset is outside the addressible range of +/-1M in the
7688 TINY code model. So we rely on images not being greater than
7689 1M and cap the offset at 1M and anything beyond 1M will have to
7690 be loaded using an alternative mechanism. */
7691 if (SYMBOL_REF_WEAK (x)
7692 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7693 return SYMBOL_FORCE_TO_MEM;
7694 return SYMBOL_TINY_ABSOLUTE;
7696 case AARCH64_CMODEL_SMALL:
7697 /* Same reasoning as the tiny code model, but the offset cap here is
7698 4G. */
7699 if (SYMBOL_REF_WEAK (x)
7700 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7701 HOST_WIDE_INT_C (4294967264)))
7702 return SYMBOL_FORCE_TO_MEM;
7703 return SYMBOL_SMALL_ABSOLUTE;
7705 case AARCH64_CMODEL_TINY_PIC:
7706 if (!aarch64_symbol_binds_local_p (x))
7707 return SYMBOL_TINY_GOT;
7708 return SYMBOL_TINY_ABSOLUTE;
7710 case AARCH64_CMODEL_SMALL_SPIC:
7711 case AARCH64_CMODEL_SMALL_PIC:
7712 if (!aarch64_symbol_binds_local_p (x))
7713 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
7714 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
7715 return SYMBOL_SMALL_ABSOLUTE;
7717 default:
7718 gcc_unreachable ();
7722 /* By default push everything into the constant pool. */
7723 return SYMBOL_FORCE_TO_MEM;
7726 bool
7727 aarch64_constant_address_p (rtx x)
7729 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7732 bool
7733 aarch64_legitimate_pic_operand_p (rtx x)
7735 if (GET_CODE (x) == SYMBOL_REF
7736 || (GET_CODE (x) == CONST
7737 && GET_CODE (XEXP (x, 0)) == PLUS
7738 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7739 return false;
7741 return true;
7744 /* Return true if X holds either a quarter-precision or
7745 floating-point +0.0 constant. */
7746 static bool
7747 aarch64_valid_floating_const (machine_mode mode, rtx x)
7749 if (!CONST_DOUBLE_P (x))
7750 return false;
7752 if (aarch64_float_const_zero_rtx_p (x))
7753 return true;
7755 /* We only handle moving 0.0 to a TFmode register. */
7756 if (!(mode == SFmode || mode == DFmode))
7757 return false;
7759 return aarch64_float_const_representable_p (x);
7762 static bool
7763 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7765 /* Do not allow vector struct mode constants. We could support
7766 0 and -1 easily, but they need support in aarch64-simd.md. */
7767 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7768 return false;
7770 /* This could probably go away because
7771 we now decompose CONST_INTs according to expand_mov_immediate. */
7772 if ((GET_CODE (x) == CONST_VECTOR
7773 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7774 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7775 return !targetm.cannot_force_const_mem (mode, x);
7777 if (GET_CODE (x) == HIGH
7778 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7779 return true;
7781 return aarch64_constant_address_p (x);
7785 aarch64_load_tp (rtx target)
7787 if (!target
7788 || GET_MODE (target) != Pmode
7789 || !register_operand (target, Pmode))
7790 target = gen_reg_rtx (Pmode);
7792 /* Can return in any reg. */
7793 emit_insn (gen_aarch64_load_tp_hard (target));
7794 return target;
7797 /* On AAPCS systems, this is the "struct __va_list". */
7798 static GTY(()) tree va_list_type;
7800 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7801 Return the type to use as __builtin_va_list.
7803 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7805 struct __va_list
7807 void *__stack;
7808 void *__gr_top;
7809 void *__vr_top;
7810 int __gr_offs;
7811 int __vr_offs;
7812 }; */
7814 static tree
7815 aarch64_build_builtin_va_list (void)
7817 tree va_list_name;
7818 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7820 /* Create the type. */
7821 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7822 /* Give it the required name. */
7823 va_list_name = build_decl (BUILTINS_LOCATION,
7824 TYPE_DECL,
7825 get_identifier ("__va_list"),
7826 va_list_type);
7827 DECL_ARTIFICIAL (va_list_name) = 1;
7828 TYPE_NAME (va_list_type) = va_list_name;
7829 TYPE_STUB_DECL (va_list_type) = va_list_name;
7831 /* Create the fields. */
7832 f_stack = build_decl (BUILTINS_LOCATION,
7833 FIELD_DECL, get_identifier ("__stack"),
7834 ptr_type_node);
7835 f_grtop = build_decl (BUILTINS_LOCATION,
7836 FIELD_DECL, get_identifier ("__gr_top"),
7837 ptr_type_node);
7838 f_vrtop = build_decl (BUILTINS_LOCATION,
7839 FIELD_DECL, get_identifier ("__vr_top"),
7840 ptr_type_node);
7841 f_groff = build_decl (BUILTINS_LOCATION,
7842 FIELD_DECL, get_identifier ("__gr_offs"),
7843 integer_type_node);
7844 f_vroff = build_decl (BUILTINS_LOCATION,
7845 FIELD_DECL, get_identifier ("__vr_offs"),
7846 integer_type_node);
7848 DECL_ARTIFICIAL (f_stack) = 1;
7849 DECL_ARTIFICIAL (f_grtop) = 1;
7850 DECL_ARTIFICIAL (f_vrtop) = 1;
7851 DECL_ARTIFICIAL (f_groff) = 1;
7852 DECL_ARTIFICIAL (f_vroff) = 1;
7854 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7855 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7856 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7857 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7858 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7860 TYPE_FIELDS (va_list_type) = f_stack;
7861 DECL_CHAIN (f_stack) = f_grtop;
7862 DECL_CHAIN (f_grtop) = f_vrtop;
7863 DECL_CHAIN (f_vrtop) = f_groff;
7864 DECL_CHAIN (f_groff) = f_vroff;
7866 /* Compute its layout. */
7867 layout_type (va_list_type);
7869 return va_list_type;
7872 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7873 static void
7874 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7876 const CUMULATIVE_ARGS *cum;
7877 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7878 tree stack, grtop, vrtop, groff, vroff;
7879 tree t;
7880 int gr_save_area_size;
7881 int vr_save_area_size;
7882 int vr_offset;
7884 cum = &crtl->args.info;
7885 gr_save_area_size
7886 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7887 vr_save_area_size
7888 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7890 if (!TARGET_FLOAT)
7892 gcc_assert (cum->aapcs_nvrn == 0);
7893 vr_save_area_size = 0;
7896 f_stack = TYPE_FIELDS (va_list_type_node);
7897 f_grtop = DECL_CHAIN (f_stack);
7898 f_vrtop = DECL_CHAIN (f_grtop);
7899 f_groff = DECL_CHAIN (f_vrtop);
7900 f_vroff = DECL_CHAIN (f_groff);
7902 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7903 NULL_TREE);
7904 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7905 NULL_TREE);
7906 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7907 NULL_TREE);
7908 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7909 NULL_TREE);
7910 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7911 NULL_TREE);
7913 /* Emit code to initialize STACK, which points to the next varargs stack
7914 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7915 by named arguments. STACK is 8-byte aligned. */
7916 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7917 if (cum->aapcs_stack_size > 0)
7918 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7919 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7920 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7922 /* Emit code to initialize GRTOP, the top of the GR save area.
7923 virtual_incoming_args_rtx should have been 16 byte aligned. */
7924 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7925 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7926 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7928 /* Emit code to initialize VRTOP, the top of the VR save area.
7929 This address is gr_save_area_bytes below GRTOP, rounded
7930 down to the next 16-byte boundary. */
7931 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7932 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7933 STACK_BOUNDARY / BITS_PER_UNIT);
7935 if (vr_offset)
7936 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7937 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7938 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7940 /* Emit code to initialize GROFF, the offset from GRTOP of the
7941 next GPR argument. */
7942 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7943 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7944 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7946 /* Likewise emit code to initialize VROFF, the offset from FTOP
7947 of the next VR argument. */
7948 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7949 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7950 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7953 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7955 static tree
7956 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7957 gimple_seq *post_p ATTRIBUTE_UNUSED)
7959 tree addr;
7960 bool indirect_p;
7961 bool is_ha; /* is HFA or HVA. */
7962 bool dw_align; /* double-word align. */
7963 machine_mode ag_mode = VOIDmode;
7964 int nregs;
7965 machine_mode mode;
7967 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7968 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7969 HOST_WIDE_INT size, rsize, adjust, align;
7970 tree t, u, cond1, cond2;
7972 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7973 if (indirect_p)
7974 type = build_pointer_type (type);
7976 mode = TYPE_MODE (type);
7978 f_stack = TYPE_FIELDS (va_list_type_node);
7979 f_grtop = DECL_CHAIN (f_stack);
7980 f_vrtop = DECL_CHAIN (f_grtop);
7981 f_groff = DECL_CHAIN (f_vrtop);
7982 f_vroff = DECL_CHAIN (f_groff);
7984 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7985 f_stack, NULL_TREE);
7986 size = int_size_in_bytes (type);
7987 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7989 dw_align = false;
7990 adjust = 0;
7991 if (aarch64_vfp_is_call_or_return_candidate (mode,
7992 type,
7993 &ag_mode,
7994 &nregs,
7995 &is_ha))
7997 /* TYPE passed in fp/simd registers. */
7998 if (!TARGET_FLOAT)
7999 aarch64_err_no_fpadvsimd (mode, "varargs");
8001 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
8002 unshare_expr (valist), f_vrtop, NULL_TREE);
8003 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
8004 unshare_expr (valist), f_vroff, NULL_TREE);
8006 rsize = nregs * UNITS_PER_VREG;
8008 if (is_ha)
8010 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
8011 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
8013 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
8014 && size < UNITS_PER_VREG)
8016 adjust = UNITS_PER_VREG - size;
8019 else
8021 /* TYPE passed in general registers. */
8022 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
8023 unshare_expr (valist), f_grtop, NULL_TREE);
8024 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
8025 unshare_expr (valist), f_groff, NULL_TREE);
8026 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
8027 nregs = rsize / UNITS_PER_WORD;
8029 if (align > 8)
8030 dw_align = true;
8032 if (BLOCK_REG_PADDING (mode, type, 1) == downward
8033 && size < UNITS_PER_WORD)
8035 adjust = UNITS_PER_WORD - size;
8039 /* Get a local temporary for the field value. */
8040 off = get_initialized_tmp_var (f_off, pre_p, NULL);
8042 /* Emit code to branch if off >= 0. */
8043 t = build2 (GE_EXPR, boolean_type_node, off,
8044 build_int_cst (TREE_TYPE (off), 0));
8045 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
8047 if (dw_align)
8049 /* Emit: offs = (offs + 15) & -16. */
8050 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8051 build_int_cst (TREE_TYPE (off), 15));
8052 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
8053 build_int_cst (TREE_TYPE (off), -16));
8054 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
8056 else
8057 roundup = NULL;
8059 /* Update ap.__[g|v]r_offs */
8060 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8061 build_int_cst (TREE_TYPE (off), rsize));
8062 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
8064 /* String up. */
8065 if (roundup)
8066 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8068 /* [cond2] if (ap.__[g|v]r_offs > 0) */
8069 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
8070 build_int_cst (TREE_TYPE (f_off), 0));
8071 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
8073 /* String up: make sure the assignment happens before the use. */
8074 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
8075 COND_EXPR_ELSE (cond1) = t;
8077 /* Prepare the trees handling the argument that is passed on the stack;
8078 the top level node will store in ON_STACK. */
8079 arg = get_initialized_tmp_var (stack, pre_p, NULL);
8080 if (align > 8)
8082 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
8083 t = fold_convert (intDI_type_node, arg);
8084 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8085 build_int_cst (TREE_TYPE (t), 15));
8086 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8087 build_int_cst (TREE_TYPE (t), -16));
8088 t = fold_convert (TREE_TYPE (arg), t);
8089 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
8091 else
8092 roundup = NULL;
8093 /* Advance ap.__stack */
8094 t = fold_convert (intDI_type_node, arg);
8095 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8096 build_int_cst (TREE_TYPE (t), size + 7));
8097 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8098 build_int_cst (TREE_TYPE (t), -8));
8099 t = fold_convert (TREE_TYPE (arg), t);
8100 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
8101 /* String up roundup and advance. */
8102 if (roundup)
8103 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8104 /* String up with arg */
8105 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
8106 /* Big-endianness related address adjustment. */
8107 if (BLOCK_REG_PADDING (mode, type, 1) == downward
8108 && size < UNITS_PER_WORD)
8110 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
8111 size_int (UNITS_PER_WORD - size));
8112 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
8115 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
8116 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
8118 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
8119 t = off;
8120 if (adjust)
8121 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
8122 build_int_cst (TREE_TYPE (off), adjust));
8124 t = fold_convert (sizetype, t);
8125 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
8127 if (is_ha)
8129 /* type ha; // treat as "struct {ftype field[n];}"
8130 ... [computing offs]
8131 for (i = 0; i <nregs; ++i, offs += 16)
8132 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
8133 return ha; */
8134 int i;
8135 tree tmp_ha, field_t, field_ptr_t;
8137 /* Declare a local variable. */
8138 tmp_ha = create_tmp_var_raw (type, "ha");
8139 gimple_add_tmp_var (tmp_ha);
8141 /* Establish the base type. */
8142 switch (ag_mode)
8144 case SFmode:
8145 field_t = float_type_node;
8146 field_ptr_t = float_ptr_type_node;
8147 break;
8148 case DFmode:
8149 field_t = double_type_node;
8150 field_ptr_t = double_ptr_type_node;
8151 break;
8152 case TFmode:
8153 field_t = long_double_type_node;
8154 field_ptr_t = long_double_ptr_type_node;
8155 break;
8156 /* The half precision and quad precision are not fully supported yet. Enable
8157 the following code after the support is complete. Need to find the correct
8158 type node for __fp16 *. */
8159 #if 0
8160 case HFmode:
8161 field_t = float_type_node;
8162 field_ptr_t = float_ptr_type_node;
8163 break;
8164 #endif
8165 case V2SImode:
8166 case V4SImode:
8168 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
8169 field_t = build_vector_type_for_mode (innertype, ag_mode);
8170 field_ptr_t = build_pointer_type (field_t);
8172 break;
8173 default:
8174 gcc_assert (0);
8177 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
8178 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
8179 addr = t;
8180 t = fold_convert (field_ptr_t, addr);
8181 t = build2 (MODIFY_EXPR, field_t,
8182 build1 (INDIRECT_REF, field_t, tmp_ha),
8183 build1 (INDIRECT_REF, field_t, t));
8185 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
8186 for (i = 1; i < nregs; ++i)
8188 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
8189 u = fold_convert (field_ptr_t, addr);
8190 u = build2 (MODIFY_EXPR, field_t,
8191 build2 (MEM_REF, field_t, tmp_ha,
8192 build_int_cst (field_ptr_t,
8193 (i *
8194 int_size_in_bytes (field_t)))),
8195 build1 (INDIRECT_REF, field_t, u));
8196 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
8199 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
8200 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
8203 COND_EXPR_ELSE (cond2) = t;
8204 addr = fold_convert (build_pointer_type (type), cond1);
8205 addr = build_va_arg_indirect_ref (addr);
8207 if (indirect_p)
8208 addr = build_va_arg_indirect_ref (addr);
8210 return addr;
8213 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
8215 static void
8216 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
8217 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8218 int no_rtl)
8220 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8221 CUMULATIVE_ARGS local_cum;
8222 int gr_saved, vr_saved;
8224 /* The caller has advanced CUM up to, but not beyond, the last named
8225 argument. Advance a local copy of CUM past the last "real" named
8226 argument, to find out how many registers are left over. */
8227 local_cum = *cum;
8228 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
8230 /* Found out how many registers we need to save. */
8231 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
8232 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
8234 if (!TARGET_FLOAT)
8236 gcc_assert (local_cum.aapcs_nvrn == 0);
8237 vr_saved = 0;
8240 if (!no_rtl)
8242 if (gr_saved > 0)
8244 rtx ptr, mem;
8246 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
8247 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
8248 - gr_saved * UNITS_PER_WORD);
8249 mem = gen_frame_mem (BLKmode, ptr);
8250 set_mem_alias_set (mem, get_varargs_alias_set ());
8252 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
8253 mem, gr_saved);
8255 if (vr_saved > 0)
8257 /* We can't use move_block_from_reg, because it will use
8258 the wrong mode, storing D regs only. */
8259 machine_mode mode = TImode;
8260 int off, i;
8262 /* Set OFF to the offset from virtual_incoming_args_rtx of
8263 the first vector register. The VR save area lies below
8264 the GR one, and is aligned to 16 bytes. */
8265 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8266 STACK_BOUNDARY / BITS_PER_UNIT);
8267 off -= vr_saved * UNITS_PER_VREG;
8269 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
8271 rtx ptr, mem;
8273 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
8274 mem = gen_frame_mem (mode, ptr);
8275 set_mem_alias_set (mem, get_varargs_alias_set ());
8276 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
8277 off += UNITS_PER_VREG;
8282 /* We don't save the size into *PRETEND_SIZE because we want to avoid
8283 any complication of having crtl->args.pretend_args_size changed. */
8284 cfun->machine->frame.saved_varargs_size
8285 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8286 STACK_BOUNDARY / BITS_PER_UNIT)
8287 + vr_saved * UNITS_PER_VREG);
8290 static void
8291 aarch64_conditional_register_usage (void)
8293 int i;
8294 if (!TARGET_FLOAT)
8296 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
8298 fixed_regs[i] = 1;
8299 call_used_regs[i] = 1;
8304 /* Walk down the type tree of TYPE counting consecutive base elements.
8305 If *MODEP is VOIDmode, then set it to the first valid floating point
8306 type. If a non-floating point type is found, or if a floating point
8307 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
8308 otherwise return the count in the sub-tree. */
8309 static int
8310 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8312 machine_mode mode;
8313 HOST_WIDE_INT size;
8315 switch (TREE_CODE (type))
8317 case REAL_TYPE:
8318 mode = TYPE_MODE (type);
8319 if (mode != DFmode && mode != SFmode && mode != TFmode)
8320 return -1;
8322 if (*modep == VOIDmode)
8323 *modep = mode;
8325 if (*modep == mode)
8326 return 1;
8328 break;
8330 case COMPLEX_TYPE:
8331 mode = TYPE_MODE (TREE_TYPE (type));
8332 if (mode != DFmode && mode != SFmode && mode != TFmode)
8333 return -1;
8335 if (*modep == VOIDmode)
8336 *modep = mode;
8338 if (*modep == mode)
8339 return 2;
8341 break;
8343 case VECTOR_TYPE:
8344 /* Use V2SImode and V4SImode as representatives of all 64-bit
8345 and 128-bit vector types. */
8346 size = int_size_in_bytes (type);
8347 switch (size)
8349 case 8:
8350 mode = V2SImode;
8351 break;
8352 case 16:
8353 mode = V4SImode;
8354 break;
8355 default:
8356 return -1;
8359 if (*modep == VOIDmode)
8360 *modep = mode;
8362 /* Vector modes are considered to be opaque: two vectors are
8363 equivalent for the purposes of being homogeneous aggregates
8364 if they are the same size. */
8365 if (*modep == mode)
8366 return 1;
8368 break;
8370 case ARRAY_TYPE:
8372 int count;
8373 tree index = TYPE_DOMAIN (type);
8375 /* Can't handle incomplete types nor sizes that are not
8376 fixed. */
8377 if (!COMPLETE_TYPE_P (type)
8378 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8379 return -1;
8381 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8382 if (count == -1
8383 || !index
8384 || !TYPE_MAX_VALUE (index)
8385 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8386 || !TYPE_MIN_VALUE (index)
8387 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8388 || count < 0)
8389 return -1;
8391 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8392 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8394 /* There must be no padding. */
8395 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8396 return -1;
8398 return count;
8401 case RECORD_TYPE:
8403 int count = 0;
8404 int sub_count;
8405 tree field;
8407 /* Can't handle incomplete types nor sizes that are not
8408 fixed. */
8409 if (!COMPLETE_TYPE_P (type)
8410 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8411 return -1;
8413 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8415 if (TREE_CODE (field) != FIELD_DECL)
8416 continue;
8418 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8419 if (sub_count < 0)
8420 return -1;
8421 count += sub_count;
8424 /* There must be no padding. */
8425 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8426 return -1;
8428 return count;
8431 case UNION_TYPE:
8432 case QUAL_UNION_TYPE:
8434 /* These aren't very interesting except in a degenerate case. */
8435 int count = 0;
8436 int sub_count;
8437 tree field;
8439 /* Can't handle incomplete types nor sizes that are not
8440 fixed. */
8441 if (!COMPLETE_TYPE_P (type)
8442 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8443 return -1;
8445 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8447 if (TREE_CODE (field) != FIELD_DECL)
8448 continue;
8450 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8451 if (sub_count < 0)
8452 return -1;
8453 count = count > sub_count ? count : sub_count;
8456 /* There must be no padding. */
8457 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8458 return -1;
8460 return count;
8463 default:
8464 break;
8467 return -1;
8470 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8471 type as described in AAPCS64 \S 4.1.2.
8473 See the comment above aarch64_composite_type_p for the notes on MODE. */
8475 static bool
8476 aarch64_short_vector_p (const_tree type,
8477 machine_mode mode)
8479 HOST_WIDE_INT size = -1;
8481 if (type && TREE_CODE (type) == VECTOR_TYPE)
8482 size = int_size_in_bytes (type);
8483 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8484 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8485 size = GET_MODE_SIZE (mode);
8487 return (size == 8 || size == 16);
8490 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8491 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
8492 array types. The C99 floating-point complex types are also considered
8493 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
8494 types, which are GCC extensions and out of the scope of AAPCS64, are
8495 treated as composite types here as well.
8497 Note that MODE itself is not sufficient in determining whether a type
8498 is such a composite type or not. This is because
8499 stor-layout.c:compute_record_mode may have already changed the MODE
8500 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
8501 structure with only one field may have its MODE set to the mode of the
8502 field. Also an integer mode whose size matches the size of the
8503 RECORD_TYPE type may be used to substitute the original mode
8504 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
8505 solely relied on. */
8507 static bool
8508 aarch64_composite_type_p (const_tree type,
8509 machine_mode mode)
8511 if (aarch64_short_vector_p (type, mode))
8512 return false;
8514 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8515 return true;
8517 if (mode == BLKmode
8518 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8519 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8520 return true;
8522 return false;
8525 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8526 shall be passed or returned in simd/fp register(s) (providing these
8527 parameter passing registers are available).
8529 Upon successful return, *COUNT returns the number of needed registers,
8530 *BASE_MODE returns the mode of the individual register and when IS_HAF
8531 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8532 floating-point aggregate or a homogeneous short-vector aggregate. */
8534 static bool
8535 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8536 const_tree type,
8537 machine_mode *base_mode,
8538 int *count,
8539 bool *is_ha)
8541 machine_mode new_mode = VOIDmode;
8542 bool composite_p = aarch64_composite_type_p (type, mode);
8544 if (is_ha != NULL) *is_ha = false;
8546 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8547 || aarch64_short_vector_p (type, mode))
8549 *count = 1;
8550 new_mode = mode;
8552 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8554 if (is_ha != NULL) *is_ha = true;
8555 *count = 2;
8556 new_mode = GET_MODE_INNER (mode);
8558 else if (type && composite_p)
8560 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8562 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8564 if (is_ha != NULL) *is_ha = true;
8565 *count = ag_count;
8567 else
8568 return false;
8570 else
8571 return false;
8573 *base_mode = new_mode;
8574 return true;
8577 /* Implement TARGET_STRUCT_VALUE_RTX. */
8579 static rtx
8580 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8581 int incoming ATTRIBUTE_UNUSED)
8583 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8586 /* Implements target hook vector_mode_supported_p. */
8587 static bool
8588 aarch64_vector_mode_supported_p (machine_mode mode)
8590 if (TARGET_SIMD
8591 && (mode == V4SImode || mode == V8HImode
8592 || mode == V16QImode || mode == V2DImode
8593 || mode == V2SImode || mode == V4HImode
8594 || mode == V8QImode || mode == V2SFmode
8595 || mode == V4SFmode || mode == V2DFmode
8596 || mode == V1DFmode))
8597 return true;
8599 return false;
8602 /* Return appropriate SIMD container
8603 for MODE within a vector of WIDTH bits. */
8604 static machine_mode
8605 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8607 gcc_assert (width == 64 || width == 128);
8608 if (TARGET_SIMD)
8610 if (width == 128)
8611 switch (mode)
8613 case DFmode:
8614 return V2DFmode;
8615 case SFmode:
8616 return V4SFmode;
8617 case SImode:
8618 return V4SImode;
8619 case HImode:
8620 return V8HImode;
8621 case QImode:
8622 return V16QImode;
8623 case DImode:
8624 return V2DImode;
8625 default:
8626 break;
8628 else
8629 switch (mode)
8631 case SFmode:
8632 return V2SFmode;
8633 case SImode:
8634 return V2SImode;
8635 case HImode:
8636 return V4HImode;
8637 case QImode:
8638 return V8QImode;
8639 default:
8640 break;
8643 return word_mode;
8646 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8647 static machine_mode
8648 aarch64_preferred_simd_mode (machine_mode mode)
8650 return aarch64_simd_container_mode (mode, 128);
8653 /* Return the bitmask of possible vector sizes for the vectorizer
8654 to iterate over. */
8655 static unsigned int
8656 aarch64_autovectorize_vector_sizes (void)
8658 return (16 | 8);
8661 /* Implement TARGET_MANGLE_TYPE. */
8663 static const char *
8664 aarch64_mangle_type (const_tree type)
8666 /* The AArch64 ABI documents say that "__va_list" has to be
8667 managled as if it is in the "std" namespace. */
8668 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8669 return "St9__va_list";
8671 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8672 builtin types. */
8673 if (TYPE_NAME (type) != NULL)
8674 return aarch64_mangle_builtin_type (type);
8676 /* Use the default mangling. */
8677 return NULL;
8681 /* Return true if the rtx_insn contains a MEM RTX somewhere
8682 in it. */
8684 static bool
8685 has_memory_op (rtx_insn *mem_insn)
8687 subrtx_iterator::array_type array;
8688 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8689 if (MEM_P (*iter))
8690 return true;
8692 return false;
8695 /* Find the first rtx_insn before insn that will generate an assembly
8696 instruction. */
8698 static rtx_insn *
8699 aarch64_prev_real_insn (rtx_insn *insn)
8701 if (!insn)
8702 return NULL;
8706 insn = prev_real_insn (insn);
8708 while (insn && recog_memoized (insn) < 0);
8710 return insn;
8713 static bool
8714 is_madd_op (enum attr_type t1)
8716 unsigned int i;
8717 /* A number of these may be AArch32 only. */
8718 enum attr_type mlatypes[] = {
8719 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8720 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8721 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8724 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8726 if (t1 == mlatypes[i])
8727 return true;
8730 return false;
8733 /* Check if there is a register dependency between a load and the insn
8734 for which we hold recog_data. */
8736 static bool
8737 dep_between_memop_and_curr (rtx memop)
8739 rtx load_reg;
8740 int opno;
8742 gcc_assert (GET_CODE (memop) == SET);
8744 if (!REG_P (SET_DEST (memop)))
8745 return false;
8747 load_reg = SET_DEST (memop);
8748 for (opno = 1; opno < recog_data.n_operands; opno++)
8750 rtx operand = recog_data.operand[opno];
8751 if (REG_P (operand)
8752 && reg_overlap_mentioned_p (load_reg, operand))
8753 return true;
8756 return false;
8760 /* When working around the Cortex-A53 erratum 835769,
8761 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8762 instruction and has a preceding memory instruction such that a NOP
8763 should be inserted between them. */
8765 bool
8766 aarch64_madd_needs_nop (rtx_insn* insn)
8768 enum attr_type attr_type;
8769 rtx_insn *prev;
8770 rtx body;
8772 if (!aarch64_fix_a53_err835769)
8773 return false;
8775 if (recog_memoized (insn) < 0)
8776 return false;
8778 attr_type = get_attr_type (insn);
8779 if (!is_madd_op (attr_type))
8780 return false;
8782 prev = aarch64_prev_real_insn (insn);
8783 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8784 Restore recog state to INSN to avoid state corruption. */
8785 extract_constrain_insn_cached (insn);
8787 if (!prev || !has_memory_op (prev))
8788 return false;
8790 body = single_set (prev);
8792 /* If the previous insn is a memory op and there is no dependency between
8793 it and the DImode madd, emit a NOP between them. If body is NULL then we
8794 have a complex memory operation, probably a load/store pair.
8795 Be conservative for now and emit a NOP. */
8796 if (GET_MODE (recog_data.operand[0]) == DImode
8797 && (!body || !dep_between_memop_and_curr (body)))
8798 return true;
8800 return false;
8805 /* Implement FINAL_PRESCAN_INSN. */
8807 void
8808 aarch64_final_prescan_insn (rtx_insn *insn)
8810 if (aarch64_madd_needs_nop (insn))
8811 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8815 /* Return the equivalent letter for size. */
8816 static char
8817 sizetochar (int size)
8819 switch (size)
8821 case 64: return 'd';
8822 case 32: return 's';
8823 case 16: return 'h';
8824 case 8 : return 'b';
8825 default: gcc_unreachable ();
8829 /* Return true iff x is a uniform vector of floating-point
8830 constants, and the constant can be represented in
8831 quarter-precision form. Note, as aarch64_float_const_representable
8832 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8833 static bool
8834 aarch64_vect_float_const_representable_p (rtx x)
8836 int i = 0;
8837 REAL_VALUE_TYPE r0, ri;
8838 rtx x0, xi;
8840 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8841 return false;
8843 x0 = CONST_VECTOR_ELT (x, 0);
8844 if (!CONST_DOUBLE_P (x0))
8845 return false;
8847 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8849 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8851 xi = CONST_VECTOR_ELT (x, i);
8852 if (!CONST_DOUBLE_P (xi))
8853 return false;
8855 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8856 if (!REAL_VALUES_EQUAL (r0, ri))
8857 return false;
8860 return aarch64_float_const_representable_p (x0);
8863 /* Return true for valid and false for invalid. */
8864 bool
8865 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8866 struct simd_immediate_info *info)
8868 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8869 matches = 1; \
8870 for (i = 0; i < idx; i += (STRIDE)) \
8871 if (!(TEST)) \
8872 matches = 0; \
8873 if (matches) \
8875 immtype = (CLASS); \
8876 elsize = (ELSIZE); \
8877 eshift = (SHIFT); \
8878 emvn = (NEG); \
8879 break; \
8882 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8883 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8884 unsigned char bytes[16];
8885 int immtype = -1, matches;
8886 unsigned int invmask = inverse ? 0xff : 0;
8887 int eshift, emvn;
8889 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8891 if (! (aarch64_simd_imm_zero_p (op, mode)
8892 || aarch64_vect_float_const_representable_p (op)))
8893 return false;
8895 if (info)
8897 info->value = CONST_VECTOR_ELT (op, 0);
8898 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8899 info->mvn = false;
8900 info->shift = 0;
8903 return true;
8906 /* Splat vector constant out into a byte vector. */
8907 for (i = 0; i < n_elts; i++)
8909 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8910 it must be laid out in the vector register in reverse order. */
8911 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8912 unsigned HOST_WIDE_INT elpart;
8913 unsigned int part, parts;
8915 if (CONST_INT_P (el))
8917 elpart = INTVAL (el);
8918 parts = 1;
8920 else if (GET_CODE (el) == CONST_DOUBLE)
8922 elpart = CONST_DOUBLE_LOW (el);
8923 parts = 2;
8925 else
8926 gcc_unreachable ();
8928 for (part = 0; part < parts; part++)
8930 unsigned int byte;
8931 for (byte = 0; byte < innersize; byte++)
8933 bytes[idx++] = (elpart & 0xff) ^ invmask;
8934 elpart >>= BITS_PER_UNIT;
8936 if (GET_CODE (el) == CONST_DOUBLE)
8937 elpart = CONST_DOUBLE_HIGH (el);
8941 /* Sanity check. */
8942 gcc_assert (idx == GET_MODE_SIZE (mode));
8946 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8947 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8949 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8950 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8952 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8953 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8955 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8956 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8958 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8960 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8962 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8963 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8965 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8966 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8968 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8969 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8971 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8972 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8974 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8976 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8978 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8979 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8981 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8982 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8984 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8985 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8987 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8988 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8990 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8992 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8993 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8995 while (0);
8997 if (immtype == -1)
8998 return false;
9000 if (info)
9002 info->element_width = elsize;
9003 info->mvn = emvn != 0;
9004 info->shift = eshift;
9006 unsigned HOST_WIDE_INT imm = 0;
9008 if (immtype >= 12 && immtype <= 15)
9009 info->msl = true;
9011 /* Un-invert bytes of recognized vector, if necessary. */
9012 if (invmask != 0)
9013 for (i = 0; i < idx; i++)
9014 bytes[i] ^= invmask;
9016 if (immtype == 17)
9018 /* FIXME: Broken on 32-bit H_W_I hosts. */
9019 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
9021 for (i = 0; i < 8; i++)
9022 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
9023 << (i * BITS_PER_UNIT);
9026 info->value = GEN_INT (imm);
9028 else
9030 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
9031 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
9033 /* Construct 'abcdefgh' because the assembler cannot handle
9034 generic constants. */
9035 if (info->mvn)
9036 imm = ~imm;
9037 imm = (imm >> info->shift) & 0xff;
9038 info->value = GEN_INT (imm);
9042 return true;
9043 #undef CHECK
9046 /* Check of immediate shift constants are within range. */
9047 bool
9048 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
9050 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
9051 if (left)
9052 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
9053 else
9054 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
9057 /* Return true if X is a uniform vector where all elements
9058 are either the floating-point constant 0.0 or the
9059 integer constant 0. */
9060 bool
9061 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
9063 return x == CONST0_RTX (mode);
9066 bool
9067 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
9069 HOST_WIDE_INT imm = INTVAL (x);
9070 int i;
9072 for (i = 0; i < 8; i++)
9074 unsigned int byte = imm & 0xff;
9075 if (byte != 0xff && byte != 0)
9076 return false;
9077 imm >>= 8;
9080 return true;
9083 bool
9084 aarch64_mov_operand_p (rtx x,
9085 enum aarch64_symbol_context context,
9086 machine_mode mode)
9088 if (GET_CODE (x) == HIGH
9089 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9090 return true;
9092 if (CONST_INT_P (x))
9093 return true;
9095 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
9096 return true;
9098 return aarch64_classify_symbolic_expression (x, context)
9099 == SYMBOL_TINY_ABSOLUTE;
9102 /* Return a const_int vector of VAL. */
9104 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
9106 int nunits = GET_MODE_NUNITS (mode);
9107 rtvec v = rtvec_alloc (nunits);
9108 int i;
9110 for (i=0; i < nunits; i++)
9111 RTVEC_ELT (v, i) = GEN_INT (val);
9113 return gen_rtx_CONST_VECTOR (mode, v);
9116 /* Check OP is a legal scalar immediate for the MOVI instruction. */
9118 bool
9119 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
9121 machine_mode vmode;
9123 gcc_assert (!VECTOR_MODE_P (mode));
9124 vmode = aarch64_preferred_simd_mode (mode);
9125 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
9126 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
9129 /* Construct and return a PARALLEL RTX vector with elements numbering the
9130 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
9131 the vector - from the perspective of the architecture. This does not
9132 line up with GCC's perspective on lane numbers, so we end up with
9133 different masks depending on our target endian-ness. The diagram
9134 below may help. We must draw the distinction when building masks
9135 which select one half of the vector. An instruction selecting
9136 architectural low-lanes for a big-endian target, must be described using
9137 a mask selecting GCC high-lanes.
9139 Big-Endian Little-Endian
9141 GCC 0 1 2 3 3 2 1 0
9142 | x | x | x | x | | x | x | x | x |
9143 Architecture 3 2 1 0 3 2 1 0
9145 Low Mask: { 2, 3 } { 0, 1 }
9146 High Mask: { 0, 1 } { 2, 3 }
9150 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
9152 int nunits = GET_MODE_NUNITS (mode);
9153 rtvec v = rtvec_alloc (nunits / 2);
9154 int high_base = nunits / 2;
9155 int low_base = 0;
9156 int base;
9157 rtx t1;
9158 int i;
9160 if (BYTES_BIG_ENDIAN)
9161 base = high ? low_base : high_base;
9162 else
9163 base = high ? high_base : low_base;
9165 for (i = 0; i < nunits / 2; i++)
9166 RTVEC_ELT (v, i) = GEN_INT (base + i);
9168 t1 = gen_rtx_PARALLEL (mode, v);
9169 return t1;
9172 /* Check OP for validity as a PARALLEL RTX vector with elements
9173 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
9174 from the perspective of the architecture. See the diagram above
9175 aarch64_simd_vect_par_cnst_half for more details. */
9177 bool
9178 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
9179 bool high)
9181 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
9182 HOST_WIDE_INT count_op = XVECLEN (op, 0);
9183 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
9184 int i = 0;
9186 if (!VECTOR_MODE_P (mode))
9187 return false;
9189 if (count_op != count_ideal)
9190 return false;
9192 for (i = 0; i < count_ideal; i++)
9194 rtx elt_op = XVECEXP (op, 0, i);
9195 rtx elt_ideal = XVECEXP (ideal, 0, i);
9197 if (!CONST_INT_P (elt_op)
9198 || INTVAL (elt_ideal) != INTVAL (elt_op))
9199 return false;
9201 return true;
9204 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
9205 HIGH (exclusive). */
9206 void
9207 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
9208 const_tree exp)
9210 HOST_WIDE_INT lane;
9211 gcc_assert (CONST_INT_P (operand));
9212 lane = INTVAL (operand);
9214 if (lane < low || lane >= high)
9216 if (exp)
9217 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
9218 else
9219 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
9223 /* Return TRUE if OP is a valid vector addressing mode. */
9224 bool
9225 aarch64_simd_mem_operand_p (rtx op)
9227 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
9228 || REG_P (XEXP (op, 0)));
9231 /* Emit a register copy from operand to operand, taking care not to
9232 early-clobber source registers in the process.
9234 COUNT is the number of components into which the copy needs to be
9235 decomposed. */
9236 void
9237 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
9238 unsigned int count)
9240 unsigned int i;
9241 int rdest = REGNO (operands[0]);
9242 int rsrc = REGNO (operands[1]);
9244 if (!reg_overlap_mentioned_p (operands[0], operands[1])
9245 || rdest < rsrc)
9246 for (i = 0; i < count; i++)
9247 emit_move_insn (gen_rtx_REG (mode, rdest + i),
9248 gen_rtx_REG (mode, rsrc + i));
9249 else
9250 for (i = 0; i < count; i++)
9251 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
9252 gen_rtx_REG (mode, rsrc + count - i - 1));
9255 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
9256 one of VSTRUCT modes: OI, CI or XI. */
9258 aarch64_simd_attr_length_move (rtx_insn *insn)
9260 machine_mode mode;
9262 extract_insn_cached (insn);
9264 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
9266 mode = GET_MODE (recog_data.operand[0]);
9267 switch (mode)
9269 case OImode:
9270 return 8;
9271 case CImode:
9272 return 12;
9273 case XImode:
9274 return 16;
9275 default:
9276 gcc_unreachable ();
9279 return 4;
9282 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
9283 one of VSTRUCT modes: OI, CI, EI, or XI. */
9285 aarch64_simd_attr_length_rglist (enum machine_mode mode)
9287 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
9290 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
9291 alignment of a vector to 128 bits. */
9292 static HOST_WIDE_INT
9293 aarch64_simd_vector_alignment (const_tree type)
9295 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
9296 return MIN (align, 128);
9299 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
9300 static bool
9301 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
9303 if (is_packed)
9304 return false;
9306 /* We guarantee alignment for vectors up to 128-bits. */
9307 if (tree_int_cst_compare (TYPE_SIZE (type),
9308 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
9309 return false;
9311 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
9312 return true;
9315 /* If VALS is a vector constant that can be loaded into a register
9316 using DUP, generate instructions to do so and return an RTX to
9317 assign to the register. Otherwise return NULL_RTX. */
9318 static rtx
9319 aarch64_simd_dup_constant (rtx vals)
9321 machine_mode mode = GET_MODE (vals);
9322 machine_mode inner_mode = GET_MODE_INNER (mode);
9323 int n_elts = GET_MODE_NUNITS (mode);
9324 bool all_same = true;
9325 rtx x;
9326 int i;
9328 if (GET_CODE (vals) != CONST_VECTOR)
9329 return NULL_RTX;
9331 for (i = 1; i < n_elts; ++i)
9333 x = CONST_VECTOR_ELT (vals, i);
9334 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9335 all_same = false;
9338 if (!all_same)
9339 return NULL_RTX;
9341 /* We can load this constant by using DUP and a constant in a
9342 single ARM register. This will be cheaper than a vector
9343 load. */
9344 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9345 return gen_rtx_VEC_DUPLICATE (mode, x);
9349 /* Generate code to load VALS, which is a PARALLEL containing only
9350 constants (for vec_init) or CONST_VECTOR, efficiently into a
9351 register. Returns an RTX to copy into the register, or NULL_RTX
9352 for a PARALLEL that can not be converted into a CONST_VECTOR. */
9353 static rtx
9354 aarch64_simd_make_constant (rtx vals)
9356 machine_mode mode = GET_MODE (vals);
9357 rtx const_dup;
9358 rtx const_vec = NULL_RTX;
9359 int n_elts = GET_MODE_NUNITS (mode);
9360 int n_const = 0;
9361 int i;
9363 if (GET_CODE (vals) == CONST_VECTOR)
9364 const_vec = vals;
9365 else if (GET_CODE (vals) == PARALLEL)
9367 /* A CONST_VECTOR must contain only CONST_INTs and
9368 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9369 Only store valid constants in a CONST_VECTOR. */
9370 for (i = 0; i < n_elts; ++i)
9372 rtx x = XVECEXP (vals, 0, i);
9373 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9374 n_const++;
9376 if (n_const == n_elts)
9377 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9379 else
9380 gcc_unreachable ();
9382 if (const_vec != NULL_RTX
9383 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9384 /* Load using MOVI/MVNI. */
9385 return const_vec;
9386 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9387 /* Loaded using DUP. */
9388 return const_dup;
9389 else if (const_vec != NULL_RTX)
9390 /* Load from constant pool. We can not take advantage of single-cycle
9391 LD1 because we need a PC-relative addressing mode. */
9392 return const_vec;
9393 else
9394 /* A PARALLEL containing something not valid inside CONST_VECTOR.
9395 We can not construct an initializer. */
9396 return NULL_RTX;
9399 void
9400 aarch64_expand_vector_init (rtx target, rtx vals)
9402 machine_mode mode = GET_MODE (target);
9403 machine_mode inner_mode = GET_MODE_INNER (mode);
9404 int n_elts = GET_MODE_NUNITS (mode);
9405 int n_var = 0;
9406 rtx any_const = NULL_RTX;
9407 bool all_same = true;
9409 for (int i = 0; i < n_elts; ++i)
9411 rtx x = XVECEXP (vals, 0, i);
9412 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9413 ++n_var;
9414 else
9415 any_const = x;
9417 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9418 all_same = false;
9421 if (n_var == 0)
9423 rtx constant = aarch64_simd_make_constant (vals);
9424 if (constant != NULL_RTX)
9426 emit_move_insn (target, constant);
9427 return;
9431 /* Splat a single non-constant element if we can. */
9432 if (all_same)
9434 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9435 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9436 return;
9439 /* Half the fields (or less) are non-constant. Load constant then overwrite
9440 varying fields. Hope that this is more efficient than using the stack. */
9441 if (n_var <= n_elts/2)
9443 rtx copy = copy_rtx (vals);
9445 /* Load constant part of vector. We really don't care what goes into the
9446 parts we will overwrite, but we're more likely to be able to load the
9447 constant efficiently if it has fewer, larger, repeating parts
9448 (see aarch64_simd_valid_immediate). */
9449 for (int i = 0; i < n_elts; i++)
9451 rtx x = XVECEXP (vals, 0, i);
9452 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9453 continue;
9454 rtx subst = any_const;
9455 for (int bit = n_elts / 2; bit > 0; bit /= 2)
9457 /* Look in the copied vector, as more elements are const. */
9458 rtx test = XVECEXP (copy, 0, i ^ bit);
9459 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9461 subst = test;
9462 break;
9465 XVECEXP (copy, 0, i) = subst;
9467 aarch64_expand_vector_init (target, copy);
9469 /* Insert variables. */
9470 enum insn_code icode = optab_handler (vec_set_optab, mode);
9471 gcc_assert (icode != CODE_FOR_nothing);
9473 for (int i = 0; i < n_elts; i++)
9475 rtx x = XVECEXP (vals, 0, i);
9476 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9477 continue;
9478 x = copy_to_mode_reg (inner_mode, x);
9479 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9481 return;
9484 /* Construct the vector in memory one field at a time
9485 and load the whole vector. */
9486 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9487 for (int i = 0; i < n_elts; i++)
9488 emit_move_insn (adjust_address_nv (mem, inner_mode,
9489 i * GET_MODE_SIZE (inner_mode)),
9490 XVECEXP (vals, 0, i));
9491 emit_move_insn (target, mem);
9495 static unsigned HOST_WIDE_INT
9496 aarch64_shift_truncation_mask (machine_mode mode)
9498 return
9499 (aarch64_vector_mode_supported_p (mode)
9500 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9503 #ifndef TLS_SECTION_ASM_FLAG
9504 #define TLS_SECTION_ASM_FLAG 'T'
9505 #endif
9507 void
9508 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9509 tree decl ATTRIBUTE_UNUSED)
9511 char flagchars[10], *f = flagchars;
9513 /* If we have already declared this section, we can use an
9514 abbreviated form to switch back to it -- unless this section is
9515 part of a COMDAT groups, in which case GAS requires the full
9516 declaration every time. */
9517 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9518 && (flags & SECTION_DECLARED))
9520 fprintf (asm_out_file, "\t.section\t%s\n", name);
9521 return;
9524 if (!(flags & SECTION_DEBUG))
9525 *f++ = 'a';
9526 if (flags & SECTION_WRITE)
9527 *f++ = 'w';
9528 if (flags & SECTION_CODE)
9529 *f++ = 'x';
9530 if (flags & SECTION_SMALL)
9531 *f++ = 's';
9532 if (flags & SECTION_MERGE)
9533 *f++ = 'M';
9534 if (flags & SECTION_STRINGS)
9535 *f++ = 'S';
9536 if (flags & SECTION_TLS)
9537 *f++ = TLS_SECTION_ASM_FLAG;
9538 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9539 *f++ = 'G';
9540 *f = '\0';
9542 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9544 if (!(flags & SECTION_NOTYPE))
9546 const char *type;
9547 const char *format;
9549 if (flags & SECTION_BSS)
9550 type = "nobits";
9551 else
9552 type = "progbits";
9554 #ifdef TYPE_OPERAND_FMT
9555 format = "," TYPE_OPERAND_FMT;
9556 #else
9557 format = ",@%s";
9558 #endif
9560 fprintf (asm_out_file, format, type);
9562 if (flags & SECTION_ENTSIZE)
9563 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9564 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9566 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9567 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9568 else
9569 fprintf (asm_out_file, ",%s,comdat",
9570 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9574 putc ('\n', asm_out_file);
9577 /* Select a format to encode pointers in exception handling data. */
9579 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9581 int type;
9582 switch (aarch64_cmodel)
9584 case AARCH64_CMODEL_TINY:
9585 case AARCH64_CMODEL_TINY_PIC:
9586 case AARCH64_CMODEL_SMALL:
9587 case AARCH64_CMODEL_SMALL_PIC:
9588 case AARCH64_CMODEL_SMALL_SPIC:
9589 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9590 for everything. */
9591 type = DW_EH_PE_sdata4;
9592 break;
9593 default:
9594 /* No assumptions here. 8-byte relocs required. */
9595 type = DW_EH_PE_sdata8;
9596 break;
9598 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9601 /* Emit load exclusive. */
9603 static void
9604 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9605 rtx mem, rtx model_rtx)
9607 rtx (*gen) (rtx, rtx, rtx);
9609 switch (mode)
9611 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9612 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9613 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9614 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9615 default:
9616 gcc_unreachable ();
9619 emit_insn (gen (rval, mem, model_rtx));
9622 /* Emit store exclusive. */
9624 static void
9625 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9626 rtx rval, rtx mem, rtx model_rtx)
9628 rtx (*gen) (rtx, rtx, rtx, rtx);
9630 switch (mode)
9632 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9633 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9634 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9635 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9636 default:
9637 gcc_unreachable ();
9640 emit_insn (gen (bval, rval, mem, model_rtx));
9643 /* Mark the previous jump instruction as unlikely. */
9645 static void
9646 aarch64_emit_unlikely_jump (rtx insn)
9648 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9650 insn = emit_jump_insn (insn);
9651 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9654 /* Expand a compare and swap pattern. */
9656 void
9657 aarch64_expand_compare_and_swap (rtx operands[])
9659 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9660 machine_mode mode, cmp_mode;
9661 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9663 bval = operands[0];
9664 rval = operands[1];
9665 mem = operands[2];
9666 oldval = operands[3];
9667 newval = operands[4];
9668 is_weak = operands[5];
9669 mod_s = operands[6];
9670 mod_f = operands[7];
9671 mode = GET_MODE (mem);
9672 cmp_mode = mode;
9674 /* Normally the succ memory model must be stronger than fail, but in the
9675 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9676 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9678 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9679 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9680 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9682 switch (mode)
9684 case QImode:
9685 case HImode:
9686 /* For short modes, we're going to perform the comparison in SImode,
9687 so do the zero-extension now. */
9688 cmp_mode = SImode;
9689 rval = gen_reg_rtx (SImode);
9690 oldval = convert_modes (SImode, mode, oldval, true);
9691 /* Fall through. */
9693 case SImode:
9694 case DImode:
9695 /* Force the value into a register if needed. */
9696 if (!aarch64_plus_operand (oldval, mode))
9697 oldval = force_reg (cmp_mode, oldval);
9698 break;
9700 default:
9701 gcc_unreachable ();
9704 switch (mode)
9706 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9707 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9708 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9709 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9710 default:
9711 gcc_unreachable ();
9714 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9716 if (mode == QImode || mode == HImode)
9717 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9719 x = gen_rtx_REG (CCmode, CC_REGNUM);
9720 x = gen_rtx_EQ (SImode, x, const0_rtx);
9721 emit_insn (gen_rtx_SET (bval, x));
9724 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9725 sequence implementing an atomic operation. */
9727 static void
9728 aarch64_emit_post_barrier (enum memmodel model)
9730 const enum memmodel base_model = memmodel_base (model);
9732 if (is_mm_sync (model)
9733 && (base_model == MEMMODEL_ACQUIRE
9734 || base_model == MEMMODEL_ACQ_REL
9735 || base_model == MEMMODEL_SEQ_CST))
9737 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9741 /* Split a compare and swap pattern. */
9743 void
9744 aarch64_split_compare_and_swap (rtx operands[])
9746 rtx rval, mem, oldval, newval, scratch;
9747 machine_mode mode;
9748 bool is_weak;
9749 rtx_code_label *label1, *label2;
9750 rtx x, cond;
9751 enum memmodel model;
9752 rtx model_rtx;
9754 rval = operands[0];
9755 mem = operands[1];
9756 oldval = operands[2];
9757 newval = operands[3];
9758 is_weak = (operands[4] != const0_rtx);
9759 model_rtx = operands[5];
9760 scratch = operands[7];
9761 mode = GET_MODE (mem);
9762 model = memmodel_from_int (INTVAL (model_rtx));
9764 label1 = NULL;
9765 if (!is_weak)
9767 label1 = gen_label_rtx ();
9768 emit_label (label1);
9770 label2 = gen_label_rtx ();
9772 /* The initial load can be relaxed for a __sync operation since a final
9773 barrier will be emitted to stop code hoisting. */
9774 if (is_mm_sync (model))
9775 aarch64_emit_load_exclusive (mode, rval, mem,
9776 GEN_INT (MEMMODEL_RELAXED));
9777 else
9778 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9780 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9781 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9782 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9783 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9784 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9786 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9788 if (!is_weak)
9790 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9791 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9792 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9793 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9795 else
9797 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9798 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9799 emit_insn (gen_rtx_SET (cond, x));
9802 emit_label (label2);
9804 /* Emit any final barrier needed for a __sync operation. */
9805 if (is_mm_sync (model))
9806 aarch64_emit_post_barrier (model);
9809 /* Split an atomic operation. */
9811 void
9812 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9813 rtx value, rtx model_rtx, rtx cond)
9815 machine_mode mode = GET_MODE (mem);
9816 machine_mode wmode = (mode == DImode ? DImode : SImode);
9817 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9818 const bool is_sync = is_mm_sync (model);
9819 rtx_code_label *label;
9820 rtx x;
9822 label = gen_label_rtx ();
9823 emit_label (label);
9825 if (new_out)
9826 new_out = gen_lowpart (wmode, new_out);
9827 if (old_out)
9828 old_out = gen_lowpart (wmode, old_out);
9829 else
9830 old_out = new_out;
9831 value = simplify_gen_subreg (wmode, value, mode, 0);
9833 /* The initial load can be relaxed for a __sync operation since a final
9834 barrier will be emitted to stop code hoisting. */
9835 if (is_sync)
9836 aarch64_emit_load_exclusive (mode, old_out, mem,
9837 GEN_INT (MEMMODEL_RELAXED));
9838 else
9839 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9841 switch (code)
9843 case SET:
9844 new_out = value;
9845 break;
9847 case NOT:
9848 x = gen_rtx_AND (wmode, old_out, value);
9849 emit_insn (gen_rtx_SET (new_out, x));
9850 x = gen_rtx_NOT (wmode, new_out);
9851 emit_insn (gen_rtx_SET (new_out, x));
9852 break;
9854 case MINUS:
9855 if (CONST_INT_P (value))
9857 value = GEN_INT (-INTVAL (value));
9858 code = PLUS;
9860 /* Fall through. */
9862 default:
9863 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9864 emit_insn (gen_rtx_SET (new_out, x));
9865 break;
9868 aarch64_emit_store_exclusive (mode, cond, mem,
9869 gen_lowpart (mode, new_out), model_rtx);
9871 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9872 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9873 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9874 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9876 /* Emit any final barrier needed for a __sync operation. */
9877 if (is_sync)
9878 aarch64_emit_post_barrier (model);
9881 static void
9882 aarch64_print_extension (void)
9884 const struct aarch64_option_extension *opt = NULL;
9886 for (opt = all_extensions; opt->name != NULL; opt++)
9887 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9888 asm_fprintf (asm_out_file, "+%s", opt->name);
9890 asm_fprintf (asm_out_file, "\n");
9893 static void
9894 aarch64_start_file (void)
9896 if (selected_arch)
9898 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9899 aarch64_print_extension ();
9901 else if (selected_cpu)
9903 const char *truncated_name
9904 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9905 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9906 aarch64_print_extension ();
9908 default_file_start();
9911 /* Target hook for c_mode_for_suffix. */
9912 static machine_mode
9913 aarch64_c_mode_for_suffix (char suffix)
9915 if (suffix == 'q')
9916 return TFmode;
9918 return VOIDmode;
9921 /* We can only represent floating point constants which will fit in
9922 "quarter-precision" values. These values are characterised by
9923 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9926 (-1)^s * (n/16) * 2^r
9928 Where:
9929 's' is the sign bit.
9930 'n' is an integer in the range 16 <= n <= 31.
9931 'r' is an integer in the range -3 <= r <= 4. */
9933 /* Return true iff X can be represented by a quarter-precision
9934 floating point immediate operand X. Note, we cannot represent 0.0. */
9935 bool
9936 aarch64_float_const_representable_p (rtx x)
9938 /* This represents our current view of how many bits
9939 make up the mantissa. */
9940 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9941 int exponent;
9942 unsigned HOST_WIDE_INT mantissa, mask;
9943 REAL_VALUE_TYPE r, m;
9944 bool fail;
9946 if (!CONST_DOUBLE_P (x))
9947 return false;
9949 if (GET_MODE (x) == VOIDmode)
9950 return false;
9952 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9954 /* We cannot represent infinities, NaNs or +/-zero. We won't
9955 know if we have +zero until we analyse the mantissa, but we
9956 can reject the other invalid values. */
9957 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9958 || REAL_VALUE_MINUS_ZERO (r))
9959 return false;
9961 /* Extract exponent. */
9962 r = real_value_abs (&r);
9963 exponent = REAL_EXP (&r);
9965 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9966 highest (sign) bit, with a fixed binary point at bit point_pos.
9967 m1 holds the low part of the mantissa, m2 the high part.
9968 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9969 bits for the mantissa, this can fail (low bits will be lost). */
9970 real_ldexp (&m, &r, point_pos - exponent);
9971 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9973 /* If the low part of the mantissa has bits set we cannot represent
9974 the value. */
9975 if (w.elt (0) != 0)
9976 return false;
9977 /* We have rejected the lower HOST_WIDE_INT, so update our
9978 understanding of how many bits lie in the mantissa and
9979 look only at the high HOST_WIDE_INT. */
9980 mantissa = w.elt (1);
9981 point_pos -= HOST_BITS_PER_WIDE_INT;
9983 /* We can only represent values with a mantissa of the form 1.xxxx. */
9984 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9985 if ((mantissa & mask) != 0)
9986 return false;
9988 /* Having filtered unrepresentable values, we may now remove all
9989 but the highest 5 bits. */
9990 mantissa >>= point_pos - 5;
9992 /* We cannot represent the value 0.0, so reject it. This is handled
9993 elsewhere. */
9994 if (mantissa == 0)
9995 return false;
9997 /* Then, as bit 4 is always set, we can mask it off, leaving
9998 the mantissa in the range [0, 15]. */
9999 mantissa &= ~(1 << 4);
10000 gcc_assert (mantissa <= 15);
10002 /* GCC internally does not use IEEE754-like encoding (where normalized
10003 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
10004 Our mantissa values are shifted 4 places to the left relative to
10005 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
10006 by 5 places to correct for GCC's representation. */
10007 exponent = 5 - exponent;
10009 return (exponent >= 0 && exponent <= 7);
10012 char*
10013 aarch64_output_simd_mov_immediate (rtx const_vector,
10014 machine_mode mode,
10015 unsigned width)
10017 bool is_valid;
10018 static char templ[40];
10019 const char *mnemonic;
10020 const char *shift_op;
10021 unsigned int lane_count = 0;
10022 char element_char;
10024 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
10026 /* This will return true to show const_vector is legal for use as either
10027 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
10028 also update INFO to show how the immediate should be generated. */
10029 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
10030 gcc_assert (is_valid);
10032 element_char = sizetochar (info.element_width);
10033 lane_count = width / info.element_width;
10035 mode = GET_MODE_INNER (mode);
10036 if (mode == SFmode || mode == DFmode)
10038 gcc_assert (info.shift == 0 && ! info.mvn);
10039 if (aarch64_float_const_zero_rtx_p (info.value))
10040 info.value = GEN_INT (0);
10041 else
10043 #define buf_size 20
10044 REAL_VALUE_TYPE r;
10045 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
10046 char float_buf[buf_size] = {'\0'};
10047 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
10048 #undef buf_size
10050 if (lane_count == 1)
10051 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
10052 else
10053 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
10054 lane_count, element_char, float_buf);
10055 return templ;
10059 mnemonic = info.mvn ? "mvni" : "movi";
10060 shift_op = info.msl ? "msl" : "lsl";
10062 if (lane_count == 1)
10063 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
10064 mnemonic, UINTVAL (info.value));
10065 else if (info.shift)
10066 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
10067 ", %s %d", mnemonic, lane_count, element_char,
10068 UINTVAL (info.value), shift_op, info.shift);
10069 else
10070 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
10071 mnemonic, lane_count, element_char, UINTVAL (info.value));
10072 return templ;
10075 char*
10076 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
10077 machine_mode mode)
10079 machine_mode vmode;
10081 gcc_assert (!VECTOR_MODE_P (mode));
10082 vmode = aarch64_simd_container_mode (mode, 64);
10083 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
10084 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
10087 /* Split operands into moves from op[1] + op[2] into op[0]. */
10089 void
10090 aarch64_split_combinev16qi (rtx operands[3])
10092 unsigned int dest = REGNO (operands[0]);
10093 unsigned int src1 = REGNO (operands[1]);
10094 unsigned int src2 = REGNO (operands[2]);
10095 machine_mode halfmode = GET_MODE (operands[1]);
10096 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
10097 rtx destlo, desthi;
10099 gcc_assert (halfmode == V16QImode);
10101 if (src1 == dest && src2 == dest + halfregs)
10103 /* No-op move. Can't split to nothing; emit something. */
10104 emit_note (NOTE_INSN_DELETED);
10105 return;
10108 /* Preserve register attributes for variable tracking. */
10109 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
10110 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
10111 GET_MODE_SIZE (halfmode));
10113 /* Special case of reversed high/low parts. */
10114 if (reg_overlap_mentioned_p (operands[2], destlo)
10115 && reg_overlap_mentioned_p (operands[1], desthi))
10117 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10118 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
10119 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10121 else if (!reg_overlap_mentioned_p (operands[2], destlo))
10123 /* Try to avoid unnecessary moves if part of the result
10124 is in the right place already. */
10125 if (src1 != dest)
10126 emit_move_insn (destlo, operands[1]);
10127 if (src2 != dest + halfregs)
10128 emit_move_insn (desthi, operands[2]);
10130 else
10132 if (src2 != dest + halfregs)
10133 emit_move_insn (desthi, operands[2]);
10134 if (src1 != dest)
10135 emit_move_insn (destlo, operands[1]);
10139 /* vec_perm support. */
10141 #define MAX_VECT_LEN 16
10143 struct expand_vec_perm_d
10145 rtx target, op0, op1;
10146 unsigned char perm[MAX_VECT_LEN];
10147 machine_mode vmode;
10148 unsigned char nelt;
10149 bool one_vector_p;
10150 bool testing_p;
10153 /* Generate a variable permutation. */
10155 static void
10156 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
10158 machine_mode vmode = GET_MODE (target);
10159 bool one_vector_p = rtx_equal_p (op0, op1);
10161 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
10162 gcc_checking_assert (GET_MODE (op0) == vmode);
10163 gcc_checking_assert (GET_MODE (op1) == vmode);
10164 gcc_checking_assert (GET_MODE (sel) == vmode);
10165 gcc_checking_assert (TARGET_SIMD);
10167 if (one_vector_p)
10169 if (vmode == V8QImode)
10171 /* Expand the argument to a V16QI mode by duplicating it. */
10172 rtx pair = gen_reg_rtx (V16QImode);
10173 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
10174 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10176 else
10178 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
10181 else
10183 rtx pair;
10185 if (vmode == V8QImode)
10187 pair = gen_reg_rtx (V16QImode);
10188 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
10189 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10191 else
10193 pair = gen_reg_rtx (OImode);
10194 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
10195 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
10200 void
10201 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
10203 machine_mode vmode = GET_MODE (target);
10204 unsigned int nelt = GET_MODE_NUNITS (vmode);
10205 bool one_vector_p = rtx_equal_p (op0, op1);
10206 rtx mask;
10208 /* The TBL instruction does not use a modulo index, so we must take care
10209 of that ourselves. */
10210 mask = aarch64_simd_gen_const_vector_dup (vmode,
10211 one_vector_p ? nelt - 1 : 2 * nelt - 1);
10212 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
10214 /* For big-endian, we also need to reverse the index within the vector
10215 (but not which vector). */
10216 if (BYTES_BIG_ENDIAN)
10218 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
10219 if (!one_vector_p)
10220 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
10221 sel = expand_simple_binop (vmode, XOR, sel, mask,
10222 NULL, 0, OPTAB_LIB_WIDEN);
10224 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
10227 /* Recognize patterns suitable for the TRN instructions. */
10228 static bool
10229 aarch64_evpc_trn (struct expand_vec_perm_d *d)
10231 unsigned int i, odd, mask, nelt = d->nelt;
10232 rtx out, in0, in1, x;
10233 rtx (*gen) (rtx, rtx, rtx);
10234 machine_mode vmode = d->vmode;
10236 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10237 return false;
10239 /* Note that these are little-endian tests.
10240 We correct for big-endian later. */
10241 if (d->perm[0] == 0)
10242 odd = 0;
10243 else if (d->perm[0] == 1)
10244 odd = 1;
10245 else
10246 return false;
10247 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10249 for (i = 0; i < nelt; i += 2)
10251 if (d->perm[i] != i + odd)
10252 return false;
10253 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
10254 return false;
10257 /* Success! */
10258 if (d->testing_p)
10259 return true;
10261 in0 = d->op0;
10262 in1 = d->op1;
10263 if (BYTES_BIG_ENDIAN)
10265 x = in0, in0 = in1, in1 = x;
10266 odd = !odd;
10268 out = d->target;
10270 if (odd)
10272 switch (vmode)
10274 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
10275 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
10276 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
10277 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
10278 case V4SImode: gen = gen_aarch64_trn2v4si; break;
10279 case V2SImode: gen = gen_aarch64_trn2v2si; break;
10280 case V2DImode: gen = gen_aarch64_trn2v2di; break;
10281 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
10282 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
10283 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
10284 default:
10285 return false;
10288 else
10290 switch (vmode)
10292 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
10293 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
10294 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
10295 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
10296 case V4SImode: gen = gen_aarch64_trn1v4si; break;
10297 case V2SImode: gen = gen_aarch64_trn1v2si; break;
10298 case V2DImode: gen = gen_aarch64_trn1v2di; break;
10299 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
10300 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
10301 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
10302 default:
10303 return false;
10307 emit_insn (gen (out, in0, in1));
10308 return true;
10311 /* Recognize patterns suitable for the UZP instructions. */
10312 static bool
10313 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10315 unsigned int i, odd, mask, nelt = d->nelt;
10316 rtx out, in0, in1, x;
10317 rtx (*gen) (rtx, rtx, rtx);
10318 machine_mode vmode = d->vmode;
10320 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10321 return false;
10323 /* Note that these are little-endian tests.
10324 We correct for big-endian later. */
10325 if (d->perm[0] == 0)
10326 odd = 0;
10327 else if (d->perm[0] == 1)
10328 odd = 1;
10329 else
10330 return false;
10331 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10333 for (i = 0; i < nelt; i++)
10335 unsigned elt = (i * 2 + odd) & mask;
10336 if (d->perm[i] != elt)
10337 return false;
10340 /* Success! */
10341 if (d->testing_p)
10342 return true;
10344 in0 = d->op0;
10345 in1 = d->op1;
10346 if (BYTES_BIG_ENDIAN)
10348 x = in0, in0 = in1, in1 = x;
10349 odd = !odd;
10351 out = d->target;
10353 if (odd)
10355 switch (vmode)
10357 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10358 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10359 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10360 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10361 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10362 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10363 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10364 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10365 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10366 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10367 default:
10368 return false;
10371 else
10373 switch (vmode)
10375 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10376 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10377 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10378 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10379 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10380 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10381 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10382 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10383 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10384 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10385 default:
10386 return false;
10390 emit_insn (gen (out, in0, in1));
10391 return true;
10394 /* Recognize patterns suitable for the ZIP instructions. */
10395 static bool
10396 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10398 unsigned int i, high, mask, nelt = d->nelt;
10399 rtx out, in0, in1, x;
10400 rtx (*gen) (rtx, rtx, rtx);
10401 machine_mode vmode = d->vmode;
10403 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10404 return false;
10406 /* Note that these are little-endian tests.
10407 We correct for big-endian later. */
10408 high = nelt / 2;
10409 if (d->perm[0] == high)
10410 /* Do Nothing. */
10412 else if (d->perm[0] == 0)
10413 high = 0;
10414 else
10415 return false;
10416 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10418 for (i = 0; i < nelt / 2; i++)
10420 unsigned elt = (i + high) & mask;
10421 if (d->perm[i * 2] != elt)
10422 return false;
10423 elt = (elt + nelt) & mask;
10424 if (d->perm[i * 2 + 1] != elt)
10425 return false;
10428 /* Success! */
10429 if (d->testing_p)
10430 return true;
10432 in0 = d->op0;
10433 in1 = d->op1;
10434 if (BYTES_BIG_ENDIAN)
10436 x = in0, in0 = in1, in1 = x;
10437 high = !high;
10439 out = d->target;
10441 if (high)
10443 switch (vmode)
10445 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10446 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10447 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10448 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10449 case V4SImode: gen = gen_aarch64_zip2v4si; break;
10450 case V2SImode: gen = gen_aarch64_zip2v2si; break;
10451 case V2DImode: gen = gen_aarch64_zip2v2di; break;
10452 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10453 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10454 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10455 default:
10456 return false;
10459 else
10461 switch (vmode)
10463 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10464 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10465 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10466 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10467 case V4SImode: gen = gen_aarch64_zip1v4si; break;
10468 case V2SImode: gen = gen_aarch64_zip1v2si; break;
10469 case V2DImode: gen = gen_aarch64_zip1v2di; break;
10470 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10471 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10472 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10473 default:
10474 return false;
10478 emit_insn (gen (out, in0, in1));
10479 return true;
10482 /* Recognize patterns for the EXT insn. */
10484 static bool
10485 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10487 unsigned int i, nelt = d->nelt;
10488 rtx (*gen) (rtx, rtx, rtx, rtx);
10489 rtx offset;
10491 unsigned int location = d->perm[0]; /* Always < nelt. */
10493 /* Check if the extracted indices are increasing by one. */
10494 for (i = 1; i < nelt; i++)
10496 unsigned int required = location + i;
10497 if (d->one_vector_p)
10499 /* We'll pass the same vector in twice, so allow indices to wrap. */
10500 required &= (nelt - 1);
10502 if (d->perm[i] != required)
10503 return false;
10506 switch (d->vmode)
10508 case V16QImode: gen = gen_aarch64_extv16qi; break;
10509 case V8QImode: gen = gen_aarch64_extv8qi; break;
10510 case V4HImode: gen = gen_aarch64_extv4hi; break;
10511 case V8HImode: gen = gen_aarch64_extv8hi; break;
10512 case V2SImode: gen = gen_aarch64_extv2si; break;
10513 case V4SImode: gen = gen_aarch64_extv4si; break;
10514 case V2SFmode: gen = gen_aarch64_extv2sf; break;
10515 case V4SFmode: gen = gen_aarch64_extv4sf; break;
10516 case V2DImode: gen = gen_aarch64_extv2di; break;
10517 case V2DFmode: gen = gen_aarch64_extv2df; break;
10518 default:
10519 return false;
10522 /* Success! */
10523 if (d->testing_p)
10524 return true;
10526 /* The case where (location == 0) is a no-op for both big- and little-endian,
10527 and is removed by the mid-end at optimization levels -O1 and higher. */
10529 if (BYTES_BIG_ENDIAN && (location != 0))
10531 /* After setup, we want the high elements of the first vector (stored
10532 at the LSB end of the register), and the low elements of the second
10533 vector (stored at the MSB end of the register). So swap. */
10534 std::swap (d->op0, d->op1);
10535 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
10536 location = nelt - location;
10539 offset = GEN_INT (location);
10540 emit_insn (gen (d->target, d->op0, d->op1, offset));
10541 return true;
10544 /* Recognize patterns for the REV insns. */
10546 static bool
10547 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10549 unsigned int i, j, diff, nelt = d->nelt;
10550 rtx (*gen) (rtx, rtx);
10552 if (!d->one_vector_p)
10553 return false;
10555 diff = d->perm[0];
10556 switch (diff)
10558 case 7:
10559 switch (d->vmode)
10561 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10562 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10563 default:
10564 return false;
10566 break;
10567 case 3:
10568 switch (d->vmode)
10570 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10571 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10572 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10573 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10574 default:
10575 return false;
10577 break;
10578 case 1:
10579 switch (d->vmode)
10581 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10582 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10583 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10584 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10585 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10586 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10587 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10588 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10589 default:
10590 return false;
10592 break;
10593 default:
10594 return false;
10597 for (i = 0; i < nelt ; i += diff + 1)
10598 for (j = 0; j <= diff; j += 1)
10600 /* This is guaranteed to be true as the value of diff
10601 is 7, 3, 1 and we should have enough elements in the
10602 queue to generate this. Getting a vector mask with a
10603 value of diff other than these values implies that
10604 something is wrong by the time we get here. */
10605 gcc_assert (i + j < nelt);
10606 if (d->perm[i + j] != i + diff - j)
10607 return false;
10610 /* Success! */
10611 if (d->testing_p)
10612 return true;
10614 emit_insn (gen (d->target, d->op0));
10615 return true;
10618 static bool
10619 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10621 rtx (*gen) (rtx, rtx, rtx);
10622 rtx out = d->target;
10623 rtx in0;
10624 machine_mode vmode = d->vmode;
10625 unsigned int i, elt, nelt = d->nelt;
10626 rtx lane;
10628 elt = d->perm[0];
10629 for (i = 1; i < nelt; i++)
10631 if (elt != d->perm[i])
10632 return false;
10635 /* The generic preparation in aarch64_expand_vec_perm_const_1
10636 swaps the operand order and the permute indices if it finds
10637 d->perm[0] to be in the second operand. Thus, we can always
10638 use d->op0 and need not do any extra arithmetic to get the
10639 correct lane number. */
10640 in0 = d->op0;
10641 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10643 switch (vmode)
10645 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10646 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10647 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10648 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10649 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10650 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10651 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10652 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10653 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10654 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10655 default:
10656 return false;
10659 emit_insn (gen (out, in0, lane));
10660 return true;
10663 static bool
10664 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10666 rtx rperm[MAX_VECT_LEN], sel;
10667 machine_mode vmode = d->vmode;
10668 unsigned int i, nelt = d->nelt;
10670 if (d->testing_p)
10671 return true;
10673 /* Generic code will try constant permutation twice. Once with the
10674 original mode and again with the elements lowered to QImode.
10675 So wait and don't do the selector expansion ourselves. */
10676 if (vmode != V8QImode && vmode != V16QImode)
10677 return false;
10679 for (i = 0; i < nelt; ++i)
10681 int nunits = GET_MODE_NUNITS (vmode);
10683 /* If big-endian and two vectors we end up with a weird mixed-endian
10684 mode on NEON. Reverse the index within each word but not the word
10685 itself. */
10686 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10687 : d->perm[i]);
10689 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10690 sel = force_reg (vmode, sel);
10692 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10693 return true;
10696 static bool
10697 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10699 /* The pattern matching functions above are written to look for a small
10700 number to begin the sequence (0, 1, N/2). If we begin with an index
10701 from the second operand, we can swap the operands. */
10702 if (d->perm[0] >= d->nelt)
10704 unsigned i, nelt = d->nelt;
10706 gcc_assert (nelt == (nelt & -nelt));
10707 for (i = 0; i < nelt; ++i)
10708 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10710 std::swap (d->op0, d->op1);
10713 if (TARGET_SIMD)
10715 if (aarch64_evpc_rev (d))
10716 return true;
10717 else if (aarch64_evpc_ext (d))
10718 return true;
10719 else if (aarch64_evpc_dup (d))
10720 return true;
10721 else if (aarch64_evpc_zip (d))
10722 return true;
10723 else if (aarch64_evpc_uzp (d))
10724 return true;
10725 else if (aarch64_evpc_trn (d))
10726 return true;
10727 return aarch64_evpc_tbl (d);
10729 return false;
10732 /* Expand a vec_perm_const pattern. */
10734 bool
10735 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10737 struct expand_vec_perm_d d;
10738 int i, nelt, which;
10740 d.target = target;
10741 d.op0 = op0;
10742 d.op1 = op1;
10744 d.vmode = GET_MODE (target);
10745 gcc_assert (VECTOR_MODE_P (d.vmode));
10746 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10747 d.testing_p = false;
10749 for (i = which = 0; i < nelt; ++i)
10751 rtx e = XVECEXP (sel, 0, i);
10752 int ei = INTVAL (e) & (2 * nelt - 1);
10753 which |= (ei < nelt ? 1 : 2);
10754 d.perm[i] = ei;
10757 switch (which)
10759 default:
10760 gcc_unreachable ();
10762 case 3:
10763 d.one_vector_p = false;
10764 if (!rtx_equal_p (op0, op1))
10765 break;
10767 /* The elements of PERM do not suggest that only the first operand
10768 is used, but both operands are identical. Allow easier matching
10769 of the permutation by folding the permutation into the single
10770 input vector. */
10771 /* Fall Through. */
10772 case 2:
10773 for (i = 0; i < nelt; ++i)
10774 d.perm[i] &= nelt - 1;
10775 d.op0 = op1;
10776 d.one_vector_p = true;
10777 break;
10779 case 1:
10780 d.op1 = op0;
10781 d.one_vector_p = true;
10782 break;
10785 return aarch64_expand_vec_perm_const_1 (&d);
10788 static bool
10789 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10790 const unsigned char *sel)
10792 struct expand_vec_perm_d d;
10793 unsigned int i, nelt, which;
10794 bool ret;
10796 d.vmode = vmode;
10797 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10798 d.testing_p = true;
10799 memcpy (d.perm, sel, nelt);
10801 /* Calculate whether all elements are in one vector. */
10802 for (i = which = 0; i < nelt; ++i)
10804 unsigned char e = d.perm[i];
10805 gcc_assert (e < 2 * nelt);
10806 which |= (e < nelt ? 1 : 2);
10809 /* If all elements are from the second vector, reindex as if from the
10810 first vector. */
10811 if (which == 2)
10812 for (i = 0; i < nelt; ++i)
10813 d.perm[i] -= nelt;
10815 /* Check whether the mask can be applied to a single vector. */
10816 d.one_vector_p = (which != 3);
10818 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10819 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10820 if (!d.one_vector_p)
10821 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10823 start_sequence ();
10824 ret = aarch64_expand_vec_perm_const_1 (&d);
10825 end_sequence ();
10827 return ret;
10831 aarch64_reverse_mask (enum machine_mode mode)
10833 /* We have to reverse each vector because we dont have
10834 a permuted load that can reverse-load according to ABI rules. */
10835 rtx mask;
10836 rtvec v = rtvec_alloc (16);
10837 int i, j;
10838 int nunits = GET_MODE_NUNITS (mode);
10839 int usize = GET_MODE_UNIT_SIZE (mode);
10841 gcc_assert (BYTES_BIG_ENDIAN);
10842 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10844 for (i = 0; i < nunits; i++)
10845 for (j = 0; j < usize; j++)
10846 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10847 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10848 return force_reg (V16QImode, mask);
10851 /* Implement MODES_TIEABLE_P. */
10853 bool
10854 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10856 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10857 return true;
10859 /* We specifically want to allow elements of "structure" modes to
10860 be tieable to the structure. This more general condition allows
10861 other rarer situations too. */
10862 if (TARGET_SIMD
10863 && aarch64_vector_mode_p (mode1)
10864 && aarch64_vector_mode_p (mode2))
10865 return true;
10867 return false;
10870 /* Return a new RTX holding the result of moving POINTER forward by
10871 AMOUNT bytes. */
10873 static rtx
10874 aarch64_move_pointer (rtx pointer, int amount)
10876 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10878 return adjust_automodify_address (pointer, GET_MODE (pointer),
10879 next, amount);
10882 /* Return a new RTX holding the result of moving POINTER forward by the
10883 size of the mode it points to. */
10885 static rtx
10886 aarch64_progress_pointer (rtx pointer)
10888 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10890 return aarch64_move_pointer (pointer, amount);
10893 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10894 MODE bytes. */
10896 static void
10897 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10898 machine_mode mode)
10900 rtx reg = gen_reg_rtx (mode);
10902 /* "Cast" the pointers to the correct mode. */
10903 *src = adjust_address (*src, mode, 0);
10904 *dst = adjust_address (*dst, mode, 0);
10905 /* Emit the memcpy. */
10906 emit_move_insn (reg, *src);
10907 emit_move_insn (*dst, reg);
10908 /* Move the pointers forward. */
10909 *src = aarch64_progress_pointer (*src);
10910 *dst = aarch64_progress_pointer (*dst);
10913 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10914 we succeed, otherwise return false. */
10916 bool
10917 aarch64_expand_movmem (rtx *operands)
10919 unsigned int n;
10920 rtx dst = operands[0];
10921 rtx src = operands[1];
10922 rtx base;
10923 bool speed_p = !optimize_function_for_size_p (cfun);
10925 /* When optimizing for size, give a better estimate of the length of a
10926 memcpy call, but use the default otherwise. */
10927 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10929 /* We can't do anything smart if the amount to copy is not constant. */
10930 if (!CONST_INT_P (operands[2]))
10931 return false;
10933 n = UINTVAL (operands[2]);
10935 /* Try to keep the number of instructions low. For cases below 16 bytes we
10936 need to make at most two moves. For cases above 16 bytes it will be one
10937 move for each 16 byte chunk, then at most two additional moves. */
10938 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10939 return false;
10941 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10942 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10944 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10945 src = adjust_automodify_address (src, VOIDmode, base, 0);
10947 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10948 1-byte chunk. */
10949 if (n < 4)
10951 if (n >= 2)
10953 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10954 n -= 2;
10957 if (n == 1)
10958 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10960 return true;
10963 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10964 4-byte chunk, partially overlapping with the previously copied chunk. */
10965 if (n < 8)
10967 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10968 n -= 4;
10969 if (n > 0)
10971 int move = n - 4;
10973 src = aarch64_move_pointer (src, move);
10974 dst = aarch64_move_pointer (dst, move);
10975 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10977 return true;
10980 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10981 them, then (if applicable) an 8-byte chunk. */
10982 while (n >= 8)
10984 if (n / 16)
10986 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10987 n -= 16;
10989 else
10991 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10992 n -= 8;
10996 /* Finish the final bytes of the copy. We can always do this in one
10997 instruction. We either copy the exact amount we need, or partially
10998 overlap with the previous chunk we copied and copy 8-bytes. */
10999 if (n == 0)
11000 return true;
11001 else if (n == 1)
11002 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
11003 else if (n == 2)
11004 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
11005 else if (n == 4)
11006 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11007 else
11009 if (n == 3)
11011 src = aarch64_move_pointer (src, -1);
11012 dst = aarch64_move_pointer (dst, -1);
11013 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11015 else
11017 int move = n - 8;
11019 src = aarch64_move_pointer (src, move);
11020 dst = aarch64_move_pointer (dst, move);
11021 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
11025 return true;
11028 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
11030 static unsigned HOST_WIDE_INT
11031 aarch64_asan_shadow_offset (void)
11033 return (HOST_WIDE_INT_1 << 36);
11036 static bool
11037 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
11038 unsigned int align,
11039 enum by_pieces_operation op,
11040 bool speed_p)
11042 /* STORE_BY_PIECES can be used when copying a constant string, but
11043 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
11044 For now we always fail this and let the move_by_pieces code copy
11045 the string from read-only memory. */
11046 if (op == STORE_BY_PIECES)
11047 return false;
11049 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
11052 static enum machine_mode
11053 aarch64_code_to_ccmode (enum rtx_code code)
11055 switch (code)
11057 case NE:
11058 return CC_DNEmode;
11060 case EQ:
11061 return CC_DEQmode;
11063 case LE:
11064 return CC_DLEmode;
11066 case LT:
11067 return CC_DLTmode;
11069 case GE:
11070 return CC_DGEmode;
11072 case GT:
11073 return CC_DGTmode;
11075 case LEU:
11076 return CC_DLEUmode;
11078 case LTU:
11079 return CC_DLTUmode;
11081 case GEU:
11082 return CC_DGEUmode;
11084 case GTU:
11085 return CC_DGTUmode;
11087 default:
11088 return CCmode;
11092 static rtx
11093 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
11094 int code, tree treeop0, tree treeop1)
11096 enum machine_mode op_mode, cmp_mode, cc_mode;
11097 rtx op0, op1, cmp, target;
11098 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11099 enum insn_code icode;
11100 struct expand_operand ops[4];
11102 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
11103 if (cc_mode == CCmode)
11104 return NULL_RTX;
11106 start_sequence ();
11107 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11109 op_mode = GET_MODE (op0);
11110 if (op_mode == VOIDmode)
11111 op_mode = GET_MODE (op1);
11113 switch (op_mode)
11115 case QImode:
11116 case HImode:
11117 case SImode:
11118 cmp_mode = SImode;
11119 icode = CODE_FOR_cmpsi;
11120 break;
11122 case DImode:
11123 cmp_mode = DImode;
11124 icode = CODE_FOR_cmpdi;
11125 break;
11127 default:
11128 end_sequence ();
11129 return NULL_RTX;
11132 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11133 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11134 if (!op0 || !op1)
11136 end_sequence ();
11137 return NULL_RTX;
11139 *prep_seq = get_insns ();
11140 end_sequence ();
11142 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
11143 target = gen_rtx_REG (CCmode, CC_REGNUM);
11145 create_output_operand (&ops[0], target, CCmode);
11146 create_fixed_operand (&ops[1], cmp);
11147 create_fixed_operand (&ops[2], op0);
11148 create_fixed_operand (&ops[3], op1);
11150 start_sequence ();
11151 if (!maybe_expand_insn (icode, 4, ops))
11153 end_sequence ();
11154 return NULL_RTX;
11156 *gen_seq = get_insns ();
11157 end_sequence ();
11159 return gen_rtx_REG (cc_mode, CC_REGNUM);
11162 static rtx
11163 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
11164 tree treeop0, tree treeop1, int bit_code)
11166 rtx op0, op1, cmp0, cmp1, target;
11167 enum machine_mode op_mode, cmp_mode, cc_mode;
11168 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11169 enum insn_code icode = CODE_FOR_ccmp_andsi;
11170 struct expand_operand ops[6];
11172 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
11173 if (cc_mode == CCmode)
11174 return NULL_RTX;
11176 push_to_sequence ((rtx_insn*) *prep_seq);
11177 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11179 op_mode = GET_MODE (op0);
11180 if (op_mode == VOIDmode)
11181 op_mode = GET_MODE (op1);
11183 switch (op_mode)
11185 case QImode:
11186 case HImode:
11187 case SImode:
11188 cmp_mode = SImode;
11189 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
11190 : CODE_FOR_ccmp_iorsi;
11191 break;
11193 case DImode:
11194 cmp_mode = DImode;
11195 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
11196 : CODE_FOR_ccmp_iordi;
11197 break;
11199 default:
11200 end_sequence ();
11201 return NULL_RTX;
11204 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11205 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11206 if (!op0 || !op1)
11208 end_sequence ();
11209 return NULL_RTX;
11211 *prep_seq = get_insns ();
11212 end_sequence ();
11214 target = gen_rtx_REG (cc_mode, CC_REGNUM);
11215 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
11216 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
11218 create_fixed_operand (&ops[0], prev);
11219 create_fixed_operand (&ops[1], target);
11220 create_fixed_operand (&ops[2], op0);
11221 create_fixed_operand (&ops[3], op1);
11222 create_fixed_operand (&ops[4], cmp0);
11223 create_fixed_operand (&ops[5], cmp1);
11225 push_to_sequence ((rtx_insn*) *gen_seq);
11226 if (!maybe_expand_insn (icode, 6, ops))
11228 end_sequence ();
11229 return NULL_RTX;
11232 *gen_seq = get_insns ();
11233 end_sequence ();
11235 return target;
11238 #undef TARGET_GEN_CCMP_FIRST
11239 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
11241 #undef TARGET_GEN_CCMP_NEXT
11242 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
11244 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
11245 instruction fusion of some sort. */
11247 static bool
11248 aarch64_macro_fusion_p (void)
11250 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
11254 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
11255 should be kept together during scheduling. */
11257 static bool
11258 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
11260 rtx set_dest;
11261 rtx prev_set = single_set (prev);
11262 rtx curr_set = single_set (curr);
11263 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
11264 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
11266 if (!aarch64_macro_fusion_p ())
11267 return false;
11269 if (simple_sets_p
11270 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
11272 /* We are trying to match:
11273 prev (mov) == (set (reg r0) (const_int imm16))
11274 curr (movk) == (set (zero_extract (reg r0)
11275 (const_int 16)
11276 (const_int 16))
11277 (const_int imm16_1)) */
11279 set_dest = SET_DEST (curr_set);
11281 if (GET_CODE (set_dest) == ZERO_EXTRACT
11282 && CONST_INT_P (SET_SRC (curr_set))
11283 && CONST_INT_P (SET_SRC (prev_set))
11284 && CONST_INT_P (XEXP (set_dest, 2))
11285 && INTVAL (XEXP (set_dest, 2)) == 16
11286 && REG_P (XEXP (set_dest, 0))
11287 && REG_P (SET_DEST (prev_set))
11288 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
11290 return true;
11294 if (simple_sets_p
11295 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
11298 /* We're trying to match:
11299 prev (adrp) == (set (reg r1)
11300 (high (symbol_ref ("SYM"))))
11301 curr (add) == (set (reg r0)
11302 (lo_sum (reg r1)
11303 (symbol_ref ("SYM"))))
11304 Note that r0 need not necessarily be the same as r1, especially
11305 during pre-regalloc scheduling. */
11307 if (satisfies_constraint_Ush (SET_SRC (prev_set))
11308 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11310 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11311 && REG_P (XEXP (SET_SRC (curr_set), 0))
11312 && REGNO (XEXP (SET_SRC (curr_set), 0))
11313 == REGNO (SET_DEST (prev_set))
11314 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11315 XEXP (SET_SRC (curr_set), 1)))
11316 return true;
11320 if (simple_sets_p
11321 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11324 /* We're trying to match:
11325 prev (movk) == (set (zero_extract (reg r0)
11326 (const_int 16)
11327 (const_int 32))
11328 (const_int imm16_1))
11329 curr (movk) == (set (zero_extract (reg r0)
11330 (const_int 16)
11331 (const_int 48))
11332 (const_int imm16_2)) */
11334 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11335 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11336 && REG_P (XEXP (SET_DEST (prev_set), 0))
11337 && REG_P (XEXP (SET_DEST (curr_set), 0))
11338 && REGNO (XEXP (SET_DEST (prev_set), 0))
11339 == REGNO (XEXP (SET_DEST (curr_set), 0))
11340 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11341 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11342 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11343 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11344 && CONST_INT_P (SET_SRC (prev_set))
11345 && CONST_INT_P (SET_SRC (curr_set)))
11346 return true;
11349 if (simple_sets_p
11350 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
11352 /* We're trying to match:
11353 prev (adrp) == (set (reg r0)
11354 (high (symbol_ref ("SYM"))))
11355 curr (ldr) == (set (reg r1)
11356 (mem (lo_sum (reg r0)
11357 (symbol_ref ("SYM")))))
11359 curr (ldr) == (set (reg r1)
11360 (zero_extend (mem
11361 (lo_sum (reg r0)
11362 (symbol_ref ("SYM")))))) */
11363 if (satisfies_constraint_Ush (SET_SRC (prev_set))
11364 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11366 rtx curr_src = SET_SRC (curr_set);
11368 if (GET_CODE (curr_src) == ZERO_EXTEND)
11369 curr_src = XEXP (curr_src, 0);
11371 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11372 && REG_P (XEXP (XEXP (curr_src, 0), 0))
11373 && REGNO (XEXP (XEXP (curr_src, 0), 0))
11374 == REGNO (SET_DEST (prev_set))
11375 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11376 XEXP (SET_SRC (prev_set), 0)))
11377 return true;
11381 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11382 && any_condjump_p (curr))
11384 enum attr_type prev_type = get_attr_type (prev);
11386 /* FIXME: this misses some which is considered simple arthematic
11387 instructions for ThunderX. Simple shifts are missed here. */
11388 if (prev_type == TYPE_ALUS_SREG
11389 || prev_type == TYPE_ALUS_IMM
11390 || prev_type == TYPE_LOGICS_REG
11391 || prev_type == TYPE_LOGICS_IMM)
11392 return true;
11395 return false;
11398 /* If MEM is in the form of [base+offset], extract the two parts
11399 of address and set to BASE and OFFSET, otherwise return false
11400 after clearing BASE and OFFSET. */
11402 bool
11403 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11405 rtx addr;
11407 gcc_assert (MEM_P (mem));
11409 addr = XEXP (mem, 0);
11411 if (REG_P (addr))
11413 *base = addr;
11414 *offset = const0_rtx;
11415 return true;
11418 if (GET_CODE (addr) == PLUS
11419 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11421 *base = XEXP (addr, 0);
11422 *offset = XEXP (addr, 1);
11423 return true;
11426 *base = NULL_RTX;
11427 *offset = NULL_RTX;
11429 return false;
11432 /* Types for scheduling fusion. */
11433 enum sched_fusion_type
11435 SCHED_FUSION_NONE = 0,
11436 SCHED_FUSION_LD_SIGN_EXTEND,
11437 SCHED_FUSION_LD_ZERO_EXTEND,
11438 SCHED_FUSION_LD,
11439 SCHED_FUSION_ST,
11440 SCHED_FUSION_NUM
11443 /* If INSN is a load or store of address in the form of [base+offset],
11444 extract the two parts and set to BASE and OFFSET. Return scheduling
11445 fusion type this INSN is. */
11447 static enum sched_fusion_type
11448 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11450 rtx x, dest, src;
11451 enum sched_fusion_type fusion = SCHED_FUSION_LD;
11453 gcc_assert (INSN_P (insn));
11454 x = PATTERN (insn);
11455 if (GET_CODE (x) != SET)
11456 return SCHED_FUSION_NONE;
11458 src = SET_SRC (x);
11459 dest = SET_DEST (x);
11461 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11462 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11463 return SCHED_FUSION_NONE;
11465 if (GET_CODE (src) == SIGN_EXTEND)
11467 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11468 src = XEXP (src, 0);
11469 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11470 return SCHED_FUSION_NONE;
11472 else if (GET_CODE (src) == ZERO_EXTEND)
11474 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11475 src = XEXP (src, 0);
11476 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11477 return SCHED_FUSION_NONE;
11480 if (GET_CODE (src) == MEM && REG_P (dest))
11481 extract_base_offset_in_addr (src, base, offset);
11482 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11484 fusion = SCHED_FUSION_ST;
11485 extract_base_offset_in_addr (dest, base, offset);
11487 else
11488 return SCHED_FUSION_NONE;
11490 if (*base == NULL_RTX || *offset == NULL_RTX)
11491 fusion = SCHED_FUSION_NONE;
11493 return fusion;
11496 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11498 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11499 and PRI are only calculated for these instructions. For other instruction,
11500 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
11501 type instruction fusion can be added by returning different priorities.
11503 It's important that irrelevant instructions get the largest FUSION_PRI. */
11505 static void
11506 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11507 int *fusion_pri, int *pri)
11509 int tmp, off_val;
11510 rtx base, offset;
11511 enum sched_fusion_type fusion;
11513 gcc_assert (INSN_P (insn));
11515 tmp = max_pri - 1;
11516 fusion = fusion_load_store (insn, &base, &offset);
11517 if (fusion == SCHED_FUSION_NONE)
11519 *pri = tmp;
11520 *fusion_pri = tmp;
11521 return;
11524 /* Set FUSION_PRI according to fusion type and base register. */
11525 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11527 /* Calculate PRI. */
11528 tmp /= 2;
11530 /* INSN with smaller offset goes first. */
11531 off_val = (int)(INTVAL (offset));
11532 if (off_val >= 0)
11533 tmp -= (off_val & 0xfffff);
11534 else
11535 tmp += ((- off_val) & 0xfffff);
11537 *pri = tmp;
11538 return;
11541 /* Given OPERANDS of consecutive load/store, check if we can merge
11542 them into ldp/stp. LOAD is true if they are load instructions.
11543 MODE is the mode of memory operands. */
11545 bool
11546 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11547 enum machine_mode mode)
11549 HOST_WIDE_INT offval_1, offval_2, msize;
11550 enum reg_class rclass_1, rclass_2;
11551 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11553 if (load)
11555 mem_1 = operands[1];
11556 mem_2 = operands[3];
11557 reg_1 = operands[0];
11558 reg_2 = operands[2];
11559 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11560 if (REGNO (reg_1) == REGNO (reg_2))
11561 return false;
11563 else
11565 mem_1 = operands[0];
11566 mem_2 = operands[2];
11567 reg_1 = operands[1];
11568 reg_2 = operands[3];
11571 /* The mems cannot be volatile. */
11572 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11573 return false;
11575 /* Check if the addresses are in the form of [base+offset]. */
11576 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11577 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11578 return false;
11579 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11580 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11581 return false;
11583 /* Check if the bases are same. */
11584 if (!rtx_equal_p (base_1, base_2))
11585 return false;
11587 offval_1 = INTVAL (offset_1);
11588 offval_2 = INTVAL (offset_2);
11589 msize = GET_MODE_SIZE (mode);
11590 /* Check if the offsets are consecutive. */
11591 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11592 return false;
11594 /* Check if the addresses are clobbered by load. */
11595 if (load)
11597 if (reg_mentioned_p (reg_1, mem_1))
11598 return false;
11600 /* In increasing order, the last load can clobber the address. */
11601 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11602 return false;
11605 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11606 rclass_1 = FP_REGS;
11607 else
11608 rclass_1 = GENERAL_REGS;
11610 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11611 rclass_2 = FP_REGS;
11612 else
11613 rclass_2 = GENERAL_REGS;
11615 /* Check if the registers are of same class. */
11616 if (rclass_1 != rclass_2)
11617 return false;
11619 return true;
11622 /* Given OPERANDS of consecutive load/store, check if we can merge
11623 them into ldp/stp by adjusting the offset. LOAD is true if they
11624 are load instructions. MODE is the mode of memory operands.
11626 Given below consecutive stores:
11628 str w1, [xb, 0x100]
11629 str w1, [xb, 0x104]
11630 str w1, [xb, 0x108]
11631 str w1, [xb, 0x10c]
11633 Though the offsets are out of the range supported by stp, we can
11634 still pair them after adjusting the offset, like:
11636 add scratch, xb, 0x100
11637 stp w1, w1, [scratch]
11638 stp w1, w1, [scratch, 0x8]
11640 The peephole patterns detecting this opportunity should guarantee
11641 the scratch register is avaliable. */
11643 bool
11644 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11645 enum machine_mode mode)
11647 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11648 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11649 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11650 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11652 if (load)
11654 reg_1 = operands[0];
11655 mem_1 = operands[1];
11656 reg_2 = operands[2];
11657 mem_2 = operands[3];
11658 reg_3 = operands[4];
11659 mem_3 = operands[5];
11660 reg_4 = operands[6];
11661 mem_4 = operands[7];
11662 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11663 && REG_P (reg_3) && REG_P (reg_4));
11664 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11665 return false;
11667 else
11669 mem_1 = operands[0];
11670 reg_1 = operands[1];
11671 mem_2 = operands[2];
11672 reg_2 = operands[3];
11673 mem_3 = operands[4];
11674 reg_3 = operands[5];
11675 mem_4 = operands[6];
11676 reg_4 = operands[7];
11678 /* Skip if memory operand is by itslef valid for ldp/stp. */
11679 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11680 return false;
11682 /* The mems cannot be volatile. */
11683 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11684 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11685 return false;
11687 /* Check if the addresses are in the form of [base+offset]. */
11688 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11689 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11690 return false;
11691 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11692 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11693 return false;
11694 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11695 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11696 return false;
11697 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11698 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11699 return false;
11701 /* Check if the bases are same. */
11702 if (!rtx_equal_p (base_1, base_2)
11703 || !rtx_equal_p (base_2, base_3)
11704 || !rtx_equal_p (base_3, base_4))
11705 return false;
11707 offval_1 = INTVAL (offset_1);
11708 offval_2 = INTVAL (offset_2);
11709 offval_3 = INTVAL (offset_3);
11710 offval_4 = INTVAL (offset_4);
11711 msize = GET_MODE_SIZE (mode);
11712 /* Check if the offsets are consecutive. */
11713 if ((offval_1 != (offval_2 + msize)
11714 || offval_1 != (offval_3 + msize * 2)
11715 || offval_1 != (offval_4 + msize * 3))
11716 && (offval_4 != (offval_3 + msize)
11717 || offval_4 != (offval_2 + msize * 2)
11718 || offval_4 != (offval_1 + msize * 3)))
11719 return false;
11721 /* Check if the addresses are clobbered by load. */
11722 if (load)
11724 if (reg_mentioned_p (reg_1, mem_1)
11725 || reg_mentioned_p (reg_2, mem_2)
11726 || reg_mentioned_p (reg_3, mem_3))
11727 return false;
11729 /* In increasing order, the last load can clobber the address. */
11730 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11731 return false;
11734 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11735 rclass_1 = FP_REGS;
11736 else
11737 rclass_1 = GENERAL_REGS;
11739 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11740 rclass_2 = FP_REGS;
11741 else
11742 rclass_2 = GENERAL_REGS;
11744 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11745 rclass_3 = FP_REGS;
11746 else
11747 rclass_3 = GENERAL_REGS;
11749 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11750 rclass_4 = FP_REGS;
11751 else
11752 rclass_4 = GENERAL_REGS;
11754 /* Check if the registers are of same class. */
11755 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11756 return false;
11758 return true;
11761 /* Given OPERANDS of consecutive load/store, this function pairs them
11762 into ldp/stp after adjusting the offset. It depends on the fact
11763 that addresses of load/store instructions are in increasing order.
11764 MODE is the mode of memory operands. CODE is the rtl operator
11765 which should be applied to all memory operands, it's SIGN_EXTEND,
11766 ZERO_EXTEND or UNKNOWN. */
11768 bool
11769 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11770 enum machine_mode mode, RTX_CODE code)
11772 rtx base, offset, t1, t2;
11773 rtx mem_1, mem_2, mem_3, mem_4;
11774 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11776 if (load)
11778 mem_1 = operands[1];
11779 mem_2 = operands[3];
11780 mem_3 = operands[5];
11781 mem_4 = operands[7];
11783 else
11785 mem_1 = operands[0];
11786 mem_2 = operands[2];
11787 mem_3 = operands[4];
11788 mem_4 = operands[6];
11789 gcc_assert (code == UNKNOWN);
11792 extract_base_offset_in_addr (mem_1, &base, &offset);
11793 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11795 /* Adjust offset thus it can fit in ldp/stp instruction. */
11796 msize = GET_MODE_SIZE (mode);
11797 stp_off_limit = msize * 0x40;
11798 off_val = INTVAL (offset);
11799 abs_off = (off_val < 0) ? -off_val : off_val;
11800 new_off = abs_off % stp_off_limit;
11801 adj_off = abs_off - new_off;
11803 /* Further adjust to make sure all offsets are OK. */
11804 if ((new_off + msize * 2) >= stp_off_limit)
11806 adj_off += stp_off_limit;
11807 new_off -= stp_off_limit;
11810 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11811 if (adj_off >= 0x1000)
11812 return false;
11814 if (off_val < 0)
11816 adj_off = -adj_off;
11817 new_off = -new_off;
11820 /* Create new memory references. */
11821 mem_1 = change_address (mem_1, VOIDmode,
11822 plus_constant (DImode, operands[8], new_off));
11824 /* Check if the adjusted address is OK for ldp/stp. */
11825 if (!aarch64_mem_pair_operand (mem_1, mode))
11826 return false;
11828 msize = GET_MODE_SIZE (mode);
11829 mem_2 = change_address (mem_2, VOIDmode,
11830 plus_constant (DImode,
11831 operands[8],
11832 new_off + msize));
11833 mem_3 = change_address (mem_3, VOIDmode,
11834 plus_constant (DImode,
11835 operands[8],
11836 new_off + msize * 2));
11837 mem_4 = change_address (mem_4, VOIDmode,
11838 plus_constant (DImode,
11839 operands[8],
11840 new_off + msize * 3));
11842 if (code == ZERO_EXTEND)
11844 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11845 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11846 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11847 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11849 else if (code == SIGN_EXTEND)
11851 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11852 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11853 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11854 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11857 if (load)
11859 operands[1] = mem_1;
11860 operands[3] = mem_2;
11861 operands[5] = mem_3;
11862 operands[7] = mem_4;
11864 else
11866 operands[0] = mem_1;
11867 operands[2] = mem_2;
11868 operands[4] = mem_3;
11869 operands[6] = mem_4;
11872 /* Emit adjusting instruction. */
11873 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11874 /* Emit ldp/stp instructions. */
11875 t1 = gen_rtx_SET (operands[0], operands[1]);
11876 t2 = gen_rtx_SET (operands[2], operands[3]);
11877 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11878 t1 = gen_rtx_SET (operands[4], operands[5]);
11879 t2 = gen_rtx_SET (operands[6], operands[7]);
11880 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11881 return true;
11884 /* Return 1 if pseudo register should be created and used to hold
11885 GOT address for PIC code. */
11887 bool
11888 aarch64_use_pseudo_pic_reg (void)
11890 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
11893 #undef TARGET_ADDRESS_COST
11894 #define TARGET_ADDRESS_COST aarch64_address_cost
11896 /* This hook will determines whether unnamed bitfields affect the alignment
11897 of the containing structure. The hook returns true if the structure
11898 should inherit the alignment requirements of an unnamed bitfield's
11899 type. */
11900 #undef TARGET_ALIGN_ANON_BITFIELD
11901 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11903 #undef TARGET_ASM_ALIGNED_DI_OP
11904 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11906 #undef TARGET_ASM_ALIGNED_HI_OP
11907 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11909 #undef TARGET_ASM_ALIGNED_SI_OP
11910 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11912 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11913 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11914 hook_bool_const_tree_hwi_hwi_const_tree_true
11916 #undef TARGET_ASM_FILE_START
11917 #define TARGET_ASM_FILE_START aarch64_start_file
11919 #undef TARGET_ASM_OUTPUT_MI_THUNK
11920 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11922 #undef TARGET_ASM_SELECT_RTX_SECTION
11923 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11925 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11926 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11928 #undef TARGET_BUILD_BUILTIN_VA_LIST
11929 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11931 #undef TARGET_CALLEE_COPIES
11932 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11934 #undef TARGET_CAN_ELIMINATE
11935 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11937 #undef TARGET_CANNOT_FORCE_CONST_MEM
11938 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11940 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11941 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11943 /* Only the least significant bit is used for initialization guard
11944 variables. */
11945 #undef TARGET_CXX_GUARD_MASK_BIT
11946 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11948 #undef TARGET_C_MODE_FOR_SUFFIX
11949 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11951 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11952 #undef TARGET_DEFAULT_TARGET_FLAGS
11953 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11954 #endif
11956 #undef TARGET_CLASS_MAX_NREGS
11957 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11959 #undef TARGET_BUILTIN_DECL
11960 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11962 #undef TARGET_EXPAND_BUILTIN
11963 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11965 #undef TARGET_EXPAND_BUILTIN_VA_START
11966 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11968 #undef TARGET_FOLD_BUILTIN
11969 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11971 #undef TARGET_FUNCTION_ARG
11972 #define TARGET_FUNCTION_ARG aarch64_function_arg
11974 #undef TARGET_FUNCTION_ARG_ADVANCE
11975 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11977 #undef TARGET_FUNCTION_ARG_BOUNDARY
11978 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11980 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11981 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11983 #undef TARGET_FUNCTION_VALUE
11984 #define TARGET_FUNCTION_VALUE aarch64_function_value
11986 #undef TARGET_FUNCTION_VALUE_REGNO_P
11987 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11989 #undef TARGET_FRAME_POINTER_REQUIRED
11990 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11992 #undef TARGET_GIMPLE_FOLD_BUILTIN
11993 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11995 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11996 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11998 #undef TARGET_INIT_BUILTINS
11999 #define TARGET_INIT_BUILTINS aarch64_init_builtins
12001 #undef TARGET_LEGITIMATE_ADDRESS_P
12002 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
12004 #undef TARGET_LEGITIMATE_CONSTANT_P
12005 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
12007 #undef TARGET_LIBGCC_CMP_RETURN_MODE
12008 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
12010 #undef TARGET_LRA_P
12011 #define TARGET_LRA_P hook_bool_void_true
12013 #undef TARGET_MANGLE_TYPE
12014 #define TARGET_MANGLE_TYPE aarch64_mangle_type
12016 #undef TARGET_MEMORY_MOVE_COST
12017 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
12019 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
12020 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
12022 #undef TARGET_MUST_PASS_IN_STACK
12023 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
12025 /* This target hook should return true if accesses to volatile bitfields
12026 should use the narrowest mode possible. It should return false if these
12027 accesses should use the bitfield container type. */
12028 #undef TARGET_NARROW_VOLATILE_BITFIELD
12029 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
12031 #undef TARGET_OPTION_OVERRIDE
12032 #define TARGET_OPTION_OVERRIDE aarch64_override_options
12034 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
12035 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
12036 aarch64_override_options_after_change
12038 #undef TARGET_PASS_BY_REFERENCE
12039 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
12041 #undef TARGET_PREFERRED_RELOAD_CLASS
12042 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
12044 #undef TARGET_SCHED_REASSOCIATION_WIDTH
12045 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
12047 #undef TARGET_SECONDARY_RELOAD
12048 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
12050 #undef TARGET_SHIFT_TRUNCATION_MASK
12051 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
12053 #undef TARGET_SETUP_INCOMING_VARARGS
12054 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
12056 #undef TARGET_STRUCT_VALUE_RTX
12057 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
12059 #undef TARGET_REGISTER_MOVE_COST
12060 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
12062 #undef TARGET_RETURN_IN_MEMORY
12063 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
12065 #undef TARGET_RETURN_IN_MSB
12066 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
12068 #undef TARGET_RTX_COSTS
12069 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
12071 #undef TARGET_SCHED_ISSUE_RATE
12072 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
12074 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
12075 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
12076 aarch64_sched_first_cycle_multipass_dfa_lookahead
12078 #undef TARGET_TRAMPOLINE_INIT
12079 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
12081 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
12082 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
12084 #undef TARGET_VECTOR_MODE_SUPPORTED_P
12085 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
12087 #undef TARGET_ARRAY_MODE_SUPPORTED_P
12088 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
12090 #undef TARGET_VECTORIZE_ADD_STMT_COST
12091 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
12093 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
12094 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
12095 aarch64_builtin_vectorization_cost
12097 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
12098 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
12100 #undef TARGET_VECTORIZE_BUILTINS
12101 #define TARGET_VECTORIZE_BUILTINS
12103 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
12104 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
12105 aarch64_builtin_vectorized_function
12107 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
12108 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
12109 aarch64_autovectorize_vector_sizes
12111 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
12112 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
12113 aarch64_atomic_assign_expand_fenv
12115 /* Section anchor support. */
12117 #undef TARGET_MIN_ANCHOR_OFFSET
12118 #define TARGET_MIN_ANCHOR_OFFSET -256
12120 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
12121 byte offset; we can do much more for larger data types, but have no way
12122 to determine the size of the access. We assume accesses are aligned. */
12123 #undef TARGET_MAX_ANCHOR_OFFSET
12124 #define TARGET_MAX_ANCHOR_OFFSET 4095
12126 #undef TARGET_VECTOR_ALIGNMENT
12127 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
12129 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
12130 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
12131 aarch64_simd_vector_alignment_reachable
12133 /* vec_perm support. */
12135 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
12136 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
12137 aarch64_vectorize_vec_perm_const_ok
12140 #undef TARGET_FIXED_CONDITION_CODE_REGS
12141 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
12143 #undef TARGET_FLAGS_REGNUM
12144 #define TARGET_FLAGS_REGNUM CC_REGNUM
12146 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
12147 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
12149 #undef TARGET_ASAN_SHADOW_OFFSET
12150 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
12152 #undef TARGET_LEGITIMIZE_ADDRESS
12153 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
12155 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
12156 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
12157 aarch64_use_by_pieces_infrastructure_p
12159 #undef TARGET_CAN_USE_DOLOOP_P
12160 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
12162 #undef TARGET_SCHED_MACRO_FUSION_P
12163 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
12165 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
12166 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
12168 #undef TARGET_SCHED_FUSION_PRIORITY
12169 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
12171 #undef TARGET_USE_PSEUDO_PIC_REG
12172 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
12174 struct gcc_target targetm = TARGET_INITIALIZER;
12176 #include "gt-aarch64.h"