[AArch64] PR target/68129: Define TARGET_SUPPORTS_WIDE_INT
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob83b7044d92580eea1d6b006d1b7b55577f647027
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "target.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "insn-attr.h"
40 #include "alias.h"
41 #include "fold-const.h"
42 #include "stor-layout.h"
43 #include "calls.h"
44 #include "varasm.h"
45 #include "output.h"
46 #include "flags.h"
47 #include "explow.h"
48 #include "expr.h"
49 #include "reload.h"
50 #include "langhooks.h"
51 #include "opts.h"
52 #include "params.h"
53 #include "gimplify.h"
54 #include "dwarf2.h"
55 #include "gimple-iterator.h"
56 #include "tree-vectorizer.h"
57 #include "aarch64-cost-tables.h"
58 #include "dumpfile.h"
59 #include "builtins.h"
60 #include "rtl-iter.h"
61 #include "tm-constrs.h"
62 #include "sched-int.h"
63 #include "cortex-a57-fma-steering.h"
64 #include "target-globals.h"
66 /* This file should be included last. */
67 #include "target-def.h"
69 /* Defined for convenience. */
70 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
72 /* Classifies an address.
74 ADDRESS_REG_IMM
75 A simple base register plus immediate offset.
77 ADDRESS_REG_WB
78 A base register indexed by immediate offset with writeback.
80 ADDRESS_REG_REG
81 A base register indexed by (optionally scaled) register.
83 ADDRESS_REG_UXTW
84 A base register indexed by (optionally scaled) zero-extended register.
86 ADDRESS_REG_SXTW
87 A base register indexed by (optionally scaled) sign-extended register.
89 ADDRESS_LO_SUM
90 A LO_SUM rtx with a base register and "LO12" symbol relocation.
92 ADDRESS_SYMBOLIC:
93 A constant symbolic address, in pc-relative literal pool. */
95 enum aarch64_address_type {
96 ADDRESS_REG_IMM,
97 ADDRESS_REG_WB,
98 ADDRESS_REG_REG,
99 ADDRESS_REG_UXTW,
100 ADDRESS_REG_SXTW,
101 ADDRESS_LO_SUM,
102 ADDRESS_SYMBOLIC
105 struct aarch64_address_info {
106 enum aarch64_address_type type;
107 rtx base;
108 rtx offset;
109 int shift;
110 enum aarch64_symbol_type symbol_type;
113 struct simd_immediate_info
115 rtx value;
116 int shift;
117 int element_width;
118 bool mvn;
119 bool msl;
122 /* The current code model. */
123 enum aarch64_code_model aarch64_cmodel;
125 #ifdef HAVE_AS_TLS
126 #undef TARGET_HAVE_TLS
127 #define TARGET_HAVE_TLS 1
128 #endif
130 static bool aarch64_composite_type_p (const_tree, machine_mode);
131 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
132 const_tree,
133 machine_mode *, int *,
134 bool *);
135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
137 static void aarch64_override_options_after_change (void);
138 static bool aarch64_vector_mode_supported_p (machine_mode);
139 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
140 const unsigned char *sel);
141 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
143 /* Major revision number of the ARM Architecture implemented by the target. */
144 unsigned aarch64_architecture_version;
146 /* The processor for which instructions should be scheduled. */
147 enum aarch64_processor aarch64_tune = cortexa53;
149 /* Mask to specify which instruction scheduling options should be used. */
150 unsigned long aarch64_tune_flags = 0;
152 /* Global flag for PC relative loads. */
153 bool aarch64_nopcrelative_literal_loads;
155 /* Support for command line parsing of boolean flags in the tuning
156 structures. */
157 struct aarch64_flag_desc
159 const char* name;
160 unsigned int flag;
163 #define AARCH64_FUSION_PAIR(name, internal_name) \
164 { name, AARCH64_FUSE_##internal_name },
165 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
167 { "none", AARCH64_FUSE_NOTHING },
168 #include "aarch64-fusion-pairs.def"
169 { "all", AARCH64_FUSE_ALL },
170 { NULL, AARCH64_FUSE_NOTHING }
172 #undef AARCH64_FUION_PAIR
174 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
175 { name, AARCH64_EXTRA_TUNE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
178 { "none", AARCH64_EXTRA_TUNE_NONE },
179 #include "aarch64-tuning-flags.def"
180 { "all", AARCH64_EXTRA_TUNE_ALL },
181 { NULL, AARCH64_EXTRA_TUNE_NONE }
183 #undef AARCH64_EXTRA_TUNING_OPTION
185 /* Tuning parameters. */
187 static const struct cpu_addrcost_table generic_addrcost_table =
190 0, /* hi */
191 0, /* si */
192 0, /* di */
193 0, /* ti */
195 0, /* pre_modify */
196 0, /* post_modify */
197 0, /* register_offset */
198 0, /* register_sextend */
199 0, /* register_zextend */
200 0 /* imm_offset */
203 static const struct cpu_addrcost_table cortexa57_addrcost_table =
206 1, /* hi */
207 0, /* si */
208 0, /* di */
209 1, /* ti */
211 0, /* pre_modify */
212 0, /* post_modify */
213 0, /* register_offset */
214 0, /* register_sextend */
215 0, /* register_zextend */
216 0, /* imm_offset */
219 static const struct cpu_addrcost_table xgene1_addrcost_table =
222 1, /* hi */
223 0, /* si */
224 0, /* di */
225 1, /* ti */
227 1, /* pre_modify */
228 0, /* post_modify */
229 0, /* register_offset */
230 1, /* register_sextend */
231 1, /* register_zextend */
232 0, /* imm_offset */
235 static const struct cpu_regmove_cost generic_regmove_cost =
237 1, /* GP2GP */
238 /* Avoid the use of slow int<->fp moves for spilling by setting
239 their cost higher than memmov_cost. */
240 5, /* GP2FP */
241 5, /* FP2GP */
242 2 /* FP2FP */
245 static const struct cpu_regmove_cost cortexa57_regmove_cost =
247 1, /* GP2GP */
248 /* Avoid the use of slow int<->fp moves for spilling by setting
249 their cost higher than memmov_cost. */
250 5, /* GP2FP */
251 5, /* FP2GP */
252 2 /* FP2FP */
255 static const struct cpu_regmove_cost cortexa53_regmove_cost =
257 1, /* GP2GP */
258 /* Avoid the use of slow int<->fp moves for spilling by setting
259 their cost higher than memmov_cost. */
260 5, /* GP2FP */
261 5, /* FP2GP */
262 2 /* FP2FP */
265 static const struct cpu_regmove_cost thunderx_regmove_cost =
267 2, /* GP2GP */
268 2, /* GP2FP */
269 6, /* FP2GP */
270 4 /* FP2FP */
273 static const struct cpu_regmove_cost xgene1_regmove_cost =
275 1, /* GP2GP */
276 /* Avoid the use of slow int<->fp moves for spilling by setting
277 their cost higher than memmov_cost. */
278 8, /* GP2FP */
279 8, /* FP2GP */
280 2 /* FP2FP */
283 /* Generic costs for vector insn classes. */
284 static const struct cpu_vector_cost generic_vector_cost =
286 1, /* scalar_stmt_cost */
287 1, /* scalar_load_cost */
288 1, /* scalar_store_cost */
289 1, /* vec_stmt_cost */
290 1, /* vec_to_scalar_cost */
291 1, /* scalar_to_vec_cost */
292 1, /* vec_align_load_cost */
293 1, /* vec_unalign_load_cost */
294 1, /* vec_unalign_store_cost */
295 1, /* vec_store_cost */
296 3, /* cond_taken_branch_cost */
297 1 /* cond_not_taken_branch_cost */
300 /* Generic costs for vector insn classes. */
301 static const struct cpu_vector_cost cortexa57_vector_cost =
303 1, /* scalar_stmt_cost */
304 4, /* scalar_load_cost */
305 1, /* scalar_store_cost */
306 3, /* vec_stmt_cost */
307 8, /* vec_to_scalar_cost */
308 8, /* scalar_to_vec_cost */
309 5, /* vec_align_load_cost */
310 5, /* vec_unalign_load_cost */
311 1, /* vec_unalign_store_cost */
312 1, /* vec_store_cost */
313 1, /* cond_taken_branch_cost */
314 1 /* cond_not_taken_branch_cost */
317 /* Generic costs for vector insn classes. */
318 static const struct cpu_vector_cost xgene1_vector_cost =
320 1, /* scalar_stmt_cost */
321 5, /* scalar_load_cost */
322 1, /* scalar_store_cost */
323 2, /* vec_stmt_cost */
324 4, /* vec_to_scalar_cost */
325 4, /* scalar_to_vec_cost */
326 10, /* vec_align_load_cost */
327 10, /* vec_unalign_load_cost */
328 2, /* vec_unalign_store_cost */
329 2, /* vec_store_cost */
330 2, /* cond_taken_branch_cost */
331 1 /* cond_not_taken_branch_cost */
334 /* Generic costs for branch instructions. */
335 static const struct cpu_branch_cost generic_branch_cost =
337 2, /* Predictable. */
338 2 /* Unpredictable. */
341 static const struct tune_params generic_tunings =
343 &cortexa57_extra_costs,
344 &generic_addrcost_table,
345 &generic_regmove_cost,
346 &generic_vector_cost,
347 &generic_branch_cost,
348 4, /* memmov_cost */
349 2, /* issue_rate */
350 AARCH64_FUSE_NOTHING, /* fusible_ops */
351 8, /* function_align. */
352 8, /* jump_align. */
353 4, /* loop_align. */
354 2, /* int_reassoc_width. */
355 4, /* fp_reassoc_width. */
356 1, /* vec_reassoc_width. */
357 2, /* min_div_recip_mul_sf. */
358 2, /* min_div_recip_mul_df. */
359 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
360 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
363 static const struct tune_params cortexa53_tunings =
365 &cortexa53_extra_costs,
366 &generic_addrcost_table,
367 &cortexa53_regmove_cost,
368 &generic_vector_cost,
369 &generic_branch_cost,
370 4, /* memmov_cost */
371 2, /* issue_rate */
372 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
373 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
374 8, /* function_align. */
375 8, /* jump_align. */
376 4, /* loop_align. */
377 2, /* int_reassoc_width. */
378 4, /* fp_reassoc_width. */
379 1, /* vec_reassoc_width. */
380 2, /* min_div_recip_mul_sf. */
381 2, /* min_div_recip_mul_df. */
382 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
383 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
386 static const struct tune_params cortexa57_tunings =
388 &cortexa57_extra_costs,
389 &cortexa57_addrcost_table,
390 &cortexa57_regmove_cost,
391 &cortexa57_vector_cost,
392 &generic_branch_cost,
393 4, /* memmov_cost */
394 3, /* issue_rate */
395 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
396 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
397 16, /* function_align. */
398 8, /* jump_align. */
399 4, /* loop_align. */
400 2, /* int_reassoc_width. */
401 4, /* fp_reassoc_width. */
402 1, /* vec_reassoc_width. */
403 2, /* min_div_recip_mul_sf. */
404 2, /* min_div_recip_mul_df. */
405 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
406 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
407 | AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
410 static const struct tune_params cortexa72_tunings =
412 &cortexa57_extra_costs,
413 &cortexa57_addrcost_table,
414 &cortexa57_regmove_cost,
415 &cortexa57_vector_cost,
416 &generic_branch_cost,
417 4, /* memmov_cost */
418 3, /* issue_rate */
419 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
420 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
421 16, /* function_align. */
422 8, /* jump_align. */
423 4, /* loop_align. */
424 2, /* int_reassoc_width. */
425 4, /* fp_reassoc_width. */
426 1, /* vec_reassoc_width. */
427 2, /* min_div_recip_mul_sf. */
428 2, /* min_div_recip_mul_df. */
429 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
430 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
433 static const struct tune_params thunderx_tunings =
435 &thunderx_extra_costs,
436 &generic_addrcost_table,
437 &thunderx_regmove_cost,
438 &generic_vector_cost,
439 &generic_branch_cost,
440 6, /* memmov_cost */
441 2, /* issue_rate */
442 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
443 8, /* function_align. */
444 8, /* jump_align. */
445 8, /* loop_align. */
446 2, /* int_reassoc_width. */
447 4, /* fp_reassoc_width. */
448 1, /* vec_reassoc_width. */
449 2, /* min_div_recip_mul_sf. */
450 2, /* min_div_recip_mul_df. */
451 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
452 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
455 static const struct tune_params xgene1_tunings =
457 &xgene1_extra_costs,
458 &xgene1_addrcost_table,
459 &xgene1_regmove_cost,
460 &xgene1_vector_cost,
461 &generic_branch_cost,
462 6, /* memmov_cost */
463 4, /* issue_rate */
464 AARCH64_FUSE_NOTHING, /* fusible_ops */
465 16, /* function_align. */
466 8, /* jump_align. */
467 16, /* loop_align. */
468 2, /* int_reassoc_width. */
469 4, /* fp_reassoc_width. */
470 1, /* vec_reassoc_width. */
471 2, /* min_div_recip_mul_sf. */
472 2, /* min_div_recip_mul_df. */
473 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
474 (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
477 /* Support for fine-grained override of the tuning structures. */
478 struct aarch64_tuning_override_function
480 const char* name;
481 void (*parse_override)(const char*, struct tune_params*);
484 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
485 static void aarch64_parse_tune_string (const char*, struct tune_params*);
487 static const struct aarch64_tuning_override_function
488 aarch64_tuning_override_functions[] =
490 { "fuse", aarch64_parse_fuse_string },
491 { "tune", aarch64_parse_tune_string },
492 { NULL, NULL }
495 /* A processor implementing AArch64. */
496 struct processor
498 const char *const name;
499 enum aarch64_processor ident;
500 enum aarch64_processor sched_core;
501 enum aarch64_arch arch;
502 unsigned architecture_version;
503 const unsigned long flags;
504 const struct tune_params *const tune;
507 /* Architectures implementing AArch64. */
508 static const struct processor all_architectures[] =
510 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
511 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
512 #include "aarch64-arches.def"
513 #undef AARCH64_ARCH
514 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
517 /* Processor cores implementing AArch64. */
518 static const struct processor all_cores[] =
520 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
521 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
522 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
523 FLAGS, &COSTS##_tunings},
524 #include "aarch64-cores.def"
525 #undef AARCH64_CORE
526 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
527 AARCH64_FL_FOR_ARCH8, &generic_tunings},
528 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
532 /* Target specification. These are populated by the -march, -mtune, -mcpu
533 handling code or by target attributes. */
534 static const struct processor *selected_arch;
535 static const struct processor *selected_cpu;
536 static const struct processor *selected_tune;
538 /* The current tuning set. */
539 struct tune_params aarch64_tune_params = generic_tunings;
541 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
543 /* An ISA extension in the co-processor and main instruction set space. */
544 struct aarch64_option_extension
546 const char *const name;
547 const unsigned long flags_on;
548 const unsigned long flags_off;
551 /* ISA extensions in AArch64. */
552 static const struct aarch64_option_extension all_extensions[] =
554 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
555 {NAME, FLAGS_ON, FLAGS_OFF},
556 #include "aarch64-option-extensions.def"
557 #undef AARCH64_OPT_EXTENSION
558 {NULL, 0, 0}
561 typedef enum aarch64_cond_code
563 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
564 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
565 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
567 aarch64_cc;
569 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
571 /* The condition codes of the processor, and the inverse function. */
572 static const char * const aarch64_condition_codes[] =
574 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
575 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
578 /* Generate code to enable conditional branches in functions over 1 MiB. */
579 const char *
580 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
581 const char * branch_format)
583 rtx_code_label * tmp_label = gen_label_rtx ();
584 char label_buf[256];
585 char buffer[128];
586 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
587 CODE_LABEL_NUMBER (tmp_label));
588 const char *label_ptr = targetm.strip_name_encoding (label_buf);
589 rtx dest_label = operands[pos_label];
590 operands[pos_label] = tmp_label;
592 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
593 output_asm_insn (buffer, operands);
595 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
596 operands[pos_label] = dest_label;
597 output_asm_insn (buffer, operands);
598 return "";
601 void
602 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
604 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
605 if (TARGET_GENERAL_REGS_ONLY)
606 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
607 else
608 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
611 static unsigned int
612 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
614 if (GET_MODE_UNIT_SIZE (mode) == 4)
615 return aarch64_tune_params.min_div_recip_mul_sf;
616 return aarch64_tune_params.min_div_recip_mul_df;
619 static int
620 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
621 enum machine_mode mode)
623 if (VECTOR_MODE_P (mode))
624 return aarch64_tune_params.vec_reassoc_width;
625 if (INTEGRAL_MODE_P (mode))
626 return aarch64_tune_params.int_reassoc_width;
627 if (FLOAT_MODE_P (mode))
628 return aarch64_tune_params.fp_reassoc_width;
629 return 1;
632 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
633 unsigned
634 aarch64_dbx_register_number (unsigned regno)
636 if (GP_REGNUM_P (regno))
637 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
638 else if (regno == SP_REGNUM)
639 return AARCH64_DWARF_SP;
640 else if (FP_REGNUM_P (regno))
641 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
643 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
644 equivalent DWARF register. */
645 return DWARF_FRAME_REGISTERS;
648 /* Return TRUE if MODE is any of the large INT modes. */
649 static bool
650 aarch64_vect_struct_mode_p (machine_mode mode)
652 return mode == OImode || mode == CImode || mode == XImode;
655 /* Return TRUE if MODE is any of the vector modes. */
656 static bool
657 aarch64_vector_mode_p (machine_mode mode)
659 return aarch64_vector_mode_supported_p (mode)
660 || aarch64_vect_struct_mode_p (mode);
663 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
664 static bool
665 aarch64_array_mode_supported_p (machine_mode mode,
666 unsigned HOST_WIDE_INT nelems)
668 if (TARGET_SIMD
669 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
670 || AARCH64_VALID_SIMD_DREG_MODE (mode))
671 && (nelems >= 2 && nelems <= 4))
672 return true;
674 return false;
677 /* Implement HARD_REGNO_NREGS. */
680 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
682 switch (aarch64_regno_regclass (regno))
684 case FP_REGS:
685 case FP_LO_REGS:
686 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
687 default:
688 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
690 gcc_unreachable ();
693 /* Implement HARD_REGNO_MODE_OK. */
696 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
698 if (GET_MODE_CLASS (mode) == MODE_CC)
699 return regno == CC_REGNUM;
701 if (regno == SP_REGNUM)
702 /* The purpose of comparing with ptr_mode is to support the
703 global register variable associated with the stack pointer
704 register via the syntax of asm ("wsp") in ILP32. */
705 return mode == Pmode || mode == ptr_mode;
707 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
708 return mode == Pmode;
710 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
711 return 1;
713 if (FP_REGNUM_P (regno))
715 if (aarch64_vect_struct_mode_p (mode))
716 return
717 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
718 else
719 return 1;
722 return 0;
725 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
726 machine_mode
727 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
728 machine_mode mode)
730 /* Handle modes that fit within single registers. */
731 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
733 if (GET_MODE_SIZE (mode) >= 4)
734 return mode;
735 else
736 return SImode;
738 /* Fall back to generic for multi-reg and very large modes. */
739 else
740 return choose_hard_reg_mode (regno, nregs, false);
743 /* Return true if calls to DECL should be treated as
744 long-calls (ie called via a register). */
745 static bool
746 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
748 return false;
751 /* Return true if calls to symbol-ref SYM should be treated as
752 long-calls (ie called via a register). */
753 bool
754 aarch64_is_long_call_p (rtx sym)
756 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
759 /* Return true if calls to symbol-ref SYM should not go through
760 plt stubs. */
762 bool
763 aarch64_is_noplt_call_p (rtx sym)
765 const_tree decl = SYMBOL_REF_DECL (sym);
767 if (flag_pic
768 && decl
769 && (!flag_plt
770 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
771 && !targetm.binds_local_p (decl))
772 return true;
774 return false;
777 /* Return true if the offsets to a zero/sign-extract operation
778 represent an expression that matches an extend operation. The
779 operands represent the paramters from
781 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
782 bool
783 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
784 rtx extract_imm)
786 HOST_WIDE_INT mult_val, extract_val;
788 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
789 return false;
791 mult_val = INTVAL (mult_imm);
792 extract_val = INTVAL (extract_imm);
794 if (extract_val > 8
795 && extract_val < GET_MODE_BITSIZE (mode)
796 && exact_log2 (extract_val & ~7) > 0
797 && (extract_val & 7) <= 4
798 && mult_val == (1 << (extract_val & 7)))
799 return true;
801 return false;
804 /* Emit an insn that's a simple single-set. Both the operands must be
805 known to be valid. */
806 inline static rtx
807 emit_set_insn (rtx x, rtx y)
809 return emit_insn (gen_rtx_SET (x, y));
812 /* X and Y are two things to compare using CODE. Emit the compare insn and
813 return the rtx for register 0 in the proper mode. */
815 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
817 machine_mode mode = SELECT_CC_MODE (code, x, y);
818 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
820 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
821 return cc_reg;
824 /* Build the SYMBOL_REF for __tls_get_addr. */
826 static GTY(()) rtx tls_get_addr_libfunc;
829 aarch64_tls_get_addr (void)
831 if (!tls_get_addr_libfunc)
832 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
833 return tls_get_addr_libfunc;
836 /* Return the TLS model to use for ADDR. */
838 static enum tls_model
839 tls_symbolic_operand_type (rtx addr)
841 enum tls_model tls_kind = TLS_MODEL_NONE;
842 rtx sym, addend;
844 if (GET_CODE (addr) == CONST)
846 split_const (addr, &sym, &addend);
847 if (GET_CODE (sym) == SYMBOL_REF)
848 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
850 else if (GET_CODE (addr) == SYMBOL_REF)
851 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
853 return tls_kind;
856 /* We'll allow lo_sum's in addresses in our legitimate addresses
857 so that combine would take care of combining addresses where
858 necessary, but for generation purposes, we'll generate the address
859 as :
860 RTL Absolute
861 tmp = hi (symbol_ref); adrp x1, foo
862 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
865 PIC TLS
866 adrp x1, :got:foo adrp tmp, :tlsgd:foo
867 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
868 bl __tls_get_addr
871 Load TLS symbol, depending on TLS mechanism and TLS access model.
873 Global Dynamic - Traditional TLS:
874 adrp tmp, :tlsgd:imm
875 add dest, tmp, #:tlsgd_lo12:imm
876 bl __tls_get_addr
878 Global Dynamic - TLS Descriptors:
879 adrp dest, :tlsdesc:imm
880 ldr tmp, [dest, #:tlsdesc_lo12:imm]
881 add dest, dest, #:tlsdesc_lo12:imm
882 blr tmp
883 mrs tp, tpidr_el0
884 add dest, dest, tp
886 Initial Exec:
887 mrs tp, tpidr_el0
888 adrp tmp, :gottprel:imm
889 ldr dest, [tmp, #:gottprel_lo12:imm]
890 add dest, dest, tp
892 Local Exec:
893 mrs tp, tpidr_el0
894 add t0, tp, #:tprel_hi12:imm, lsl #12
895 add t0, t0, #:tprel_lo12_nc:imm
898 static void
899 aarch64_load_symref_appropriately (rtx dest, rtx imm,
900 enum aarch64_symbol_type type)
902 switch (type)
904 case SYMBOL_SMALL_ABSOLUTE:
906 /* In ILP32, the mode of dest can be either SImode or DImode. */
907 rtx tmp_reg = dest;
908 machine_mode mode = GET_MODE (dest);
910 gcc_assert (mode == Pmode || mode == ptr_mode);
912 if (can_create_pseudo_p ())
913 tmp_reg = gen_reg_rtx (mode);
915 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
916 emit_insn (gen_add_losym (dest, tmp_reg, imm));
917 return;
920 case SYMBOL_TINY_ABSOLUTE:
921 emit_insn (gen_rtx_SET (dest, imm));
922 return;
924 case SYMBOL_SMALL_GOT_28K:
926 machine_mode mode = GET_MODE (dest);
927 rtx gp_rtx = pic_offset_table_rtx;
928 rtx insn;
929 rtx mem;
931 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
932 here before rtl expand. Tree IVOPT will generate rtl pattern to
933 decide rtx costs, in which case pic_offset_table_rtx is not
934 initialized. For that case no need to generate the first adrp
935 instruction as the final cost for global variable access is
936 one instruction. */
937 if (gp_rtx != NULL)
939 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
940 using the page base as GOT base, the first page may be wasted,
941 in the worst scenario, there is only 28K space for GOT).
943 The generate instruction sequence for accessing global variable
946 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
948 Only one instruction needed. But we must initialize
949 pic_offset_table_rtx properly. We generate initialize insn for
950 every global access, and allow CSE to remove all redundant.
952 The final instruction sequences will look like the following
953 for multiply global variables access.
955 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
957 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
958 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
959 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
960 ... */
962 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
963 crtl->uses_pic_offset_table = 1;
964 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
966 if (mode != GET_MODE (gp_rtx))
967 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
970 if (mode == ptr_mode)
972 if (mode == DImode)
973 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
974 else
975 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
977 mem = XVECEXP (SET_SRC (insn), 0, 0);
979 else
981 gcc_assert (mode == Pmode);
983 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
984 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
987 /* The operand is expected to be MEM. Whenever the related insn
988 pattern changed, above code which calculate mem should be
989 updated. */
990 gcc_assert (GET_CODE (mem) == MEM);
991 MEM_READONLY_P (mem) = 1;
992 MEM_NOTRAP_P (mem) = 1;
993 emit_insn (insn);
994 return;
997 case SYMBOL_SMALL_GOT_4G:
999 /* In ILP32, the mode of dest can be either SImode or DImode,
1000 while the got entry is always of SImode size. The mode of
1001 dest depends on how dest is used: if dest is assigned to a
1002 pointer (e.g. in the memory), it has SImode; it may have
1003 DImode if dest is dereferenced to access the memeory.
1004 This is why we have to handle three different ldr_got_small
1005 patterns here (two patterns for ILP32). */
1007 rtx insn;
1008 rtx mem;
1009 rtx tmp_reg = dest;
1010 machine_mode mode = GET_MODE (dest);
1012 if (can_create_pseudo_p ())
1013 tmp_reg = gen_reg_rtx (mode);
1015 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1016 if (mode == ptr_mode)
1018 if (mode == DImode)
1019 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1020 else
1021 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1023 mem = XVECEXP (SET_SRC (insn), 0, 0);
1025 else
1027 gcc_assert (mode == Pmode);
1029 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1030 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1033 gcc_assert (GET_CODE (mem) == MEM);
1034 MEM_READONLY_P (mem) = 1;
1035 MEM_NOTRAP_P (mem) = 1;
1036 emit_insn (insn);
1037 return;
1040 case SYMBOL_SMALL_TLSGD:
1042 rtx_insn *insns;
1043 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1045 start_sequence ();
1046 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1047 insns = get_insns ();
1048 end_sequence ();
1050 RTL_CONST_CALL_P (insns) = 1;
1051 emit_libcall_block (insns, dest, result, imm);
1052 return;
1055 case SYMBOL_SMALL_TLSDESC:
1057 machine_mode mode = GET_MODE (dest);
1058 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1059 rtx tp;
1061 gcc_assert (mode == Pmode || mode == ptr_mode);
1063 /* In ILP32, the got entry is always of SImode size. Unlike
1064 small GOT, the dest is fixed at reg 0. */
1065 if (TARGET_ILP32)
1066 emit_insn (gen_tlsdesc_small_si (imm));
1067 else
1068 emit_insn (gen_tlsdesc_small_di (imm));
1069 tp = aarch64_load_tp (NULL);
1071 if (mode != Pmode)
1072 tp = gen_lowpart (mode, tp);
1074 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1075 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1076 return;
1079 case SYMBOL_SMALL_TLSIE:
1081 /* In ILP32, the mode of dest can be either SImode or DImode,
1082 while the got entry is always of SImode size. The mode of
1083 dest depends on how dest is used: if dest is assigned to a
1084 pointer (e.g. in the memory), it has SImode; it may have
1085 DImode if dest is dereferenced to access the memeory.
1086 This is why we have to handle three different tlsie_small
1087 patterns here (two patterns for ILP32). */
1088 machine_mode mode = GET_MODE (dest);
1089 rtx tmp_reg = gen_reg_rtx (mode);
1090 rtx tp = aarch64_load_tp (NULL);
1092 if (mode == ptr_mode)
1094 if (mode == DImode)
1095 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1096 else
1098 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1099 tp = gen_lowpart (mode, tp);
1102 else
1104 gcc_assert (mode == Pmode);
1105 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1108 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1109 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1110 return;
1113 case SYMBOL_TLSLE12:
1114 case SYMBOL_TLSLE24:
1115 case SYMBOL_TLSLE32:
1116 case SYMBOL_TLSLE48:
1118 machine_mode mode = GET_MODE (dest);
1119 rtx tp = aarch64_load_tp (NULL);
1121 if (mode != Pmode)
1122 tp = gen_lowpart (mode, tp);
1124 switch (type)
1126 case SYMBOL_TLSLE12:
1127 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1128 (dest, tp, imm));
1129 break;
1130 case SYMBOL_TLSLE24:
1131 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1132 (dest, tp, imm));
1133 break;
1134 case SYMBOL_TLSLE32:
1135 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1136 (dest, imm));
1137 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1138 (dest, dest, tp));
1139 break;
1140 case SYMBOL_TLSLE48:
1141 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1142 (dest, imm));
1143 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1144 (dest, dest, tp));
1145 break;
1146 default:
1147 gcc_unreachable ();
1150 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1151 return;
1154 case SYMBOL_TINY_GOT:
1155 emit_insn (gen_ldr_got_tiny (dest, imm));
1156 return;
1158 case SYMBOL_TINY_TLSIE:
1160 machine_mode mode = GET_MODE (dest);
1161 rtx tp = aarch64_load_tp (NULL);
1163 if (mode == ptr_mode)
1165 if (mode == DImode)
1166 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1167 else
1169 tp = gen_lowpart (mode, tp);
1170 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1173 else
1175 gcc_assert (mode == Pmode);
1176 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1179 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1180 return;
1183 default:
1184 gcc_unreachable ();
1188 /* Emit a move from SRC to DEST. Assume that the move expanders can
1189 handle all moves if !can_create_pseudo_p (). The distinction is
1190 important because, unlike emit_move_insn, the move expanders know
1191 how to force Pmode objects into the constant pool even when the
1192 constant pool address is not itself legitimate. */
1193 static rtx
1194 aarch64_emit_move (rtx dest, rtx src)
1196 return (can_create_pseudo_p ()
1197 ? emit_move_insn (dest, src)
1198 : emit_move_insn_1 (dest, src));
1201 /* Split a 128-bit move operation into two 64-bit move operations,
1202 taking care to handle partial overlap of register to register
1203 copies. Special cases are needed when moving between GP regs and
1204 FP regs. SRC can be a register, constant or memory; DST a register
1205 or memory. If either operand is memory it must not have any side
1206 effects. */
1207 void
1208 aarch64_split_128bit_move (rtx dst, rtx src)
1210 rtx dst_lo, dst_hi;
1211 rtx src_lo, src_hi;
1213 machine_mode mode = GET_MODE (dst);
1215 gcc_assert (mode == TImode || mode == TFmode);
1216 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1217 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1219 if (REG_P (dst) && REG_P (src))
1221 int src_regno = REGNO (src);
1222 int dst_regno = REGNO (dst);
1224 /* Handle FP <-> GP regs. */
1225 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1227 src_lo = gen_lowpart (word_mode, src);
1228 src_hi = gen_highpart (word_mode, src);
1230 if (mode == TImode)
1232 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1233 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1235 else
1237 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1238 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1240 return;
1242 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1244 dst_lo = gen_lowpart (word_mode, dst);
1245 dst_hi = gen_highpart (word_mode, dst);
1247 if (mode == TImode)
1249 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1250 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1252 else
1254 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1255 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1257 return;
1261 dst_lo = gen_lowpart (word_mode, dst);
1262 dst_hi = gen_highpart (word_mode, dst);
1263 src_lo = gen_lowpart (word_mode, src);
1264 src_hi = gen_highpart_mode (word_mode, mode, src);
1266 /* At most one pairing may overlap. */
1267 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1269 aarch64_emit_move (dst_hi, src_hi);
1270 aarch64_emit_move (dst_lo, src_lo);
1272 else
1274 aarch64_emit_move (dst_lo, src_lo);
1275 aarch64_emit_move (dst_hi, src_hi);
1279 bool
1280 aarch64_split_128bit_move_p (rtx dst, rtx src)
1282 return (! REG_P (src)
1283 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1286 /* Split a complex SIMD combine. */
1288 void
1289 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1291 machine_mode src_mode = GET_MODE (src1);
1292 machine_mode dst_mode = GET_MODE (dst);
1294 gcc_assert (VECTOR_MODE_P (dst_mode));
1296 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1298 rtx (*gen) (rtx, rtx, rtx);
1300 switch (src_mode)
1302 case V8QImode:
1303 gen = gen_aarch64_simd_combinev8qi;
1304 break;
1305 case V4HImode:
1306 gen = gen_aarch64_simd_combinev4hi;
1307 break;
1308 case V2SImode:
1309 gen = gen_aarch64_simd_combinev2si;
1310 break;
1311 case V4HFmode:
1312 gen = gen_aarch64_simd_combinev4hf;
1313 break;
1314 case V2SFmode:
1315 gen = gen_aarch64_simd_combinev2sf;
1316 break;
1317 case DImode:
1318 gen = gen_aarch64_simd_combinedi;
1319 break;
1320 case DFmode:
1321 gen = gen_aarch64_simd_combinedf;
1322 break;
1323 default:
1324 gcc_unreachable ();
1327 emit_insn (gen (dst, src1, src2));
1328 return;
1332 /* Split a complex SIMD move. */
1334 void
1335 aarch64_split_simd_move (rtx dst, rtx src)
1337 machine_mode src_mode = GET_MODE (src);
1338 machine_mode dst_mode = GET_MODE (dst);
1340 gcc_assert (VECTOR_MODE_P (dst_mode));
1342 if (REG_P (dst) && REG_P (src))
1344 rtx (*gen) (rtx, rtx);
1346 gcc_assert (VECTOR_MODE_P (src_mode));
1348 switch (src_mode)
1350 case V16QImode:
1351 gen = gen_aarch64_split_simd_movv16qi;
1352 break;
1353 case V8HImode:
1354 gen = gen_aarch64_split_simd_movv8hi;
1355 break;
1356 case V4SImode:
1357 gen = gen_aarch64_split_simd_movv4si;
1358 break;
1359 case V2DImode:
1360 gen = gen_aarch64_split_simd_movv2di;
1361 break;
1362 case V8HFmode:
1363 gen = gen_aarch64_split_simd_movv8hf;
1364 break;
1365 case V4SFmode:
1366 gen = gen_aarch64_split_simd_movv4sf;
1367 break;
1368 case V2DFmode:
1369 gen = gen_aarch64_split_simd_movv2df;
1370 break;
1371 default:
1372 gcc_unreachable ();
1375 emit_insn (gen (dst, src));
1376 return;
1380 static rtx
1381 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1383 if (can_create_pseudo_p ())
1384 return force_reg (mode, value);
1385 else
1387 x = aarch64_emit_move (x, value);
1388 return x;
1393 static rtx
1394 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1396 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1398 rtx high;
1399 /* Load the full offset into a register. This
1400 might be improvable in the future. */
1401 high = GEN_INT (offset);
1402 offset = 0;
1403 high = aarch64_force_temporary (mode, temp, high);
1404 reg = aarch64_force_temporary (mode, temp,
1405 gen_rtx_PLUS (mode, high, reg));
1407 return plus_constant (mode, reg, offset);
1410 static int
1411 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1412 machine_mode mode)
1414 int i;
1415 unsigned HOST_WIDE_INT val, val2, mask;
1416 int one_match, zero_match;
1417 int num_insns;
1419 val = INTVAL (imm);
1421 if (aarch64_move_imm (val, mode))
1423 if (generate)
1424 emit_insn (gen_rtx_SET (dest, imm));
1425 return 1;
1428 if ((val >> 32) == 0 || mode == SImode)
1430 if (generate)
1432 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1433 if (mode == SImode)
1434 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1435 GEN_INT ((val >> 16) & 0xffff)));
1436 else
1437 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1438 GEN_INT ((val >> 16) & 0xffff)));
1440 return 2;
1443 /* Remaining cases are all for DImode. */
1445 mask = 0xffff;
1446 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1447 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1448 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1449 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1451 if (zero_match != 2 && one_match != 2)
1453 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1454 For a 64-bit bitmask try whether changing 16 bits to all ones or
1455 zeroes creates a valid bitmask. To check any repeated bitmask,
1456 try using 16 bits from the other 32-bit half of val. */
1458 for (i = 0; i < 64; i += 16, mask <<= 16)
1460 val2 = val & ~mask;
1461 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1462 break;
1463 val2 = val | mask;
1464 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1465 break;
1466 val2 = val2 & ~mask;
1467 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1468 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1469 break;
1471 if (i != 64)
1473 if (generate)
1475 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1476 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1477 GEN_INT ((val >> i) & 0xffff)));
1482 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1483 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1484 otherwise skip zero bits. */
1486 num_insns = 1;
1487 mask = 0xffff;
1488 val2 = one_match > zero_match ? ~val : val;
1489 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1491 if (generate)
1492 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1493 ? (val | ~(mask << i))
1494 : (val & (mask << i)))));
1495 for (i += 16; i < 64; i += 16)
1497 if ((val2 & (mask << i)) == 0)
1498 continue;
1499 if (generate)
1500 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1501 GEN_INT ((val >> i) & 0xffff)));
1502 num_insns ++;
1505 return num_insns;
1509 void
1510 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1512 machine_mode mode = GET_MODE (dest);
1514 gcc_assert (mode == SImode || mode == DImode);
1516 /* Check on what type of symbol it is. */
1517 if (GET_CODE (imm) == SYMBOL_REF
1518 || GET_CODE (imm) == LABEL_REF
1519 || GET_CODE (imm) == CONST)
1521 rtx mem, base, offset;
1522 enum aarch64_symbol_type sty;
1524 /* If we have (const (plus symbol offset)), separate out the offset
1525 before we start classifying the symbol. */
1526 split_const (imm, &base, &offset);
1528 sty = aarch64_classify_symbol (base, offset);
1529 switch (sty)
1531 case SYMBOL_FORCE_TO_MEM:
1532 if (offset != const0_rtx
1533 && targetm.cannot_force_const_mem (mode, imm))
1535 gcc_assert (can_create_pseudo_p ());
1536 base = aarch64_force_temporary (mode, dest, base);
1537 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1538 aarch64_emit_move (dest, base);
1539 return;
1542 mem = force_const_mem (ptr_mode, imm);
1543 gcc_assert (mem);
1545 /* If we aren't generating PC relative literals, then
1546 we need to expand the literal pool access carefully.
1547 This is something that needs to be done in a number
1548 of places, so could well live as a separate function. */
1549 if (aarch64_nopcrelative_literal_loads)
1551 gcc_assert (can_create_pseudo_p ());
1552 base = gen_reg_rtx (ptr_mode);
1553 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1554 mem = gen_rtx_MEM (ptr_mode, base);
1557 if (mode != ptr_mode)
1558 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1560 emit_insn (gen_rtx_SET (dest, mem));
1562 return;
1564 case SYMBOL_SMALL_TLSGD:
1565 case SYMBOL_SMALL_TLSDESC:
1566 case SYMBOL_SMALL_TLSIE:
1567 case SYMBOL_SMALL_GOT_28K:
1568 case SYMBOL_SMALL_GOT_4G:
1569 case SYMBOL_TINY_GOT:
1570 case SYMBOL_TINY_TLSIE:
1571 if (offset != const0_rtx)
1573 gcc_assert(can_create_pseudo_p ());
1574 base = aarch64_force_temporary (mode, dest, base);
1575 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1576 aarch64_emit_move (dest, base);
1577 return;
1579 /* FALLTHRU */
1581 case SYMBOL_SMALL_ABSOLUTE:
1582 case SYMBOL_TINY_ABSOLUTE:
1583 case SYMBOL_TLSLE12:
1584 case SYMBOL_TLSLE24:
1585 case SYMBOL_TLSLE32:
1586 case SYMBOL_TLSLE48:
1587 aarch64_load_symref_appropriately (dest, imm, sty);
1588 return;
1590 default:
1591 gcc_unreachable ();
1595 if (!CONST_INT_P (imm))
1597 if (GET_CODE (imm) == HIGH)
1598 emit_insn (gen_rtx_SET (dest, imm));
1599 else
1601 rtx mem = force_const_mem (mode, imm);
1602 gcc_assert (mem);
1603 emit_insn (gen_rtx_SET (dest, mem));
1606 return;
1609 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1612 static bool
1613 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1614 tree exp ATTRIBUTE_UNUSED)
1616 /* Currently, always true. */
1617 return true;
1620 /* Implement TARGET_PASS_BY_REFERENCE. */
1622 static bool
1623 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1624 machine_mode mode,
1625 const_tree type,
1626 bool named ATTRIBUTE_UNUSED)
1628 HOST_WIDE_INT size;
1629 machine_mode dummymode;
1630 int nregs;
1632 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1633 size = (mode == BLKmode && type)
1634 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1636 /* Aggregates are passed by reference based on their size. */
1637 if (type && AGGREGATE_TYPE_P (type))
1639 size = int_size_in_bytes (type);
1642 /* Variable sized arguments are always returned by reference. */
1643 if (size < 0)
1644 return true;
1646 /* Can this be a candidate to be passed in fp/simd register(s)? */
1647 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1648 &dummymode, &nregs,
1649 NULL))
1650 return false;
1652 /* Arguments which are variable sized or larger than 2 registers are
1653 passed by reference unless they are a homogenous floating point
1654 aggregate. */
1655 return size > 2 * UNITS_PER_WORD;
1658 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1659 static bool
1660 aarch64_return_in_msb (const_tree valtype)
1662 machine_mode dummy_mode;
1663 int dummy_int;
1665 /* Never happens in little-endian mode. */
1666 if (!BYTES_BIG_ENDIAN)
1667 return false;
1669 /* Only composite types smaller than or equal to 16 bytes can
1670 be potentially returned in registers. */
1671 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1672 || int_size_in_bytes (valtype) <= 0
1673 || int_size_in_bytes (valtype) > 16)
1674 return false;
1676 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1677 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1678 is always passed/returned in the least significant bits of fp/simd
1679 register(s). */
1680 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1681 &dummy_mode, &dummy_int, NULL))
1682 return false;
1684 return true;
1687 /* Implement TARGET_FUNCTION_VALUE.
1688 Define how to find the value returned by a function. */
1690 static rtx
1691 aarch64_function_value (const_tree type, const_tree func,
1692 bool outgoing ATTRIBUTE_UNUSED)
1694 machine_mode mode;
1695 int unsignedp;
1696 int count;
1697 machine_mode ag_mode;
1699 mode = TYPE_MODE (type);
1700 if (INTEGRAL_TYPE_P (type))
1701 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1703 if (aarch64_return_in_msb (type))
1705 HOST_WIDE_INT size = int_size_in_bytes (type);
1707 if (size % UNITS_PER_WORD != 0)
1709 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1710 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1714 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1715 &ag_mode, &count, NULL))
1717 if (!aarch64_composite_type_p (type, mode))
1719 gcc_assert (count == 1 && mode == ag_mode);
1720 return gen_rtx_REG (mode, V0_REGNUM);
1722 else
1724 int i;
1725 rtx par;
1727 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1728 for (i = 0; i < count; i++)
1730 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1731 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1732 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1733 XVECEXP (par, 0, i) = tmp;
1735 return par;
1738 else
1739 return gen_rtx_REG (mode, R0_REGNUM);
1742 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1743 Return true if REGNO is the number of a hard register in which the values
1744 of called function may come back. */
1746 static bool
1747 aarch64_function_value_regno_p (const unsigned int regno)
1749 /* Maximum of 16 bytes can be returned in the general registers. Examples
1750 of 16-byte return values are: 128-bit integers and 16-byte small
1751 structures (excluding homogeneous floating-point aggregates). */
1752 if (regno == R0_REGNUM || regno == R1_REGNUM)
1753 return true;
1755 /* Up to four fp/simd registers can return a function value, e.g. a
1756 homogeneous floating-point aggregate having four members. */
1757 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1758 return TARGET_FLOAT;
1760 return false;
1763 /* Implement TARGET_RETURN_IN_MEMORY.
1765 If the type T of the result of a function is such that
1766 void func (T arg)
1767 would require that arg be passed as a value in a register (or set of
1768 registers) according to the parameter passing rules, then the result
1769 is returned in the same registers as would be used for such an
1770 argument. */
1772 static bool
1773 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1775 HOST_WIDE_INT size;
1776 machine_mode ag_mode;
1777 int count;
1779 if (!AGGREGATE_TYPE_P (type)
1780 && TREE_CODE (type) != COMPLEX_TYPE
1781 && TREE_CODE (type) != VECTOR_TYPE)
1782 /* Simple scalar types always returned in registers. */
1783 return false;
1785 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1786 type,
1787 &ag_mode,
1788 &count,
1789 NULL))
1790 return false;
1792 /* Types larger than 2 registers returned in memory. */
1793 size = int_size_in_bytes (type);
1794 return (size < 0 || size > 2 * UNITS_PER_WORD);
1797 static bool
1798 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1799 const_tree type, int *nregs)
1801 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1802 return aarch64_vfp_is_call_or_return_candidate (mode,
1803 type,
1804 &pcum->aapcs_vfp_rmode,
1805 nregs,
1806 NULL);
1809 /* Given MODE and TYPE of a function argument, return the alignment in
1810 bits. The idea is to suppress any stronger alignment requested by
1811 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1812 This is a helper function for local use only. */
1814 static unsigned int
1815 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1817 unsigned int alignment;
1819 if (type)
1821 if (!integer_zerop (TYPE_SIZE (type)))
1823 if (TYPE_MODE (type) == mode)
1824 alignment = TYPE_ALIGN (type);
1825 else
1826 alignment = GET_MODE_ALIGNMENT (mode);
1828 else
1829 alignment = 0;
1831 else
1832 alignment = GET_MODE_ALIGNMENT (mode);
1834 return alignment;
1837 /* Layout a function argument according to the AAPCS64 rules. The rule
1838 numbers refer to the rule numbers in the AAPCS64. */
1840 static void
1841 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1842 const_tree type,
1843 bool named ATTRIBUTE_UNUSED)
1845 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1846 int ncrn, nvrn, nregs;
1847 bool allocate_ncrn, allocate_nvrn;
1848 HOST_WIDE_INT size;
1850 /* We need to do this once per argument. */
1851 if (pcum->aapcs_arg_processed)
1852 return;
1854 pcum->aapcs_arg_processed = true;
1856 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1857 size
1858 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1859 UNITS_PER_WORD);
1861 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1862 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1863 mode,
1864 type,
1865 &nregs);
1867 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1868 The following code thus handles passing by SIMD/FP registers first. */
1870 nvrn = pcum->aapcs_nvrn;
1872 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1873 and homogenous short-vector aggregates (HVA). */
1874 if (allocate_nvrn)
1876 if (!TARGET_FLOAT)
1877 aarch64_err_no_fpadvsimd (mode, "argument");
1879 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1881 pcum->aapcs_nextnvrn = nvrn + nregs;
1882 if (!aarch64_composite_type_p (type, mode))
1884 gcc_assert (nregs == 1);
1885 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1887 else
1889 rtx par;
1890 int i;
1891 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1892 for (i = 0; i < nregs; i++)
1894 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1895 V0_REGNUM + nvrn + i);
1896 tmp = gen_rtx_EXPR_LIST
1897 (VOIDmode, tmp,
1898 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1899 XVECEXP (par, 0, i) = tmp;
1901 pcum->aapcs_reg = par;
1903 return;
1905 else
1907 /* C.3 NSRN is set to 8. */
1908 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1909 goto on_stack;
1913 ncrn = pcum->aapcs_ncrn;
1914 nregs = size / UNITS_PER_WORD;
1916 /* C6 - C9. though the sign and zero extension semantics are
1917 handled elsewhere. This is the case where the argument fits
1918 entirely general registers. */
1919 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1921 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1923 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1925 /* C.8 if the argument has an alignment of 16 then the NGRN is
1926 rounded up to the next even number. */
1927 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1929 ++ncrn;
1930 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1932 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1933 A reg is still generated for it, but the caller should be smart
1934 enough not to use it. */
1935 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1937 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1939 else
1941 rtx par;
1942 int i;
1944 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1945 for (i = 0; i < nregs; i++)
1947 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1948 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1949 GEN_INT (i * UNITS_PER_WORD));
1950 XVECEXP (par, 0, i) = tmp;
1952 pcum->aapcs_reg = par;
1955 pcum->aapcs_nextncrn = ncrn + nregs;
1956 return;
1959 /* C.11 */
1960 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1962 /* The argument is passed on stack; record the needed number of words for
1963 this argument and align the total size if necessary. */
1964 on_stack:
1965 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1966 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1967 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
1968 16 / UNITS_PER_WORD);
1969 return;
1972 /* Implement TARGET_FUNCTION_ARG. */
1974 static rtx
1975 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1976 const_tree type, bool named)
1978 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1979 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1981 if (mode == VOIDmode)
1982 return NULL_RTX;
1984 aarch64_layout_arg (pcum_v, mode, type, named);
1985 return pcum->aapcs_reg;
1988 void
1989 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1990 const_tree fntype ATTRIBUTE_UNUSED,
1991 rtx libname ATTRIBUTE_UNUSED,
1992 const_tree fndecl ATTRIBUTE_UNUSED,
1993 unsigned n_named ATTRIBUTE_UNUSED)
1995 pcum->aapcs_ncrn = 0;
1996 pcum->aapcs_nvrn = 0;
1997 pcum->aapcs_nextncrn = 0;
1998 pcum->aapcs_nextnvrn = 0;
1999 pcum->pcs_variant = ARM_PCS_AAPCS64;
2000 pcum->aapcs_reg = NULL_RTX;
2001 pcum->aapcs_arg_processed = false;
2002 pcum->aapcs_stack_words = 0;
2003 pcum->aapcs_stack_size = 0;
2005 if (!TARGET_FLOAT
2006 && fndecl && TREE_PUBLIC (fndecl)
2007 && fntype && fntype != error_mark_node)
2009 const_tree type = TREE_TYPE (fntype);
2010 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2011 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2012 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2013 &mode, &nregs, NULL))
2014 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2016 return;
2019 static void
2020 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2021 machine_mode mode,
2022 const_tree type,
2023 bool named)
2025 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2026 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2028 aarch64_layout_arg (pcum_v, mode, type, named);
2029 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2030 != (pcum->aapcs_stack_words != 0));
2031 pcum->aapcs_arg_processed = false;
2032 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2033 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2034 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2035 pcum->aapcs_stack_words = 0;
2036 pcum->aapcs_reg = NULL_RTX;
2040 bool
2041 aarch64_function_arg_regno_p (unsigned regno)
2043 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2044 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2047 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2048 PARM_BOUNDARY bits of alignment, but will be given anything up
2049 to STACK_BOUNDARY bits if the type requires it. This makes sure
2050 that both before and after the layout of each argument, the Next
2051 Stacked Argument Address (NSAA) will have a minimum alignment of
2052 8 bytes. */
2054 static unsigned int
2055 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2057 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2059 if (alignment < PARM_BOUNDARY)
2060 alignment = PARM_BOUNDARY;
2061 if (alignment > STACK_BOUNDARY)
2062 alignment = STACK_BOUNDARY;
2063 return alignment;
2066 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2068 Return true if an argument passed on the stack should be padded upwards,
2069 i.e. if the least-significant byte of the stack slot has useful data.
2071 Small aggregate types are placed in the lowest memory address.
2073 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2075 bool
2076 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2078 /* On little-endian targets, the least significant byte of every stack
2079 argument is passed at the lowest byte address of the stack slot. */
2080 if (!BYTES_BIG_ENDIAN)
2081 return true;
2083 /* Otherwise, integral, floating-point and pointer types are padded downward:
2084 the least significant byte of a stack argument is passed at the highest
2085 byte address of the stack slot. */
2086 if (type
2087 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2088 || POINTER_TYPE_P (type))
2089 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2090 return false;
2092 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2093 return true;
2096 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2098 It specifies padding for the last (may also be the only)
2099 element of a block move between registers and memory. If
2100 assuming the block is in the memory, padding upward means that
2101 the last element is padded after its highest significant byte,
2102 while in downward padding, the last element is padded at the
2103 its least significant byte side.
2105 Small aggregates and small complex types are always padded
2106 upwards.
2108 We don't need to worry about homogeneous floating-point or
2109 short-vector aggregates; their move is not affected by the
2110 padding direction determined here. Regardless of endianness,
2111 each element of such an aggregate is put in the least
2112 significant bits of a fp/simd register.
2114 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2115 register has useful data, and return the opposite if the most
2116 significant byte does. */
2118 bool
2119 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2120 bool first ATTRIBUTE_UNUSED)
2123 /* Small composite types are always padded upward. */
2124 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2126 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2127 : GET_MODE_SIZE (mode));
2128 if (size < 2 * UNITS_PER_WORD)
2129 return true;
2132 /* Otherwise, use the default padding. */
2133 return !BYTES_BIG_ENDIAN;
2136 static machine_mode
2137 aarch64_libgcc_cmp_return_mode (void)
2139 return SImode;
2142 static bool
2143 aarch64_frame_pointer_required (void)
2145 /* In aarch64_override_options_after_change
2146 flag_omit_leaf_frame_pointer turns off the frame pointer by
2147 default. Turn it back on now if we've not got a leaf
2148 function. */
2149 if (flag_omit_leaf_frame_pointer
2150 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2151 return true;
2153 return false;
2156 /* Mark the registers that need to be saved by the callee and calculate
2157 the size of the callee-saved registers area and frame record (both FP
2158 and LR may be omitted). */
2159 static void
2160 aarch64_layout_frame (void)
2162 HOST_WIDE_INT offset = 0;
2163 int regno;
2165 if (reload_completed && cfun->machine->frame.laid_out)
2166 return;
2168 #define SLOT_NOT_REQUIRED (-2)
2169 #define SLOT_REQUIRED (-1)
2171 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2172 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2174 /* First mark all the registers that really need to be saved... */
2175 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2176 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2178 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2179 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2181 /* ... that includes the eh data registers (if needed)... */
2182 if (crtl->calls_eh_return)
2183 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2184 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2185 = SLOT_REQUIRED;
2187 /* ... and any callee saved register that dataflow says is live. */
2188 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2189 if (df_regs_ever_live_p (regno)
2190 && (regno == R30_REGNUM
2191 || !call_used_regs[regno]))
2192 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2194 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2195 if (df_regs_ever_live_p (regno)
2196 && !call_used_regs[regno])
2197 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2199 if (frame_pointer_needed)
2201 /* FP and LR are placed in the linkage record. */
2202 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2203 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2204 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2205 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2206 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2207 offset += 2 * UNITS_PER_WORD;
2210 /* Now assign stack slots for them. */
2211 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2212 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2214 cfun->machine->frame.reg_offset[regno] = offset;
2215 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2216 cfun->machine->frame.wb_candidate1 = regno;
2217 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2218 cfun->machine->frame.wb_candidate2 = regno;
2219 offset += UNITS_PER_WORD;
2222 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2223 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2225 cfun->machine->frame.reg_offset[regno] = offset;
2226 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2227 cfun->machine->frame.wb_candidate1 = regno;
2228 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2229 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2230 cfun->machine->frame.wb_candidate2 = regno;
2231 offset += UNITS_PER_WORD;
2234 cfun->machine->frame.padding0 =
2235 (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2236 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2238 cfun->machine->frame.saved_regs_size = offset;
2240 cfun->machine->frame.hard_fp_offset
2241 = ROUND_UP (cfun->machine->frame.saved_varargs_size
2242 + get_frame_size ()
2243 + cfun->machine->frame.saved_regs_size,
2244 STACK_BOUNDARY / BITS_PER_UNIT);
2246 cfun->machine->frame.frame_size
2247 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2248 + crtl->outgoing_args_size,
2249 STACK_BOUNDARY / BITS_PER_UNIT);
2251 cfun->machine->frame.laid_out = true;
2254 static bool
2255 aarch64_register_saved_on_entry (int regno)
2257 return cfun->machine->frame.reg_offset[regno] >= 0;
2260 static unsigned
2261 aarch64_next_callee_save (unsigned regno, unsigned limit)
2263 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2264 regno ++;
2265 return regno;
2268 static void
2269 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2270 HOST_WIDE_INT adjustment)
2272 rtx base_rtx = stack_pointer_rtx;
2273 rtx insn, reg, mem;
2275 reg = gen_rtx_REG (mode, regno);
2276 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2277 plus_constant (Pmode, base_rtx, -adjustment));
2278 mem = gen_rtx_MEM (mode, mem);
2280 insn = emit_move_insn (mem, reg);
2281 RTX_FRAME_RELATED_P (insn) = 1;
2284 static rtx
2285 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2286 HOST_WIDE_INT adjustment)
2288 switch (mode)
2290 case DImode:
2291 return gen_storewb_pairdi_di (base, base, reg, reg2,
2292 GEN_INT (-adjustment),
2293 GEN_INT (UNITS_PER_WORD - adjustment));
2294 case DFmode:
2295 return gen_storewb_pairdf_di (base, base, reg, reg2,
2296 GEN_INT (-adjustment),
2297 GEN_INT (UNITS_PER_WORD - adjustment));
2298 default:
2299 gcc_unreachable ();
2303 static void
2304 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2305 unsigned regno2, HOST_WIDE_INT adjustment)
2307 rtx_insn *insn;
2308 rtx reg1 = gen_rtx_REG (mode, regno1);
2309 rtx reg2 = gen_rtx_REG (mode, regno2);
2311 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2312 reg2, adjustment));
2313 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2314 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2315 RTX_FRAME_RELATED_P (insn) = 1;
2318 static rtx
2319 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2320 HOST_WIDE_INT adjustment)
2322 switch (mode)
2324 case DImode:
2325 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2326 GEN_INT (UNITS_PER_WORD));
2327 case DFmode:
2328 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2329 GEN_INT (UNITS_PER_WORD));
2330 default:
2331 gcc_unreachable ();
2335 static rtx
2336 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2337 rtx reg2)
2339 switch (mode)
2341 case DImode:
2342 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2344 case DFmode:
2345 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2347 default:
2348 gcc_unreachable ();
2352 static rtx
2353 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2354 rtx mem2)
2356 switch (mode)
2358 case DImode:
2359 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2361 case DFmode:
2362 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2364 default:
2365 gcc_unreachable ();
2370 static void
2371 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2372 unsigned start, unsigned limit, bool skip_wb)
2374 rtx_insn *insn;
2375 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2376 ? gen_frame_mem : gen_rtx_MEM);
2377 unsigned regno;
2378 unsigned regno2;
2380 for (regno = aarch64_next_callee_save (start, limit);
2381 regno <= limit;
2382 regno = aarch64_next_callee_save (regno + 1, limit))
2384 rtx reg, mem;
2385 HOST_WIDE_INT offset;
2387 if (skip_wb
2388 && (regno == cfun->machine->frame.wb_candidate1
2389 || regno == cfun->machine->frame.wb_candidate2))
2390 continue;
2392 reg = gen_rtx_REG (mode, regno);
2393 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2394 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2395 offset));
2397 regno2 = aarch64_next_callee_save (regno + 1, limit);
2399 if (regno2 <= limit
2400 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2401 == cfun->machine->frame.reg_offset[regno2]))
2404 rtx reg2 = gen_rtx_REG (mode, regno2);
2405 rtx mem2;
2407 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2408 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2409 offset));
2410 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2411 reg2));
2413 /* The first part of a frame-related parallel insn is
2414 always assumed to be relevant to the frame
2415 calculations; subsequent parts, are only
2416 frame-related if explicitly marked. */
2417 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2418 regno = regno2;
2420 else
2421 insn = emit_move_insn (mem, reg);
2423 RTX_FRAME_RELATED_P (insn) = 1;
2427 static void
2428 aarch64_restore_callee_saves (machine_mode mode,
2429 HOST_WIDE_INT start_offset, unsigned start,
2430 unsigned limit, bool skip_wb, rtx *cfi_ops)
2432 rtx base_rtx = stack_pointer_rtx;
2433 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2434 ? gen_frame_mem : gen_rtx_MEM);
2435 unsigned regno;
2436 unsigned regno2;
2437 HOST_WIDE_INT offset;
2439 for (regno = aarch64_next_callee_save (start, limit);
2440 regno <= limit;
2441 regno = aarch64_next_callee_save (regno + 1, limit))
2443 rtx reg, mem;
2445 if (skip_wb
2446 && (regno == cfun->machine->frame.wb_candidate1
2447 || regno == cfun->machine->frame.wb_candidate2))
2448 continue;
2450 reg = gen_rtx_REG (mode, regno);
2451 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2452 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2454 regno2 = aarch64_next_callee_save (regno + 1, limit);
2456 if (regno2 <= limit
2457 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2458 == cfun->machine->frame.reg_offset[regno2]))
2460 rtx reg2 = gen_rtx_REG (mode, regno2);
2461 rtx mem2;
2463 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2464 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2465 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2467 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2468 regno = regno2;
2470 else
2471 emit_move_insn (reg, mem);
2472 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2476 /* AArch64 stack frames generated by this compiler look like:
2478 +-------------------------------+
2480 | incoming stack arguments |
2482 +-------------------------------+
2483 | | <-- incoming stack pointer (aligned)
2484 | callee-allocated save area |
2485 | for register varargs |
2487 +-------------------------------+
2488 | local variables | <-- frame_pointer_rtx
2490 +-------------------------------+
2491 | padding0 | \
2492 +-------------------------------+ |
2493 | callee-saved registers | | frame.saved_regs_size
2494 +-------------------------------+ |
2495 | LR' | |
2496 +-------------------------------+ |
2497 | FP' | / <- hard_frame_pointer_rtx (aligned)
2498 +-------------------------------+
2499 | dynamic allocation |
2500 +-------------------------------+
2501 | padding |
2502 +-------------------------------+
2503 | outgoing stack arguments | <-- arg_pointer
2505 +-------------------------------+
2506 | | <-- stack_pointer_rtx (aligned)
2508 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2509 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2510 unchanged. */
2512 /* Generate the prologue instructions for entry into a function.
2513 Establish the stack frame by decreasing the stack pointer with a
2514 properly calculated size and, if necessary, create a frame record
2515 filled with the values of LR and previous frame pointer. The
2516 current FP is also set up if it is in use. */
2518 void
2519 aarch64_expand_prologue (void)
2521 /* sub sp, sp, #<frame_size>
2522 stp {fp, lr}, [sp, #<frame_size> - 16]
2523 add fp, sp, #<frame_size> - hardfp_offset
2524 stp {cs_reg}, [fp, #-16] etc.
2526 sub sp, sp, <final_adjustment_if_any>
2528 HOST_WIDE_INT frame_size, offset;
2529 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2530 HOST_WIDE_INT hard_fp_offset;
2531 rtx_insn *insn;
2533 aarch64_layout_frame ();
2535 offset = frame_size = cfun->machine->frame.frame_size;
2536 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2537 fp_offset = frame_size - hard_fp_offset;
2539 if (flag_stack_usage_info)
2540 current_function_static_stack_size = frame_size;
2542 /* Store pairs and load pairs have a range only -512 to 504. */
2543 if (offset >= 512)
2545 /* When the frame has a large size, an initial decrease is done on
2546 the stack pointer to jump over the callee-allocated save area for
2547 register varargs, the local variable area and/or the callee-saved
2548 register area. This will allow the pre-index write-back
2549 store pair instructions to be used for setting up the stack frame
2550 efficiently. */
2551 offset = hard_fp_offset;
2552 if (offset >= 512)
2553 offset = cfun->machine->frame.saved_regs_size;
2555 frame_size -= (offset + crtl->outgoing_args_size);
2556 fp_offset = 0;
2558 if (frame_size >= 0x1000000)
2560 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2561 emit_move_insn (op0, GEN_INT (-frame_size));
2562 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2564 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2565 gen_rtx_SET (stack_pointer_rtx,
2566 plus_constant (Pmode, stack_pointer_rtx,
2567 -frame_size)));
2568 RTX_FRAME_RELATED_P (insn) = 1;
2570 else if (frame_size > 0)
2572 int hi_ofs = frame_size & 0xfff000;
2573 int lo_ofs = frame_size & 0x000fff;
2575 if (hi_ofs)
2577 insn = emit_insn (gen_add2_insn
2578 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2579 RTX_FRAME_RELATED_P (insn) = 1;
2581 if (lo_ofs)
2583 insn = emit_insn (gen_add2_insn
2584 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2585 RTX_FRAME_RELATED_P (insn) = 1;
2589 else
2590 frame_size = -1;
2592 if (offset > 0)
2594 bool skip_wb = false;
2596 if (frame_pointer_needed)
2598 skip_wb = true;
2600 if (fp_offset)
2602 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2603 GEN_INT (-offset)));
2604 RTX_FRAME_RELATED_P (insn) = 1;
2606 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2607 R30_REGNUM, false);
2609 else
2610 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2612 /* Set up frame pointer to point to the location of the
2613 previous frame pointer on the stack. */
2614 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2615 stack_pointer_rtx,
2616 GEN_INT (fp_offset)));
2617 RTX_FRAME_RELATED_P (insn) = 1;
2618 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2620 else
2622 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2623 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2625 if (fp_offset
2626 || reg1 == FIRST_PSEUDO_REGISTER
2627 || (reg2 == FIRST_PSEUDO_REGISTER
2628 && offset >= 256))
2630 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2631 GEN_INT (-offset)));
2632 RTX_FRAME_RELATED_P (insn) = 1;
2634 else
2636 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2638 skip_wb = true;
2640 if (reg2 == FIRST_PSEUDO_REGISTER)
2641 aarch64_pushwb_single_reg (mode1, reg1, offset);
2642 else
2643 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2647 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2648 skip_wb);
2649 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2650 skip_wb);
2653 /* when offset >= 512,
2654 sub sp, sp, #<outgoing_args_size> */
2655 if (frame_size > -1)
2657 if (crtl->outgoing_args_size > 0)
2659 insn = emit_insn (gen_add2_insn
2660 (stack_pointer_rtx,
2661 GEN_INT (- crtl->outgoing_args_size)));
2662 RTX_FRAME_RELATED_P (insn) = 1;
2667 /* Return TRUE if we can use a simple_return insn.
2669 This function checks whether the callee saved stack is empty, which
2670 means no restore actions are need. The pro_and_epilogue will use
2671 this to check whether shrink-wrapping opt is feasible. */
2673 bool
2674 aarch64_use_return_insn_p (void)
2676 if (!reload_completed)
2677 return false;
2679 if (crtl->profile)
2680 return false;
2682 aarch64_layout_frame ();
2684 return cfun->machine->frame.frame_size == 0;
2687 /* Generate the epilogue instructions for returning from a function. */
2688 void
2689 aarch64_expand_epilogue (bool for_sibcall)
2691 HOST_WIDE_INT frame_size, offset;
2692 HOST_WIDE_INT fp_offset;
2693 HOST_WIDE_INT hard_fp_offset;
2694 rtx_insn *insn;
2695 /* We need to add memory barrier to prevent read from deallocated stack. */
2696 bool need_barrier_p = (get_frame_size () != 0
2697 || cfun->machine->frame.saved_varargs_size);
2699 aarch64_layout_frame ();
2701 offset = frame_size = cfun->machine->frame.frame_size;
2702 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2703 fp_offset = frame_size - hard_fp_offset;
2705 /* Store pairs and load pairs have a range only -512 to 504. */
2706 if (offset >= 512)
2708 offset = hard_fp_offset;
2709 if (offset >= 512)
2710 offset = cfun->machine->frame.saved_regs_size;
2712 frame_size -= (offset + crtl->outgoing_args_size);
2713 fp_offset = 0;
2714 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2716 insn = emit_insn (gen_add2_insn
2717 (stack_pointer_rtx,
2718 GEN_INT (crtl->outgoing_args_size)));
2719 RTX_FRAME_RELATED_P (insn) = 1;
2722 else
2723 frame_size = -1;
2725 /* If there were outgoing arguments or we've done dynamic stack
2726 allocation, then restore the stack pointer from the frame
2727 pointer. This is at most one insn and more efficient than using
2728 GCC's internal mechanism. */
2729 if (frame_pointer_needed
2730 && (crtl->outgoing_args_size || cfun->calls_alloca))
2732 if (cfun->calls_alloca)
2733 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2735 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2736 hard_frame_pointer_rtx,
2737 GEN_INT (0)));
2738 offset = offset - fp_offset;
2741 if (offset > 0)
2743 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2744 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2745 bool skip_wb = true;
2746 rtx cfi_ops = NULL;
2748 if (frame_pointer_needed)
2749 fp_offset = 0;
2750 else if (fp_offset
2751 || reg1 == FIRST_PSEUDO_REGISTER
2752 || (reg2 == FIRST_PSEUDO_REGISTER
2753 && offset >= 256))
2754 skip_wb = false;
2756 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2757 skip_wb, &cfi_ops);
2758 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2759 skip_wb, &cfi_ops);
2761 if (need_barrier_p)
2762 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2764 if (skip_wb)
2766 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2767 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2769 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2770 if (reg2 == FIRST_PSEUDO_REGISTER)
2772 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2773 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2774 mem = gen_rtx_MEM (mode1, mem);
2775 insn = emit_move_insn (rreg1, mem);
2777 else
2779 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2781 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2782 insn = emit_insn (aarch64_gen_loadwb_pair
2783 (mode1, stack_pointer_rtx, rreg1,
2784 rreg2, offset));
2787 else
2789 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2790 GEN_INT (offset)));
2793 /* Reset the CFA to be SP + FRAME_SIZE. */
2794 rtx new_cfa = stack_pointer_rtx;
2795 if (frame_size > 0)
2796 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2797 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2798 REG_NOTES (insn) = cfi_ops;
2799 RTX_FRAME_RELATED_P (insn) = 1;
2802 if (frame_size > 0)
2804 if (need_barrier_p)
2805 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2807 if (frame_size >= 0x1000000)
2809 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2810 emit_move_insn (op0, GEN_INT (frame_size));
2811 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2813 else
2815 int hi_ofs = frame_size & 0xfff000;
2816 int lo_ofs = frame_size & 0x000fff;
2818 if (hi_ofs && lo_ofs)
2820 insn = emit_insn (gen_add2_insn
2821 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2822 RTX_FRAME_RELATED_P (insn) = 1;
2823 frame_size = lo_ofs;
2825 insn = emit_insn (gen_add2_insn
2826 (stack_pointer_rtx, GEN_INT (frame_size)));
2829 /* Reset the CFA to be SP + 0. */
2830 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2831 RTX_FRAME_RELATED_P (insn) = 1;
2834 /* Stack adjustment for exception handler. */
2835 if (crtl->calls_eh_return)
2837 /* We need to unwind the stack by the offset computed by
2838 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2839 to be SP; letting the CFA move during this adjustment
2840 is just as correct as retaining the CFA from the body
2841 of the function. Therefore, do nothing special. */
2842 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2845 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2846 if (!for_sibcall)
2847 emit_jump_insn (ret_rtx);
2850 /* Return the place to copy the exception unwinding return address to.
2851 This will probably be a stack slot, but could (in theory be the
2852 return register). */
2854 aarch64_final_eh_return_addr (void)
2856 HOST_WIDE_INT fp_offset;
2858 aarch64_layout_frame ();
2860 fp_offset = cfun->machine->frame.frame_size
2861 - cfun->machine->frame.hard_fp_offset;
2863 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2864 return gen_rtx_REG (DImode, LR_REGNUM);
2866 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2867 result in a store to save LR introduced by builtin_eh_return () being
2868 incorrectly deleted because the alias is not detected.
2869 So in the calculation of the address to copy the exception unwinding
2870 return address to, we note 2 cases.
2871 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2872 we return a SP-relative location since all the addresses are SP-relative
2873 in this case. This prevents the store from being optimized away.
2874 If the fp_offset is not 0, then the addresses will be FP-relative and
2875 therefore we return a FP-relative location. */
2877 if (frame_pointer_needed)
2879 if (fp_offset)
2880 return gen_frame_mem (DImode,
2881 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2882 else
2883 return gen_frame_mem (DImode,
2884 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2887 /* If FP is not needed, we calculate the location of LR, which would be
2888 at the top of the saved registers block. */
2890 return gen_frame_mem (DImode,
2891 plus_constant (Pmode,
2892 stack_pointer_rtx,
2893 fp_offset
2894 + cfun->machine->frame.saved_regs_size
2895 - 2 * UNITS_PER_WORD));
2898 /* Possibly output code to build up a constant in a register. For
2899 the benefit of the costs infrastructure, returns the number of
2900 instructions which would be emitted. GENERATE inhibits or
2901 enables code generation. */
2903 static int
2904 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2906 int insns = 0;
2908 if (aarch64_bitmask_imm (val, DImode))
2910 if (generate)
2911 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2912 insns = 1;
2914 else
2916 int i;
2917 int ncount = 0;
2918 int zcount = 0;
2919 HOST_WIDE_INT valp = val >> 16;
2920 HOST_WIDE_INT valm;
2921 HOST_WIDE_INT tval;
2923 for (i = 16; i < 64; i += 16)
2925 valm = (valp & 0xffff);
2927 if (valm != 0)
2928 ++ zcount;
2930 if (valm != 0xffff)
2931 ++ ncount;
2933 valp >>= 16;
2936 /* zcount contains the number of additional MOVK instructions
2937 required if the constant is built up with an initial MOVZ instruction,
2938 while ncount is the number of MOVK instructions required if starting
2939 with a MOVN instruction. Choose the sequence that yields the fewest
2940 number of instructions, preferring MOVZ instructions when they are both
2941 the same. */
2942 if (ncount < zcount)
2944 if (generate)
2945 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2946 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2947 tval = 0xffff;
2948 insns++;
2950 else
2952 if (generate)
2953 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2954 GEN_INT (val & 0xffff));
2955 tval = 0;
2956 insns++;
2959 val >>= 16;
2961 for (i = 16; i < 64; i += 16)
2963 if ((val & 0xffff) != tval)
2965 if (generate)
2966 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2967 GEN_INT (i),
2968 GEN_INT (val & 0xffff)));
2969 insns++;
2971 val >>= 16;
2974 return insns;
2977 static void
2978 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2980 HOST_WIDE_INT mdelta = delta;
2981 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2982 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2984 if (mdelta < 0)
2985 mdelta = -mdelta;
2987 if (mdelta >= 4096 * 4096)
2989 (void) aarch64_build_constant (scratchreg, delta, true);
2990 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2992 else if (mdelta > 0)
2994 if (mdelta >= 4096)
2996 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2997 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2998 if (delta < 0)
2999 emit_insn (gen_rtx_SET (this_rtx,
3000 gen_rtx_MINUS (Pmode, this_rtx, shift)));
3001 else
3002 emit_insn (gen_rtx_SET (this_rtx,
3003 gen_rtx_PLUS (Pmode, this_rtx, shift)));
3005 if (mdelta % 4096 != 0)
3007 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3008 emit_insn (gen_rtx_SET (this_rtx,
3009 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3014 /* Output code to add DELTA to the first argument, and then jump
3015 to FUNCTION. Used for C++ multiple inheritance. */
3016 static void
3017 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3018 HOST_WIDE_INT delta,
3019 HOST_WIDE_INT vcall_offset,
3020 tree function)
3022 /* The this pointer is always in x0. Note that this differs from
3023 Arm where the this pointer maybe bumped to r1 if r0 is required
3024 to return a pointer to an aggregate. On AArch64 a result value
3025 pointer will be in x8. */
3026 int this_regno = R0_REGNUM;
3027 rtx this_rtx, temp0, temp1, addr, funexp;
3028 rtx_insn *insn;
3030 reload_completed = 1;
3031 emit_note (NOTE_INSN_PROLOGUE_END);
3033 if (vcall_offset == 0)
3034 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3035 else
3037 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3039 this_rtx = gen_rtx_REG (Pmode, this_regno);
3040 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3041 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3043 addr = this_rtx;
3044 if (delta != 0)
3046 if (delta >= -256 && delta < 256)
3047 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3048 plus_constant (Pmode, this_rtx, delta));
3049 else
3050 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3053 if (Pmode == ptr_mode)
3054 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3055 else
3056 aarch64_emit_move (temp0,
3057 gen_rtx_ZERO_EXTEND (Pmode,
3058 gen_rtx_MEM (ptr_mode, addr)));
3060 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3061 addr = plus_constant (Pmode, temp0, vcall_offset);
3062 else
3064 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3065 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3068 if (Pmode == ptr_mode)
3069 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3070 else
3071 aarch64_emit_move (temp1,
3072 gen_rtx_SIGN_EXTEND (Pmode,
3073 gen_rtx_MEM (ptr_mode, addr)));
3075 emit_insn (gen_add2_insn (this_rtx, temp1));
3078 /* Generate a tail call to the target function. */
3079 if (!TREE_USED (function))
3081 assemble_external (function);
3082 TREE_USED (function) = 1;
3084 funexp = XEXP (DECL_RTL (function), 0);
3085 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3086 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3087 SIBLING_CALL_P (insn) = 1;
3089 insn = get_insns ();
3090 shorten_branches (insn);
3091 final_start_function (insn, file, 1);
3092 final (insn, file, 1);
3093 final_end_function ();
3095 /* Stop pretending to be a post-reload pass. */
3096 reload_completed = 0;
3099 static bool
3100 aarch64_tls_referenced_p (rtx x)
3102 if (!TARGET_HAVE_TLS)
3103 return false;
3104 subrtx_iterator::array_type array;
3105 FOR_EACH_SUBRTX (iter, array, x, ALL)
3107 const_rtx x = *iter;
3108 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3109 return true;
3110 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3111 TLS offsets, not real symbol references. */
3112 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3113 iter.skip_subrtxes ();
3115 return false;
3119 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3120 a left shift of 0 or 12 bits. */
3121 bool
3122 aarch64_uimm12_shift (HOST_WIDE_INT val)
3124 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3125 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3130 /* Return true if val is an immediate that can be loaded into a
3131 register by a MOVZ instruction. */
3132 static bool
3133 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3135 if (GET_MODE_SIZE (mode) > 4)
3137 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3138 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3139 return 1;
3141 else
3143 /* Ignore sign extension. */
3144 val &= (HOST_WIDE_INT) 0xffffffff;
3146 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3147 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3150 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3152 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3154 0x0000000100000001ull,
3155 0x0001000100010001ull,
3156 0x0101010101010101ull,
3157 0x1111111111111111ull,
3158 0x5555555555555555ull,
3162 /* Return true if val is a valid bitmask immediate. */
3164 bool
3165 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3167 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3168 int bits;
3170 /* Check for a single sequence of one bits and return quickly if so.
3171 The special cases of all ones and all zeroes returns false. */
3172 val = (unsigned HOST_WIDE_INT) val_in;
3173 tmp = val + (val & -val);
3175 if (tmp == (tmp & -tmp))
3176 return (val + 1) > 1;
3178 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3179 if (mode == SImode)
3180 val = (val << 32) | (val & 0xffffffff);
3182 /* Invert if the immediate doesn't start with a zero bit - this means we
3183 only need to search for sequences of one bits. */
3184 if (val & 1)
3185 val = ~val;
3187 /* Find the first set bit and set tmp to val with the first sequence of one
3188 bits removed. Return success if there is a single sequence of ones. */
3189 first_one = val & -val;
3190 tmp = val & (val + first_one);
3192 if (tmp == 0)
3193 return true;
3195 /* Find the next set bit and compute the difference in bit position. */
3196 next_one = tmp & -tmp;
3197 bits = clz_hwi (first_one) - clz_hwi (next_one);
3198 mask = val ^ tmp;
3200 /* Check the bit position difference is a power of 2, and that the first
3201 sequence of one bits fits within 'bits' bits. */
3202 if ((mask >> bits) != 0 || bits != (bits & -bits))
3203 return false;
3205 /* Check the sequence of one bits is repeated 64/bits times. */
3206 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3210 /* Return true if val is an immediate that can be loaded into a
3211 register in a single instruction. */
3212 bool
3213 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3215 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3216 return 1;
3217 return aarch64_bitmask_imm (val, mode);
3220 static bool
3221 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3223 rtx base, offset;
3225 if (GET_CODE (x) == HIGH)
3226 return true;
3228 split_const (x, &base, &offset);
3229 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3231 if (aarch64_classify_symbol (base, offset)
3232 != SYMBOL_FORCE_TO_MEM)
3233 return true;
3234 else
3235 /* Avoid generating a 64-bit relocation in ILP32; leave
3236 to aarch64_expand_mov_immediate to handle it properly. */
3237 return mode != ptr_mode;
3240 return aarch64_tls_referenced_p (x);
3243 /* Return true if register REGNO is a valid index register.
3244 STRICT_P is true if REG_OK_STRICT is in effect. */
3246 bool
3247 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3249 if (!HARD_REGISTER_NUM_P (regno))
3251 if (!strict_p)
3252 return true;
3254 if (!reg_renumber)
3255 return false;
3257 regno = reg_renumber[regno];
3259 return GP_REGNUM_P (regno);
3262 /* Return true if register REGNO is a valid base register for mode MODE.
3263 STRICT_P is true if REG_OK_STRICT is in effect. */
3265 bool
3266 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3268 if (!HARD_REGISTER_NUM_P (regno))
3270 if (!strict_p)
3271 return true;
3273 if (!reg_renumber)
3274 return false;
3276 regno = reg_renumber[regno];
3279 /* The fake registers will be eliminated to either the stack or
3280 hard frame pointer, both of which are usually valid base registers.
3281 Reload deals with the cases where the eliminated form isn't valid. */
3282 return (GP_REGNUM_P (regno)
3283 || regno == SP_REGNUM
3284 || regno == FRAME_POINTER_REGNUM
3285 || regno == ARG_POINTER_REGNUM);
3288 /* Return true if X is a valid base register for mode MODE.
3289 STRICT_P is true if REG_OK_STRICT is in effect. */
3291 static bool
3292 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3294 if (!strict_p && GET_CODE (x) == SUBREG)
3295 x = SUBREG_REG (x);
3297 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3300 /* Return true if address offset is a valid index. If it is, fill in INFO
3301 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3303 static bool
3304 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3305 machine_mode mode, bool strict_p)
3307 enum aarch64_address_type type;
3308 rtx index;
3309 int shift;
3311 /* (reg:P) */
3312 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3313 && GET_MODE (x) == Pmode)
3315 type = ADDRESS_REG_REG;
3316 index = x;
3317 shift = 0;
3319 /* (sign_extend:DI (reg:SI)) */
3320 else if ((GET_CODE (x) == SIGN_EXTEND
3321 || GET_CODE (x) == ZERO_EXTEND)
3322 && GET_MODE (x) == DImode
3323 && GET_MODE (XEXP (x, 0)) == SImode)
3325 type = (GET_CODE (x) == SIGN_EXTEND)
3326 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3327 index = XEXP (x, 0);
3328 shift = 0;
3330 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3331 else if (GET_CODE (x) == MULT
3332 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3333 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3334 && GET_MODE (XEXP (x, 0)) == DImode
3335 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3336 && CONST_INT_P (XEXP (x, 1)))
3338 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3339 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3340 index = XEXP (XEXP (x, 0), 0);
3341 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3343 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3344 else if (GET_CODE (x) == ASHIFT
3345 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3346 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3347 && GET_MODE (XEXP (x, 0)) == DImode
3348 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3349 && CONST_INT_P (XEXP (x, 1)))
3351 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3352 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3353 index = XEXP (XEXP (x, 0), 0);
3354 shift = INTVAL (XEXP (x, 1));
3356 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3357 else if ((GET_CODE (x) == SIGN_EXTRACT
3358 || GET_CODE (x) == ZERO_EXTRACT)
3359 && GET_MODE (x) == DImode
3360 && GET_CODE (XEXP (x, 0)) == MULT
3361 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3362 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3364 type = (GET_CODE (x) == SIGN_EXTRACT)
3365 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3366 index = XEXP (XEXP (x, 0), 0);
3367 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3368 if (INTVAL (XEXP (x, 1)) != 32 + shift
3369 || INTVAL (XEXP (x, 2)) != 0)
3370 shift = -1;
3372 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3373 (const_int 0xffffffff<<shift)) */
3374 else if (GET_CODE (x) == AND
3375 && GET_MODE (x) == DImode
3376 && GET_CODE (XEXP (x, 0)) == MULT
3377 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3378 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3379 && CONST_INT_P (XEXP (x, 1)))
3381 type = ADDRESS_REG_UXTW;
3382 index = XEXP (XEXP (x, 0), 0);
3383 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3384 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3385 shift = -1;
3387 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3388 else if ((GET_CODE (x) == SIGN_EXTRACT
3389 || GET_CODE (x) == ZERO_EXTRACT)
3390 && GET_MODE (x) == DImode
3391 && GET_CODE (XEXP (x, 0)) == ASHIFT
3392 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3393 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3395 type = (GET_CODE (x) == SIGN_EXTRACT)
3396 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3397 index = XEXP (XEXP (x, 0), 0);
3398 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3399 if (INTVAL (XEXP (x, 1)) != 32 + shift
3400 || INTVAL (XEXP (x, 2)) != 0)
3401 shift = -1;
3403 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3404 (const_int 0xffffffff<<shift)) */
3405 else if (GET_CODE (x) == AND
3406 && GET_MODE (x) == DImode
3407 && GET_CODE (XEXP (x, 0)) == ASHIFT
3408 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3409 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3410 && CONST_INT_P (XEXP (x, 1)))
3412 type = ADDRESS_REG_UXTW;
3413 index = XEXP (XEXP (x, 0), 0);
3414 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3415 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3416 shift = -1;
3418 /* (mult:P (reg:P) (const_int scale)) */
3419 else if (GET_CODE (x) == MULT
3420 && GET_MODE (x) == Pmode
3421 && GET_MODE (XEXP (x, 0)) == Pmode
3422 && CONST_INT_P (XEXP (x, 1)))
3424 type = ADDRESS_REG_REG;
3425 index = XEXP (x, 0);
3426 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3428 /* (ashift:P (reg:P) (const_int shift)) */
3429 else if (GET_CODE (x) == ASHIFT
3430 && GET_MODE (x) == Pmode
3431 && GET_MODE (XEXP (x, 0)) == Pmode
3432 && CONST_INT_P (XEXP (x, 1)))
3434 type = ADDRESS_REG_REG;
3435 index = XEXP (x, 0);
3436 shift = INTVAL (XEXP (x, 1));
3438 else
3439 return false;
3441 if (GET_CODE (index) == SUBREG)
3442 index = SUBREG_REG (index);
3444 if ((shift == 0 ||
3445 (shift > 0 && shift <= 3
3446 && (1 << shift) == GET_MODE_SIZE (mode)))
3447 && REG_P (index)
3448 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3450 info->type = type;
3451 info->offset = index;
3452 info->shift = shift;
3453 return true;
3456 return false;
3459 bool
3460 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3462 return (offset >= -64 * GET_MODE_SIZE (mode)
3463 && offset < 64 * GET_MODE_SIZE (mode)
3464 && offset % GET_MODE_SIZE (mode) == 0);
3467 static inline bool
3468 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3469 HOST_WIDE_INT offset)
3471 return offset >= -256 && offset < 256;
3474 static inline bool
3475 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3477 return (offset >= 0
3478 && offset < 4096 * GET_MODE_SIZE (mode)
3479 && offset % GET_MODE_SIZE (mode) == 0);
3482 /* Return true if MODE is one of the modes for which we
3483 support LDP/STP operations. */
3485 static bool
3486 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3488 return mode == SImode || mode == DImode
3489 || mode == SFmode || mode == DFmode
3490 || (aarch64_vector_mode_supported_p (mode)
3491 && GET_MODE_SIZE (mode) == 8);
3494 /* Return true if X is a valid address for machine mode MODE. If it is,
3495 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3496 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3498 static bool
3499 aarch64_classify_address (struct aarch64_address_info *info,
3500 rtx x, machine_mode mode,
3501 RTX_CODE outer_code, bool strict_p)
3503 enum rtx_code code = GET_CODE (x);
3504 rtx op0, op1;
3506 /* On BE, we use load/store pair for all large int mode load/stores. */
3507 bool load_store_pair_p = (outer_code == PARALLEL
3508 || (BYTES_BIG_ENDIAN
3509 && aarch64_vect_struct_mode_p (mode)));
3511 bool allow_reg_index_p =
3512 !load_store_pair_p
3513 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3514 && !aarch64_vect_struct_mode_p (mode);
3516 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3517 REG addressing. */
3518 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3519 && (code != POST_INC && code != REG))
3520 return false;
3522 switch (code)
3524 case REG:
3525 case SUBREG:
3526 info->type = ADDRESS_REG_IMM;
3527 info->base = x;
3528 info->offset = const0_rtx;
3529 return aarch64_base_register_rtx_p (x, strict_p);
3531 case PLUS:
3532 op0 = XEXP (x, 0);
3533 op1 = XEXP (x, 1);
3535 if (! strict_p
3536 && REG_P (op0)
3537 && (op0 == virtual_stack_vars_rtx
3538 || op0 == frame_pointer_rtx
3539 || op0 == arg_pointer_rtx)
3540 && CONST_INT_P (op1))
3542 info->type = ADDRESS_REG_IMM;
3543 info->base = op0;
3544 info->offset = op1;
3546 return true;
3549 if (GET_MODE_SIZE (mode) != 0
3550 && CONST_INT_P (op1)
3551 && aarch64_base_register_rtx_p (op0, strict_p))
3553 HOST_WIDE_INT offset = INTVAL (op1);
3555 info->type = ADDRESS_REG_IMM;
3556 info->base = op0;
3557 info->offset = op1;
3559 /* TImode and TFmode values are allowed in both pairs of X
3560 registers and individual Q registers. The available
3561 address modes are:
3562 X,X: 7-bit signed scaled offset
3563 Q: 9-bit signed offset
3564 We conservatively require an offset representable in either mode.
3566 if (mode == TImode || mode == TFmode)
3567 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3568 && offset_9bit_signed_unscaled_p (mode, offset));
3570 /* A 7bit offset check because OImode will emit a ldp/stp
3571 instruction (only big endian will get here).
3572 For ldp/stp instructions, the offset is scaled for the size of a
3573 single element of the pair. */
3574 if (mode == OImode)
3575 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3577 /* Three 9/12 bit offsets checks because CImode will emit three
3578 ldr/str instructions (only big endian will get here). */
3579 if (mode == CImode)
3580 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3581 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3582 || offset_12bit_unsigned_scaled_p (V16QImode,
3583 offset + 32)));
3585 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3586 instructions (only big endian will get here). */
3587 if (mode == XImode)
3588 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3589 && aarch64_offset_7bit_signed_scaled_p (TImode,
3590 offset + 32));
3592 if (load_store_pair_p)
3593 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3594 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3595 else
3596 return (offset_9bit_signed_unscaled_p (mode, offset)
3597 || offset_12bit_unsigned_scaled_p (mode, offset));
3600 if (allow_reg_index_p)
3602 /* Look for base + (scaled/extended) index register. */
3603 if (aarch64_base_register_rtx_p (op0, strict_p)
3604 && aarch64_classify_index (info, op1, mode, strict_p))
3606 info->base = op0;
3607 return true;
3609 if (aarch64_base_register_rtx_p (op1, strict_p)
3610 && aarch64_classify_index (info, op0, mode, strict_p))
3612 info->base = op1;
3613 return true;
3617 return false;
3619 case POST_INC:
3620 case POST_DEC:
3621 case PRE_INC:
3622 case PRE_DEC:
3623 info->type = ADDRESS_REG_WB;
3624 info->base = XEXP (x, 0);
3625 info->offset = NULL_RTX;
3626 return aarch64_base_register_rtx_p (info->base, strict_p);
3628 case POST_MODIFY:
3629 case PRE_MODIFY:
3630 info->type = ADDRESS_REG_WB;
3631 info->base = XEXP (x, 0);
3632 if (GET_CODE (XEXP (x, 1)) == PLUS
3633 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3634 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3635 && aarch64_base_register_rtx_p (info->base, strict_p))
3637 HOST_WIDE_INT offset;
3638 info->offset = XEXP (XEXP (x, 1), 1);
3639 offset = INTVAL (info->offset);
3641 /* TImode and TFmode values are allowed in both pairs of X
3642 registers and individual Q registers. The available
3643 address modes are:
3644 X,X: 7-bit signed scaled offset
3645 Q: 9-bit signed offset
3646 We conservatively require an offset representable in either mode.
3648 if (mode == TImode || mode == TFmode)
3649 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3650 && offset_9bit_signed_unscaled_p (mode, offset));
3652 if (load_store_pair_p)
3653 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3654 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3655 else
3656 return offset_9bit_signed_unscaled_p (mode, offset);
3658 return false;
3660 case CONST:
3661 case SYMBOL_REF:
3662 case LABEL_REF:
3663 /* load literal: pc-relative constant pool entry. Only supported
3664 for SI mode or larger. */
3665 info->type = ADDRESS_SYMBOLIC;
3667 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3669 rtx sym, addend;
3671 split_const (x, &sym, &addend);
3672 return ((GET_CODE (sym) == LABEL_REF
3673 || (GET_CODE (sym) == SYMBOL_REF
3674 && CONSTANT_POOL_ADDRESS_P (sym)
3675 && !aarch64_nopcrelative_literal_loads)));
3677 return false;
3679 case LO_SUM:
3680 info->type = ADDRESS_LO_SUM;
3681 info->base = XEXP (x, 0);
3682 info->offset = XEXP (x, 1);
3683 if (allow_reg_index_p
3684 && aarch64_base_register_rtx_p (info->base, strict_p))
3686 rtx sym, offs;
3687 split_const (info->offset, &sym, &offs);
3688 if (GET_CODE (sym) == SYMBOL_REF
3689 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
3691 /* The symbol and offset must be aligned to the access size. */
3692 unsigned int align;
3693 unsigned int ref_size;
3695 if (CONSTANT_POOL_ADDRESS_P (sym))
3696 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3697 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3699 tree exp = SYMBOL_REF_DECL (sym);
3700 align = TYPE_ALIGN (TREE_TYPE (exp));
3701 align = CONSTANT_ALIGNMENT (exp, align);
3703 else if (SYMBOL_REF_DECL (sym))
3704 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3705 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3706 && SYMBOL_REF_BLOCK (sym) != NULL)
3707 align = SYMBOL_REF_BLOCK (sym)->alignment;
3708 else
3709 align = BITS_PER_UNIT;
3711 ref_size = GET_MODE_SIZE (mode);
3712 if (ref_size == 0)
3713 ref_size = GET_MODE_SIZE (DImode);
3715 return ((INTVAL (offs) & (ref_size - 1)) == 0
3716 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3719 return false;
3721 default:
3722 return false;
3726 bool
3727 aarch64_symbolic_address_p (rtx x)
3729 rtx offset;
3731 split_const (x, &x, &offset);
3732 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3735 /* Classify the base of symbolic expression X. */
3737 enum aarch64_symbol_type
3738 aarch64_classify_symbolic_expression (rtx x)
3740 rtx offset;
3742 split_const (x, &x, &offset);
3743 return aarch64_classify_symbol (x, offset);
3747 /* Return TRUE if X is a legitimate address for accessing memory in
3748 mode MODE. */
3749 static bool
3750 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3752 struct aarch64_address_info addr;
3754 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3757 /* Return TRUE if X is a legitimate address for accessing memory in
3758 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3759 pair operation. */
3760 bool
3761 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3762 RTX_CODE outer_code, bool strict_p)
3764 struct aarch64_address_info addr;
3766 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3769 /* Return TRUE if rtx X is immediate constant 0.0 */
3770 bool
3771 aarch64_float_const_zero_rtx_p (rtx x)
3773 if (GET_MODE (x) == VOIDmode)
3774 return false;
3776 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
3777 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3778 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
3781 /* Return the fixed registers used for condition codes. */
3783 static bool
3784 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3786 *p1 = CC_REGNUM;
3787 *p2 = INVALID_REGNUM;
3788 return true;
3791 /* Emit call insn with PAT and do aarch64-specific handling. */
3793 void
3794 aarch64_emit_call_insn (rtx pat)
3796 rtx insn = emit_call_insn (pat);
3798 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3799 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3800 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3803 machine_mode
3804 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3806 /* All floating point compares return CCFP if it is an equality
3807 comparison, and CCFPE otherwise. */
3808 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3810 switch (code)
3812 case EQ:
3813 case NE:
3814 case UNORDERED:
3815 case ORDERED:
3816 case UNLT:
3817 case UNLE:
3818 case UNGT:
3819 case UNGE:
3820 case UNEQ:
3821 case LTGT:
3822 return CCFPmode;
3824 case LT:
3825 case LE:
3826 case GT:
3827 case GE:
3828 return CCFPEmode;
3830 default:
3831 gcc_unreachable ();
3835 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3836 && y == const0_rtx
3837 && (code == EQ || code == NE || code == LT || code == GE)
3838 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3839 || GET_CODE (x) == NEG))
3840 return CC_NZmode;
3842 /* A compare with a shifted operand. Because of canonicalization,
3843 the comparison will have to be swapped when we emit the assembly
3844 code. */
3845 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3846 && (REG_P (y) || GET_CODE (y) == SUBREG)
3847 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3848 || GET_CODE (x) == LSHIFTRT
3849 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3850 return CC_SWPmode;
3852 /* Similarly for a negated operand, but we can only do this for
3853 equalities. */
3854 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3855 && (REG_P (y) || GET_CODE (y) == SUBREG)
3856 && (code == EQ || code == NE)
3857 && GET_CODE (x) == NEG)
3858 return CC_Zmode;
3860 /* A compare of a mode narrower than SI mode against zero can be done
3861 by extending the value in the comparison. */
3862 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3863 && y == const0_rtx)
3864 /* Only use sign-extension if we really need it. */
3865 return ((code == GT || code == GE || code == LE || code == LT)
3866 ? CC_SESWPmode : CC_ZESWPmode);
3868 /* For everything else, return CCmode. */
3869 return CCmode;
3872 static int
3873 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3876 aarch64_get_condition_code (rtx x)
3878 machine_mode mode = GET_MODE (XEXP (x, 0));
3879 enum rtx_code comp_code = GET_CODE (x);
3881 if (GET_MODE_CLASS (mode) != MODE_CC)
3882 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3883 return aarch64_get_condition_code_1 (mode, comp_code);
3886 static int
3887 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3889 int ne = -1, eq = -1;
3890 switch (mode)
3892 case CCFPmode:
3893 case CCFPEmode:
3894 switch (comp_code)
3896 case GE: return AARCH64_GE;
3897 case GT: return AARCH64_GT;
3898 case LE: return AARCH64_LS;
3899 case LT: return AARCH64_MI;
3900 case NE: return AARCH64_NE;
3901 case EQ: return AARCH64_EQ;
3902 case ORDERED: return AARCH64_VC;
3903 case UNORDERED: return AARCH64_VS;
3904 case UNLT: return AARCH64_LT;
3905 case UNLE: return AARCH64_LE;
3906 case UNGT: return AARCH64_HI;
3907 case UNGE: return AARCH64_PL;
3908 default: return -1;
3910 break;
3912 case CC_DNEmode:
3913 ne = AARCH64_NE;
3914 eq = AARCH64_EQ;
3915 break;
3917 case CC_DEQmode:
3918 ne = AARCH64_EQ;
3919 eq = AARCH64_NE;
3920 break;
3922 case CC_DGEmode:
3923 ne = AARCH64_GE;
3924 eq = AARCH64_LT;
3925 break;
3927 case CC_DLTmode:
3928 ne = AARCH64_LT;
3929 eq = AARCH64_GE;
3930 break;
3932 case CC_DGTmode:
3933 ne = AARCH64_GT;
3934 eq = AARCH64_LE;
3935 break;
3937 case CC_DLEmode:
3938 ne = AARCH64_LE;
3939 eq = AARCH64_GT;
3940 break;
3942 case CC_DGEUmode:
3943 ne = AARCH64_CS;
3944 eq = AARCH64_CC;
3945 break;
3947 case CC_DLTUmode:
3948 ne = AARCH64_CC;
3949 eq = AARCH64_CS;
3950 break;
3952 case CC_DGTUmode:
3953 ne = AARCH64_HI;
3954 eq = AARCH64_LS;
3955 break;
3957 case CC_DLEUmode:
3958 ne = AARCH64_LS;
3959 eq = AARCH64_HI;
3960 break;
3962 case CCmode:
3963 switch (comp_code)
3965 case NE: return AARCH64_NE;
3966 case EQ: return AARCH64_EQ;
3967 case GE: return AARCH64_GE;
3968 case GT: return AARCH64_GT;
3969 case LE: return AARCH64_LE;
3970 case LT: return AARCH64_LT;
3971 case GEU: return AARCH64_CS;
3972 case GTU: return AARCH64_HI;
3973 case LEU: return AARCH64_LS;
3974 case LTU: return AARCH64_CC;
3975 default: return -1;
3977 break;
3979 case CC_SWPmode:
3980 case CC_ZESWPmode:
3981 case CC_SESWPmode:
3982 switch (comp_code)
3984 case NE: return AARCH64_NE;
3985 case EQ: return AARCH64_EQ;
3986 case GE: return AARCH64_LE;
3987 case GT: return AARCH64_LT;
3988 case LE: return AARCH64_GE;
3989 case LT: return AARCH64_GT;
3990 case GEU: return AARCH64_LS;
3991 case GTU: return AARCH64_CC;
3992 case LEU: return AARCH64_CS;
3993 case LTU: return AARCH64_HI;
3994 default: return -1;
3996 break;
3998 case CC_NZmode:
3999 switch (comp_code)
4001 case NE: return AARCH64_NE;
4002 case EQ: return AARCH64_EQ;
4003 case GE: return AARCH64_PL;
4004 case LT: return AARCH64_MI;
4005 default: return -1;
4007 break;
4009 case CC_Zmode:
4010 switch (comp_code)
4012 case NE: return AARCH64_NE;
4013 case EQ: return AARCH64_EQ;
4014 default: return -1;
4016 break;
4018 default:
4019 return -1;
4020 break;
4023 if (comp_code == NE)
4024 return ne;
4026 if (comp_code == EQ)
4027 return eq;
4029 return -1;
4032 bool
4033 aarch64_const_vec_all_same_in_range_p (rtx x,
4034 HOST_WIDE_INT minval,
4035 HOST_WIDE_INT maxval)
4037 HOST_WIDE_INT firstval;
4038 int count, i;
4040 if (GET_CODE (x) != CONST_VECTOR
4041 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4042 return false;
4044 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4045 if (firstval < minval || firstval > maxval)
4046 return false;
4048 count = CONST_VECTOR_NUNITS (x);
4049 for (i = 1; i < count; i++)
4050 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4051 return false;
4053 return true;
4056 bool
4057 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4059 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4063 /* N Z C V. */
4064 #define AARCH64_CC_V 1
4065 #define AARCH64_CC_C (1 << 1)
4066 #define AARCH64_CC_Z (1 << 2)
4067 #define AARCH64_CC_N (1 << 3)
4069 /* N Z C V flags for ccmp. The first code is for AND op and the other
4070 is for IOR op. Indexed by AARCH64_COND_CODE. */
4071 static const int aarch64_nzcv_codes[][2] =
4073 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
4074 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
4075 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4076 {0, AARCH64_CC_C}, /* CC, C == 0. */
4077 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4078 {0, AARCH64_CC_N}, /* PL, N == 0. */
4079 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4080 {0, AARCH64_CC_V}, /* VC, V == 0. */
4081 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4082 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4083 {0, AARCH64_CC_V}, /* GE, N == V. */
4084 {AARCH64_CC_V, 0}, /* LT, N != V. */
4085 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4086 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4087 {0, 0}, /* AL, Any. */
4088 {0, 0}, /* NV, Any. */
4092 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4094 switch (mode)
4096 case CC_DNEmode:
4097 return NE;
4099 case CC_DEQmode:
4100 return EQ;
4102 case CC_DLEmode:
4103 return LE;
4105 case CC_DGTmode:
4106 return GT;
4108 case CC_DLTmode:
4109 return LT;
4111 case CC_DGEmode:
4112 return GE;
4114 case CC_DLEUmode:
4115 return LEU;
4117 case CC_DGTUmode:
4118 return GTU;
4120 case CC_DLTUmode:
4121 return LTU;
4123 case CC_DGEUmode:
4124 return GEU;
4126 default:
4127 gcc_unreachable ();
4132 static void
4133 aarch64_print_operand (FILE *f, rtx x, int code)
4135 switch (code)
4137 /* An integer or symbol address without a preceding # sign. */
4138 case 'c':
4139 switch (GET_CODE (x))
4141 case CONST_INT:
4142 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4143 break;
4145 case SYMBOL_REF:
4146 output_addr_const (f, x);
4147 break;
4149 case CONST:
4150 if (GET_CODE (XEXP (x, 0)) == PLUS
4151 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4153 output_addr_const (f, x);
4154 break;
4156 /* Fall through. */
4158 default:
4159 output_operand_lossage ("Unsupported operand for code '%c'", code);
4161 break;
4163 case 'e':
4164 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4166 int n;
4168 if (!CONST_INT_P (x)
4169 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4171 output_operand_lossage ("invalid operand for '%%%c'", code);
4172 return;
4175 switch (n)
4177 case 3:
4178 fputc ('b', f);
4179 break;
4180 case 4:
4181 fputc ('h', f);
4182 break;
4183 case 5:
4184 fputc ('w', f);
4185 break;
4186 default:
4187 output_operand_lossage ("invalid operand for '%%%c'", code);
4188 return;
4191 break;
4193 case 'p':
4195 int n;
4197 /* Print N such that 2^N == X. */
4198 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4200 output_operand_lossage ("invalid operand for '%%%c'", code);
4201 return;
4204 asm_fprintf (f, "%d", n);
4206 break;
4208 case 'P':
4209 /* Print the number of non-zero bits in X (a const_int). */
4210 if (!CONST_INT_P (x))
4212 output_operand_lossage ("invalid operand for '%%%c'", code);
4213 return;
4216 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4217 break;
4219 case 'H':
4220 /* Print the higher numbered register of a pair (TImode) of regs. */
4221 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4223 output_operand_lossage ("invalid operand for '%%%c'", code);
4224 return;
4227 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4228 break;
4230 case 'm':
4232 int cond_code;
4233 /* Print a condition (eq, ne, etc). */
4235 /* CONST_TRUE_RTX means always -- that's the default. */
4236 if (x == const_true_rtx)
4237 return;
4239 if (!COMPARISON_P (x))
4241 output_operand_lossage ("invalid operand for '%%%c'", code);
4242 return;
4245 cond_code = aarch64_get_condition_code (x);
4246 gcc_assert (cond_code >= 0);
4247 fputs (aarch64_condition_codes[cond_code], f);
4249 break;
4251 case 'M':
4253 int cond_code;
4254 /* Print the inverse of a condition (eq <-> ne, etc). */
4256 /* CONST_TRUE_RTX means never -- that's the default. */
4257 if (x == const_true_rtx)
4259 fputs ("nv", f);
4260 return;
4263 if (!COMPARISON_P (x))
4265 output_operand_lossage ("invalid operand for '%%%c'", code);
4266 return;
4268 cond_code = aarch64_get_condition_code (x);
4269 gcc_assert (cond_code >= 0);
4270 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4271 (cond_code)], f);
4273 break;
4275 case 'b':
4276 case 'h':
4277 case 's':
4278 case 'd':
4279 case 'q':
4280 /* Print a scalar FP/SIMD register name. */
4281 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4283 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4284 return;
4286 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4287 break;
4289 case 'S':
4290 case 'T':
4291 case 'U':
4292 case 'V':
4293 /* Print the first FP/SIMD register name in a list. */
4294 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4296 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4297 return;
4299 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4300 break;
4302 case 'R':
4303 /* Print a scalar FP/SIMD register name + 1. */
4304 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4306 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4307 return;
4309 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4310 break;
4312 case 'X':
4313 /* Print bottom 16 bits of integer constant in hex. */
4314 if (!CONST_INT_P (x))
4316 output_operand_lossage ("invalid operand for '%%%c'", code);
4317 return;
4319 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4320 break;
4322 case 'w':
4323 case 'x':
4324 /* Print a general register name or the zero register (32-bit or
4325 64-bit). */
4326 if (x == const0_rtx
4327 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4329 asm_fprintf (f, "%czr", code);
4330 break;
4333 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4335 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4336 break;
4339 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4341 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4342 break;
4345 /* Fall through */
4347 case 0:
4348 /* Print a normal operand, if it's a general register, then we
4349 assume DImode. */
4350 if (x == NULL)
4352 output_operand_lossage ("missing operand");
4353 return;
4356 switch (GET_CODE (x))
4358 case REG:
4359 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4360 break;
4362 case MEM:
4363 output_address (GET_MODE (x), XEXP (x, 0));
4364 break;
4366 case CONST:
4367 case LABEL_REF:
4368 case SYMBOL_REF:
4369 output_addr_const (asm_out_file, x);
4370 break;
4372 case CONST_INT:
4373 asm_fprintf (f, "%wd", INTVAL (x));
4374 break;
4376 case CONST_VECTOR:
4377 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4379 gcc_assert (
4380 aarch64_const_vec_all_same_in_range_p (x,
4381 HOST_WIDE_INT_MIN,
4382 HOST_WIDE_INT_MAX));
4383 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4385 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4387 fputc ('0', f);
4389 else
4390 gcc_unreachable ();
4391 break;
4393 case CONST_DOUBLE:
4394 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4395 be getting CONST_DOUBLEs holding integers. */
4396 gcc_assert (GET_MODE (x) != VOIDmode);
4397 if (aarch64_float_const_zero_rtx_p (x))
4399 fputc ('0', f);
4400 break;
4402 else if (aarch64_float_const_representable_p (x))
4404 #define buf_size 20
4405 char float_buf[buf_size] = {'\0'};
4406 real_to_decimal_for_mode (float_buf,
4407 CONST_DOUBLE_REAL_VALUE (x),
4408 buf_size, buf_size,
4409 1, GET_MODE (x));
4410 asm_fprintf (asm_out_file, "%s", float_buf);
4411 break;
4412 #undef buf_size
4414 output_operand_lossage ("invalid constant");
4415 return;
4416 default:
4417 output_operand_lossage ("invalid operand");
4418 return;
4420 break;
4422 case 'A':
4423 if (GET_CODE (x) == HIGH)
4424 x = XEXP (x, 0);
4426 switch (aarch64_classify_symbolic_expression (x))
4428 case SYMBOL_SMALL_GOT_4G:
4429 asm_fprintf (asm_out_file, ":got:");
4430 break;
4432 case SYMBOL_SMALL_TLSGD:
4433 asm_fprintf (asm_out_file, ":tlsgd:");
4434 break;
4436 case SYMBOL_SMALL_TLSDESC:
4437 asm_fprintf (asm_out_file, ":tlsdesc:");
4438 break;
4440 case SYMBOL_SMALL_TLSIE:
4441 asm_fprintf (asm_out_file, ":gottprel:");
4442 break;
4444 case SYMBOL_TLSLE24:
4445 asm_fprintf (asm_out_file, ":tprel:");
4446 break;
4448 case SYMBOL_TINY_GOT:
4449 gcc_unreachable ();
4450 break;
4452 default:
4453 break;
4455 output_addr_const (asm_out_file, x);
4456 break;
4458 case 'L':
4459 switch (aarch64_classify_symbolic_expression (x))
4461 case SYMBOL_SMALL_GOT_4G:
4462 asm_fprintf (asm_out_file, ":lo12:");
4463 break;
4465 case SYMBOL_SMALL_TLSGD:
4466 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4467 break;
4469 case SYMBOL_SMALL_TLSDESC:
4470 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4471 break;
4473 case SYMBOL_SMALL_TLSIE:
4474 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4475 break;
4477 case SYMBOL_TLSLE12:
4478 asm_fprintf (asm_out_file, ":tprel_lo12:");
4479 break;
4481 case SYMBOL_TLSLE24:
4482 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4483 break;
4485 case SYMBOL_TINY_GOT:
4486 asm_fprintf (asm_out_file, ":got:");
4487 break;
4489 case SYMBOL_TINY_TLSIE:
4490 asm_fprintf (asm_out_file, ":gottprel:");
4491 break;
4493 default:
4494 break;
4496 output_addr_const (asm_out_file, x);
4497 break;
4499 case 'G':
4501 switch (aarch64_classify_symbolic_expression (x))
4503 case SYMBOL_TLSLE24:
4504 asm_fprintf (asm_out_file, ":tprel_hi12:");
4505 break;
4506 default:
4507 break;
4509 output_addr_const (asm_out_file, x);
4510 break;
4512 case 'K':
4514 int cond_code;
4515 /* Print nzcv. */
4517 if (!COMPARISON_P (x))
4519 output_operand_lossage ("invalid operand for '%%%c'", code);
4520 return;
4523 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4524 gcc_assert (cond_code >= 0);
4525 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4527 break;
4529 case 'k':
4531 int cond_code;
4532 /* Print nzcv. */
4534 if (!COMPARISON_P (x))
4536 output_operand_lossage ("invalid operand for '%%%c'", code);
4537 return;
4540 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4541 gcc_assert (cond_code >= 0);
4542 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4544 break;
4546 default:
4547 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4548 return;
4552 static void
4553 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4555 struct aarch64_address_info addr;
4557 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4558 switch (addr.type)
4560 case ADDRESS_REG_IMM:
4561 if (addr.offset == const0_rtx)
4562 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4563 else
4564 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4565 INTVAL (addr.offset));
4566 return;
4568 case ADDRESS_REG_REG:
4569 if (addr.shift == 0)
4570 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4571 reg_names [REGNO (addr.offset)]);
4572 else
4573 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4574 reg_names [REGNO (addr.offset)], addr.shift);
4575 return;
4577 case ADDRESS_REG_UXTW:
4578 if (addr.shift == 0)
4579 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4580 REGNO (addr.offset) - R0_REGNUM);
4581 else
4582 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4583 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4584 return;
4586 case ADDRESS_REG_SXTW:
4587 if (addr.shift == 0)
4588 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4589 REGNO (addr.offset) - R0_REGNUM);
4590 else
4591 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4592 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4593 return;
4595 case ADDRESS_REG_WB:
4596 switch (GET_CODE (x))
4598 case PRE_INC:
4599 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4600 GET_MODE_SIZE (mode));
4601 return;
4602 case POST_INC:
4603 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4604 GET_MODE_SIZE (mode));
4605 return;
4606 case PRE_DEC:
4607 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4608 GET_MODE_SIZE (mode));
4609 return;
4610 case POST_DEC:
4611 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4612 GET_MODE_SIZE (mode));
4613 return;
4614 case PRE_MODIFY:
4615 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4616 INTVAL (addr.offset));
4617 return;
4618 case POST_MODIFY:
4619 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4620 INTVAL (addr.offset));
4621 return;
4622 default:
4623 break;
4625 break;
4627 case ADDRESS_LO_SUM:
4628 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4629 output_addr_const (f, addr.offset);
4630 asm_fprintf (f, "]");
4631 return;
4633 case ADDRESS_SYMBOLIC:
4634 break;
4637 output_addr_const (f, x);
4640 bool
4641 aarch64_label_mentioned_p (rtx x)
4643 const char *fmt;
4644 int i;
4646 if (GET_CODE (x) == LABEL_REF)
4647 return true;
4649 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4650 referencing instruction, but they are constant offsets, not
4651 symbols. */
4652 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4653 return false;
4655 fmt = GET_RTX_FORMAT (GET_CODE (x));
4656 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4658 if (fmt[i] == 'E')
4660 int j;
4662 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4663 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4664 return 1;
4666 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4667 return 1;
4670 return 0;
4673 /* Implement REGNO_REG_CLASS. */
4675 enum reg_class
4676 aarch64_regno_regclass (unsigned regno)
4678 if (GP_REGNUM_P (regno))
4679 return GENERAL_REGS;
4681 if (regno == SP_REGNUM)
4682 return STACK_REG;
4684 if (regno == FRAME_POINTER_REGNUM
4685 || regno == ARG_POINTER_REGNUM)
4686 return POINTER_REGS;
4688 if (FP_REGNUM_P (regno))
4689 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4691 return NO_REGS;
4694 static rtx
4695 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4697 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4698 where mask is selected by alignment and size of the offset.
4699 We try to pick as large a range for the offset as possible to
4700 maximize the chance of a CSE. However, for aligned addresses
4701 we limit the range to 4k so that structures with different sized
4702 elements are likely to use the same base. */
4704 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4706 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4707 HOST_WIDE_INT base_offset;
4709 /* Does it look like we'll need a load/store-pair operation? */
4710 if (GET_MODE_SIZE (mode) > 16
4711 || mode == TImode)
4712 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4713 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4714 /* For offsets aren't a multiple of the access size, the limit is
4715 -256...255. */
4716 else if (offset & (GET_MODE_SIZE (mode) - 1))
4717 base_offset = (offset + 0x100) & ~0x1ff;
4718 else
4719 base_offset = offset & ~0xfff;
4721 if (base_offset == 0)
4722 return x;
4724 offset -= base_offset;
4725 rtx base_reg = gen_reg_rtx (Pmode);
4726 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4727 NULL_RTX);
4728 emit_move_insn (base_reg, val);
4729 x = plus_constant (Pmode, base_reg, offset);
4732 return x;
4735 /* Try a machine-dependent way of reloading an illegitimate address
4736 operand. If we find one, push the reload and return the new rtx. */
4739 aarch64_legitimize_reload_address (rtx *x_p,
4740 machine_mode mode,
4741 int opnum, int type,
4742 int ind_levels ATTRIBUTE_UNUSED)
4744 rtx x = *x_p;
4746 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4747 if (aarch64_vect_struct_mode_p (mode)
4748 && GET_CODE (x) == PLUS
4749 && REG_P (XEXP (x, 0))
4750 && CONST_INT_P (XEXP (x, 1)))
4752 rtx orig_rtx = x;
4753 x = copy_rtx (x);
4754 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4755 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4756 opnum, (enum reload_type) type);
4757 return x;
4760 /* We must recognize output that we have already generated ourselves. */
4761 if (GET_CODE (x) == PLUS
4762 && GET_CODE (XEXP (x, 0)) == PLUS
4763 && REG_P (XEXP (XEXP (x, 0), 0))
4764 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4765 && CONST_INT_P (XEXP (x, 1)))
4767 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4768 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4769 opnum, (enum reload_type) type);
4770 return x;
4773 /* We wish to handle large displacements off a base register by splitting
4774 the addend across an add and the mem insn. This can cut the number of
4775 extra insns needed from 3 to 1. It is only useful for load/store of a
4776 single register with 12 bit offset field. */
4777 if (GET_CODE (x) == PLUS
4778 && REG_P (XEXP (x, 0))
4779 && CONST_INT_P (XEXP (x, 1))
4780 && HARD_REGISTER_P (XEXP (x, 0))
4781 && mode != TImode
4782 && mode != TFmode
4783 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4785 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4786 HOST_WIDE_INT low = val & 0xfff;
4787 HOST_WIDE_INT high = val - low;
4788 HOST_WIDE_INT offs;
4789 rtx cst;
4790 machine_mode xmode = GET_MODE (x);
4792 /* In ILP32, xmode can be either DImode or SImode. */
4793 gcc_assert (xmode == DImode || xmode == SImode);
4795 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4796 BLKmode alignment. */
4797 if (GET_MODE_SIZE (mode) == 0)
4798 return NULL_RTX;
4800 offs = low % GET_MODE_SIZE (mode);
4802 /* Align misaligned offset by adjusting high part to compensate. */
4803 if (offs != 0)
4805 if (aarch64_uimm12_shift (high + offs))
4807 /* Align down. */
4808 low = low - offs;
4809 high = high + offs;
4811 else
4813 /* Align up. */
4814 offs = GET_MODE_SIZE (mode) - offs;
4815 low = low + offs;
4816 high = high + (low & 0x1000) - offs;
4817 low &= 0xfff;
4821 /* Check for overflow. */
4822 if (high + low != val)
4823 return NULL_RTX;
4825 cst = GEN_INT (high);
4826 if (!aarch64_uimm12_shift (high))
4827 cst = force_const_mem (xmode, cst);
4829 /* Reload high part into base reg, leaving the low part
4830 in the mem instruction.
4831 Note that replacing this gen_rtx_PLUS with plus_constant is
4832 wrong in this case because we rely on the
4833 (plus (plus reg c1) c2) structure being preserved so that
4834 XEXP (*p, 0) in push_reload below uses the correct term. */
4835 x = gen_rtx_PLUS (xmode,
4836 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4837 GEN_INT (low));
4839 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4840 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4841 opnum, (enum reload_type) type);
4842 return x;
4845 return NULL_RTX;
4849 /* Return the reload icode required for a constant pool in mode. */
4850 static enum insn_code
4851 aarch64_constant_pool_reload_icode (machine_mode mode)
4853 switch (mode)
4855 case SFmode:
4856 return CODE_FOR_aarch64_reload_movcpsfdi;
4858 case DFmode:
4859 return CODE_FOR_aarch64_reload_movcpdfdi;
4861 case TFmode:
4862 return CODE_FOR_aarch64_reload_movcptfdi;
4864 case V8QImode:
4865 return CODE_FOR_aarch64_reload_movcpv8qidi;
4867 case V16QImode:
4868 return CODE_FOR_aarch64_reload_movcpv16qidi;
4870 case V4HImode:
4871 return CODE_FOR_aarch64_reload_movcpv4hidi;
4873 case V8HImode:
4874 return CODE_FOR_aarch64_reload_movcpv8hidi;
4876 case V2SImode:
4877 return CODE_FOR_aarch64_reload_movcpv2sidi;
4879 case V4SImode:
4880 return CODE_FOR_aarch64_reload_movcpv4sidi;
4882 case V2DImode:
4883 return CODE_FOR_aarch64_reload_movcpv2didi;
4885 case V2DFmode:
4886 return CODE_FOR_aarch64_reload_movcpv2dfdi;
4888 default:
4889 gcc_unreachable ();
4892 gcc_unreachable ();
4894 static reg_class_t
4895 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4896 reg_class_t rclass,
4897 machine_mode mode,
4898 secondary_reload_info *sri)
4901 /* If we have to disable direct literal pool loads and stores because the
4902 function is too big, then we need a scratch register. */
4903 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
4904 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
4905 || targetm.vector_mode_supported_p (GET_MODE (x)))
4906 && aarch64_nopcrelative_literal_loads)
4908 sri->icode = aarch64_constant_pool_reload_icode (mode);
4909 return NO_REGS;
4912 /* Without the TARGET_SIMD instructions we cannot move a Q register
4913 to a Q register directly. We need a scratch. */
4914 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4915 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4916 && reg_class_subset_p (rclass, FP_REGS))
4918 if (mode == TFmode)
4919 sri->icode = CODE_FOR_aarch64_reload_movtf;
4920 else if (mode == TImode)
4921 sri->icode = CODE_FOR_aarch64_reload_movti;
4922 return NO_REGS;
4925 /* A TFmode or TImode memory access should be handled via an FP_REGS
4926 because AArch64 has richer addressing modes for LDR/STR instructions
4927 than LDP/STP instructions. */
4928 if (TARGET_FLOAT && rclass == GENERAL_REGS
4929 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4930 return FP_REGS;
4932 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4933 return GENERAL_REGS;
4935 return NO_REGS;
4938 static bool
4939 aarch64_can_eliminate (const int from, const int to)
4941 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4942 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4944 if (frame_pointer_needed)
4946 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4947 return true;
4948 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4949 return false;
4950 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4951 && !cfun->calls_alloca)
4952 return true;
4953 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4954 return true;
4956 return false;
4958 else
4960 /* If we decided that we didn't need a leaf frame pointer but then used
4961 LR in the function, then we'll want a frame pointer after all, so
4962 prevent this elimination to ensure a frame pointer is used. */
4963 if (to == STACK_POINTER_REGNUM
4964 && flag_omit_leaf_frame_pointer
4965 && df_regs_ever_live_p (LR_REGNUM))
4966 return false;
4969 return true;
4972 HOST_WIDE_INT
4973 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4975 aarch64_layout_frame ();
4977 if (to == HARD_FRAME_POINTER_REGNUM)
4979 if (from == ARG_POINTER_REGNUM)
4980 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4982 if (from == FRAME_POINTER_REGNUM)
4983 return (cfun->machine->frame.hard_fp_offset
4984 - cfun->machine->frame.saved_varargs_size);
4987 if (to == STACK_POINTER_REGNUM)
4989 if (from == FRAME_POINTER_REGNUM)
4990 return (cfun->machine->frame.frame_size
4991 - cfun->machine->frame.saved_varargs_size);
4994 return cfun->machine->frame.frame_size;
4997 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4998 previous frame. */
5001 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5003 if (count != 0)
5004 return const0_rtx;
5005 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5009 static void
5010 aarch64_asm_trampoline_template (FILE *f)
5012 if (TARGET_ILP32)
5014 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5015 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5017 else
5019 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5020 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5022 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5023 assemble_aligned_integer (4, const0_rtx);
5024 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5025 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5028 static void
5029 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5031 rtx fnaddr, mem, a_tramp;
5032 const int tramp_code_sz = 16;
5034 /* Don't need to copy the trailing D-words, we fill those in below. */
5035 emit_block_move (m_tramp, assemble_trampoline_template (),
5036 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5037 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5038 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5039 if (GET_MODE (fnaddr) != ptr_mode)
5040 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5041 emit_move_insn (mem, fnaddr);
5043 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5044 emit_move_insn (mem, chain_value);
5046 /* XXX We should really define a "clear_cache" pattern and use
5047 gen_clear_cache(). */
5048 a_tramp = XEXP (m_tramp, 0);
5049 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5050 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5051 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5052 ptr_mode);
5055 static unsigned char
5056 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5058 switch (regclass)
5060 case CALLER_SAVE_REGS:
5061 case POINTER_REGS:
5062 case GENERAL_REGS:
5063 case ALL_REGS:
5064 case FP_REGS:
5065 case FP_LO_REGS:
5066 return
5067 aarch64_vector_mode_p (mode)
5068 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5069 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5070 case STACK_REG:
5071 return 1;
5073 case NO_REGS:
5074 return 0;
5076 default:
5077 break;
5079 gcc_unreachable ();
5082 static reg_class_t
5083 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5085 if (regclass == POINTER_REGS)
5086 return GENERAL_REGS;
5088 if (regclass == STACK_REG)
5090 if (REG_P(x)
5091 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5092 return regclass;
5094 return NO_REGS;
5097 /* If it's an integer immediate that MOVI can't handle, then
5098 FP_REGS is not an option, so we return NO_REGS instead. */
5099 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5100 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5101 return NO_REGS;
5103 /* Register eliminiation can result in a request for
5104 SP+constant->FP_REGS. We cannot support such operations which
5105 use SP as source and an FP_REG as destination, so reject out
5106 right now. */
5107 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5109 rtx lhs = XEXP (x, 0);
5111 /* Look through a possible SUBREG introduced by ILP32. */
5112 if (GET_CODE (lhs) == SUBREG)
5113 lhs = SUBREG_REG (lhs);
5115 gcc_assert (REG_P (lhs));
5116 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5117 POINTER_REGS));
5118 return NO_REGS;
5121 return regclass;
5124 void
5125 aarch64_asm_output_labelref (FILE* f, const char *name)
5127 asm_fprintf (f, "%U%s", name);
5130 static void
5131 aarch64_elf_asm_constructor (rtx symbol, int priority)
5133 if (priority == DEFAULT_INIT_PRIORITY)
5134 default_ctor_section_asm_out_constructor (symbol, priority);
5135 else
5137 section *s;
5138 char buf[18];
5139 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5140 s = get_section (buf, SECTION_WRITE, NULL);
5141 switch_to_section (s);
5142 assemble_align (POINTER_SIZE);
5143 assemble_aligned_integer (POINTER_BYTES, symbol);
5147 static void
5148 aarch64_elf_asm_destructor (rtx symbol, int priority)
5150 if (priority == DEFAULT_INIT_PRIORITY)
5151 default_dtor_section_asm_out_destructor (symbol, priority);
5152 else
5154 section *s;
5155 char buf[18];
5156 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5157 s = get_section (buf, SECTION_WRITE, NULL);
5158 switch_to_section (s);
5159 assemble_align (POINTER_SIZE);
5160 assemble_aligned_integer (POINTER_BYTES, symbol);
5164 const char*
5165 aarch64_output_casesi (rtx *operands)
5167 char buf[100];
5168 char label[100];
5169 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5170 int index;
5171 static const char *const patterns[4][2] =
5174 "ldrb\t%w3, [%0,%w1,uxtw]",
5175 "add\t%3, %4, %w3, sxtb #2"
5178 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5179 "add\t%3, %4, %w3, sxth #2"
5182 "ldr\t%w3, [%0,%w1,uxtw #2]",
5183 "add\t%3, %4, %w3, sxtw #2"
5185 /* We assume that DImode is only generated when not optimizing and
5186 that we don't really need 64-bit address offsets. That would
5187 imply an object file with 8GB of code in a single function! */
5189 "ldr\t%w3, [%0,%w1,uxtw #2]",
5190 "add\t%3, %4, %w3, sxtw #2"
5194 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5196 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5198 gcc_assert (index >= 0 && index <= 3);
5200 /* Need to implement table size reduction, by chaning the code below. */
5201 output_asm_insn (patterns[index][0], operands);
5202 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5203 snprintf (buf, sizeof (buf),
5204 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5205 output_asm_insn (buf, operands);
5206 output_asm_insn (patterns[index][1], operands);
5207 output_asm_insn ("br\t%3", operands);
5208 assemble_label (asm_out_file, label);
5209 return "";
5213 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5214 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5215 operator. */
5218 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5220 if (shift >= 0 && shift <= 3)
5222 int size;
5223 for (size = 8; size <= 32; size *= 2)
5225 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5226 if (mask == bits << shift)
5227 return size;
5230 return 0;
5233 /* Constant pools are per function only when PC relative
5234 literal loads are true or we are in the large memory
5235 model. */
5237 static inline bool
5238 aarch64_can_use_per_function_literal_pools_p (void)
5240 return (!aarch64_nopcrelative_literal_loads
5241 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5244 static bool
5245 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5247 /* We can't use blocks for constants when we're using a per-function
5248 constant pool. */
5249 return !aarch64_can_use_per_function_literal_pools_p ();
5252 /* Select appropriate section for constants depending
5253 on where we place literal pools. */
5255 static section *
5256 aarch64_select_rtx_section (machine_mode mode,
5257 rtx x,
5258 unsigned HOST_WIDE_INT align)
5260 if (aarch64_can_use_per_function_literal_pools_p ())
5261 return function_section (current_function_decl);
5263 return default_elf_select_rtx_section (mode, x, align);
5266 /* Costs. */
5268 /* Helper function for rtx cost calculation. Strip a shift expression
5269 from X. Returns the inner operand if successful, or the original
5270 expression on failure. */
5271 static rtx
5272 aarch64_strip_shift (rtx x)
5274 rtx op = x;
5276 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5277 we can convert both to ROR during final output. */
5278 if ((GET_CODE (op) == ASHIFT
5279 || GET_CODE (op) == ASHIFTRT
5280 || GET_CODE (op) == LSHIFTRT
5281 || GET_CODE (op) == ROTATERT
5282 || GET_CODE (op) == ROTATE)
5283 && CONST_INT_P (XEXP (op, 1)))
5284 return XEXP (op, 0);
5286 if (GET_CODE (op) == MULT
5287 && CONST_INT_P (XEXP (op, 1))
5288 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5289 return XEXP (op, 0);
5291 return x;
5294 /* Helper function for rtx cost calculation. Strip an extend
5295 expression from X. Returns the inner operand if successful, or the
5296 original expression on failure. We deal with a number of possible
5297 canonicalization variations here. */
5298 static rtx
5299 aarch64_strip_extend (rtx x)
5301 rtx op = x;
5303 /* Zero and sign extraction of a widened value. */
5304 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5305 && XEXP (op, 2) == const0_rtx
5306 && GET_CODE (XEXP (op, 0)) == MULT
5307 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5308 XEXP (op, 1)))
5309 return XEXP (XEXP (op, 0), 0);
5311 /* It can also be represented (for zero-extend) as an AND with an
5312 immediate. */
5313 if (GET_CODE (op) == AND
5314 && GET_CODE (XEXP (op, 0)) == MULT
5315 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5316 && CONST_INT_P (XEXP (op, 1))
5317 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5318 INTVAL (XEXP (op, 1))) != 0)
5319 return XEXP (XEXP (op, 0), 0);
5321 /* Now handle extended register, as this may also have an optional
5322 left shift by 1..4. */
5323 if (GET_CODE (op) == ASHIFT
5324 && CONST_INT_P (XEXP (op, 1))
5325 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5326 op = XEXP (op, 0);
5328 if (GET_CODE (op) == ZERO_EXTEND
5329 || GET_CODE (op) == SIGN_EXTEND)
5330 op = XEXP (op, 0);
5332 if (op != x)
5333 return op;
5335 return x;
5338 /* Return true iff CODE is a shift supported in combination
5339 with arithmetic instructions. */
5341 static bool
5342 aarch64_shift_p (enum rtx_code code)
5344 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5347 /* Helper function for rtx cost calculation. Calculate the cost of
5348 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5349 Return the calculated cost of the expression, recursing manually in to
5350 operands where needed. */
5352 static int
5353 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5355 rtx op0, op1;
5356 const struct cpu_cost_table *extra_cost
5357 = aarch64_tune_params.insn_extra_cost;
5358 int cost = 0;
5359 bool compound_p = (outer == PLUS || outer == MINUS);
5360 machine_mode mode = GET_MODE (x);
5362 gcc_checking_assert (code == MULT);
5364 op0 = XEXP (x, 0);
5365 op1 = XEXP (x, 1);
5367 if (VECTOR_MODE_P (mode))
5368 mode = GET_MODE_INNER (mode);
5370 /* Integer multiply/fma. */
5371 if (GET_MODE_CLASS (mode) == MODE_INT)
5373 /* The multiply will be canonicalized as a shift, cost it as such. */
5374 if (aarch64_shift_p (GET_CODE (x))
5375 || (CONST_INT_P (op1)
5376 && exact_log2 (INTVAL (op1)) > 0))
5378 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5379 || GET_CODE (op0) == SIGN_EXTEND;
5380 if (speed)
5382 if (compound_p)
5384 if (REG_P (op1))
5385 /* ARITH + shift-by-register. */
5386 cost += extra_cost->alu.arith_shift_reg;
5387 else if (is_extend)
5388 /* ARITH + extended register. We don't have a cost field
5389 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5390 cost += extra_cost->alu.extend_arith;
5391 else
5392 /* ARITH + shift-by-immediate. */
5393 cost += extra_cost->alu.arith_shift;
5395 else
5396 /* LSL (immediate). */
5397 cost += extra_cost->alu.shift;
5400 /* Strip extends as we will have costed them in the case above. */
5401 if (is_extend)
5402 op0 = aarch64_strip_extend (op0);
5404 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5406 return cost;
5409 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5410 compound and let the below cases handle it. After all, MNEG is a
5411 special-case alias of MSUB. */
5412 if (GET_CODE (op0) == NEG)
5414 op0 = XEXP (op0, 0);
5415 compound_p = true;
5418 /* Integer multiplies or FMAs have zero/sign extending variants. */
5419 if ((GET_CODE (op0) == ZERO_EXTEND
5420 && GET_CODE (op1) == ZERO_EXTEND)
5421 || (GET_CODE (op0) == SIGN_EXTEND
5422 && GET_CODE (op1) == SIGN_EXTEND))
5424 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5425 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5427 if (speed)
5429 if (compound_p)
5430 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5431 cost += extra_cost->mult[0].extend_add;
5432 else
5433 /* MUL/SMULL/UMULL. */
5434 cost += extra_cost->mult[0].extend;
5437 return cost;
5440 /* This is either an integer multiply or a MADD. In both cases
5441 we want to recurse and cost the operands. */
5442 cost += rtx_cost (op0, mode, MULT, 0, speed);
5443 cost += rtx_cost (op1, mode, MULT, 1, speed);
5445 if (speed)
5447 if (compound_p)
5448 /* MADD/MSUB. */
5449 cost += extra_cost->mult[mode == DImode].add;
5450 else
5451 /* MUL. */
5452 cost += extra_cost->mult[mode == DImode].simple;
5455 return cost;
5457 else
5459 if (speed)
5461 /* Floating-point FMA/FMUL can also support negations of the
5462 operands, unless the rounding mode is upward or downward in
5463 which case FNMUL is different than FMUL with operand negation. */
5464 bool neg0 = GET_CODE (op0) == NEG;
5465 bool neg1 = GET_CODE (op1) == NEG;
5466 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5468 if (neg0)
5469 op0 = XEXP (op0, 0);
5470 if (neg1)
5471 op1 = XEXP (op1, 0);
5474 if (compound_p)
5475 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5476 cost += extra_cost->fp[mode == DFmode].fma;
5477 else
5478 /* FMUL/FNMUL. */
5479 cost += extra_cost->fp[mode == DFmode].mult;
5482 cost += rtx_cost (op0, mode, MULT, 0, speed);
5483 cost += rtx_cost (op1, mode, MULT, 1, speed);
5484 return cost;
5488 static int
5489 aarch64_address_cost (rtx x,
5490 machine_mode mode,
5491 addr_space_t as ATTRIBUTE_UNUSED,
5492 bool speed)
5494 enum rtx_code c = GET_CODE (x);
5495 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5496 struct aarch64_address_info info;
5497 int cost = 0;
5498 info.shift = 0;
5500 if (!aarch64_classify_address (&info, x, mode, c, false))
5502 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5504 /* This is a CONST or SYMBOL ref which will be split
5505 in a different way depending on the code model in use.
5506 Cost it through the generic infrastructure. */
5507 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5508 /* Divide through by the cost of one instruction to
5509 bring it to the same units as the address costs. */
5510 cost_symbol_ref /= COSTS_N_INSNS (1);
5511 /* The cost is then the cost of preparing the address,
5512 followed by an immediate (possibly 0) offset. */
5513 return cost_symbol_ref + addr_cost->imm_offset;
5515 else
5517 /* This is most likely a jump table from a case
5518 statement. */
5519 return addr_cost->register_offset;
5523 switch (info.type)
5525 case ADDRESS_LO_SUM:
5526 case ADDRESS_SYMBOLIC:
5527 case ADDRESS_REG_IMM:
5528 cost += addr_cost->imm_offset;
5529 break;
5531 case ADDRESS_REG_WB:
5532 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5533 cost += addr_cost->pre_modify;
5534 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5535 cost += addr_cost->post_modify;
5536 else
5537 gcc_unreachable ();
5539 break;
5541 case ADDRESS_REG_REG:
5542 cost += addr_cost->register_offset;
5543 break;
5545 case ADDRESS_REG_SXTW:
5546 cost += addr_cost->register_sextend;
5547 break;
5549 case ADDRESS_REG_UXTW:
5550 cost += addr_cost->register_zextend;
5551 break;
5553 default:
5554 gcc_unreachable ();
5558 if (info.shift > 0)
5560 /* For the sake of calculating the cost of the shifted register
5561 component, we can treat same sized modes in the same way. */
5562 switch (GET_MODE_BITSIZE (mode))
5564 case 16:
5565 cost += addr_cost->addr_scale_costs.hi;
5566 break;
5568 case 32:
5569 cost += addr_cost->addr_scale_costs.si;
5570 break;
5572 case 64:
5573 cost += addr_cost->addr_scale_costs.di;
5574 break;
5576 /* We can't tell, or this is a 128-bit vector. */
5577 default:
5578 cost += addr_cost->addr_scale_costs.ti;
5579 break;
5583 return cost;
5586 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5587 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5588 to be taken. */
5591 aarch64_branch_cost (bool speed_p, bool predictable_p)
5593 /* When optimizing for speed, use the cost of unpredictable branches. */
5594 const struct cpu_branch_cost *branch_costs =
5595 aarch64_tune_params.branch_costs;
5597 if (!speed_p || predictable_p)
5598 return branch_costs->predictable;
5599 else
5600 return branch_costs->unpredictable;
5603 /* Return true if the RTX X in mode MODE is a zero or sign extract
5604 usable in an ADD or SUB (extended register) instruction. */
5605 static bool
5606 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5608 /* Catch add with a sign extract.
5609 This is add_<optab><mode>_multp2. */
5610 if (GET_CODE (x) == SIGN_EXTRACT
5611 || GET_CODE (x) == ZERO_EXTRACT)
5613 rtx op0 = XEXP (x, 0);
5614 rtx op1 = XEXP (x, 1);
5615 rtx op2 = XEXP (x, 2);
5617 if (GET_CODE (op0) == MULT
5618 && CONST_INT_P (op1)
5619 && op2 == const0_rtx
5620 && CONST_INT_P (XEXP (op0, 1))
5621 && aarch64_is_extend_from_extract (mode,
5622 XEXP (op0, 1),
5623 op1))
5625 return true;
5628 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5629 No shift. */
5630 else if (GET_CODE (x) == SIGN_EXTEND
5631 || GET_CODE (x) == ZERO_EXTEND)
5632 return REG_P (XEXP (x, 0));
5634 return false;
5637 static bool
5638 aarch64_frint_unspec_p (unsigned int u)
5640 switch (u)
5642 case UNSPEC_FRINTZ:
5643 case UNSPEC_FRINTP:
5644 case UNSPEC_FRINTM:
5645 case UNSPEC_FRINTA:
5646 case UNSPEC_FRINTN:
5647 case UNSPEC_FRINTX:
5648 case UNSPEC_FRINTI:
5649 return true;
5651 default:
5652 return false;
5656 /* Return true iff X is an rtx that will match an extr instruction
5657 i.e. as described in the *extr<mode>5_insn family of patterns.
5658 OP0 and OP1 will be set to the operands of the shifts involved
5659 on success and will be NULL_RTX otherwise. */
5661 static bool
5662 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5664 rtx op0, op1;
5665 machine_mode mode = GET_MODE (x);
5667 *res_op0 = NULL_RTX;
5668 *res_op1 = NULL_RTX;
5670 if (GET_CODE (x) != IOR)
5671 return false;
5673 op0 = XEXP (x, 0);
5674 op1 = XEXP (x, 1);
5676 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5677 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5679 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5680 if (GET_CODE (op1) == ASHIFT)
5681 std::swap (op0, op1);
5683 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5684 return false;
5686 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5687 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5689 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5690 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5692 *res_op0 = XEXP (op0, 0);
5693 *res_op1 = XEXP (op1, 0);
5694 return true;
5698 return false;
5701 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5702 storing it in *COST. Result is true if the total cost of the operation
5703 has now been calculated. */
5704 static bool
5705 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5707 rtx inner;
5708 rtx comparator;
5709 enum rtx_code cmpcode;
5711 if (COMPARISON_P (op0))
5713 inner = XEXP (op0, 0);
5714 comparator = XEXP (op0, 1);
5715 cmpcode = GET_CODE (op0);
5717 else
5719 inner = op0;
5720 comparator = const0_rtx;
5721 cmpcode = NE;
5724 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5726 /* Conditional branch. */
5727 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5728 return true;
5729 else
5731 if (cmpcode == NE || cmpcode == EQ)
5733 if (comparator == const0_rtx)
5735 /* TBZ/TBNZ/CBZ/CBNZ. */
5736 if (GET_CODE (inner) == ZERO_EXTRACT)
5737 /* TBZ/TBNZ. */
5738 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5739 ZERO_EXTRACT, 0, speed);
5740 else
5741 /* CBZ/CBNZ. */
5742 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5744 return true;
5747 else if (cmpcode == LT || cmpcode == GE)
5749 /* TBZ/TBNZ. */
5750 if (comparator == const0_rtx)
5751 return true;
5755 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5757 /* It's a conditional operation based on the status flags,
5758 so it must be some flavor of CSEL. */
5760 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5761 if (GET_CODE (op1) == NEG
5762 || GET_CODE (op1) == NOT
5763 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5764 op1 = XEXP (op1, 0);
5766 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
5767 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
5768 return true;
5771 /* We don't know what this is, cost all operands. */
5772 return false;
5775 /* Calculate the cost of calculating X, storing it in *COST. Result
5776 is true if the total cost of the operation has now been calculated. */
5777 static bool
5778 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
5779 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5781 rtx op0, op1, op2;
5782 const struct cpu_cost_table *extra_cost
5783 = aarch64_tune_params.insn_extra_cost;
5784 int code = GET_CODE (x);
5786 /* By default, assume that everything has equivalent cost to the
5787 cheapest instruction. Any additional costs are applied as a delta
5788 above this default. */
5789 *cost = COSTS_N_INSNS (1);
5791 switch (code)
5793 case SET:
5794 /* The cost depends entirely on the operands to SET. */
5795 *cost = 0;
5796 op0 = SET_DEST (x);
5797 op1 = SET_SRC (x);
5799 switch (GET_CODE (op0))
5801 case MEM:
5802 if (speed)
5804 rtx address = XEXP (op0, 0);
5805 if (VECTOR_MODE_P (mode))
5806 *cost += extra_cost->ldst.storev;
5807 else if (GET_MODE_CLASS (mode) == MODE_INT)
5808 *cost += extra_cost->ldst.store;
5809 else if (mode == SFmode)
5810 *cost += extra_cost->ldst.storef;
5811 else if (mode == DFmode)
5812 *cost += extra_cost->ldst.stored;
5814 *cost +=
5815 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5816 0, speed));
5819 *cost += rtx_cost (op1, mode, SET, 1, speed);
5820 return true;
5822 case SUBREG:
5823 if (! REG_P (SUBREG_REG (op0)))
5824 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
5826 /* Fall through. */
5827 case REG:
5828 /* The cost is one per vector-register copied. */
5829 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5831 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5832 / GET_MODE_SIZE (V4SImode);
5833 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5835 /* const0_rtx is in general free, but we will use an
5836 instruction to set a register to 0. */
5837 else if (REG_P (op1) || op1 == const0_rtx)
5839 /* The cost is 1 per register copied. */
5840 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5841 / UNITS_PER_WORD;
5842 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5844 else
5845 /* Cost is just the cost of the RHS of the set. */
5846 *cost += rtx_cost (op1, mode, SET, 1, speed);
5847 return true;
5849 case ZERO_EXTRACT:
5850 case SIGN_EXTRACT:
5851 /* Bit-field insertion. Strip any redundant widening of
5852 the RHS to meet the width of the target. */
5853 if (GET_CODE (op1) == SUBREG)
5854 op1 = SUBREG_REG (op1);
5855 if ((GET_CODE (op1) == ZERO_EXTEND
5856 || GET_CODE (op1) == SIGN_EXTEND)
5857 && CONST_INT_P (XEXP (op0, 1))
5858 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5859 >= INTVAL (XEXP (op0, 1))))
5860 op1 = XEXP (op1, 0);
5862 if (CONST_INT_P (op1))
5864 /* MOV immediate is assumed to always be cheap. */
5865 *cost = COSTS_N_INSNS (1);
5867 else
5869 /* BFM. */
5870 if (speed)
5871 *cost += extra_cost->alu.bfi;
5872 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
5875 return true;
5877 default:
5878 /* We can't make sense of this, assume default cost. */
5879 *cost = COSTS_N_INSNS (1);
5880 return false;
5882 return false;
5884 case CONST_INT:
5885 /* If an instruction can incorporate a constant within the
5886 instruction, the instruction's expression avoids calling
5887 rtx_cost() on the constant. If rtx_cost() is called on a
5888 constant, then it is usually because the constant must be
5889 moved into a register by one or more instructions.
5891 The exception is constant 0, which can be expressed
5892 as XZR/WZR and is therefore free. The exception to this is
5893 if we have (set (reg) (const0_rtx)) in which case we must cost
5894 the move. However, we can catch that when we cost the SET, so
5895 we don't need to consider that here. */
5896 if (x == const0_rtx)
5897 *cost = 0;
5898 else
5900 /* To an approximation, building any other constant is
5901 proportionally expensive to the number of instructions
5902 required to build that constant. This is true whether we
5903 are compiling for SPEED or otherwise. */
5904 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5905 (NULL_RTX, x, false, mode));
5907 return true;
5909 case CONST_DOUBLE:
5910 if (speed)
5912 /* mov[df,sf]_aarch64. */
5913 if (aarch64_float_const_representable_p (x))
5914 /* FMOV (scalar immediate). */
5915 *cost += extra_cost->fp[mode == DFmode].fpconst;
5916 else if (!aarch64_float_const_zero_rtx_p (x))
5918 /* This will be a load from memory. */
5919 if (mode == DFmode)
5920 *cost += extra_cost->ldst.loadd;
5921 else
5922 *cost += extra_cost->ldst.loadf;
5924 else
5925 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5926 or MOV v0.s[0], wzr - neither of which are modeled by the
5927 cost tables. Just use the default cost. */
5932 return true;
5934 case MEM:
5935 if (speed)
5937 /* For loads we want the base cost of a load, plus an
5938 approximation for the additional cost of the addressing
5939 mode. */
5940 rtx address = XEXP (x, 0);
5941 if (VECTOR_MODE_P (mode))
5942 *cost += extra_cost->ldst.loadv;
5943 else if (GET_MODE_CLASS (mode) == MODE_INT)
5944 *cost += extra_cost->ldst.load;
5945 else if (mode == SFmode)
5946 *cost += extra_cost->ldst.loadf;
5947 else if (mode == DFmode)
5948 *cost += extra_cost->ldst.loadd;
5950 *cost +=
5951 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5952 0, speed));
5955 return true;
5957 case NEG:
5958 op0 = XEXP (x, 0);
5960 if (VECTOR_MODE_P (mode))
5962 if (speed)
5964 /* FNEG. */
5965 *cost += extra_cost->vect.alu;
5967 return false;
5970 if (GET_MODE_CLASS (mode) == MODE_INT)
5972 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5973 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5975 /* CSETM. */
5976 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
5977 return true;
5980 /* Cost this as SUB wzr, X. */
5981 op0 = CONST0_RTX (mode);
5982 op1 = XEXP (x, 0);
5983 goto cost_minus;
5986 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5988 /* Support (neg(fma...)) as a single instruction only if
5989 sign of zeros is unimportant. This matches the decision
5990 making in aarch64.md. */
5991 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5993 /* FNMADD. */
5994 *cost = rtx_cost (op0, mode, NEG, 0, speed);
5995 return true;
5997 if (GET_CODE (op0) == MULT)
5999 /* FNMUL. */
6000 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6001 return true;
6003 if (speed)
6004 /* FNEG. */
6005 *cost += extra_cost->fp[mode == DFmode].neg;
6006 return false;
6009 return false;
6011 case CLRSB:
6012 case CLZ:
6013 if (speed)
6015 if (VECTOR_MODE_P (mode))
6016 *cost += extra_cost->vect.alu;
6017 else
6018 *cost += extra_cost->alu.clz;
6021 return false;
6023 case COMPARE:
6024 op0 = XEXP (x, 0);
6025 op1 = XEXP (x, 1);
6027 if (op1 == const0_rtx
6028 && GET_CODE (op0) == AND)
6030 x = op0;
6031 mode = GET_MODE (op0);
6032 goto cost_logic;
6035 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6037 /* TODO: A write to the CC flags possibly costs extra, this
6038 needs encoding in the cost tables. */
6040 /* CC_ZESWPmode supports zero extend for free. */
6041 if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6042 op0 = XEXP (op0, 0);
6044 mode = GET_MODE (op0);
6045 /* ANDS. */
6046 if (GET_CODE (op0) == AND)
6048 x = op0;
6049 goto cost_logic;
6052 if (GET_CODE (op0) == PLUS)
6054 /* ADDS (and CMN alias). */
6055 x = op0;
6056 goto cost_plus;
6059 if (GET_CODE (op0) == MINUS)
6061 /* SUBS. */
6062 x = op0;
6063 goto cost_minus;
6066 if (GET_CODE (op1) == NEG)
6068 /* CMN. */
6069 if (speed)
6070 *cost += extra_cost->alu.arith;
6072 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6073 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6074 return true;
6077 /* CMP.
6079 Compare can freely swap the order of operands, and
6080 canonicalization puts the more complex operation first.
6081 But the integer MINUS logic expects the shift/extend
6082 operation in op1. */
6083 if (! (REG_P (op0)
6084 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6086 op0 = XEXP (x, 1);
6087 op1 = XEXP (x, 0);
6089 goto cost_minus;
6092 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6094 /* FCMP. */
6095 if (speed)
6096 *cost += extra_cost->fp[mode == DFmode].compare;
6098 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6100 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6101 /* FCMP supports constant 0.0 for no extra cost. */
6102 return true;
6104 return false;
6107 if (VECTOR_MODE_P (mode))
6109 /* Vector compare. */
6110 if (speed)
6111 *cost += extra_cost->vect.alu;
6113 if (aarch64_float_const_zero_rtx_p (op1))
6115 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6116 cost. */
6117 return true;
6119 return false;
6121 return false;
6123 case MINUS:
6125 op0 = XEXP (x, 0);
6126 op1 = XEXP (x, 1);
6128 cost_minus:
6129 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6131 /* Detect valid immediates. */
6132 if ((GET_MODE_CLASS (mode) == MODE_INT
6133 || (GET_MODE_CLASS (mode) == MODE_CC
6134 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6135 && CONST_INT_P (op1)
6136 && aarch64_uimm12_shift (INTVAL (op1)))
6138 if (speed)
6139 /* SUB(S) (immediate). */
6140 *cost += extra_cost->alu.arith;
6141 return true;
6144 /* Look for SUB (extended register). */
6145 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6147 if (speed)
6148 *cost += extra_cost->alu.extend_arith;
6150 op1 = aarch64_strip_extend (op1);
6151 *cost += rtx_cost (op1, VOIDmode,
6152 (enum rtx_code) GET_CODE (op1), 0, speed);
6153 return true;
6156 rtx new_op1 = aarch64_strip_extend (op1);
6158 /* Cost this as an FMA-alike operation. */
6159 if ((GET_CODE (new_op1) == MULT
6160 || aarch64_shift_p (GET_CODE (new_op1)))
6161 && code != COMPARE)
6163 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6164 (enum rtx_code) code,
6165 speed);
6166 return true;
6169 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6171 if (speed)
6173 if (VECTOR_MODE_P (mode))
6175 /* Vector SUB. */
6176 *cost += extra_cost->vect.alu;
6178 else if (GET_MODE_CLASS (mode) == MODE_INT)
6180 /* SUB(S). */
6181 *cost += extra_cost->alu.arith;
6183 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6185 /* FSUB. */
6186 *cost += extra_cost->fp[mode == DFmode].addsub;
6189 return true;
6192 case PLUS:
6194 rtx new_op0;
6196 op0 = XEXP (x, 0);
6197 op1 = XEXP (x, 1);
6199 cost_plus:
6200 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6201 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6203 /* CSINC. */
6204 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6205 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6206 return true;
6209 if (GET_MODE_CLASS (mode) == MODE_INT
6210 && CONST_INT_P (op1)
6211 && aarch64_uimm12_shift (INTVAL (op1)))
6213 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6215 if (speed)
6216 /* ADD (immediate). */
6217 *cost += extra_cost->alu.arith;
6218 return true;
6221 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6223 /* Look for ADD (extended register). */
6224 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6226 if (speed)
6227 *cost += extra_cost->alu.extend_arith;
6229 op0 = aarch64_strip_extend (op0);
6230 *cost += rtx_cost (op0, VOIDmode,
6231 (enum rtx_code) GET_CODE (op0), 0, speed);
6232 return true;
6235 /* Strip any extend, leave shifts behind as we will
6236 cost them through mult_cost. */
6237 new_op0 = aarch64_strip_extend (op0);
6239 if (GET_CODE (new_op0) == MULT
6240 || aarch64_shift_p (GET_CODE (new_op0)))
6242 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6243 speed);
6244 return true;
6247 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6249 if (speed)
6251 if (VECTOR_MODE_P (mode))
6253 /* Vector ADD. */
6254 *cost += extra_cost->vect.alu;
6256 else if (GET_MODE_CLASS (mode) == MODE_INT)
6258 /* ADD. */
6259 *cost += extra_cost->alu.arith;
6261 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6263 /* FADD. */
6264 *cost += extra_cost->fp[mode == DFmode].addsub;
6267 return true;
6270 case BSWAP:
6271 *cost = COSTS_N_INSNS (1);
6273 if (speed)
6275 if (VECTOR_MODE_P (mode))
6276 *cost += extra_cost->vect.alu;
6277 else
6278 *cost += extra_cost->alu.rev;
6280 return false;
6282 case IOR:
6283 if (aarch_rev16_p (x))
6285 *cost = COSTS_N_INSNS (1);
6287 if (speed)
6289 if (VECTOR_MODE_P (mode))
6290 *cost += extra_cost->vect.alu;
6291 else
6292 *cost += extra_cost->alu.rev;
6294 return true;
6297 if (aarch64_extr_rtx_p (x, &op0, &op1))
6299 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6300 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6301 if (speed)
6302 *cost += extra_cost->alu.shift;
6304 return true;
6306 /* Fall through. */
6307 case XOR:
6308 case AND:
6309 cost_logic:
6310 op0 = XEXP (x, 0);
6311 op1 = XEXP (x, 1);
6313 if (VECTOR_MODE_P (mode))
6315 if (speed)
6316 *cost += extra_cost->vect.alu;
6317 return true;
6320 if (code == AND
6321 && GET_CODE (op0) == MULT
6322 && CONST_INT_P (XEXP (op0, 1))
6323 && CONST_INT_P (op1)
6324 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6325 INTVAL (op1)) != 0)
6327 /* This is a UBFM/SBFM. */
6328 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6329 if (speed)
6330 *cost += extra_cost->alu.bfx;
6331 return true;
6334 if (GET_MODE_CLASS (mode) == MODE_INT)
6336 /* We possibly get the immediate for free, this is not
6337 modelled. */
6338 if (CONST_INT_P (op1)
6339 && aarch64_bitmask_imm (INTVAL (op1), mode))
6341 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6343 if (speed)
6344 *cost += extra_cost->alu.logical;
6346 return true;
6348 else
6350 rtx new_op0 = op0;
6352 /* Handle ORN, EON, or BIC. */
6353 if (GET_CODE (op0) == NOT)
6354 op0 = XEXP (op0, 0);
6356 new_op0 = aarch64_strip_shift (op0);
6358 /* If we had a shift on op0 then this is a logical-shift-
6359 by-register/immediate operation. Otherwise, this is just
6360 a logical operation. */
6361 if (speed)
6363 if (new_op0 != op0)
6365 /* Shift by immediate. */
6366 if (CONST_INT_P (XEXP (op0, 1)))
6367 *cost += extra_cost->alu.log_shift;
6368 else
6369 *cost += extra_cost->alu.log_shift_reg;
6371 else
6372 *cost += extra_cost->alu.logical;
6375 /* In both cases we want to cost both operands. */
6376 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6377 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6379 return true;
6382 return false;
6384 case NOT:
6385 x = XEXP (x, 0);
6386 op0 = aarch64_strip_shift (x);
6388 if (VECTOR_MODE_P (mode))
6390 /* Vector NOT. */
6391 *cost += extra_cost->vect.alu;
6392 return false;
6395 /* MVN-shifted-reg. */
6396 if (op0 != x)
6398 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6400 if (speed)
6401 *cost += extra_cost->alu.log_shift;
6403 return true;
6405 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6406 Handle the second form here taking care that 'a' in the above can
6407 be a shift. */
6408 else if (GET_CODE (op0) == XOR)
6410 rtx newop0 = XEXP (op0, 0);
6411 rtx newop1 = XEXP (op0, 1);
6412 rtx op0_stripped = aarch64_strip_shift (newop0);
6414 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6415 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6417 if (speed)
6419 if (op0_stripped != newop0)
6420 *cost += extra_cost->alu.log_shift;
6421 else
6422 *cost += extra_cost->alu.logical;
6425 return true;
6427 /* MVN. */
6428 if (speed)
6429 *cost += extra_cost->alu.logical;
6431 return false;
6433 case ZERO_EXTEND:
6435 op0 = XEXP (x, 0);
6436 /* If a value is written in SI mode, then zero extended to DI
6437 mode, the operation will in general be free as a write to
6438 a 'w' register implicitly zeroes the upper bits of an 'x'
6439 register. However, if this is
6441 (set (reg) (zero_extend (reg)))
6443 we must cost the explicit register move. */
6444 if (mode == DImode
6445 && GET_MODE (op0) == SImode
6446 && outer == SET)
6448 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6450 if (!op_cost && speed)
6451 /* MOV. */
6452 *cost += extra_cost->alu.extend;
6453 else
6454 /* Free, the cost is that of the SI mode operation. */
6455 *cost = op_cost;
6457 return true;
6459 else if (MEM_P (op0))
6461 /* All loads can zero extend to any size for free. */
6462 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6463 return true;
6466 if (speed)
6468 if (VECTOR_MODE_P (mode))
6470 /* UMOV. */
6471 *cost += extra_cost->vect.alu;
6473 else
6475 /* UXTB/UXTH. */
6476 *cost += extra_cost->alu.extend;
6479 return false;
6481 case SIGN_EXTEND:
6482 if (MEM_P (XEXP (x, 0)))
6484 /* LDRSH. */
6485 if (speed)
6487 rtx address = XEXP (XEXP (x, 0), 0);
6488 *cost += extra_cost->ldst.load_sign_extend;
6490 *cost +=
6491 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6492 0, speed));
6494 return true;
6497 if (speed)
6499 if (VECTOR_MODE_P (mode))
6500 *cost += extra_cost->vect.alu;
6501 else
6502 *cost += extra_cost->alu.extend;
6504 return false;
6506 case ASHIFT:
6507 op0 = XEXP (x, 0);
6508 op1 = XEXP (x, 1);
6510 if (CONST_INT_P (op1))
6512 if (speed)
6514 if (VECTOR_MODE_P (mode))
6516 /* Vector shift (immediate). */
6517 *cost += extra_cost->vect.alu;
6519 else
6521 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6522 aliases. */
6523 *cost += extra_cost->alu.shift;
6527 /* We can incorporate zero/sign extend for free. */
6528 if (GET_CODE (op0) == ZERO_EXTEND
6529 || GET_CODE (op0) == SIGN_EXTEND)
6530 op0 = XEXP (op0, 0);
6532 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6533 return true;
6535 else
6537 if (speed)
6539 if (VECTOR_MODE_P (mode))
6541 /* Vector shift (register). */
6542 *cost += extra_cost->vect.alu;
6544 else
6546 /* LSLV. */
6547 *cost += extra_cost->alu.shift_reg;
6550 return false; /* All arguments need to be in registers. */
6553 case ROTATE:
6554 case ROTATERT:
6555 case LSHIFTRT:
6556 case ASHIFTRT:
6557 op0 = XEXP (x, 0);
6558 op1 = XEXP (x, 1);
6560 if (CONST_INT_P (op1))
6562 /* ASR (immediate) and friends. */
6563 if (speed)
6565 if (VECTOR_MODE_P (mode))
6566 *cost += extra_cost->vect.alu;
6567 else
6568 *cost += extra_cost->alu.shift;
6571 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6572 return true;
6574 else
6577 /* ASR (register) and friends. */
6578 if (speed)
6580 if (VECTOR_MODE_P (mode))
6581 *cost += extra_cost->vect.alu;
6582 else
6583 *cost += extra_cost->alu.shift_reg;
6585 return false; /* All arguments need to be in registers. */
6588 case SYMBOL_REF:
6590 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6591 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6593 /* LDR. */
6594 if (speed)
6595 *cost += extra_cost->ldst.load;
6597 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6598 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6600 /* ADRP, followed by ADD. */
6601 *cost += COSTS_N_INSNS (1);
6602 if (speed)
6603 *cost += 2 * extra_cost->alu.arith;
6605 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6606 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6608 /* ADR. */
6609 if (speed)
6610 *cost += extra_cost->alu.arith;
6613 if (flag_pic)
6615 /* One extra load instruction, after accessing the GOT. */
6616 *cost += COSTS_N_INSNS (1);
6617 if (speed)
6618 *cost += extra_cost->ldst.load;
6620 return true;
6622 case HIGH:
6623 case LO_SUM:
6624 /* ADRP/ADD (immediate). */
6625 if (speed)
6626 *cost += extra_cost->alu.arith;
6627 return true;
6629 case ZERO_EXTRACT:
6630 case SIGN_EXTRACT:
6631 /* UBFX/SBFX. */
6632 if (speed)
6634 if (VECTOR_MODE_P (mode))
6635 *cost += extra_cost->vect.alu;
6636 else
6637 *cost += extra_cost->alu.bfx;
6640 /* We can trust that the immediates used will be correct (there
6641 are no by-register forms), so we need only cost op0. */
6642 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
6643 return true;
6645 case MULT:
6646 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6647 /* aarch64_rtx_mult_cost always handles recursion to its
6648 operands. */
6649 return true;
6651 case MOD:
6652 /* We can expand signed mod by power of 2 using a NEGS, two parallel
6653 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
6654 an unconditional negate. This case should only ever be reached through
6655 the set_smod_pow2_cheap check in expmed.c. */
6656 if (CONST_INT_P (XEXP (x, 1))
6657 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
6658 && (mode == SImode || mode == DImode))
6660 /* We expand to 4 instructions. Reset the baseline. */
6661 *cost = COSTS_N_INSNS (4);
6663 if (speed)
6664 *cost += 2 * extra_cost->alu.logical
6665 + 2 * extra_cost->alu.arith;
6667 return true;
6670 /* Fall-through. */
6671 case UMOD:
6672 if (speed)
6674 if (VECTOR_MODE_P (mode))
6675 *cost += extra_cost->vect.alu;
6676 else if (GET_MODE_CLASS (mode) == MODE_INT)
6677 *cost += (extra_cost->mult[mode == DImode].add
6678 + extra_cost->mult[mode == DImode].idiv);
6679 else if (mode == DFmode)
6680 *cost += (extra_cost->fp[1].mult
6681 + extra_cost->fp[1].div);
6682 else if (mode == SFmode)
6683 *cost += (extra_cost->fp[0].mult
6684 + extra_cost->fp[0].div);
6686 return false; /* All arguments need to be in registers. */
6688 case DIV:
6689 case UDIV:
6690 case SQRT:
6691 if (speed)
6693 if (VECTOR_MODE_P (mode))
6694 *cost += extra_cost->vect.alu;
6695 else if (GET_MODE_CLASS (mode) == MODE_INT)
6696 /* There is no integer SQRT, so only DIV and UDIV can get
6697 here. */
6698 *cost += extra_cost->mult[mode == DImode].idiv;
6699 else
6700 *cost += extra_cost->fp[mode == DFmode].div;
6702 return false; /* All arguments need to be in registers. */
6704 case IF_THEN_ELSE:
6705 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6706 XEXP (x, 2), cost, speed);
6708 case EQ:
6709 case NE:
6710 case GT:
6711 case GTU:
6712 case LT:
6713 case LTU:
6714 case GE:
6715 case GEU:
6716 case LE:
6717 case LEU:
6719 return false; /* All arguments must be in registers. */
6721 case FMA:
6722 op0 = XEXP (x, 0);
6723 op1 = XEXP (x, 1);
6724 op2 = XEXP (x, 2);
6726 if (speed)
6728 if (VECTOR_MODE_P (mode))
6729 *cost += extra_cost->vect.alu;
6730 else
6731 *cost += extra_cost->fp[mode == DFmode].fma;
6734 /* FMSUB, FNMADD, and FNMSUB are free. */
6735 if (GET_CODE (op0) == NEG)
6736 op0 = XEXP (op0, 0);
6738 if (GET_CODE (op2) == NEG)
6739 op2 = XEXP (op2, 0);
6741 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6742 and the by-element operand as operand 0. */
6743 if (GET_CODE (op1) == NEG)
6744 op1 = XEXP (op1, 0);
6746 /* Catch vector-by-element operations. The by-element operand can
6747 either be (vec_duplicate (vec_select (x))) or just
6748 (vec_select (x)), depending on whether we are multiplying by
6749 a vector or a scalar.
6751 Canonicalization is not very good in these cases, FMA4 will put the
6752 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6753 if (GET_CODE (op0) == VEC_DUPLICATE)
6754 op0 = XEXP (op0, 0);
6755 else if (GET_CODE (op1) == VEC_DUPLICATE)
6756 op1 = XEXP (op1, 0);
6758 if (GET_CODE (op0) == VEC_SELECT)
6759 op0 = XEXP (op0, 0);
6760 else if (GET_CODE (op1) == VEC_SELECT)
6761 op1 = XEXP (op1, 0);
6763 /* If the remaining parameters are not registers,
6764 get the cost to put them into registers. */
6765 *cost += rtx_cost (op0, mode, FMA, 0, speed);
6766 *cost += rtx_cost (op1, mode, FMA, 1, speed);
6767 *cost += rtx_cost (op2, mode, FMA, 2, speed);
6768 return true;
6770 case FLOAT:
6771 case UNSIGNED_FLOAT:
6772 if (speed)
6773 *cost += extra_cost->fp[mode == DFmode].fromint;
6774 return false;
6776 case FLOAT_EXTEND:
6777 if (speed)
6779 if (VECTOR_MODE_P (mode))
6781 /*Vector truncate. */
6782 *cost += extra_cost->vect.alu;
6784 else
6785 *cost += extra_cost->fp[mode == DFmode].widen;
6787 return false;
6789 case FLOAT_TRUNCATE:
6790 if (speed)
6792 if (VECTOR_MODE_P (mode))
6794 /*Vector conversion. */
6795 *cost += extra_cost->vect.alu;
6797 else
6798 *cost += extra_cost->fp[mode == DFmode].narrow;
6800 return false;
6802 case FIX:
6803 case UNSIGNED_FIX:
6804 x = XEXP (x, 0);
6805 /* Strip the rounding part. They will all be implemented
6806 by the fcvt* family of instructions anyway. */
6807 if (GET_CODE (x) == UNSPEC)
6809 unsigned int uns_code = XINT (x, 1);
6811 if (uns_code == UNSPEC_FRINTA
6812 || uns_code == UNSPEC_FRINTM
6813 || uns_code == UNSPEC_FRINTN
6814 || uns_code == UNSPEC_FRINTP
6815 || uns_code == UNSPEC_FRINTZ)
6816 x = XVECEXP (x, 0, 0);
6819 if (speed)
6821 if (VECTOR_MODE_P (mode))
6822 *cost += extra_cost->vect.alu;
6823 else
6824 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6827 /* We can combine fmul by a power of 2 followed by a fcvt into a single
6828 fixed-point fcvt. */
6829 if (GET_CODE (x) == MULT
6830 && ((VECTOR_MODE_P (mode)
6831 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
6832 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
6834 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
6835 0, speed);
6836 return true;
6839 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
6840 return true;
6842 case ABS:
6843 if (VECTOR_MODE_P (mode))
6845 /* ABS (vector). */
6846 if (speed)
6847 *cost += extra_cost->vect.alu;
6849 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6851 op0 = XEXP (x, 0);
6853 /* FABD, which is analogous to FADD. */
6854 if (GET_CODE (op0) == MINUS)
6856 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
6857 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
6858 if (speed)
6859 *cost += extra_cost->fp[mode == DFmode].addsub;
6861 return true;
6863 /* Simple FABS is analogous to FNEG. */
6864 if (speed)
6865 *cost += extra_cost->fp[mode == DFmode].neg;
6867 else
6869 /* Integer ABS will either be split to
6870 two arithmetic instructions, or will be an ABS
6871 (scalar), which we don't model. */
6872 *cost = COSTS_N_INSNS (2);
6873 if (speed)
6874 *cost += 2 * extra_cost->alu.arith;
6876 return false;
6878 case SMAX:
6879 case SMIN:
6880 if (speed)
6882 if (VECTOR_MODE_P (mode))
6883 *cost += extra_cost->vect.alu;
6884 else
6886 /* FMAXNM/FMINNM/FMAX/FMIN.
6887 TODO: This may not be accurate for all implementations, but
6888 we do not model this in the cost tables. */
6889 *cost += extra_cost->fp[mode == DFmode].addsub;
6892 return false;
6894 case UNSPEC:
6895 /* The floating point round to integer frint* instructions. */
6896 if (aarch64_frint_unspec_p (XINT (x, 1)))
6898 if (speed)
6899 *cost += extra_cost->fp[mode == DFmode].roundint;
6901 return false;
6904 if (XINT (x, 1) == UNSPEC_RBIT)
6906 if (speed)
6907 *cost += extra_cost->alu.rev;
6909 return false;
6911 break;
6913 case TRUNCATE:
6915 /* Decompose <su>muldi3_highpart. */
6916 if (/* (truncate:DI */
6917 mode == DImode
6918 /* (lshiftrt:TI */
6919 && GET_MODE (XEXP (x, 0)) == TImode
6920 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6921 /* (mult:TI */
6922 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6923 /* (ANY_EXTEND:TI (reg:DI))
6924 (ANY_EXTEND:TI (reg:DI))) */
6925 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6926 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6927 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6928 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6929 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6930 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6931 /* (const_int 64) */
6932 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6933 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6935 /* UMULH/SMULH. */
6936 if (speed)
6937 *cost += extra_cost->mult[mode == DImode].extend;
6938 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6939 mode, MULT, 0, speed);
6940 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6941 mode, MULT, 1, speed);
6942 return true;
6945 /* Fall through. */
6946 default:
6947 break;
6950 if (dump_file && (dump_flags & TDF_DETAILS))
6951 fprintf (dump_file,
6952 "\nFailed to cost RTX. Assuming default cost.\n");
6954 return true;
6957 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6958 calculated for X. This cost is stored in *COST. Returns true
6959 if the total cost of X was calculated. */
6960 static bool
6961 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
6962 int param, int *cost, bool speed)
6964 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
6966 if (dump_file && (dump_flags & TDF_DETAILS))
6968 print_rtl_single (dump_file, x);
6969 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6970 speed ? "Hot" : "Cold",
6971 *cost, result ? "final" : "partial");
6974 return result;
6977 static int
6978 aarch64_register_move_cost (machine_mode mode,
6979 reg_class_t from_i, reg_class_t to_i)
6981 enum reg_class from = (enum reg_class) from_i;
6982 enum reg_class to = (enum reg_class) to_i;
6983 const struct cpu_regmove_cost *regmove_cost
6984 = aarch64_tune_params.regmove_cost;
6986 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6987 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6988 to = GENERAL_REGS;
6990 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6991 from = GENERAL_REGS;
6993 /* Moving between GPR and stack cost is the same as GP2GP. */
6994 if ((from == GENERAL_REGS && to == STACK_REG)
6995 || (to == GENERAL_REGS && from == STACK_REG))
6996 return regmove_cost->GP2GP;
6998 /* To/From the stack register, we move via the gprs. */
6999 if (to == STACK_REG || from == STACK_REG)
7000 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7001 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7003 if (GET_MODE_SIZE (mode) == 16)
7005 /* 128-bit operations on general registers require 2 instructions. */
7006 if (from == GENERAL_REGS && to == GENERAL_REGS)
7007 return regmove_cost->GP2GP * 2;
7008 else if (from == GENERAL_REGS)
7009 return regmove_cost->GP2FP * 2;
7010 else if (to == GENERAL_REGS)
7011 return regmove_cost->FP2GP * 2;
7013 /* When AdvSIMD instructions are disabled it is not possible to move
7014 a 128-bit value directly between Q registers. This is handled in
7015 secondary reload. A general register is used as a scratch to move
7016 the upper DI value and the lower DI value is moved directly,
7017 hence the cost is the sum of three moves. */
7018 if (! TARGET_SIMD)
7019 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7021 return regmove_cost->FP2FP;
7024 if (from == GENERAL_REGS && to == GENERAL_REGS)
7025 return regmove_cost->GP2GP;
7026 else if (from == GENERAL_REGS)
7027 return regmove_cost->GP2FP;
7028 else if (to == GENERAL_REGS)
7029 return regmove_cost->FP2GP;
7031 return regmove_cost->FP2FP;
7034 static int
7035 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7036 reg_class_t rclass ATTRIBUTE_UNUSED,
7037 bool in ATTRIBUTE_UNUSED)
7039 return aarch64_tune_params.memmov_cost;
7042 /* Function to decide when to use
7043 reciprocal square root builtins. */
7045 static tree
7046 aarch64_builtin_reciprocal (unsigned int fn,
7047 bool md_fn,
7048 bool)
7050 if (flag_trapping_math
7051 || !flag_unsafe_math_optimizations
7052 || optimize_size
7053 || ! (aarch64_tune_params.extra_tuning_flags
7054 & AARCH64_EXTRA_TUNE_RECIP_SQRT))
7056 return NULL_TREE;
7059 return aarch64_builtin_rsqrt (fn, md_fn);
7062 typedef rtx (*rsqrte_type) (rtx, rtx);
7064 /* Select reciprocal square root initial estimate
7065 insn depending on machine mode. */
7067 rsqrte_type
7068 get_rsqrte_type (machine_mode mode)
7070 switch (mode)
7072 case DFmode: return gen_aarch64_rsqrte_df2;
7073 case SFmode: return gen_aarch64_rsqrte_sf2;
7074 case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7075 case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7076 case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7077 default: gcc_unreachable ();
7081 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7083 /* Select reciprocal square root Newton-Raphson step
7084 insn depending on machine mode. */
7086 rsqrts_type
7087 get_rsqrts_type (machine_mode mode)
7089 switch (mode)
7091 case DFmode: return gen_aarch64_rsqrts_df3;
7092 case SFmode: return gen_aarch64_rsqrts_sf3;
7093 case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7094 case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7095 case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7096 default: gcc_unreachable ();
7100 /* Emit instruction sequence to compute
7101 reciprocal square root. Use two Newton-Raphson steps
7102 for single precision and three for double precision. */
7104 void
7105 aarch64_emit_swrsqrt (rtx dst, rtx src)
7107 machine_mode mode = GET_MODE (src);
7108 gcc_assert (
7109 mode == SFmode || mode == V2SFmode || mode == V4SFmode
7110 || mode == DFmode || mode == V2DFmode);
7112 rtx xsrc = gen_reg_rtx (mode);
7113 emit_move_insn (xsrc, src);
7114 rtx x0 = gen_reg_rtx (mode);
7116 emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7118 bool double_mode = (mode == DFmode || mode == V2DFmode);
7120 int iterations = double_mode ? 3 : 2;
7122 if (flag_mrecip_low_precision_sqrt)
7123 iterations--;
7125 for (int i = 0; i < iterations; ++i)
7127 rtx x1 = gen_reg_rtx (mode);
7128 rtx x2 = gen_reg_rtx (mode);
7129 rtx x3 = gen_reg_rtx (mode);
7130 emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7132 emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7134 emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7135 x0 = x1;
7138 emit_move_insn (dst, x0);
7141 /* Return the number of instructions that can be issued per cycle. */
7142 static int
7143 aarch64_sched_issue_rate (void)
7145 return aarch64_tune_params.issue_rate;
7148 static int
7149 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7151 int issue_rate = aarch64_sched_issue_rate ();
7153 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7157 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7158 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7159 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7161 static int
7162 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7163 int ready_index)
7165 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7169 /* Vectorizer cost model target hooks. */
7171 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7172 static int
7173 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7174 tree vectype,
7175 int misalign ATTRIBUTE_UNUSED)
7177 unsigned elements;
7179 switch (type_of_cost)
7181 case scalar_stmt:
7182 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7184 case scalar_load:
7185 return aarch64_tune_params.vec_costs->scalar_load_cost;
7187 case scalar_store:
7188 return aarch64_tune_params.vec_costs->scalar_store_cost;
7190 case vector_stmt:
7191 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7193 case vector_load:
7194 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7196 case vector_store:
7197 return aarch64_tune_params.vec_costs->vec_store_cost;
7199 case vec_to_scalar:
7200 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7202 case scalar_to_vec:
7203 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7205 case unaligned_load:
7206 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7208 case unaligned_store:
7209 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7211 case cond_branch_taken:
7212 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7214 case cond_branch_not_taken:
7215 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7217 case vec_perm:
7218 case vec_promote_demote:
7219 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7221 case vec_construct:
7222 elements = TYPE_VECTOR_SUBPARTS (vectype);
7223 return elements / 2 + 1;
7225 default:
7226 gcc_unreachable ();
7230 /* Implement targetm.vectorize.add_stmt_cost. */
7231 static unsigned
7232 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7233 struct _stmt_vec_info *stmt_info, int misalign,
7234 enum vect_cost_model_location where)
7236 unsigned *cost = (unsigned *) data;
7237 unsigned retval = 0;
7239 if (flag_vect_cost_model)
7241 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7242 int stmt_cost =
7243 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7245 /* Statements in an inner loop relative to the loop being
7246 vectorized are weighted more heavily. The value here is
7247 arbitrary and could potentially be improved with analysis. */
7248 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7249 count *= 50; /* FIXME */
7251 retval = (unsigned) (count * stmt_cost);
7252 cost[where] += retval;
7255 return retval;
7258 static void initialize_aarch64_code_model (struct gcc_options *);
7260 /* Enum describing the various ways that the
7261 aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7262 This way their callers can choose what kind of error to give. */
7264 enum aarch64_parse_opt_result
7266 AARCH64_PARSE_OK, /* Parsing was successful. */
7267 AARCH64_PARSE_MISSING_ARG, /* Missing argument. */
7268 AARCH64_PARSE_INVALID_FEATURE, /* Invalid feature modifier. */
7269 AARCH64_PARSE_INVALID_ARG /* Invalid arch, tune, cpu arg. */
7272 /* Parse the architecture extension string STR and update ISA_FLAGS
7273 with the architecture features turned on or off. Return a
7274 aarch64_parse_opt_result describing the result. */
7276 static enum aarch64_parse_opt_result
7277 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7279 /* The extension string is parsed left to right. */
7280 const struct aarch64_option_extension *opt = NULL;
7282 /* Flag to say whether we are adding or removing an extension. */
7283 int adding_ext = -1;
7285 while (str != NULL && *str != 0)
7287 char *ext;
7288 size_t len;
7290 str++;
7291 ext = strchr (str, '+');
7293 if (ext != NULL)
7294 len = ext - str;
7295 else
7296 len = strlen (str);
7298 if (len >= 2 && strncmp (str, "no", 2) == 0)
7300 adding_ext = 0;
7301 len -= 2;
7302 str += 2;
7304 else if (len > 0)
7305 adding_ext = 1;
7307 if (len == 0)
7308 return AARCH64_PARSE_MISSING_ARG;
7311 /* Scan over the extensions table trying to find an exact match. */
7312 for (opt = all_extensions; opt->name != NULL; opt++)
7314 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7316 /* Add or remove the extension. */
7317 if (adding_ext)
7318 *isa_flags |= opt->flags_on;
7319 else
7320 *isa_flags &= ~(opt->flags_off);
7321 break;
7325 if (opt->name == NULL)
7327 /* Extension not found in list. */
7328 return AARCH64_PARSE_INVALID_FEATURE;
7331 str = ext;
7334 return AARCH64_PARSE_OK;
7337 /* Parse the TO_PARSE string and put the architecture struct that it
7338 selects into RES and the architectural features into ISA_FLAGS.
7339 Return an aarch64_parse_opt_result describing the parse result.
7340 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7342 static enum aarch64_parse_opt_result
7343 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7344 unsigned long *isa_flags)
7346 char *ext;
7347 const struct processor *arch;
7348 char *str = (char *) alloca (strlen (to_parse) + 1);
7349 size_t len;
7351 strcpy (str, to_parse);
7353 ext = strchr (str, '+');
7355 if (ext != NULL)
7356 len = ext - str;
7357 else
7358 len = strlen (str);
7360 if (len == 0)
7361 return AARCH64_PARSE_MISSING_ARG;
7364 /* Loop through the list of supported ARCHes to find a match. */
7365 for (arch = all_architectures; arch->name != NULL; arch++)
7367 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7369 unsigned long isa_temp = arch->flags;
7371 if (ext != NULL)
7373 /* TO_PARSE string contains at least one extension. */
7374 enum aarch64_parse_opt_result ext_res
7375 = aarch64_parse_extension (ext, &isa_temp);
7377 if (ext_res != AARCH64_PARSE_OK)
7378 return ext_res;
7380 /* Extension parsing was successful. Confirm the result
7381 arch and ISA flags. */
7382 *res = arch;
7383 *isa_flags = isa_temp;
7384 return AARCH64_PARSE_OK;
7388 /* ARCH name not found in list. */
7389 return AARCH64_PARSE_INVALID_ARG;
7392 /* Parse the TO_PARSE string and put the result tuning in RES and the
7393 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7394 describing the parse result. If there is an error parsing, RES and
7395 ISA_FLAGS are left unchanged. */
7397 static enum aarch64_parse_opt_result
7398 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7399 unsigned long *isa_flags)
7401 char *ext;
7402 const struct processor *cpu;
7403 char *str = (char *) alloca (strlen (to_parse) + 1);
7404 size_t len;
7406 strcpy (str, to_parse);
7408 ext = strchr (str, '+');
7410 if (ext != NULL)
7411 len = ext - str;
7412 else
7413 len = strlen (str);
7415 if (len == 0)
7416 return AARCH64_PARSE_MISSING_ARG;
7419 /* Loop through the list of supported CPUs to find a match. */
7420 for (cpu = all_cores; cpu->name != NULL; cpu++)
7422 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7424 unsigned long isa_temp = cpu->flags;
7427 if (ext != NULL)
7429 /* TO_PARSE string contains at least one extension. */
7430 enum aarch64_parse_opt_result ext_res
7431 = aarch64_parse_extension (ext, &isa_temp);
7433 if (ext_res != AARCH64_PARSE_OK)
7434 return ext_res;
7436 /* Extension parsing was successfull. Confirm the result
7437 cpu and ISA flags. */
7438 *res = cpu;
7439 *isa_flags = isa_temp;
7440 return AARCH64_PARSE_OK;
7444 /* CPU name not found in list. */
7445 return AARCH64_PARSE_INVALID_ARG;
7448 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7449 Return an aarch64_parse_opt_result describing the parse result.
7450 If the parsing fails the RES does not change. */
7452 static enum aarch64_parse_opt_result
7453 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7455 const struct processor *cpu;
7456 char *str = (char *) alloca (strlen (to_parse) + 1);
7458 strcpy (str, to_parse);
7460 /* Loop through the list of supported CPUs to find a match. */
7461 for (cpu = all_cores; cpu->name != NULL; cpu++)
7463 if (strcmp (cpu->name, str) == 0)
7465 *res = cpu;
7466 return AARCH64_PARSE_OK;
7470 /* CPU name not found in list. */
7471 return AARCH64_PARSE_INVALID_ARG;
7474 /* Parse TOKEN, which has length LENGTH to see if it is an option
7475 described in FLAG. If it is, return the index bit for that fusion type.
7476 If not, error (printing OPTION_NAME) and return zero. */
7478 static unsigned int
7479 aarch64_parse_one_option_token (const char *token,
7480 size_t length,
7481 const struct aarch64_flag_desc *flag,
7482 const char *option_name)
7484 for (; flag->name != NULL; flag++)
7486 if (length == strlen (flag->name)
7487 && !strncmp (flag->name, token, length))
7488 return flag->flag;
7491 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7492 return 0;
7495 /* Parse OPTION which is a comma-separated list of flags to enable.
7496 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7497 default state we inherit from the CPU tuning structures. OPTION_NAME
7498 gives the top-level option we are parsing in the -moverride string,
7499 for use in error messages. */
7501 static unsigned int
7502 aarch64_parse_boolean_options (const char *option,
7503 const struct aarch64_flag_desc *flags,
7504 unsigned int initial_state,
7505 const char *option_name)
7507 const char separator = '.';
7508 const char* specs = option;
7509 const char* ntoken = option;
7510 unsigned int found_flags = initial_state;
7512 while ((ntoken = strchr (specs, separator)))
7514 size_t token_length = ntoken - specs;
7515 unsigned token_ops = aarch64_parse_one_option_token (specs,
7516 token_length,
7517 flags,
7518 option_name);
7519 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7520 in the token stream, reset the supported operations. So:
7522 adrp+add.cmp+branch.none.adrp+add
7524 would have the result of turning on only adrp+add fusion. */
7525 if (!token_ops)
7526 found_flags = 0;
7528 found_flags |= token_ops;
7529 specs = ++ntoken;
7532 /* We ended with a comma, print something. */
7533 if (!(*specs))
7535 error ("%s string ill-formed\n", option_name);
7536 return 0;
7539 /* We still have one more token to parse. */
7540 size_t token_length = strlen (specs);
7541 unsigned token_ops = aarch64_parse_one_option_token (specs,
7542 token_length,
7543 flags,
7544 option_name);
7545 if (!token_ops)
7546 found_flags = 0;
7548 found_flags |= token_ops;
7549 return found_flags;
7552 /* Support for overriding instruction fusion. */
7554 static void
7555 aarch64_parse_fuse_string (const char *fuse_string,
7556 struct tune_params *tune)
7558 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7559 aarch64_fusible_pairs,
7560 tune->fusible_ops,
7561 "fuse=");
7564 /* Support for overriding other tuning flags. */
7566 static void
7567 aarch64_parse_tune_string (const char *tune_string,
7568 struct tune_params *tune)
7570 tune->extra_tuning_flags
7571 = aarch64_parse_boolean_options (tune_string,
7572 aarch64_tuning_flags,
7573 tune->extra_tuning_flags,
7574 "tune=");
7577 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7578 we understand. If it is, extract the option string and handoff to
7579 the appropriate function. */
7581 void
7582 aarch64_parse_one_override_token (const char* token,
7583 size_t length,
7584 struct tune_params *tune)
7586 const struct aarch64_tuning_override_function *fn
7587 = aarch64_tuning_override_functions;
7589 const char *option_part = strchr (token, '=');
7590 if (!option_part)
7592 error ("tuning string missing in option (%s)", token);
7593 return;
7596 /* Get the length of the option name. */
7597 length = option_part - token;
7598 /* Skip the '=' to get to the option string. */
7599 option_part++;
7601 for (; fn->name != NULL; fn++)
7603 if (!strncmp (fn->name, token, length))
7605 fn->parse_override (option_part, tune);
7606 return;
7610 error ("unknown tuning option (%s)",token);
7611 return;
7614 /* A checking mechanism for the implementation of the tls size. */
7616 static void
7617 initialize_aarch64_tls_size (struct gcc_options *opts)
7619 if (aarch64_tls_size == 0)
7620 aarch64_tls_size = 24;
7622 switch (opts->x_aarch64_cmodel_var)
7624 case AARCH64_CMODEL_TINY:
7625 /* Both the default and maximum TLS size allowed under tiny is 1M which
7626 needs two instructions to address, so we clamp the size to 24. */
7627 if (aarch64_tls_size > 24)
7628 aarch64_tls_size = 24;
7629 break;
7630 case AARCH64_CMODEL_SMALL:
7631 /* The maximum TLS size allowed under small is 4G. */
7632 if (aarch64_tls_size > 32)
7633 aarch64_tls_size = 32;
7634 break;
7635 case AARCH64_CMODEL_LARGE:
7636 /* The maximum TLS size allowed under large is 16E.
7637 FIXME: 16E should be 64bit, we only support 48bit offset now. */
7638 if (aarch64_tls_size > 48)
7639 aarch64_tls_size = 48;
7640 break;
7641 default:
7642 gcc_unreachable ();
7645 return;
7648 /* Parse STRING looking for options in the format:
7649 string :: option:string
7650 option :: name=substring
7651 name :: {a-z}
7652 substring :: defined by option. */
7654 static void
7655 aarch64_parse_override_string (const char* input_string,
7656 struct tune_params* tune)
7658 const char separator = ':';
7659 size_t string_length = strlen (input_string) + 1;
7660 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7661 char *string = string_root;
7662 strncpy (string, input_string, string_length);
7663 string[string_length - 1] = '\0';
7665 char* ntoken = string;
7667 while ((ntoken = strchr (string, separator)))
7669 size_t token_length = ntoken - string;
7670 /* Make this substring look like a string. */
7671 *ntoken = '\0';
7672 aarch64_parse_one_override_token (string, token_length, tune);
7673 string = ++ntoken;
7676 /* One last option to parse. */
7677 aarch64_parse_one_override_token (string, strlen (string), tune);
7678 free (string_root);
7682 static void
7683 aarch64_override_options_after_change_1 (struct gcc_options *opts)
7685 if (opts->x_flag_omit_frame_pointer)
7686 opts->x_flag_omit_leaf_frame_pointer = false;
7687 else if (opts->x_flag_omit_leaf_frame_pointer)
7688 opts->x_flag_omit_frame_pointer = true;
7690 /* If not optimizing for size, set the default
7691 alignment to what the target wants. */
7692 if (!opts->x_optimize_size)
7694 if (opts->x_align_loops <= 0)
7695 opts->x_align_loops = aarch64_tune_params.loop_align;
7696 if (opts->x_align_jumps <= 0)
7697 opts->x_align_jumps = aarch64_tune_params.jump_align;
7698 if (opts->x_align_functions <= 0)
7699 opts->x_align_functions = aarch64_tune_params.function_align;
7702 /* If nopcrelative_literal_loads is set on the command line, this
7703 implies that the user asked for PC relative literal loads. */
7704 if (opts->x_nopcrelative_literal_loads == 1)
7705 aarch64_nopcrelative_literal_loads = false;
7707 /* If it is not set on the command line, we default to no
7708 pc relative literal loads. */
7709 if (opts->x_nopcrelative_literal_loads == 2)
7710 aarch64_nopcrelative_literal_loads = true;
7712 /* In the tiny memory model it makes no sense
7713 to disallow non PC relative literal pool loads
7714 as many other things will break anyway. */
7715 if (opts->x_nopcrelative_literal_loads
7716 && (aarch64_cmodel == AARCH64_CMODEL_TINY
7717 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
7718 aarch64_nopcrelative_literal_loads = false;
7721 /* 'Unpack' up the internal tuning structs and update the options
7722 in OPTS. The caller must have set up selected_tune and selected_arch
7723 as all the other target-specific codegen decisions are
7724 derived from them. */
7726 void
7727 aarch64_override_options_internal (struct gcc_options *opts)
7729 aarch64_tune_flags = selected_tune->flags;
7730 aarch64_tune = selected_tune->sched_core;
7731 /* Make a copy of the tuning parameters attached to the core, which
7732 we may later overwrite. */
7733 aarch64_tune_params = *(selected_tune->tune);
7734 aarch64_architecture_version = selected_arch->architecture_version;
7736 if (opts->x_aarch64_override_tune_string)
7737 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
7738 &aarch64_tune_params);
7740 /* This target defaults to strict volatile bitfields. */
7741 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7742 opts->x_flag_strict_volatile_bitfields = 1;
7744 /* -mgeneral-regs-only sets a mask in target_flags, make sure that
7745 aarch64_isa_flags does not contain the FP/SIMD/Crypto feature flags
7746 in case some code tries reading aarch64_isa_flags directly to check if
7747 FP is available. Reuse the aarch64_parse_extension machinery since it
7748 knows how to disable any other flags that fp implies. */
7749 if (TARGET_GENERAL_REGS_ONLY_P (opts->x_target_flags))
7751 /* aarch64_parse_extension takes char* rather than const char* because
7752 it is usually called from within other parsing functions. */
7753 char tmp_str[] = "+nofp";
7754 aarch64_parse_extension (tmp_str, &opts->x_aarch64_isa_flags);
7757 initialize_aarch64_code_model (opts);
7758 initialize_aarch64_tls_size (opts);
7760 int queue_depth = 0;
7761 switch (aarch64_tune_params.autoprefetcher_model)
7763 case tune_params::AUTOPREFETCHER_OFF:
7764 queue_depth = -1;
7765 break;
7766 case tune_params::AUTOPREFETCHER_WEAK:
7767 queue_depth = 0;
7768 break;
7769 case tune_params::AUTOPREFETCHER_STRONG:
7770 queue_depth = max_insn_queue_index + 1;
7771 break;
7772 default:
7773 gcc_unreachable ();
7776 /* We don't mind passing in global_options_set here as we don't use
7777 the *options_set structs anyway. */
7778 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
7779 queue_depth,
7780 opts->x_param_values,
7781 global_options_set.x_param_values);
7783 aarch64_override_options_after_change_1 (opts);
7786 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
7787 specified in STR and throw errors if appropriate. Put the results if
7788 they are valid in RES and ISA_FLAGS. Return whether the option is
7789 valid. */
7791 static bool
7792 aarch64_validate_mcpu (const char *str, const struct processor **res,
7793 unsigned long *isa_flags)
7795 enum aarch64_parse_opt_result parse_res
7796 = aarch64_parse_cpu (str, res, isa_flags);
7798 if (parse_res == AARCH64_PARSE_OK)
7799 return true;
7801 switch (parse_res)
7803 case AARCH64_PARSE_MISSING_ARG:
7804 error ("missing cpu name in -mcpu=%qs", str);
7805 break;
7806 case AARCH64_PARSE_INVALID_ARG:
7807 error ("unknown value %qs for -mcpu", str);
7808 break;
7809 case AARCH64_PARSE_INVALID_FEATURE:
7810 error ("invalid feature modifier in -mcpu=%qs", str);
7811 break;
7812 default:
7813 gcc_unreachable ();
7816 return false;
7819 /* Validate a command-line -march option. Parse the arch and extensions
7820 (if any) specified in STR and throw errors if appropriate. Put the
7821 results, if they are valid, in RES and ISA_FLAGS. Return whether the
7822 option is valid. */
7824 static bool
7825 aarch64_validate_march (const char *str, const struct processor **res,
7826 unsigned long *isa_flags)
7828 enum aarch64_parse_opt_result parse_res
7829 = aarch64_parse_arch (str, res, isa_flags);
7831 if (parse_res == AARCH64_PARSE_OK)
7832 return true;
7834 switch (parse_res)
7836 case AARCH64_PARSE_MISSING_ARG:
7837 error ("missing arch name in -march=%qs", str);
7838 break;
7839 case AARCH64_PARSE_INVALID_ARG:
7840 error ("unknown value %qs for -march", str);
7841 break;
7842 case AARCH64_PARSE_INVALID_FEATURE:
7843 error ("invalid feature modifier in -march=%qs", str);
7844 break;
7845 default:
7846 gcc_unreachable ();
7849 return false;
7852 /* Validate a command-line -mtune option. Parse the cpu
7853 specified in STR and throw errors if appropriate. Put the
7854 result, if it is valid, in RES. Return whether the option is
7855 valid. */
7857 static bool
7858 aarch64_validate_mtune (const char *str, const struct processor **res)
7860 enum aarch64_parse_opt_result parse_res
7861 = aarch64_parse_tune (str, res);
7863 if (parse_res == AARCH64_PARSE_OK)
7864 return true;
7866 switch (parse_res)
7868 case AARCH64_PARSE_MISSING_ARG:
7869 error ("missing cpu name in -mtune=%qs", str);
7870 break;
7871 case AARCH64_PARSE_INVALID_ARG:
7872 error ("unknown value %qs for -mtune", str);
7873 break;
7874 default:
7875 gcc_unreachable ();
7877 return false;
7880 /* Return the CPU corresponding to the enum CPU.
7881 If it doesn't specify a cpu, return the default. */
7883 static const struct processor *
7884 aarch64_get_tune_cpu (enum aarch64_processor cpu)
7886 if (cpu != aarch64_none)
7887 return &all_cores[cpu];
7889 /* The & 0x3f is to extract the bottom 6 bits that encode the
7890 default cpu as selected by the --with-cpu GCC configure option
7891 in config.gcc.
7892 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
7893 flags mechanism should be reworked to make it more sane. */
7894 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7897 /* Return the architecture corresponding to the enum ARCH.
7898 If it doesn't specify a valid architecture, return the default. */
7900 static const struct processor *
7901 aarch64_get_arch (enum aarch64_arch arch)
7903 if (arch != aarch64_no_arch)
7904 return &all_architectures[arch];
7906 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7908 return &all_architectures[cpu->arch];
7911 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
7912 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
7913 tuning structs. In particular it must set selected_tune and
7914 aarch64_isa_flags that define the available ISA features and tuning
7915 decisions. It must also set selected_arch as this will be used to
7916 output the .arch asm tags for each function. */
7918 static void
7919 aarch64_override_options (void)
7921 unsigned long cpu_isa = 0;
7922 unsigned long arch_isa = 0;
7923 aarch64_isa_flags = 0;
7925 bool valid_cpu = true;
7926 bool valid_tune = true;
7927 bool valid_arch = true;
7929 selected_cpu = NULL;
7930 selected_arch = NULL;
7931 selected_tune = NULL;
7933 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7934 If either of -march or -mtune is given, they override their
7935 respective component of -mcpu. */
7936 if (aarch64_cpu_string)
7937 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
7938 &cpu_isa);
7940 if (aarch64_arch_string)
7941 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
7942 &arch_isa);
7944 if (aarch64_tune_string)
7945 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
7947 /* If the user did not specify a processor, choose the default
7948 one for them. This will be the CPU set during configuration using
7949 --with-cpu, otherwise it is "generic". */
7950 if (!selected_cpu)
7952 if (selected_arch)
7954 selected_cpu = &all_cores[selected_arch->ident];
7955 aarch64_isa_flags = arch_isa;
7956 explicit_arch = selected_arch->arch;
7958 else
7960 /* Get default configure-time CPU. */
7961 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
7962 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7965 if (selected_tune)
7966 explicit_tune_core = selected_tune->ident;
7968 /* If both -mcpu and -march are specified check that they are architecturally
7969 compatible, warn if they're not and prefer the -march ISA flags. */
7970 else if (selected_arch)
7972 if (selected_arch->arch != selected_cpu->arch)
7974 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7975 all_architectures[selected_cpu->arch].name,
7976 selected_arch->name);
7978 aarch64_isa_flags = arch_isa;
7979 explicit_arch = selected_arch->arch;
7980 explicit_tune_core = selected_tune ? selected_tune->ident
7981 : selected_cpu->ident;
7983 else
7985 /* -mcpu but no -march. */
7986 aarch64_isa_flags = cpu_isa;
7987 explicit_tune_core = selected_tune ? selected_tune->ident
7988 : selected_cpu->ident;
7989 gcc_assert (selected_cpu);
7990 selected_arch = &all_architectures[selected_cpu->arch];
7991 explicit_arch = selected_arch->arch;
7994 /* Set the arch as well as we will need it when outputing
7995 the .arch directive in assembly. */
7996 if (!selected_arch)
7998 gcc_assert (selected_cpu);
7999 selected_arch = &all_architectures[selected_cpu->arch];
8002 if (!selected_tune)
8003 selected_tune = selected_cpu;
8005 #ifndef HAVE_AS_MABI_OPTION
8006 /* The compiler may have been configured with 2.23.* binutils, which does
8007 not have support for ILP32. */
8008 if (TARGET_ILP32)
8009 error ("Assembler does not support -mabi=ilp32");
8010 #endif
8012 /* Make sure we properly set up the explicit options. */
8013 if ((aarch64_cpu_string && valid_cpu)
8014 || (aarch64_tune_string && valid_tune))
8015 gcc_assert (explicit_tune_core != aarch64_none);
8017 if ((aarch64_cpu_string && valid_cpu)
8018 || (aarch64_arch_string && valid_arch))
8019 gcc_assert (explicit_arch != aarch64_no_arch);
8021 aarch64_override_options_internal (&global_options);
8023 /* Save these options as the default ones in case we push and pop them later
8024 while processing functions with potential target attributes. */
8025 target_option_default_node = target_option_current_node
8026 = build_target_option_node (&global_options);
8028 aarch64_register_fma_steering ();
8032 /* Implement targetm.override_options_after_change. */
8034 static void
8035 aarch64_override_options_after_change (void)
8037 aarch64_override_options_after_change_1 (&global_options);
8040 static struct machine_function *
8041 aarch64_init_machine_status (void)
8043 struct machine_function *machine;
8044 machine = ggc_cleared_alloc<machine_function> ();
8045 return machine;
8048 void
8049 aarch64_init_expanders (void)
8051 init_machine_status = aarch64_init_machine_status;
8054 /* A checking mechanism for the implementation of the various code models. */
8055 static void
8056 initialize_aarch64_code_model (struct gcc_options *opts)
8058 if (opts->x_flag_pic)
8060 switch (opts->x_aarch64_cmodel_var)
8062 case AARCH64_CMODEL_TINY:
8063 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8064 break;
8065 case AARCH64_CMODEL_SMALL:
8066 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8067 aarch64_cmodel = (flag_pic == 2
8068 ? AARCH64_CMODEL_SMALL_PIC
8069 : AARCH64_CMODEL_SMALL_SPIC);
8070 #else
8071 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8072 #endif
8073 break;
8074 case AARCH64_CMODEL_LARGE:
8075 sorry ("code model %qs with -f%s", "large",
8076 opts->x_flag_pic > 1 ? "PIC" : "pic");
8077 break;
8078 default:
8079 gcc_unreachable ();
8082 else
8083 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8086 /* Implement TARGET_OPTION_SAVE. */
8088 static void
8089 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8091 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8094 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8095 using the information saved in PTR. */
8097 static void
8098 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8100 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8101 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8102 opts->x_explicit_arch = ptr->x_explicit_arch;
8103 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8104 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8106 aarch64_override_options_internal (opts);
8109 /* Implement TARGET_OPTION_PRINT. */
8111 static void
8112 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8114 const struct processor *cpu
8115 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8116 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8117 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8118 std::string extension
8119 = aarch64_get_extension_string_for_isa_flags (isa_flags);
8121 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8122 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8123 arch->name, extension.c_str ());
8126 static GTY(()) tree aarch64_previous_fndecl;
8128 void
8129 aarch64_reset_previous_fndecl (void)
8131 aarch64_previous_fndecl = NULL;
8134 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8135 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8136 of the function, if such exists. This function may be called multiple
8137 times on a single function so use aarch64_previous_fndecl to avoid
8138 setting up identical state. */
8140 static void
8141 aarch64_set_current_function (tree fndecl)
8143 tree old_tree = (aarch64_previous_fndecl
8144 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8145 : NULL_TREE);
8147 tree new_tree = (fndecl
8148 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
8149 : NULL_TREE);
8152 if (fndecl && fndecl != aarch64_previous_fndecl)
8154 aarch64_previous_fndecl = fndecl;
8155 if (old_tree == new_tree)
8158 else if (new_tree && new_tree != target_option_default_node)
8160 cl_target_option_restore (&global_options,
8161 TREE_TARGET_OPTION (new_tree));
8162 if (TREE_TARGET_GLOBALS (new_tree))
8163 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8164 else
8165 TREE_TARGET_GLOBALS (new_tree)
8166 = save_target_globals_default_opts ();
8169 else if (old_tree && old_tree != target_option_default_node)
8171 new_tree = target_option_current_node;
8172 cl_target_option_restore (&global_options,
8173 TREE_TARGET_OPTION (new_tree));
8174 if (TREE_TARGET_GLOBALS (new_tree))
8175 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8176 else if (new_tree == target_option_default_node)
8177 restore_target_globals (&default_target_globals);
8178 else
8179 TREE_TARGET_GLOBALS (new_tree)
8180 = save_target_globals_default_opts ();
8184 if (!fndecl)
8185 return;
8187 /* If we turned on SIMD make sure that any vector parameters are re-laid out
8188 so that they use proper vector modes. */
8189 if (TARGET_SIMD)
8191 tree parms = DECL_ARGUMENTS (fndecl);
8192 for (; parms && parms != void_list_node; parms = TREE_CHAIN (parms))
8194 if (TREE_CODE (parms) == PARM_DECL
8195 && VECTOR_TYPE_P (TREE_TYPE (parms))
8196 && DECL_MODE (parms) != TYPE_MODE (TREE_TYPE (parms)))
8197 relayout_decl (parms);
8202 /* Enum describing the various ways we can handle attributes.
8203 In many cases we can reuse the generic option handling machinery. */
8205 enum aarch64_attr_opt_type
8207 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8208 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8209 aarch64_attr_enum, /* Attribute sets an enum variable. */
8210 aarch64_attr_custom /* Attribute requires a custom handling function. */
8213 /* All the information needed to handle a target attribute.
8214 NAME is the name of the attribute.
8215 ATTR_TYPE specifies the type of behaviour of the attribute as described
8216 in the definition of enum aarch64_attr_opt_type.
8217 ALLOW_NEG is true if the attribute supports a "no-" form.
8218 HANDLER is the function that takes the attribute string and whether
8219 it is a pragma or attribute and handles the option. It is needed only
8220 when the ATTR_TYPE is aarch64_attr_custom.
8221 OPT_NUM is the enum specifying the option that the attribute modifies.
8222 This is needed for attributes that mirror the behaviour of a command-line
8223 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8224 aarch64_attr_enum. */
8226 struct aarch64_attribute_info
8228 const char *name;
8229 enum aarch64_attr_opt_type attr_type;
8230 bool allow_neg;
8231 bool (*handler) (const char *, const char *);
8232 enum opt_code opt_num;
8235 /* Handle the ARCH_STR argument to the arch= target attribute.
8236 PRAGMA_OR_ATTR is used in potential error messages. */
8238 static bool
8239 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8241 const struct processor *tmp_arch = NULL;
8242 enum aarch64_parse_opt_result parse_res
8243 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8245 if (parse_res == AARCH64_PARSE_OK)
8247 gcc_assert (tmp_arch);
8248 selected_arch = tmp_arch;
8249 explicit_arch = selected_arch->arch;
8250 return true;
8253 switch (parse_res)
8255 case AARCH64_PARSE_MISSING_ARG:
8256 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8257 break;
8258 case AARCH64_PARSE_INVALID_ARG:
8259 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8260 break;
8261 case AARCH64_PARSE_INVALID_FEATURE:
8262 error ("invalid feature modifier %qs for 'arch' target %s",
8263 str, pragma_or_attr);
8264 break;
8265 default:
8266 gcc_unreachable ();
8269 return false;
8272 /* Handle the argument CPU_STR to the cpu= target attribute.
8273 PRAGMA_OR_ATTR is used in potential error messages. */
8275 static bool
8276 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8278 const struct processor *tmp_cpu = NULL;
8279 enum aarch64_parse_opt_result parse_res
8280 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8282 if (parse_res == AARCH64_PARSE_OK)
8284 gcc_assert (tmp_cpu);
8285 selected_tune = tmp_cpu;
8286 explicit_tune_core = selected_tune->ident;
8288 selected_arch = &all_architectures[tmp_cpu->arch];
8289 explicit_arch = selected_arch->arch;
8290 return true;
8293 switch (parse_res)
8295 case AARCH64_PARSE_MISSING_ARG:
8296 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8297 break;
8298 case AARCH64_PARSE_INVALID_ARG:
8299 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8300 break;
8301 case AARCH64_PARSE_INVALID_FEATURE:
8302 error ("invalid feature modifier %qs for 'cpu' target %s",
8303 str, pragma_or_attr);
8304 break;
8305 default:
8306 gcc_unreachable ();
8309 return false;
8312 /* Handle the argument STR to the tune= target attribute.
8313 PRAGMA_OR_ATTR is used in potential error messages. */
8315 static bool
8316 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8318 const struct processor *tmp_tune = NULL;
8319 enum aarch64_parse_opt_result parse_res
8320 = aarch64_parse_tune (str, &tmp_tune);
8322 if (parse_res == AARCH64_PARSE_OK)
8324 gcc_assert (tmp_tune);
8325 selected_tune = tmp_tune;
8326 explicit_tune_core = selected_tune->ident;
8327 return true;
8330 switch (parse_res)
8332 case AARCH64_PARSE_INVALID_ARG:
8333 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8334 break;
8335 default:
8336 gcc_unreachable ();
8339 return false;
8342 /* Parse an architecture extensions target attribute string specified in STR.
8343 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8344 if successful. Update aarch64_isa_flags to reflect the ISA features
8345 modified.
8346 PRAGMA_OR_ATTR is used in potential error messages. */
8348 static bool
8349 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8351 enum aarch64_parse_opt_result parse_res;
8352 unsigned long isa_flags = aarch64_isa_flags;
8354 /* We allow "+nothing" in the beginning to clear out all architectural
8355 features if the user wants to handpick specific features. */
8356 if (strncmp ("+nothing", str, 8) == 0)
8358 isa_flags = 0;
8359 str += 8;
8362 parse_res = aarch64_parse_extension (str, &isa_flags);
8364 if (parse_res == AARCH64_PARSE_OK)
8366 aarch64_isa_flags = isa_flags;
8367 return true;
8370 switch (parse_res)
8372 case AARCH64_PARSE_MISSING_ARG:
8373 error ("missing feature modifier in target %s %qs",
8374 pragma_or_attr, str);
8375 break;
8377 case AARCH64_PARSE_INVALID_FEATURE:
8378 error ("invalid feature modifier in target %s %qs",
8379 pragma_or_attr, str);
8380 break;
8382 default:
8383 gcc_unreachable ();
8386 return false;
8389 /* The target attributes that we support. On top of these we also support just
8390 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8391 handled explicitly in aarch64_process_one_target_attr. */
8393 static const struct aarch64_attribute_info aarch64_attributes[] =
8395 { "general-regs-only", aarch64_attr_mask, false, NULL,
8396 OPT_mgeneral_regs_only },
8397 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8398 OPT_mfix_cortex_a53_835769 },
8399 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8400 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8401 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8402 OPT_momit_leaf_frame_pointer },
8403 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8404 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8405 OPT_march_ },
8406 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8407 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8408 OPT_mtune_ },
8409 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8412 /* Parse ARG_STR which contains the definition of one target attribute.
8413 Show appropriate errors if any or return true if the attribute is valid.
8414 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8415 we're processing a target attribute or pragma. */
8417 static bool
8418 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8420 bool invert = false;
8422 size_t len = strlen (arg_str);
8424 if (len == 0)
8426 error ("malformed target %s", pragma_or_attr);
8427 return false;
8430 char *str_to_check = (char *) alloca (len + 1);
8431 strcpy (str_to_check, arg_str);
8433 /* Skip leading whitespace. */
8434 while (*str_to_check == ' ' || *str_to_check == '\t')
8435 str_to_check++;
8437 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8438 It is easier to detect and handle it explicitly here rather than going
8439 through the machinery for the rest of the target attributes in this
8440 function. */
8441 if (*str_to_check == '+')
8442 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8444 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8446 invert = true;
8447 str_to_check += 3;
8449 char *arg = strchr (str_to_check, '=');
8451 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8452 and point ARG to "foo". */
8453 if (arg)
8455 *arg = '\0';
8456 arg++;
8458 const struct aarch64_attribute_info *p_attr;
8459 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8461 /* If the names don't match up, or the user has given an argument
8462 to an attribute that doesn't accept one, or didn't give an argument
8463 to an attribute that expects one, fail to match. */
8464 if (strcmp (str_to_check, p_attr->name) != 0)
8465 continue;
8467 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8468 || p_attr->attr_type == aarch64_attr_enum;
8470 if (attr_need_arg_p ^ (arg != NULL))
8472 error ("target %s %qs does not accept an argument",
8473 pragma_or_attr, str_to_check);
8474 return false;
8477 /* If the name matches but the attribute does not allow "no-" versions
8478 then we can't match. */
8479 if (invert && !p_attr->allow_neg)
8481 error ("target %s %qs does not allow a negated form",
8482 pragma_or_attr, str_to_check);
8483 return false;
8486 switch (p_attr->attr_type)
8488 /* Has a custom handler registered.
8489 For example, cpu=, arch=, tune=. */
8490 case aarch64_attr_custom:
8491 gcc_assert (p_attr->handler);
8492 if (!p_attr->handler (arg, pragma_or_attr))
8493 return false;
8494 break;
8496 /* Either set or unset a boolean option. */
8497 case aarch64_attr_bool:
8499 struct cl_decoded_option decoded;
8501 generate_option (p_attr->opt_num, NULL, !invert,
8502 CL_TARGET, &decoded);
8503 aarch64_handle_option (&global_options, &global_options_set,
8504 &decoded, input_location);
8505 break;
8507 /* Set or unset a bit in the target_flags. aarch64_handle_option
8508 should know what mask to apply given the option number. */
8509 case aarch64_attr_mask:
8511 struct cl_decoded_option decoded;
8512 /* We only need to specify the option number.
8513 aarch64_handle_option will know which mask to apply. */
8514 decoded.opt_index = p_attr->opt_num;
8515 decoded.value = !invert;
8516 aarch64_handle_option (&global_options, &global_options_set,
8517 &decoded, input_location);
8518 break;
8520 /* Use the option setting machinery to set an option to an enum. */
8521 case aarch64_attr_enum:
8523 gcc_assert (arg);
8524 bool valid;
8525 int value;
8526 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8527 &value, CL_TARGET);
8528 if (valid)
8530 set_option (&global_options, NULL, p_attr->opt_num, value,
8531 NULL, DK_UNSPECIFIED, input_location,
8532 global_dc);
8534 else
8536 error ("target %s %s=%s is not valid",
8537 pragma_or_attr, str_to_check, arg);
8539 break;
8541 default:
8542 gcc_unreachable ();
8546 return true;
8549 /* Count how many times the character C appears in
8550 NULL-terminated string STR. */
8552 static unsigned int
8553 num_occurences_in_str (char c, char *str)
8555 unsigned int res = 0;
8556 while (*str != '\0')
8558 if (*str == c)
8559 res++;
8561 str++;
8564 return res;
8567 /* Parse the tree in ARGS that contains the target attribute information
8568 and update the global target options space. PRAGMA_OR_ATTR is a string
8569 to be used in error messages, specifying whether this is processing
8570 a target attribute or a target pragma. */
8572 bool
8573 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
8575 if (TREE_CODE (args) == TREE_LIST)
8579 tree head = TREE_VALUE (args);
8580 if (head)
8582 if (!aarch64_process_target_attr (head, pragma_or_attr))
8583 return false;
8585 args = TREE_CHAIN (args);
8586 } while (args);
8588 return true;
8590 /* We expect to find a string to parse. */
8591 gcc_assert (TREE_CODE (args) == STRING_CST);
8593 size_t len = strlen (TREE_STRING_POINTER (args));
8594 char *str_to_check = (char *) alloca (len + 1);
8595 strcpy (str_to_check, TREE_STRING_POINTER (args));
8597 if (len == 0)
8599 error ("malformed target %s value", pragma_or_attr);
8600 return false;
8603 /* Used to catch empty spaces between commas i.e.
8604 attribute ((target ("attr1,,attr2"))). */
8605 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
8607 /* Handle multiple target attributes separated by ','. */
8608 char *token = strtok (str_to_check, ",");
8610 unsigned int num_attrs = 0;
8611 while (token)
8613 num_attrs++;
8614 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
8616 error ("target %s %qs is invalid", pragma_or_attr, token);
8617 return false;
8620 token = strtok (NULL, ",");
8623 if (num_attrs != num_commas + 1)
8625 error ("malformed target %s list %qs",
8626 pragma_or_attr, TREE_STRING_POINTER (args));
8627 return false;
8630 return true;
8633 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
8634 process attribute ((target ("..."))). */
8636 static bool
8637 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
8639 struct cl_target_option cur_target;
8640 bool ret;
8641 tree old_optimize;
8642 tree new_target, new_optimize;
8643 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8645 /* If what we're processing is the current pragma string then the
8646 target option node is already stored in target_option_current_node
8647 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
8648 having to re-parse the string. This is especially useful to keep
8649 arm_neon.h compile times down since that header contains a lot
8650 of intrinsics enclosed in pragmas. */
8651 if (!existing_target && args == current_target_pragma)
8653 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
8654 return true;
8656 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8658 old_optimize = build_optimization_node (&global_options);
8659 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8661 /* If the function changed the optimization levels as well as setting
8662 target options, start with the optimizations specified. */
8663 if (func_optimize && func_optimize != old_optimize)
8664 cl_optimization_restore (&global_options,
8665 TREE_OPTIMIZATION (func_optimize));
8667 /* Save the current target options to restore at the end. */
8668 cl_target_option_save (&cur_target, &global_options);
8670 /* If fndecl already has some target attributes applied to it, unpack
8671 them so that we add this attribute on top of them, rather than
8672 overwriting them. */
8673 if (existing_target)
8675 struct cl_target_option *existing_options
8676 = TREE_TARGET_OPTION (existing_target);
8678 if (existing_options)
8679 cl_target_option_restore (&global_options, existing_options);
8681 else
8682 cl_target_option_restore (&global_options,
8683 TREE_TARGET_OPTION (target_option_current_node));
8686 ret = aarch64_process_target_attr (args, "attribute");
8688 /* Set up any additional state. */
8689 if (ret)
8691 aarch64_override_options_internal (&global_options);
8692 /* Initialize SIMD builtins if we haven't already.
8693 Set current_target_pragma to NULL for the duration so that
8694 the builtin initialization code doesn't try to tag the functions
8695 being built with the attributes specified by any current pragma, thus
8696 going into an infinite recursion. */
8697 if (TARGET_SIMD)
8699 tree saved_current_target_pragma = current_target_pragma;
8700 current_target_pragma = NULL;
8701 aarch64_init_simd_builtins ();
8702 current_target_pragma = saved_current_target_pragma;
8704 new_target = build_target_option_node (&global_options);
8706 else
8707 new_target = NULL;
8709 new_optimize = build_optimization_node (&global_options);
8711 if (fndecl && ret)
8713 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
8715 if (old_optimize != new_optimize)
8716 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
8719 cl_target_option_restore (&global_options, &cur_target);
8721 if (old_optimize != new_optimize)
8722 cl_optimization_restore (&global_options,
8723 TREE_OPTIMIZATION (old_optimize));
8724 return ret;
8727 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
8728 tri-bool options (yes, no, don't care) and the default value is
8729 DEF, determine whether to reject inlining. */
8731 static bool
8732 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
8733 int dont_care, int def)
8735 /* If the callee doesn't care, always allow inlining. */
8736 if (callee == dont_care)
8737 return true;
8739 /* If the caller doesn't care, always allow inlining. */
8740 if (caller == dont_care)
8741 return true;
8743 /* Otherwise, allow inlining if either the callee and caller values
8744 agree, or if the callee is using the default value. */
8745 return (callee == caller || callee == def);
8748 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
8749 to inline CALLEE into CALLER based on target-specific info.
8750 Make sure that the caller and callee have compatible architectural
8751 features. Then go through the other possible target attributes
8752 and see if they can block inlining. Try not to reject always_inline
8753 callees unless they are incompatible architecturally. */
8755 static bool
8756 aarch64_can_inline_p (tree caller, tree callee)
8758 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
8759 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
8761 /* If callee has no option attributes, then it is ok to inline. */
8762 if (!callee_tree)
8763 return true;
8765 struct cl_target_option *caller_opts
8766 = TREE_TARGET_OPTION (caller_tree ? caller_tree
8767 : target_option_default_node);
8769 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
8772 /* Callee's ISA flags should be a subset of the caller's. */
8773 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
8774 != callee_opts->x_aarch64_isa_flags)
8775 return false;
8777 /* Allow non-strict aligned functions inlining into strict
8778 aligned ones. */
8779 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
8780 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
8781 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
8782 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
8783 return false;
8785 bool always_inline = lookup_attribute ("always_inline",
8786 DECL_ATTRIBUTES (callee));
8788 /* If the architectural features match up and the callee is always_inline
8789 then the other attributes don't matter. */
8790 if (always_inline)
8791 return true;
8793 if (caller_opts->x_aarch64_cmodel_var
8794 != callee_opts->x_aarch64_cmodel_var)
8795 return false;
8797 if (caller_opts->x_aarch64_tls_dialect
8798 != callee_opts->x_aarch64_tls_dialect)
8799 return false;
8801 /* Honour explicit requests to workaround errata. */
8802 if (!aarch64_tribools_ok_for_inlining_p (
8803 caller_opts->x_aarch64_fix_a53_err835769,
8804 callee_opts->x_aarch64_fix_a53_err835769,
8805 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
8806 return false;
8808 /* If the user explicitly specified -momit-leaf-frame-pointer for the
8809 caller and calle and they don't match up, reject inlining. */
8810 if (!aarch64_tribools_ok_for_inlining_p (
8811 caller_opts->x_flag_omit_leaf_frame_pointer,
8812 callee_opts->x_flag_omit_leaf_frame_pointer,
8813 2, 1))
8814 return false;
8816 /* If the callee has specific tuning overrides, respect them. */
8817 if (callee_opts->x_aarch64_override_tune_string != NULL
8818 && caller_opts->x_aarch64_override_tune_string == NULL)
8819 return false;
8821 /* If the user specified tuning override strings for the
8822 caller and callee and they don't match up, reject inlining.
8823 We just do a string compare here, we don't analyze the meaning
8824 of the string, as it would be too costly for little gain. */
8825 if (callee_opts->x_aarch64_override_tune_string
8826 && caller_opts->x_aarch64_override_tune_string
8827 && (strcmp (callee_opts->x_aarch64_override_tune_string,
8828 caller_opts->x_aarch64_override_tune_string) != 0))
8829 return false;
8831 return true;
8834 /* Return true if SYMBOL_REF X binds locally. */
8836 static bool
8837 aarch64_symbol_binds_local_p (const_rtx x)
8839 return (SYMBOL_REF_DECL (x)
8840 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
8841 : SYMBOL_REF_LOCAL_P (x));
8844 /* Return true if SYMBOL_REF X is thread local */
8845 static bool
8846 aarch64_tls_symbol_p (rtx x)
8848 if (! TARGET_HAVE_TLS)
8849 return false;
8851 if (GET_CODE (x) != SYMBOL_REF)
8852 return false;
8854 return SYMBOL_REF_TLS_MODEL (x) != 0;
8857 /* Classify a TLS symbol into one of the TLS kinds. */
8858 enum aarch64_symbol_type
8859 aarch64_classify_tls_symbol (rtx x)
8861 enum tls_model tls_kind = tls_symbolic_operand_type (x);
8863 switch (tls_kind)
8865 case TLS_MODEL_GLOBAL_DYNAMIC:
8866 case TLS_MODEL_LOCAL_DYNAMIC:
8867 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
8869 case TLS_MODEL_INITIAL_EXEC:
8870 switch (aarch64_cmodel)
8872 case AARCH64_CMODEL_TINY:
8873 case AARCH64_CMODEL_TINY_PIC:
8874 return SYMBOL_TINY_TLSIE;
8875 default:
8876 return SYMBOL_SMALL_TLSIE;
8879 case TLS_MODEL_LOCAL_EXEC:
8880 if (aarch64_tls_size == 12)
8881 return SYMBOL_TLSLE12;
8882 else if (aarch64_tls_size == 24)
8883 return SYMBOL_TLSLE24;
8884 else if (aarch64_tls_size == 32)
8885 return SYMBOL_TLSLE32;
8886 else if (aarch64_tls_size == 48)
8887 return SYMBOL_TLSLE48;
8888 else
8889 gcc_unreachable ();
8891 case TLS_MODEL_EMULATED:
8892 case TLS_MODEL_NONE:
8893 return SYMBOL_FORCE_TO_MEM;
8895 default:
8896 gcc_unreachable ();
8900 /* Return the method that should be used to access SYMBOL_REF or
8901 LABEL_REF X. */
8903 enum aarch64_symbol_type
8904 aarch64_classify_symbol (rtx x, rtx offset)
8906 if (GET_CODE (x) == LABEL_REF)
8908 switch (aarch64_cmodel)
8910 case AARCH64_CMODEL_LARGE:
8911 return SYMBOL_FORCE_TO_MEM;
8913 case AARCH64_CMODEL_TINY_PIC:
8914 case AARCH64_CMODEL_TINY:
8915 return SYMBOL_TINY_ABSOLUTE;
8917 case AARCH64_CMODEL_SMALL_SPIC:
8918 case AARCH64_CMODEL_SMALL_PIC:
8919 case AARCH64_CMODEL_SMALL:
8920 return SYMBOL_SMALL_ABSOLUTE;
8922 default:
8923 gcc_unreachable ();
8927 if (GET_CODE (x) == SYMBOL_REF)
8929 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
8931 /* This is alright even in PIC code as the constant
8932 pool reference is always PC relative and within
8933 the same translation unit. */
8934 if (nopcrelative_literal_loads
8935 && CONSTANT_POOL_ADDRESS_P (x))
8936 return SYMBOL_SMALL_ABSOLUTE;
8937 else
8938 return SYMBOL_FORCE_TO_MEM;
8941 if (aarch64_tls_symbol_p (x))
8942 return aarch64_classify_tls_symbol (x);
8944 switch (aarch64_cmodel)
8946 case AARCH64_CMODEL_TINY:
8947 /* When we retreive symbol + offset address, we have to make sure
8948 the offset does not cause overflow of the final address. But
8949 we have no way of knowing the address of symbol at compile time
8950 so we can't accurately say if the distance between the PC and
8951 symbol + offset is outside the addressible range of +/-1M in the
8952 TINY code model. So we rely on images not being greater than
8953 1M and cap the offset at 1M and anything beyond 1M will have to
8954 be loaded using an alternative mechanism. */
8955 if (SYMBOL_REF_WEAK (x)
8956 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
8957 return SYMBOL_FORCE_TO_MEM;
8958 return SYMBOL_TINY_ABSOLUTE;
8960 case AARCH64_CMODEL_SMALL:
8961 /* Same reasoning as the tiny code model, but the offset cap here is
8962 4G. */
8963 if (SYMBOL_REF_WEAK (x)
8964 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
8965 HOST_WIDE_INT_C (4294967264)))
8966 return SYMBOL_FORCE_TO_MEM;
8967 return SYMBOL_SMALL_ABSOLUTE;
8969 case AARCH64_CMODEL_TINY_PIC:
8970 if (!aarch64_symbol_binds_local_p (x))
8971 return SYMBOL_TINY_GOT;
8972 return SYMBOL_TINY_ABSOLUTE;
8974 case AARCH64_CMODEL_SMALL_SPIC:
8975 case AARCH64_CMODEL_SMALL_PIC:
8976 if (!aarch64_symbol_binds_local_p (x))
8977 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
8978 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
8979 return SYMBOL_SMALL_ABSOLUTE;
8981 default:
8982 gcc_unreachable ();
8986 /* By default push everything into the constant pool. */
8987 return SYMBOL_FORCE_TO_MEM;
8990 bool
8991 aarch64_constant_address_p (rtx x)
8993 return (CONSTANT_P (x) && memory_address_p (DImode, x));
8996 bool
8997 aarch64_legitimate_pic_operand_p (rtx x)
8999 if (GET_CODE (x) == SYMBOL_REF
9000 || (GET_CODE (x) == CONST
9001 && GET_CODE (XEXP (x, 0)) == PLUS
9002 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9003 return false;
9005 return true;
9008 /* Return true if X holds either a quarter-precision or
9009 floating-point +0.0 constant. */
9010 static bool
9011 aarch64_valid_floating_const (machine_mode mode, rtx x)
9013 if (!CONST_DOUBLE_P (x))
9014 return false;
9016 if (aarch64_float_const_zero_rtx_p (x))
9017 return true;
9019 /* We only handle moving 0.0 to a TFmode register. */
9020 if (!(mode == SFmode || mode == DFmode))
9021 return false;
9023 return aarch64_float_const_representable_p (x);
9026 static bool
9027 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9029 /* Do not allow vector struct mode constants. We could support
9030 0 and -1 easily, but they need support in aarch64-simd.md. */
9031 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9032 return false;
9034 /* This could probably go away because
9035 we now decompose CONST_INTs according to expand_mov_immediate. */
9036 if ((GET_CODE (x) == CONST_VECTOR
9037 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9038 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9039 return !targetm.cannot_force_const_mem (mode, x);
9041 if (GET_CODE (x) == HIGH
9042 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9043 return true;
9045 return aarch64_constant_address_p (x);
9049 aarch64_load_tp (rtx target)
9051 if (!target
9052 || GET_MODE (target) != Pmode
9053 || !register_operand (target, Pmode))
9054 target = gen_reg_rtx (Pmode);
9056 /* Can return in any reg. */
9057 emit_insn (gen_aarch64_load_tp_hard (target));
9058 return target;
9061 /* On AAPCS systems, this is the "struct __va_list". */
9062 static GTY(()) tree va_list_type;
9064 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9065 Return the type to use as __builtin_va_list.
9067 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9069 struct __va_list
9071 void *__stack;
9072 void *__gr_top;
9073 void *__vr_top;
9074 int __gr_offs;
9075 int __vr_offs;
9076 }; */
9078 static tree
9079 aarch64_build_builtin_va_list (void)
9081 tree va_list_name;
9082 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9084 /* Create the type. */
9085 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9086 /* Give it the required name. */
9087 va_list_name = build_decl (BUILTINS_LOCATION,
9088 TYPE_DECL,
9089 get_identifier ("__va_list"),
9090 va_list_type);
9091 DECL_ARTIFICIAL (va_list_name) = 1;
9092 TYPE_NAME (va_list_type) = va_list_name;
9093 TYPE_STUB_DECL (va_list_type) = va_list_name;
9095 /* Create the fields. */
9096 f_stack = build_decl (BUILTINS_LOCATION,
9097 FIELD_DECL, get_identifier ("__stack"),
9098 ptr_type_node);
9099 f_grtop = build_decl (BUILTINS_LOCATION,
9100 FIELD_DECL, get_identifier ("__gr_top"),
9101 ptr_type_node);
9102 f_vrtop = build_decl (BUILTINS_LOCATION,
9103 FIELD_DECL, get_identifier ("__vr_top"),
9104 ptr_type_node);
9105 f_groff = build_decl (BUILTINS_LOCATION,
9106 FIELD_DECL, get_identifier ("__gr_offs"),
9107 integer_type_node);
9108 f_vroff = build_decl (BUILTINS_LOCATION,
9109 FIELD_DECL, get_identifier ("__vr_offs"),
9110 integer_type_node);
9112 DECL_ARTIFICIAL (f_stack) = 1;
9113 DECL_ARTIFICIAL (f_grtop) = 1;
9114 DECL_ARTIFICIAL (f_vrtop) = 1;
9115 DECL_ARTIFICIAL (f_groff) = 1;
9116 DECL_ARTIFICIAL (f_vroff) = 1;
9118 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9119 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9120 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9121 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9122 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9124 TYPE_FIELDS (va_list_type) = f_stack;
9125 DECL_CHAIN (f_stack) = f_grtop;
9126 DECL_CHAIN (f_grtop) = f_vrtop;
9127 DECL_CHAIN (f_vrtop) = f_groff;
9128 DECL_CHAIN (f_groff) = f_vroff;
9130 /* Compute its layout. */
9131 layout_type (va_list_type);
9133 return va_list_type;
9136 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9137 static void
9138 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9140 const CUMULATIVE_ARGS *cum;
9141 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9142 tree stack, grtop, vrtop, groff, vroff;
9143 tree t;
9144 int gr_save_area_size;
9145 int vr_save_area_size;
9146 int vr_offset;
9148 cum = &crtl->args.info;
9149 gr_save_area_size
9150 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
9151 vr_save_area_size
9152 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
9154 if (!TARGET_FLOAT)
9156 gcc_assert (cum->aapcs_nvrn == 0);
9157 vr_save_area_size = 0;
9160 f_stack = TYPE_FIELDS (va_list_type_node);
9161 f_grtop = DECL_CHAIN (f_stack);
9162 f_vrtop = DECL_CHAIN (f_grtop);
9163 f_groff = DECL_CHAIN (f_vrtop);
9164 f_vroff = DECL_CHAIN (f_groff);
9166 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9167 NULL_TREE);
9168 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9169 NULL_TREE);
9170 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9171 NULL_TREE);
9172 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9173 NULL_TREE);
9174 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9175 NULL_TREE);
9177 /* Emit code to initialize STACK, which points to the next varargs stack
9178 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9179 by named arguments. STACK is 8-byte aligned. */
9180 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9181 if (cum->aapcs_stack_size > 0)
9182 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9183 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9184 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9186 /* Emit code to initialize GRTOP, the top of the GR save area.
9187 virtual_incoming_args_rtx should have been 16 byte aligned. */
9188 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9189 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9190 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9192 /* Emit code to initialize VRTOP, the top of the VR save area.
9193 This address is gr_save_area_bytes below GRTOP, rounded
9194 down to the next 16-byte boundary. */
9195 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9196 vr_offset = ROUND_UP (gr_save_area_size,
9197 STACK_BOUNDARY / BITS_PER_UNIT);
9199 if (vr_offset)
9200 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9201 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9202 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9204 /* Emit code to initialize GROFF, the offset from GRTOP of the
9205 next GPR argument. */
9206 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9207 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9208 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9210 /* Likewise emit code to initialize VROFF, the offset from FTOP
9211 of the next VR argument. */
9212 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9213 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9214 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9217 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9219 static tree
9220 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9221 gimple_seq *post_p ATTRIBUTE_UNUSED)
9223 tree addr;
9224 bool indirect_p;
9225 bool is_ha; /* is HFA or HVA. */
9226 bool dw_align; /* double-word align. */
9227 machine_mode ag_mode = VOIDmode;
9228 int nregs;
9229 machine_mode mode;
9231 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9232 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9233 HOST_WIDE_INT size, rsize, adjust, align;
9234 tree t, u, cond1, cond2;
9236 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9237 if (indirect_p)
9238 type = build_pointer_type (type);
9240 mode = TYPE_MODE (type);
9242 f_stack = TYPE_FIELDS (va_list_type_node);
9243 f_grtop = DECL_CHAIN (f_stack);
9244 f_vrtop = DECL_CHAIN (f_grtop);
9245 f_groff = DECL_CHAIN (f_vrtop);
9246 f_vroff = DECL_CHAIN (f_groff);
9248 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9249 f_stack, NULL_TREE);
9250 size = int_size_in_bytes (type);
9251 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9253 dw_align = false;
9254 adjust = 0;
9255 if (aarch64_vfp_is_call_or_return_candidate (mode,
9256 type,
9257 &ag_mode,
9258 &nregs,
9259 &is_ha))
9261 /* TYPE passed in fp/simd registers. */
9262 if (!TARGET_FLOAT)
9263 aarch64_err_no_fpadvsimd (mode, "varargs");
9265 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9266 unshare_expr (valist), f_vrtop, NULL_TREE);
9267 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9268 unshare_expr (valist), f_vroff, NULL_TREE);
9270 rsize = nregs * UNITS_PER_VREG;
9272 if (is_ha)
9274 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9275 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9277 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9278 && size < UNITS_PER_VREG)
9280 adjust = UNITS_PER_VREG - size;
9283 else
9285 /* TYPE passed in general registers. */
9286 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9287 unshare_expr (valist), f_grtop, NULL_TREE);
9288 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9289 unshare_expr (valist), f_groff, NULL_TREE);
9290 rsize = ROUND_UP (size, UNITS_PER_WORD);
9291 nregs = rsize / UNITS_PER_WORD;
9293 if (align > 8)
9294 dw_align = true;
9296 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9297 && size < UNITS_PER_WORD)
9299 adjust = UNITS_PER_WORD - size;
9303 /* Get a local temporary for the field value. */
9304 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9306 /* Emit code to branch if off >= 0. */
9307 t = build2 (GE_EXPR, boolean_type_node, off,
9308 build_int_cst (TREE_TYPE (off), 0));
9309 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9311 if (dw_align)
9313 /* Emit: offs = (offs + 15) & -16. */
9314 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9315 build_int_cst (TREE_TYPE (off), 15));
9316 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9317 build_int_cst (TREE_TYPE (off), -16));
9318 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9320 else
9321 roundup = NULL;
9323 /* Update ap.__[g|v]r_offs */
9324 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9325 build_int_cst (TREE_TYPE (off), rsize));
9326 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9328 /* String up. */
9329 if (roundup)
9330 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9332 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9333 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9334 build_int_cst (TREE_TYPE (f_off), 0));
9335 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9337 /* String up: make sure the assignment happens before the use. */
9338 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9339 COND_EXPR_ELSE (cond1) = t;
9341 /* Prepare the trees handling the argument that is passed on the stack;
9342 the top level node will store in ON_STACK. */
9343 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9344 if (align > 8)
9346 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9347 t = fold_convert (intDI_type_node, arg);
9348 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9349 build_int_cst (TREE_TYPE (t), 15));
9350 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9351 build_int_cst (TREE_TYPE (t), -16));
9352 t = fold_convert (TREE_TYPE (arg), t);
9353 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9355 else
9356 roundup = NULL;
9357 /* Advance ap.__stack */
9358 t = fold_convert (intDI_type_node, arg);
9359 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9360 build_int_cst (TREE_TYPE (t), size + 7));
9361 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9362 build_int_cst (TREE_TYPE (t), -8));
9363 t = fold_convert (TREE_TYPE (arg), t);
9364 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9365 /* String up roundup and advance. */
9366 if (roundup)
9367 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9368 /* String up with arg */
9369 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9370 /* Big-endianness related address adjustment. */
9371 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9372 && size < UNITS_PER_WORD)
9374 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9375 size_int (UNITS_PER_WORD - size));
9376 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9379 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9380 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9382 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9383 t = off;
9384 if (adjust)
9385 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9386 build_int_cst (TREE_TYPE (off), adjust));
9388 t = fold_convert (sizetype, t);
9389 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9391 if (is_ha)
9393 /* type ha; // treat as "struct {ftype field[n];}"
9394 ... [computing offs]
9395 for (i = 0; i <nregs; ++i, offs += 16)
9396 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9397 return ha; */
9398 int i;
9399 tree tmp_ha, field_t, field_ptr_t;
9401 /* Declare a local variable. */
9402 tmp_ha = create_tmp_var_raw (type, "ha");
9403 gimple_add_tmp_var (tmp_ha);
9405 /* Establish the base type. */
9406 switch (ag_mode)
9408 case SFmode:
9409 field_t = float_type_node;
9410 field_ptr_t = float_ptr_type_node;
9411 break;
9412 case DFmode:
9413 field_t = double_type_node;
9414 field_ptr_t = double_ptr_type_node;
9415 break;
9416 case TFmode:
9417 field_t = long_double_type_node;
9418 field_ptr_t = long_double_ptr_type_node;
9419 break;
9420 /* The half precision and quad precision are not fully supported yet. Enable
9421 the following code after the support is complete. Need to find the correct
9422 type node for __fp16 *. */
9423 #if 0
9424 case HFmode:
9425 field_t = float_type_node;
9426 field_ptr_t = float_ptr_type_node;
9427 break;
9428 #endif
9429 case V2SImode:
9430 case V4SImode:
9432 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9433 field_t = build_vector_type_for_mode (innertype, ag_mode);
9434 field_ptr_t = build_pointer_type (field_t);
9436 break;
9437 default:
9438 gcc_assert (0);
9441 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
9442 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9443 addr = t;
9444 t = fold_convert (field_ptr_t, addr);
9445 t = build2 (MODIFY_EXPR, field_t,
9446 build1 (INDIRECT_REF, field_t, tmp_ha),
9447 build1 (INDIRECT_REF, field_t, t));
9449 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
9450 for (i = 1; i < nregs; ++i)
9452 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9453 u = fold_convert (field_ptr_t, addr);
9454 u = build2 (MODIFY_EXPR, field_t,
9455 build2 (MEM_REF, field_t, tmp_ha,
9456 build_int_cst (field_ptr_t,
9457 (i *
9458 int_size_in_bytes (field_t)))),
9459 build1 (INDIRECT_REF, field_t, u));
9460 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9463 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9464 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9467 COND_EXPR_ELSE (cond2) = t;
9468 addr = fold_convert (build_pointer_type (type), cond1);
9469 addr = build_va_arg_indirect_ref (addr);
9471 if (indirect_p)
9472 addr = build_va_arg_indirect_ref (addr);
9474 return addr;
9477 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
9479 static void
9480 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9481 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9482 int no_rtl)
9484 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9485 CUMULATIVE_ARGS local_cum;
9486 int gr_saved, vr_saved;
9488 /* The caller has advanced CUM up to, but not beyond, the last named
9489 argument. Advance a local copy of CUM past the last "real" named
9490 argument, to find out how many registers are left over. */
9491 local_cum = *cum;
9492 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9494 /* Found out how many registers we need to save. */
9495 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9496 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9498 if (!TARGET_FLOAT)
9500 gcc_assert (local_cum.aapcs_nvrn == 0);
9501 vr_saved = 0;
9504 if (!no_rtl)
9506 if (gr_saved > 0)
9508 rtx ptr, mem;
9510 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
9511 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9512 - gr_saved * UNITS_PER_WORD);
9513 mem = gen_frame_mem (BLKmode, ptr);
9514 set_mem_alias_set (mem, get_varargs_alias_set ());
9516 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9517 mem, gr_saved);
9519 if (vr_saved > 0)
9521 /* We can't use move_block_from_reg, because it will use
9522 the wrong mode, storing D regs only. */
9523 machine_mode mode = TImode;
9524 int off, i;
9526 /* Set OFF to the offset from virtual_incoming_args_rtx of
9527 the first vector register. The VR save area lies below
9528 the GR one, and is aligned to 16 bytes. */
9529 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
9530 STACK_BOUNDARY / BITS_PER_UNIT);
9531 off -= vr_saved * UNITS_PER_VREG;
9533 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
9535 rtx ptr, mem;
9537 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9538 mem = gen_frame_mem (mode, ptr);
9539 set_mem_alias_set (mem, get_varargs_alias_set ());
9540 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
9541 off += UNITS_PER_VREG;
9546 /* We don't save the size into *PRETEND_SIZE because we want to avoid
9547 any complication of having crtl->args.pretend_args_size changed. */
9548 cfun->machine->frame.saved_varargs_size
9549 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
9550 STACK_BOUNDARY / BITS_PER_UNIT)
9551 + vr_saved * UNITS_PER_VREG);
9554 static void
9555 aarch64_conditional_register_usage (void)
9557 int i;
9558 if (!TARGET_FLOAT)
9560 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9562 fixed_regs[i] = 1;
9563 call_used_regs[i] = 1;
9568 /* Walk down the type tree of TYPE counting consecutive base elements.
9569 If *MODEP is VOIDmode, then set it to the first valid floating point
9570 type. If a non-floating point type is found, or if a floating point
9571 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9572 otherwise return the count in the sub-tree. */
9573 static int
9574 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
9576 machine_mode mode;
9577 HOST_WIDE_INT size;
9579 switch (TREE_CODE (type))
9581 case REAL_TYPE:
9582 mode = TYPE_MODE (type);
9583 if (mode != DFmode && mode != SFmode && mode != TFmode)
9584 return -1;
9586 if (*modep == VOIDmode)
9587 *modep = mode;
9589 if (*modep == mode)
9590 return 1;
9592 break;
9594 case COMPLEX_TYPE:
9595 mode = TYPE_MODE (TREE_TYPE (type));
9596 if (mode != DFmode && mode != SFmode && mode != TFmode)
9597 return -1;
9599 if (*modep == VOIDmode)
9600 *modep = mode;
9602 if (*modep == mode)
9603 return 2;
9605 break;
9607 case VECTOR_TYPE:
9608 /* Use V2SImode and V4SImode as representatives of all 64-bit
9609 and 128-bit vector types. */
9610 size = int_size_in_bytes (type);
9611 switch (size)
9613 case 8:
9614 mode = V2SImode;
9615 break;
9616 case 16:
9617 mode = V4SImode;
9618 break;
9619 default:
9620 return -1;
9623 if (*modep == VOIDmode)
9624 *modep = mode;
9626 /* Vector modes are considered to be opaque: two vectors are
9627 equivalent for the purposes of being homogeneous aggregates
9628 if they are the same size. */
9629 if (*modep == mode)
9630 return 1;
9632 break;
9634 case ARRAY_TYPE:
9636 int count;
9637 tree index = TYPE_DOMAIN (type);
9639 /* Can't handle incomplete types nor sizes that are not
9640 fixed. */
9641 if (!COMPLETE_TYPE_P (type)
9642 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9643 return -1;
9645 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
9646 if (count == -1
9647 || !index
9648 || !TYPE_MAX_VALUE (index)
9649 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
9650 || !TYPE_MIN_VALUE (index)
9651 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
9652 || count < 0)
9653 return -1;
9655 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
9656 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
9658 /* There must be no padding. */
9659 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9660 return -1;
9662 return count;
9665 case RECORD_TYPE:
9667 int count = 0;
9668 int sub_count;
9669 tree field;
9671 /* Can't handle incomplete types nor sizes that are not
9672 fixed. */
9673 if (!COMPLETE_TYPE_P (type)
9674 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9675 return -1;
9677 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9679 if (TREE_CODE (field) != FIELD_DECL)
9680 continue;
9682 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9683 if (sub_count < 0)
9684 return -1;
9685 count += sub_count;
9688 /* There must be no padding. */
9689 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9690 return -1;
9692 return count;
9695 case UNION_TYPE:
9696 case QUAL_UNION_TYPE:
9698 /* These aren't very interesting except in a degenerate case. */
9699 int count = 0;
9700 int sub_count;
9701 tree field;
9703 /* Can't handle incomplete types nor sizes that are not
9704 fixed. */
9705 if (!COMPLETE_TYPE_P (type)
9706 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9707 return -1;
9709 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9711 if (TREE_CODE (field) != FIELD_DECL)
9712 continue;
9714 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9715 if (sub_count < 0)
9716 return -1;
9717 count = count > sub_count ? count : sub_count;
9720 /* There must be no padding. */
9721 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9722 return -1;
9724 return count;
9727 default:
9728 break;
9731 return -1;
9734 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
9735 type as described in AAPCS64 \S 4.1.2.
9737 See the comment above aarch64_composite_type_p for the notes on MODE. */
9739 static bool
9740 aarch64_short_vector_p (const_tree type,
9741 machine_mode mode)
9743 HOST_WIDE_INT size = -1;
9745 if (type && TREE_CODE (type) == VECTOR_TYPE)
9746 size = int_size_in_bytes (type);
9747 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
9748 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
9749 size = GET_MODE_SIZE (mode);
9751 return (size == 8 || size == 16);
9754 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
9755 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
9756 array types. The C99 floating-point complex types are also considered
9757 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
9758 types, which are GCC extensions and out of the scope of AAPCS64, are
9759 treated as composite types here as well.
9761 Note that MODE itself is not sufficient in determining whether a type
9762 is such a composite type or not. This is because
9763 stor-layout.c:compute_record_mode may have already changed the MODE
9764 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
9765 structure with only one field may have its MODE set to the mode of the
9766 field. Also an integer mode whose size matches the size of the
9767 RECORD_TYPE type may be used to substitute the original mode
9768 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
9769 solely relied on. */
9771 static bool
9772 aarch64_composite_type_p (const_tree type,
9773 machine_mode mode)
9775 if (aarch64_short_vector_p (type, mode))
9776 return false;
9778 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
9779 return true;
9781 if (mode == BLKmode
9782 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
9783 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
9784 return true;
9786 return false;
9789 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
9790 shall be passed or returned in simd/fp register(s) (providing these
9791 parameter passing registers are available).
9793 Upon successful return, *COUNT returns the number of needed registers,
9794 *BASE_MODE returns the mode of the individual register and when IS_HAF
9795 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
9796 floating-point aggregate or a homogeneous short-vector aggregate. */
9798 static bool
9799 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
9800 const_tree type,
9801 machine_mode *base_mode,
9802 int *count,
9803 bool *is_ha)
9805 machine_mode new_mode = VOIDmode;
9806 bool composite_p = aarch64_composite_type_p (type, mode);
9808 if (is_ha != NULL) *is_ha = false;
9810 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
9811 || aarch64_short_vector_p (type, mode))
9813 *count = 1;
9814 new_mode = mode;
9816 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
9818 if (is_ha != NULL) *is_ha = true;
9819 *count = 2;
9820 new_mode = GET_MODE_INNER (mode);
9822 else if (type && composite_p)
9824 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
9826 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
9828 if (is_ha != NULL) *is_ha = true;
9829 *count = ag_count;
9831 else
9832 return false;
9834 else
9835 return false;
9837 *base_mode = new_mode;
9838 return true;
9841 /* Implement TARGET_STRUCT_VALUE_RTX. */
9843 static rtx
9844 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
9845 int incoming ATTRIBUTE_UNUSED)
9847 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
9850 /* Implements target hook vector_mode_supported_p. */
9851 static bool
9852 aarch64_vector_mode_supported_p (machine_mode mode)
9854 if (TARGET_SIMD
9855 && (mode == V4SImode || mode == V8HImode
9856 || mode == V16QImode || mode == V2DImode
9857 || mode == V2SImode || mode == V4HImode
9858 || mode == V8QImode || mode == V2SFmode
9859 || mode == V4SFmode || mode == V2DFmode
9860 || mode == V4HFmode || mode == V8HFmode
9861 || mode == V1DFmode))
9862 return true;
9864 return false;
9867 /* Return appropriate SIMD container
9868 for MODE within a vector of WIDTH bits. */
9869 static machine_mode
9870 aarch64_simd_container_mode (machine_mode mode, unsigned width)
9872 gcc_assert (width == 64 || width == 128);
9873 if (TARGET_SIMD)
9875 if (width == 128)
9876 switch (mode)
9878 case DFmode:
9879 return V2DFmode;
9880 case SFmode:
9881 return V4SFmode;
9882 case SImode:
9883 return V4SImode;
9884 case HImode:
9885 return V8HImode;
9886 case QImode:
9887 return V16QImode;
9888 case DImode:
9889 return V2DImode;
9890 default:
9891 break;
9893 else
9894 switch (mode)
9896 case SFmode:
9897 return V2SFmode;
9898 case SImode:
9899 return V2SImode;
9900 case HImode:
9901 return V4HImode;
9902 case QImode:
9903 return V8QImode;
9904 default:
9905 break;
9908 return word_mode;
9911 /* Return 128-bit container as the preferred SIMD mode for MODE. */
9912 static machine_mode
9913 aarch64_preferred_simd_mode (machine_mode mode)
9915 return aarch64_simd_container_mode (mode, 128);
9918 /* Return the bitmask of possible vector sizes for the vectorizer
9919 to iterate over. */
9920 static unsigned int
9921 aarch64_autovectorize_vector_sizes (void)
9923 return (16 | 8);
9926 /* Implement TARGET_MANGLE_TYPE. */
9928 static const char *
9929 aarch64_mangle_type (const_tree type)
9931 /* The AArch64 ABI documents say that "__va_list" has to be
9932 managled as if it is in the "std" namespace. */
9933 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
9934 return "St9__va_list";
9936 /* Half-precision float. */
9937 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
9938 return "Dh";
9940 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
9941 builtin types. */
9942 if (TYPE_NAME (type) != NULL)
9943 return aarch64_mangle_builtin_type (type);
9945 /* Use the default mangling. */
9946 return NULL;
9950 /* Return true if the rtx_insn contains a MEM RTX somewhere
9951 in it. */
9953 static bool
9954 has_memory_op (rtx_insn *mem_insn)
9956 subrtx_iterator::array_type array;
9957 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
9958 if (MEM_P (*iter))
9959 return true;
9961 return false;
9964 /* Find the first rtx_insn before insn that will generate an assembly
9965 instruction. */
9967 static rtx_insn *
9968 aarch64_prev_real_insn (rtx_insn *insn)
9970 if (!insn)
9971 return NULL;
9975 insn = prev_real_insn (insn);
9977 while (insn && recog_memoized (insn) < 0);
9979 return insn;
9982 static bool
9983 is_madd_op (enum attr_type t1)
9985 unsigned int i;
9986 /* A number of these may be AArch32 only. */
9987 enum attr_type mlatypes[] = {
9988 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
9989 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
9990 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
9993 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
9995 if (t1 == mlatypes[i])
9996 return true;
9999 return false;
10002 /* Check if there is a register dependency between a load and the insn
10003 for which we hold recog_data. */
10005 static bool
10006 dep_between_memop_and_curr (rtx memop)
10008 rtx load_reg;
10009 int opno;
10011 gcc_assert (GET_CODE (memop) == SET);
10013 if (!REG_P (SET_DEST (memop)))
10014 return false;
10016 load_reg = SET_DEST (memop);
10017 for (opno = 1; opno < recog_data.n_operands; opno++)
10019 rtx operand = recog_data.operand[opno];
10020 if (REG_P (operand)
10021 && reg_overlap_mentioned_p (load_reg, operand))
10022 return true;
10025 return false;
10029 /* When working around the Cortex-A53 erratum 835769,
10030 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10031 instruction and has a preceding memory instruction such that a NOP
10032 should be inserted between them. */
10034 bool
10035 aarch64_madd_needs_nop (rtx_insn* insn)
10037 enum attr_type attr_type;
10038 rtx_insn *prev;
10039 rtx body;
10041 if (!TARGET_FIX_ERR_A53_835769)
10042 return false;
10044 if (recog_memoized (insn) < 0)
10045 return false;
10047 attr_type = get_attr_type (insn);
10048 if (!is_madd_op (attr_type))
10049 return false;
10051 prev = aarch64_prev_real_insn (insn);
10052 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10053 Restore recog state to INSN to avoid state corruption. */
10054 extract_constrain_insn_cached (insn);
10056 if (!prev || !has_memory_op (prev))
10057 return false;
10059 body = single_set (prev);
10061 /* If the previous insn is a memory op and there is no dependency between
10062 it and the DImode madd, emit a NOP between them. If body is NULL then we
10063 have a complex memory operation, probably a load/store pair.
10064 Be conservative for now and emit a NOP. */
10065 if (GET_MODE (recog_data.operand[0]) == DImode
10066 && (!body || !dep_between_memop_and_curr (body)))
10067 return true;
10069 return false;
10074 /* Implement FINAL_PRESCAN_INSN. */
10076 void
10077 aarch64_final_prescan_insn (rtx_insn *insn)
10079 if (aarch64_madd_needs_nop (insn))
10080 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10084 /* Return the equivalent letter for size. */
10085 static char
10086 sizetochar (int size)
10088 switch (size)
10090 case 64: return 'd';
10091 case 32: return 's';
10092 case 16: return 'h';
10093 case 8 : return 'b';
10094 default: gcc_unreachable ();
10098 /* Return true iff x is a uniform vector of floating-point
10099 constants, and the constant can be represented in
10100 quarter-precision form. Note, as aarch64_float_const_representable
10101 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10102 static bool
10103 aarch64_vect_float_const_representable_p (rtx x)
10105 rtx elt;
10106 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10107 && const_vec_duplicate_p (x, &elt)
10108 && aarch64_float_const_representable_p (elt));
10111 /* Return true for valid and false for invalid. */
10112 bool
10113 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10114 struct simd_immediate_info *info)
10116 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10117 matches = 1; \
10118 for (i = 0; i < idx; i += (STRIDE)) \
10119 if (!(TEST)) \
10120 matches = 0; \
10121 if (matches) \
10123 immtype = (CLASS); \
10124 elsize = (ELSIZE); \
10125 eshift = (SHIFT); \
10126 emvn = (NEG); \
10127 break; \
10130 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10131 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10132 unsigned char bytes[16];
10133 int immtype = -1, matches;
10134 unsigned int invmask = inverse ? 0xff : 0;
10135 int eshift, emvn;
10137 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10139 if (! (aarch64_simd_imm_zero_p (op, mode)
10140 || aarch64_vect_float_const_representable_p (op)))
10141 return false;
10143 if (info)
10145 info->value = CONST_VECTOR_ELT (op, 0);
10146 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10147 info->mvn = false;
10148 info->shift = 0;
10151 return true;
10154 /* Splat vector constant out into a byte vector. */
10155 for (i = 0; i < n_elts; i++)
10157 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10158 it must be laid out in the vector register in reverse order. */
10159 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10160 unsigned HOST_WIDE_INT elpart;
10161 unsigned int part, parts;
10163 if (CONST_INT_P (el))
10165 elpart = INTVAL (el);
10166 parts = 1;
10168 else if (GET_CODE (el) == CONST_DOUBLE)
10170 elpart = CONST_DOUBLE_LOW (el);
10171 parts = 2;
10173 else
10174 gcc_unreachable ();
10176 for (part = 0; part < parts; part++)
10178 unsigned int byte;
10179 for (byte = 0; byte < innersize; byte++)
10181 bytes[idx++] = (elpart & 0xff) ^ invmask;
10182 elpart >>= BITS_PER_UNIT;
10184 if (GET_CODE (el) == CONST_DOUBLE)
10185 elpart = CONST_DOUBLE_HIGH (el);
10189 /* Sanity check. */
10190 gcc_assert (idx == GET_MODE_SIZE (mode));
10194 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10195 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10197 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10198 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10200 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10201 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10203 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10204 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10206 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10208 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10210 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10211 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10213 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10214 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10216 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10217 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10219 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10220 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10222 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10224 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10226 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10227 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10229 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10230 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10232 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10233 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10235 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10236 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10238 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10240 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10241 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10243 while (0);
10245 if (immtype == -1)
10246 return false;
10248 if (info)
10250 info->element_width = elsize;
10251 info->mvn = emvn != 0;
10252 info->shift = eshift;
10254 unsigned HOST_WIDE_INT imm = 0;
10256 if (immtype >= 12 && immtype <= 15)
10257 info->msl = true;
10259 /* Un-invert bytes of recognized vector, if necessary. */
10260 if (invmask != 0)
10261 for (i = 0; i < idx; i++)
10262 bytes[i] ^= invmask;
10264 if (immtype == 17)
10266 /* FIXME: Broken on 32-bit H_W_I hosts. */
10267 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10269 for (i = 0; i < 8; i++)
10270 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10271 << (i * BITS_PER_UNIT);
10274 info->value = GEN_INT (imm);
10276 else
10278 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10279 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10281 /* Construct 'abcdefgh' because the assembler cannot handle
10282 generic constants. */
10283 if (info->mvn)
10284 imm = ~imm;
10285 imm = (imm >> info->shift) & 0xff;
10286 info->value = GEN_INT (imm);
10290 return true;
10291 #undef CHECK
10294 /* Check of immediate shift constants are within range. */
10295 bool
10296 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10298 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10299 if (left)
10300 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10301 else
10302 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10305 /* Return true if X is a uniform vector where all elements
10306 are either the floating-point constant 0.0 or the
10307 integer constant 0. */
10308 bool
10309 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10311 return x == CONST0_RTX (mode);
10314 bool
10315 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10317 HOST_WIDE_INT imm = INTVAL (x);
10318 int i;
10320 for (i = 0; i < 8; i++)
10322 unsigned int byte = imm & 0xff;
10323 if (byte != 0xff && byte != 0)
10324 return false;
10325 imm >>= 8;
10328 return true;
10331 bool
10332 aarch64_mov_operand_p (rtx x, machine_mode mode)
10334 if (GET_CODE (x) == HIGH
10335 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10336 return true;
10338 if (CONST_INT_P (x))
10339 return true;
10341 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10342 return true;
10344 return aarch64_classify_symbolic_expression (x)
10345 == SYMBOL_TINY_ABSOLUTE;
10348 /* Return a const_int vector of VAL. */
10350 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10352 int nunits = GET_MODE_NUNITS (mode);
10353 rtvec v = rtvec_alloc (nunits);
10354 int i;
10356 for (i=0; i < nunits; i++)
10357 RTVEC_ELT (v, i) = GEN_INT (val);
10359 return gen_rtx_CONST_VECTOR (mode, v);
10362 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10364 bool
10365 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10367 machine_mode vmode;
10369 gcc_assert (!VECTOR_MODE_P (mode));
10370 vmode = aarch64_preferred_simd_mode (mode);
10371 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10372 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10375 /* Construct and return a PARALLEL RTX vector with elements numbering the
10376 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10377 the vector - from the perspective of the architecture. This does not
10378 line up with GCC's perspective on lane numbers, so we end up with
10379 different masks depending on our target endian-ness. The diagram
10380 below may help. We must draw the distinction when building masks
10381 which select one half of the vector. An instruction selecting
10382 architectural low-lanes for a big-endian target, must be described using
10383 a mask selecting GCC high-lanes.
10385 Big-Endian Little-Endian
10387 GCC 0 1 2 3 3 2 1 0
10388 | x | x | x | x | | x | x | x | x |
10389 Architecture 3 2 1 0 3 2 1 0
10391 Low Mask: { 2, 3 } { 0, 1 }
10392 High Mask: { 0, 1 } { 2, 3 }
10396 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10398 int nunits = GET_MODE_NUNITS (mode);
10399 rtvec v = rtvec_alloc (nunits / 2);
10400 int high_base = nunits / 2;
10401 int low_base = 0;
10402 int base;
10403 rtx t1;
10404 int i;
10406 if (BYTES_BIG_ENDIAN)
10407 base = high ? low_base : high_base;
10408 else
10409 base = high ? high_base : low_base;
10411 for (i = 0; i < nunits / 2; i++)
10412 RTVEC_ELT (v, i) = GEN_INT (base + i);
10414 t1 = gen_rtx_PARALLEL (mode, v);
10415 return t1;
10418 /* Check OP for validity as a PARALLEL RTX vector with elements
10419 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10420 from the perspective of the architecture. See the diagram above
10421 aarch64_simd_vect_par_cnst_half for more details. */
10423 bool
10424 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10425 bool high)
10427 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10428 HOST_WIDE_INT count_op = XVECLEN (op, 0);
10429 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10430 int i = 0;
10432 if (!VECTOR_MODE_P (mode))
10433 return false;
10435 if (count_op != count_ideal)
10436 return false;
10438 for (i = 0; i < count_ideal; i++)
10440 rtx elt_op = XVECEXP (op, 0, i);
10441 rtx elt_ideal = XVECEXP (ideal, 0, i);
10443 if (!CONST_INT_P (elt_op)
10444 || INTVAL (elt_ideal) != INTVAL (elt_op))
10445 return false;
10447 return true;
10450 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
10451 HIGH (exclusive). */
10452 void
10453 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10454 const_tree exp)
10456 HOST_WIDE_INT lane;
10457 gcc_assert (CONST_INT_P (operand));
10458 lane = INTVAL (operand);
10460 if (lane < low || lane >= high)
10462 if (exp)
10463 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10464 else
10465 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10469 /* Return TRUE if OP is a valid vector addressing mode. */
10470 bool
10471 aarch64_simd_mem_operand_p (rtx op)
10473 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10474 || REG_P (XEXP (op, 0)));
10477 /* Emit a register copy from operand to operand, taking care not to
10478 early-clobber source registers in the process.
10480 COUNT is the number of components into which the copy needs to be
10481 decomposed. */
10482 void
10483 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10484 unsigned int count)
10486 unsigned int i;
10487 int rdest = REGNO (operands[0]);
10488 int rsrc = REGNO (operands[1]);
10490 if (!reg_overlap_mentioned_p (operands[0], operands[1])
10491 || rdest < rsrc)
10492 for (i = 0; i < count; i++)
10493 emit_move_insn (gen_rtx_REG (mode, rdest + i),
10494 gen_rtx_REG (mode, rsrc + i));
10495 else
10496 for (i = 0; i < count; i++)
10497 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10498 gen_rtx_REG (mode, rsrc + count - i - 1));
10501 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10502 one of VSTRUCT modes: OI, CI or XI. */
10504 aarch64_simd_attr_length_move (rtx_insn *insn)
10506 machine_mode mode;
10508 extract_insn_cached (insn);
10510 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10512 mode = GET_MODE (recog_data.operand[0]);
10513 switch (mode)
10515 case OImode:
10516 return 8;
10517 case CImode:
10518 return 12;
10519 case XImode:
10520 return 16;
10521 default:
10522 gcc_unreachable ();
10525 return 4;
10528 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10529 one of VSTRUCT modes: OI, CI, or XI. */
10531 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10533 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10536 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
10537 alignment of a vector to 128 bits. */
10538 static HOST_WIDE_INT
10539 aarch64_simd_vector_alignment (const_tree type)
10541 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10542 return MIN (align, 128);
10545 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
10546 static bool
10547 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10549 if (is_packed)
10550 return false;
10552 /* We guarantee alignment for vectors up to 128-bits. */
10553 if (tree_int_cst_compare (TYPE_SIZE (type),
10554 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10555 return false;
10557 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
10558 return true;
10561 /* If VALS is a vector constant that can be loaded into a register
10562 using DUP, generate instructions to do so and return an RTX to
10563 assign to the register. Otherwise return NULL_RTX. */
10564 static rtx
10565 aarch64_simd_dup_constant (rtx vals)
10567 machine_mode mode = GET_MODE (vals);
10568 machine_mode inner_mode = GET_MODE_INNER (mode);
10569 rtx x;
10571 if (!const_vec_duplicate_p (vals, &x))
10572 return NULL_RTX;
10574 /* We can load this constant by using DUP and a constant in a
10575 single ARM register. This will be cheaper than a vector
10576 load. */
10577 x = copy_to_mode_reg (inner_mode, x);
10578 return gen_rtx_VEC_DUPLICATE (mode, x);
10582 /* Generate code to load VALS, which is a PARALLEL containing only
10583 constants (for vec_init) or CONST_VECTOR, efficiently into a
10584 register. Returns an RTX to copy into the register, or NULL_RTX
10585 for a PARALLEL that can not be converted into a CONST_VECTOR. */
10586 static rtx
10587 aarch64_simd_make_constant (rtx vals)
10589 machine_mode mode = GET_MODE (vals);
10590 rtx const_dup;
10591 rtx const_vec = NULL_RTX;
10592 int n_elts = GET_MODE_NUNITS (mode);
10593 int n_const = 0;
10594 int i;
10596 if (GET_CODE (vals) == CONST_VECTOR)
10597 const_vec = vals;
10598 else if (GET_CODE (vals) == PARALLEL)
10600 /* A CONST_VECTOR must contain only CONST_INTs and
10601 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
10602 Only store valid constants in a CONST_VECTOR. */
10603 for (i = 0; i < n_elts; ++i)
10605 rtx x = XVECEXP (vals, 0, i);
10606 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10607 n_const++;
10609 if (n_const == n_elts)
10610 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
10612 else
10613 gcc_unreachable ();
10615 if (const_vec != NULL_RTX
10616 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
10617 /* Load using MOVI/MVNI. */
10618 return const_vec;
10619 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
10620 /* Loaded using DUP. */
10621 return const_dup;
10622 else if (const_vec != NULL_RTX)
10623 /* Load from constant pool. We can not take advantage of single-cycle
10624 LD1 because we need a PC-relative addressing mode. */
10625 return const_vec;
10626 else
10627 /* A PARALLEL containing something not valid inside CONST_VECTOR.
10628 We can not construct an initializer. */
10629 return NULL_RTX;
10632 void
10633 aarch64_expand_vector_init (rtx target, rtx vals)
10635 machine_mode mode = GET_MODE (target);
10636 machine_mode inner_mode = GET_MODE_INNER (mode);
10637 int n_elts = GET_MODE_NUNITS (mode);
10638 int n_var = 0;
10639 rtx any_const = NULL_RTX;
10640 bool all_same = true;
10642 for (int i = 0; i < n_elts; ++i)
10644 rtx x = XVECEXP (vals, 0, i);
10645 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
10646 ++n_var;
10647 else
10648 any_const = x;
10650 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
10651 all_same = false;
10654 if (n_var == 0)
10656 rtx constant = aarch64_simd_make_constant (vals);
10657 if (constant != NULL_RTX)
10659 emit_move_insn (target, constant);
10660 return;
10664 /* Splat a single non-constant element if we can. */
10665 if (all_same)
10667 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
10668 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
10669 return;
10672 /* Half the fields (or less) are non-constant. Load constant then overwrite
10673 varying fields. Hope that this is more efficient than using the stack. */
10674 if (n_var <= n_elts/2)
10676 rtx copy = copy_rtx (vals);
10678 /* Load constant part of vector. We really don't care what goes into the
10679 parts we will overwrite, but we're more likely to be able to load the
10680 constant efficiently if it has fewer, larger, repeating parts
10681 (see aarch64_simd_valid_immediate). */
10682 for (int i = 0; i < n_elts; i++)
10684 rtx x = XVECEXP (vals, 0, i);
10685 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10686 continue;
10687 rtx subst = any_const;
10688 for (int bit = n_elts / 2; bit > 0; bit /= 2)
10690 /* Look in the copied vector, as more elements are const. */
10691 rtx test = XVECEXP (copy, 0, i ^ bit);
10692 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
10694 subst = test;
10695 break;
10698 XVECEXP (copy, 0, i) = subst;
10700 aarch64_expand_vector_init (target, copy);
10702 /* Insert variables. */
10703 enum insn_code icode = optab_handler (vec_set_optab, mode);
10704 gcc_assert (icode != CODE_FOR_nothing);
10706 for (int i = 0; i < n_elts; i++)
10708 rtx x = XVECEXP (vals, 0, i);
10709 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10710 continue;
10711 x = copy_to_mode_reg (inner_mode, x);
10712 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
10714 return;
10717 /* Construct the vector in memory one field at a time
10718 and load the whole vector. */
10719 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
10720 for (int i = 0; i < n_elts; i++)
10721 emit_move_insn (adjust_address_nv (mem, inner_mode,
10722 i * GET_MODE_SIZE (inner_mode)),
10723 XVECEXP (vals, 0, i));
10724 emit_move_insn (target, mem);
10728 static unsigned HOST_WIDE_INT
10729 aarch64_shift_truncation_mask (machine_mode mode)
10731 return
10732 (aarch64_vector_mode_supported_p (mode)
10733 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
10736 /* Select a format to encode pointers in exception handling data. */
10738 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
10740 int type;
10741 switch (aarch64_cmodel)
10743 case AARCH64_CMODEL_TINY:
10744 case AARCH64_CMODEL_TINY_PIC:
10745 case AARCH64_CMODEL_SMALL:
10746 case AARCH64_CMODEL_SMALL_PIC:
10747 case AARCH64_CMODEL_SMALL_SPIC:
10748 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
10749 for everything. */
10750 type = DW_EH_PE_sdata4;
10751 break;
10752 default:
10753 /* No assumptions here. 8-byte relocs required. */
10754 type = DW_EH_PE_sdata8;
10755 break;
10757 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
10760 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
10761 by the function fndecl. */
10763 void
10764 aarch64_declare_function_name (FILE *stream, const char* name,
10765 tree fndecl)
10767 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10769 struct cl_target_option *targ_options;
10770 if (target_parts)
10771 targ_options = TREE_TARGET_OPTION (target_parts);
10772 else
10773 targ_options = TREE_TARGET_OPTION (target_option_current_node);
10774 gcc_assert (targ_options);
10776 const struct processor *this_arch
10777 = aarch64_get_arch (targ_options->x_explicit_arch);
10779 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
10780 std::string extension
10781 = aarch64_get_extension_string_for_isa_flags (isa_flags);
10782 asm_fprintf (asm_out_file, "\t.arch %s%s\n",
10783 this_arch->name, extension.c_str ());
10785 /* Print the cpu name we're tuning for in the comments, might be
10786 useful to readers of the generated asm. */
10788 const struct processor *this_tune
10789 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
10791 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
10792 this_tune->name);
10794 /* Don't forget the type directive for ELF. */
10795 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
10796 ASM_OUTPUT_LABEL (stream, name);
10799 /* Emit load exclusive. */
10801 static void
10802 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
10803 rtx mem, rtx model_rtx)
10805 rtx (*gen) (rtx, rtx, rtx);
10807 switch (mode)
10809 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
10810 case HImode: gen = gen_aarch64_load_exclusivehi; break;
10811 case SImode: gen = gen_aarch64_load_exclusivesi; break;
10812 case DImode: gen = gen_aarch64_load_exclusivedi; break;
10813 default:
10814 gcc_unreachable ();
10817 emit_insn (gen (rval, mem, model_rtx));
10820 /* Emit store exclusive. */
10822 static void
10823 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
10824 rtx rval, rtx mem, rtx model_rtx)
10826 rtx (*gen) (rtx, rtx, rtx, rtx);
10828 switch (mode)
10830 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
10831 case HImode: gen = gen_aarch64_store_exclusivehi; break;
10832 case SImode: gen = gen_aarch64_store_exclusivesi; break;
10833 case DImode: gen = gen_aarch64_store_exclusivedi; break;
10834 default:
10835 gcc_unreachable ();
10838 emit_insn (gen (bval, rval, mem, model_rtx));
10841 /* Mark the previous jump instruction as unlikely. */
10843 static void
10844 aarch64_emit_unlikely_jump (rtx insn)
10846 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
10848 insn = emit_jump_insn (insn);
10849 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
10852 /* Expand a compare and swap pattern. */
10854 void
10855 aarch64_expand_compare_and_swap (rtx operands[])
10857 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
10858 machine_mode mode, cmp_mode;
10859 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
10860 int idx;
10861 gen_cas_fn gen;
10862 const gen_cas_fn split_cas[] =
10864 gen_aarch64_compare_and_swapqi,
10865 gen_aarch64_compare_and_swaphi,
10866 gen_aarch64_compare_and_swapsi,
10867 gen_aarch64_compare_and_swapdi
10869 const gen_cas_fn atomic_cas[] =
10871 gen_aarch64_compare_and_swapqi_lse,
10872 gen_aarch64_compare_and_swaphi_lse,
10873 gen_aarch64_compare_and_swapsi_lse,
10874 gen_aarch64_compare_and_swapdi_lse
10877 bval = operands[0];
10878 rval = operands[1];
10879 mem = operands[2];
10880 oldval = operands[3];
10881 newval = operands[4];
10882 is_weak = operands[5];
10883 mod_s = operands[6];
10884 mod_f = operands[7];
10885 mode = GET_MODE (mem);
10886 cmp_mode = mode;
10888 /* Normally the succ memory model must be stronger than fail, but in the
10889 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
10890 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
10892 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
10893 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
10894 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
10896 switch (mode)
10898 case QImode:
10899 case HImode:
10900 /* For short modes, we're going to perform the comparison in SImode,
10901 so do the zero-extension now. */
10902 cmp_mode = SImode;
10903 rval = gen_reg_rtx (SImode);
10904 oldval = convert_modes (SImode, mode, oldval, true);
10905 /* Fall through. */
10907 case SImode:
10908 case DImode:
10909 /* Force the value into a register if needed. */
10910 if (!aarch64_plus_operand (oldval, mode))
10911 oldval = force_reg (cmp_mode, oldval);
10912 break;
10914 default:
10915 gcc_unreachable ();
10918 switch (mode)
10920 case QImode: idx = 0; break;
10921 case HImode: idx = 1; break;
10922 case SImode: idx = 2; break;
10923 case DImode: idx = 3; break;
10924 default:
10925 gcc_unreachable ();
10927 if (TARGET_LSE)
10928 gen = atomic_cas[idx];
10929 else
10930 gen = split_cas[idx];
10932 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
10934 if (mode == QImode || mode == HImode)
10935 emit_move_insn (operands[1], gen_lowpart (mode, rval));
10937 x = gen_rtx_REG (CCmode, CC_REGNUM);
10938 x = gen_rtx_EQ (SImode, x, const0_rtx);
10939 emit_insn (gen_rtx_SET (bval, x));
10942 /* Test whether the target supports using a atomic load-operate instruction.
10943 CODE is the operation and AFTER is TRUE if the data in memory after the
10944 operation should be returned and FALSE if the data before the operation
10945 should be returned. Returns FALSE if the operation isn't supported by the
10946 architecture. */
10948 bool
10949 aarch64_atomic_ldop_supported_p (enum rtx_code code)
10951 if (!TARGET_LSE)
10952 return false;
10954 switch (code)
10956 case SET:
10957 case AND:
10958 case IOR:
10959 case XOR:
10960 case MINUS:
10961 case PLUS:
10962 return true;
10963 default:
10964 return false;
10968 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
10969 sequence implementing an atomic operation. */
10971 static void
10972 aarch64_emit_post_barrier (enum memmodel model)
10974 const enum memmodel base_model = memmodel_base (model);
10976 if (is_mm_sync (model)
10977 && (base_model == MEMMODEL_ACQUIRE
10978 || base_model == MEMMODEL_ACQ_REL
10979 || base_model == MEMMODEL_SEQ_CST))
10981 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
10985 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
10986 for the data in memory. EXPECTED is the value expected to be in memory.
10987 DESIRED is the value to store to memory. MEM is the memory location. MODEL
10988 is the memory ordering to use. */
10990 void
10991 aarch64_gen_atomic_cas (rtx rval, rtx mem,
10992 rtx expected, rtx desired,
10993 rtx model)
10995 rtx (*gen) (rtx, rtx, rtx, rtx);
10996 machine_mode mode;
10998 mode = GET_MODE (mem);
11000 switch (mode)
11002 case QImode: gen = gen_aarch64_atomic_casqi; break;
11003 case HImode: gen = gen_aarch64_atomic_cashi; break;
11004 case SImode: gen = gen_aarch64_atomic_cassi; break;
11005 case DImode: gen = gen_aarch64_atomic_casdi; break;
11006 default:
11007 gcc_unreachable ();
11010 /* Move the expected value into the CAS destination register. */
11011 emit_insn (gen_rtx_SET (rval, expected));
11013 /* Emit the CAS. */
11014 emit_insn (gen (rval, mem, desired, model));
11016 /* Compare the expected value with the value loaded by the CAS, to establish
11017 whether the swap was made. */
11018 aarch64_gen_compare_reg (EQ, rval, expected);
11021 /* Split a compare and swap pattern. */
11023 void
11024 aarch64_split_compare_and_swap (rtx operands[])
11026 rtx rval, mem, oldval, newval, scratch;
11027 machine_mode mode;
11028 bool is_weak;
11029 rtx_code_label *label1, *label2;
11030 rtx x, cond;
11031 enum memmodel model;
11032 rtx model_rtx;
11034 rval = operands[0];
11035 mem = operands[1];
11036 oldval = operands[2];
11037 newval = operands[3];
11038 is_weak = (operands[4] != const0_rtx);
11039 model_rtx = operands[5];
11040 scratch = operands[7];
11041 mode = GET_MODE (mem);
11042 model = memmodel_from_int (INTVAL (model_rtx));
11044 label1 = NULL;
11045 if (!is_weak)
11047 label1 = gen_label_rtx ();
11048 emit_label (label1);
11050 label2 = gen_label_rtx ();
11052 /* The initial load can be relaxed for a __sync operation since a final
11053 barrier will be emitted to stop code hoisting. */
11054 if (is_mm_sync (model))
11055 aarch64_emit_load_exclusive (mode, rval, mem,
11056 GEN_INT (MEMMODEL_RELAXED));
11057 else
11058 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11060 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11061 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11062 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11063 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11064 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11066 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11068 if (!is_weak)
11070 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11071 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11072 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11073 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11075 else
11077 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11078 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11079 emit_insn (gen_rtx_SET (cond, x));
11082 emit_label (label2);
11084 /* Emit any final barrier needed for a __sync operation. */
11085 if (is_mm_sync (model))
11086 aarch64_emit_post_barrier (model);
11089 /* Emit a BIC instruction. */
11091 static void
11092 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11094 rtx shift_rtx = GEN_INT (shift);
11095 rtx (*gen) (rtx, rtx, rtx, rtx);
11097 switch (mode)
11099 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11100 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11101 default:
11102 gcc_unreachable ();
11105 emit_insn (gen (dst, s2, shift_rtx, s1));
11108 /* Emit an atomic swap. */
11110 static void
11111 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11112 rtx mem, rtx model)
11114 rtx (*gen) (rtx, rtx, rtx, rtx);
11116 switch (mode)
11118 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11119 case HImode: gen = gen_aarch64_atomic_swphi; break;
11120 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11121 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11122 default:
11123 gcc_unreachable ();
11126 emit_insn (gen (dst, mem, value, model));
11129 /* Operations supported by aarch64_emit_atomic_load_op. */
11131 enum aarch64_atomic_load_op_code
11133 AARCH64_LDOP_PLUS, /* A + B */
11134 AARCH64_LDOP_XOR, /* A ^ B */
11135 AARCH64_LDOP_OR, /* A | B */
11136 AARCH64_LDOP_BIC /* A & ~B */
11139 /* Emit an atomic load-operate. */
11141 static void
11142 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11143 machine_mode mode, rtx dst, rtx src,
11144 rtx mem, rtx model)
11146 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11147 const aarch64_atomic_load_op_fn plus[] =
11149 gen_aarch64_atomic_loadaddqi,
11150 gen_aarch64_atomic_loadaddhi,
11151 gen_aarch64_atomic_loadaddsi,
11152 gen_aarch64_atomic_loadadddi
11154 const aarch64_atomic_load_op_fn eor[] =
11156 gen_aarch64_atomic_loadeorqi,
11157 gen_aarch64_atomic_loadeorhi,
11158 gen_aarch64_atomic_loadeorsi,
11159 gen_aarch64_atomic_loadeordi
11161 const aarch64_atomic_load_op_fn ior[] =
11163 gen_aarch64_atomic_loadsetqi,
11164 gen_aarch64_atomic_loadsethi,
11165 gen_aarch64_atomic_loadsetsi,
11166 gen_aarch64_atomic_loadsetdi
11168 const aarch64_atomic_load_op_fn bic[] =
11170 gen_aarch64_atomic_loadclrqi,
11171 gen_aarch64_atomic_loadclrhi,
11172 gen_aarch64_atomic_loadclrsi,
11173 gen_aarch64_atomic_loadclrdi
11175 aarch64_atomic_load_op_fn gen;
11176 int idx = 0;
11178 switch (mode)
11180 case QImode: idx = 0; break;
11181 case HImode: idx = 1; break;
11182 case SImode: idx = 2; break;
11183 case DImode: idx = 3; break;
11184 default:
11185 gcc_unreachable ();
11188 switch (code)
11190 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11191 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11192 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11193 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11194 default:
11195 gcc_unreachable ();
11198 emit_insn (gen (dst, mem, src, model));
11201 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11202 location to store the data read from memory. OUT_RESULT is the location to
11203 store the result of the operation. MEM is the memory location to read and
11204 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11205 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11206 be NULL. */
11208 void
11209 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11210 rtx mem, rtx value, rtx model_rtx)
11212 machine_mode mode = GET_MODE (mem);
11213 machine_mode wmode = (mode == DImode ? DImode : SImode);
11214 const bool short_mode = (mode < SImode);
11215 aarch64_atomic_load_op_code ldop_code;
11216 rtx src;
11217 rtx x;
11219 if (out_data)
11220 out_data = gen_lowpart (mode, out_data);
11222 if (out_result)
11223 out_result = gen_lowpart (mode, out_result);
11225 /* Make sure the value is in a register, putting it into a destination
11226 register if it needs to be manipulated. */
11227 if (!register_operand (value, mode)
11228 || code == AND || code == MINUS)
11230 src = out_result ? out_result : out_data;
11231 emit_move_insn (src, gen_lowpart (mode, value));
11233 else
11234 src = value;
11235 gcc_assert (register_operand (src, mode));
11237 /* Preprocess the data for the operation as necessary. If the operation is
11238 a SET then emit a swap instruction and finish. */
11239 switch (code)
11241 case SET:
11242 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11243 return;
11245 case MINUS:
11246 /* Negate the value and treat it as a PLUS. */
11248 rtx neg_src;
11250 /* Resize the value if necessary. */
11251 if (short_mode)
11252 src = gen_lowpart (wmode, src);
11254 neg_src = gen_rtx_NEG (wmode, src);
11255 emit_insn (gen_rtx_SET (src, neg_src));
11257 if (short_mode)
11258 src = gen_lowpart (mode, src);
11260 /* Fall-through. */
11261 case PLUS:
11262 ldop_code = AARCH64_LDOP_PLUS;
11263 break;
11265 case IOR:
11266 ldop_code = AARCH64_LDOP_OR;
11267 break;
11269 case XOR:
11270 ldop_code = AARCH64_LDOP_XOR;
11271 break;
11273 case AND:
11275 rtx not_src;
11277 /* Resize the value if necessary. */
11278 if (short_mode)
11279 src = gen_lowpart (wmode, src);
11281 not_src = gen_rtx_NOT (wmode, src);
11282 emit_insn (gen_rtx_SET (src, not_src));
11284 if (short_mode)
11285 src = gen_lowpart (mode, src);
11287 ldop_code = AARCH64_LDOP_BIC;
11288 break;
11290 default:
11291 /* The operation can't be done with atomic instructions. */
11292 gcc_unreachable ();
11295 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11297 /* If necessary, calculate the data in memory after the update by redoing the
11298 operation from values in registers. */
11299 if (!out_result)
11300 return;
11302 if (short_mode)
11304 src = gen_lowpart (wmode, src);
11305 out_data = gen_lowpart (wmode, out_data);
11306 out_result = gen_lowpart (wmode, out_result);
11309 x = NULL_RTX;
11311 switch (code)
11313 case MINUS:
11314 case PLUS:
11315 x = gen_rtx_PLUS (wmode, out_data, src);
11316 break;
11317 case IOR:
11318 x = gen_rtx_IOR (wmode, out_data, src);
11319 break;
11320 case XOR:
11321 x = gen_rtx_XOR (wmode, out_data, src);
11322 break;
11323 case AND:
11324 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11325 return;
11326 default:
11327 gcc_unreachable ();
11330 emit_set_insn (out_result, x);
11332 return;
11335 /* Split an atomic operation. */
11337 void
11338 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11339 rtx value, rtx model_rtx, rtx cond)
11341 machine_mode mode = GET_MODE (mem);
11342 machine_mode wmode = (mode == DImode ? DImode : SImode);
11343 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11344 const bool is_sync = is_mm_sync (model);
11345 rtx_code_label *label;
11346 rtx x;
11348 /* Split the atomic operation into a sequence. */
11349 label = gen_label_rtx ();
11350 emit_label (label);
11352 if (new_out)
11353 new_out = gen_lowpart (wmode, new_out);
11354 if (old_out)
11355 old_out = gen_lowpart (wmode, old_out);
11356 else
11357 old_out = new_out;
11358 value = simplify_gen_subreg (wmode, value, mode, 0);
11360 /* The initial load can be relaxed for a __sync operation since a final
11361 barrier will be emitted to stop code hoisting. */
11362 if (is_sync)
11363 aarch64_emit_load_exclusive (mode, old_out, mem,
11364 GEN_INT (MEMMODEL_RELAXED));
11365 else
11366 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11368 switch (code)
11370 case SET:
11371 new_out = value;
11372 break;
11374 case NOT:
11375 x = gen_rtx_AND (wmode, old_out, value);
11376 emit_insn (gen_rtx_SET (new_out, x));
11377 x = gen_rtx_NOT (wmode, new_out);
11378 emit_insn (gen_rtx_SET (new_out, x));
11379 break;
11381 case MINUS:
11382 if (CONST_INT_P (value))
11384 value = GEN_INT (-INTVAL (value));
11385 code = PLUS;
11387 /* Fall through. */
11389 default:
11390 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11391 emit_insn (gen_rtx_SET (new_out, x));
11392 break;
11395 aarch64_emit_store_exclusive (mode, cond, mem,
11396 gen_lowpart (mode, new_out), model_rtx);
11398 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11399 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11400 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11401 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11403 /* Emit any final barrier needed for a __sync operation. */
11404 if (is_sync)
11405 aarch64_emit_post_barrier (model);
11408 static void
11409 aarch64_init_libfuncs (void)
11411 /* Half-precision float operations. The compiler handles all operations
11412 with NULL libfuncs by converting to SFmode. */
11414 /* Conversions. */
11415 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11416 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11418 /* Arithmetic. */
11419 set_optab_libfunc (add_optab, HFmode, NULL);
11420 set_optab_libfunc (sdiv_optab, HFmode, NULL);
11421 set_optab_libfunc (smul_optab, HFmode, NULL);
11422 set_optab_libfunc (neg_optab, HFmode, NULL);
11423 set_optab_libfunc (sub_optab, HFmode, NULL);
11425 /* Comparisons. */
11426 set_optab_libfunc (eq_optab, HFmode, NULL);
11427 set_optab_libfunc (ne_optab, HFmode, NULL);
11428 set_optab_libfunc (lt_optab, HFmode, NULL);
11429 set_optab_libfunc (le_optab, HFmode, NULL);
11430 set_optab_libfunc (ge_optab, HFmode, NULL);
11431 set_optab_libfunc (gt_optab, HFmode, NULL);
11432 set_optab_libfunc (unord_optab, HFmode, NULL);
11435 /* Target hook for c_mode_for_suffix. */
11436 static machine_mode
11437 aarch64_c_mode_for_suffix (char suffix)
11439 if (suffix == 'q')
11440 return TFmode;
11442 return VOIDmode;
11445 /* We can only represent floating point constants which will fit in
11446 "quarter-precision" values. These values are characterised by
11447 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
11450 (-1)^s * (n/16) * 2^r
11452 Where:
11453 's' is the sign bit.
11454 'n' is an integer in the range 16 <= n <= 31.
11455 'r' is an integer in the range -3 <= r <= 4. */
11457 /* Return true iff X can be represented by a quarter-precision
11458 floating point immediate operand X. Note, we cannot represent 0.0. */
11459 bool
11460 aarch64_float_const_representable_p (rtx x)
11462 /* This represents our current view of how many bits
11463 make up the mantissa. */
11464 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11465 int exponent;
11466 unsigned HOST_WIDE_INT mantissa, mask;
11467 REAL_VALUE_TYPE r, m;
11468 bool fail;
11470 if (!CONST_DOUBLE_P (x))
11471 return false;
11473 /* We don't support HFmode constants yet. */
11474 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11475 return false;
11477 r = *CONST_DOUBLE_REAL_VALUE (x);
11479 /* We cannot represent infinities, NaNs or +/-zero. We won't
11480 know if we have +zero until we analyse the mantissa, but we
11481 can reject the other invalid values. */
11482 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11483 || REAL_VALUE_MINUS_ZERO (r))
11484 return false;
11486 /* Extract exponent. */
11487 r = real_value_abs (&r);
11488 exponent = REAL_EXP (&r);
11490 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11491 highest (sign) bit, with a fixed binary point at bit point_pos.
11492 m1 holds the low part of the mantissa, m2 the high part.
11493 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11494 bits for the mantissa, this can fail (low bits will be lost). */
11495 real_ldexp (&m, &r, point_pos - exponent);
11496 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11498 /* If the low part of the mantissa has bits set we cannot represent
11499 the value. */
11500 if (w.elt (0) != 0)
11501 return false;
11502 /* We have rejected the lower HOST_WIDE_INT, so update our
11503 understanding of how many bits lie in the mantissa and
11504 look only at the high HOST_WIDE_INT. */
11505 mantissa = w.elt (1);
11506 point_pos -= HOST_BITS_PER_WIDE_INT;
11508 /* We can only represent values with a mantissa of the form 1.xxxx. */
11509 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11510 if ((mantissa & mask) != 0)
11511 return false;
11513 /* Having filtered unrepresentable values, we may now remove all
11514 but the highest 5 bits. */
11515 mantissa >>= point_pos - 5;
11517 /* We cannot represent the value 0.0, so reject it. This is handled
11518 elsewhere. */
11519 if (mantissa == 0)
11520 return false;
11522 /* Then, as bit 4 is always set, we can mask it off, leaving
11523 the mantissa in the range [0, 15]. */
11524 mantissa &= ~(1 << 4);
11525 gcc_assert (mantissa <= 15);
11527 /* GCC internally does not use IEEE754-like encoding (where normalized
11528 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
11529 Our mantissa values are shifted 4 places to the left relative to
11530 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11531 by 5 places to correct for GCC's representation. */
11532 exponent = 5 - exponent;
11534 return (exponent >= 0 && exponent <= 7);
11537 char*
11538 aarch64_output_simd_mov_immediate (rtx const_vector,
11539 machine_mode mode,
11540 unsigned width)
11542 bool is_valid;
11543 static char templ[40];
11544 const char *mnemonic;
11545 const char *shift_op;
11546 unsigned int lane_count = 0;
11547 char element_char;
11549 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
11551 /* This will return true to show const_vector is legal for use as either
11552 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
11553 also update INFO to show how the immediate should be generated. */
11554 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
11555 gcc_assert (is_valid);
11557 element_char = sizetochar (info.element_width);
11558 lane_count = width / info.element_width;
11560 mode = GET_MODE_INNER (mode);
11561 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11563 gcc_assert (info.shift == 0 && ! info.mvn);
11564 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
11565 move immediate path. */
11566 if (aarch64_float_const_zero_rtx_p (info.value))
11567 info.value = GEN_INT (0);
11568 else
11570 #define buf_size 20
11571 char float_buf[buf_size] = {'\0'};
11572 real_to_decimal_for_mode (float_buf,
11573 CONST_DOUBLE_REAL_VALUE (info.value),
11574 buf_size, buf_size, 1, mode);
11575 #undef buf_size
11577 if (lane_count == 1)
11578 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
11579 else
11580 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
11581 lane_count, element_char, float_buf);
11582 return templ;
11586 mnemonic = info.mvn ? "mvni" : "movi";
11587 shift_op = info.msl ? "msl" : "lsl";
11589 gcc_assert (CONST_INT_P (info.value));
11590 if (lane_count == 1)
11591 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
11592 mnemonic, UINTVAL (info.value));
11593 else if (info.shift)
11594 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
11595 ", %s %d", mnemonic, lane_count, element_char,
11596 UINTVAL (info.value), shift_op, info.shift);
11597 else
11598 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
11599 mnemonic, lane_count, element_char, UINTVAL (info.value));
11600 return templ;
11603 char*
11604 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
11605 machine_mode mode)
11607 machine_mode vmode;
11609 gcc_assert (!VECTOR_MODE_P (mode));
11610 vmode = aarch64_simd_container_mode (mode, 64);
11611 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
11612 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
11615 /* Split operands into moves from op[1] + op[2] into op[0]. */
11617 void
11618 aarch64_split_combinev16qi (rtx operands[3])
11620 unsigned int dest = REGNO (operands[0]);
11621 unsigned int src1 = REGNO (operands[1]);
11622 unsigned int src2 = REGNO (operands[2]);
11623 machine_mode halfmode = GET_MODE (operands[1]);
11624 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
11625 rtx destlo, desthi;
11627 gcc_assert (halfmode == V16QImode);
11629 if (src1 == dest && src2 == dest + halfregs)
11631 /* No-op move. Can't split to nothing; emit something. */
11632 emit_note (NOTE_INSN_DELETED);
11633 return;
11636 /* Preserve register attributes for variable tracking. */
11637 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
11638 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
11639 GET_MODE_SIZE (halfmode));
11641 /* Special case of reversed high/low parts. */
11642 if (reg_overlap_mentioned_p (operands[2], destlo)
11643 && reg_overlap_mentioned_p (operands[1], desthi))
11645 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11646 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
11647 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11649 else if (!reg_overlap_mentioned_p (operands[2], destlo))
11651 /* Try to avoid unnecessary moves if part of the result
11652 is in the right place already. */
11653 if (src1 != dest)
11654 emit_move_insn (destlo, operands[1]);
11655 if (src2 != dest + halfregs)
11656 emit_move_insn (desthi, operands[2]);
11658 else
11660 if (src2 != dest + halfregs)
11661 emit_move_insn (desthi, operands[2]);
11662 if (src1 != dest)
11663 emit_move_insn (destlo, operands[1]);
11667 /* vec_perm support. */
11669 #define MAX_VECT_LEN 16
11671 struct expand_vec_perm_d
11673 rtx target, op0, op1;
11674 unsigned char perm[MAX_VECT_LEN];
11675 machine_mode vmode;
11676 unsigned char nelt;
11677 bool one_vector_p;
11678 bool testing_p;
11681 /* Generate a variable permutation. */
11683 static void
11684 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
11686 machine_mode vmode = GET_MODE (target);
11687 bool one_vector_p = rtx_equal_p (op0, op1);
11689 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
11690 gcc_checking_assert (GET_MODE (op0) == vmode);
11691 gcc_checking_assert (GET_MODE (op1) == vmode);
11692 gcc_checking_assert (GET_MODE (sel) == vmode);
11693 gcc_checking_assert (TARGET_SIMD);
11695 if (one_vector_p)
11697 if (vmode == V8QImode)
11699 /* Expand the argument to a V16QI mode by duplicating it. */
11700 rtx pair = gen_reg_rtx (V16QImode);
11701 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
11702 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11704 else
11706 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
11709 else
11711 rtx pair;
11713 if (vmode == V8QImode)
11715 pair = gen_reg_rtx (V16QImode);
11716 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
11717 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11719 else
11721 pair = gen_reg_rtx (OImode);
11722 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
11723 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
11728 void
11729 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
11731 machine_mode vmode = GET_MODE (target);
11732 unsigned int nelt = GET_MODE_NUNITS (vmode);
11733 bool one_vector_p = rtx_equal_p (op0, op1);
11734 rtx mask;
11736 /* The TBL instruction does not use a modulo index, so we must take care
11737 of that ourselves. */
11738 mask = aarch64_simd_gen_const_vector_dup (vmode,
11739 one_vector_p ? nelt - 1 : 2 * nelt - 1);
11740 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
11742 /* For big-endian, we also need to reverse the index within the vector
11743 (but not which vector). */
11744 if (BYTES_BIG_ENDIAN)
11746 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
11747 if (!one_vector_p)
11748 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
11749 sel = expand_simple_binop (vmode, XOR, sel, mask,
11750 NULL, 0, OPTAB_LIB_WIDEN);
11752 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
11755 /* Recognize patterns suitable for the TRN instructions. */
11756 static bool
11757 aarch64_evpc_trn (struct expand_vec_perm_d *d)
11759 unsigned int i, odd, mask, nelt = d->nelt;
11760 rtx out, in0, in1, x;
11761 rtx (*gen) (rtx, rtx, rtx);
11762 machine_mode vmode = d->vmode;
11764 if (GET_MODE_UNIT_SIZE (vmode) > 8)
11765 return false;
11767 /* Note that these are little-endian tests.
11768 We correct for big-endian later. */
11769 if (d->perm[0] == 0)
11770 odd = 0;
11771 else if (d->perm[0] == 1)
11772 odd = 1;
11773 else
11774 return false;
11775 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11777 for (i = 0; i < nelt; i += 2)
11779 if (d->perm[i] != i + odd)
11780 return false;
11781 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
11782 return false;
11785 /* Success! */
11786 if (d->testing_p)
11787 return true;
11789 in0 = d->op0;
11790 in1 = d->op1;
11791 if (BYTES_BIG_ENDIAN)
11793 x = in0, in0 = in1, in1 = x;
11794 odd = !odd;
11796 out = d->target;
11798 if (odd)
11800 switch (vmode)
11802 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
11803 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
11804 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
11805 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
11806 case V4SImode: gen = gen_aarch64_trn2v4si; break;
11807 case V2SImode: gen = gen_aarch64_trn2v2si; break;
11808 case V2DImode: gen = gen_aarch64_trn2v2di; break;
11809 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
11810 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
11811 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
11812 default:
11813 return false;
11816 else
11818 switch (vmode)
11820 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
11821 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
11822 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
11823 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
11824 case V4SImode: gen = gen_aarch64_trn1v4si; break;
11825 case V2SImode: gen = gen_aarch64_trn1v2si; break;
11826 case V2DImode: gen = gen_aarch64_trn1v2di; break;
11827 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
11828 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
11829 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
11830 default:
11831 return false;
11835 emit_insn (gen (out, in0, in1));
11836 return true;
11839 /* Recognize patterns suitable for the UZP instructions. */
11840 static bool
11841 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
11843 unsigned int i, odd, mask, nelt = d->nelt;
11844 rtx out, in0, in1, x;
11845 rtx (*gen) (rtx, rtx, rtx);
11846 machine_mode vmode = d->vmode;
11848 if (GET_MODE_UNIT_SIZE (vmode) > 8)
11849 return false;
11851 /* Note that these are little-endian tests.
11852 We correct for big-endian later. */
11853 if (d->perm[0] == 0)
11854 odd = 0;
11855 else if (d->perm[0] == 1)
11856 odd = 1;
11857 else
11858 return false;
11859 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11861 for (i = 0; i < nelt; i++)
11863 unsigned elt = (i * 2 + odd) & mask;
11864 if (d->perm[i] != elt)
11865 return false;
11868 /* Success! */
11869 if (d->testing_p)
11870 return true;
11872 in0 = d->op0;
11873 in1 = d->op1;
11874 if (BYTES_BIG_ENDIAN)
11876 x = in0, in0 = in1, in1 = x;
11877 odd = !odd;
11879 out = d->target;
11881 if (odd)
11883 switch (vmode)
11885 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
11886 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
11887 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
11888 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
11889 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
11890 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
11891 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
11892 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
11893 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
11894 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
11895 default:
11896 return false;
11899 else
11901 switch (vmode)
11903 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
11904 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
11905 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
11906 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
11907 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
11908 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
11909 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
11910 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
11911 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
11912 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
11913 default:
11914 return false;
11918 emit_insn (gen (out, in0, in1));
11919 return true;
11922 /* Recognize patterns suitable for the ZIP instructions. */
11923 static bool
11924 aarch64_evpc_zip (struct expand_vec_perm_d *d)
11926 unsigned int i, high, mask, nelt = d->nelt;
11927 rtx out, in0, in1, x;
11928 rtx (*gen) (rtx, rtx, rtx);
11929 machine_mode vmode = d->vmode;
11931 if (GET_MODE_UNIT_SIZE (vmode) > 8)
11932 return false;
11934 /* Note that these are little-endian tests.
11935 We correct for big-endian later. */
11936 high = nelt / 2;
11937 if (d->perm[0] == high)
11938 /* Do Nothing. */
11940 else if (d->perm[0] == 0)
11941 high = 0;
11942 else
11943 return false;
11944 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11946 for (i = 0; i < nelt / 2; i++)
11948 unsigned elt = (i + high) & mask;
11949 if (d->perm[i * 2] != elt)
11950 return false;
11951 elt = (elt + nelt) & mask;
11952 if (d->perm[i * 2 + 1] != elt)
11953 return false;
11956 /* Success! */
11957 if (d->testing_p)
11958 return true;
11960 in0 = d->op0;
11961 in1 = d->op1;
11962 if (BYTES_BIG_ENDIAN)
11964 x = in0, in0 = in1, in1 = x;
11965 high = !high;
11967 out = d->target;
11969 if (high)
11971 switch (vmode)
11973 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
11974 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
11975 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
11976 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
11977 case V4SImode: gen = gen_aarch64_zip2v4si; break;
11978 case V2SImode: gen = gen_aarch64_zip2v2si; break;
11979 case V2DImode: gen = gen_aarch64_zip2v2di; break;
11980 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
11981 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
11982 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
11983 default:
11984 return false;
11987 else
11989 switch (vmode)
11991 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
11992 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
11993 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
11994 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
11995 case V4SImode: gen = gen_aarch64_zip1v4si; break;
11996 case V2SImode: gen = gen_aarch64_zip1v2si; break;
11997 case V2DImode: gen = gen_aarch64_zip1v2di; break;
11998 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
11999 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12000 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12001 default:
12002 return false;
12006 emit_insn (gen (out, in0, in1));
12007 return true;
12010 /* Recognize patterns for the EXT insn. */
12012 static bool
12013 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12015 unsigned int i, nelt = d->nelt;
12016 rtx (*gen) (rtx, rtx, rtx, rtx);
12017 rtx offset;
12019 unsigned int location = d->perm[0]; /* Always < nelt. */
12021 /* Check if the extracted indices are increasing by one. */
12022 for (i = 1; i < nelt; i++)
12024 unsigned int required = location + i;
12025 if (d->one_vector_p)
12027 /* We'll pass the same vector in twice, so allow indices to wrap. */
12028 required &= (nelt - 1);
12030 if (d->perm[i] != required)
12031 return false;
12034 switch (d->vmode)
12036 case V16QImode: gen = gen_aarch64_extv16qi; break;
12037 case V8QImode: gen = gen_aarch64_extv8qi; break;
12038 case V4HImode: gen = gen_aarch64_extv4hi; break;
12039 case V8HImode: gen = gen_aarch64_extv8hi; break;
12040 case V2SImode: gen = gen_aarch64_extv2si; break;
12041 case V4SImode: gen = gen_aarch64_extv4si; break;
12042 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12043 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12044 case V2DImode: gen = gen_aarch64_extv2di; break;
12045 case V2DFmode: gen = gen_aarch64_extv2df; break;
12046 default:
12047 return false;
12050 /* Success! */
12051 if (d->testing_p)
12052 return true;
12054 /* The case where (location == 0) is a no-op for both big- and little-endian,
12055 and is removed by the mid-end at optimization levels -O1 and higher. */
12057 if (BYTES_BIG_ENDIAN && (location != 0))
12059 /* After setup, we want the high elements of the first vector (stored
12060 at the LSB end of the register), and the low elements of the second
12061 vector (stored at the MSB end of the register). So swap. */
12062 std::swap (d->op0, d->op1);
12063 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12064 location = nelt - location;
12067 offset = GEN_INT (location);
12068 emit_insn (gen (d->target, d->op0, d->op1, offset));
12069 return true;
12072 /* Recognize patterns for the REV insns. */
12074 static bool
12075 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12077 unsigned int i, j, diff, nelt = d->nelt;
12078 rtx (*gen) (rtx, rtx);
12080 if (!d->one_vector_p)
12081 return false;
12083 diff = d->perm[0];
12084 switch (diff)
12086 case 7:
12087 switch (d->vmode)
12089 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12090 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12091 default:
12092 return false;
12094 break;
12095 case 3:
12096 switch (d->vmode)
12098 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12099 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12100 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12101 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12102 default:
12103 return false;
12105 break;
12106 case 1:
12107 switch (d->vmode)
12109 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12110 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12111 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12112 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12113 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12114 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12115 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12116 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12117 default:
12118 return false;
12120 break;
12121 default:
12122 return false;
12125 for (i = 0; i < nelt ; i += diff + 1)
12126 for (j = 0; j <= diff; j += 1)
12128 /* This is guaranteed to be true as the value of diff
12129 is 7, 3, 1 and we should have enough elements in the
12130 queue to generate this. Getting a vector mask with a
12131 value of diff other than these values implies that
12132 something is wrong by the time we get here. */
12133 gcc_assert (i + j < nelt);
12134 if (d->perm[i + j] != i + diff - j)
12135 return false;
12138 /* Success! */
12139 if (d->testing_p)
12140 return true;
12142 emit_insn (gen (d->target, d->op0));
12143 return true;
12146 static bool
12147 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12149 rtx (*gen) (rtx, rtx, rtx);
12150 rtx out = d->target;
12151 rtx in0;
12152 machine_mode vmode = d->vmode;
12153 unsigned int i, elt, nelt = d->nelt;
12154 rtx lane;
12156 elt = d->perm[0];
12157 for (i = 1; i < nelt; i++)
12159 if (elt != d->perm[i])
12160 return false;
12163 /* The generic preparation in aarch64_expand_vec_perm_const_1
12164 swaps the operand order and the permute indices if it finds
12165 d->perm[0] to be in the second operand. Thus, we can always
12166 use d->op0 and need not do any extra arithmetic to get the
12167 correct lane number. */
12168 in0 = d->op0;
12169 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12171 switch (vmode)
12173 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12174 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12175 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12176 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12177 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12178 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12179 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12180 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12181 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12182 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12183 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12184 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12185 default:
12186 return false;
12189 emit_insn (gen (out, in0, lane));
12190 return true;
12193 static bool
12194 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12196 rtx rperm[MAX_VECT_LEN], sel;
12197 machine_mode vmode = d->vmode;
12198 unsigned int i, nelt = d->nelt;
12200 if (d->testing_p)
12201 return true;
12203 /* Generic code will try constant permutation twice. Once with the
12204 original mode and again with the elements lowered to QImode.
12205 So wait and don't do the selector expansion ourselves. */
12206 if (vmode != V8QImode && vmode != V16QImode)
12207 return false;
12209 for (i = 0; i < nelt; ++i)
12211 int nunits = GET_MODE_NUNITS (vmode);
12213 /* If big-endian and two vectors we end up with a weird mixed-endian
12214 mode on NEON. Reverse the index within each word but not the word
12215 itself. */
12216 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12217 : d->perm[i]);
12219 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12220 sel = force_reg (vmode, sel);
12222 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12223 return true;
12226 static bool
12227 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12229 /* The pattern matching functions above are written to look for a small
12230 number to begin the sequence (0, 1, N/2). If we begin with an index
12231 from the second operand, we can swap the operands. */
12232 if (d->perm[0] >= d->nelt)
12234 unsigned i, nelt = d->nelt;
12236 gcc_assert (nelt == (nelt & -nelt));
12237 for (i = 0; i < nelt; ++i)
12238 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12240 std::swap (d->op0, d->op1);
12243 if (TARGET_SIMD)
12245 if (aarch64_evpc_rev (d))
12246 return true;
12247 else if (aarch64_evpc_ext (d))
12248 return true;
12249 else if (aarch64_evpc_dup (d))
12250 return true;
12251 else if (aarch64_evpc_zip (d))
12252 return true;
12253 else if (aarch64_evpc_uzp (d))
12254 return true;
12255 else if (aarch64_evpc_trn (d))
12256 return true;
12257 return aarch64_evpc_tbl (d);
12259 return false;
12262 /* Expand a vec_perm_const pattern. */
12264 bool
12265 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12267 struct expand_vec_perm_d d;
12268 int i, nelt, which;
12270 d.target = target;
12271 d.op0 = op0;
12272 d.op1 = op1;
12274 d.vmode = GET_MODE (target);
12275 gcc_assert (VECTOR_MODE_P (d.vmode));
12276 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12277 d.testing_p = false;
12279 for (i = which = 0; i < nelt; ++i)
12281 rtx e = XVECEXP (sel, 0, i);
12282 int ei = INTVAL (e) & (2 * nelt - 1);
12283 which |= (ei < nelt ? 1 : 2);
12284 d.perm[i] = ei;
12287 switch (which)
12289 default:
12290 gcc_unreachable ();
12292 case 3:
12293 d.one_vector_p = false;
12294 if (!rtx_equal_p (op0, op1))
12295 break;
12297 /* The elements of PERM do not suggest that only the first operand
12298 is used, but both operands are identical. Allow easier matching
12299 of the permutation by folding the permutation into the single
12300 input vector. */
12301 /* Fall Through. */
12302 case 2:
12303 for (i = 0; i < nelt; ++i)
12304 d.perm[i] &= nelt - 1;
12305 d.op0 = op1;
12306 d.one_vector_p = true;
12307 break;
12309 case 1:
12310 d.op1 = op0;
12311 d.one_vector_p = true;
12312 break;
12315 return aarch64_expand_vec_perm_const_1 (&d);
12318 static bool
12319 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12320 const unsigned char *sel)
12322 struct expand_vec_perm_d d;
12323 unsigned int i, nelt, which;
12324 bool ret;
12326 d.vmode = vmode;
12327 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12328 d.testing_p = true;
12329 memcpy (d.perm, sel, nelt);
12331 /* Calculate whether all elements are in one vector. */
12332 for (i = which = 0; i < nelt; ++i)
12334 unsigned char e = d.perm[i];
12335 gcc_assert (e < 2 * nelt);
12336 which |= (e < nelt ? 1 : 2);
12339 /* If all elements are from the second vector, reindex as if from the
12340 first vector. */
12341 if (which == 2)
12342 for (i = 0; i < nelt; ++i)
12343 d.perm[i] -= nelt;
12345 /* Check whether the mask can be applied to a single vector. */
12346 d.one_vector_p = (which != 3);
12348 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12349 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12350 if (!d.one_vector_p)
12351 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12353 start_sequence ();
12354 ret = aarch64_expand_vec_perm_const_1 (&d);
12355 end_sequence ();
12357 return ret;
12361 aarch64_reverse_mask (enum machine_mode mode)
12363 /* We have to reverse each vector because we dont have
12364 a permuted load that can reverse-load according to ABI rules. */
12365 rtx mask;
12366 rtvec v = rtvec_alloc (16);
12367 int i, j;
12368 int nunits = GET_MODE_NUNITS (mode);
12369 int usize = GET_MODE_UNIT_SIZE (mode);
12371 gcc_assert (BYTES_BIG_ENDIAN);
12372 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12374 for (i = 0; i < nunits; i++)
12375 for (j = 0; j < usize; j++)
12376 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12377 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12378 return force_reg (V16QImode, mask);
12381 /* Implement MODES_TIEABLE_P. */
12383 bool
12384 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12386 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12387 return true;
12389 /* We specifically want to allow elements of "structure" modes to
12390 be tieable to the structure. This more general condition allows
12391 other rarer situations too. */
12392 if (TARGET_SIMD
12393 && aarch64_vector_mode_p (mode1)
12394 && aarch64_vector_mode_p (mode2))
12395 return true;
12397 return false;
12400 /* Return a new RTX holding the result of moving POINTER forward by
12401 AMOUNT bytes. */
12403 static rtx
12404 aarch64_move_pointer (rtx pointer, int amount)
12406 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12408 return adjust_automodify_address (pointer, GET_MODE (pointer),
12409 next, amount);
12412 /* Return a new RTX holding the result of moving POINTER forward by the
12413 size of the mode it points to. */
12415 static rtx
12416 aarch64_progress_pointer (rtx pointer)
12418 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12420 return aarch64_move_pointer (pointer, amount);
12423 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12424 MODE bytes. */
12426 static void
12427 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12428 machine_mode mode)
12430 rtx reg = gen_reg_rtx (mode);
12432 /* "Cast" the pointers to the correct mode. */
12433 *src = adjust_address (*src, mode, 0);
12434 *dst = adjust_address (*dst, mode, 0);
12435 /* Emit the memcpy. */
12436 emit_move_insn (reg, *src);
12437 emit_move_insn (*dst, reg);
12438 /* Move the pointers forward. */
12439 *src = aarch64_progress_pointer (*src);
12440 *dst = aarch64_progress_pointer (*dst);
12443 /* Expand movmem, as if from a __builtin_memcpy. Return true if
12444 we succeed, otherwise return false. */
12446 bool
12447 aarch64_expand_movmem (rtx *operands)
12449 unsigned int n;
12450 rtx dst = operands[0];
12451 rtx src = operands[1];
12452 rtx base;
12453 bool speed_p = !optimize_function_for_size_p (cfun);
12455 /* When optimizing for size, give a better estimate of the length of a
12456 memcpy call, but use the default otherwise. */
12457 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12459 /* We can't do anything smart if the amount to copy is not constant. */
12460 if (!CONST_INT_P (operands[2]))
12461 return false;
12463 n = UINTVAL (operands[2]);
12465 /* Try to keep the number of instructions low. For cases below 16 bytes we
12466 need to make at most two moves. For cases above 16 bytes it will be one
12467 move for each 16 byte chunk, then at most two additional moves. */
12468 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12469 return false;
12471 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12472 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12474 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12475 src = adjust_automodify_address (src, VOIDmode, base, 0);
12477 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12478 1-byte chunk. */
12479 if (n < 4)
12481 if (n >= 2)
12483 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12484 n -= 2;
12487 if (n == 1)
12488 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12490 return true;
12493 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
12494 4-byte chunk, partially overlapping with the previously copied chunk. */
12495 if (n < 8)
12497 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12498 n -= 4;
12499 if (n > 0)
12501 int move = n - 4;
12503 src = aarch64_move_pointer (src, move);
12504 dst = aarch64_move_pointer (dst, move);
12505 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12507 return true;
12510 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
12511 them, then (if applicable) an 8-byte chunk. */
12512 while (n >= 8)
12514 if (n / 16)
12516 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12517 n -= 16;
12519 else
12521 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12522 n -= 8;
12526 /* Finish the final bytes of the copy. We can always do this in one
12527 instruction. We either copy the exact amount we need, or partially
12528 overlap with the previous chunk we copied and copy 8-bytes. */
12529 if (n == 0)
12530 return true;
12531 else if (n == 1)
12532 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12533 else if (n == 2)
12534 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12535 else if (n == 4)
12536 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12537 else
12539 if (n == 3)
12541 src = aarch64_move_pointer (src, -1);
12542 dst = aarch64_move_pointer (dst, -1);
12543 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12545 else
12547 int move = n - 8;
12549 src = aarch64_move_pointer (src, move);
12550 dst = aarch64_move_pointer (dst, move);
12551 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12555 return true;
12558 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
12560 static unsigned HOST_WIDE_INT
12561 aarch64_asan_shadow_offset (void)
12563 return (HOST_WIDE_INT_1 << 36);
12566 static bool
12567 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
12568 unsigned int align,
12569 enum by_pieces_operation op,
12570 bool speed_p)
12572 /* STORE_BY_PIECES can be used when copying a constant string, but
12573 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
12574 For now we always fail this and let the move_by_pieces code copy
12575 the string from read-only memory. */
12576 if (op == STORE_BY_PIECES)
12577 return false;
12579 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
12582 static enum machine_mode
12583 aarch64_code_to_ccmode (enum rtx_code code)
12585 switch (code)
12587 case NE:
12588 return CC_DNEmode;
12590 case EQ:
12591 return CC_DEQmode;
12593 case LE:
12594 return CC_DLEmode;
12596 case LT:
12597 return CC_DLTmode;
12599 case GE:
12600 return CC_DGEmode;
12602 case GT:
12603 return CC_DGTmode;
12605 case LEU:
12606 return CC_DLEUmode;
12608 case LTU:
12609 return CC_DLTUmode;
12611 case GEU:
12612 return CC_DGEUmode;
12614 case GTU:
12615 return CC_DGTUmode;
12617 default:
12618 return CCmode;
12622 static rtx
12623 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
12624 int code, tree treeop0, tree treeop1)
12626 enum machine_mode op_mode, cmp_mode, cc_mode;
12627 rtx op0, op1, cmp, target;
12628 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12629 enum insn_code icode;
12630 struct expand_operand ops[4];
12632 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
12633 if (cc_mode == CCmode)
12634 return NULL_RTX;
12636 start_sequence ();
12637 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12639 op_mode = GET_MODE (op0);
12640 if (op_mode == VOIDmode)
12641 op_mode = GET_MODE (op1);
12643 switch (op_mode)
12645 case QImode:
12646 case HImode:
12647 case SImode:
12648 cmp_mode = SImode;
12649 icode = CODE_FOR_cmpsi;
12650 break;
12652 case DImode:
12653 cmp_mode = DImode;
12654 icode = CODE_FOR_cmpdi;
12655 break;
12657 default:
12658 end_sequence ();
12659 return NULL_RTX;
12662 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12663 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12664 if (!op0 || !op1)
12666 end_sequence ();
12667 return NULL_RTX;
12669 *prep_seq = get_insns ();
12670 end_sequence ();
12672 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
12673 target = gen_rtx_REG (CCmode, CC_REGNUM);
12675 create_output_operand (&ops[0], target, CCmode);
12676 create_fixed_operand (&ops[1], cmp);
12677 create_fixed_operand (&ops[2], op0);
12678 create_fixed_operand (&ops[3], op1);
12680 start_sequence ();
12681 if (!maybe_expand_insn (icode, 4, ops))
12683 end_sequence ();
12684 return NULL_RTX;
12686 *gen_seq = get_insns ();
12687 end_sequence ();
12689 return gen_rtx_REG (cc_mode, CC_REGNUM);
12692 static rtx
12693 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
12694 tree treeop0, tree treeop1, int bit_code)
12696 rtx op0, op1, cmp0, cmp1, target;
12697 enum machine_mode op_mode, cmp_mode, cc_mode;
12698 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12699 enum insn_code icode = CODE_FOR_ccmp_andsi;
12700 struct expand_operand ops[6];
12702 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
12703 if (cc_mode == CCmode)
12704 return NULL_RTX;
12706 push_to_sequence ((rtx_insn*) *prep_seq);
12707 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12709 op_mode = GET_MODE (op0);
12710 if (op_mode == VOIDmode)
12711 op_mode = GET_MODE (op1);
12713 switch (op_mode)
12715 case QImode:
12716 case HImode:
12717 case SImode:
12718 cmp_mode = SImode;
12719 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
12720 : CODE_FOR_ccmp_iorsi;
12721 break;
12723 case DImode:
12724 cmp_mode = DImode;
12725 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
12726 : CODE_FOR_ccmp_iordi;
12727 break;
12729 default:
12730 end_sequence ();
12731 return NULL_RTX;
12734 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12735 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12736 if (!op0 || !op1)
12738 end_sequence ();
12739 return NULL_RTX;
12741 *prep_seq = get_insns ();
12742 end_sequence ();
12744 target = gen_rtx_REG (cc_mode, CC_REGNUM);
12745 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
12746 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
12748 create_fixed_operand (&ops[0], prev);
12749 create_fixed_operand (&ops[1], target);
12750 create_fixed_operand (&ops[2], op0);
12751 create_fixed_operand (&ops[3], op1);
12752 create_fixed_operand (&ops[4], cmp0);
12753 create_fixed_operand (&ops[5], cmp1);
12755 push_to_sequence ((rtx_insn*) *gen_seq);
12756 if (!maybe_expand_insn (icode, 6, ops))
12758 end_sequence ();
12759 return NULL_RTX;
12762 *gen_seq = get_insns ();
12763 end_sequence ();
12765 return target;
12768 #undef TARGET_GEN_CCMP_FIRST
12769 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
12771 #undef TARGET_GEN_CCMP_NEXT
12772 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
12774 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
12775 instruction fusion of some sort. */
12777 static bool
12778 aarch64_macro_fusion_p (void)
12780 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
12784 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
12785 should be kept together during scheduling. */
12787 static bool
12788 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
12790 rtx set_dest;
12791 rtx prev_set = single_set (prev);
12792 rtx curr_set = single_set (curr);
12793 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
12794 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
12796 if (!aarch64_macro_fusion_p ())
12797 return false;
12799 if (simple_sets_p
12800 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
12802 /* We are trying to match:
12803 prev (mov) == (set (reg r0) (const_int imm16))
12804 curr (movk) == (set (zero_extract (reg r0)
12805 (const_int 16)
12806 (const_int 16))
12807 (const_int imm16_1)) */
12809 set_dest = SET_DEST (curr_set);
12811 if (GET_CODE (set_dest) == ZERO_EXTRACT
12812 && CONST_INT_P (SET_SRC (curr_set))
12813 && CONST_INT_P (SET_SRC (prev_set))
12814 && CONST_INT_P (XEXP (set_dest, 2))
12815 && INTVAL (XEXP (set_dest, 2)) == 16
12816 && REG_P (XEXP (set_dest, 0))
12817 && REG_P (SET_DEST (prev_set))
12818 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
12820 return true;
12824 if (simple_sets_p
12825 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
12828 /* We're trying to match:
12829 prev (adrp) == (set (reg r1)
12830 (high (symbol_ref ("SYM"))))
12831 curr (add) == (set (reg r0)
12832 (lo_sum (reg r1)
12833 (symbol_ref ("SYM"))))
12834 Note that r0 need not necessarily be the same as r1, especially
12835 during pre-regalloc scheduling. */
12837 if (satisfies_constraint_Ush (SET_SRC (prev_set))
12838 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
12840 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
12841 && REG_P (XEXP (SET_SRC (curr_set), 0))
12842 && REGNO (XEXP (SET_SRC (curr_set), 0))
12843 == REGNO (SET_DEST (prev_set))
12844 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
12845 XEXP (SET_SRC (curr_set), 1)))
12846 return true;
12850 if (simple_sets_p
12851 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
12854 /* We're trying to match:
12855 prev (movk) == (set (zero_extract (reg r0)
12856 (const_int 16)
12857 (const_int 32))
12858 (const_int imm16_1))
12859 curr (movk) == (set (zero_extract (reg r0)
12860 (const_int 16)
12861 (const_int 48))
12862 (const_int imm16_2)) */
12864 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
12865 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
12866 && REG_P (XEXP (SET_DEST (prev_set), 0))
12867 && REG_P (XEXP (SET_DEST (curr_set), 0))
12868 && REGNO (XEXP (SET_DEST (prev_set), 0))
12869 == REGNO (XEXP (SET_DEST (curr_set), 0))
12870 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
12871 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
12872 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
12873 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
12874 && CONST_INT_P (SET_SRC (prev_set))
12875 && CONST_INT_P (SET_SRC (curr_set)))
12876 return true;
12879 if (simple_sets_p
12880 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
12882 /* We're trying to match:
12883 prev (adrp) == (set (reg r0)
12884 (high (symbol_ref ("SYM"))))
12885 curr (ldr) == (set (reg r1)
12886 (mem (lo_sum (reg r0)
12887 (symbol_ref ("SYM")))))
12889 curr (ldr) == (set (reg r1)
12890 (zero_extend (mem
12891 (lo_sum (reg r0)
12892 (symbol_ref ("SYM")))))) */
12893 if (satisfies_constraint_Ush (SET_SRC (prev_set))
12894 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
12896 rtx curr_src = SET_SRC (curr_set);
12898 if (GET_CODE (curr_src) == ZERO_EXTEND)
12899 curr_src = XEXP (curr_src, 0);
12901 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
12902 && REG_P (XEXP (XEXP (curr_src, 0), 0))
12903 && REGNO (XEXP (XEXP (curr_src, 0), 0))
12904 == REGNO (SET_DEST (prev_set))
12905 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
12906 XEXP (SET_SRC (prev_set), 0)))
12907 return true;
12911 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
12912 && any_condjump_p (curr))
12914 enum attr_type prev_type = get_attr_type (prev);
12916 /* FIXME: this misses some which is considered simple arthematic
12917 instructions for ThunderX. Simple shifts are missed here. */
12918 if (prev_type == TYPE_ALUS_SREG
12919 || prev_type == TYPE_ALUS_IMM
12920 || prev_type == TYPE_LOGICS_REG
12921 || prev_type == TYPE_LOGICS_IMM)
12922 return true;
12925 return false;
12928 /* If MEM is in the form of [base+offset], extract the two parts
12929 of address and set to BASE and OFFSET, otherwise return false
12930 after clearing BASE and OFFSET. */
12932 bool
12933 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
12935 rtx addr;
12937 gcc_assert (MEM_P (mem));
12939 addr = XEXP (mem, 0);
12941 if (REG_P (addr))
12943 *base = addr;
12944 *offset = const0_rtx;
12945 return true;
12948 if (GET_CODE (addr) == PLUS
12949 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
12951 *base = XEXP (addr, 0);
12952 *offset = XEXP (addr, 1);
12953 return true;
12956 *base = NULL_RTX;
12957 *offset = NULL_RTX;
12959 return false;
12962 /* Types for scheduling fusion. */
12963 enum sched_fusion_type
12965 SCHED_FUSION_NONE = 0,
12966 SCHED_FUSION_LD_SIGN_EXTEND,
12967 SCHED_FUSION_LD_ZERO_EXTEND,
12968 SCHED_FUSION_LD,
12969 SCHED_FUSION_ST,
12970 SCHED_FUSION_NUM
12973 /* If INSN is a load or store of address in the form of [base+offset],
12974 extract the two parts and set to BASE and OFFSET. Return scheduling
12975 fusion type this INSN is. */
12977 static enum sched_fusion_type
12978 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
12980 rtx x, dest, src;
12981 enum sched_fusion_type fusion = SCHED_FUSION_LD;
12983 gcc_assert (INSN_P (insn));
12984 x = PATTERN (insn);
12985 if (GET_CODE (x) != SET)
12986 return SCHED_FUSION_NONE;
12988 src = SET_SRC (x);
12989 dest = SET_DEST (x);
12991 machine_mode dest_mode = GET_MODE (dest);
12993 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
12994 return SCHED_FUSION_NONE;
12996 if (GET_CODE (src) == SIGN_EXTEND)
12998 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
12999 src = XEXP (src, 0);
13000 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13001 return SCHED_FUSION_NONE;
13003 else if (GET_CODE (src) == ZERO_EXTEND)
13005 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13006 src = XEXP (src, 0);
13007 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13008 return SCHED_FUSION_NONE;
13011 if (GET_CODE (src) == MEM && REG_P (dest))
13012 extract_base_offset_in_addr (src, base, offset);
13013 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13015 fusion = SCHED_FUSION_ST;
13016 extract_base_offset_in_addr (dest, base, offset);
13018 else
13019 return SCHED_FUSION_NONE;
13021 if (*base == NULL_RTX || *offset == NULL_RTX)
13022 fusion = SCHED_FUSION_NONE;
13024 return fusion;
13027 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13029 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13030 and PRI are only calculated for these instructions. For other instruction,
13031 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13032 type instruction fusion can be added by returning different priorities.
13034 It's important that irrelevant instructions get the largest FUSION_PRI. */
13036 static void
13037 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13038 int *fusion_pri, int *pri)
13040 int tmp, off_val;
13041 rtx base, offset;
13042 enum sched_fusion_type fusion;
13044 gcc_assert (INSN_P (insn));
13046 tmp = max_pri - 1;
13047 fusion = fusion_load_store (insn, &base, &offset);
13048 if (fusion == SCHED_FUSION_NONE)
13050 *pri = tmp;
13051 *fusion_pri = tmp;
13052 return;
13055 /* Set FUSION_PRI according to fusion type and base register. */
13056 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13058 /* Calculate PRI. */
13059 tmp /= 2;
13061 /* INSN with smaller offset goes first. */
13062 off_val = (int)(INTVAL (offset));
13063 if (off_val >= 0)
13064 tmp -= (off_val & 0xfffff);
13065 else
13066 tmp += ((- off_val) & 0xfffff);
13068 *pri = tmp;
13069 return;
13072 /* Given OPERANDS of consecutive load/store, check if we can merge
13073 them into ldp/stp. LOAD is true if they are load instructions.
13074 MODE is the mode of memory operands. */
13076 bool
13077 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13078 enum machine_mode mode)
13080 HOST_WIDE_INT offval_1, offval_2, msize;
13081 enum reg_class rclass_1, rclass_2;
13082 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13084 if (load)
13086 mem_1 = operands[1];
13087 mem_2 = operands[3];
13088 reg_1 = operands[0];
13089 reg_2 = operands[2];
13090 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13091 if (REGNO (reg_1) == REGNO (reg_2))
13092 return false;
13094 else
13096 mem_1 = operands[0];
13097 mem_2 = operands[2];
13098 reg_1 = operands[1];
13099 reg_2 = operands[3];
13102 /* The mems cannot be volatile. */
13103 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13104 return false;
13106 /* Check if the addresses are in the form of [base+offset]. */
13107 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13108 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13109 return false;
13110 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13111 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13112 return false;
13114 /* Check if the bases are same. */
13115 if (!rtx_equal_p (base_1, base_2))
13116 return false;
13118 offval_1 = INTVAL (offset_1);
13119 offval_2 = INTVAL (offset_2);
13120 msize = GET_MODE_SIZE (mode);
13121 /* Check if the offsets are consecutive. */
13122 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13123 return false;
13125 /* Check if the addresses are clobbered by load. */
13126 if (load)
13128 if (reg_mentioned_p (reg_1, mem_1))
13129 return false;
13131 /* In increasing order, the last load can clobber the address. */
13132 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13133 return false;
13136 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13137 rclass_1 = FP_REGS;
13138 else
13139 rclass_1 = GENERAL_REGS;
13141 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13142 rclass_2 = FP_REGS;
13143 else
13144 rclass_2 = GENERAL_REGS;
13146 /* Check if the registers are of same class. */
13147 if (rclass_1 != rclass_2)
13148 return false;
13150 return true;
13153 /* Given OPERANDS of consecutive load/store, check if we can merge
13154 them into ldp/stp by adjusting the offset. LOAD is true if they
13155 are load instructions. MODE is the mode of memory operands.
13157 Given below consecutive stores:
13159 str w1, [xb, 0x100]
13160 str w1, [xb, 0x104]
13161 str w1, [xb, 0x108]
13162 str w1, [xb, 0x10c]
13164 Though the offsets are out of the range supported by stp, we can
13165 still pair them after adjusting the offset, like:
13167 add scratch, xb, 0x100
13168 stp w1, w1, [scratch]
13169 stp w1, w1, [scratch, 0x8]
13171 The peephole patterns detecting this opportunity should guarantee
13172 the scratch register is avaliable. */
13174 bool
13175 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13176 enum machine_mode mode)
13178 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13179 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13180 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13181 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13183 if (load)
13185 reg_1 = operands[0];
13186 mem_1 = operands[1];
13187 reg_2 = operands[2];
13188 mem_2 = operands[3];
13189 reg_3 = operands[4];
13190 mem_3 = operands[5];
13191 reg_4 = operands[6];
13192 mem_4 = operands[7];
13193 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13194 && REG_P (reg_3) && REG_P (reg_4));
13195 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13196 return false;
13198 else
13200 mem_1 = operands[0];
13201 reg_1 = operands[1];
13202 mem_2 = operands[2];
13203 reg_2 = operands[3];
13204 mem_3 = operands[4];
13205 reg_3 = operands[5];
13206 mem_4 = operands[6];
13207 reg_4 = operands[7];
13209 /* Skip if memory operand is by itslef valid for ldp/stp. */
13210 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13211 return false;
13213 /* The mems cannot be volatile. */
13214 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13215 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13216 return false;
13218 /* Check if the addresses are in the form of [base+offset]. */
13219 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13220 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13221 return false;
13222 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13223 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13224 return false;
13225 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13226 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13227 return false;
13228 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13229 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13230 return false;
13232 /* Check if the bases are same. */
13233 if (!rtx_equal_p (base_1, base_2)
13234 || !rtx_equal_p (base_2, base_3)
13235 || !rtx_equal_p (base_3, base_4))
13236 return false;
13238 offval_1 = INTVAL (offset_1);
13239 offval_2 = INTVAL (offset_2);
13240 offval_3 = INTVAL (offset_3);
13241 offval_4 = INTVAL (offset_4);
13242 msize = GET_MODE_SIZE (mode);
13243 /* Check if the offsets are consecutive. */
13244 if ((offval_1 != (offval_2 + msize)
13245 || offval_1 != (offval_3 + msize * 2)
13246 || offval_1 != (offval_4 + msize * 3))
13247 && (offval_4 != (offval_3 + msize)
13248 || offval_4 != (offval_2 + msize * 2)
13249 || offval_4 != (offval_1 + msize * 3)))
13250 return false;
13252 /* Check if the addresses are clobbered by load. */
13253 if (load)
13255 if (reg_mentioned_p (reg_1, mem_1)
13256 || reg_mentioned_p (reg_2, mem_2)
13257 || reg_mentioned_p (reg_3, mem_3))
13258 return false;
13260 /* In increasing order, the last load can clobber the address. */
13261 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13262 return false;
13265 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13266 rclass_1 = FP_REGS;
13267 else
13268 rclass_1 = GENERAL_REGS;
13270 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13271 rclass_2 = FP_REGS;
13272 else
13273 rclass_2 = GENERAL_REGS;
13275 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13276 rclass_3 = FP_REGS;
13277 else
13278 rclass_3 = GENERAL_REGS;
13280 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13281 rclass_4 = FP_REGS;
13282 else
13283 rclass_4 = GENERAL_REGS;
13285 /* Check if the registers are of same class. */
13286 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13287 return false;
13289 return true;
13292 /* Given OPERANDS of consecutive load/store, this function pairs them
13293 into ldp/stp after adjusting the offset. It depends on the fact
13294 that addresses of load/store instructions are in increasing order.
13295 MODE is the mode of memory operands. CODE is the rtl operator
13296 which should be applied to all memory operands, it's SIGN_EXTEND,
13297 ZERO_EXTEND or UNKNOWN. */
13299 bool
13300 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13301 enum machine_mode mode, RTX_CODE code)
13303 rtx base, offset, t1, t2;
13304 rtx mem_1, mem_2, mem_3, mem_4;
13305 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13307 if (load)
13309 mem_1 = operands[1];
13310 mem_2 = operands[3];
13311 mem_3 = operands[5];
13312 mem_4 = operands[7];
13314 else
13316 mem_1 = operands[0];
13317 mem_2 = operands[2];
13318 mem_3 = operands[4];
13319 mem_4 = operands[6];
13320 gcc_assert (code == UNKNOWN);
13323 extract_base_offset_in_addr (mem_1, &base, &offset);
13324 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13326 /* Adjust offset thus it can fit in ldp/stp instruction. */
13327 msize = GET_MODE_SIZE (mode);
13328 stp_off_limit = msize * 0x40;
13329 off_val = INTVAL (offset);
13330 abs_off = (off_val < 0) ? -off_val : off_val;
13331 new_off = abs_off % stp_off_limit;
13332 adj_off = abs_off - new_off;
13334 /* Further adjust to make sure all offsets are OK. */
13335 if ((new_off + msize * 2) >= stp_off_limit)
13337 adj_off += stp_off_limit;
13338 new_off -= stp_off_limit;
13341 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13342 if (adj_off >= 0x1000)
13343 return false;
13345 if (off_val < 0)
13347 adj_off = -adj_off;
13348 new_off = -new_off;
13351 /* Create new memory references. */
13352 mem_1 = change_address (mem_1, VOIDmode,
13353 plus_constant (DImode, operands[8], new_off));
13355 /* Check if the adjusted address is OK for ldp/stp. */
13356 if (!aarch64_mem_pair_operand (mem_1, mode))
13357 return false;
13359 msize = GET_MODE_SIZE (mode);
13360 mem_2 = change_address (mem_2, VOIDmode,
13361 plus_constant (DImode,
13362 operands[8],
13363 new_off + msize));
13364 mem_3 = change_address (mem_3, VOIDmode,
13365 plus_constant (DImode,
13366 operands[8],
13367 new_off + msize * 2));
13368 mem_4 = change_address (mem_4, VOIDmode,
13369 plus_constant (DImode,
13370 operands[8],
13371 new_off + msize * 3));
13373 if (code == ZERO_EXTEND)
13375 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13376 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13377 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13378 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13380 else if (code == SIGN_EXTEND)
13382 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13383 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13384 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13385 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13388 if (load)
13390 operands[1] = mem_1;
13391 operands[3] = mem_2;
13392 operands[5] = mem_3;
13393 operands[7] = mem_4;
13395 else
13397 operands[0] = mem_1;
13398 operands[2] = mem_2;
13399 operands[4] = mem_3;
13400 operands[6] = mem_4;
13403 /* Emit adjusting instruction. */
13404 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13405 /* Emit ldp/stp instructions. */
13406 t1 = gen_rtx_SET (operands[0], operands[1]);
13407 t2 = gen_rtx_SET (operands[2], operands[3]);
13408 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13409 t1 = gen_rtx_SET (operands[4], operands[5]);
13410 t2 = gen_rtx_SET (operands[6], operands[7]);
13411 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13412 return true;
13415 /* Return 1 if pseudo register should be created and used to hold
13416 GOT address for PIC code. */
13418 bool
13419 aarch64_use_pseudo_pic_reg (void)
13421 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13424 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
13426 static int
13427 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13429 switch (XINT (x, 1))
13431 case UNSPEC_GOTSMALLPIC:
13432 case UNSPEC_GOTSMALLPIC28K:
13433 case UNSPEC_GOTTINYPIC:
13434 return 0;
13435 default:
13436 break;
13439 return default_unspec_may_trap_p (x, flags);
13443 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13444 return the log2 of that value. Otherwise return -1. */
13447 aarch64_fpconst_pow_of_2 (rtx x)
13449 const REAL_VALUE_TYPE *r;
13451 if (!CONST_DOUBLE_P (x))
13452 return -1;
13454 r = CONST_DOUBLE_REAL_VALUE (x);
13456 if (REAL_VALUE_NEGATIVE (*r)
13457 || REAL_VALUE_ISNAN (*r)
13458 || REAL_VALUE_ISINF (*r)
13459 || !real_isinteger (r, DFmode))
13460 return -1;
13462 return exact_log2 (real_to_integer (r));
13465 /* If X is a vector of equal CONST_DOUBLE values and that value is
13466 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
13469 aarch64_vec_fpconst_pow_of_2 (rtx x)
13471 if (GET_CODE (x) != CONST_VECTOR)
13472 return -1;
13474 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13475 return -1;
13477 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13478 if (firstval <= 0)
13479 return -1;
13481 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13482 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13483 return -1;
13485 return firstval;
13488 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
13489 static tree
13490 aarch64_promoted_type (const_tree t)
13492 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13493 return float_type_node;
13494 return NULL_TREE;
13496 #undef TARGET_ADDRESS_COST
13497 #define TARGET_ADDRESS_COST aarch64_address_cost
13499 /* This hook will determines whether unnamed bitfields affect the alignment
13500 of the containing structure. The hook returns true if the structure
13501 should inherit the alignment requirements of an unnamed bitfield's
13502 type. */
13503 #undef TARGET_ALIGN_ANON_BITFIELD
13504 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
13506 #undef TARGET_ASM_ALIGNED_DI_OP
13507 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
13509 #undef TARGET_ASM_ALIGNED_HI_OP
13510 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
13512 #undef TARGET_ASM_ALIGNED_SI_OP
13513 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
13515 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
13516 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
13517 hook_bool_const_tree_hwi_hwi_const_tree_true
13519 #undef TARGET_ASM_OUTPUT_MI_THUNK
13520 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
13522 #undef TARGET_ASM_SELECT_RTX_SECTION
13523 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
13525 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
13526 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
13528 #undef TARGET_BUILD_BUILTIN_VA_LIST
13529 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
13531 #undef TARGET_CALLEE_COPIES
13532 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
13534 #undef TARGET_CAN_ELIMINATE
13535 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
13537 #undef TARGET_CAN_INLINE_P
13538 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
13540 #undef TARGET_CANNOT_FORCE_CONST_MEM
13541 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
13543 #undef TARGET_CONDITIONAL_REGISTER_USAGE
13544 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
13546 /* Only the least significant bit is used for initialization guard
13547 variables. */
13548 #undef TARGET_CXX_GUARD_MASK_BIT
13549 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
13551 #undef TARGET_C_MODE_FOR_SUFFIX
13552 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
13554 #ifdef TARGET_BIG_ENDIAN_DEFAULT
13555 #undef TARGET_DEFAULT_TARGET_FLAGS
13556 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
13557 #endif
13559 #undef TARGET_CLASS_MAX_NREGS
13560 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
13562 #undef TARGET_BUILTIN_DECL
13563 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
13565 #undef TARGET_BUILTIN_RECIPROCAL
13566 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
13568 #undef TARGET_EXPAND_BUILTIN
13569 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
13571 #undef TARGET_EXPAND_BUILTIN_VA_START
13572 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
13574 #undef TARGET_FOLD_BUILTIN
13575 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
13577 #undef TARGET_FUNCTION_ARG
13578 #define TARGET_FUNCTION_ARG aarch64_function_arg
13580 #undef TARGET_FUNCTION_ARG_ADVANCE
13581 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
13583 #undef TARGET_FUNCTION_ARG_BOUNDARY
13584 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
13586 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
13587 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
13589 #undef TARGET_FUNCTION_VALUE
13590 #define TARGET_FUNCTION_VALUE aarch64_function_value
13592 #undef TARGET_FUNCTION_VALUE_REGNO_P
13593 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
13595 #undef TARGET_FRAME_POINTER_REQUIRED
13596 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
13598 #undef TARGET_GIMPLE_FOLD_BUILTIN
13599 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
13601 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
13602 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
13604 #undef TARGET_INIT_BUILTINS
13605 #define TARGET_INIT_BUILTINS aarch64_init_builtins
13607 #undef TARGET_LEGITIMATE_ADDRESS_P
13608 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
13610 #undef TARGET_LEGITIMATE_CONSTANT_P
13611 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
13613 #undef TARGET_LIBGCC_CMP_RETURN_MODE
13614 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
13616 #undef TARGET_LRA_P
13617 #define TARGET_LRA_P hook_bool_void_true
13619 #undef TARGET_MANGLE_TYPE
13620 #define TARGET_MANGLE_TYPE aarch64_mangle_type
13622 #undef TARGET_MEMORY_MOVE_COST
13623 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
13625 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
13626 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
13628 #undef TARGET_MUST_PASS_IN_STACK
13629 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
13631 /* This target hook should return true if accesses to volatile bitfields
13632 should use the narrowest mode possible. It should return false if these
13633 accesses should use the bitfield container type. */
13634 #undef TARGET_NARROW_VOLATILE_BITFIELD
13635 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
13637 #undef TARGET_OPTION_OVERRIDE
13638 #define TARGET_OPTION_OVERRIDE aarch64_override_options
13640 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
13641 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
13642 aarch64_override_options_after_change
13644 #undef TARGET_OPTION_SAVE
13645 #define TARGET_OPTION_SAVE aarch64_option_save
13647 #undef TARGET_OPTION_RESTORE
13648 #define TARGET_OPTION_RESTORE aarch64_option_restore
13650 #undef TARGET_OPTION_PRINT
13651 #define TARGET_OPTION_PRINT aarch64_option_print
13653 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
13654 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
13656 #undef TARGET_SET_CURRENT_FUNCTION
13657 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
13659 #undef TARGET_PASS_BY_REFERENCE
13660 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
13662 #undef TARGET_PREFERRED_RELOAD_CLASS
13663 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
13665 #undef TARGET_SCHED_REASSOCIATION_WIDTH
13666 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
13668 #undef TARGET_PROMOTED_TYPE
13669 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
13671 #undef TARGET_SECONDARY_RELOAD
13672 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
13674 #undef TARGET_SHIFT_TRUNCATION_MASK
13675 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
13677 #undef TARGET_SETUP_INCOMING_VARARGS
13678 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
13680 #undef TARGET_STRUCT_VALUE_RTX
13681 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
13683 #undef TARGET_REGISTER_MOVE_COST
13684 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
13686 #undef TARGET_RETURN_IN_MEMORY
13687 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
13689 #undef TARGET_RETURN_IN_MSB
13690 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
13692 #undef TARGET_RTX_COSTS
13693 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
13695 #undef TARGET_SCHED_ISSUE_RATE
13696 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
13698 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
13699 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
13700 aarch64_sched_first_cycle_multipass_dfa_lookahead
13702 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
13703 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
13704 aarch64_first_cycle_multipass_dfa_lookahead_guard
13706 #undef TARGET_TRAMPOLINE_INIT
13707 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
13709 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
13710 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
13712 #undef TARGET_VECTOR_MODE_SUPPORTED_P
13713 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
13715 #undef TARGET_ARRAY_MODE_SUPPORTED_P
13716 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
13718 #undef TARGET_VECTORIZE_ADD_STMT_COST
13719 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
13721 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
13722 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
13723 aarch64_builtin_vectorization_cost
13725 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
13726 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
13728 #undef TARGET_VECTORIZE_BUILTINS
13729 #define TARGET_VECTORIZE_BUILTINS
13731 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
13732 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
13733 aarch64_builtin_vectorized_function
13735 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
13736 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
13737 aarch64_autovectorize_vector_sizes
13739 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
13740 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
13741 aarch64_atomic_assign_expand_fenv
13743 /* Section anchor support. */
13745 #undef TARGET_MIN_ANCHOR_OFFSET
13746 #define TARGET_MIN_ANCHOR_OFFSET -256
13748 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
13749 byte offset; we can do much more for larger data types, but have no way
13750 to determine the size of the access. We assume accesses are aligned. */
13751 #undef TARGET_MAX_ANCHOR_OFFSET
13752 #define TARGET_MAX_ANCHOR_OFFSET 4095
13754 #undef TARGET_VECTOR_ALIGNMENT
13755 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
13757 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
13758 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
13759 aarch64_simd_vector_alignment_reachable
13761 /* vec_perm support. */
13763 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
13764 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
13765 aarch64_vectorize_vec_perm_const_ok
13767 #undef TARGET_INIT_LIBFUNCS
13768 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
13770 #undef TARGET_FIXED_CONDITION_CODE_REGS
13771 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
13773 #undef TARGET_FLAGS_REGNUM
13774 #define TARGET_FLAGS_REGNUM CC_REGNUM
13776 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
13777 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
13779 #undef TARGET_ASAN_SHADOW_OFFSET
13780 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
13782 #undef TARGET_LEGITIMIZE_ADDRESS
13783 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
13785 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
13786 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
13787 aarch64_use_by_pieces_infrastructure_p
13789 #undef TARGET_CAN_USE_DOLOOP_P
13790 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
13792 #undef TARGET_SCHED_MACRO_FUSION_P
13793 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
13795 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
13796 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
13798 #undef TARGET_SCHED_FUSION_PRIORITY
13799 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
13801 #undef TARGET_UNSPEC_MAY_TRAP_P
13802 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
13804 #undef TARGET_USE_PSEUDO_PIC_REG
13805 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
13807 #undef TARGET_PRINT_OPERAND
13808 #define TARGET_PRINT_OPERAND aarch64_print_operand
13810 #undef TARGET_PRINT_OPERAND_ADDRESS
13811 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
13813 struct gcc_target targetm = TARGET_INITIALIZER;
13815 #include "gt-aarch64.h"