gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "alias.h"
  29 #include "symtab.h"
  30 #include "tree.h"
  31 #include "fold-const.h"
  32 #include "stringpool.h"
  33 #include "stor-layout.h"
  34 #include "calls.h"
  35 #include "varasm.h"
  36 #include "regs.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfgrtl.h"
  40 #include "cfganal.h"
  41 #include "lcm.h"
  42 #include "cfgbuild.h"
  43 #include "cfgcleanup.h"
  44 #include "predict.h"
  45 #include "basic-block.h"
  46 #include "df.h"
  47 #include "hard-reg-set.h"
  48 #include "output.h"
  49 #include "function.h"
  50 #include "flags.h"
  51 #include "insn-config.h"
  52 #include "expmed.h"
  53 #include "dojump.h"
  54 #include "explow.h"
  55 #include "emit-rtl.h"
  56 #include "stmt.h"
  57 #include "expr.h"
  58 #include "reload.h"
  59 #include "toplev.h"
  60 #include "target.h"
  61 #include "targhooks.h"
  62 #include "tm_p.h"
  63 #include "recog.h"
  64 #include "langhooks.h"
  65 #include "diagnostic-core.h"
  66 #include "tree-ssa-alias.h"
  67 #include "internal-fn.h"
  68 #include "gimple-fold.h"
  69 #include "tree-eh.h"
  70 #include "gimple-expr.h"
  71 #include "gimple.h"
  72 #include "gimplify.h"
  73 #include "optabs.h"
  74 #include "dwarf2.h"
  75 #include "cfgloop.h"
  76 #include "tree-vectorizer.h"
  77 #include "aarch64-cost-tables.h"
  78 #include "dumpfile.h"
  79 #include "builtins.h"
  80 #include "rtl-iter.h"
  81 #include "tm-constrs.h"
  82 #include "sched-int.h"
  83 #include "cortex-a57-fma-steering.h"
  84
  85 /* This file should be included last.  */
  86 #include "target-def.h"
  87
  88 /* Defined for convenience.  */
  89 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  90
  91 /* Classifies an address.
  92
  93    ADDRESS_REG_IMM
  94        A simple base register plus immediate offset.
  95
  96    ADDRESS_REG_WB
  97        A base register indexed by immediate offset with writeback.
  98
  99    ADDRESS_REG_REG
 100        A base register indexed by (optionally scaled) register.
 101
 102    ADDRESS_REG_UXTW
 103        A base register indexed by (optionally scaled) zero-extended register.
 104
 105    ADDRESS_REG_SXTW
 106        A base register indexed by (optionally scaled) sign-extended register.
 107
 108    ADDRESS_LO_SUM
 109        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 110
 111    ADDRESS_SYMBOLIC:
 112        A constant symbolic address, in pc-relative literal pool.  */
 113
 114 enum aarch64_address_type {
 115   ADDRESS_REG_IMM,
 116   ADDRESS_REG_WB,
 117   ADDRESS_REG_REG,
 118   ADDRESS_REG_UXTW,
 119   ADDRESS_REG_SXTW,
 120   ADDRESS_LO_SUM,
 121   ADDRESS_SYMBOLIC
 122 };
 123
 124 struct aarch64_address_info {
 125   enum aarch64_address_type type;
 126   rtx base;
 127   rtx offset;
 128   int shift;
 129   enum aarch64_symbol_type symbol_type;
 130 };
 131
 132 struct simd_immediate_info
 133 {
 134   rtx value;
 135   int shift;
 136   int element_width;
 137   bool mvn;
 138   bool msl;
 139 };
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 #ifdef HAVE_AS_TLS
 145 #undef TARGET_HAVE_TLS
 146 #define TARGET_HAVE_TLS 1
 147 #endif
 148
 149 static bool aarch64_composite_type_p (const_tree, machine_mode);
 150 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 151                                                      const_tree,
 152                                                      machine_mode *, int *,
 153                                                      bool *);
 154 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 155 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 156 static void aarch64_override_options_after_change (void);
 157 static bool aarch64_vector_mode_supported_p (machine_mode);
 158 static unsigned bit_count (unsigned HOST_WIDE_INT);
 159 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 160                                                  const unsigned char *sel);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162
 163 /* Major revision number of the ARM Architecture implemented by the target.  */
 164 unsigned aarch64_architecture_version;
 165
 166 /* The processor for which instructions should be scheduled.  */
 167 enum aarch64_processor aarch64_tune = cortexa53;
 168
 169 /* Mask to specify which instructions we are allowed to generate.  */
 170 unsigned long aarch64_isa_flags = 0;
 171
 172 /* Mask to specify which instruction scheduling options should be used.  */
 173 unsigned long aarch64_tune_flags = 0;
 174
 175 /* Support for command line parsing of boolean flags in the tuning
 176    structures.  */
 177 struct aarch64_flag_desc
 178 {
 179   const char* name;
 180   unsigned int flag;
 181 };
 182
 183 #define AARCH64_FUSION_PAIR(name, internal_name, y) \
 184   { name, AARCH64_FUSE_##internal_name },
 185 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 186 {
 187   { "none", AARCH64_FUSE_NOTHING },
 188 #include "aarch64-fusion-pairs.def"
 189   { "all", AARCH64_FUSE_ALL },
 190   { NULL, AARCH64_FUSE_NOTHING }
 191 };
 192 #undef AARCH64_FUION_PAIR
 193
 194 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name, y) \
 195   { name, AARCH64_EXTRA_TUNE_##internal_name },
 196 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 197 {
 198   { "none", AARCH64_EXTRA_TUNE_NONE },
 199 #include "aarch64-tuning-flags.def"
 200   { "all", AARCH64_EXTRA_TUNE_ALL },
 201   { NULL, AARCH64_EXTRA_TUNE_NONE }
 202 };
 203 #undef AARCH64_EXTRA_TUNING_OPTION
 204
 205 /* Tuning parameters.  */
 206
 207 static const struct cpu_addrcost_table generic_addrcost_table =
 208 {
 209     {
 210       0, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       0, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0 /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   0, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   0, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_extend  */
 249   0, /* imm_offset  */
 250 };
 251
 252 static const struct cpu_regmove_cost generic_regmove_cost =
 253 {
 254   1, /* GP2GP  */
 255   /* Avoid the use of slow int<->fp moves for spilling by setting
 256      their cost higher than memmov_cost.  */
 257   5, /* GP2FP  */
 258   5, /* FP2GP  */
 259   2 /* FP2FP  */
 260 };
 261
 262 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 263 {
 264   1, /* GP2GP  */
 265   /* Avoid the use of slow int<->fp moves for spilling by setting
 266      their cost higher than memmov_cost.  */
 267   5, /* GP2FP  */
 268   5, /* FP2GP  */
 269   2 /* FP2FP  */
 270 };
 271
 272 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 273 {
 274   1, /* GP2GP  */
 275   /* Avoid the use of slow int<->fp moves for spilling by setting
 276      their cost higher than memmov_cost.  */
 277   5, /* GP2FP  */
 278   5, /* FP2GP  */
 279   2 /* FP2FP  */
 280 };
 281
 282 static const struct cpu_regmove_cost thunderx_regmove_cost =
 283 {
 284   2, /* GP2GP  */
 285   2, /* GP2FP  */
 286   6, /* FP2GP  */
 287   4 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost xgene1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost.  */
 295   8, /* GP2FP  */
 296   8, /* FP2GP  */
 297   2 /* FP2FP  */
 298 };
 299
 300 /* Generic costs for vector insn classes.  */
 301 static const struct cpu_vector_cost generic_vector_cost =
 302 {
 303   1, /* scalar_stmt_cost  */
 304   1, /* scalar_load_cost  */
 305   1, /* scalar_store_cost  */
 306   1, /* vec_stmt_cost  */
 307   1, /* vec_to_scalar_cost  */
 308   1, /* scalar_to_vec_cost  */
 309   1, /* vec_align_load_cost  */
 310   1, /* vec_unalign_load_cost  */
 311   1, /* vec_unalign_store_cost  */
 312   1, /* vec_store_cost  */
 313   3, /* cond_taken_branch_cost  */
 314   1 /* cond_not_taken_branch_cost  */
 315 };
 316
 317 /* Generic costs for vector insn classes.  */
 318 static const struct cpu_vector_cost cortexa57_vector_cost =
 319 {
 320   1, /* scalar_stmt_cost  */
 321   4, /* scalar_load_cost  */
 322   1, /* scalar_store_cost  */
 323   3, /* vec_stmt_cost  */
 324   8, /* vec_to_scalar_cost  */
 325   8, /* scalar_to_vec_cost  */
 326   5, /* vec_align_load_cost  */
 327   5, /* vec_unalign_load_cost  */
 328   1, /* vec_unalign_store_cost  */
 329   1, /* vec_store_cost  */
 330   1, /* cond_taken_branch_cost  */
 331   1 /* cond_not_taken_branch_cost  */
 332 };
 333
 334 /* Generic costs for vector insn classes.  */
 335 static const struct cpu_vector_cost xgene1_vector_cost =
 336 {
 337   1, /* scalar_stmt_cost  */
 338   5, /* scalar_load_cost  */
 339   1, /* scalar_store_cost  */
 340   2, /* vec_stmt_cost  */
 341   4, /* vec_to_scalar_cost  */
 342   4, /* scalar_to_vec_cost  */
 343   10, /* vec_align_load_cost  */
 344   10, /* vec_unalign_load_cost  */
 345   2, /* vec_unalign_store_cost  */
 346   2, /* vec_store_cost  */
 347   2, /* cond_taken_branch_cost  */
 348   1 /* cond_not_taken_branch_cost  */
 349 };
 350
 351 /* Generic costs for branch instructions.  */
 352 static const struct cpu_branch_cost generic_branch_cost =
 353 {
 354   2,  /* Predictable.  */
 355   2   /* Unpredictable.  */
 356 };
 357
 358 static const struct tune_params generic_tunings =
 359 {
 360   &cortexa57_extra_costs,
 361   &generic_addrcost_table,
 362   &generic_regmove_cost,
 363   &generic_vector_cost,
 364   &generic_branch_cost,
 365   4, /* memmov_cost  */
 366   2, /* issue_rate  */
 367   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 368   8,    /* function_align.  */
 369   8,    /* jump_align.  */
 370   4,    /* loop_align.  */
 371   2,    /* int_reassoc_width.  */
 372   4,    /* fp_reassoc_width.  */
 373   1,    /* vec_reassoc_width.  */
 374   2,    /* min_div_recip_mul_sf.  */
 375   2,    /* min_div_recip_mul_df.  */
 376   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 377 };
 378
 379 static const struct tune_params cortexa53_tunings =
 380 {
 381   &cortexa53_extra_costs,
 382   &generic_addrcost_table,
 383   &cortexa53_regmove_cost,
 384   &generic_vector_cost,
 385   &generic_branch_cost,
 386   4, /* memmov_cost  */
 387   2, /* issue_rate  */
 388   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 389    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 390   8,    /* function_align.  */
 391   8,    /* jump_align.  */
 392   4,    /* loop_align.  */
 393   2,    /* int_reassoc_width.  */
 394   4,    /* fp_reassoc_width.  */
 395   1,    /* vec_reassoc_width.  */
 396   2,    /* min_div_recip_mul_sf.  */
 397   2,    /* min_div_recip_mul_df.  */
 398   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 399 };
 400
 401 static const struct tune_params cortexa57_tunings =
 402 {
 403   &cortexa57_extra_costs,
 404   &cortexa57_addrcost_table,
 405   &cortexa57_regmove_cost,
 406   &cortexa57_vector_cost,
 407   &generic_branch_cost,
 408   4, /* memmov_cost  */
 409   3, /* issue_rate  */
 410   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 411    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 412   16,   /* function_align.  */
 413   8,    /* jump_align.  */
 414   4,    /* loop_align.  */
 415   2,    /* int_reassoc_width.  */
 416   4,    /* fp_reassoc_width.  */
 417   1,    /* vec_reassoc_width.  */
 418   2,    /* min_div_recip_mul_sf.  */
 419   2,    /* min_div_recip_mul_df.  */
 420   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 421 };
 422
 423 static const struct tune_params cortexa72_tunings =
 424 {
 425   &cortexa57_extra_costs,
 426   &cortexa57_addrcost_table,
 427   &cortexa57_regmove_cost,
 428   &cortexa57_vector_cost,
 429   &generic_branch_cost,
 430   4, /* memmov_cost  */
 431   3, /* issue_rate  */
 432   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 433    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 434   16,   /* function_align.  */
 435   8,    /* jump_align.  */
 436   4,    /* loop_align.  */
 437   2,    /* int_reassoc_width.  */
 438   4,    /* fp_reassoc_width.  */
 439   1,    /* vec_reassoc_width.  */
 440   2,    /* min_div_recip_mul_sf.  */
 441   2,    /* min_div_recip_mul_df.  */
 442   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 443 };
 444
 445 static const struct tune_params thunderx_tunings =
 446 {
 447   &thunderx_extra_costs,
 448   &generic_addrcost_table,
 449   &thunderx_regmove_cost,
 450   &generic_vector_cost,
 451   &generic_branch_cost,
 452   6, /* memmov_cost  */
 453   2, /* issue_rate  */
 454   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 455   8,    /* function_align.  */
 456   8,    /* jump_align.  */
 457   8,    /* loop_align.  */
 458   2,    /* int_reassoc_width.  */
 459   4,    /* fp_reassoc_width.  */
 460   1,    /* vec_reassoc_width.  */
 461   2,    /* min_div_recip_mul_sf.  */
 462   2,    /* min_div_recip_mul_df.  */
 463   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 464 };
 465
 466 static const struct tune_params xgene1_tunings =
 467 {
 468   &xgene1_extra_costs,
 469   &xgene1_addrcost_table,
 470   &xgene1_regmove_cost,
 471   &xgene1_vector_cost,
 472   &generic_branch_cost,
 473   6, /* memmov_cost  */
 474   4, /* issue_rate  */
 475   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 476   16,   /* function_align.  */
 477   8,    /* jump_align.  */
 478   16,   /* loop_align.  */
 479   2,    /* int_reassoc_width.  */
 480   4,    /* fp_reassoc_width.  */
 481   1,    /* vec_reassoc_width.  */
 482   2,    /* min_div_recip_mul_sf.  */
 483   2,    /* min_div_recip_mul_df.  */
 484   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 485 };
 486
 487 /* Support for fine-grained override of the tuning structures.  */
 488 struct aarch64_tuning_override_function
 489 {
 490   const char* name;
 491   void (*parse_override)(const char*, struct tune_params*);
 492 };
 493
 494 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 495 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 496
 497 static const struct aarch64_tuning_override_function
 498 aarch64_tuning_override_functions[] =
 499 {
 500   { "fuse", aarch64_parse_fuse_string },
 501   { "tune", aarch64_parse_tune_string },
 502   { NULL, NULL }
 503 };
 504
 505 /* A processor implementing AArch64.  */
 506 struct processor
 507 {
 508   const char *const name;
 509   enum aarch64_processor core;
 510   const char *arch;
 511   unsigned architecture_version;
 512   const unsigned long flags;
 513   const struct tune_params *const tune;
 514 };
 515
 516 /* Processor cores implementing AArch64.  */
 517 static const struct processor all_cores[] =
 518 {
 519 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 520   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 521 #include "aarch64-cores.def"
 522 #undef AARCH64_CORE
 523   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 524   {NULL, aarch64_none, NULL, 0, 0, NULL}
 525 };
 526
 527 /* Architectures implementing AArch64.  */
 528 static const struct processor all_architectures[] =
 529 {
 530 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 531   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 532 #include "aarch64-arches.def"
 533 #undef AARCH64_ARCH
 534   {NULL, aarch64_none, NULL, 0, 0, NULL}
 535 };
 536
 537 /* Target specification.  These are populated as commandline arguments
 538    are processed, or NULL if not specified.  */
 539 static const struct processor *selected_arch;
 540 static const struct processor *selected_cpu;
 541 static const struct processor *selected_tune;
 542
 543 /* The current tuning set.  */
 544 struct tune_params aarch64_tune_params = generic_tunings;
 545
 546 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 547
 548 /* An ISA extension in the co-processor and main instruction set space.  */
 549 struct aarch64_option_extension
 550 {
 551   const char *const name;
 552   const unsigned long flags_on;
 553   const unsigned long flags_off;
 554 };
 555
 556 /* ISA extensions in AArch64.  */
 557 static const struct aarch64_option_extension all_extensions[] =
 558 {
 559 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 560   {NAME, FLAGS_ON, FLAGS_OFF},
 561 #include "aarch64-option-extensions.def"
 562 #undef AARCH64_OPT_EXTENSION
 563   {NULL, 0, 0}
 564 };
 565
 566 /* Used to track the size of an address when generating a pre/post
 567    increment address.  */
 568 static machine_mode aarch64_memory_reference_mode;
 569
 570 /* A table of valid AArch64 "bitmask immediate" values for
 571    logical instructions.  */
 572
 573 #define AARCH64_NUM_BITMASKS  5334
 574 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 575
 576 typedef enum aarch64_cond_code
 577 {
 578   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 579   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 580   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 581 }
 582 aarch64_cc;
 583
 584 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 585
 586 /* The condition codes of the processor, and the inverse function.  */
 587 static const char * const aarch64_condition_codes[] =
 588 {
 589   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 590   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 591 };
 592
 593 void
 594 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 595 {
 596   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 597   if (TARGET_GENERAL_REGS_ONLY)
 598     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 599   else
 600     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 601 }
 602
 603 static unsigned int
 604 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 605 {
 606   if (GET_MODE_UNIT_SIZE (mode) == 4)
 607     return aarch64_tune_params.min_div_recip_mul_sf;
 608   return aarch64_tune_params.min_div_recip_mul_df;
 609 }
 610
 611 static int
 612 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 613                              enum machine_mode mode)
 614 {
 615   if (VECTOR_MODE_P (mode))
 616     return aarch64_tune_params.vec_reassoc_width;
 617   if (INTEGRAL_MODE_P (mode))
 618     return aarch64_tune_params.int_reassoc_width;
 619   if (FLOAT_MODE_P (mode))
 620     return aarch64_tune_params.fp_reassoc_width;
 621   return 1;
 622 }
 623
 624 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 625 unsigned
 626 aarch64_dbx_register_number (unsigned regno)
 627 {
 628    if (GP_REGNUM_P (regno))
 629      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 630    else if (regno == SP_REGNUM)
 631      return AARCH64_DWARF_SP;
 632    else if (FP_REGNUM_P (regno))
 633      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 634
 635    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 636       equivalent DWARF register.  */
 637    return DWARF_FRAME_REGISTERS;
 638 }
 639
 640 /* Return TRUE if MODE is any of the large INT modes.  */
 641 static bool
 642 aarch64_vect_struct_mode_p (machine_mode mode)
 643 {
 644   return mode == OImode || mode == CImode || mode == XImode;
 645 }
 646
 647 /* Return TRUE if MODE is any of the vector modes.  */
 648 static bool
 649 aarch64_vector_mode_p (machine_mode mode)
 650 {
 651   return aarch64_vector_mode_supported_p (mode)
 652          || aarch64_vect_struct_mode_p (mode);
 653 }
 654
 655 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 656 static bool
 657 aarch64_array_mode_supported_p (machine_mode mode,
 658                                 unsigned HOST_WIDE_INT nelems)
 659 {
 660   if (TARGET_SIMD
 661       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 662       && (nelems >= 2 && nelems <= 4))
 663     return true;
 664
 665   return false;
 666 }
 667
 668 /* Implement HARD_REGNO_NREGS.  */
 669
 670 int
 671 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 672 {
 673   switch (aarch64_regno_regclass (regno))
 674     {
 675     case FP_REGS:
 676     case FP_LO_REGS:
 677       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 678     default:
 679       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 680     }
 681   gcc_unreachable ();
 682 }
 683
 684 /* Implement HARD_REGNO_MODE_OK.  */
 685
 686 int
 687 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 688 {
 689   if (GET_MODE_CLASS (mode) == MODE_CC)
 690     return regno == CC_REGNUM;
 691
 692   if (regno == SP_REGNUM)
 693     /* The purpose of comparing with ptr_mode is to support the
 694        global register variable associated with the stack pointer
 695        register via the syntax of asm ("wsp") in ILP32.  */
 696     return mode == Pmode || mode == ptr_mode;
 697
 698   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 699     return mode == Pmode;
 700
 701   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 702     return 1;
 703
 704   if (FP_REGNUM_P (regno))
 705     {
 706       if (aarch64_vect_struct_mode_p (mode))
 707         return
 708           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 709       else
 710         return 1;
 711     }
 712
 713   return 0;
 714 }
 715
 716 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 717 machine_mode
 718 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 719                                      machine_mode mode)
 720 {
 721   /* Handle modes that fit within single registers.  */
 722   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 723     {
 724       if (GET_MODE_SIZE (mode) >= 4)
 725         return mode;
 726       else
 727         return SImode;
 728     }
 729   /* Fall back to generic for multi-reg and very large modes.  */
 730   else
 731     return choose_hard_reg_mode (regno, nregs, false);
 732 }
 733
 734 /* Return true if calls to DECL should be treated as
 735    long-calls (ie called via a register).  */
 736 static bool
 737 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 738 {
 739   return false;
 740 }
 741
 742 /* Return true if calls to symbol-ref SYM should be treated as
 743    long-calls (ie called via a register).  */
 744 bool
 745 aarch64_is_long_call_p (rtx sym)
 746 {
 747   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 748 }
 749
 750 /* Return true if the offsets to a zero/sign-extract operation
 751    represent an expression that matches an extend operation.  The
 752    operands represent the paramters from
 753
 754    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 755 bool
 756 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 757                                 rtx extract_imm)
 758 {
 759   HOST_WIDE_INT mult_val, extract_val;
 760
 761   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 762     return false;
 763
 764   mult_val = INTVAL (mult_imm);
 765   extract_val = INTVAL (extract_imm);
 766
 767   if (extract_val > 8
 768       && extract_val < GET_MODE_BITSIZE (mode)
 769       && exact_log2 (extract_val & ~7) > 0
 770       && (extract_val & 7) <= 4
 771       && mult_val == (1 << (extract_val & 7)))
 772     return true;
 773
 774   return false;
 775 }
 776
 777 /* Emit an insn that's a simple single-set.  Both the operands must be
 778    known to be valid.  */
 779 inline static rtx
 780 emit_set_insn (rtx x, rtx y)
 781 {
 782   return emit_insn (gen_rtx_SET (x, y));
 783 }
 784
 785 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 786    return the rtx for register 0 in the proper mode.  */
 787 rtx
 788 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 789 {
 790   machine_mode mode = SELECT_CC_MODE (code, x, y);
 791   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 792
 793   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 794   return cc_reg;
 795 }
 796
 797 /* Build the SYMBOL_REF for __tls_get_addr.  */
 798
 799 static GTY(()) rtx tls_get_addr_libfunc;
 800
 801 rtx
 802 aarch64_tls_get_addr (void)
 803 {
 804   if (!tls_get_addr_libfunc)
 805     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 806   return tls_get_addr_libfunc;
 807 }
 808
 809 /* Return the TLS model to use for ADDR.  */
 810
 811 static enum tls_model
 812 tls_symbolic_operand_type (rtx addr)
 813 {
 814   enum tls_model tls_kind = TLS_MODEL_NONE;
 815   rtx sym, addend;
 816
 817   if (GET_CODE (addr) == CONST)
 818     {
 819       split_const (addr, &sym, &addend);
 820       if (GET_CODE (sym) == SYMBOL_REF)
 821         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 822     }
 823   else if (GET_CODE (addr) == SYMBOL_REF)
 824     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 825
 826   return tls_kind;
 827 }
 828
 829 /* We'll allow lo_sum's in addresses in our legitimate addresses
 830    so that combine would take care of combining addresses where
 831    necessary, but for generation purposes, we'll generate the address
 832    as :
 833    RTL                               Absolute
 834    tmp = hi (symbol_ref);            adrp  x1, foo
 835    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 836                                      nop
 837
 838    PIC                               TLS
 839    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 840    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 841                                      bl   __tls_get_addr
 842                                      nop
 843
 844    Load TLS symbol, depending on TLS mechanism and TLS access model.
 845
 846    Global Dynamic - Traditional TLS:
 847    adrp tmp, :tlsgd:imm
 848    add  dest, tmp, #:tlsgd_lo12:imm
 849    bl   __tls_get_addr
 850
 851    Global Dynamic - TLS Descriptors:
 852    adrp dest, :tlsdesc:imm
 853    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 854    add  dest, dest, #:tlsdesc_lo12:imm
 855    blr  tmp
 856    mrs  tp, tpidr_el0
 857    add  dest, dest, tp
 858
 859    Initial Exec:
 860    mrs  tp, tpidr_el0
 861    adrp tmp, :gottprel:imm
 862    ldr  dest, [tmp, #:gottprel_lo12:imm]
 863    add  dest, dest, tp
 864
 865    Local Exec:
 866    mrs  tp, tpidr_el0
 867    add  t0, tp, #:tprel_hi12:imm, lsl #12
 868    add  t0, t0, #:tprel_lo12_nc:imm
 869 */
 870
 871 static void
 872 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 873                                    enum aarch64_symbol_type type)
 874 {
 875   switch (type)
 876     {
 877     case SYMBOL_SMALL_ABSOLUTE:
 878       {
 879         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 880         rtx tmp_reg = dest;
 881         machine_mode mode = GET_MODE (dest);
 882
 883         gcc_assert (mode == Pmode || mode == ptr_mode);
 884
 885         if (can_create_pseudo_p ())
 886           tmp_reg = gen_reg_rtx (mode);
 887
 888         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 889         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 890         return;
 891       }
 892
 893     case SYMBOL_TINY_ABSOLUTE:
 894       emit_insn (gen_rtx_SET (dest, imm));
 895       return;
 896
 897     case SYMBOL_SMALL_GOT_28K:
 898       {
 899         machine_mode mode = GET_MODE (dest);
 900         rtx gp_rtx = pic_offset_table_rtx;
 901
 902         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
 903            here before rtl expand.  Tree IVOPT will generate rtl pattern to
 904            decide rtx costs, in which case pic_offset_table_rtx is not
 905            initialized.  For that case no need to generate the first adrp
 906            instruction as the the final cost for global variable access is
 907            one instruction.  */
 908         if (gp_rtx != NULL)
 909           {
 910             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
 911                using the page base as GOT base, the first page may be wasted,
 912                in the worst scenario, there is only 28K space for GOT).
 913
 914                The generate instruction sequence for accessing global variable
 915                is:
 916
 917                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
 918
 919                Only one instruction needed. But we must initialize
 920                pic_offset_table_rtx properly.  We generate initialize insn for
 921                every global access, and allow CSE to remove all redundant.
 922
 923                The final instruction sequences will look like the following
 924                for multiply global variables access.
 925
 926                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
 927
 928                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
 929                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
 930                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
 931                  ...  */
 932
 933             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
 934             crtl->uses_pic_offset_table = 1;
 935             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 936
 937             if (mode != GET_MODE (gp_rtx))
 938               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
 939           }
 940
 941         if (mode == ptr_mode)
 942           {
 943             if (mode == DImode)
 944               emit_insn (gen_ldr_got_small_28k_di (dest, gp_rtx, imm));
 945             else
 946               emit_insn (gen_ldr_got_small_28k_si (dest, gp_rtx, imm));
 947           }
 948         else
 949           {
 950             gcc_assert (mode == Pmode);
 951             emit_insn (gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm));
 952           }
 953
 954         return;
 955       }
 956
 957     case SYMBOL_SMALL_GOT_4G:
 958       {
 959         /* In ILP32, the mode of dest can be either SImode or DImode,
 960            while the got entry is always of SImode size.  The mode of
 961            dest depends on how dest is used: if dest is assigned to a
 962            pointer (e.g. in the memory), it has SImode; it may have
 963            DImode if dest is dereferenced to access the memeory.
 964            This is why we have to handle three different ldr_got_small
 965            patterns here (two patterns for ILP32).  */
 966         rtx tmp_reg = dest;
 967         machine_mode mode = GET_MODE (dest);
 968
 969         if (can_create_pseudo_p ())
 970           tmp_reg = gen_reg_rtx (mode);
 971
 972         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 973         if (mode == ptr_mode)
 974           {
 975             if (mode == DImode)
 976               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 977             else
 978               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 979           }
 980         else
 981           {
 982             gcc_assert (mode == Pmode);
 983             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 984           }
 985
 986         return;
 987       }
 988
 989     case SYMBOL_SMALL_TLSGD:
 990       {
 991         rtx_insn *insns;
 992         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 993
 994         start_sequence ();
 995         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 996         insns = get_insns ();
 997         end_sequence ();
 998
 999         RTL_CONST_CALL_P (insns) = 1;
1000         emit_libcall_block (insns, dest, result, imm);
1001         return;
1002       }
1003
1004     case SYMBOL_SMALL_TLSDESC:
1005       {
1006         machine_mode mode = GET_MODE (dest);
1007         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1008         rtx tp;
1009
1010         gcc_assert (mode == Pmode || mode == ptr_mode);
1011
1012         /* In ILP32, the got entry is always of SImode size.  Unlike
1013            small GOT, the dest is fixed at reg 0.  */
1014         if (TARGET_ILP32)
1015           emit_insn (gen_tlsdesc_small_si (imm));
1016         else
1017           emit_insn (gen_tlsdesc_small_di (imm));
1018         tp = aarch64_load_tp (NULL);
1019
1020         if (mode != Pmode)
1021           tp = gen_lowpart (mode, tp);
1022
1023         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1024         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1025         return;
1026       }
1027
1028     case SYMBOL_SMALL_GOTTPREL:
1029       {
1030         /* In ILP32, the mode of dest can be either SImode or DImode,
1031            while the got entry is always of SImode size.  The mode of
1032            dest depends on how dest is used: if dest is assigned to a
1033            pointer (e.g. in the memory), it has SImode; it may have
1034            DImode if dest is dereferenced to access the memeory.
1035            This is why we have to handle three different tlsie_small
1036            patterns here (two patterns for ILP32).  */
1037         machine_mode mode = GET_MODE (dest);
1038         rtx tmp_reg = gen_reg_rtx (mode);
1039         rtx tp = aarch64_load_tp (NULL);
1040
1041         if (mode == ptr_mode)
1042           {
1043             if (mode == DImode)
1044               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1045             else
1046               {
1047                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1048                 tp = gen_lowpart (mode, tp);
1049               }
1050           }
1051         else
1052           {
1053             gcc_assert (mode == Pmode);
1054             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1055           }
1056
1057         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1058         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1059         return;
1060       }
1061
1062     case SYMBOL_TLSLE:
1063       {
1064         rtx tp = aarch64_load_tp (NULL);
1065
1066         if (GET_MODE (dest) != Pmode)
1067           tp = gen_lowpart (GET_MODE (dest), tp);
1068
1069         emit_insn (gen_tlsle (dest, tp, imm));
1070         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1071         return;
1072       }
1073
1074     case SYMBOL_TINY_GOT:
1075       emit_insn (gen_ldr_got_tiny (dest, imm));
1076       return;
1077
1078     default:
1079       gcc_unreachable ();
1080     }
1081 }
1082
1083 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1084    handle all moves if !can_create_pseudo_p ().  The distinction is
1085    important because, unlike emit_move_insn, the move expanders know
1086    how to force Pmode objects into the constant pool even when the
1087    constant pool address is not itself legitimate.  */
1088 static rtx
1089 aarch64_emit_move (rtx dest, rtx src)
1090 {
1091   return (can_create_pseudo_p ()
1092           ? emit_move_insn (dest, src)
1093           : emit_move_insn_1 (dest, src));
1094 }
1095
1096 /* Split a 128-bit move operation into two 64-bit move operations,
1097    taking care to handle partial overlap of register to register
1098    copies.  Special cases are needed when moving between GP regs and
1099    FP regs.  SRC can be a register, constant or memory; DST a register
1100    or memory.  If either operand is memory it must not have any side
1101    effects.  */
1102 void
1103 aarch64_split_128bit_move (rtx dst, rtx src)
1104 {
1105   rtx dst_lo, dst_hi;
1106   rtx src_lo, src_hi;
1107
1108   machine_mode mode = GET_MODE (dst);
1109
1110   gcc_assert (mode == TImode || mode == TFmode);
1111   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1112   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1113
1114   if (REG_P (dst) && REG_P (src))
1115     {
1116       int src_regno = REGNO (src);
1117       int dst_regno = REGNO (dst);
1118
1119       /* Handle FP <-> GP regs.  */
1120       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1121         {
1122           src_lo = gen_lowpart (word_mode, src);
1123           src_hi = gen_highpart (word_mode, src);
1124
1125           if (mode == TImode)
1126             {
1127               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1128               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1129             }
1130           else
1131             {
1132               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1133               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1134             }
1135           return;
1136         }
1137       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1138         {
1139           dst_lo = gen_lowpart (word_mode, dst);
1140           dst_hi = gen_highpart (word_mode, dst);
1141
1142           if (mode == TImode)
1143             {
1144               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1145               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1146             }
1147           else
1148             {
1149               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1150               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1151             }
1152           return;
1153         }
1154     }
1155
1156   dst_lo = gen_lowpart (word_mode, dst);
1157   dst_hi = gen_highpart (word_mode, dst);
1158   src_lo = gen_lowpart (word_mode, src);
1159   src_hi = gen_highpart_mode (word_mode, mode, src);
1160
1161   /* At most one pairing may overlap.  */
1162   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1163     {
1164       aarch64_emit_move (dst_hi, src_hi);
1165       aarch64_emit_move (dst_lo, src_lo);
1166     }
1167   else
1168     {
1169       aarch64_emit_move (dst_lo, src_lo);
1170       aarch64_emit_move (dst_hi, src_hi);
1171     }
1172 }
1173
1174 bool
1175 aarch64_split_128bit_move_p (rtx dst, rtx src)
1176 {
1177   return (! REG_P (src)
1178           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1179 }
1180
1181 /* Split a complex SIMD combine.  */
1182
1183 void
1184 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1185 {
1186   machine_mode src_mode = GET_MODE (src1);
1187   machine_mode dst_mode = GET_MODE (dst);
1188
1189   gcc_assert (VECTOR_MODE_P (dst_mode));
1190
1191   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1192     {
1193       rtx (*gen) (rtx, rtx, rtx);
1194
1195       switch (src_mode)
1196         {
1197         case V8QImode:
1198           gen = gen_aarch64_simd_combinev8qi;
1199           break;
1200         case V4HImode:
1201           gen = gen_aarch64_simd_combinev4hi;
1202           break;
1203         case V2SImode:
1204           gen = gen_aarch64_simd_combinev2si;
1205           break;
1206         case V2SFmode:
1207           gen = gen_aarch64_simd_combinev2sf;
1208           break;
1209         case DImode:
1210           gen = gen_aarch64_simd_combinedi;
1211           break;
1212         case DFmode:
1213           gen = gen_aarch64_simd_combinedf;
1214           break;
1215         default:
1216           gcc_unreachable ();
1217         }
1218
1219       emit_insn (gen (dst, src1, src2));
1220       return;
1221     }
1222 }
1223
1224 /* Split a complex SIMD move.  */
1225
1226 void
1227 aarch64_split_simd_move (rtx dst, rtx src)
1228 {
1229   machine_mode src_mode = GET_MODE (src);
1230   machine_mode dst_mode = GET_MODE (dst);
1231
1232   gcc_assert (VECTOR_MODE_P (dst_mode));
1233
1234   if (REG_P (dst) && REG_P (src))
1235     {
1236       rtx (*gen) (rtx, rtx);
1237
1238       gcc_assert (VECTOR_MODE_P (src_mode));
1239
1240       switch (src_mode)
1241         {
1242         case V16QImode:
1243           gen = gen_aarch64_split_simd_movv16qi;
1244           break;
1245         case V8HImode:
1246           gen = gen_aarch64_split_simd_movv8hi;
1247           break;
1248         case V4SImode:
1249           gen = gen_aarch64_split_simd_movv4si;
1250           break;
1251         case V2DImode:
1252           gen = gen_aarch64_split_simd_movv2di;
1253           break;
1254         case V4SFmode:
1255           gen = gen_aarch64_split_simd_movv4sf;
1256           break;
1257         case V2DFmode:
1258           gen = gen_aarch64_split_simd_movv2df;
1259           break;
1260         default:
1261           gcc_unreachable ();
1262         }
1263
1264       emit_insn (gen (dst, src));
1265       return;
1266     }
1267 }
1268
1269 static rtx
1270 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1271 {
1272   if (can_create_pseudo_p ())
1273     return force_reg (mode, value);
1274   else
1275     {
1276       x = aarch64_emit_move (x, value);
1277       return x;
1278     }
1279 }
1280
1281
1282 static rtx
1283 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1284 {
1285   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1286     {
1287       rtx high;
1288       /* Load the full offset into a register.  This
1289          might be improvable in the future.  */
1290       high = GEN_INT (offset);
1291       offset = 0;
1292       high = aarch64_force_temporary (mode, temp, high);
1293       reg = aarch64_force_temporary (mode, temp,
1294                                      gen_rtx_PLUS (mode, high, reg));
1295     }
1296   return plus_constant (mode, reg, offset);
1297 }
1298
1299 static int
1300 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1301                                 machine_mode mode)
1302 {
1303   unsigned HOST_WIDE_INT mask;
1304   int i;
1305   bool first;
1306   unsigned HOST_WIDE_INT val;
1307   bool subtargets;
1308   rtx subtarget;
1309   int one_match, zero_match, first_not_ffff_match;
1310   int num_insns = 0;
1311
1312   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1313     {
1314       if (generate)
1315         emit_insn (gen_rtx_SET (dest, imm));
1316       num_insns++;
1317       return num_insns;
1318     }
1319
1320   if (mode == SImode)
1321     {
1322       /* We know we can't do this in 1 insn, and we must be able to do it
1323          in two; so don't mess around looking for sequences that don't buy
1324          us anything.  */
1325       if (generate)
1326         {
1327           emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1328           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1329                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1330         }
1331       num_insns += 2;
1332       return num_insns;
1333     }
1334
1335   /* Remaining cases are all for DImode.  */
1336
1337   val = INTVAL (imm);
1338   subtargets = optimize && can_create_pseudo_p ();
1339
1340   one_match = 0;
1341   zero_match = 0;
1342   mask = 0xffff;
1343   first_not_ffff_match = -1;
1344
1345   for (i = 0; i < 64; i += 16, mask <<= 16)
1346     {
1347       if ((val & mask) == mask)
1348         one_match++;
1349       else
1350         {
1351           if (first_not_ffff_match < 0)
1352             first_not_ffff_match = i;
1353           if ((val & mask) == 0)
1354             zero_match++;
1355         }
1356     }
1357
1358   if (one_match == 2)
1359     {
1360       /* Set one of the quarters and then insert back into result.  */
1361       mask = 0xffffll << first_not_ffff_match;
1362       if (generate)
1363         {
1364           emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1365           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1366                                      GEN_INT ((val >> first_not_ffff_match)
1367                                               & 0xffff)));
1368         }
1369       num_insns += 2;
1370       return num_insns;
1371     }
1372
1373   if (zero_match == 2)
1374     goto simple_sequence;
1375
1376   mask = 0x0ffff0000UL;
1377   for (i = 16; i < 64; i += 16, mask <<= 16)
1378     {
1379       HOST_WIDE_INT comp = mask & ~(mask - 1);
1380
1381       if (aarch64_uimm12_shift (val - (val & mask)))
1382         {
1383           if (generate)
1384             {
1385               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1386               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1387               emit_insn (gen_adddi3 (dest, subtarget,
1388                                      GEN_INT (val - (val & mask))));
1389             }
1390           num_insns += 2;
1391           return num_insns;
1392         }
1393       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1394         {
1395           if (generate)
1396             {
1397               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1398               emit_insn (gen_rtx_SET (subtarget,
1399                                       GEN_INT ((val + comp) & mask)));
1400               emit_insn (gen_adddi3 (dest, subtarget,
1401                                      GEN_INT (val - ((val + comp) & mask))));
1402             }
1403           num_insns += 2;
1404           return num_insns;
1405         }
1406       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1407         {
1408           if (generate)
1409             {
1410               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1411               emit_insn (gen_rtx_SET (subtarget,
1412                                       GEN_INT ((val - comp) | ~mask)));
1413               emit_insn (gen_adddi3 (dest, subtarget,
1414                                      GEN_INT (val - ((val - comp) | ~mask))));
1415             }
1416           num_insns += 2;
1417           return num_insns;
1418         }
1419       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1420         {
1421           if (generate)
1422             {
1423               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1424               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1425               emit_insn (gen_adddi3 (dest, subtarget,
1426                                      GEN_INT (val - (val | ~mask))));
1427             }
1428           num_insns += 2;
1429           return num_insns;
1430         }
1431     }
1432
1433   /* See if we can do it by arithmetically combining two
1434      immediates.  */
1435   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1436     {
1437       int j;
1438       mask = 0xffff;
1439
1440       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1441           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1442         {
1443           if (generate)
1444             {
1445               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1446               emit_insn (gen_rtx_SET (subtarget,
1447                                       GEN_INT (aarch64_bitmasks[i])));
1448               emit_insn (gen_adddi3 (dest, subtarget,
1449                                      GEN_INT (val - aarch64_bitmasks[i])));
1450             }
1451           num_insns += 2;
1452           return num_insns;
1453         }
1454
1455       for (j = 0; j < 64; j += 16, mask <<= 16)
1456         {
1457           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1458             {
1459               if (generate)
1460                 {
1461                   emit_insn (gen_rtx_SET (dest,
1462                                           GEN_INT (aarch64_bitmasks[i])));
1463                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1464                                              GEN_INT ((val >> j) & 0xffff)));
1465                 }
1466               num_insns += 2;
1467               return num_insns;
1468             }
1469         }
1470     }
1471
1472   /* See if we can do it by logically combining two immediates.  */
1473   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1474     {
1475       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1476         {
1477           int j;
1478
1479           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1480             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1481               {
1482                 if (generate)
1483                   {
1484                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1485                     emit_insn (gen_rtx_SET (subtarget,
1486                                             GEN_INT (aarch64_bitmasks[i])));
1487                     emit_insn (gen_iordi3 (dest, subtarget,
1488                                            GEN_INT (aarch64_bitmasks[j])));
1489                   }
1490                 num_insns += 2;
1491                 return num_insns;
1492               }
1493         }
1494       else if ((val & aarch64_bitmasks[i]) == val)
1495         {
1496           int j;
1497
1498           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1499             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1500               {
1501                 if (generate)
1502                   {
1503                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1504                     emit_insn (gen_rtx_SET (subtarget,
1505                                             GEN_INT (aarch64_bitmasks[j])));
1506                     emit_insn (gen_anddi3 (dest, subtarget,
1507                                            GEN_INT (aarch64_bitmasks[i])));
1508                   }
1509                 num_insns += 2;
1510                 return num_insns;
1511               }
1512         }
1513     }
1514
1515   if (one_match > zero_match)
1516     {
1517       /* Set either first three quarters or all but the third.   */
1518       mask = 0xffffll << (16 - first_not_ffff_match);
1519       if (generate)
1520         emit_insn (gen_rtx_SET (dest,
1521                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1522       num_insns ++;
1523
1524       /* Now insert other two quarters.  */
1525       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1526            i < 64; i += 16, mask <<= 16)
1527         {
1528           if ((val & mask) != mask)
1529             {
1530               if (generate)
1531                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1532                                            GEN_INT ((val >> i) & 0xffff)));
1533               num_insns ++;
1534             }
1535         }
1536       return num_insns;
1537     }
1538
1539  simple_sequence:
1540   first = true;
1541   mask = 0xffff;
1542   for (i = 0; i < 64; i += 16, mask <<= 16)
1543     {
1544       if ((val & mask) != 0)
1545         {
1546           if (first)
1547             {
1548               if (generate)
1549                 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1550               num_insns ++;
1551               first = false;
1552             }
1553           else
1554             {
1555               if (generate)
1556                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1557                                            GEN_INT ((val >> i) & 0xffff)));
1558               num_insns ++;
1559             }
1560         }
1561     }
1562
1563   return num_insns;
1564 }
1565
1566
1567 void
1568 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1569 {
1570   machine_mode mode = GET_MODE (dest);
1571
1572   gcc_assert (mode == SImode || mode == DImode);
1573
1574   /* Check on what type of symbol it is.  */
1575   if (GET_CODE (imm) == SYMBOL_REF
1576       || GET_CODE (imm) == LABEL_REF
1577       || GET_CODE (imm) == CONST)
1578     {
1579       rtx mem, base, offset;
1580       enum aarch64_symbol_type sty;
1581
1582       /* If we have (const (plus symbol offset)), separate out the offset
1583          before we start classifying the symbol.  */
1584       split_const (imm, &base, &offset);
1585
1586       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1587       switch (sty)
1588         {
1589         case SYMBOL_FORCE_TO_MEM:
1590           if (offset != const0_rtx
1591               && targetm.cannot_force_const_mem (mode, imm))
1592             {
1593               gcc_assert (can_create_pseudo_p ());
1594               base = aarch64_force_temporary (mode, dest, base);
1595               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1596               aarch64_emit_move (dest, base);
1597               return;
1598             }
1599           mem = force_const_mem (ptr_mode, imm);
1600           gcc_assert (mem);
1601           if (mode != ptr_mode)
1602             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1603           emit_insn (gen_rtx_SET (dest, mem));
1604           return;
1605
1606         case SYMBOL_SMALL_TLSGD:
1607         case SYMBOL_SMALL_TLSDESC:
1608         case SYMBOL_SMALL_GOTTPREL:
1609         case SYMBOL_SMALL_GOT_28K:
1610         case SYMBOL_SMALL_GOT_4G:
1611         case SYMBOL_TINY_GOT:
1612           if (offset != const0_rtx)
1613             {
1614               gcc_assert(can_create_pseudo_p ());
1615               base = aarch64_force_temporary (mode, dest, base);
1616               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1617               aarch64_emit_move (dest, base);
1618               return;
1619             }
1620           /* FALLTHRU */
1621
1622         case SYMBOL_SMALL_ABSOLUTE:
1623         case SYMBOL_TINY_ABSOLUTE:
1624         case SYMBOL_TLSLE:
1625           aarch64_load_symref_appropriately (dest, imm, sty);
1626           return;
1627
1628         default:
1629           gcc_unreachable ();
1630         }
1631     }
1632
1633   if (!CONST_INT_P (imm))
1634     {
1635       if (GET_CODE (imm) == HIGH)
1636         emit_insn (gen_rtx_SET (dest, imm));
1637       else
1638         {
1639           rtx mem = force_const_mem (mode, imm);
1640           gcc_assert (mem);
1641           emit_insn (gen_rtx_SET (dest, mem));
1642         }
1643
1644       return;
1645     }
1646
1647   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1648 }
1649
1650 static bool
1651 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1652                                  tree exp ATTRIBUTE_UNUSED)
1653 {
1654   /* Currently, always true.  */
1655   return true;
1656 }
1657
1658 /* Implement TARGET_PASS_BY_REFERENCE.  */
1659
1660 static bool
1661 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1662                            machine_mode mode,
1663                            const_tree type,
1664                            bool named ATTRIBUTE_UNUSED)
1665 {
1666   HOST_WIDE_INT size;
1667   machine_mode dummymode;
1668   int nregs;
1669
1670   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1671   size = (mode == BLKmode && type)
1672     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1673
1674   /* Aggregates are passed by reference based on their size.  */
1675   if (type && AGGREGATE_TYPE_P (type))
1676     {
1677       size = int_size_in_bytes (type);
1678     }
1679
1680   /* Variable sized arguments are always returned by reference.  */
1681   if (size < 0)
1682     return true;
1683
1684   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1685   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1686                                                &dummymode, &nregs,
1687                                                NULL))
1688     return false;
1689
1690   /* Arguments which are variable sized or larger than 2 registers are
1691      passed by reference unless they are a homogenous floating point
1692      aggregate.  */
1693   return size > 2 * UNITS_PER_WORD;
1694 }
1695
1696 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1697 static bool
1698 aarch64_return_in_msb (const_tree valtype)
1699 {
1700   machine_mode dummy_mode;
1701   int dummy_int;
1702
1703   /* Never happens in little-endian mode.  */
1704   if (!BYTES_BIG_ENDIAN)
1705     return false;
1706
1707   /* Only composite types smaller than or equal to 16 bytes can
1708      be potentially returned in registers.  */
1709   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1710       || int_size_in_bytes (valtype) <= 0
1711       || int_size_in_bytes (valtype) > 16)
1712     return false;
1713
1714   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1715      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1716      is always passed/returned in the least significant bits of fp/simd
1717      register(s).  */
1718   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1719                                                &dummy_mode, &dummy_int, NULL))
1720     return false;
1721
1722   return true;
1723 }
1724
1725 /* Implement TARGET_FUNCTION_VALUE.
1726    Define how to find the value returned by a function.  */
1727
1728 static rtx
1729 aarch64_function_value (const_tree type, const_tree func,
1730                         bool outgoing ATTRIBUTE_UNUSED)
1731 {
1732   machine_mode mode;
1733   int unsignedp;
1734   int count;
1735   machine_mode ag_mode;
1736
1737   mode = TYPE_MODE (type);
1738   if (INTEGRAL_TYPE_P (type))
1739     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1740
1741   if (aarch64_return_in_msb (type))
1742     {
1743       HOST_WIDE_INT size = int_size_in_bytes (type);
1744
1745       if (size % UNITS_PER_WORD != 0)
1746         {
1747           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1748           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1749         }
1750     }
1751
1752   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1753                                                &ag_mode, &count, NULL))
1754     {
1755       if (!aarch64_composite_type_p (type, mode))
1756         {
1757           gcc_assert (count == 1 && mode == ag_mode);
1758           return gen_rtx_REG (mode, V0_REGNUM);
1759         }
1760       else
1761         {
1762           int i;
1763           rtx par;
1764
1765           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1766           for (i = 0; i < count; i++)
1767             {
1768               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1769               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1770                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1771               XVECEXP (par, 0, i) = tmp;
1772             }
1773           return par;
1774         }
1775     }
1776   else
1777     return gen_rtx_REG (mode, R0_REGNUM);
1778 }
1779
1780 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1781    Return true if REGNO is the number of a hard register in which the values
1782    of called function may come back.  */
1783
1784 static bool
1785 aarch64_function_value_regno_p (const unsigned int regno)
1786 {
1787   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1788      of 16-byte return values are: 128-bit integers and 16-byte small
1789      structures (excluding homogeneous floating-point aggregates).  */
1790   if (regno == R0_REGNUM || regno == R1_REGNUM)
1791     return true;
1792
1793   /* Up to four fp/simd registers can return a function value, e.g. a
1794      homogeneous floating-point aggregate having four members.  */
1795   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1796     return TARGET_FLOAT;
1797
1798   return false;
1799 }
1800
1801 /* Implement TARGET_RETURN_IN_MEMORY.
1802
1803    If the type T of the result of a function is such that
1804      void func (T arg)
1805    would require that arg be passed as a value in a register (or set of
1806    registers) according to the parameter passing rules, then the result
1807    is returned in the same registers as would be used for such an
1808    argument.  */
1809
1810 static bool
1811 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1812 {
1813   HOST_WIDE_INT size;
1814   machine_mode ag_mode;
1815   int count;
1816
1817   if (!AGGREGATE_TYPE_P (type)
1818       && TREE_CODE (type) != COMPLEX_TYPE
1819       && TREE_CODE (type) != VECTOR_TYPE)
1820     /* Simple scalar types always returned in registers.  */
1821     return false;
1822
1823   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1824                                                type,
1825                                                &ag_mode,
1826                                                &count,
1827                                                NULL))
1828     return false;
1829
1830   /* Types larger than 2 registers returned in memory.  */
1831   size = int_size_in_bytes (type);
1832   return (size < 0 || size > 2 * UNITS_PER_WORD);
1833 }
1834
1835 static bool
1836 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1837                                const_tree type, int *nregs)
1838 {
1839   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1840   return aarch64_vfp_is_call_or_return_candidate (mode,
1841                                                   type,
1842                                                   &pcum->aapcs_vfp_rmode,
1843                                                   nregs,
1844                                                   NULL);
1845 }
1846
1847 /* Given MODE and TYPE of a function argument, return the alignment in
1848    bits.  The idea is to suppress any stronger alignment requested by
1849    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1850    This is a helper function for local use only.  */
1851
1852 static unsigned int
1853 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1854 {
1855   unsigned int alignment;
1856
1857   if (type)
1858     {
1859       if (!integer_zerop (TYPE_SIZE (type)))
1860         {
1861           if (TYPE_MODE (type) == mode)
1862             alignment = TYPE_ALIGN (type);
1863           else
1864             alignment = GET_MODE_ALIGNMENT (mode);
1865         }
1866       else
1867         alignment = 0;
1868     }
1869   else
1870     alignment = GET_MODE_ALIGNMENT (mode);
1871
1872   return alignment;
1873 }
1874
1875 /* Layout a function argument according to the AAPCS64 rules.  The rule
1876    numbers refer to the rule numbers in the AAPCS64.  */
1877
1878 static void
1879 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1880                     const_tree type,
1881                     bool named ATTRIBUTE_UNUSED)
1882 {
1883   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1884   int ncrn, nvrn, nregs;
1885   bool allocate_ncrn, allocate_nvrn;
1886   HOST_WIDE_INT size;
1887
1888   /* We need to do this once per argument.  */
1889   if (pcum->aapcs_arg_processed)
1890     return;
1891
1892   pcum->aapcs_arg_processed = true;
1893
1894   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1895   size
1896     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1897                         UNITS_PER_WORD);
1898
1899   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1900   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1901                                                  mode,
1902                                                  type,
1903                                                  &nregs);
1904
1905   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1906      The following code thus handles passing by SIMD/FP registers first.  */
1907
1908   nvrn = pcum->aapcs_nvrn;
1909
1910   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1911      and homogenous short-vector aggregates (HVA).  */
1912   if (allocate_nvrn)
1913     {
1914       if (!TARGET_FLOAT)
1915         aarch64_err_no_fpadvsimd (mode, "argument");
1916
1917       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1918         {
1919           pcum->aapcs_nextnvrn = nvrn + nregs;
1920           if (!aarch64_composite_type_p (type, mode))
1921             {
1922               gcc_assert (nregs == 1);
1923               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1924             }
1925           else
1926             {
1927               rtx par;
1928               int i;
1929               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1930               for (i = 0; i < nregs; i++)
1931                 {
1932                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1933                                          V0_REGNUM + nvrn + i);
1934                   tmp = gen_rtx_EXPR_LIST
1935                     (VOIDmode, tmp,
1936                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1937                   XVECEXP (par, 0, i) = tmp;
1938                 }
1939               pcum->aapcs_reg = par;
1940             }
1941           return;
1942         }
1943       else
1944         {
1945           /* C.3 NSRN is set to 8.  */
1946           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1947           goto on_stack;
1948         }
1949     }
1950
1951   ncrn = pcum->aapcs_ncrn;
1952   nregs = size / UNITS_PER_WORD;
1953
1954   /* C6 - C9.  though the sign and zero extension semantics are
1955      handled elsewhere.  This is the case where the argument fits
1956      entirely general registers.  */
1957   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1958     {
1959       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1960
1961       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1962
1963       /* C.8 if the argument has an alignment of 16 then the NGRN is
1964          rounded up to the next even number.  */
1965       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1966         {
1967           ++ncrn;
1968           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1969         }
1970       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1971          A reg is still generated for it, but the caller should be smart
1972          enough not to use it.  */
1973       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1974         {
1975           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1976         }
1977       else
1978         {
1979           rtx par;
1980           int i;
1981
1982           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1983           for (i = 0; i < nregs; i++)
1984             {
1985               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1986               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1987                                        GEN_INT (i * UNITS_PER_WORD));
1988               XVECEXP (par, 0, i) = tmp;
1989             }
1990           pcum->aapcs_reg = par;
1991         }
1992
1993       pcum->aapcs_nextncrn = ncrn + nregs;
1994       return;
1995     }
1996
1997   /* C.11  */
1998   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1999
2000   /* The argument is passed on stack; record the needed number of words for
2001      this argument and align the total size if necessary.  */
2002 on_stack:
2003   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2004   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2005     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
2006                                                16 / UNITS_PER_WORD);
2007   return;
2008 }
2009
2010 /* Implement TARGET_FUNCTION_ARG.  */
2011
2012 static rtx
2013 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2014                       const_tree type, bool named)
2015 {
2016   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2017   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2018
2019   if (mode == VOIDmode)
2020     return NULL_RTX;
2021
2022   aarch64_layout_arg (pcum_v, mode, type, named);
2023   return pcum->aapcs_reg;
2024 }
2025
2026 void
2027 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2028                            const_tree fntype ATTRIBUTE_UNUSED,
2029                            rtx libname ATTRIBUTE_UNUSED,
2030                            const_tree fndecl ATTRIBUTE_UNUSED,
2031                            unsigned n_named ATTRIBUTE_UNUSED)
2032 {
2033   pcum->aapcs_ncrn = 0;
2034   pcum->aapcs_nvrn = 0;
2035   pcum->aapcs_nextncrn = 0;
2036   pcum->aapcs_nextnvrn = 0;
2037   pcum->pcs_variant = ARM_PCS_AAPCS64;
2038   pcum->aapcs_reg = NULL_RTX;
2039   pcum->aapcs_arg_processed = false;
2040   pcum->aapcs_stack_words = 0;
2041   pcum->aapcs_stack_size = 0;
2042
2043   if (!TARGET_FLOAT
2044       && fndecl && TREE_PUBLIC (fndecl)
2045       && fntype && fntype != error_mark_node)
2046     {
2047       const_tree type = TREE_TYPE (fntype);
2048       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2049       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2050       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2051                                                    &mode, &nregs, NULL))
2052         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2053     }
2054   return;
2055 }
2056
2057 static void
2058 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2059                               machine_mode mode,
2060                               const_tree type,
2061                               bool named)
2062 {
2063   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2064   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2065     {
2066       aarch64_layout_arg (pcum_v, mode, type, named);
2067       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2068                   != (pcum->aapcs_stack_words != 0));
2069       pcum->aapcs_arg_processed = false;
2070       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2071       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2072       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2073       pcum->aapcs_stack_words = 0;
2074       pcum->aapcs_reg = NULL_RTX;
2075     }
2076 }
2077
2078 bool
2079 aarch64_function_arg_regno_p (unsigned regno)
2080 {
2081   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2082           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2083 }
2084
2085 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2086    PARM_BOUNDARY bits of alignment, but will be given anything up
2087    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2088    that both before and after the layout of each argument, the Next
2089    Stacked Argument Address (NSAA) will have a minimum alignment of
2090    8 bytes.  */
2091
2092 static unsigned int
2093 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2094 {
2095   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2096
2097   if (alignment < PARM_BOUNDARY)
2098     alignment = PARM_BOUNDARY;
2099   if (alignment > STACK_BOUNDARY)
2100     alignment = STACK_BOUNDARY;
2101   return alignment;
2102 }
2103
2104 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2105
2106    Return true if an argument passed on the stack should be padded upwards,
2107    i.e. if the least-significant byte of the stack slot has useful data.
2108
2109    Small aggregate types are placed in the lowest memory address.
2110
2111    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2112
2113 bool
2114 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2115 {
2116   /* On little-endian targets, the least significant byte of every stack
2117      argument is passed at the lowest byte address of the stack slot.  */
2118   if (!BYTES_BIG_ENDIAN)
2119     return true;
2120
2121   /* Otherwise, integral, floating-point and pointer types are padded downward:
2122      the least significant byte of a stack argument is passed at the highest
2123      byte address of the stack slot.  */
2124   if (type
2125       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2126          || POINTER_TYPE_P (type))
2127       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2128     return false;
2129
2130   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2131   return true;
2132 }
2133
2134 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2135
2136    It specifies padding for the last (may also be the only)
2137    element of a block move between registers and memory.  If
2138    assuming the block is in the memory, padding upward means that
2139    the last element is padded after its highest significant byte,
2140    while in downward padding, the last element is padded at the
2141    its least significant byte side.
2142
2143    Small aggregates and small complex types are always padded
2144    upwards.
2145
2146    We don't need to worry about homogeneous floating-point or
2147    short-vector aggregates; their move is not affected by the
2148    padding direction determined here.  Regardless of endianness,
2149    each element of such an aggregate is put in the least
2150    significant bits of a fp/simd register.
2151
2152    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2153    register has useful data, and return the opposite if the most
2154    significant byte does.  */
2155
2156 bool
2157 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2158                      bool first ATTRIBUTE_UNUSED)
2159 {
2160
2161   /* Small composite types are always padded upward.  */
2162   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2163     {
2164       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2165                             : GET_MODE_SIZE (mode));
2166       if (size < 2 * UNITS_PER_WORD)
2167         return true;
2168     }
2169
2170   /* Otherwise, use the default padding.  */
2171   return !BYTES_BIG_ENDIAN;
2172 }
2173
2174 static machine_mode
2175 aarch64_libgcc_cmp_return_mode (void)
2176 {
2177   return SImode;
2178 }
2179
2180 static bool
2181 aarch64_frame_pointer_required (void)
2182 {
2183   /* In aarch64_override_options_after_change
2184      flag_omit_leaf_frame_pointer turns off the frame pointer by
2185      default.  Turn it back on now if we've not got a leaf
2186      function.  */
2187   if (flag_omit_leaf_frame_pointer
2188       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2189     return true;
2190
2191   return false;
2192 }
2193
2194 /* Mark the registers that need to be saved by the callee and calculate
2195    the size of the callee-saved registers area and frame record (both FP
2196    and LR may be omitted).  */
2197 static void
2198 aarch64_layout_frame (void)
2199 {
2200   HOST_WIDE_INT offset = 0;
2201   int regno;
2202
2203   if (reload_completed && cfun->machine->frame.laid_out)
2204     return;
2205
2206 #define SLOT_NOT_REQUIRED (-2)
2207 #define SLOT_REQUIRED     (-1)
2208
2209   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2210   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2211
2212   /* First mark all the registers that really need to be saved...  */
2213   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2214     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2215
2216   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2217     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2218
2219   /* ... that includes the eh data registers (if needed)...  */
2220   if (crtl->calls_eh_return)
2221     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2222       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2223         = SLOT_REQUIRED;
2224
2225   /* ... and any callee saved register that dataflow says is live.  */
2226   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2227     if (df_regs_ever_live_p (regno)
2228         && (regno == R30_REGNUM
2229             || !call_used_regs[regno]))
2230       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2231
2232   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2233     if (df_regs_ever_live_p (regno)
2234         && !call_used_regs[regno])
2235       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2236
2237   if (frame_pointer_needed)
2238     {
2239       /* FP and LR are placed in the linkage record.  */
2240       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2241       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2242       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2243       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2244       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2245       offset += 2 * UNITS_PER_WORD;
2246     }
2247
2248   /* Now assign stack slots for them.  */
2249   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2250     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2251       {
2252         cfun->machine->frame.reg_offset[regno] = offset;
2253         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2254           cfun->machine->frame.wb_candidate1 = regno;
2255         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2256           cfun->machine->frame.wb_candidate2 = regno;
2257         offset += UNITS_PER_WORD;
2258       }
2259
2260   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2261     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2262       {
2263         cfun->machine->frame.reg_offset[regno] = offset;
2264         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2265           cfun->machine->frame.wb_candidate1 = regno;
2266         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2267                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2268           cfun->machine->frame.wb_candidate2 = regno;
2269         offset += UNITS_PER_WORD;
2270       }
2271
2272   cfun->machine->frame.padding0 =
2273     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2274   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2275
2276   cfun->machine->frame.saved_regs_size = offset;
2277
2278   cfun->machine->frame.hard_fp_offset
2279     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2280                         + get_frame_size ()
2281                         + cfun->machine->frame.saved_regs_size,
2282                         STACK_BOUNDARY / BITS_PER_UNIT);
2283
2284   cfun->machine->frame.frame_size
2285     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2286                         + crtl->outgoing_args_size,
2287                         STACK_BOUNDARY / BITS_PER_UNIT);
2288
2289   cfun->machine->frame.laid_out = true;
2290 }
2291
2292 static bool
2293 aarch64_register_saved_on_entry (int regno)
2294 {
2295   return cfun->machine->frame.reg_offset[regno] >= 0;
2296 }
2297
2298 static unsigned
2299 aarch64_next_callee_save (unsigned regno, unsigned limit)
2300 {
2301   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2302     regno ++;
2303   return regno;
2304 }
2305
2306 static void
2307 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2308                            HOST_WIDE_INT adjustment)
2309  {
2310   rtx base_rtx = stack_pointer_rtx;
2311   rtx insn, reg, mem;
2312
2313   reg = gen_rtx_REG (mode, regno);
2314   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2315                             plus_constant (Pmode, base_rtx, -adjustment));
2316   mem = gen_rtx_MEM (mode, mem);
2317
2318   insn = emit_move_insn (mem, reg);
2319   RTX_FRAME_RELATED_P (insn) = 1;
2320 }
2321
2322 static rtx
2323 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2324                           HOST_WIDE_INT adjustment)
2325 {
2326   switch (mode)
2327     {
2328     case DImode:
2329       return gen_storewb_pairdi_di (base, base, reg, reg2,
2330                                     GEN_INT (-adjustment),
2331                                     GEN_INT (UNITS_PER_WORD - adjustment));
2332     case DFmode:
2333       return gen_storewb_pairdf_di (base, base, reg, reg2,
2334                                     GEN_INT (-adjustment),
2335                                     GEN_INT (UNITS_PER_WORD - adjustment));
2336     default:
2337       gcc_unreachable ();
2338     }
2339 }
2340
2341 static void
2342 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2343                          unsigned regno2, HOST_WIDE_INT adjustment)
2344 {
2345   rtx_insn *insn;
2346   rtx reg1 = gen_rtx_REG (mode, regno1);
2347   rtx reg2 = gen_rtx_REG (mode, regno2);
2348
2349   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2350                                               reg2, adjustment));
2351   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2352   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2353   RTX_FRAME_RELATED_P (insn) = 1;
2354 }
2355
2356 static rtx
2357 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2358                          HOST_WIDE_INT adjustment)
2359 {
2360   switch (mode)
2361     {
2362     case DImode:
2363       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2364                                    GEN_INT (UNITS_PER_WORD));
2365     case DFmode:
2366       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2367                                    GEN_INT (UNITS_PER_WORD));
2368     default:
2369       gcc_unreachable ();
2370     }
2371 }
2372
2373 static rtx
2374 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2375                         rtx reg2)
2376 {
2377   switch (mode)
2378     {
2379     case DImode:
2380       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2381
2382     case DFmode:
2383       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2384
2385     default:
2386       gcc_unreachable ();
2387     }
2388 }
2389
2390 static rtx
2391 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2392                        rtx mem2)
2393 {
2394   switch (mode)
2395     {
2396     case DImode:
2397       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2398
2399     case DFmode:
2400       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2401
2402     default:
2403       gcc_unreachable ();
2404     }
2405 }
2406
2407
2408 static void
2409 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2410                            unsigned start, unsigned limit, bool skip_wb)
2411 {
2412   rtx_insn *insn;
2413   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2414                                                  ? gen_frame_mem : gen_rtx_MEM);
2415   unsigned regno;
2416   unsigned regno2;
2417
2418   for (regno = aarch64_next_callee_save (start, limit);
2419        regno <= limit;
2420        regno = aarch64_next_callee_save (regno + 1, limit))
2421     {
2422       rtx reg, mem;
2423       HOST_WIDE_INT offset;
2424
2425       if (skip_wb
2426           && (regno == cfun->machine->frame.wb_candidate1
2427               || regno == cfun->machine->frame.wb_candidate2))
2428         continue;
2429
2430       reg = gen_rtx_REG (mode, regno);
2431       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2432       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2433                                               offset));
2434
2435       regno2 = aarch64_next_callee_save (regno + 1, limit);
2436
2437       if (regno2 <= limit
2438           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2439               == cfun->machine->frame.reg_offset[regno2]))
2440
2441         {
2442           rtx reg2 = gen_rtx_REG (mode, regno2);
2443           rtx mem2;
2444
2445           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2446           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2447                                                    offset));
2448           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2449                                                     reg2));
2450
2451           /* The first part of a frame-related parallel insn is
2452              always assumed to be relevant to the frame
2453              calculations; subsequent parts, are only
2454              frame-related if explicitly marked.  */
2455           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2456           regno = regno2;
2457         }
2458       else
2459         insn = emit_move_insn (mem, reg);
2460
2461       RTX_FRAME_RELATED_P (insn) = 1;
2462     }
2463 }
2464
2465 static void
2466 aarch64_restore_callee_saves (machine_mode mode,
2467                               HOST_WIDE_INT start_offset, unsigned start,
2468                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2469 {
2470   rtx base_rtx = stack_pointer_rtx;
2471   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2472                                                  ? gen_frame_mem : gen_rtx_MEM);
2473   unsigned regno;
2474   unsigned regno2;
2475   HOST_WIDE_INT offset;
2476
2477   for (regno = aarch64_next_callee_save (start, limit);
2478        regno <= limit;
2479        regno = aarch64_next_callee_save (regno + 1, limit))
2480     {
2481       rtx reg, mem;
2482
2483       if (skip_wb
2484           && (regno == cfun->machine->frame.wb_candidate1
2485               || regno == cfun->machine->frame.wb_candidate2))
2486         continue;
2487
2488       reg = gen_rtx_REG (mode, regno);
2489       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2490       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2491
2492       regno2 = aarch64_next_callee_save (regno + 1, limit);
2493
2494       if (regno2 <= limit
2495           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2496               == cfun->machine->frame.reg_offset[regno2]))
2497         {
2498           rtx reg2 = gen_rtx_REG (mode, regno2);
2499           rtx mem2;
2500
2501           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2502           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2503           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2504
2505           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2506           regno = regno2;
2507         }
2508       else
2509         emit_move_insn (reg, mem);
2510       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2511     }
2512 }
2513
2514 /* AArch64 stack frames generated by this compiler look like:
2515
2516         +-------------------------------+
2517         |                               |
2518         |  incoming stack arguments     |
2519         |                               |
2520         +-------------------------------+
2521         |                               | <-- incoming stack pointer (aligned)
2522         |  callee-allocated save area   |
2523         |  for register varargs         |
2524         |                               |
2525         +-------------------------------+
2526         |  local variables              | <-- frame_pointer_rtx
2527         |                               |
2528         +-------------------------------+
2529         |  padding0                     | \
2530         +-------------------------------+  |
2531         |  callee-saved registers       |  | frame.saved_regs_size
2532         +-------------------------------+  |
2533         |  LR'                          |  |
2534         +-------------------------------+  |
2535         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2536         +-------------------------------+
2537         |  dynamic allocation           |
2538         +-------------------------------+
2539         |  padding                      |
2540         +-------------------------------+
2541         |  outgoing stack arguments     | <-- arg_pointer
2542         |                               |
2543         +-------------------------------+
2544         |                               | <-- stack_pointer_rtx (aligned)
2545
2546    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2547    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2548    unchanged.  */
2549
2550 /* Generate the prologue instructions for entry into a function.
2551    Establish the stack frame by decreasing the stack pointer with a
2552    properly calculated size and, if necessary, create a frame record
2553    filled with the values of LR and previous frame pointer.  The
2554    current FP is also set up if it is in use.  */
2555
2556 void
2557 aarch64_expand_prologue (void)
2558 {
2559   /* sub sp, sp, #<frame_size>
2560      stp {fp, lr}, [sp, #<frame_size> - 16]
2561      add fp, sp, #<frame_size> - hardfp_offset
2562      stp {cs_reg}, [fp, #-16] etc.
2563
2564      sub sp, sp, <final_adjustment_if_any>
2565   */
2566   HOST_WIDE_INT frame_size, offset;
2567   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2568   HOST_WIDE_INT hard_fp_offset;
2569   rtx_insn *insn;
2570
2571   aarch64_layout_frame ();
2572
2573   offset = frame_size = cfun->machine->frame.frame_size;
2574   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2575   fp_offset = frame_size - hard_fp_offset;
2576
2577   if (flag_stack_usage_info)
2578     current_function_static_stack_size = frame_size;
2579
2580   /* Store pairs and load pairs have a range only -512 to 504.  */
2581   if (offset >= 512)
2582     {
2583       /* When the frame has a large size, an initial decrease is done on
2584          the stack pointer to jump over the callee-allocated save area for
2585          register varargs, the local variable area and/or the callee-saved
2586          register area.  This will allow the pre-index write-back
2587          store pair instructions to be used for setting up the stack frame
2588          efficiently.  */
2589       offset = hard_fp_offset;
2590       if (offset >= 512)
2591         offset = cfun->machine->frame.saved_regs_size;
2592
2593       frame_size -= (offset + crtl->outgoing_args_size);
2594       fp_offset = 0;
2595
2596       if (frame_size >= 0x1000000)
2597         {
2598           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2599           emit_move_insn (op0, GEN_INT (-frame_size));
2600           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2601
2602           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2603                         gen_rtx_SET (stack_pointer_rtx,
2604                                      plus_constant (Pmode, stack_pointer_rtx,
2605                                                     -frame_size)));
2606           RTX_FRAME_RELATED_P (insn) = 1;
2607         }
2608       else if (frame_size > 0)
2609         {
2610           int hi_ofs = frame_size & 0xfff000;
2611           int lo_ofs = frame_size & 0x000fff;
2612
2613           if (hi_ofs)
2614             {
2615               insn = emit_insn (gen_add2_insn
2616                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2617               RTX_FRAME_RELATED_P (insn) = 1;
2618             }
2619           if (lo_ofs)
2620             {
2621               insn = emit_insn (gen_add2_insn
2622                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2623               RTX_FRAME_RELATED_P (insn) = 1;
2624             }
2625         }
2626     }
2627   else
2628     frame_size = -1;
2629
2630   if (offset > 0)
2631     {
2632       bool skip_wb = false;
2633
2634       if (frame_pointer_needed)
2635         {
2636           skip_wb = true;
2637
2638           if (fp_offset)
2639             {
2640               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2641                                                GEN_INT (-offset)));
2642               RTX_FRAME_RELATED_P (insn) = 1;
2643
2644               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2645                                          R30_REGNUM, false);
2646             }
2647           else
2648             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2649
2650           /* Set up frame pointer to point to the location of the
2651              previous frame pointer on the stack.  */
2652           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2653                                            stack_pointer_rtx,
2654                                            GEN_INT (fp_offset)));
2655           RTX_FRAME_RELATED_P (insn) = 1;
2656           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2657         }
2658       else
2659         {
2660           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2661           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2662
2663           if (fp_offset
2664               || reg1 == FIRST_PSEUDO_REGISTER
2665               || (reg2 == FIRST_PSEUDO_REGISTER
2666                   && offset >= 256))
2667             {
2668               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2669                                                GEN_INT (-offset)));
2670               RTX_FRAME_RELATED_P (insn) = 1;
2671             }
2672           else
2673             {
2674               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2675
2676               skip_wb = true;
2677
2678               if (reg2 == FIRST_PSEUDO_REGISTER)
2679                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2680               else
2681                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2682             }
2683         }
2684
2685       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2686                                  skip_wb);
2687       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2688                                  skip_wb);
2689     }
2690
2691   /* when offset >= 512,
2692      sub sp, sp, #<outgoing_args_size> */
2693   if (frame_size > -1)
2694     {
2695       if (crtl->outgoing_args_size > 0)
2696         {
2697           insn = emit_insn (gen_add2_insn
2698                             (stack_pointer_rtx,
2699                              GEN_INT (- crtl->outgoing_args_size)));
2700           RTX_FRAME_RELATED_P (insn) = 1;
2701         }
2702     }
2703 }
2704
2705 /* Return TRUE if we can use a simple_return insn.
2706
2707    This function checks whether the callee saved stack is empty, which
2708    means no restore actions are need. The pro_and_epilogue will use
2709    this to check whether shrink-wrapping opt is feasible.  */
2710
2711 bool
2712 aarch64_use_return_insn_p (void)
2713 {
2714   if (!reload_completed)
2715     return false;
2716
2717   if (crtl->profile)
2718     return false;
2719
2720   aarch64_layout_frame ();
2721
2722   return cfun->machine->frame.frame_size == 0;
2723 }
2724
2725 /* Generate the epilogue instructions for returning from a function.  */
2726 void
2727 aarch64_expand_epilogue (bool for_sibcall)
2728 {
2729   HOST_WIDE_INT frame_size, offset;
2730   HOST_WIDE_INT fp_offset;
2731   HOST_WIDE_INT hard_fp_offset;
2732   rtx_insn *insn;
2733   /* We need to add memory barrier to prevent read from deallocated stack.  */
2734   bool need_barrier_p = (get_frame_size () != 0
2735                          || cfun->machine->frame.saved_varargs_size);
2736
2737   aarch64_layout_frame ();
2738
2739   offset = frame_size = cfun->machine->frame.frame_size;
2740   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2741   fp_offset = frame_size - hard_fp_offset;
2742
2743   /* Store pairs and load pairs have a range only -512 to 504.  */
2744   if (offset >= 512)
2745     {
2746       offset = hard_fp_offset;
2747       if (offset >= 512)
2748         offset = cfun->machine->frame.saved_regs_size;
2749
2750       frame_size -= (offset + crtl->outgoing_args_size);
2751       fp_offset = 0;
2752       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2753         {
2754           insn = emit_insn (gen_add2_insn
2755                             (stack_pointer_rtx,
2756                              GEN_INT (crtl->outgoing_args_size)));
2757           RTX_FRAME_RELATED_P (insn) = 1;
2758         }
2759     }
2760   else
2761     frame_size = -1;
2762
2763   /* If there were outgoing arguments or we've done dynamic stack
2764      allocation, then restore the stack pointer from the frame
2765      pointer.  This is at most one insn and more efficient than using
2766      GCC's internal mechanism.  */
2767   if (frame_pointer_needed
2768       && (crtl->outgoing_args_size || cfun->calls_alloca))
2769     {
2770       if (cfun->calls_alloca)
2771         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2772
2773       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2774                                        hard_frame_pointer_rtx,
2775                                        GEN_INT (0)));
2776       offset = offset - fp_offset;
2777     }
2778
2779   if (offset > 0)
2780     {
2781       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2782       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2783       bool skip_wb = true;
2784       rtx cfi_ops = NULL;
2785
2786       if (frame_pointer_needed)
2787         fp_offset = 0;
2788       else if (fp_offset
2789                || reg1 == FIRST_PSEUDO_REGISTER
2790                || (reg2 == FIRST_PSEUDO_REGISTER
2791                    && offset >= 256))
2792         skip_wb = false;
2793
2794       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2795                                     skip_wb, &cfi_ops);
2796       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2797                                     skip_wb, &cfi_ops);
2798
2799       if (need_barrier_p)
2800         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2801
2802       if (skip_wb)
2803         {
2804           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2805           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2806
2807           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2808           if (reg2 == FIRST_PSEUDO_REGISTER)
2809             {
2810               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2811               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2812               mem = gen_rtx_MEM (mode1, mem);
2813               insn = emit_move_insn (rreg1, mem);
2814             }
2815           else
2816             {
2817               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2818
2819               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2820               insn = emit_insn (aarch64_gen_loadwb_pair
2821                                 (mode1, stack_pointer_rtx, rreg1,
2822                                  rreg2, offset));
2823             }
2824         }
2825       else
2826         {
2827           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2828                                            GEN_INT (offset)));
2829         }
2830
2831       /* Reset the CFA to be SP + FRAME_SIZE.  */
2832       rtx new_cfa = stack_pointer_rtx;
2833       if (frame_size > 0)
2834         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2835       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2836       REG_NOTES (insn) = cfi_ops;
2837       RTX_FRAME_RELATED_P (insn) = 1;
2838     }
2839
2840   if (frame_size > 0)
2841     {
2842       if (need_barrier_p)
2843         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2844
2845       if (frame_size >= 0x1000000)
2846         {
2847           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2848           emit_move_insn (op0, GEN_INT (frame_size));
2849           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2850         }
2851       else
2852         {
2853           int hi_ofs = frame_size & 0xfff000;
2854           int lo_ofs = frame_size & 0x000fff;
2855
2856           if (hi_ofs && lo_ofs)
2857             {
2858               insn = emit_insn (gen_add2_insn
2859                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2860               RTX_FRAME_RELATED_P (insn) = 1;
2861               frame_size = lo_ofs;
2862             }
2863           insn = emit_insn (gen_add2_insn
2864                             (stack_pointer_rtx, GEN_INT (frame_size)));
2865         }
2866
2867       /* Reset the CFA to be SP + 0.  */
2868       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2869       RTX_FRAME_RELATED_P (insn) = 1;
2870     }
2871
2872   /* Stack adjustment for exception handler.  */
2873   if (crtl->calls_eh_return)
2874     {
2875       /* We need to unwind the stack by the offset computed by
2876          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2877          to be SP; letting the CFA move during this adjustment
2878          is just as correct as retaining the CFA from the body
2879          of the function.  Therefore, do nothing special.  */
2880       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2881     }
2882
2883   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2884   if (!for_sibcall)
2885     emit_jump_insn (ret_rtx);
2886 }
2887
2888 /* Return the place to copy the exception unwinding return address to.
2889    This will probably be a stack slot, but could (in theory be the
2890    return register).  */
2891 rtx
2892 aarch64_final_eh_return_addr (void)
2893 {
2894   HOST_WIDE_INT fp_offset;
2895
2896   aarch64_layout_frame ();
2897
2898   fp_offset = cfun->machine->frame.frame_size
2899               - cfun->machine->frame.hard_fp_offset;
2900
2901   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2902     return gen_rtx_REG (DImode, LR_REGNUM);
2903
2904   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2905      result in a store to save LR introduced by builtin_eh_return () being
2906      incorrectly deleted because the alias is not detected.
2907      So in the calculation of the address to copy the exception unwinding
2908      return address to, we note 2 cases.
2909      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2910      we return a SP-relative location since all the addresses are SP-relative
2911      in this case.  This prevents the store from being optimized away.
2912      If the fp_offset is not 0, then the addresses will be FP-relative and
2913      therefore we return a FP-relative location.  */
2914
2915   if (frame_pointer_needed)
2916     {
2917       if (fp_offset)
2918         return gen_frame_mem (DImode,
2919                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2920       else
2921         return gen_frame_mem (DImode,
2922                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2923     }
2924
2925   /* If FP is not needed, we calculate the location of LR, which would be
2926      at the top of the saved registers block.  */
2927
2928   return gen_frame_mem (DImode,
2929                         plus_constant (Pmode,
2930                                        stack_pointer_rtx,
2931                                        fp_offset
2932                                        + cfun->machine->frame.saved_regs_size
2933                                        - 2 * UNITS_PER_WORD));
2934 }
2935
2936 /* Possibly output code to build up a constant in a register.  For
2937    the benefit of the costs infrastructure, returns the number of
2938    instructions which would be emitted.  GENERATE inhibits or
2939    enables code generation.  */
2940
2941 static int
2942 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2943 {
2944   int insns = 0;
2945
2946   if (aarch64_bitmask_imm (val, DImode))
2947     {
2948       if (generate)
2949         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2950       insns = 1;
2951     }
2952   else
2953     {
2954       int i;
2955       int ncount = 0;
2956       int zcount = 0;
2957       HOST_WIDE_INT valp = val >> 16;
2958       HOST_WIDE_INT valm;
2959       HOST_WIDE_INT tval;
2960
2961       for (i = 16; i < 64; i += 16)
2962         {
2963           valm = (valp & 0xffff);
2964
2965           if (valm != 0)
2966             ++ zcount;
2967
2968           if (valm != 0xffff)
2969             ++ ncount;
2970
2971           valp >>= 16;
2972         }
2973
2974       /* zcount contains the number of additional MOVK instructions
2975          required if the constant is built up with an initial MOVZ instruction,
2976          while ncount is the number of MOVK instructions required if starting
2977          with a MOVN instruction.  Choose the sequence that yields the fewest
2978          number of instructions, preferring MOVZ instructions when they are both
2979          the same.  */
2980       if (ncount < zcount)
2981         {
2982           if (generate)
2983             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2984                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2985           tval = 0xffff;
2986           insns++;
2987         }
2988       else
2989         {
2990           if (generate)
2991             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2992                             GEN_INT (val & 0xffff));
2993           tval = 0;
2994           insns++;
2995         }
2996
2997       val >>= 16;
2998
2999       for (i = 16; i < 64; i += 16)
3000         {
3001           if ((val & 0xffff) != tval)
3002             {
3003               if (generate)
3004                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3005                                            GEN_INT (i),
3006                                            GEN_INT (val & 0xffff)));
3007               insns++;
3008             }
3009           val >>= 16;
3010         }
3011     }
3012   return insns;
3013 }
3014
3015 static void
3016 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3017 {
3018   HOST_WIDE_INT mdelta = delta;
3019   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3020   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3021
3022   if (mdelta < 0)
3023     mdelta = -mdelta;
3024
3025   if (mdelta >= 4096 * 4096)
3026     {
3027       (void) aarch64_build_constant (scratchreg, delta, true);
3028       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3029     }
3030   else if (mdelta > 0)
3031     {
3032       if (mdelta >= 4096)
3033         {
3034           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3035           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3036           if (delta < 0)
3037             emit_insn (gen_rtx_SET (this_rtx,
3038                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3039           else
3040             emit_insn (gen_rtx_SET (this_rtx,
3041                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3042         }
3043       if (mdelta % 4096 != 0)
3044         {
3045           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3046           emit_insn (gen_rtx_SET (this_rtx,
3047                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3048         }
3049     }
3050 }
3051
3052 /* Output code to add DELTA to the first argument, and then jump
3053    to FUNCTION.  Used for C++ multiple inheritance.  */
3054 static void
3055 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3056                          HOST_WIDE_INT delta,
3057                          HOST_WIDE_INT vcall_offset,
3058                          tree function)
3059 {
3060   /* The this pointer is always in x0.  Note that this differs from
3061      Arm where the this pointer maybe bumped to r1 if r0 is required
3062      to return a pointer to an aggregate.  On AArch64 a result value
3063      pointer will be in x8.  */
3064   int this_regno = R0_REGNUM;
3065   rtx this_rtx, temp0, temp1, addr, funexp;
3066   rtx_insn *insn;
3067
3068   reload_completed = 1;
3069   emit_note (NOTE_INSN_PROLOGUE_END);
3070
3071   if (vcall_offset == 0)
3072     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3073   else
3074     {
3075       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3076
3077       this_rtx = gen_rtx_REG (Pmode, this_regno);
3078       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3079       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3080
3081       addr = this_rtx;
3082       if (delta != 0)
3083         {
3084           if (delta >= -256 && delta < 256)
3085             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3086                                        plus_constant (Pmode, this_rtx, delta));
3087           else
3088             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3089         }
3090
3091       if (Pmode == ptr_mode)
3092         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3093       else
3094         aarch64_emit_move (temp0,
3095                            gen_rtx_ZERO_EXTEND (Pmode,
3096                                                 gen_rtx_MEM (ptr_mode, addr)));
3097
3098       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3099           addr = plus_constant (Pmode, temp0, vcall_offset);
3100       else
3101         {
3102           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3103           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3104         }
3105
3106       if (Pmode == ptr_mode)
3107         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3108       else
3109         aarch64_emit_move (temp1,
3110                            gen_rtx_SIGN_EXTEND (Pmode,
3111                                                 gen_rtx_MEM (ptr_mode, addr)));
3112
3113       emit_insn (gen_add2_insn (this_rtx, temp1));
3114     }
3115
3116   /* Generate a tail call to the target function.  */
3117   if (!TREE_USED (function))
3118     {
3119       assemble_external (function);
3120       TREE_USED (function) = 1;
3121     }
3122   funexp = XEXP (DECL_RTL (function), 0);
3123   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3124   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3125   SIBLING_CALL_P (insn) = 1;
3126
3127   insn = get_insns ();
3128   shorten_branches (insn);
3129   final_start_function (insn, file, 1);
3130   final (insn, file, 1);
3131   final_end_function ();
3132
3133   /* Stop pretending to be a post-reload pass.  */
3134   reload_completed = 0;
3135 }
3136
3137 static bool
3138 aarch64_tls_referenced_p (rtx x)
3139 {
3140   if (!TARGET_HAVE_TLS)
3141     return false;
3142   subrtx_iterator::array_type array;
3143   FOR_EACH_SUBRTX (iter, array, x, ALL)
3144     {
3145       const_rtx x = *iter;
3146       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3147         return true;
3148       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3149          TLS offsets, not real symbol references.  */
3150       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3151         iter.skip_subrtxes ();
3152     }
3153   return false;
3154 }
3155
3156
3157 static int
3158 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3159 {
3160   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3161   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3162
3163   if (*imm1 < *imm2)
3164     return -1;
3165   if (*imm1 > *imm2)
3166     return +1;
3167   return 0;
3168 }
3169
3170
3171 static void
3172 aarch64_build_bitmask_table (void)
3173 {
3174   unsigned HOST_WIDE_INT mask, imm;
3175   unsigned int log_e, e, s, r;
3176   unsigned int nimms = 0;
3177
3178   for (log_e = 1; log_e <= 6; log_e++)
3179     {
3180       e = 1 << log_e;
3181       if (e == 64)
3182         mask = ~(HOST_WIDE_INT) 0;
3183       else
3184         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3185       for (s = 1; s < e; s++)
3186         {
3187           for (r = 0; r < e; r++)
3188             {
3189               /* set s consecutive bits to 1 (s < 64) */
3190               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3191               /* rotate right by r */
3192               if (r != 0)
3193                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3194               /* replicate the constant depending on SIMD size */
3195               switch (log_e) {
3196               case 1: imm |= (imm <<  2);
3197               case 2: imm |= (imm <<  4);
3198               case 3: imm |= (imm <<  8);
3199               case 4: imm |= (imm << 16);
3200               case 5: imm |= (imm << 32);
3201               case 6:
3202                 break;
3203               default:
3204                 gcc_unreachable ();
3205               }
3206               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3207               aarch64_bitmasks[nimms++] = imm;
3208             }
3209         }
3210     }
3211
3212   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3213   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3214          aarch64_bitmasks_cmp);
3215 }
3216
3217
3218 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3219    a left shift of 0 or 12 bits.  */
3220 bool
3221 aarch64_uimm12_shift (HOST_WIDE_INT val)
3222 {
3223   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3224           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3225           );
3226 }
3227
3228
3229 /* Return true if val is an immediate that can be loaded into a
3230    register by a MOVZ instruction.  */
3231 static bool
3232 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3233 {
3234   if (GET_MODE_SIZE (mode) > 4)
3235     {
3236       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3237           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3238         return 1;
3239     }
3240   else
3241     {
3242       /* Ignore sign extension.  */
3243       val &= (HOST_WIDE_INT) 0xffffffff;
3244     }
3245   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3246           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3247 }
3248
3249
3250 /* Return true if val is a valid bitmask immediate.  */
3251 bool
3252 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3253 {
3254   if (GET_MODE_SIZE (mode) < 8)
3255     {
3256       /* Replicate bit pattern.  */
3257       val &= (HOST_WIDE_INT) 0xffffffff;
3258       val |= val << 32;
3259     }
3260   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3261                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3262 }
3263
3264
3265 /* Return true if val is an immediate that can be loaded into a
3266    register in a single instruction.  */
3267 bool
3268 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3269 {
3270   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3271     return 1;
3272   return aarch64_bitmask_imm (val, mode);
3273 }
3274
3275 static bool
3276 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3277 {
3278   rtx base, offset;
3279
3280   if (GET_CODE (x) == HIGH)
3281     return true;
3282
3283   split_const (x, &base, &offset);
3284   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3285     {
3286       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3287           != SYMBOL_FORCE_TO_MEM)
3288         return true;
3289       else
3290         /* Avoid generating a 64-bit relocation in ILP32; leave
3291            to aarch64_expand_mov_immediate to handle it properly.  */
3292         return mode != ptr_mode;
3293     }
3294
3295   return aarch64_tls_referenced_p (x);
3296 }
3297
3298 /* Return true if register REGNO is a valid index register.
3299    STRICT_P is true if REG_OK_STRICT is in effect.  */
3300
3301 bool
3302 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3303 {
3304   if (!HARD_REGISTER_NUM_P (regno))
3305     {
3306       if (!strict_p)
3307         return true;
3308
3309       if (!reg_renumber)
3310         return false;
3311
3312       regno = reg_renumber[regno];
3313     }
3314   return GP_REGNUM_P (regno);
3315 }
3316
3317 /* Return true if register REGNO is a valid base register for mode MODE.
3318    STRICT_P is true if REG_OK_STRICT is in effect.  */
3319
3320 bool
3321 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3322 {
3323   if (!HARD_REGISTER_NUM_P (regno))
3324     {
3325       if (!strict_p)
3326         return true;
3327
3328       if (!reg_renumber)
3329         return false;
3330
3331       regno = reg_renumber[regno];
3332     }
3333
3334   /* The fake registers will be eliminated to either the stack or
3335      hard frame pointer, both of which are usually valid base registers.
3336      Reload deals with the cases where the eliminated form isn't valid.  */
3337   return (GP_REGNUM_P (regno)
3338           || regno == SP_REGNUM
3339           || regno == FRAME_POINTER_REGNUM
3340           || regno == ARG_POINTER_REGNUM);
3341 }
3342
3343 /* Return true if X is a valid base register for mode MODE.
3344    STRICT_P is true if REG_OK_STRICT is in effect.  */
3345
3346 static bool
3347 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3348 {
3349   if (!strict_p && GET_CODE (x) == SUBREG)
3350     x = SUBREG_REG (x);
3351
3352   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3353 }
3354
3355 /* Return true if address offset is a valid index.  If it is, fill in INFO
3356    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3357
3358 static bool
3359 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3360                         machine_mode mode, bool strict_p)
3361 {
3362   enum aarch64_address_type type;
3363   rtx index;
3364   int shift;
3365
3366   /* (reg:P) */
3367   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3368       && GET_MODE (x) == Pmode)
3369     {
3370       type = ADDRESS_REG_REG;
3371       index = x;
3372       shift = 0;
3373     }
3374   /* (sign_extend:DI (reg:SI)) */
3375   else if ((GET_CODE (x) == SIGN_EXTEND
3376             || GET_CODE (x) == ZERO_EXTEND)
3377            && GET_MODE (x) == DImode
3378            && GET_MODE (XEXP (x, 0)) == SImode)
3379     {
3380       type = (GET_CODE (x) == SIGN_EXTEND)
3381         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3382       index = XEXP (x, 0);
3383       shift = 0;
3384     }
3385   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3386   else if (GET_CODE (x) == MULT
3387            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3388                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3389            && GET_MODE (XEXP (x, 0)) == DImode
3390            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3391            && CONST_INT_P (XEXP (x, 1)))
3392     {
3393       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3394         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3395       index = XEXP (XEXP (x, 0), 0);
3396       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3397     }
3398   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3399   else if (GET_CODE (x) == ASHIFT
3400            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3401                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3402            && GET_MODE (XEXP (x, 0)) == DImode
3403            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3404            && CONST_INT_P (XEXP (x, 1)))
3405     {
3406       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3407         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3408       index = XEXP (XEXP (x, 0), 0);
3409       shift = INTVAL (XEXP (x, 1));
3410     }
3411   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3412   else if ((GET_CODE (x) == SIGN_EXTRACT
3413             || GET_CODE (x) == ZERO_EXTRACT)
3414            && GET_MODE (x) == DImode
3415            && GET_CODE (XEXP (x, 0)) == MULT
3416            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3417            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3418     {
3419       type = (GET_CODE (x) == SIGN_EXTRACT)
3420         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3421       index = XEXP (XEXP (x, 0), 0);
3422       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3423       if (INTVAL (XEXP (x, 1)) != 32 + shift
3424           || INTVAL (XEXP (x, 2)) != 0)
3425         shift = -1;
3426     }
3427   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3428      (const_int 0xffffffff<<shift)) */
3429   else if (GET_CODE (x) == AND
3430            && GET_MODE (x) == DImode
3431            && GET_CODE (XEXP (x, 0)) == MULT
3432            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3433            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3434            && CONST_INT_P (XEXP (x, 1)))
3435     {
3436       type = ADDRESS_REG_UXTW;
3437       index = XEXP (XEXP (x, 0), 0);
3438       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3439       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3440         shift = -1;
3441     }
3442   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3443   else if ((GET_CODE (x) == SIGN_EXTRACT
3444             || GET_CODE (x) == ZERO_EXTRACT)
3445            && GET_MODE (x) == DImode
3446            && GET_CODE (XEXP (x, 0)) == ASHIFT
3447            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3448            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3449     {
3450       type = (GET_CODE (x) == SIGN_EXTRACT)
3451         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3452       index = XEXP (XEXP (x, 0), 0);
3453       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3454       if (INTVAL (XEXP (x, 1)) != 32 + shift
3455           || INTVAL (XEXP (x, 2)) != 0)
3456         shift = -1;
3457     }
3458   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3459      (const_int 0xffffffff<<shift)) */
3460   else if (GET_CODE (x) == AND
3461            && GET_MODE (x) == DImode
3462            && GET_CODE (XEXP (x, 0)) == ASHIFT
3463            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3464            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3465            && CONST_INT_P (XEXP (x, 1)))
3466     {
3467       type = ADDRESS_REG_UXTW;
3468       index = XEXP (XEXP (x, 0), 0);
3469       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3470       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3471         shift = -1;
3472     }
3473   /* (mult:P (reg:P) (const_int scale)) */
3474   else if (GET_CODE (x) == MULT
3475            && GET_MODE (x) == Pmode
3476            && GET_MODE (XEXP (x, 0)) == Pmode
3477            && CONST_INT_P (XEXP (x, 1)))
3478     {
3479       type = ADDRESS_REG_REG;
3480       index = XEXP (x, 0);
3481       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3482     }
3483   /* (ashift:P (reg:P) (const_int shift)) */
3484   else if (GET_CODE (x) == ASHIFT
3485            && GET_MODE (x) == Pmode
3486            && GET_MODE (XEXP (x, 0)) == Pmode
3487            && CONST_INT_P (XEXP (x, 1)))
3488     {
3489       type = ADDRESS_REG_REG;
3490       index = XEXP (x, 0);
3491       shift = INTVAL (XEXP (x, 1));
3492     }
3493   else
3494     return false;
3495
3496   if (GET_CODE (index) == SUBREG)
3497     index = SUBREG_REG (index);
3498
3499   if ((shift == 0 ||
3500        (shift > 0 && shift <= 3
3501         && (1 << shift) == GET_MODE_SIZE (mode)))
3502       && REG_P (index)
3503       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3504     {
3505       info->type = type;
3506       info->offset = index;
3507       info->shift = shift;
3508       return true;
3509     }
3510
3511   return false;
3512 }
3513
3514 bool
3515 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3516 {
3517   return (offset >= -64 * GET_MODE_SIZE (mode)
3518           && offset < 64 * GET_MODE_SIZE (mode)
3519           && offset % GET_MODE_SIZE (mode) == 0);
3520 }
3521
3522 static inline bool
3523 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3524                                HOST_WIDE_INT offset)
3525 {
3526   return offset >= -256 && offset < 256;
3527 }
3528
3529 static inline bool
3530 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3531 {
3532   return (offset >= 0
3533           && offset < 4096 * GET_MODE_SIZE (mode)
3534           && offset % GET_MODE_SIZE (mode) == 0);
3535 }
3536
3537 /* Return true if X is a valid address for machine mode MODE.  If it is,
3538    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3539    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3540
3541 static bool
3542 aarch64_classify_address (struct aarch64_address_info *info,
3543                           rtx x, machine_mode mode,
3544                           RTX_CODE outer_code, bool strict_p)
3545 {
3546   enum rtx_code code = GET_CODE (x);
3547   rtx op0, op1;
3548
3549   /* On BE, we use load/store pair for all large int mode load/stores.  */
3550   bool load_store_pair_p = (outer_code == PARALLEL
3551                             || (BYTES_BIG_ENDIAN
3552                                 && aarch64_vect_struct_mode_p (mode)));
3553
3554   bool allow_reg_index_p =
3555     !load_store_pair_p
3556     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3557     && !aarch64_vect_struct_mode_p (mode);
3558
3559   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3560      REG addressing.  */
3561   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3562       && (code != POST_INC && code != REG))
3563     return false;
3564
3565   switch (code)
3566     {
3567     case REG:
3568     case SUBREG:
3569       info->type = ADDRESS_REG_IMM;
3570       info->base = x;
3571       info->offset = const0_rtx;
3572       return aarch64_base_register_rtx_p (x, strict_p);
3573
3574     case PLUS:
3575       op0 = XEXP (x, 0);
3576       op1 = XEXP (x, 1);
3577
3578       if (! strict_p
3579           && REG_P (op0)
3580           && (op0 == virtual_stack_vars_rtx
3581               || op0 == frame_pointer_rtx
3582               || op0 == arg_pointer_rtx)
3583           && CONST_INT_P (op1))
3584         {
3585           info->type = ADDRESS_REG_IMM;
3586           info->base = op0;
3587           info->offset = op1;
3588
3589           return true;
3590         }
3591
3592       if (GET_MODE_SIZE (mode) != 0
3593           && CONST_INT_P (op1)
3594           && aarch64_base_register_rtx_p (op0, strict_p))
3595         {
3596           HOST_WIDE_INT offset = INTVAL (op1);
3597
3598           info->type = ADDRESS_REG_IMM;
3599           info->base = op0;
3600           info->offset = op1;
3601
3602           /* TImode and TFmode values are allowed in both pairs of X
3603              registers and individual Q registers.  The available
3604              address modes are:
3605              X,X: 7-bit signed scaled offset
3606              Q:   9-bit signed offset
3607              We conservatively require an offset representable in either mode.
3608            */
3609           if (mode == TImode || mode == TFmode)
3610             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3611                     && offset_9bit_signed_unscaled_p (mode, offset));
3612
3613           /* A 7bit offset check because OImode will emit a ldp/stp
3614              instruction (only big endian will get here).
3615              For ldp/stp instructions, the offset is scaled for the size of a
3616              single element of the pair.  */
3617           if (mode == OImode)
3618             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3619
3620           /* Three 9/12 bit offsets checks because CImode will emit three
3621              ldr/str instructions (only big endian will get here).  */
3622           if (mode == CImode)
3623             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3624                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3625                         || offset_12bit_unsigned_scaled_p (V16QImode,
3626                                                            offset + 32)));
3627
3628           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3629              instructions (only big endian will get here).  */
3630           if (mode == XImode)
3631             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3632                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3633                                                             offset + 32));
3634
3635           if (load_store_pair_p)
3636             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3637                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3638           else
3639             return (offset_9bit_signed_unscaled_p (mode, offset)
3640                     || offset_12bit_unsigned_scaled_p (mode, offset));
3641         }
3642
3643       if (allow_reg_index_p)
3644         {
3645           /* Look for base + (scaled/extended) index register.  */
3646           if (aarch64_base_register_rtx_p (op0, strict_p)
3647               && aarch64_classify_index (info, op1, mode, strict_p))
3648             {
3649               info->base = op0;
3650               return true;
3651             }
3652           if (aarch64_base_register_rtx_p (op1, strict_p)
3653               && aarch64_classify_index (info, op0, mode, strict_p))
3654             {
3655               info->base = op1;
3656               return true;
3657             }
3658         }
3659
3660       return false;
3661
3662     case POST_INC:
3663     case POST_DEC:
3664     case PRE_INC:
3665     case PRE_DEC:
3666       info->type = ADDRESS_REG_WB;
3667       info->base = XEXP (x, 0);
3668       info->offset = NULL_RTX;
3669       return aarch64_base_register_rtx_p (info->base, strict_p);
3670
3671     case POST_MODIFY:
3672     case PRE_MODIFY:
3673       info->type = ADDRESS_REG_WB;
3674       info->base = XEXP (x, 0);
3675       if (GET_CODE (XEXP (x, 1)) == PLUS
3676           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3677           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3678           && aarch64_base_register_rtx_p (info->base, strict_p))
3679         {
3680           HOST_WIDE_INT offset;
3681           info->offset = XEXP (XEXP (x, 1), 1);
3682           offset = INTVAL (info->offset);
3683
3684           /* TImode and TFmode values are allowed in both pairs of X
3685              registers and individual Q registers.  The available
3686              address modes are:
3687              X,X: 7-bit signed scaled offset
3688              Q:   9-bit signed offset
3689              We conservatively require an offset representable in either mode.
3690            */
3691           if (mode == TImode || mode == TFmode)
3692             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3693                     && offset_9bit_signed_unscaled_p (mode, offset));
3694
3695           if (load_store_pair_p)
3696             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3697                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3698           else
3699             return offset_9bit_signed_unscaled_p (mode, offset);
3700         }
3701       return false;
3702
3703     case CONST:
3704     case SYMBOL_REF:
3705     case LABEL_REF:
3706       /* load literal: pc-relative constant pool entry.  Only supported
3707          for SI mode or larger.  */
3708       info->type = ADDRESS_SYMBOLIC;
3709
3710       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3711         {
3712           rtx sym, addend;
3713
3714           split_const (x, &sym, &addend);
3715           return (GET_CODE (sym) == LABEL_REF
3716                   || (GET_CODE (sym) == SYMBOL_REF
3717                       && CONSTANT_POOL_ADDRESS_P (sym)));
3718         }
3719       return false;
3720
3721     case LO_SUM:
3722       info->type = ADDRESS_LO_SUM;
3723       info->base = XEXP (x, 0);
3724       info->offset = XEXP (x, 1);
3725       if (allow_reg_index_p
3726           && aarch64_base_register_rtx_p (info->base, strict_p))
3727         {
3728           rtx sym, offs;
3729           split_const (info->offset, &sym, &offs);
3730           if (GET_CODE (sym) == SYMBOL_REF
3731               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3732                   == SYMBOL_SMALL_ABSOLUTE))
3733             {
3734               /* The symbol and offset must be aligned to the access size.  */
3735               unsigned int align;
3736               unsigned int ref_size;
3737
3738               if (CONSTANT_POOL_ADDRESS_P (sym))
3739                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3740               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3741                 {
3742                   tree exp = SYMBOL_REF_DECL (sym);
3743                   align = TYPE_ALIGN (TREE_TYPE (exp));
3744                   align = CONSTANT_ALIGNMENT (exp, align);
3745                 }
3746               else if (SYMBOL_REF_DECL (sym))
3747                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3748               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3749                        && SYMBOL_REF_BLOCK (sym) != NULL)
3750                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3751               else
3752                 align = BITS_PER_UNIT;
3753
3754               ref_size = GET_MODE_SIZE (mode);
3755               if (ref_size == 0)
3756                 ref_size = GET_MODE_SIZE (DImode);
3757
3758               return ((INTVAL (offs) & (ref_size - 1)) == 0
3759                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3760             }
3761         }
3762       return false;
3763
3764     default:
3765       return false;
3766     }
3767 }
3768
3769 bool
3770 aarch64_symbolic_address_p (rtx x)
3771 {
3772   rtx offset;
3773
3774   split_const (x, &x, &offset);
3775   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3776 }
3777
3778 /* Classify the base of symbolic expression X, given that X appears in
3779    context CONTEXT.  */
3780
3781 enum aarch64_symbol_type
3782 aarch64_classify_symbolic_expression (rtx x,
3783                                       enum aarch64_symbol_context context)
3784 {
3785   rtx offset;
3786
3787   split_const (x, &x, &offset);
3788   return aarch64_classify_symbol (x, offset, context);
3789 }
3790
3791
3792 /* Return TRUE if X is a legitimate address for accessing memory in
3793    mode MODE.  */
3794 static bool
3795 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3796 {
3797   struct aarch64_address_info addr;
3798
3799   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3800 }
3801
3802 /* Return TRUE if X is a legitimate address for accessing memory in
3803    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3804    pair operation.  */
3805 bool
3806 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3807                               RTX_CODE outer_code, bool strict_p)
3808 {
3809   struct aarch64_address_info addr;
3810
3811   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3812 }
3813
3814 /* Return TRUE if rtx X is immediate constant 0.0 */
3815 bool
3816 aarch64_float_const_zero_rtx_p (rtx x)
3817 {
3818   REAL_VALUE_TYPE r;
3819
3820   if (GET_MODE (x) == VOIDmode)
3821     return false;
3822
3823   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3824   if (REAL_VALUE_MINUS_ZERO (r))
3825     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3826   return REAL_VALUES_EQUAL (r, dconst0);
3827 }
3828
3829 /* Return the fixed registers used for condition codes.  */
3830
3831 static bool
3832 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3833 {
3834   *p1 = CC_REGNUM;
3835   *p2 = INVALID_REGNUM;
3836   return true;
3837 }
3838
3839 /* Emit call insn with PAT and do aarch64-specific handling.  */
3840
3841 void
3842 aarch64_emit_call_insn (rtx pat)
3843 {
3844   rtx insn = emit_call_insn (pat);
3845
3846   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3847   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3848   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3849 }
3850
3851 machine_mode
3852 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3853 {
3854   /* All floating point compares return CCFP if it is an equality
3855      comparison, and CCFPE otherwise.  */
3856   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3857     {
3858       switch (code)
3859         {
3860         case EQ:
3861         case NE:
3862         case UNORDERED:
3863         case ORDERED:
3864         case UNLT:
3865         case UNLE:
3866         case UNGT:
3867         case UNGE:
3868         case UNEQ:
3869         case LTGT:
3870           return CCFPmode;
3871
3872         case LT:
3873         case LE:
3874         case GT:
3875         case GE:
3876           return CCFPEmode;
3877
3878         default:
3879           gcc_unreachable ();
3880         }
3881     }
3882
3883   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3884       && y == const0_rtx
3885       && (code == EQ || code == NE || code == LT || code == GE)
3886       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3887           || GET_CODE (x) == NEG))
3888     return CC_NZmode;
3889
3890   /* A compare with a shifted operand.  Because of canonicalization,
3891      the comparison will have to be swapped when we emit the assembly
3892      code.  */
3893   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3894       && (REG_P (y) || GET_CODE (y) == SUBREG)
3895       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3896           || GET_CODE (x) == LSHIFTRT
3897           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3898     return CC_SWPmode;
3899
3900   /* Similarly for a negated operand, but we can only do this for
3901      equalities.  */
3902   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3903       && (REG_P (y) || GET_CODE (y) == SUBREG)
3904       && (code == EQ || code == NE)
3905       && GET_CODE (x) == NEG)
3906     return CC_Zmode;
3907
3908   /* A compare of a mode narrower than SI mode against zero can be done
3909      by extending the value in the comparison.  */
3910   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3911       && y == const0_rtx)
3912     /* Only use sign-extension if we really need it.  */
3913     return ((code == GT || code == GE || code == LE || code == LT)
3914             ? CC_SESWPmode : CC_ZESWPmode);
3915
3916   /* For everything else, return CCmode.  */
3917   return CCmode;
3918 }
3919
3920 static int
3921 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3922
3923 int
3924 aarch64_get_condition_code (rtx x)
3925 {
3926   machine_mode mode = GET_MODE (XEXP (x, 0));
3927   enum rtx_code comp_code = GET_CODE (x);
3928
3929   if (GET_MODE_CLASS (mode) != MODE_CC)
3930     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3931   return aarch64_get_condition_code_1 (mode, comp_code);
3932 }
3933
3934 static int
3935 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3936 {
3937   int ne = -1, eq = -1;
3938   switch (mode)
3939     {
3940     case CCFPmode:
3941     case CCFPEmode:
3942       switch (comp_code)
3943         {
3944         case GE: return AARCH64_GE;
3945         case GT: return AARCH64_GT;
3946         case LE: return AARCH64_LS;
3947         case LT: return AARCH64_MI;
3948         case NE: return AARCH64_NE;
3949         case EQ: return AARCH64_EQ;
3950         case ORDERED: return AARCH64_VC;
3951         case UNORDERED: return AARCH64_VS;
3952         case UNLT: return AARCH64_LT;
3953         case UNLE: return AARCH64_LE;
3954         case UNGT: return AARCH64_HI;
3955         case UNGE: return AARCH64_PL;
3956         default: return -1;
3957         }
3958       break;
3959
3960     case CC_DNEmode:
3961       ne = AARCH64_NE;
3962       eq = AARCH64_EQ;
3963       break;
3964
3965     case CC_DEQmode:
3966       ne = AARCH64_EQ;
3967       eq = AARCH64_NE;
3968       break;
3969
3970     case CC_DGEmode:
3971       ne = AARCH64_GE;
3972       eq = AARCH64_LT;
3973       break;
3974
3975     case CC_DLTmode:
3976       ne = AARCH64_LT;
3977       eq = AARCH64_GE;
3978       break;
3979
3980     case CC_DGTmode:
3981       ne = AARCH64_GT;
3982       eq = AARCH64_LE;
3983       break;
3984
3985     case CC_DLEmode:
3986       ne = AARCH64_LE;
3987       eq = AARCH64_GT;
3988       break;
3989
3990     case CC_DGEUmode:
3991       ne = AARCH64_CS;
3992       eq = AARCH64_CC;
3993       break;
3994
3995     case CC_DLTUmode:
3996       ne = AARCH64_CC;
3997       eq = AARCH64_CS;
3998       break;
3999
4000     case CC_DGTUmode:
4001       ne = AARCH64_HI;
4002       eq = AARCH64_LS;
4003       break;
4004
4005     case CC_DLEUmode:
4006       ne = AARCH64_LS;
4007       eq = AARCH64_HI;
4008       break;
4009
4010     case CCmode:
4011       switch (comp_code)
4012         {
4013         case NE: return AARCH64_NE;
4014         case EQ: return AARCH64_EQ;
4015         case GE: return AARCH64_GE;
4016         case GT: return AARCH64_GT;
4017         case LE: return AARCH64_LE;
4018         case LT: return AARCH64_LT;
4019         case GEU: return AARCH64_CS;
4020         case GTU: return AARCH64_HI;
4021         case LEU: return AARCH64_LS;
4022         case LTU: return AARCH64_CC;
4023         default: return -1;
4024         }
4025       break;
4026
4027     case CC_SWPmode:
4028     case CC_ZESWPmode:
4029     case CC_SESWPmode:
4030       switch (comp_code)
4031         {
4032         case NE: return AARCH64_NE;
4033         case EQ: return AARCH64_EQ;
4034         case GE: return AARCH64_LE;
4035         case GT: return AARCH64_LT;
4036         case LE: return AARCH64_GE;
4037         case LT: return AARCH64_GT;
4038         case GEU: return AARCH64_LS;
4039         case GTU: return AARCH64_CC;
4040         case LEU: return AARCH64_CS;
4041         case LTU: return AARCH64_HI;
4042         default: return -1;
4043         }
4044       break;
4045
4046     case CC_NZmode:
4047       switch (comp_code)
4048         {
4049         case NE: return AARCH64_NE;
4050         case EQ: return AARCH64_EQ;
4051         case GE: return AARCH64_PL;
4052         case LT: return AARCH64_MI;
4053         default: return -1;
4054         }
4055       break;
4056
4057     case CC_Zmode:
4058       switch (comp_code)
4059         {
4060         case NE: return AARCH64_NE;
4061         case EQ: return AARCH64_EQ;
4062         default: return -1;
4063         }
4064       break;
4065
4066     default:
4067       return -1;
4068       break;
4069     }
4070
4071   if (comp_code == NE)
4072     return ne;
4073
4074   if (comp_code == EQ)
4075     return eq;
4076
4077   return -1;
4078 }
4079
4080 bool
4081 aarch64_const_vec_all_same_in_range_p (rtx x,
4082                                   HOST_WIDE_INT minval,
4083                                   HOST_WIDE_INT maxval)
4084 {
4085   HOST_WIDE_INT firstval;
4086   int count, i;
4087
4088   if (GET_CODE (x) != CONST_VECTOR
4089       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4090     return false;
4091
4092   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4093   if (firstval < minval || firstval > maxval)
4094     return false;
4095
4096   count = CONST_VECTOR_NUNITS (x);
4097   for (i = 1; i < count; i++)
4098     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4099       return false;
4100
4101   return true;
4102 }
4103
4104 bool
4105 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4106 {
4107   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4108 }
4109
4110 static unsigned
4111 bit_count (unsigned HOST_WIDE_INT value)
4112 {
4113   unsigned count = 0;
4114
4115   while (value)
4116     {
4117       count++;
4118       value &= value - 1;
4119     }
4120
4121   return count;
4122 }
4123
4124 /* N Z C V.  */
4125 #define AARCH64_CC_V 1
4126 #define AARCH64_CC_C (1 << 1)
4127 #define AARCH64_CC_Z (1 << 2)
4128 #define AARCH64_CC_N (1 << 3)
4129
4130 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4131    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4132 static const int aarch64_nzcv_codes[][2] =
4133 {
4134   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4135   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4136   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4137   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4138   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4139   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4140   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4141   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4142   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4143   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4144   {0, AARCH64_CC_V}, /* GE, N == V.  */
4145   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4146   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4147   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4148   {0, 0}, /* AL, Any.  */
4149   {0, 0}, /* NV, Any.  */
4150 };
4151
4152 int
4153 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4154 {
4155   switch (mode)
4156     {
4157     case CC_DNEmode:
4158       return NE;
4159
4160     case CC_DEQmode:
4161       return EQ;
4162
4163     case CC_DLEmode:
4164       return LE;
4165
4166     case CC_DGTmode:
4167       return GT;
4168
4169     case CC_DLTmode:
4170       return LT;
4171
4172     case CC_DGEmode:
4173       return GE;
4174
4175     case CC_DLEUmode:
4176       return LEU;
4177
4178     case CC_DGTUmode:
4179       return GTU;
4180
4181     case CC_DLTUmode:
4182       return LTU;
4183
4184     case CC_DGEUmode:
4185       return GEU;
4186
4187     default:
4188       gcc_unreachable ();
4189     }
4190 }
4191
4192
4193 void
4194 aarch64_print_operand (FILE *f, rtx x, char code)
4195 {
4196   switch (code)
4197     {
4198     /* An integer or symbol address without a preceding # sign.  */
4199     case 'c':
4200       switch (GET_CODE (x))
4201         {
4202         case CONST_INT:
4203           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4204           break;
4205
4206         case SYMBOL_REF:
4207           output_addr_const (f, x);
4208           break;
4209
4210         case CONST:
4211           if (GET_CODE (XEXP (x, 0)) == PLUS
4212               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4213             {
4214               output_addr_const (f, x);
4215               break;
4216             }
4217           /* Fall through.  */
4218
4219         default:
4220           output_operand_lossage ("Unsupported operand for code '%c'", code);
4221         }
4222       break;
4223
4224     case 'e':
4225       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4226       {
4227         int n;
4228
4229         if (!CONST_INT_P (x)
4230             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4231           {
4232             output_operand_lossage ("invalid operand for '%%%c'", code);
4233             return;
4234           }
4235
4236         switch (n)
4237           {
4238           case 3:
4239             fputc ('b', f);
4240             break;
4241           case 4:
4242             fputc ('h', f);
4243             break;
4244           case 5:
4245             fputc ('w', f);
4246             break;
4247           default:
4248             output_operand_lossage ("invalid operand for '%%%c'", code);
4249             return;
4250           }
4251       }
4252       break;
4253
4254     case 'p':
4255       {
4256         int n;
4257
4258         /* Print N such that 2^N == X.  */
4259         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4260           {
4261             output_operand_lossage ("invalid operand for '%%%c'", code);
4262             return;
4263           }
4264
4265         asm_fprintf (f, "%d", n);
4266       }
4267       break;
4268
4269     case 'P':
4270       /* Print the number of non-zero bits in X (a const_int).  */
4271       if (!CONST_INT_P (x))
4272         {
4273           output_operand_lossage ("invalid operand for '%%%c'", code);
4274           return;
4275         }
4276
4277       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4278       break;
4279
4280     case 'H':
4281       /* Print the higher numbered register of a pair (TImode) of regs.  */
4282       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4283         {
4284           output_operand_lossage ("invalid operand for '%%%c'", code);
4285           return;
4286         }
4287
4288       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4289       break;
4290
4291     case 'm':
4292       {
4293         int cond_code;
4294         /* Print a condition (eq, ne, etc).  */
4295
4296         /* CONST_TRUE_RTX means always -- that's the default.  */
4297         if (x == const_true_rtx)
4298           return;
4299
4300         if (!COMPARISON_P (x))
4301           {
4302             output_operand_lossage ("invalid operand for '%%%c'", code);
4303             return;
4304           }
4305
4306         cond_code = aarch64_get_condition_code (x);
4307         gcc_assert (cond_code >= 0);
4308         fputs (aarch64_condition_codes[cond_code], f);
4309       }
4310       break;
4311
4312     case 'M':
4313       {
4314         int cond_code;
4315         /* Print the inverse of a condition (eq <-> ne, etc).  */
4316
4317         /* CONST_TRUE_RTX means never -- that's the default.  */
4318         if (x == const_true_rtx)
4319           {
4320             fputs ("nv", f);
4321             return;
4322           }
4323
4324         if (!COMPARISON_P (x))
4325           {
4326             output_operand_lossage ("invalid operand for '%%%c'", code);
4327             return;
4328           }
4329         cond_code = aarch64_get_condition_code (x);
4330         gcc_assert (cond_code >= 0);
4331         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4332                                        (cond_code)], f);
4333       }
4334       break;
4335
4336     case 'b':
4337     case 'h':
4338     case 's':
4339     case 'd':
4340     case 'q':
4341       /* Print a scalar FP/SIMD register name.  */
4342       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4343         {
4344           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4345           return;
4346         }
4347       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4348       break;
4349
4350     case 'S':
4351     case 'T':
4352     case 'U':
4353     case 'V':
4354       /* Print the first FP/SIMD register name in a list.  */
4355       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4356         {
4357           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4358           return;
4359         }
4360       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4361       break;
4362
4363     case 'R':
4364       /* Print a scalar FP/SIMD register name + 1.  */
4365       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4366         {
4367           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4368           return;
4369         }
4370       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4371       break;
4372
4373     case 'X':
4374       /* Print bottom 16 bits of integer constant in hex.  */
4375       if (!CONST_INT_P (x))
4376         {
4377           output_operand_lossage ("invalid operand for '%%%c'", code);
4378           return;
4379         }
4380       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4381       break;
4382
4383     case 'w':
4384     case 'x':
4385       /* Print a general register name or the zero register (32-bit or
4386          64-bit).  */
4387       if (x == const0_rtx
4388           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4389         {
4390           asm_fprintf (f, "%czr", code);
4391           break;
4392         }
4393
4394       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4395         {
4396           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4397           break;
4398         }
4399
4400       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4401         {
4402           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4403           break;
4404         }
4405
4406       /* Fall through */
4407
4408     case 0:
4409       /* Print a normal operand, if it's a general register, then we
4410          assume DImode.  */
4411       if (x == NULL)
4412         {
4413           output_operand_lossage ("missing operand");
4414           return;
4415         }
4416
4417       switch (GET_CODE (x))
4418         {
4419         case REG:
4420           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4421           break;
4422
4423         case MEM:
4424           aarch64_memory_reference_mode = GET_MODE (x);
4425           output_address (XEXP (x, 0));
4426           break;
4427
4428         case LABEL_REF:
4429         case SYMBOL_REF:
4430           output_addr_const (asm_out_file, x);
4431           break;
4432
4433         case CONST_INT:
4434           asm_fprintf (f, "%wd", INTVAL (x));
4435           break;
4436
4437         case CONST_VECTOR:
4438           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4439             {
4440               gcc_assert (
4441                   aarch64_const_vec_all_same_in_range_p (x,
4442                                                          HOST_WIDE_INT_MIN,
4443                                                          HOST_WIDE_INT_MAX));
4444               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4445             }
4446           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4447             {
4448               fputc ('0', f);
4449             }
4450           else
4451             gcc_unreachable ();
4452           break;
4453
4454         case CONST_DOUBLE:
4455           /* CONST_DOUBLE can represent a double-width integer.
4456              In this case, the mode of x is VOIDmode.  */
4457           if (GET_MODE (x) == VOIDmode)
4458             ; /* Do Nothing.  */
4459           else if (aarch64_float_const_zero_rtx_p (x))
4460             {
4461               fputc ('0', f);
4462               break;
4463             }
4464           else if (aarch64_float_const_representable_p (x))
4465             {
4466 #define buf_size 20
4467               char float_buf[buf_size] = {'\0'};
4468               REAL_VALUE_TYPE r;
4469               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4470               real_to_decimal_for_mode (float_buf, &r,
4471                                         buf_size, buf_size,
4472                                         1, GET_MODE (x));
4473               asm_fprintf (asm_out_file, "%s", float_buf);
4474               break;
4475 #undef buf_size
4476             }
4477           output_operand_lossage ("invalid constant");
4478           return;
4479         default:
4480           output_operand_lossage ("invalid operand");
4481           return;
4482         }
4483       break;
4484
4485     case 'A':
4486       if (GET_CODE (x) == HIGH)
4487         x = XEXP (x, 0);
4488
4489       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4490         {
4491         case SYMBOL_SMALL_GOT_4G:
4492           asm_fprintf (asm_out_file, ":got:");
4493           break;
4494
4495         case SYMBOL_SMALL_TLSGD:
4496           asm_fprintf (asm_out_file, ":tlsgd:");
4497           break;
4498
4499         case SYMBOL_SMALL_TLSDESC:
4500           asm_fprintf (asm_out_file, ":tlsdesc:");
4501           break;
4502
4503         case SYMBOL_SMALL_GOTTPREL:
4504           asm_fprintf (asm_out_file, ":gottprel:");
4505           break;
4506
4507         case SYMBOL_TLSLE:
4508           asm_fprintf (asm_out_file, ":tprel:");
4509           break;
4510
4511         case SYMBOL_TINY_GOT:
4512           gcc_unreachable ();
4513           break;
4514
4515         default:
4516           break;
4517         }
4518       output_addr_const (asm_out_file, x);
4519       break;
4520
4521     case 'L':
4522       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4523         {
4524         case SYMBOL_SMALL_GOT_4G:
4525           asm_fprintf (asm_out_file, ":lo12:");
4526           break;
4527
4528         case SYMBOL_SMALL_TLSGD:
4529           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4530           break;
4531
4532         case SYMBOL_SMALL_TLSDESC:
4533           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4534           break;
4535
4536         case SYMBOL_SMALL_GOTTPREL:
4537           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4538           break;
4539
4540         case SYMBOL_TLSLE:
4541           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4542           break;
4543
4544         case SYMBOL_TINY_GOT:
4545           asm_fprintf (asm_out_file, ":got:");
4546           break;
4547
4548         default:
4549           break;
4550         }
4551       output_addr_const (asm_out_file, x);
4552       break;
4553
4554     case 'G':
4555
4556       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4557         {
4558         case SYMBOL_TLSLE:
4559           asm_fprintf (asm_out_file, ":tprel_hi12:");
4560           break;
4561         default:
4562           break;
4563         }
4564       output_addr_const (asm_out_file, x);
4565       break;
4566
4567     case 'K':
4568       {
4569         int cond_code;
4570         /* Print nzcv.  */
4571
4572         if (!COMPARISON_P (x))
4573           {
4574             output_operand_lossage ("invalid operand for '%%%c'", code);
4575             return;
4576           }
4577
4578         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4579         gcc_assert (cond_code >= 0);
4580         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4581       }
4582       break;
4583
4584     case 'k':
4585       {
4586         int cond_code;
4587         /* Print nzcv.  */
4588
4589         if (!COMPARISON_P (x))
4590           {
4591             output_operand_lossage ("invalid operand for '%%%c'", code);
4592             return;
4593           }
4594
4595         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4596         gcc_assert (cond_code >= 0);
4597         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4598       }
4599       break;
4600
4601     default:
4602       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4603       return;
4604     }
4605 }
4606
4607 void
4608 aarch64_print_operand_address (FILE *f, rtx x)
4609 {
4610   struct aarch64_address_info addr;
4611
4612   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4613                              MEM, true))
4614     switch (addr.type)
4615       {
4616       case ADDRESS_REG_IMM:
4617         if (addr.offset == const0_rtx)
4618           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4619         else
4620           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4621                        INTVAL (addr.offset));
4622         return;
4623
4624       case ADDRESS_REG_REG:
4625         if (addr.shift == 0)
4626           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4627                        reg_names [REGNO (addr.offset)]);
4628         else
4629           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4630                        reg_names [REGNO (addr.offset)], addr.shift);
4631         return;
4632
4633       case ADDRESS_REG_UXTW:
4634         if (addr.shift == 0)
4635           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4636                        REGNO (addr.offset) - R0_REGNUM);
4637         else
4638           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4639                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4640         return;
4641
4642       case ADDRESS_REG_SXTW:
4643         if (addr.shift == 0)
4644           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4645                        REGNO (addr.offset) - R0_REGNUM);
4646         else
4647           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4648                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4649         return;
4650
4651       case ADDRESS_REG_WB:
4652         switch (GET_CODE (x))
4653           {
4654           case PRE_INC:
4655             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4656                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4657             return;
4658           case POST_INC:
4659             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4660                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4661             return;
4662           case PRE_DEC:
4663             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4664                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4665             return;
4666           case POST_DEC:
4667             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4668                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4669             return;
4670           case PRE_MODIFY:
4671             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4672                          INTVAL (addr.offset));
4673             return;
4674           case POST_MODIFY:
4675             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4676                          INTVAL (addr.offset));
4677             return;
4678           default:
4679             break;
4680           }
4681         break;
4682
4683       case ADDRESS_LO_SUM:
4684         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4685         output_addr_const (f, addr.offset);
4686         asm_fprintf (f, "]");
4687         return;
4688
4689       case ADDRESS_SYMBOLIC:
4690         break;
4691       }
4692
4693   output_addr_const (f, x);
4694 }
4695
4696 bool
4697 aarch64_label_mentioned_p (rtx x)
4698 {
4699   const char *fmt;
4700   int i;
4701
4702   if (GET_CODE (x) == LABEL_REF)
4703     return true;
4704
4705   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4706      referencing instruction, but they are constant offsets, not
4707      symbols.  */
4708   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4709     return false;
4710
4711   fmt = GET_RTX_FORMAT (GET_CODE (x));
4712   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4713     {
4714       if (fmt[i] == 'E')
4715         {
4716           int j;
4717
4718           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4719             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4720               return 1;
4721         }
4722       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4723         return 1;
4724     }
4725
4726   return 0;
4727 }
4728
4729 /* Implement REGNO_REG_CLASS.  */
4730
4731 enum reg_class
4732 aarch64_regno_regclass (unsigned regno)
4733 {
4734   if (GP_REGNUM_P (regno))
4735     return GENERAL_REGS;
4736
4737   if (regno == SP_REGNUM)
4738     return STACK_REG;
4739
4740   if (regno == FRAME_POINTER_REGNUM
4741       || regno == ARG_POINTER_REGNUM)
4742     return POINTER_REGS;
4743
4744   if (FP_REGNUM_P (regno))
4745     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4746
4747   return NO_REGS;
4748 }
4749
4750 static rtx
4751 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4752 {
4753   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4754      where mask is selected by alignment and size of the offset.
4755      We try to pick as large a range for the offset as possible to
4756      maximize the chance of a CSE.  However, for aligned addresses
4757      we limit the range to 4k so that structures with different sized
4758      elements are likely to use the same base.  */
4759
4760   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4761     {
4762       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4763       HOST_WIDE_INT base_offset;
4764
4765       /* Does it look like we'll need a load/store-pair operation?  */
4766       if (GET_MODE_SIZE (mode) > 16
4767           || mode == TImode)
4768         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4769                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4770       /* For offsets aren't a multiple of the access size, the limit is
4771          -256...255.  */
4772       else if (offset & (GET_MODE_SIZE (mode) - 1))
4773         base_offset = (offset + 0x100) & ~0x1ff;
4774       else
4775         base_offset = offset & ~0xfff;
4776
4777       if (base_offset == 0)
4778         return x;
4779
4780       offset -= base_offset;
4781       rtx base_reg = gen_reg_rtx (Pmode);
4782       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4783                            NULL_RTX);
4784       emit_move_insn (base_reg, val);
4785       x = plus_constant (Pmode, base_reg, offset);
4786     }
4787
4788   return x;
4789 }
4790
4791 /* Try a machine-dependent way of reloading an illegitimate address
4792    operand.  If we find one, push the reload and return the new rtx.  */
4793
4794 rtx
4795 aarch64_legitimize_reload_address (rtx *x_p,
4796                                    machine_mode mode,
4797                                    int opnum, int type,
4798                                    int ind_levels ATTRIBUTE_UNUSED)
4799 {
4800   rtx x = *x_p;
4801
4802   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4803   if (aarch64_vect_struct_mode_p (mode)
4804       && GET_CODE (x) == PLUS
4805       && REG_P (XEXP (x, 0))
4806       && CONST_INT_P (XEXP (x, 1)))
4807     {
4808       rtx orig_rtx = x;
4809       x = copy_rtx (x);
4810       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4811                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4812                    opnum, (enum reload_type) type);
4813       return x;
4814     }
4815
4816   /* We must recognize output that we have already generated ourselves.  */
4817   if (GET_CODE (x) == PLUS
4818       && GET_CODE (XEXP (x, 0)) == PLUS
4819       && REG_P (XEXP (XEXP (x, 0), 0))
4820       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4821       && CONST_INT_P (XEXP (x, 1)))
4822     {
4823       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4824                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4825                    opnum, (enum reload_type) type);
4826       return x;
4827     }
4828
4829   /* We wish to handle large displacements off a base register by splitting
4830      the addend across an add and the mem insn.  This can cut the number of
4831      extra insns needed from 3 to 1.  It is only useful for load/store of a
4832      single register with 12 bit offset field.  */
4833   if (GET_CODE (x) == PLUS
4834       && REG_P (XEXP (x, 0))
4835       && CONST_INT_P (XEXP (x, 1))
4836       && HARD_REGISTER_P (XEXP (x, 0))
4837       && mode != TImode
4838       && mode != TFmode
4839       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4840     {
4841       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4842       HOST_WIDE_INT low = val & 0xfff;
4843       HOST_WIDE_INT high = val - low;
4844       HOST_WIDE_INT offs;
4845       rtx cst;
4846       machine_mode xmode = GET_MODE (x);
4847
4848       /* In ILP32, xmode can be either DImode or SImode.  */
4849       gcc_assert (xmode == DImode || xmode == SImode);
4850
4851       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4852          BLKmode alignment.  */
4853       if (GET_MODE_SIZE (mode) == 0)
4854         return NULL_RTX;
4855
4856       offs = low % GET_MODE_SIZE (mode);
4857
4858       /* Align misaligned offset by adjusting high part to compensate.  */
4859       if (offs != 0)
4860         {
4861           if (aarch64_uimm12_shift (high + offs))
4862             {
4863               /* Align down.  */
4864               low = low - offs;
4865               high = high + offs;
4866             }
4867           else
4868             {
4869               /* Align up.  */
4870               offs = GET_MODE_SIZE (mode) - offs;
4871               low = low + offs;
4872               high = high + (low & 0x1000) - offs;
4873               low &= 0xfff;
4874             }
4875         }
4876
4877       /* Check for overflow.  */
4878       if (high + low != val)
4879         return NULL_RTX;
4880
4881       cst = GEN_INT (high);
4882       if (!aarch64_uimm12_shift (high))
4883         cst = force_const_mem (xmode, cst);
4884
4885       /* Reload high part into base reg, leaving the low part
4886          in the mem instruction.
4887          Note that replacing this gen_rtx_PLUS with plus_constant is
4888          wrong in this case because we rely on the
4889          (plus (plus reg c1) c2) structure being preserved so that
4890          XEXP (*p, 0) in push_reload below uses the correct term.  */
4891       x = gen_rtx_PLUS (xmode,
4892                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4893                         GEN_INT (low));
4894
4895       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4896                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4897                    opnum, (enum reload_type) type);
4898       return x;
4899     }
4900
4901   return NULL_RTX;
4902 }
4903
4904
4905 static reg_class_t
4906 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4907                           reg_class_t rclass,
4908                           machine_mode mode,
4909                           secondary_reload_info *sri)
4910 {
4911   /* Without the TARGET_SIMD instructions we cannot move a Q register
4912      to a Q register directly.  We need a scratch.  */
4913   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4914       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4915       && reg_class_subset_p (rclass, FP_REGS))
4916     {
4917       if (mode == TFmode)
4918         sri->icode = CODE_FOR_aarch64_reload_movtf;
4919       else if (mode == TImode)
4920         sri->icode = CODE_FOR_aarch64_reload_movti;
4921       return NO_REGS;
4922     }
4923
4924   /* A TFmode or TImode memory access should be handled via an FP_REGS
4925      because AArch64 has richer addressing modes for LDR/STR instructions
4926      than LDP/STP instructions.  */
4927   if (TARGET_FLOAT && rclass == GENERAL_REGS
4928       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4929     return FP_REGS;
4930
4931   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4932       return GENERAL_REGS;
4933
4934   return NO_REGS;
4935 }
4936
4937 static bool
4938 aarch64_can_eliminate (const int from, const int to)
4939 {
4940   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4941      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4942
4943   if (frame_pointer_needed)
4944     {
4945       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4946         return true;
4947       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4948         return false;
4949       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4950           && !cfun->calls_alloca)
4951         return true;
4952       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4953         return true;
4954
4955       return false;
4956     }
4957   else
4958     {
4959       /* If we decided that we didn't need a leaf frame pointer but then used
4960          LR in the function, then we'll want a frame pointer after all, so
4961          prevent this elimination to ensure a frame pointer is used.  */
4962       if (to == STACK_POINTER_REGNUM
4963           && flag_omit_leaf_frame_pointer
4964           && df_regs_ever_live_p (LR_REGNUM))
4965         return false;
4966     }
4967
4968   return true;
4969 }
4970
4971 HOST_WIDE_INT
4972 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4973 {
4974   aarch64_layout_frame ();
4975
4976   if (to == HARD_FRAME_POINTER_REGNUM)
4977     {
4978       if (from == ARG_POINTER_REGNUM)
4979         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4980
4981       if (from == FRAME_POINTER_REGNUM)
4982         return (cfun->machine->frame.hard_fp_offset
4983                 - cfun->machine->frame.saved_varargs_size);
4984     }
4985
4986   if (to == STACK_POINTER_REGNUM)
4987     {
4988       if (from == FRAME_POINTER_REGNUM)
4989           return (cfun->machine->frame.frame_size
4990                   - cfun->machine->frame.saved_varargs_size);
4991     }
4992
4993   return cfun->machine->frame.frame_size;
4994 }
4995
4996 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4997    previous frame.  */
4998
4999 rtx
5000 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5001 {
5002   if (count != 0)
5003     return const0_rtx;
5004   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5005 }
5006
5007
5008 static void
5009 aarch64_asm_trampoline_template (FILE *f)
5010 {
5011   if (TARGET_ILP32)
5012     {
5013       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5014       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5015     }
5016   else
5017     {
5018       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5019       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5020     }
5021   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5022   assemble_aligned_integer (4, const0_rtx);
5023   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5024   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5025 }
5026
5027 static void
5028 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5029 {
5030   rtx fnaddr, mem, a_tramp;
5031   const int tramp_code_sz = 16;
5032
5033   /* Don't need to copy the trailing D-words, we fill those in below.  */
5034   emit_block_move (m_tramp, assemble_trampoline_template (),
5035                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5036   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5037   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5038   if (GET_MODE (fnaddr) != ptr_mode)
5039     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5040   emit_move_insn (mem, fnaddr);
5041
5042   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5043   emit_move_insn (mem, chain_value);
5044
5045   /* XXX We should really define a "clear_cache" pattern and use
5046      gen_clear_cache().  */
5047   a_tramp = XEXP (m_tramp, 0);
5048   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5049                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5050                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5051                      ptr_mode);
5052 }
5053
5054 static unsigned char
5055 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5056 {
5057   switch (regclass)
5058     {
5059     case CALLER_SAVE_REGS:
5060     case POINTER_REGS:
5061     case GENERAL_REGS:
5062     case ALL_REGS:
5063     case FP_REGS:
5064     case FP_LO_REGS:
5065       return
5066         aarch64_vector_mode_p (mode)
5067           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5068           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5069     case STACK_REG:
5070       return 1;
5071
5072     case NO_REGS:
5073       return 0;
5074
5075     default:
5076       break;
5077     }
5078   gcc_unreachable ();
5079 }
5080
5081 static reg_class_t
5082 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5083 {
5084   if (regclass == POINTER_REGS)
5085     return GENERAL_REGS;
5086
5087   if (regclass == STACK_REG)
5088     {
5089       if (REG_P(x)
5090           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5091           return regclass;
5092
5093       return NO_REGS;
5094     }
5095
5096   /* If it's an integer immediate that MOVI can't handle, then
5097      FP_REGS is not an option, so we return NO_REGS instead.  */
5098   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5099       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5100     return NO_REGS;
5101
5102   /* Register eliminiation can result in a request for
5103      SP+constant->FP_REGS.  We cannot support such operations which
5104      use SP as source and an FP_REG as destination, so reject out
5105      right now.  */
5106   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5107     {
5108       rtx lhs = XEXP (x, 0);
5109
5110       /* Look through a possible SUBREG introduced by ILP32.  */
5111       if (GET_CODE (lhs) == SUBREG)
5112         lhs = SUBREG_REG (lhs);
5113
5114       gcc_assert (REG_P (lhs));
5115       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5116                                       POINTER_REGS));
5117       return NO_REGS;
5118     }
5119
5120   return regclass;
5121 }
5122
5123 void
5124 aarch64_asm_output_labelref (FILE* f, const char *name)
5125 {
5126   asm_fprintf (f, "%U%s", name);
5127 }
5128
5129 static void
5130 aarch64_elf_asm_constructor (rtx symbol, int priority)
5131 {
5132   if (priority == DEFAULT_INIT_PRIORITY)
5133     default_ctor_section_asm_out_constructor (symbol, priority);
5134   else
5135     {
5136       section *s;
5137       char buf[18];
5138       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5139       s = get_section (buf, SECTION_WRITE, NULL);
5140       switch_to_section (s);
5141       assemble_align (POINTER_SIZE);
5142       assemble_aligned_integer (POINTER_BYTES, symbol);
5143     }
5144 }
5145
5146 static void
5147 aarch64_elf_asm_destructor (rtx symbol, int priority)
5148 {
5149   if (priority == DEFAULT_INIT_PRIORITY)
5150     default_dtor_section_asm_out_destructor (symbol, priority);
5151   else
5152     {
5153       section *s;
5154       char buf[18];
5155       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5156       s = get_section (buf, SECTION_WRITE, NULL);
5157       switch_to_section (s);
5158       assemble_align (POINTER_SIZE);
5159       assemble_aligned_integer (POINTER_BYTES, symbol);
5160     }
5161 }
5162
5163 const char*
5164 aarch64_output_casesi (rtx *operands)
5165 {
5166   char buf[100];
5167   char label[100];
5168   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5169   int index;
5170   static const char *const patterns[4][2] =
5171   {
5172     {
5173       "ldrb\t%w3, [%0,%w1,uxtw]",
5174       "add\t%3, %4, %w3, sxtb #2"
5175     },
5176     {
5177       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5178       "add\t%3, %4, %w3, sxth #2"
5179     },
5180     {
5181       "ldr\t%w3, [%0,%w1,uxtw #2]",
5182       "add\t%3, %4, %w3, sxtw #2"
5183     },
5184     /* We assume that DImode is only generated when not optimizing and
5185        that we don't really need 64-bit address offsets.  That would
5186        imply an object file with 8GB of code in a single function!  */
5187     {
5188       "ldr\t%w3, [%0,%w1,uxtw #2]",
5189       "add\t%3, %4, %w3, sxtw #2"
5190     }
5191   };
5192
5193   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5194
5195   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5196
5197   gcc_assert (index >= 0 && index <= 3);
5198
5199   /* Need to implement table size reduction, by chaning the code below.  */
5200   output_asm_insn (patterns[index][0], operands);
5201   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5202   snprintf (buf, sizeof (buf),
5203             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5204   output_asm_insn (buf, operands);
5205   output_asm_insn (patterns[index][1], operands);
5206   output_asm_insn ("br\t%3", operands);
5207   assemble_label (asm_out_file, label);
5208   return "";
5209 }
5210
5211
5212 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5213    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5214    operator.  */
5215
5216 int
5217 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5218 {
5219   if (shift >= 0 && shift <= 3)
5220     {
5221       int size;
5222       for (size = 8; size <= 32; size *= 2)
5223         {
5224           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5225           if (mask == bits << shift)
5226             return size;
5227         }
5228     }
5229   return 0;
5230 }
5231
5232 static bool
5233 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5234                                    const_rtx x ATTRIBUTE_UNUSED)
5235 {
5236   /* We can't use blocks for constants when we're using a per-function
5237      constant pool.  */
5238   return false;
5239 }
5240
5241 static section *
5242 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5243                             rtx x ATTRIBUTE_UNUSED,
5244                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5245 {
5246   /* Force all constant pool entries into the current function section.  */
5247   return function_section (current_function_decl);
5248 }
5249
5250
5251 /* Costs.  */
5252
5253 /* Helper function for rtx cost calculation.  Strip a shift expression
5254    from X.  Returns the inner operand if successful, or the original
5255    expression on failure.  */
5256 static rtx
5257 aarch64_strip_shift (rtx x)
5258 {
5259   rtx op = x;
5260
5261   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5262      we can convert both to ROR during final output.  */
5263   if ((GET_CODE (op) == ASHIFT
5264        || GET_CODE (op) == ASHIFTRT
5265        || GET_CODE (op) == LSHIFTRT
5266        || GET_CODE (op) == ROTATERT
5267        || GET_CODE (op) == ROTATE)
5268       && CONST_INT_P (XEXP (op, 1)))
5269     return XEXP (op, 0);
5270
5271   if (GET_CODE (op) == MULT
5272       && CONST_INT_P (XEXP (op, 1))
5273       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5274     return XEXP (op, 0);
5275
5276   return x;
5277 }
5278
5279 /* Helper function for rtx cost calculation.  Strip an extend
5280    expression from X.  Returns the inner operand if successful, or the
5281    original expression on failure.  We deal with a number of possible
5282    canonicalization variations here.  */
5283 static rtx
5284 aarch64_strip_extend (rtx x)
5285 {
5286   rtx op = x;
5287
5288   /* Zero and sign extraction of a widened value.  */
5289   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5290       && XEXP (op, 2) == const0_rtx
5291       && GET_CODE (XEXP (op, 0)) == MULT
5292       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5293                                          XEXP (op, 1)))
5294     return XEXP (XEXP (op, 0), 0);
5295
5296   /* It can also be represented (for zero-extend) as an AND with an
5297      immediate.  */
5298   if (GET_CODE (op) == AND
5299       && GET_CODE (XEXP (op, 0)) == MULT
5300       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5301       && CONST_INT_P (XEXP (op, 1))
5302       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5303                            INTVAL (XEXP (op, 1))) != 0)
5304     return XEXP (XEXP (op, 0), 0);
5305
5306   /* Now handle extended register, as this may also have an optional
5307      left shift by 1..4.  */
5308   if (GET_CODE (op) == ASHIFT
5309       && CONST_INT_P (XEXP (op, 1))
5310       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5311     op = XEXP (op, 0);
5312
5313   if (GET_CODE (op) == ZERO_EXTEND
5314       || GET_CODE (op) == SIGN_EXTEND)
5315     op = XEXP (op, 0);
5316
5317   if (op != x)
5318     return op;
5319
5320   return x;
5321 }
5322
5323 /* Return true iff CODE is a shift supported in combination
5324    with arithmetic instructions.  */
5325
5326 static bool
5327 aarch64_shift_p (enum rtx_code code)
5328 {
5329   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5330 }
5331
5332 /* Helper function for rtx cost calculation.  Calculate the cost of
5333    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5334    Return the calculated cost of the expression, recursing manually in to
5335    operands where needed.  */
5336
5337 static int
5338 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5339 {
5340   rtx op0, op1;
5341   const struct cpu_cost_table *extra_cost
5342     = aarch64_tune_params.insn_extra_cost;
5343   int cost = 0;
5344   bool compound_p = (outer == PLUS || outer == MINUS);
5345   machine_mode mode = GET_MODE (x);
5346
5347   gcc_checking_assert (code == MULT);
5348
5349   op0 = XEXP (x, 0);
5350   op1 = XEXP (x, 1);
5351
5352   if (VECTOR_MODE_P (mode))
5353     mode = GET_MODE_INNER (mode);
5354
5355   /* Integer multiply/fma.  */
5356   if (GET_MODE_CLASS (mode) == MODE_INT)
5357     {
5358       /* The multiply will be canonicalized as a shift, cost it as such.  */
5359       if (aarch64_shift_p (GET_CODE (x))
5360           || (CONST_INT_P (op1)
5361               && exact_log2 (INTVAL (op1)) > 0))
5362         {
5363           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5364                            || GET_CODE (op0) == SIGN_EXTEND;
5365           if (speed)
5366             {
5367               if (compound_p)
5368                 {
5369                   if (REG_P (op1))
5370                     /* ARITH + shift-by-register.  */
5371                     cost += extra_cost->alu.arith_shift_reg;
5372                   else if (is_extend)
5373                     /* ARITH + extended register.  We don't have a cost field
5374                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5375                     cost += extra_cost->alu.extend_arith;
5376                   else
5377                     /* ARITH + shift-by-immediate.  */
5378                     cost += extra_cost->alu.arith_shift;
5379                 }
5380               else
5381                 /* LSL (immediate).  */
5382                 cost += extra_cost->alu.shift;
5383
5384             }
5385           /* Strip extends as we will have costed them in the case above.  */
5386           if (is_extend)
5387             op0 = aarch64_strip_extend (op0);
5388
5389           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5390
5391           return cost;
5392         }
5393
5394       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5395          compound and let the below cases handle it.  After all, MNEG is a
5396          special-case alias of MSUB.  */
5397       if (GET_CODE (op0) == NEG)
5398         {
5399           op0 = XEXP (op0, 0);
5400           compound_p = true;
5401         }
5402
5403       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5404       if ((GET_CODE (op0) == ZERO_EXTEND
5405            && GET_CODE (op1) == ZERO_EXTEND)
5406           || (GET_CODE (op0) == SIGN_EXTEND
5407               && GET_CODE (op1) == SIGN_EXTEND))
5408         {
5409           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5410                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5411
5412           if (speed)
5413             {
5414               if (compound_p)
5415                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5416                 cost += extra_cost->mult[0].extend_add;
5417               else
5418                 /* MUL/SMULL/UMULL.  */
5419                 cost += extra_cost->mult[0].extend;
5420             }
5421
5422           return cost;
5423         }
5424
5425       /* This is either an integer multiply or a MADD.  In both cases
5426          we want to recurse and cost the operands.  */
5427       cost += rtx_cost (op0, MULT, 0, speed)
5428               + rtx_cost (op1, MULT, 1, speed);
5429
5430       if (speed)
5431         {
5432           if (compound_p)
5433             /* MADD/MSUB.  */
5434             cost += extra_cost->mult[mode == DImode].add;
5435           else
5436             /* MUL.  */
5437             cost += extra_cost->mult[mode == DImode].simple;
5438         }
5439
5440       return cost;
5441     }
5442   else
5443     {
5444       if (speed)
5445         {
5446           /* Floating-point FMA/FMUL can also support negations of the
5447              operands.  */
5448           if (GET_CODE (op0) == NEG)
5449             op0 = XEXP (op0, 0);
5450           if (GET_CODE (op1) == NEG)
5451             op1 = XEXP (op1, 0);
5452
5453           if (compound_p)
5454             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5455             cost += extra_cost->fp[mode == DFmode].fma;
5456           else
5457             /* FMUL/FNMUL.  */
5458             cost += extra_cost->fp[mode == DFmode].mult;
5459         }
5460
5461       cost += rtx_cost (op0, MULT, 0, speed)
5462               + rtx_cost (op1, MULT, 1, speed);
5463       return cost;
5464     }
5465 }
5466
5467 static int
5468 aarch64_address_cost (rtx x,
5469                       machine_mode mode,
5470                       addr_space_t as ATTRIBUTE_UNUSED,
5471                       bool speed)
5472 {
5473   enum rtx_code c = GET_CODE (x);
5474   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5475   struct aarch64_address_info info;
5476   int cost = 0;
5477   info.shift = 0;
5478
5479   if (!aarch64_classify_address (&info, x, mode, c, false))
5480     {
5481       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5482         {
5483           /* This is a CONST or SYMBOL ref which will be split
5484              in a different way depending on the code model in use.
5485              Cost it through the generic infrastructure.  */
5486           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5487           /* Divide through by the cost of one instruction to
5488              bring it to the same units as the address costs.  */
5489           cost_symbol_ref /= COSTS_N_INSNS (1);
5490           /* The cost is then the cost of preparing the address,
5491              followed by an immediate (possibly 0) offset.  */
5492           return cost_symbol_ref + addr_cost->imm_offset;
5493         }
5494       else
5495         {
5496           /* This is most likely a jump table from a case
5497              statement.  */
5498           return addr_cost->register_offset;
5499         }
5500     }
5501
5502   switch (info.type)
5503     {
5504       case ADDRESS_LO_SUM:
5505       case ADDRESS_SYMBOLIC:
5506       case ADDRESS_REG_IMM:
5507         cost += addr_cost->imm_offset;
5508         break;
5509
5510       case ADDRESS_REG_WB:
5511         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5512           cost += addr_cost->pre_modify;
5513         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5514           cost += addr_cost->post_modify;
5515         else
5516           gcc_unreachable ();
5517
5518         break;
5519
5520       case ADDRESS_REG_REG:
5521         cost += addr_cost->register_offset;
5522         break;
5523
5524       case ADDRESS_REG_UXTW:
5525       case ADDRESS_REG_SXTW:
5526         cost += addr_cost->register_extend;
5527         break;
5528
5529       default:
5530         gcc_unreachable ();
5531     }
5532
5533
5534   if (info.shift > 0)
5535     {
5536       /* For the sake of calculating the cost of the shifted register
5537          component, we can treat same sized modes in the same way.  */
5538       switch (GET_MODE_BITSIZE (mode))
5539         {
5540           case 16:
5541             cost += addr_cost->addr_scale_costs.hi;
5542             break;
5543
5544           case 32:
5545             cost += addr_cost->addr_scale_costs.si;
5546             break;
5547
5548           case 64:
5549             cost += addr_cost->addr_scale_costs.di;
5550             break;
5551
5552           /* We can't tell, or this is a 128-bit vector.  */
5553           default:
5554             cost += addr_cost->addr_scale_costs.ti;
5555             break;
5556         }
5557     }
5558
5559   return cost;
5560 }
5561
5562 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5563    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5564    to be taken.  */
5565
5566 int
5567 aarch64_branch_cost (bool speed_p, bool predictable_p)
5568 {
5569   /* When optimizing for speed, use the cost of unpredictable branches.  */
5570   const struct cpu_branch_cost *branch_costs =
5571     aarch64_tune_params.branch_costs;
5572
5573   if (!speed_p || predictable_p)
5574     return branch_costs->predictable;
5575   else
5576     return branch_costs->unpredictable;
5577 }
5578
5579 /* Return true if the RTX X in mode MODE is a zero or sign extract
5580    usable in an ADD or SUB (extended register) instruction.  */
5581 static bool
5582 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5583 {
5584   /* Catch add with a sign extract.
5585      This is add_<optab><mode>_multp2.  */
5586   if (GET_CODE (x) == SIGN_EXTRACT
5587       || GET_CODE (x) == ZERO_EXTRACT)
5588     {
5589       rtx op0 = XEXP (x, 0);
5590       rtx op1 = XEXP (x, 1);
5591       rtx op2 = XEXP (x, 2);
5592
5593       if (GET_CODE (op0) == MULT
5594           && CONST_INT_P (op1)
5595           && op2 == const0_rtx
5596           && CONST_INT_P (XEXP (op0, 1))
5597           && aarch64_is_extend_from_extract (mode,
5598                                              XEXP (op0, 1),
5599                                              op1))
5600         {
5601           return true;
5602         }
5603     }
5604
5605   return false;
5606 }
5607
5608 static bool
5609 aarch64_frint_unspec_p (unsigned int u)
5610 {
5611   switch (u)
5612     {
5613       case UNSPEC_FRINTZ:
5614       case UNSPEC_FRINTP:
5615       case UNSPEC_FRINTM:
5616       case UNSPEC_FRINTA:
5617       case UNSPEC_FRINTN:
5618       case UNSPEC_FRINTX:
5619       case UNSPEC_FRINTI:
5620         return true;
5621
5622       default:
5623         return false;
5624     }
5625 }
5626
5627 /* Return true iff X is an rtx that will match an extr instruction
5628    i.e. as described in the *extr<mode>5_insn family of patterns.
5629    OP0 and OP1 will be set to the operands of the shifts involved
5630    on success and will be NULL_RTX otherwise.  */
5631
5632 static bool
5633 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5634 {
5635   rtx op0, op1;
5636   machine_mode mode = GET_MODE (x);
5637
5638   *res_op0 = NULL_RTX;
5639   *res_op1 = NULL_RTX;
5640
5641   if (GET_CODE (x) != IOR)
5642     return false;
5643
5644   op0 = XEXP (x, 0);
5645   op1 = XEXP (x, 1);
5646
5647   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5648       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5649     {
5650      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5651       if (GET_CODE (op1) == ASHIFT)
5652         std::swap (op0, op1);
5653
5654       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5655         return false;
5656
5657       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5658       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5659
5660       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5661           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5662         {
5663           *res_op0 = XEXP (op0, 0);
5664           *res_op1 = XEXP (op1, 0);
5665           return true;
5666         }
5667     }
5668
5669   return false;
5670 }
5671
5672 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5673    storing it in *COST.  Result is true if the total cost of the operation
5674    has now been calculated.  */
5675 static bool
5676 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5677 {
5678   rtx inner;
5679   rtx comparator;
5680   enum rtx_code cmpcode;
5681
5682   if (COMPARISON_P (op0))
5683     {
5684       inner = XEXP (op0, 0);
5685       comparator = XEXP (op0, 1);
5686       cmpcode = GET_CODE (op0);
5687     }
5688   else
5689     {
5690       inner = op0;
5691       comparator = const0_rtx;
5692       cmpcode = NE;
5693     }
5694
5695   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5696     {
5697       /* Conditional branch.  */
5698       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5699         return true;
5700       else
5701         {
5702           if (cmpcode == NE || cmpcode == EQ)
5703             {
5704               if (comparator == const0_rtx)
5705                 {
5706                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5707                   if (GET_CODE (inner) == ZERO_EXTRACT)
5708                     /* TBZ/TBNZ.  */
5709                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5710                                        0, speed);
5711                 else
5712                   /* CBZ/CBNZ.  */
5713                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5714
5715                 return true;
5716               }
5717             }
5718           else if (cmpcode == LT || cmpcode == GE)
5719             {
5720               /* TBZ/TBNZ.  */
5721               if (comparator == const0_rtx)
5722                 return true;
5723             }
5724         }
5725     }
5726   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5727     {
5728       /* It's a conditional operation based on the status flags,
5729          so it must be some flavor of CSEL.  */
5730
5731       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5732       if (GET_CODE (op1) == NEG
5733           || GET_CODE (op1) == NOT
5734           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5735         op1 = XEXP (op1, 0);
5736
5737       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5738       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5739       return true;
5740     }
5741
5742   /* We don't know what this is, cost all operands.  */
5743   return false;
5744 }
5745
5746 /* Calculate the cost of calculating X, storing it in *COST.  Result
5747    is true if the total cost of the operation has now been calculated.  */
5748 static bool
5749 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5750                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5751 {
5752   rtx op0, op1, op2;
5753   const struct cpu_cost_table *extra_cost
5754     = aarch64_tune_params.insn_extra_cost;
5755   machine_mode mode = GET_MODE (x);
5756
5757   /* By default, assume that everything has equivalent cost to the
5758      cheapest instruction.  Any additional costs are applied as a delta
5759      above this default.  */
5760   *cost = COSTS_N_INSNS (1);
5761
5762   switch (code)
5763     {
5764     case SET:
5765       /* The cost depends entirely on the operands to SET.  */
5766       *cost = 0;
5767       op0 = SET_DEST (x);
5768       op1 = SET_SRC (x);
5769
5770       switch (GET_CODE (op0))
5771         {
5772         case MEM:
5773           if (speed)
5774             {
5775               rtx address = XEXP (op0, 0);
5776               if (VECTOR_MODE_P (mode))
5777                 *cost += extra_cost->ldst.storev;
5778               else if (GET_MODE_CLASS (mode) == MODE_INT)
5779                 *cost += extra_cost->ldst.store;
5780               else if (mode == SFmode)
5781                 *cost += extra_cost->ldst.storef;
5782               else if (mode == DFmode)
5783                 *cost += extra_cost->ldst.stored;
5784
5785               *cost +=
5786                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5787                                                      0, speed));
5788             }
5789
5790           *cost += rtx_cost (op1, SET, 1, speed);
5791           return true;
5792
5793         case SUBREG:
5794           if (! REG_P (SUBREG_REG (op0)))
5795             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5796
5797           /* Fall through.  */
5798         case REG:
5799           /* The cost is one per vector-register copied.  */
5800           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5801             {
5802               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5803                               / GET_MODE_SIZE (V4SImode);
5804               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5805             }
5806           /* const0_rtx is in general free, but we will use an
5807              instruction to set a register to 0.  */
5808           else if (REG_P (op1) || op1 == const0_rtx)
5809             {
5810               /* The cost is 1 per register copied.  */
5811               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5812                               / UNITS_PER_WORD;
5813               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5814             }
5815           else
5816             /* Cost is just the cost of the RHS of the set.  */
5817             *cost += rtx_cost (op1, SET, 1, speed);
5818           return true;
5819
5820         case ZERO_EXTRACT:
5821         case SIGN_EXTRACT:
5822           /* Bit-field insertion.  Strip any redundant widening of
5823              the RHS to meet the width of the target.  */
5824           if (GET_CODE (op1) == SUBREG)
5825             op1 = SUBREG_REG (op1);
5826           if ((GET_CODE (op1) == ZERO_EXTEND
5827                || GET_CODE (op1) == SIGN_EXTEND)
5828               && CONST_INT_P (XEXP (op0, 1))
5829               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5830                   >= INTVAL (XEXP (op0, 1))))
5831             op1 = XEXP (op1, 0);
5832
5833           if (CONST_INT_P (op1))
5834             {
5835               /* MOV immediate is assumed to always be cheap.  */
5836               *cost = COSTS_N_INSNS (1);
5837             }
5838           else
5839             {
5840               /* BFM.  */
5841               if (speed)
5842                 *cost += extra_cost->alu.bfi;
5843               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5844             }
5845
5846           return true;
5847
5848         default:
5849           /* We can't make sense of this, assume default cost.  */
5850           *cost = COSTS_N_INSNS (1);
5851           return false;
5852         }
5853       return false;
5854
5855     case CONST_INT:
5856       /* If an instruction can incorporate a constant within the
5857          instruction, the instruction's expression avoids calling
5858          rtx_cost() on the constant.  If rtx_cost() is called on a
5859          constant, then it is usually because the constant must be
5860          moved into a register by one or more instructions.
5861
5862          The exception is constant 0, which can be expressed
5863          as XZR/WZR and is therefore free.  The exception to this is
5864          if we have (set (reg) (const0_rtx)) in which case we must cost
5865          the move.  However, we can catch that when we cost the SET, so
5866          we don't need to consider that here.  */
5867       if (x == const0_rtx)
5868         *cost = 0;
5869       else
5870         {
5871           /* To an approximation, building any other constant is
5872              proportionally expensive to the number of instructions
5873              required to build that constant.  This is true whether we
5874              are compiling for SPEED or otherwise.  */
5875           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5876                                  (NULL_RTX, x, false, mode));
5877         }
5878       return true;
5879
5880     case CONST_DOUBLE:
5881       if (speed)
5882         {
5883           /* mov[df,sf]_aarch64.  */
5884           if (aarch64_float_const_representable_p (x))
5885             /* FMOV (scalar immediate).  */
5886             *cost += extra_cost->fp[mode == DFmode].fpconst;
5887           else if (!aarch64_float_const_zero_rtx_p (x))
5888             {
5889               /* This will be a load from memory.  */
5890               if (mode == DFmode)
5891                 *cost += extra_cost->ldst.loadd;
5892               else
5893                 *cost += extra_cost->ldst.loadf;
5894             }
5895           else
5896             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5897                or MOV v0.s[0], wzr - neither of which are modeled by the
5898                cost tables.  Just use the default cost.  */
5899             {
5900             }
5901         }
5902
5903       return true;
5904
5905     case MEM:
5906       if (speed)
5907         {
5908           /* For loads we want the base cost of a load, plus an
5909              approximation for the additional cost of the addressing
5910              mode.  */
5911           rtx address = XEXP (x, 0);
5912           if (VECTOR_MODE_P (mode))
5913             *cost += extra_cost->ldst.loadv;
5914           else if (GET_MODE_CLASS (mode) == MODE_INT)
5915             *cost += extra_cost->ldst.load;
5916           else if (mode == SFmode)
5917             *cost += extra_cost->ldst.loadf;
5918           else if (mode == DFmode)
5919             *cost += extra_cost->ldst.loadd;
5920
5921           *cost +=
5922                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5923                                                      0, speed));
5924         }
5925
5926       return true;
5927
5928     case NEG:
5929       op0 = XEXP (x, 0);
5930
5931       if (VECTOR_MODE_P (mode))
5932         {
5933           if (speed)
5934             {
5935               /* FNEG.  */
5936               *cost += extra_cost->vect.alu;
5937             }
5938           return false;
5939         }
5940
5941       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5942        {
5943           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5944               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5945             {
5946               /* CSETM.  */
5947               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5948               return true;
5949             }
5950
5951           /* Cost this as SUB wzr, X.  */
5952           op0 = CONST0_RTX (GET_MODE (x));
5953           op1 = XEXP (x, 0);
5954           goto cost_minus;
5955         }
5956
5957       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5958         {
5959           /* Support (neg(fma...)) as a single instruction only if
5960              sign of zeros is unimportant.  This matches the decision
5961              making in aarch64.md.  */
5962           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5963             {
5964               /* FNMADD.  */
5965               *cost = rtx_cost (op0, NEG, 0, speed);
5966               return true;
5967             }
5968           if (speed)
5969             /* FNEG.  */
5970             *cost += extra_cost->fp[mode == DFmode].neg;
5971           return false;
5972         }
5973
5974       return false;
5975
5976     case CLRSB:
5977     case CLZ:
5978       if (speed)
5979         {
5980           if (VECTOR_MODE_P (mode))
5981             *cost += extra_cost->vect.alu;
5982           else
5983             *cost += extra_cost->alu.clz;
5984         }
5985
5986       return false;
5987
5988     case COMPARE:
5989       op0 = XEXP (x, 0);
5990       op1 = XEXP (x, 1);
5991
5992       if (op1 == const0_rtx
5993           && GET_CODE (op0) == AND)
5994         {
5995           x = op0;
5996           goto cost_logic;
5997         }
5998
5999       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6000         {
6001           /* TODO: A write to the CC flags possibly costs extra, this
6002              needs encoding in the cost tables.  */
6003
6004           /* CC_ZESWPmode supports zero extend for free.  */
6005           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6006             op0 = XEXP (op0, 0);
6007
6008           /* ANDS.  */
6009           if (GET_CODE (op0) == AND)
6010             {
6011               x = op0;
6012               goto cost_logic;
6013             }
6014
6015           if (GET_CODE (op0) == PLUS)
6016             {
6017               /* ADDS (and CMN alias).  */
6018               x = op0;
6019               goto cost_plus;
6020             }
6021
6022           if (GET_CODE (op0) == MINUS)
6023             {
6024               /* SUBS.  */
6025               x = op0;
6026               goto cost_minus;
6027             }
6028
6029           if (GET_CODE (op1) == NEG)
6030             {
6031               /* CMN.  */
6032               if (speed)
6033                 *cost += extra_cost->alu.arith;
6034
6035               *cost += rtx_cost (op0, COMPARE, 0, speed);
6036               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
6037               return true;
6038             }
6039
6040           /* CMP.
6041
6042              Compare can freely swap the order of operands, and
6043              canonicalization puts the more complex operation first.
6044              But the integer MINUS logic expects the shift/extend
6045              operation in op1.  */
6046           if (! (REG_P (op0)
6047                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6048           {
6049             op0 = XEXP (x, 1);
6050             op1 = XEXP (x, 0);
6051           }
6052           goto cost_minus;
6053         }
6054
6055       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6056         {
6057           /* FCMP.  */
6058           if (speed)
6059             *cost += extra_cost->fp[mode == DFmode].compare;
6060
6061           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6062             {
6063               *cost += rtx_cost (op0, COMPARE, 0, speed);
6064               /* FCMP supports constant 0.0 for no extra cost. */
6065               return true;
6066             }
6067           return false;
6068         }
6069
6070       if (VECTOR_MODE_P (mode))
6071         {
6072           /* Vector compare.  */
6073           if (speed)
6074             *cost += extra_cost->vect.alu;
6075
6076           if (aarch64_float_const_zero_rtx_p (op1))
6077             {
6078               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6079                  cost.  */
6080               return true;
6081             }
6082           return false;
6083         }
6084       return false;
6085
6086     case MINUS:
6087       {
6088         op0 = XEXP (x, 0);
6089         op1 = XEXP (x, 1);
6090
6091 cost_minus:
6092         *cost += rtx_cost (op0, MINUS, 0, speed);
6093
6094         /* Detect valid immediates.  */
6095         if ((GET_MODE_CLASS (mode) == MODE_INT
6096              || (GET_MODE_CLASS (mode) == MODE_CC
6097                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6098             && CONST_INT_P (op1)
6099             && aarch64_uimm12_shift (INTVAL (op1)))
6100           {
6101             if (speed)
6102               /* SUB(S) (immediate).  */
6103               *cost += extra_cost->alu.arith;
6104             return true;
6105           }
6106
6107         /* Look for SUB (extended register).  */
6108         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6109           {
6110             if (speed)
6111               *cost += extra_cost->alu.extend_arith;
6112
6113             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
6114                                (enum rtx_code) GET_CODE (op1),
6115                                0, speed);
6116             return true;
6117           }
6118
6119         rtx new_op1 = aarch64_strip_extend (op1);
6120
6121         /* Cost this as an FMA-alike operation.  */
6122         if ((GET_CODE (new_op1) == MULT
6123              || aarch64_shift_p (GET_CODE (new_op1)))
6124             && code != COMPARE)
6125           {
6126             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6127                                             (enum rtx_code) code,
6128                                             speed);
6129             return true;
6130           }
6131
6132         *cost += rtx_cost (new_op1, MINUS, 1, speed);
6133
6134         if (speed)
6135           {
6136             if (VECTOR_MODE_P (mode))
6137               {
6138                 /* Vector SUB.  */
6139                 *cost += extra_cost->vect.alu;
6140               }
6141             else if (GET_MODE_CLASS (mode) == MODE_INT)
6142               {
6143                 /* SUB(S).  */
6144                 *cost += extra_cost->alu.arith;
6145               }
6146             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6147               {
6148                 /* FSUB.  */
6149                 *cost += extra_cost->fp[mode == DFmode].addsub;
6150               }
6151           }
6152         return true;
6153       }
6154
6155     case PLUS:
6156       {
6157         rtx new_op0;
6158
6159         op0 = XEXP (x, 0);
6160         op1 = XEXP (x, 1);
6161
6162 cost_plus:
6163         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6164             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6165           {
6166             /* CSINC.  */
6167             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6168             *cost += rtx_cost (op1, PLUS, 1, speed);
6169             return true;
6170           }
6171
6172         if (GET_MODE_CLASS (mode) == MODE_INT
6173             && CONST_INT_P (op1)
6174             && aarch64_uimm12_shift (INTVAL (op1)))
6175           {
6176             *cost += rtx_cost (op0, PLUS, 0, speed);
6177
6178             if (speed)
6179               /* ADD (immediate).  */
6180               *cost += extra_cost->alu.arith;
6181             return true;
6182           }
6183
6184         *cost += rtx_cost (op1, PLUS, 1, speed);
6185
6186         /* Look for ADD (extended register).  */
6187         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6188           {
6189             if (speed)
6190               *cost += extra_cost->alu.extend_arith;
6191
6192             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6193                                (enum rtx_code) GET_CODE (op0),
6194                                0, speed);
6195             return true;
6196           }
6197
6198         /* Strip any extend, leave shifts behind as we will
6199            cost them through mult_cost.  */
6200         new_op0 = aarch64_strip_extend (op0);
6201
6202         if (GET_CODE (new_op0) == MULT
6203             || aarch64_shift_p (GET_CODE (new_op0)))
6204           {
6205             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6206                                             speed);
6207             return true;
6208           }
6209
6210         *cost += rtx_cost (new_op0, PLUS, 0, speed);
6211
6212         if (speed)
6213           {
6214             if (VECTOR_MODE_P (mode))
6215               {
6216                 /* Vector ADD.  */
6217                 *cost += extra_cost->vect.alu;
6218               }
6219             else if (GET_MODE_CLASS (mode) == MODE_INT)
6220               {
6221                 /* ADD.  */
6222                 *cost += extra_cost->alu.arith;
6223               }
6224             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6225               {
6226                 /* FADD.  */
6227                 *cost += extra_cost->fp[mode == DFmode].addsub;
6228               }
6229           }
6230         return true;
6231       }
6232
6233     case BSWAP:
6234       *cost = COSTS_N_INSNS (1);
6235
6236       if (speed)
6237         {
6238           if (VECTOR_MODE_P (mode))
6239             *cost += extra_cost->vect.alu;
6240           else
6241             *cost += extra_cost->alu.rev;
6242         }
6243       return false;
6244
6245     case IOR:
6246       if (aarch_rev16_p (x))
6247         {
6248           *cost = COSTS_N_INSNS (1);
6249
6250           if (speed)
6251             {
6252               if (VECTOR_MODE_P (mode))
6253                 *cost += extra_cost->vect.alu;
6254               else
6255                 *cost += extra_cost->alu.rev;
6256             }
6257           return true;
6258         }
6259
6260       if (aarch64_extr_rtx_p (x, &op0, &op1))
6261         {
6262           *cost += rtx_cost (op0, IOR, 0, speed)
6263                    + rtx_cost (op1, IOR, 1, speed);
6264           if (speed)
6265             *cost += extra_cost->alu.shift;
6266
6267           return true;
6268         }
6269     /* Fall through.  */
6270     case XOR:
6271     case AND:
6272     cost_logic:
6273       op0 = XEXP (x, 0);
6274       op1 = XEXP (x, 1);
6275
6276       if (VECTOR_MODE_P (mode))
6277         {
6278           if (speed)
6279             *cost += extra_cost->vect.alu;
6280           return true;
6281         }
6282
6283       if (code == AND
6284           && GET_CODE (op0) == MULT
6285           && CONST_INT_P (XEXP (op0, 1))
6286           && CONST_INT_P (op1)
6287           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6288                                INTVAL (op1)) != 0)
6289         {
6290           /* This is a UBFM/SBFM.  */
6291           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6292           if (speed)
6293             *cost += extra_cost->alu.bfx;
6294           return true;
6295         }
6296
6297       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6298         {
6299           /* We possibly get the immediate for free, this is not
6300              modelled.  */
6301           if (CONST_INT_P (op1)
6302               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6303             {
6304               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6305
6306               if (speed)
6307                 *cost += extra_cost->alu.logical;
6308
6309               return true;
6310             }
6311           else
6312             {
6313               rtx new_op0 = op0;
6314
6315               /* Handle ORN, EON, or BIC.  */
6316               if (GET_CODE (op0) == NOT)
6317                 op0 = XEXP (op0, 0);
6318
6319               new_op0 = aarch64_strip_shift (op0);
6320
6321               /* If we had a shift on op0 then this is a logical-shift-
6322                  by-register/immediate operation.  Otherwise, this is just
6323                  a logical operation.  */
6324               if (speed)
6325                 {
6326                   if (new_op0 != op0)
6327                     {
6328                       /* Shift by immediate.  */
6329                       if (CONST_INT_P (XEXP (op0, 1)))
6330                         *cost += extra_cost->alu.log_shift;
6331                       else
6332                         *cost += extra_cost->alu.log_shift_reg;
6333                     }
6334                   else
6335                     *cost += extra_cost->alu.logical;
6336                 }
6337
6338               /* In both cases we want to cost both operands.  */
6339               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6340                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6341
6342               return true;
6343             }
6344         }
6345       return false;
6346
6347     case NOT:
6348       x = XEXP (x, 0);
6349       op0 = aarch64_strip_shift (x);
6350
6351       if (VECTOR_MODE_P (mode))
6352         {
6353           /* Vector NOT.  */
6354           *cost += extra_cost->vect.alu;
6355           return false;
6356         }
6357
6358       /* MVN-shifted-reg.  */
6359       if (op0 != x)
6360         {
6361           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6362
6363           if (speed)
6364             *cost += extra_cost->alu.log_shift;
6365
6366           return true;
6367         }
6368       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6369          Handle the second form here taking care that 'a' in the above can
6370          be a shift.  */
6371       else if (GET_CODE (op0) == XOR)
6372         {
6373           rtx newop0 = XEXP (op0, 0);
6374           rtx newop1 = XEXP (op0, 1);
6375           rtx op0_stripped = aarch64_strip_shift (newop0);
6376
6377           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6378                    + rtx_cost (op0_stripped, XOR, 0, speed);
6379
6380           if (speed)
6381             {
6382               if (op0_stripped != newop0)
6383                 *cost += extra_cost->alu.log_shift;
6384               else
6385                 *cost += extra_cost->alu.logical;
6386             }
6387
6388           return true;
6389         }
6390       /* MVN.  */
6391       if (speed)
6392         *cost += extra_cost->alu.logical;
6393
6394       return false;
6395
6396     case ZERO_EXTEND:
6397
6398       op0 = XEXP (x, 0);
6399       /* If a value is written in SI mode, then zero extended to DI
6400          mode, the operation will in general be free as a write to
6401          a 'w' register implicitly zeroes the upper bits of an 'x'
6402          register.  However, if this is
6403
6404            (set (reg) (zero_extend (reg)))
6405
6406          we must cost the explicit register move.  */
6407       if (mode == DImode
6408           && GET_MODE (op0) == SImode
6409           && outer == SET)
6410         {
6411           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6412
6413           if (!op_cost && speed)
6414             /* MOV.  */
6415             *cost += extra_cost->alu.extend;
6416           else
6417             /* Free, the cost is that of the SI mode operation.  */
6418             *cost = op_cost;
6419
6420           return true;
6421         }
6422       else if (MEM_P (XEXP (x, 0)))
6423         {
6424           /* All loads can zero extend to any size for free.  */
6425           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6426           return true;
6427         }
6428
6429       if (speed)
6430         {
6431           if (VECTOR_MODE_P (mode))
6432             {
6433               /* UMOV.  */
6434               *cost += extra_cost->vect.alu;
6435             }
6436           else
6437             {
6438               /* UXTB/UXTH.  */
6439               *cost += extra_cost->alu.extend;
6440             }
6441         }
6442       return false;
6443
6444     case SIGN_EXTEND:
6445       if (MEM_P (XEXP (x, 0)))
6446         {
6447           /* LDRSH.  */
6448           if (speed)
6449             {
6450               rtx address = XEXP (XEXP (x, 0), 0);
6451               *cost += extra_cost->ldst.load_sign_extend;
6452
6453               *cost +=
6454                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6455                                                      0, speed));
6456             }
6457           return true;
6458         }
6459
6460       if (speed)
6461         {
6462           if (VECTOR_MODE_P (mode))
6463             *cost += extra_cost->vect.alu;
6464           else
6465             *cost += extra_cost->alu.extend;
6466         }
6467       return false;
6468
6469     case ASHIFT:
6470       op0 = XEXP (x, 0);
6471       op1 = XEXP (x, 1);
6472
6473       if (CONST_INT_P (op1))
6474         {
6475           if (speed)
6476             {
6477               if (VECTOR_MODE_P (mode))
6478                 {
6479                   /* Vector shift (immediate).  */
6480                   *cost += extra_cost->vect.alu;
6481                 }
6482               else
6483                 {
6484                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6485                      aliases.  */
6486                   *cost += extra_cost->alu.shift;
6487                 }
6488             }
6489
6490           /* We can incorporate zero/sign extend for free.  */
6491           if (GET_CODE (op0) == ZERO_EXTEND
6492               || GET_CODE (op0) == SIGN_EXTEND)
6493             op0 = XEXP (op0, 0);
6494
6495           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6496           return true;
6497         }
6498       else
6499         {
6500           if (speed)
6501             {
6502               if (VECTOR_MODE_P (mode))
6503                 {
6504                   /* Vector shift (register).  */
6505                   *cost += extra_cost->vect.alu;
6506                 }
6507               else
6508                 {
6509                   /* LSLV.  */
6510                   *cost += extra_cost->alu.shift_reg;
6511                 }
6512             }
6513           return false;  /* All arguments need to be in registers.  */
6514         }
6515
6516     case ROTATE:
6517     case ROTATERT:
6518     case LSHIFTRT:
6519     case ASHIFTRT:
6520       op0 = XEXP (x, 0);
6521       op1 = XEXP (x, 1);
6522
6523       if (CONST_INT_P (op1))
6524         {
6525           /* ASR (immediate) and friends.  */
6526           if (speed)
6527             {
6528               if (VECTOR_MODE_P (mode))
6529                 *cost += extra_cost->vect.alu;
6530               else
6531                 *cost += extra_cost->alu.shift;
6532             }
6533
6534           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6535           return true;
6536         }
6537       else
6538         {
6539
6540           /* ASR (register) and friends.  */
6541           if (speed)
6542             {
6543               if (VECTOR_MODE_P (mode))
6544                 *cost += extra_cost->vect.alu;
6545               else
6546                 *cost += extra_cost->alu.shift_reg;
6547             }
6548           return false;  /* All arguments need to be in registers.  */
6549         }
6550
6551     case SYMBOL_REF:
6552
6553       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6554           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6555         {
6556           /* LDR.  */
6557           if (speed)
6558             *cost += extra_cost->ldst.load;
6559         }
6560       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6561                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6562         {
6563           /* ADRP, followed by ADD.  */
6564           *cost += COSTS_N_INSNS (1);
6565           if (speed)
6566             *cost += 2 * extra_cost->alu.arith;
6567         }
6568       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6569                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6570         {
6571           /* ADR.  */
6572           if (speed)
6573             *cost += extra_cost->alu.arith;
6574         }
6575
6576       if (flag_pic)
6577         {
6578           /* One extra load instruction, after accessing the GOT.  */
6579           *cost += COSTS_N_INSNS (1);
6580           if (speed)
6581             *cost += extra_cost->ldst.load;
6582         }
6583       return true;
6584
6585     case HIGH:
6586     case LO_SUM:
6587       /* ADRP/ADD (immediate).  */
6588       if (speed)
6589         *cost += extra_cost->alu.arith;
6590       return true;
6591
6592     case ZERO_EXTRACT:
6593     case SIGN_EXTRACT:
6594       /* UBFX/SBFX.  */
6595       if (speed)
6596         {
6597           if (VECTOR_MODE_P (mode))
6598             *cost += extra_cost->vect.alu;
6599           else
6600             *cost += extra_cost->alu.bfx;
6601         }
6602
6603       /* We can trust that the immediates used will be correct (there
6604          are no by-register forms), so we need only cost op0.  */
6605       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6606       return true;
6607
6608     case MULT:
6609       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6610       /* aarch64_rtx_mult_cost always handles recursion to its
6611          operands.  */
6612       return true;
6613
6614     case MOD:
6615     case UMOD:
6616       if (speed)
6617         {
6618           if (VECTOR_MODE_P (mode))
6619             *cost += extra_cost->vect.alu;
6620           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6621             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6622                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6623           else if (GET_MODE (x) == DFmode)
6624             *cost += (extra_cost->fp[1].mult
6625                       + extra_cost->fp[1].div);
6626           else if (GET_MODE (x) == SFmode)
6627             *cost += (extra_cost->fp[0].mult
6628                       + extra_cost->fp[0].div);
6629         }
6630       return false;  /* All arguments need to be in registers.  */
6631
6632     case DIV:
6633     case UDIV:
6634     case SQRT:
6635       if (speed)
6636         {
6637           if (VECTOR_MODE_P (mode))
6638             *cost += extra_cost->vect.alu;
6639           else if (GET_MODE_CLASS (mode) == MODE_INT)
6640             /* There is no integer SQRT, so only DIV and UDIV can get
6641                here.  */
6642             *cost += extra_cost->mult[mode == DImode].idiv;
6643           else
6644             *cost += extra_cost->fp[mode == DFmode].div;
6645         }
6646       return false;  /* All arguments need to be in registers.  */
6647
6648     case IF_THEN_ELSE:
6649       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6650                                          XEXP (x, 2), cost, speed);
6651
6652     case EQ:
6653     case NE:
6654     case GT:
6655     case GTU:
6656     case LT:
6657     case LTU:
6658     case GE:
6659     case GEU:
6660     case LE:
6661     case LEU:
6662
6663       return false; /* All arguments must be in registers.  */
6664
6665     case FMA:
6666       op0 = XEXP (x, 0);
6667       op1 = XEXP (x, 1);
6668       op2 = XEXP (x, 2);
6669
6670       if (speed)
6671         {
6672           if (VECTOR_MODE_P (mode))
6673             *cost += extra_cost->vect.alu;
6674           else
6675             *cost += extra_cost->fp[mode == DFmode].fma;
6676         }
6677
6678       /* FMSUB, FNMADD, and FNMSUB are free.  */
6679       if (GET_CODE (op0) == NEG)
6680         op0 = XEXP (op0, 0);
6681
6682       if (GET_CODE (op2) == NEG)
6683         op2 = XEXP (op2, 0);
6684
6685       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6686          and the by-element operand as operand 0.  */
6687       if (GET_CODE (op1) == NEG)
6688         op1 = XEXP (op1, 0);
6689
6690       /* Catch vector-by-element operations.  The by-element operand can
6691          either be (vec_duplicate (vec_select (x))) or just
6692          (vec_select (x)), depending on whether we are multiplying by
6693          a vector or a scalar.
6694
6695          Canonicalization is not very good in these cases, FMA4 will put the
6696          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6697       if (GET_CODE (op0) == VEC_DUPLICATE)
6698         op0 = XEXP (op0, 0);
6699       else if (GET_CODE (op1) == VEC_DUPLICATE)
6700         op1 = XEXP (op1, 0);
6701
6702       if (GET_CODE (op0) == VEC_SELECT)
6703         op0 = XEXP (op0, 0);
6704       else if (GET_CODE (op1) == VEC_SELECT)
6705         op1 = XEXP (op1, 0);
6706
6707       /* If the remaining parameters are not registers,
6708          get the cost to put them into registers.  */
6709       *cost += rtx_cost (op0, FMA, 0, speed);
6710       *cost += rtx_cost (op1, FMA, 1, speed);
6711       *cost += rtx_cost (op2, FMA, 2, speed);
6712       return true;
6713
6714     case FLOAT:
6715     case UNSIGNED_FLOAT:
6716       if (speed)
6717         *cost += extra_cost->fp[mode == DFmode].fromint;
6718       return false;
6719
6720     case FLOAT_EXTEND:
6721       if (speed)
6722         {
6723           if (VECTOR_MODE_P (mode))
6724             {
6725               /*Vector truncate.  */
6726               *cost += extra_cost->vect.alu;
6727             }
6728           else
6729             *cost += extra_cost->fp[mode == DFmode].widen;
6730         }
6731       return false;
6732
6733     case FLOAT_TRUNCATE:
6734       if (speed)
6735         {
6736           if (VECTOR_MODE_P (mode))
6737             {
6738               /*Vector conversion.  */
6739               *cost += extra_cost->vect.alu;
6740             }
6741           else
6742             *cost += extra_cost->fp[mode == DFmode].narrow;
6743         }
6744       return false;
6745
6746     case FIX:
6747     case UNSIGNED_FIX:
6748       x = XEXP (x, 0);
6749       /* Strip the rounding part.  They will all be implemented
6750          by the fcvt* family of instructions anyway.  */
6751       if (GET_CODE (x) == UNSPEC)
6752         {
6753           unsigned int uns_code = XINT (x, 1);
6754
6755           if (uns_code == UNSPEC_FRINTA
6756               || uns_code == UNSPEC_FRINTM
6757               || uns_code == UNSPEC_FRINTN
6758               || uns_code == UNSPEC_FRINTP
6759               || uns_code == UNSPEC_FRINTZ)
6760             x = XVECEXP (x, 0, 0);
6761         }
6762
6763       if (speed)
6764         {
6765           if (VECTOR_MODE_P (mode))
6766             *cost += extra_cost->vect.alu;
6767           else
6768             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6769         }
6770       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6771       return true;
6772
6773     case ABS:
6774       if (VECTOR_MODE_P (mode))
6775         {
6776           /* ABS (vector).  */
6777           if (speed)
6778             *cost += extra_cost->vect.alu;
6779         }
6780       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6781         {
6782           op0 = XEXP (x, 0);
6783
6784           /* FABD, which is analogous to FADD.  */
6785           if (GET_CODE (op0) == MINUS)
6786             {
6787               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6788                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6789               if (speed)
6790                 *cost += extra_cost->fp[mode == DFmode].addsub;
6791
6792               return true;
6793             }
6794           /* Simple FABS is analogous to FNEG.  */
6795           if (speed)
6796             *cost += extra_cost->fp[mode == DFmode].neg;
6797         }
6798       else
6799         {
6800           /* Integer ABS will either be split to
6801              two arithmetic instructions, or will be an ABS
6802              (scalar), which we don't model.  */
6803           *cost = COSTS_N_INSNS (2);
6804           if (speed)
6805             *cost += 2 * extra_cost->alu.arith;
6806         }
6807       return false;
6808
6809     case SMAX:
6810     case SMIN:
6811       if (speed)
6812         {
6813           if (VECTOR_MODE_P (mode))
6814             *cost += extra_cost->vect.alu;
6815           else
6816             {
6817               /* FMAXNM/FMINNM/FMAX/FMIN.
6818                  TODO: This may not be accurate for all implementations, but
6819                  we do not model this in the cost tables.  */
6820               *cost += extra_cost->fp[mode == DFmode].addsub;
6821             }
6822         }
6823       return false;
6824
6825     case UNSPEC:
6826       /* The floating point round to integer frint* instructions.  */
6827       if (aarch64_frint_unspec_p (XINT (x, 1)))
6828         {
6829           if (speed)
6830             *cost += extra_cost->fp[mode == DFmode].roundint;
6831
6832           return false;
6833         }
6834
6835       if (XINT (x, 1) == UNSPEC_RBIT)
6836         {
6837           if (speed)
6838             *cost += extra_cost->alu.rev;
6839
6840           return false;
6841         }
6842       break;
6843
6844     case TRUNCATE:
6845
6846       /* Decompose <su>muldi3_highpart.  */
6847       if (/* (truncate:DI  */
6848           mode == DImode
6849           /*   (lshiftrt:TI  */
6850           && GET_MODE (XEXP (x, 0)) == TImode
6851           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6852           /*      (mult:TI  */
6853           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6854           /*        (ANY_EXTEND:TI (reg:DI))
6855                     (ANY_EXTEND:TI (reg:DI)))  */
6856           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6857                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6858               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6859                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6860           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6861           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6862           /*     (const_int 64)  */
6863           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6864           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6865         {
6866           /* UMULH/SMULH.  */
6867           if (speed)
6868             *cost += extra_cost->mult[mode == DImode].extend;
6869           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6870                              MULT, 0, speed);
6871           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6872                              MULT, 1, speed);
6873           return true;
6874         }
6875
6876       /* Fall through.  */
6877     default:
6878       break;
6879     }
6880
6881   if (dump_file && (dump_flags & TDF_DETAILS))
6882     fprintf (dump_file,
6883       "\nFailed to cost RTX.  Assuming default cost.\n");
6884
6885   return true;
6886 }
6887
6888 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6889    calculated for X.  This cost is stored in *COST.  Returns true
6890    if the total cost of X was calculated.  */
6891 static bool
6892 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6893                    int param, int *cost, bool speed)
6894 {
6895   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6896
6897   if (dump_file && (dump_flags & TDF_DETAILS))
6898     {
6899       print_rtl_single (dump_file, x);
6900       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6901                speed ? "Hot" : "Cold",
6902                *cost, result ? "final" : "partial");
6903     }
6904
6905   return result;
6906 }
6907
6908 static int
6909 aarch64_register_move_cost (machine_mode mode,
6910                             reg_class_t from_i, reg_class_t to_i)
6911 {
6912   enum reg_class from = (enum reg_class) from_i;
6913   enum reg_class to = (enum reg_class) to_i;
6914   const struct cpu_regmove_cost *regmove_cost
6915     = aarch64_tune_params.regmove_cost;
6916
6917   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6918   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6919     to = GENERAL_REGS;
6920
6921   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6922     from = GENERAL_REGS;
6923
6924   /* Moving between GPR and stack cost is the same as GP2GP.  */
6925   if ((from == GENERAL_REGS && to == STACK_REG)
6926       || (to == GENERAL_REGS && from == STACK_REG))
6927     return regmove_cost->GP2GP;
6928
6929   /* To/From the stack register, we move via the gprs.  */
6930   if (to == STACK_REG || from == STACK_REG)
6931     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6932             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6933
6934   if (GET_MODE_SIZE (mode) == 16)
6935     {
6936       /* 128-bit operations on general registers require 2 instructions.  */
6937       if (from == GENERAL_REGS && to == GENERAL_REGS)
6938         return regmove_cost->GP2GP * 2;
6939       else if (from == GENERAL_REGS)
6940         return regmove_cost->GP2FP * 2;
6941       else if (to == GENERAL_REGS)
6942         return regmove_cost->FP2GP * 2;
6943
6944       /* When AdvSIMD instructions are disabled it is not possible to move
6945          a 128-bit value directly between Q registers.  This is handled in
6946          secondary reload.  A general register is used as a scratch to move
6947          the upper DI value and the lower DI value is moved directly,
6948          hence the cost is the sum of three moves. */
6949       if (! TARGET_SIMD)
6950         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6951
6952       return regmove_cost->FP2FP;
6953     }
6954
6955   if (from == GENERAL_REGS && to == GENERAL_REGS)
6956     return regmove_cost->GP2GP;
6957   else if (from == GENERAL_REGS)
6958     return regmove_cost->GP2FP;
6959   else if (to == GENERAL_REGS)
6960     return regmove_cost->FP2GP;
6961
6962   return regmove_cost->FP2FP;
6963 }
6964
6965 static int
6966 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6967                           reg_class_t rclass ATTRIBUTE_UNUSED,
6968                           bool in ATTRIBUTE_UNUSED)
6969 {
6970   return aarch64_tune_params.memmov_cost;
6971 }
6972
6973 /* Return the number of instructions that can be issued per cycle.  */
6974 static int
6975 aarch64_sched_issue_rate (void)
6976 {
6977   return aarch64_tune_params.issue_rate;
6978 }
6979
6980 static int
6981 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6982 {
6983   int issue_rate = aarch64_sched_issue_rate ();
6984
6985   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6986 }
6987
6988 /* Vectorizer cost model target hooks.  */
6989
6990 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6991 static int
6992 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6993                                     tree vectype,
6994                                     int misalign ATTRIBUTE_UNUSED)
6995 {
6996   unsigned elements;
6997
6998   switch (type_of_cost)
6999     {
7000       case scalar_stmt:
7001         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7002
7003       case scalar_load:
7004         return aarch64_tune_params.vec_costs->scalar_load_cost;
7005
7006       case scalar_store:
7007         return aarch64_tune_params.vec_costs->scalar_store_cost;
7008
7009       case vector_stmt:
7010         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7011
7012       case vector_load:
7013         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7014
7015       case vector_store:
7016         return aarch64_tune_params.vec_costs->vec_store_cost;
7017
7018       case vec_to_scalar:
7019         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7020
7021       case scalar_to_vec:
7022         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7023
7024       case unaligned_load:
7025         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7026
7027       case unaligned_store:
7028         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7029
7030       case cond_branch_taken:
7031         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7032
7033       case cond_branch_not_taken:
7034         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7035
7036       case vec_perm:
7037       case vec_promote_demote:
7038         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7039
7040       case vec_construct:
7041         elements = TYPE_VECTOR_SUBPARTS (vectype);
7042         return elements / 2 + 1;
7043
7044       default:
7045         gcc_unreachable ();
7046     }
7047 }
7048
7049 /* Implement targetm.vectorize.add_stmt_cost.  */
7050 static unsigned
7051 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7052                        struct _stmt_vec_info *stmt_info, int misalign,
7053                        enum vect_cost_model_location where)
7054 {
7055   unsigned *cost = (unsigned *) data;
7056   unsigned retval = 0;
7057
7058   if (flag_vect_cost_model)
7059     {
7060       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7061       int stmt_cost =
7062             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7063
7064       /* Statements in an inner loop relative to the loop being
7065          vectorized are weighted more heavily.  The value here is
7066          a function (linear for now) of the loop nest level.  */
7067       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7068         {
7069           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
7070           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
7071           unsigned nest_level = loop_depth (loop);
7072
7073           count *= nest_level;
7074         }
7075
7076       retval = (unsigned) (count * stmt_cost);
7077       cost[where] += retval;
7078     }
7079
7080   return retval;
7081 }
7082
7083 static void initialize_aarch64_code_model (void);
7084
7085 /* Parse the architecture extension string.  */
7086
7087 static void
7088 aarch64_parse_extension (char *str)
7089 {
7090   /* The extension string is parsed left to right.  */
7091   const struct aarch64_option_extension *opt = NULL;
7092
7093   /* Flag to say whether we are adding or removing an extension.  */
7094   int adding_ext = -1;
7095
7096   while (str != NULL && *str != 0)
7097     {
7098       char *ext;
7099       size_t len;
7100
7101       str++;
7102       ext = strchr (str, '+');
7103
7104       if (ext != NULL)
7105         len = ext - str;
7106       else
7107         len = strlen (str);
7108
7109       if (len >= 2 && strncmp (str, "no", 2) == 0)
7110         {
7111           adding_ext = 0;
7112           len -= 2;
7113           str += 2;
7114         }
7115       else if (len > 0)
7116         adding_ext = 1;
7117
7118       if (len == 0)
7119         {
7120           error ("missing feature modifier after %qs", adding_ext ? "+"
7121                                                                   : "+no");
7122           return;
7123         }
7124
7125       /* Scan over the extensions table trying to find an exact match.  */
7126       for (opt = all_extensions; opt->name != NULL; opt++)
7127         {
7128           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7129             {
7130               /* Add or remove the extension.  */
7131               if (adding_ext)
7132                 aarch64_isa_flags |= opt->flags_on;
7133               else
7134                 aarch64_isa_flags &= ~(opt->flags_off);
7135               break;
7136             }
7137         }
7138
7139       if (opt->name == NULL)
7140         {
7141           /* Extension not found in list.  */
7142           error ("unknown feature modifier %qs", str);
7143           return;
7144         }
7145
7146       str = ext;
7147     };
7148
7149   return;
7150 }
7151
7152 /* Parse the ARCH string.  */
7153
7154 static void
7155 aarch64_parse_arch (void)
7156 {
7157   char *ext;
7158   const struct processor *arch;
7159   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7160   size_t len;
7161
7162   strcpy (str, aarch64_arch_string);
7163
7164   ext = strchr (str, '+');
7165
7166   if (ext != NULL)
7167     len = ext - str;
7168   else
7169     len = strlen (str);
7170
7171   if (len == 0)
7172     {
7173       error ("missing arch name in -march=%qs", str);
7174       return;
7175     }
7176
7177   /* Loop through the list of supported ARCHs to find a match.  */
7178   for (arch = all_architectures; arch->name != NULL; arch++)
7179     {
7180       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7181         {
7182           selected_arch = arch;
7183           aarch64_isa_flags = selected_arch->flags;
7184
7185           if (!selected_cpu)
7186             selected_cpu = &all_cores[selected_arch->core];
7187
7188           if (ext != NULL)
7189             {
7190               /* ARCH string contains at least one extension.  */
7191               aarch64_parse_extension (ext);
7192             }
7193
7194           if (strcmp (selected_arch->arch, selected_cpu->arch))
7195             {
7196               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7197                        selected_cpu->name, selected_arch->name);
7198             }
7199
7200           return;
7201         }
7202     }
7203
7204   /* ARCH name not found in list.  */
7205   error ("unknown value %qs for -march", str);
7206   return;
7207 }
7208
7209 /* Parse the CPU string.  */
7210
7211 static void
7212 aarch64_parse_cpu (void)
7213 {
7214   char *ext;
7215   const struct processor *cpu;
7216   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7217   size_t len;
7218
7219   strcpy (str, aarch64_cpu_string);
7220
7221   ext = strchr (str, '+');
7222
7223   if (ext != NULL)
7224     len = ext - str;
7225   else
7226     len = strlen (str);
7227
7228   if (len == 0)
7229     {
7230       error ("missing cpu name in -mcpu=%qs", str);
7231       return;
7232     }
7233
7234   /* Loop through the list of supported CPUs to find a match.  */
7235   for (cpu = all_cores; cpu->name != NULL; cpu++)
7236     {
7237       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7238         {
7239           selected_cpu = cpu;
7240           aarch64_isa_flags = selected_cpu->flags;
7241
7242           if (ext != NULL)
7243             {
7244               /* CPU string contains at least one extension.  */
7245               aarch64_parse_extension (ext);
7246             }
7247
7248           return;
7249         }
7250     }
7251
7252   /* CPU name not found in list.  */
7253   error ("unknown value %qs for -mcpu", str);
7254   return;
7255 }
7256
7257 /* Parse the TUNE string.  */
7258
7259 static void
7260 aarch64_parse_tune (void)
7261 {
7262   const struct processor *cpu;
7263   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7264   strcpy (str, aarch64_tune_string);
7265
7266   /* Loop through the list of supported CPUs to find a match.  */
7267   for (cpu = all_cores; cpu->name != NULL; cpu++)
7268     {
7269       if (strcmp (cpu->name, str) == 0)
7270         {
7271           selected_tune = cpu;
7272           return;
7273         }
7274     }
7275
7276   /* CPU name not found in list.  */
7277   error ("unknown value %qs for -mtune", str);
7278   return;
7279 }
7280
7281 /* Parse TOKEN, which has length LENGTH to see if it is an option
7282    described in FLAG.  If it is, return the index bit for that fusion type.
7283    If not, error (printing OPTION_NAME) and return zero.  */
7284
7285 static unsigned int
7286 aarch64_parse_one_option_token (const char *token,
7287                                 size_t length,
7288                                 const struct aarch64_flag_desc *flag,
7289                                 const char *option_name)
7290 {
7291   for (; flag->name != NULL; flag++)
7292     {
7293       if (length == strlen (flag->name)
7294           && !strncmp (flag->name, token, length))
7295         return flag->flag;
7296     }
7297
7298   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7299   return 0;
7300 }
7301
7302 /* Parse OPTION which is a comma-separated list of flags to enable.
7303    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7304    default state we inherit from the CPU tuning structures.  OPTION_NAME
7305    gives the top-level option we are parsing in the -moverride string,
7306    for use in error messages.  */
7307
7308 static unsigned int
7309 aarch64_parse_boolean_options (const char *option,
7310                                const struct aarch64_flag_desc *flags,
7311                                unsigned int initial_state,
7312                                const char *option_name)
7313 {
7314   const char separator = '.';
7315   const char* specs = option;
7316   const char* ntoken = option;
7317   unsigned int found_flags = initial_state;
7318
7319   while ((ntoken = strchr (specs, separator)))
7320     {
7321       size_t token_length = ntoken - specs;
7322       unsigned token_ops = aarch64_parse_one_option_token (specs,
7323                                                            token_length,
7324                                                            flags,
7325                                                            option_name);
7326       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7327          in the token stream, reset the supported operations.  So:
7328
7329            adrp+add.cmp+branch.none.adrp+add
7330
7331            would have the result of turning on only adrp+add fusion.  */
7332       if (!token_ops)
7333         found_flags = 0;
7334
7335       found_flags |= token_ops;
7336       specs = ++ntoken;
7337     }
7338
7339   /* We ended with a comma, print something.  */
7340   if (!(*specs))
7341     {
7342       error ("%s string ill-formed\n", option_name);
7343       return 0;
7344     }
7345
7346   /* We still have one more token to parse.  */
7347   size_t token_length = strlen (specs);
7348   unsigned token_ops = aarch64_parse_one_option_token (specs,
7349                                                        token_length,
7350                                                        flags,
7351                                                        option_name);
7352    if (!token_ops)
7353      found_flags = 0;
7354
7355   found_flags |= token_ops;
7356   return found_flags;
7357 }
7358
7359 /* Support for overriding instruction fusion.  */
7360
7361 static void
7362 aarch64_parse_fuse_string (const char *fuse_string,
7363                             struct tune_params *tune)
7364 {
7365   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7366                                                      aarch64_fusible_pairs,
7367                                                      tune->fusible_ops,
7368                                                      "fuse=");
7369 }
7370
7371 /* Support for overriding other tuning flags.  */
7372
7373 static void
7374 aarch64_parse_tune_string (const char *tune_string,
7375                             struct tune_params *tune)
7376 {
7377   tune->extra_tuning_flags
7378     = aarch64_parse_boolean_options (tune_string,
7379                                      aarch64_tuning_flags,
7380                                      tune->extra_tuning_flags,
7381                                      "tune=");
7382 }
7383
7384 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7385    we understand.  If it is, extract the option string and handoff to
7386    the appropriate function.  */
7387
7388 void
7389 aarch64_parse_one_override_token (const char* token,
7390                                   size_t length,
7391                                   struct tune_params *tune)
7392 {
7393   const struct aarch64_tuning_override_function *fn
7394     = aarch64_tuning_override_functions;
7395
7396   const char *option_part = strchr (token, '=');
7397   if (!option_part)
7398     {
7399       error ("tuning string missing in option (%s)", token);
7400       return;
7401     }
7402
7403   /* Get the length of the option name.  */
7404   length = option_part - token;
7405   /* Skip the '=' to get to the option string.  */
7406   option_part++;
7407
7408   for (; fn->name != NULL; fn++)
7409     {
7410       if (!strncmp (fn->name, token, length))
7411         {
7412           fn->parse_override (option_part, tune);
7413           return;
7414         }
7415     }
7416
7417   error ("unknown tuning option (%s)",token);
7418   return;
7419 }
7420
7421 /* Parse STRING looking for options in the format:
7422      string     :: option:string
7423      option     :: name=substring
7424      name       :: {a-z}
7425      substring  :: defined by option.  */
7426
7427 static void
7428 aarch64_parse_override_string (const char* input_string,
7429                                struct tune_params* tune)
7430 {
7431   const char separator = ':';
7432   size_t string_length = strlen (input_string) + 1;
7433   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7434   char *string = string_root;
7435   strncpy (string, input_string, string_length);
7436   string[string_length - 1] = '\0';
7437
7438   char* ntoken = string;
7439
7440   while ((ntoken = strchr (string, separator)))
7441     {
7442       size_t token_length = ntoken - string;
7443       /* Make this substring look like a string.  */
7444       *ntoken = '\0';
7445       aarch64_parse_one_override_token (string, token_length, tune);
7446       string = ++ntoken;
7447     }
7448
7449   /* One last option to parse.  */
7450   aarch64_parse_one_override_token (string, strlen (string), tune);
7451   free (string_root);
7452 }
7453
7454 /* Implement TARGET_OPTION_OVERRIDE.  */
7455
7456 static void
7457 aarch64_override_options (void)
7458 {
7459   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7460      If either of -march or -mtune is given, they override their
7461      respective component of -mcpu.
7462
7463      So, first parse AARCH64_CPU_STRING, then the others, be careful
7464      with -march as, if -mcpu is not present on the command line, march
7465      must set a sensible default CPU.  */
7466   if (aarch64_cpu_string)
7467     {
7468       aarch64_parse_cpu ();
7469     }
7470
7471   if (aarch64_arch_string)
7472     {
7473       aarch64_parse_arch ();
7474     }
7475
7476   if (aarch64_tune_string)
7477     {
7478       aarch64_parse_tune ();
7479     }
7480
7481 #ifndef HAVE_AS_MABI_OPTION
7482   /* The compiler may have been configured with 2.23.* binutils, which does
7483      not have support for ILP32.  */
7484   if (TARGET_ILP32)
7485     error ("Assembler does not support -mabi=ilp32");
7486 #endif
7487
7488   initialize_aarch64_code_model ();
7489
7490   aarch64_build_bitmask_table ();
7491
7492   /* This target defaults to strict volatile bitfields.  */
7493   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7494     flag_strict_volatile_bitfields = 1;
7495
7496   /* If the user did not specify a processor, choose the default
7497      one for them.  This will be the CPU set during configuration using
7498      --with-cpu, otherwise it is "generic".  */
7499   if (!selected_cpu)
7500     {
7501       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7502       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7503     }
7504
7505   gcc_assert (selected_cpu);
7506
7507   if (!selected_tune)
7508     selected_tune = selected_cpu;
7509
7510   aarch64_tune_flags = selected_tune->flags;
7511   aarch64_tune = selected_tune->core;
7512   /* Make a copy of the tuning parameters attached to the core, which
7513      we may later overwrite.  */
7514   aarch64_tune_params = *(selected_tune->tune);
7515   aarch64_architecture_version = selected_cpu->architecture_version;
7516
7517   if (aarch64_override_tune_string)
7518     aarch64_parse_override_string (aarch64_override_tune_string,
7519                                    &aarch64_tune_params);
7520
7521   if (aarch64_fix_a53_err835769 == 2)
7522     {
7523 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7524       aarch64_fix_a53_err835769 = 1;
7525 #else
7526       aarch64_fix_a53_err835769 = 0;
7527 #endif
7528     }
7529
7530   aarch64_register_fma_steering ();
7531
7532   aarch64_override_options_after_change ();
7533 }
7534
7535 /* Implement targetm.override_options_after_change.  */
7536
7537 static void
7538 aarch64_override_options_after_change (void)
7539 {
7540   if (flag_omit_frame_pointer)
7541     flag_omit_leaf_frame_pointer = false;
7542   else if (flag_omit_leaf_frame_pointer)
7543     flag_omit_frame_pointer = true;
7544
7545   /* If not optimizing for size, set the default
7546      alignment to what the target wants */
7547   if (!optimize_size)
7548     {
7549       if (align_loops <= 0)
7550         align_loops = aarch64_tune_params.loop_align;
7551       if (align_jumps <= 0)
7552         align_jumps = aarch64_tune_params.jump_align;
7553       if (align_functions <= 0)
7554         align_functions = aarch64_tune_params.function_align;
7555     }
7556 }
7557
7558 static struct machine_function *
7559 aarch64_init_machine_status (void)
7560 {
7561   struct machine_function *machine;
7562   machine = ggc_cleared_alloc<machine_function> ();
7563   return machine;
7564 }
7565
7566 void
7567 aarch64_init_expanders (void)
7568 {
7569   init_machine_status = aarch64_init_machine_status;
7570 }
7571
7572 /* A checking mechanism for the implementation of the various code models.  */
7573 static void
7574 initialize_aarch64_code_model (void)
7575 {
7576    if (flag_pic)
7577      {
7578        switch (aarch64_cmodel_var)
7579          {
7580          case AARCH64_CMODEL_TINY:
7581            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7582            break;
7583          case AARCH64_CMODEL_SMALL:
7584 #ifdef HAVE_AS_SMALL_PIC_RELOCS
7585            aarch64_cmodel = (flag_pic == 2
7586                              ? AARCH64_CMODEL_SMALL_PIC
7587                              : AARCH64_CMODEL_SMALL_SPIC);
7588 #else
7589            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7590 #endif
7591            break;
7592          case AARCH64_CMODEL_LARGE:
7593            sorry ("code model %qs with -f%s", "large",
7594                   flag_pic > 1 ? "PIC" : "pic");
7595          default:
7596            gcc_unreachable ();
7597          }
7598      }
7599    else
7600      aarch64_cmodel = aarch64_cmodel_var;
7601 }
7602
7603 /* Return true if SYMBOL_REF X binds locally.  */
7604
7605 static bool
7606 aarch64_symbol_binds_local_p (const_rtx x)
7607 {
7608   return (SYMBOL_REF_DECL (x)
7609           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7610           : SYMBOL_REF_LOCAL_P (x));
7611 }
7612
7613 /* Return true if SYMBOL_REF X is thread local */
7614 static bool
7615 aarch64_tls_symbol_p (rtx x)
7616 {
7617   if (! TARGET_HAVE_TLS)
7618     return false;
7619
7620   if (GET_CODE (x) != SYMBOL_REF)
7621     return false;
7622
7623   return SYMBOL_REF_TLS_MODEL (x) != 0;
7624 }
7625
7626 /* Classify a TLS symbol into one of the TLS kinds.  */
7627 enum aarch64_symbol_type
7628 aarch64_classify_tls_symbol (rtx x)
7629 {
7630   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7631
7632   switch (tls_kind)
7633     {
7634     case TLS_MODEL_GLOBAL_DYNAMIC:
7635     case TLS_MODEL_LOCAL_DYNAMIC:
7636       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7637
7638     case TLS_MODEL_INITIAL_EXEC:
7639       return SYMBOL_SMALL_GOTTPREL;
7640
7641     case TLS_MODEL_LOCAL_EXEC:
7642       return SYMBOL_TLSLE;
7643
7644     case TLS_MODEL_EMULATED:
7645     case TLS_MODEL_NONE:
7646       return SYMBOL_FORCE_TO_MEM;
7647
7648     default:
7649       gcc_unreachable ();
7650     }
7651 }
7652
7653 /* Return the method that should be used to access SYMBOL_REF or
7654    LABEL_REF X in context CONTEXT.  */
7655
7656 enum aarch64_symbol_type
7657 aarch64_classify_symbol (rtx x, rtx offset,
7658                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7659 {
7660   if (GET_CODE (x) == LABEL_REF)
7661     {
7662       switch (aarch64_cmodel)
7663         {
7664         case AARCH64_CMODEL_LARGE:
7665           return SYMBOL_FORCE_TO_MEM;
7666
7667         case AARCH64_CMODEL_TINY_PIC:
7668         case AARCH64_CMODEL_TINY:
7669           return SYMBOL_TINY_ABSOLUTE;
7670
7671         case AARCH64_CMODEL_SMALL_SPIC:
7672         case AARCH64_CMODEL_SMALL_PIC:
7673         case AARCH64_CMODEL_SMALL:
7674           return SYMBOL_SMALL_ABSOLUTE;
7675
7676         default:
7677           gcc_unreachable ();
7678         }
7679     }
7680
7681   if (GET_CODE (x) == SYMBOL_REF)
7682     {
7683       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7684           return SYMBOL_FORCE_TO_MEM;
7685
7686       if (aarch64_tls_symbol_p (x))
7687         return aarch64_classify_tls_symbol (x);
7688
7689       switch (aarch64_cmodel)
7690         {
7691         case AARCH64_CMODEL_TINY:
7692           /* When we retreive symbol + offset address, we have to make sure
7693              the offset does not cause overflow of the final address.  But
7694              we have no way of knowing the address of symbol at compile time
7695              so we can't accurately say if the distance between the PC and
7696              symbol + offset is outside the addressible range of +/-1M in the
7697              TINY code model.  So we rely on images not being greater than
7698              1M and cap the offset at 1M and anything beyond 1M will have to
7699              be loaded using an alternative mechanism.  */
7700           if (SYMBOL_REF_WEAK (x)
7701               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7702             return SYMBOL_FORCE_TO_MEM;
7703           return SYMBOL_TINY_ABSOLUTE;
7704
7705         case AARCH64_CMODEL_SMALL:
7706           /* Same reasoning as the tiny code model, but the offset cap here is
7707              4G.  */
7708           if (SYMBOL_REF_WEAK (x)
7709               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7710                             HOST_WIDE_INT_C (4294967264)))
7711             return SYMBOL_FORCE_TO_MEM;
7712           return SYMBOL_SMALL_ABSOLUTE;
7713
7714         case AARCH64_CMODEL_TINY_PIC:
7715           if (!aarch64_symbol_binds_local_p (x))
7716             return SYMBOL_TINY_GOT;
7717           return SYMBOL_TINY_ABSOLUTE;
7718
7719         case AARCH64_CMODEL_SMALL_SPIC:
7720         case AARCH64_CMODEL_SMALL_PIC:
7721           if (!aarch64_symbol_binds_local_p (x))
7722             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
7723                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
7724           return SYMBOL_SMALL_ABSOLUTE;
7725
7726         default:
7727           gcc_unreachable ();
7728         }
7729     }
7730
7731   /* By default push everything into the constant pool.  */
7732   return SYMBOL_FORCE_TO_MEM;
7733 }
7734
7735 bool
7736 aarch64_constant_address_p (rtx x)
7737 {
7738   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7739 }
7740
7741 bool
7742 aarch64_legitimate_pic_operand_p (rtx x)
7743 {
7744   if (GET_CODE (x) == SYMBOL_REF
7745       || (GET_CODE (x) == CONST
7746           && GET_CODE (XEXP (x, 0)) == PLUS
7747           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7748      return false;
7749
7750   return true;
7751 }
7752
7753 /* Return true if X holds either a quarter-precision or
7754      floating-point +0.0 constant.  */
7755 static bool
7756 aarch64_valid_floating_const (machine_mode mode, rtx x)
7757 {
7758   if (!CONST_DOUBLE_P (x))
7759     return false;
7760
7761   if (aarch64_float_const_zero_rtx_p (x))
7762     return true;
7763
7764   /* We only handle moving 0.0 to a TFmode register.  */
7765   if (!(mode == SFmode || mode == DFmode))
7766     return false;
7767
7768   return aarch64_float_const_representable_p (x);
7769 }
7770
7771 static bool
7772 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7773 {
7774   /* Do not allow vector struct mode constants.  We could support
7775      0 and -1 easily, but they need support in aarch64-simd.md.  */
7776   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7777     return false;
7778
7779   /* This could probably go away because
7780      we now decompose CONST_INTs according to expand_mov_immediate.  */
7781   if ((GET_CODE (x) == CONST_VECTOR
7782        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7783       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7784         return !targetm.cannot_force_const_mem (mode, x);
7785
7786   if (GET_CODE (x) == HIGH
7787       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7788     return true;
7789
7790   return aarch64_constant_address_p (x);
7791 }
7792
7793 rtx
7794 aarch64_load_tp (rtx target)
7795 {
7796   if (!target
7797       || GET_MODE (target) != Pmode
7798       || !register_operand (target, Pmode))
7799     target = gen_reg_rtx (Pmode);
7800
7801   /* Can return in any reg.  */
7802   emit_insn (gen_aarch64_load_tp_hard (target));
7803   return target;
7804 }
7805
7806 /* On AAPCS systems, this is the "struct __va_list".  */
7807 static GTY(()) tree va_list_type;
7808
7809 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7810    Return the type to use as __builtin_va_list.
7811
7812    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7813
7814    struct __va_list
7815    {
7816      void *__stack;
7817      void *__gr_top;
7818      void *__vr_top;
7819      int   __gr_offs;
7820      int   __vr_offs;
7821    };  */
7822
7823 static tree
7824 aarch64_build_builtin_va_list (void)
7825 {
7826   tree va_list_name;
7827   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7828
7829   /* Create the type.  */
7830   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7831   /* Give it the required name.  */
7832   va_list_name = build_decl (BUILTINS_LOCATION,
7833                              TYPE_DECL,
7834                              get_identifier ("__va_list"),
7835                              va_list_type);
7836   DECL_ARTIFICIAL (va_list_name) = 1;
7837   TYPE_NAME (va_list_type) = va_list_name;
7838   TYPE_STUB_DECL (va_list_type) = va_list_name;
7839
7840   /* Create the fields.  */
7841   f_stack = build_decl (BUILTINS_LOCATION,
7842                         FIELD_DECL, get_identifier ("__stack"),
7843                         ptr_type_node);
7844   f_grtop = build_decl (BUILTINS_LOCATION,
7845                         FIELD_DECL, get_identifier ("__gr_top"),
7846                         ptr_type_node);
7847   f_vrtop = build_decl (BUILTINS_LOCATION,
7848                         FIELD_DECL, get_identifier ("__vr_top"),
7849                         ptr_type_node);
7850   f_groff = build_decl (BUILTINS_LOCATION,
7851                         FIELD_DECL, get_identifier ("__gr_offs"),
7852                         integer_type_node);
7853   f_vroff = build_decl (BUILTINS_LOCATION,
7854                         FIELD_DECL, get_identifier ("__vr_offs"),
7855                         integer_type_node);
7856
7857   DECL_ARTIFICIAL (f_stack) = 1;
7858   DECL_ARTIFICIAL (f_grtop) = 1;
7859   DECL_ARTIFICIAL (f_vrtop) = 1;
7860   DECL_ARTIFICIAL (f_groff) = 1;
7861   DECL_ARTIFICIAL (f_vroff) = 1;
7862
7863   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7864   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7865   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7866   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7867   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7868
7869   TYPE_FIELDS (va_list_type) = f_stack;
7870   DECL_CHAIN (f_stack) = f_grtop;
7871   DECL_CHAIN (f_grtop) = f_vrtop;
7872   DECL_CHAIN (f_vrtop) = f_groff;
7873   DECL_CHAIN (f_groff) = f_vroff;
7874
7875   /* Compute its layout.  */
7876   layout_type (va_list_type);
7877
7878   return va_list_type;
7879 }
7880
7881 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7882 static void
7883 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7884 {
7885   const CUMULATIVE_ARGS *cum;
7886   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7887   tree stack, grtop, vrtop, groff, vroff;
7888   tree t;
7889   int gr_save_area_size;
7890   int vr_save_area_size;
7891   int vr_offset;
7892
7893   cum = &crtl->args.info;
7894   gr_save_area_size
7895     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7896   vr_save_area_size
7897     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7898
7899   if (!TARGET_FLOAT)
7900     {
7901       gcc_assert (cum->aapcs_nvrn == 0);
7902       vr_save_area_size = 0;
7903     }
7904
7905   f_stack = TYPE_FIELDS (va_list_type_node);
7906   f_grtop = DECL_CHAIN (f_stack);
7907   f_vrtop = DECL_CHAIN (f_grtop);
7908   f_groff = DECL_CHAIN (f_vrtop);
7909   f_vroff = DECL_CHAIN (f_groff);
7910
7911   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7912                   NULL_TREE);
7913   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7914                   NULL_TREE);
7915   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7916                   NULL_TREE);
7917   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7918                   NULL_TREE);
7919   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7920                   NULL_TREE);
7921
7922   /* Emit code to initialize STACK, which points to the next varargs stack
7923      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7924      by named arguments.  STACK is 8-byte aligned.  */
7925   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7926   if (cum->aapcs_stack_size > 0)
7927     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7928   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7929   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7930
7931   /* Emit code to initialize GRTOP, the top of the GR save area.
7932      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7933   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7934   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7935   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7936
7937   /* Emit code to initialize VRTOP, the top of the VR save area.
7938      This address is gr_save_area_bytes below GRTOP, rounded
7939      down to the next 16-byte boundary.  */
7940   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7941   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7942                              STACK_BOUNDARY / BITS_PER_UNIT);
7943
7944   if (vr_offset)
7945     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7946   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7947   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7948
7949   /* Emit code to initialize GROFF, the offset from GRTOP of the
7950      next GPR argument.  */
7951   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7952               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7953   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7954
7955   /* Likewise emit code to initialize VROFF, the offset from FTOP
7956      of the next VR argument.  */
7957   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7958               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7959   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7960 }
7961
7962 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7963
7964 static tree
7965 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7966                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7967 {
7968   tree addr;
7969   bool indirect_p;
7970   bool is_ha;           /* is HFA or HVA.  */
7971   bool dw_align;        /* double-word align.  */
7972   machine_mode ag_mode = VOIDmode;
7973   int nregs;
7974   machine_mode mode;
7975
7976   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7977   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7978   HOST_WIDE_INT size, rsize, adjust, align;
7979   tree t, u, cond1, cond2;
7980
7981   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7982   if (indirect_p)
7983     type = build_pointer_type (type);
7984
7985   mode = TYPE_MODE (type);
7986
7987   f_stack = TYPE_FIELDS (va_list_type_node);
7988   f_grtop = DECL_CHAIN (f_stack);
7989   f_vrtop = DECL_CHAIN (f_grtop);
7990   f_groff = DECL_CHAIN (f_vrtop);
7991   f_vroff = DECL_CHAIN (f_groff);
7992
7993   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7994                   f_stack, NULL_TREE);
7995   size = int_size_in_bytes (type);
7996   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7997
7998   dw_align = false;
7999   adjust = 0;
8000   if (aarch64_vfp_is_call_or_return_candidate (mode,
8001                                                type,
8002                                                &ag_mode,
8003                                                &nregs,
8004                                                &is_ha))
8005     {
8006       /* TYPE passed in fp/simd registers.  */
8007       if (!TARGET_FLOAT)
8008         aarch64_err_no_fpadvsimd (mode, "varargs");
8009
8010       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
8011                       unshare_expr (valist), f_vrtop, NULL_TREE);
8012       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
8013                       unshare_expr (valist), f_vroff, NULL_TREE);
8014
8015       rsize = nregs * UNITS_PER_VREG;
8016
8017       if (is_ha)
8018         {
8019           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
8020             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
8021         }
8022       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
8023                && size < UNITS_PER_VREG)
8024         {
8025           adjust = UNITS_PER_VREG - size;
8026         }
8027     }
8028   else
8029     {
8030       /* TYPE passed in general registers.  */
8031       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
8032                       unshare_expr (valist), f_grtop, NULL_TREE);
8033       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
8034                       unshare_expr (valist), f_groff, NULL_TREE);
8035       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
8036       nregs = rsize / UNITS_PER_WORD;
8037
8038       if (align > 8)
8039         dw_align = true;
8040
8041       if (BLOCK_REG_PADDING (mode, type, 1) == downward
8042           && size < UNITS_PER_WORD)
8043         {
8044           adjust = UNITS_PER_WORD  - size;
8045         }
8046     }
8047
8048   /* Get a local temporary for the field value.  */
8049   off = get_initialized_tmp_var (f_off, pre_p, NULL);
8050
8051   /* Emit code to branch if off >= 0.  */
8052   t = build2 (GE_EXPR, boolean_type_node, off,
8053               build_int_cst (TREE_TYPE (off), 0));
8054   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
8055
8056   if (dw_align)
8057     {
8058       /* Emit: offs = (offs + 15) & -16.  */
8059       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8060                   build_int_cst (TREE_TYPE (off), 15));
8061       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
8062                   build_int_cst (TREE_TYPE (off), -16));
8063       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
8064     }
8065   else
8066     roundup = NULL;
8067
8068   /* Update ap.__[g|v]r_offs  */
8069   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8070               build_int_cst (TREE_TYPE (off), rsize));
8071   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
8072
8073   /* String up.  */
8074   if (roundup)
8075     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8076
8077   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
8078   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
8079               build_int_cst (TREE_TYPE (f_off), 0));
8080   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
8081
8082   /* String up: make sure the assignment happens before the use.  */
8083   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
8084   COND_EXPR_ELSE (cond1) = t;
8085
8086   /* Prepare the trees handling the argument that is passed on the stack;
8087      the top level node will store in ON_STACK.  */
8088   arg = get_initialized_tmp_var (stack, pre_p, NULL);
8089   if (align > 8)
8090     {
8091       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
8092       t = fold_convert (intDI_type_node, arg);
8093       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8094                   build_int_cst (TREE_TYPE (t), 15));
8095       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8096                   build_int_cst (TREE_TYPE (t), -16));
8097       t = fold_convert (TREE_TYPE (arg), t);
8098       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
8099     }
8100   else
8101     roundup = NULL;
8102   /* Advance ap.__stack  */
8103   t = fold_convert (intDI_type_node, arg);
8104   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8105               build_int_cst (TREE_TYPE (t), size + 7));
8106   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8107               build_int_cst (TREE_TYPE (t), -8));
8108   t = fold_convert (TREE_TYPE (arg), t);
8109   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
8110   /* String up roundup and advance.  */
8111   if (roundup)
8112     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8113   /* String up with arg */
8114   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
8115   /* Big-endianness related address adjustment.  */
8116   if (BLOCK_REG_PADDING (mode, type, 1) == downward
8117       && size < UNITS_PER_WORD)
8118   {
8119     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
8120                 size_int (UNITS_PER_WORD - size));
8121     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
8122   }
8123
8124   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
8125   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
8126
8127   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
8128   t = off;
8129   if (adjust)
8130     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
8131                 build_int_cst (TREE_TYPE (off), adjust));
8132
8133   t = fold_convert (sizetype, t);
8134   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
8135
8136   if (is_ha)
8137     {
8138       /* type ha; // treat as "struct {ftype field[n];}"
8139          ... [computing offs]
8140          for (i = 0; i <nregs; ++i, offs += 16)
8141            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
8142          return ha;  */
8143       int i;
8144       tree tmp_ha, field_t, field_ptr_t;
8145
8146       /* Declare a local variable.  */
8147       tmp_ha = create_tmp_var_raw (type, "ha");
8148       gimple_add_tmp_var (tmp_ha);
8149
8150       /* Establish the base type.  */
8151       switch (ag_mode)
8152         {
8153         case SFmode:
8154           field_t = float_type_node;
8155           field_ptr_t = float_ptr_type_node;
8156           break;
8157         case DFmode:
8158           field_t = double_type_node;
8159           field_ptr_t = double_ptr_type_node;
8160           break;
8161         case TFmode:
8162           field_t = long_double_type_node;
8163           field_ptr_t = long_double_ptr_type_node;
8164           break;
8165 /* The half precision and quad precision are not fully supported yet.  Enable
8166    the following code after the support is complete.  Need to find the correct
8167    type node for __fp16 *.  */
8168 #if 0
8169         case HFmode:
8170           field_t = float_type_node;
8171           field_ptr_t = float_ptr_type_node;
8172           break;
8173 #endif
8174         case V2SImode:
8175         case V4SImode:
8176             {
8177               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
8178               field_t = build_vector_type_for_mode (innertype, ag_mode);
8179               field_ptr_t = build_pointer_type (field_t);
8180             }
8181           break;
8182         default:
8183           gcc_assert (0);
8184         }
8185
8186       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
8187       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
8188       addr = t;
8189       t = fold_convert (field_ptr_t, addr);
8190       t = build2 (MODIFY_EXPR, field_t,
8191                   build1 (INDIRECT_REF, field_t, tmp_ha),
8192                   build1 (INDIRECT_REF, field_t, t));
8193
8194       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
8195       for (i = 1; i < nregs; ++i)
8196         {
8197           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
8198           u = fold_convert (field_ptr_t, addr);
8199           u = build2 (MODIFY_EXPR, field_t,
8200                       build2 (MEM_REF, field_t, tmp_ha,
8201                               build_int_cst (field_ptr_t,
8202                                              (i *
8203                                               int_size_in_bytes (field_t)))),
8204                       build1 (INDIRECT_REF, field_t, u));
8205           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
8206         }
8207
8208       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
8209       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
8210     }
8211
8212   COND_EXPR_ELSE (cond2) = t;
8213   addr = fold_convert (build_pointer_type (type), cond1);
8214   addr = build_va_arg_indirect_ref (addr);
8215
8216   if (indirect_p)
8217     addr = build_va_arg_indirect_ref (addr);
8218
8219   return addr;
8220 }
8221
8222 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
8223
8224 static void
8225 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
8226                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8227                                 int no_rtl)
8228 {
8229   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8230   CUMULATIVE_ARGS local_cum;
8231   int gr_saved, vr_saved;
8232
8233   /* The caller has advanced CUM up to, but not beyond, the last named
8234      argument.  Advance a local copy of CUM past the last "real" named
8235      argument, to find out how many registers are left over.  */
8236   local_cum = *cum;
8237   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
8238
8239   /* Found out how many registers we need to save.  */
8240   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
8241   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
8242
8243   if (!TARGET_FLOAT)
8244     {
8245       gcc_assert (local_cum.aapcs_nvrn == 0);
8246       vr_saved = 0;
8247     }
8248
8249   if (!no_rtl)
8250     {
8251       if (gr_saved > 0)
8252         {
8253           rtx ptr, mem;
8254
8255           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
8256           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
8257                                - gr_saved * UNITS_PER_WORD);
8258           mem = gen_frame_mem (BLKmode, ptr);
8259           set_mem_alias_set (mem, get_varargs_alias_set ());
8260
8261           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
8262                                mem, gr_saved);
8263         }
8264       if (vr_saved > 0)
8265         {
8266           /* We can't use move_block_from_reg, because it will use
8267              the wrong mode, storing D regs only.  */
8268           machine_mode mode = TImode;
8269           int off, i;
8270
8271           /* Set OFF to the offset from virtual_incoming_args_rtx of
8272              the first vector register.  The VR save area lies below
8273              the GR one, and is aligned to 16 bytes.  */
8274           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8275                                    STACK_BOUNDARY / BITS_PER_UNIT);
8276           off -= vr_saved * UNITS_PER_VREG;
8277
8278           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
8279             {
8280               rtx ptr, mem;
8281
8282               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
8283               mem = gen_frame_mem (mode, ptr);
8284               set_mem_alias_set (mem, get_varargs_alias_set ());
8285               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
8286               off += UNITS_PER_VREG;
8287             }
8288         }
8289     }
8290
8291   /* We don't save the size into *PRETEND_SIZE because we want to avoid
8292      any complication of having crtl->args.pretend_args_size changed.  */
8293   cfun->machine->frame.saved_varargs_size
8294     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8295                       STACK_BOUNDARY / BITS_PER_UNIT)
8296        + vr_saved * UNITS_PER_VREG);
8297 }
8298
8299 static void
8300 aarch64_conditional_register_usage (void)
8301 {
8302   int i;
8303   if (!TARGET_FLOAT)
8304     {
8305       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
8306         {
8307           fixed_regs[i] = 1;
8308           call_used_regs[i] = 1;
8309         }
8310     }
8311 }
8312
8313 /* Walk down the type tree of TYPE counting consecutive base elements.
8314    If *MODEP is VOIDmode, then set it to the first valid floating point
8315    type.  If a non-floating point type is found, or if a floating point
8316    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
8317    otherwise return the count in the sub-tree.  */
8318 static int
8319 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8320 {
8321   machine_mode mode;
8322   HOST_WIDE_INT size;
8323
8324   switch (TREE_CODE (type))
8325     {
8326     case REAL_TYPE:
8327       mode = TYPE_MODE (type);
8328       if (mode != DFmode && mode != SFmode && mode != TFmode)
8329         return -1;
8330
8331       if (*modep == VOIDmode)
8332         *modep = mode;
8333
8334       if (*modep == mode)
8335         return 1;
8336
8337       break;
8338
8339     case COMPLEX_TYPE:
8340       mode = TYPE_MODE (TREE_TYPE (type));
8341       if (mode != DFmode && mode != SFmode && mode != TFmode)
8342         return -1;
8343
8344       if (*modep == VOIDmode)
8345         *modep = mode;
8346
8347       if (*modep == mode)
8348         return 2;
8349
8350       break;
8351
8352     case VECTOR_TYPE:
8353       /* Use V2SImode and V4SImode as representatives of all 64-bit
8354          and 128-bit vector types.  */
8355       size = int_size_in_bytes (type);
8356       switch (size)
8357         {
8358         case 8:
8359           mode = V2SImode;
8360           break;
8361         case 16:
8362           mode = V4SImode;
8363           break;
8364         default:
8365           return -1;
8366         }
8367
8368       if (*modep == VOIDmode)
8369         *modep = mode;
8370
8371       /* Vector modes are considered to be opaque: two vectors are
8372          equivalent for the purposes of being homogeneous aggregates
8373          if they are the same size.  */
8374       if (*modep == mode)
8375         return 1;
8376
8377       break;
8378
8379     case ARRAY_TYPE:
8380       {
8381         int count;
8382         tree index = TYPE_DOMAIN (type);
8383
8384         /* Can't handle incomplete types nor sizes that are not
8385            fixed.  */
8386         if (!COMPLETE_TYPE_P (type)
8387             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8388           return -1;
8389
8390         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8391         if (count == -1
8392             || !index
8393             || !TYPE_MAX_VALUE (index)
8394             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8395             || !TYPE_MIN_VALUE (index)
8396             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8397             || count < 0)
8398           return -1;
8399
8400         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8401                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8402
8403         /* There must be no padding.  */
8404         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8405           return -1;
8406
8407         return count;
8408       }
8409
8410     case RECORD_TYPE:
8411       {
8412         int count = 0;
8413         int sub_count;
8414         tree field;
8415
8416         /* Can't handle incomplete types nor sizes that are not
8417            fixed.  */
8418         if (!COMPLETE_TYPE_P (type)
8419             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8420           return -1;
8421
8422         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8423           {
8424             if (TREE_CODE (field) != FIELD_DECL)
8425               continue;
8426
8427             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8428             if (sub_count < 0)
8429               return -1;
8430             count += sub_count;
8431           }
8432
8433         /* There must be no padding.  */
8434         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8435           return -1;
8436
8437         return count;
8438       }
8439
8440     case UNION_TYPE:
8441     case QUAL_UNION_TYPE:
8442       {
8443         /* These aren't very interesting except in a degenerate case.  */
8444         int count = 0;
8445         int sub_count;
8446         tree field;
8447
8448         /* Can't handle incomplete types nor sizes that are not
8449            fixed.  */
8450         if (!COMPLETE_TYPE_P (type)
8451             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8452           return -1;
8453
8454         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8455           {
8456             if (TREE_CODE (field) != FIELD_DECL)
8457               continue;
8458
8459             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8460             if (sub_count < 0)
8461               return -1;
8462             count = count > sub_count ? count : sub_count;
8463           }
8464
8465         /* There must be no padding.  */
8466         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8467           return -1;
8468
8469         return count;
8470       }
8471
8472     default:
8473       break;
8474     }
8475
8476   return -1;
8477 }
8478
8479 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8480    type as described in AAPCS64 \S 4.1.2.
8481
8482    See the comment above aarch64_composite_type_p for the notes on MODE.  */
8483
8484 static bool
8485 aarch64_short_vector_p (const_tree type,
8486                         machine_mode mode)
8487 {
8488   HOST_WIDE_INT size = -1;
8489
8490   if (type && TREE_CODE (type) == VECTOR_TYPE)
8491     size = int_size_in_bytes (type);
8492   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8493             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8494     size = GET_MODE_SIZE (mode);
8495
8496   return (size == 8 || size == 16);
8497 }
8498
8499 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8500    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
8501    array types.  The C99 floating-point complex types are also considered
8502    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
8503    types, which are GCC extensions and out of the scope of AAPCS64, are
8504    treated as composite types here as well.
8505
8506    Note that MODE itself is not sufficient in determining whether a type
8507    is such a composite type or not.  This is because
8508    stor-layout.c:compute_record_mode may have already changed the MODE
8509    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
8510    structure with only one field may have its MODE set to the mode of the
8511    field.  Also an integer mode whose size matches the size of the
8512    RECORD_TYPE type may be used to substitute the original mode
8513    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
8514    solely relied on.  */
8515
8516 static bool
8517 aarch64_composite_type_p (const_tree type,
8518                           machine_mode mode)
8519 {
8520   if (aarch64_short_vector_p (type, mode))
8521     return false;
8522
8523   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8524     return true;
8525
8526   if (mode == BLKmode
8527       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8528       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8529     return true;
8530
8531   return false;
8532 }
8533
8534 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8535    shall be passed or returned in simd/fp register(s) (providing these
8536    parameter passing registers are available).
8537
8538    Upon successful return, *COUNT returns the number of needed registers,
8539    *BASE_MODE returns the mode of the individual register and when IS_HAF
8540    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8541    floating-point aggregate or a homogeneous short-vector aggregate.  */
8542
8543 static bool
8544 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8545                                          const_tree type,
8546                                          machine_mode *base_mode,
8547                                          int *count,
8548                                          bool *is_ha)
8549 {
8550   machine_mode new_mode = VOIDmode;
8551   bool composite_p = aarch64_composite_type_p (type, mode);
8552
8553   if (is_ha != NULL) *is_ha = false;
8554
8555   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8556       || aarch64_short_vector_p (type, mode))
8557     {
8558       *count = 1;
8559       new_mode = mode;
8560     }
8561   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8562     {
8563       if (is_ha != NULL) *is_ha = true;
8564       *count = 2;
8565       new_mode = GET_MODE_INNER (mode);
8566     }
8567   else if (type && composite_p)
8568     {
8569       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8570
8571       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8572         {
8573           if (is_ha != NULL) *is_ha = true;
8574           *count = ag_count;
8575         }
8576       else
8577         return false;
8578     }
8579   else
8580     return false;
8581
8582   *base_mode = new_mode;
8583   return true;
8584 }
8585
8586 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8587
8588 static rtx
8589 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8590                           int incoming ATTRIBUTE_UNUSED)
8591 {
8592   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8593 }
8594
8595 /* Implements target hook vector_mode_supported_p.  */
8596 static bool
8597 aarch64_vector_mode_supported_p (machine_mode mode)
8598 {
8599   if (TARGET_SIMD
8600       && (mode == V4SImode  || mode == V8HImode
8601           || mode == V16QImode || mode == V2DImode
8602           || mode == V2SImode  || mode == V4HImode
8603           || mode == V8QImode || mode == V2SFmode
8604           || mode == V4SFmode || mode == V2DFmode
8605           || mode == V1DFmode))
8606     return true;
8607
8608   return false;
8609 }
8610
8611 /* Return appropriate SIMD container
8612    for MODE within a vector of WIDTH bits.  */
8613 static machine_mode
8614 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8615 {
8616   gcc_assert (width == 64 || width == 128);
8617   if (TARGET_SIMD)
8618     {
8619       if (width == 128)
8620         switch (mode)
8621           {
8622           case DFmode:
8623             return V2DFmode;
8624           case SFmode:
8625             return V4SFmode;
8626           case SImode:
8627             return V4SImode;
8628           case HImode:
8629             return V8HImode;
8630           case QImode:
8631             return V16QImode;
8632           case DImode:
8633             return V2DImode;
8634           default:
8635             break;
8636           }
8637       else
8638         switch (mode)
8639           {
8640           case SFmode:
8641             return V2SFmode;
8642           case SImode:
8643             return V2SImode;
8644           case HImode:
8645             return V4HImode;
8646           case QImode:
8647             return V8QImode;
8648           default:
8649             break;
8650           }
8651     }
8652   return word_mode;
8653 }
8654
8655 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8656 static machine_mode
8657 aarch64_preferred_simd_mode (machine_mode mode)
8658 {
8659   return aarch64_simd_container_mode (mode, 128);
8660 }
8661
8662 /* Return the bitmask of possible vector sizes for the vectorizer
8663    to iterate over.  */
8664 static unsigned int
8665 aarch64_autovectorize_vector_sizes (void)
8666 {
8667   return (16 | 8);
8668 }
8669
8670 /* Implement TARGET_MANGLE_TYPE.  */
8671
8672 static const char *
8673 aarch64_mangle_type (const_tree type)
8674 {
8675   /* The AArch64 ABI documents say that "__va_list" has to be
8676      managled as if it is in the "std" namespace.  */
8677   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8678     return "St9__va_list";
8679
8680   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8681      builtin types.  */
8682   if (TYPE_NAME (type) != NULL)
8683     return aarch64_mangle_builtin_type (type);
8684
8685   /* Use the default mangling.  */
8686   return NULL;
8687 }
8688
8689
8690 /* Return true if the rtx_insn contains a MEM RTX somewhere
8691    in it.  */
8692
8693 static bool
8694 has_memory_op (rtx_insn *mem_insn)
8695 {
8696   subrtx_iterator::array_type array;
8697   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8698     if (MEM_P (*iter))
8699       return true;
8700
8701   return false;
8702 }
8703
8704 /* Find the first rtx_insn before insn that will generate an assembly
8705    instruction.  */
8706
8707 static rtx_insn *
8708 aarch64_prev_real_insn (rtx_insn *insn)
8709 {
8710   if (!insn)
8711     return NULL;
8712
8713   do
8714     {
8715       insn = prev_real_insn (insn);
8716     }
8717   while (insn && recog_memoized (insn) < 0);
8718
8719   return insn;
8720 }
8721
8722 static bool
8723 is_madd_op (enum attr_type t1)
8724 {
8725   unsigned int i;
8726   /* A number of these may be AArch32 only.  */
8727   enum attr_type mlatypes[] = {
8728     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8729     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8730     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8731   };
8732
8733   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8734     {
8735       if (t1 == mlatypes[i])
8736         return true;
8737     }
8738
8739   return false;
8740 }
8741
8742 /* Check if there is a register dependency between a load and the insn
8743    for which we hold recog_data.  */
8744
8745 static bool
8746 dep_between_memop_and_curr (rtx memop)
8747 {
8748   rtx load_reg;
8749   int opno;
8750
8751   gcc_assert (GET_CODE (memop) == SET);
8752
8753   if (!REG_P (SET_DEST (memop)))
8754     return false;
8755
8756   load_reg = SET_DEST (memop);
8757   for (opno = 1; opno < recog_data.n_operands; opno++)
8758     {
8759       rtx operand = recog_data.operand[opno];
8760       if (REG_P (operand)
8761           && reg_overlap_mentioned_p (load_reg, operand))
8762         return true;
8763
8764     }
8765   return false;
8766 }
8767
8768
8769 /* When working around the Cortex-A53 erratum 835769,
8770    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8771    instruction and has a preceding memory instruction such that a NOP
8772    should be inserted between them.  */
8773
8774 bool
8775 aarch64_madd_needs_nop (rtx_insn* insn)
8776 {
8777   enum attr_type attr_type;
8778   rtx_insn *prev;
8779   rtx body;
8780
8781   if (!aarch64_fix_a53_err835769)
8782     return false;
8783
8784   if (recog_memoized (insn) < 0)
8785     return false;
8786
8787   attr_type = get_attr_type (insn);
8788   if (!is_madd_op (attr_type))
8789     return false;
8790
8791   prev = aarch64_prev_real_insn (insn);
8792   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8793      Restore recog state to INSN to avoid state corruption.  */
8794   extract_constrain_insn_cached (insn);
8795
8796   if (!prev || !has_memory_op (prev))
8797     return false;
8798
8799   body = single_set (prev);
8800
8801   /* If the previous insn is a memory op and there is no dependency between
8802      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8803      have a complex memory operation, probably a load/store pair.
8804      Be conservative for now and emit a NOP.  */
8805   if (GET_MODE (recog_data.operand[0]) == DImode
8806       && (!body || !dep_between_memop_and_curr (body)))
8807     return true;
8808
8809   return false;
8810
8811 }
8812
8813
8814 /* Implement FINAL_PRESCAN_INSN.  */
8815
8816 void
8817 aarch64_final_prescan_insn (rtx_insn *insn)
8818 {
8819   if (aarch64_madd_needs_nop (insn))
8820     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8821 }
8822
8823
8824 /* Return the equivalent letter for size.  */
8825 static char
8826 sizetochar (int size)
8827 {
8828   switch (size)
8829     {
8830     case 64: return 'd';
8831     case 32: return 's';
8832     case 16: return 'h';
8833     case 8 : return 'b';
8834     default: gcc_unreachable ();
8835     }
8836 }
8837
8838 /* Return true iff x is a uniform vector of floating-point
8839    constants, and the constant can be represented in
8840    quarter-precision form.  Note, as aarch64_float_const_representable
8841    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8842 static bool
8843 aarch64_vect_float_const_representable_p (rtx x)
8844 {
8845   int i = 0;
8846   REAL_VALUE_TYPE r0, ri;
8847   rtx x0, xi;
8848
8849   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8850     return false;
8851
8852   x0 = CONST_VECTOR_ELT (x, 0);
8853   if (!CONST_DOUBLE_P (x0))
8854     return false;
8855
8856   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8857
8858   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8859     {
8860       xi = CONST_VECTOR_ELT (x, i);
8861       if (!CONST_DOUBLE_P (xi))
8862         return false;
8863
8864       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8865       if (!REAL_VALUES_EQUAL (r0, ri))
8866         return false;
8867     }
8868
8869   return aarch64_float_const_representable_p (x0);
8870 }
8871
8872 /* Return true for valid and false for invalid.  */
8873 bool
8874 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8875                               struct simd_immediate_info *info)
8876 {
8877 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8878   matches = 1;                                          \
8879   for (i = 0; i < idx; i += (STRIDE))                   \
8880     if (!(TEST))                                        \
8881       matches = 0;                                      \
8882   if (matches)                                          \
8883     {                                                   \
8884       immtype = (CLASS);                                \
8885       elsize = (ELSIZE);                                \
8886       eshift = (SHIFT);                                 \
8887       emvn = (NEG);                                     \
8888       break;                                            \
8889     }
8890
8891   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8892   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8893   unsigned char bytes[16];
8894   int immtype = -1, matches;
8895   unsigned int invmask = inverse ? 0xff : 0;
8896   int eshift, emvn;
8897
8898   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8899     {
8900       if (! (aarch64_simd_imm_zero_p (op, mode)
8901              || aarch64_vect_float_const_representable_p (op)))
8902         return false;
8903
8904       if (info)
8905         {
8906           info->value = CONST_VECTOR_ELT (op, 0);
8907           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8908           info->mvn = false;
8909           info->shift = 0;
8910         }
8911
8912       return true;
8913     }
8914
8915   /* Splat vector constant out into a byte vector.  */
8916   for (i = 0; i < n_elts; i++)
8917     {
8918       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8919          it must be laid out in the vector register in reverse order.  */
8920       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8921       unsigned HOST_WIDE_INT elpart;
8922       unsigned int part, parts;
8923
8924       if (CONST_INT_P (el))
8925         {
8926           elpart = INTVAL (el);
8927           parts = 1;
8928         }
8929       else if (GET_CODE (el) == CONST_DOUBLE)
8930         {
8931           elpart = CONST_DOUBLE_LOW (el);
8932           parts = 2;
8933         }
8934       else
8935         gcc_unreachable ();
8936
8937       for (part = 0; part < parts; part++)
8938         {
8939           unsigned int byte;
8940           for (byte = 0; byte < innersize; byte++)
8941             {
8942               bytes[idx++] = (elpart & 0xff) ^ invmask;
8943               elpart >>= BITS_PER_UNIT;
8944             }
8945           if (GET_CODE (el) == CONST_DOUBLE)
8946             elpart = CONST_DOUBLE_HIGH (el);
8947         }
8948     }
8949
8950   /* Sanity check.  */
8951   gcc_assert (idx == GET_MODE_SIZE (mode));
8952
8953   do
8954     {
8955       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8956              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8957
8958       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8959              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8960
8961       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8962              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8963
8964       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8965              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8966
8967       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8968
8969       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8970
8971       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8972              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8973
8974       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8975              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8976
8977       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8978              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8979
8980       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8981              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8982
8983       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8984
8985       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8986
8987       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8988              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8989
8990       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8991              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8992
8993       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8994              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8995
8996       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8997              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8998
8999       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
9000
9001       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
9002              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
9003     }
9004   while (0);
9005
9006   if (immtype == -1)
9007     return false;
9008
9009   if (info)
9010     {
9011       info->element_width = elsize;
9012       info->mvn = emvn != 0;
9013       info->shift = eshift;
9014
9015       unsigned HOST_WIDE_INT imm = 0;
9016
9017       if (immtype >= 12 && immtype <= 15)
9018         info->msl = true;
9019
9020       /* Un-invert bytes of recognized vector, if necessary.  */
9021       if (invmask != 0)
9022         for (i = 0; i < idx; i++)
9023           bytes[i] ^= invmask;
9024
9025       if (immtype == 17)
9026         {
9027           /* FIXME: Broken on 32-bit H_W_I hosts.  */
9028           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
9029
9030           for (i = 0; i < 8; i++)
9031             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
9032               << (i * BITS_PER_UNIT);
9033
9034
9035           info->value = GEN_INT (imm);
9036         }
9037       else
9038         {
9039           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
9040             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
9041
9042           /* Construct 'abcdefgh' because the assembler cannot handle
9043              generic constants.  */
9044           if (info->mvn)
9045             imm = ~imm;
9046           imm = (imm >> info->shift) & 0xff;
9047           info->value = GEN_INT (imm);
9048         }
9049     }
9050
9051   return true;
9052 #undef CHECK
9053 }
9054
9055 /* Check of immediate shift constants are within range.  */
9056 bool
9057 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
9058 {
9059   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
9060   if (left)
9061     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
9062   else
9063     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
9064 }
9065
9066 /* Return true if X is a uniform vector where all elements
9067    are either the floating-point constant 0.0 or the
9068    integer constant 0.  */
9069 bool
9070 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
9071 {
9072   return x == CONST0_RTX (mode);
9073 }
9074
9075 bool
9076 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
9077 {
9078   HOST_WIDE_INT imm = INTVAL (x);
9079   int i;
9080
9081   for (i = 0; i < 8; i++)
9082     {
9083       unsigned int byte = imm & 0xff;
9084       if (byte != 0xff && byte != 0)
9085        return false;
9086       imm >>= 8;
9087     }
9088
9089   return true;
9090 }
9091
9092 bool
9093 aarch64_mov_operand_p (rtx x,
9094                        enum aarch64_symbol_context context,
9095                        machine_mode mode)
9096 {
9097   if (GET_CODE (x) == HIGH
9098       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9099     return true;
9100
9101   if (CONST_INT_P (x))
9102     return true;
9103
9104   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
9105     return true;
9106
9107   return aarch64_classify_symbolic_expression (x, context)
9108     == SYMBOL_TINY_ABSOLUTE;
9109 }
9110
9111 /* Return a const_int vector of VAL.  */
9112 rtx
9113 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
9114 {
9115   int nunits = GET_MODE_NUNITS (mode);
9116   rtvec v = rtvec_alloc (nunits);
9117   int i;
9118
9119   for (i=0; i < nunits; i++)
9120     RTVEC_ELT (v, i) = GEN_INT (val);
9121
9122   return gen_rtx_CONST_VECTOR (mode, v);
9123 }
9124
9125 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
9126
9127 bool
9128 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
9129 {
9130   machine_mode vmode;
9131
9132   gcc_assert (!VECTOR_MODE_P (mode));
9133   vmode = aarch64_preferred_simd_mode (mode);
9134   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
9135   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
9136 }
9137
9138 /* Construct and return a PARALLEL RTX vector with elements numbering the
9139    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
9140    the vector - from the perspective of the architecture.  This does not
9141    line up with GCC's perspective on lane numbers, so we end up with
9142    different masks depending on our target endian-ness.  The diagram
9143    below may help.  We must draw the distinction when building masks
9144    which select one half of the vector.  An instruction selecting
9145    architectural low-lanes for a big-endian target, must be described using
9146    a mask selecting GCC high-lanes.
9147
9148                  Big-Endian             Little-Endian
9149
9150 GCC             0   1   2   3           3   2   1   0
9151               | x | x | x | x |       | x | x | x | x |
9152 Architecture    3   2   1   0           3   2   1   0
9153
9154 Low Mask:         { 2, 3 }                { 0, 1 }
9155 High Mask:        { 0, 1 }                { 2, 3 }
9156 */
9157
9158 rtx
9159 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
9160 {
9161   int nunits = GET_MODE_NUNITS (mode);
9162   rtvec v = rtvec_alloc (nunits / 2);
9163   int high_base = nunits / 2;
9164   int low_base = 0;
9165   int base;
9166   rtx t1;
9167   int i;
9168
9169   if (BYTES_BIG_ENDIAN)
9170     base = high ? low_base : high_base;
9171   else
9172     base = high ? high_base : low_base;
9173
9174   for (i = 0; i < nunits / 2; i++)
9175     RTVEC_ELT (v, i) = GEN_INT (base + i);
9176
9177   t1 = gen_rtx_PARALLEL (mode, v);
9178   return t1;
9179 }
9180
9181 /* Check OP for validity as a PARALLEL RTX vector with elements
9182    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
9183    from the perspective of the architecture.  See the diagram above
9184    aarch64_simd_vect_par_cnst_half for more details.  */
9185
9186 bool
9187 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
9188                                        bool high)
9189 {
9190   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
9191   HOST_WIDE_INT count_op = XVECLEN (op, 0);
9192   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
9193   int i = 0;
9194
9195   if (!VECTOR_MODE_P (mode))
9196     return false;
9197
9198   if (count_op != count_ideal)
9199     return false;
9200
9201   for (i = 0; i < count_ideal; i++)
9202     {
9203       rtx elt_op = XVECEXP (op, 0, i);
9204       rtx elt_ideal = XVECEXP (ideal, 0, i);
9205
9206       if (!CONST_INT_P (elt_op)
9207           || INTVAL (elt_ideal) != INTVAL (elt_op))
9208         return false;
9209     }
9210   return true;
9211 }
9212
9213 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
9214    HIGH (exclusive).  */
9215 void
9216 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
9217                           const_tree exp)
9218 {
9219   HOST_WIDE_INT lane;
9220   gcc_assert (CONST_INT_P (operand));
9221   lane = INTVAL (operand);
9222
9223   if (lane < low || lane >= high)
9224   {
9225     if (exp)
9226       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
9227     else
9228       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
9229   }
9230 }
9231
9232 /* Return TRUE if OP is a valid vector addressing mode.  */
9233 bool
9234 aarch64_simd_mem_operand_p (rtx op)
9235 {
9236   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
9237                         || REG_P (XEXP (op, 0)));
9238 }
9239
9240 /* Emit a register copy from operand to operand, taking care not to
9241    early-clobber source registers in the process.
9242
9243    COUNT is the number of components into which the copy needs to be
9244    decomposed.  */
9245 void
9246 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
9247                                 unsigned int count)
9248 {
9249   unsigned int i;
9250   int rdest = REGNO (operands[0]);
9251   int rsrc = REGNO (operands[1]);
9252
9253   if (!reg_overlap_mentioned_p (operands[0], operands[1])
9254       || rdest < rsrc)
9255     for (i = 0; i < count; i++)
9256       emit_move_insn (gen_rtx_REG (mode, rdest + i),
9257                       gen_rtx_REG (mode, rsrc + i));
9258   else
9259     for (i = 0; i < count; i++)
9260       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
9261                       gen_rtx_REG (mode, rsrc + count - i - 1));
9262 }
9263
9264 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
9265    one of VSTRUCT modes: OI, CI or XI.  */
9266 int
9267 aarch64_simd_attr_length_move (rtx_insn *insn)
9268 {
9269   machine_mode mode;
9270
9271   extract_insn_cached (insn);
9272
9273   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
9274     {
9275       mode = GET_MODE (recog_data.operand[0]);
9276       switch (mode)
9277         {
9278         case OImode:
9279           return 8;
9280         case CImode:
9281           return 12;
9282         case XImode:
9283           return 16;
9284         default:
9285           gcc_unreachable ();
9286         }
9287     }
9288   return 4;
9289 }
9290
9291 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
9292    one of VSTRUCT modes: OI, CI, EI, or XI.  */
9293 int
9294 aarch64_simd_attr_length_rglist (enum machine_mode mode)
9295 {
9296   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
9297 }
9298
9299 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
9300    alignment of a vector to 128 bits.  */
9301 static HOST_WIDE_INT
9302 aarch64_simd_vector_alignment (const_tree type)
9303 {
9304   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
9305   return MIN (align, 128);
9306 }
9307
9308 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
9309 static bool
9310 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
9311 {
9312   if (is_packed)
9313     return false;
9314
9315   /* We guarantee alignment for vectors up to 128-bits.  */
9316   if (tree_int_cst_compare (TYPE_SIZE (type),
9317                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
9318     return false;
9319
9320   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
9321   return true;
9322 }
9323
9324 /* If VALS is a vector constant that can be loaded into a register
9325    using DUP, generate instructions to do so and return an RTX to
9326    assign to the register.  Otherwise return NULL_RTX.  */
9327 static rtx
9328 aarch64_simd_dup_constant (rtx vals)
9329 {
9330   machine_mode mode = GET_MODE (vals);
9331   machine_mode inner_mode = GET_MODE_INNER (mode);
9332   int n_elts = GET_MODE_NUNITS (mode);
9333   bool all_same = true;
9334   rtx x;
9335   int i;
9336
9337   if (GET_CODE (vals) != CONST_VECTOR)
9338     return NULL_RTX;
9339
9340   for (i = 1; i < n_elts; ++i)
9341     {
9342       x = CONST_VECTOR_ELT (vals, i);
9343       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9344         all_same = false;
9345     }
9346
9347   if (!all_same)
9348     return NULL_RTX;
9349
9350   /* We can load this constant by using DUP and a constant in a
9351      single ARM register.  This will be cheaper than a vector
9352      load.  */
9353   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9354   return gen_rtx_VEC_DUPLICATE (mode, x);
9355 }
9356
9357
9358 /* Generate code to load VALS, which is a PARALLEL containing only
9359    constants (for vec_init) or CONST_VECTOR, efficiently into a
9360    register.  Returns an RTX to copy into the register, or NULL_RTX
9361    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
9362 static rtx
9363 aarch64_simd_make_constant (rtx vals)
9364 {
9365   machine_mode mode = GET_MODE (vals);
9366   rtx const_dup;
9367   rtx const_vec = NULL_RTX;
9368   int n_elts = GET_MODE_NUNITS (mode);
9369   int n_const = 0;
9370   int i;
9371
9372   if (GET_CODE (vals) == CONST_VECTOR)
9373     const_vec = vals;
9374   else if (GET_CODE (vals) == PARALLEL)
9375     {
9376       /* A CONST_VECTOR must contain only CONST_INTs and
9377          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9378          Only store valid constants in a CONST_VECTOR.  */
9379       for (i = 0; i < n_elts; ++i)
9380         {
9381           rtx x = XVECEXP (vals, 0, i);
9382           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9383             n_const++;
9384         }
9385       if (n_const == n_elts)
9386         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9387     }
9388   else
9389     gcc_unreachable ();
9390
9391   if (const_vec != NULL_RTX
9392       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9393     /* Load using MOVI/MVNI.  */
9394     return const_vec;
9395   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9396     /* Loaded using DUP.  */
9397     return const_dup;
9398   else if (const_vec != NULL_RTX)
9399     /* Load from constant pool. We can not take advantage of single-cycle
9400        LD1 because we need a PC-relative addressing mode.  */
9401     return const_vec;
9402   else
9403     /* A PARALLEL containing something not valid inside CONST_VECTOR.
9404        We can not construct an initializer.  */
9405     return NULL_RTX;
9406 }
9407
9408 void
9409 aarch64_expand_vector_init (rtx target, rtx vals)
9410 {
9411   machine_mode mode = GET_MODE (target);
9412   machine_mode inner_mode = GET_MODE_INNER (mode);
9413   int n_elts = GET_MODE_NUNITS (mode);
9414   int n_var = 0;
9415   rtx any_const = NULL_RTX;
9416   bool all_same = true;
9417
9418   for (int i = 0; i < n_elts; ++i)
9419     {
9420       rtx x = XVECEXP (vals, 0, i);
9421       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9422         ++n_var;
9423       else
9424         any_const = x;
9425
9426       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9427         all_same = false;
9428     }
9429
9430   if (n_var == 0)
9431     {
9432       rtx constant = aarch64_simd_make_constant (vals);
9433       if (constant != NULL_RTX)
9434         {
9435           emit_move_insn (target, constant);
9436           return;
9437         }
9438     }
9439
9440   /* Splat a single non-constant element if we can.  */
9441   if (all_same)
9442     {
9443       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9444       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9445       return;
9446     }
9447
9448   /* Half the fields (or less) are non-constant.  Load constant then overwrite
9449      varying fields.  Hope that this is more efficient than using the stack.  */
9450   if (n_var <= n_elts/2)
9451     {
9452       rtx copy = copy_rtx (vals);
9453
9454       /* Load constant part of vector.  We really don't care what goes into the
9455          parts we will overwrite, but we're more likely to be able to load the
9456          constant efficiently if it has fewer, larger, repeating parts
9457          (see aarch64_simd_valid_immediate).  */
9458       for (int i = 0; i < n_elts; i++)
9459         {
9460           rtx x = XVECEXP (vals, 0, i);
9461           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9462             continue;
9463           rtx subst = any_const;
9464           for (int bit = n_elts / 2; bit > 0; bit /= 2)
9465             {
9466               /* Look in the copied vector, as more elements are const.  */
9467               rtx test = XVECEXP (copy, 0, i ^ bit);
9468               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9469                 {
9470                   subst = test;
9471                   break;
9472                 }
9473             }
9474           XVECEXP (copy, 0, i) = subst;
9475         }
9476       aarch64_expand_vector_init (target, copy);
9477
9478       /* Insert variables.  */
9479       enum insn_code icode = optab_handler (vec_set_optab, mode);
9480       gcc_assert (icode != CODE_FOR_nothing);
9481
9482       for (int i = 0; i < n_elts; i++)
9483         {
9484           rtx x = XVECEXP (vals, 0, i);
9485           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9486             continue;
9487           x = copy_to_mode_reg (inner_mode, x);
9488           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9489         }
9490       return;
9491     }
9492
9493   /* Construct the vector in memory one field at a time
9494      and load the whole vector.  */
9495   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9496   for (int i = 0; i < n_elts; i++)
9497     emit_move_insn (adjust_address_nv (mem, inner_mode,
9498                                     i * GET_MODE_SIZE (inner_mode)),
9499                     XVECEXP (vals, 0, i));
9500   emit_move_insn (target, mem);
9501
9502 }
9503
9504 static unsigned HOST_WIDE_INT
9505 aarch64_shift_truncation_mask (machine_mode mode)
9506 {
9507   return
9508     (aarch64_vector_mode_supported_p (mode)
9509      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9510 }
9511
9512 #ifndef TLS_SECTION_ASM_FLAG
9513 #define TLS_SECTION_ASM_FLAG 'T'
9514 #endif
9515
9516 void
9517 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9518                                tree decl ATTRIBUTE_UNUSED)
9519 {
9520   char flagchars[10], *f = flagchars;
9521
9522   /* If we have already declared this section, we can use an
9523      abbreviated form to switch back to it -- unless this section is
9524      part of a COMDAT groups, in which case GAS requires the full
9525      declaration every time.  */
9526   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9527       && (flags & SECTION_DECLARED))
9528     {
9529       fprintf (asm_out_file, "\t.section\t%s\n", name);
9530       return;
9531     }
9532
9533   if (!(flags & SECTION_DEBUG))
9534     *f++ = 'a';
9535   if (flags & SECTION_WRITE)
9536     *f++ = 'w';
9537   if (flags & SECTION_CODE)
9538     *f++ = 'x';
9539   if (flags & SECTION_SMALL)
9540     *f++ = 's';
9541   if (flags & SECTION_MERGE)
9542     *f++ = 'M';
9543   if (flags & SECTION_STRINGS)
9544     *f++ = 'S';
9545   if (flags & SECTION_TLS)
9546     *f++ = TLS_SECTION_ASM_FLAG;
9547   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9548     *f++ = 'G';
9549   *f = '\0';
9550
9551   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9552
9553   if (!(flags & SECTION_NOTYPE))
9554     {
9555       const char *type;
9556       const char *format;
9557
9558       if (flags & SECTION_BSS)
9559         type = "nobits";
9560       else
9561         type = "progbits";
9562
9563 #ifdef TYPE_OPERAND_FMT
9564       format = "," TYPE_OPERAND_FMT;
9565 #else
9566       format = ",@%s";
9567 #endif
9568
9569       fprintf (asm_out_file, format, type);
9570
9571       if (flags & SECTION_ENTSIZE)
9572         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9573       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9574         {
9575           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9576             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9577           else
9578             fprintf (asm_out_file, ",%s,comdat",
9579                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9580         }
9581     }
9582
9583   putc ('\n', asm_out_file);
9584 }
9585
9586 /* Select a format to encode pointers in exception handling data.  */
9587 int
9588 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9589 {
9590    int type;
9591    switch (aarch64_cmodel)
9592      {
9593      case AARCH64_CMODEL_TINY:
9594      case AARCH64_CMODEL_TINY_PIC:
9595      case AARCH64_CMODEL_SMALL:
9596      case AARCH64_CMODEL_SMALL_PIC:
9597      case AARCH64_CMODEL_SMALL_SPIC:
9598        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9599           for everything.  */
9600        type = DW_EH_PE_sdata4;
9601        break;
9602      default:
9603        /* No assumptions here.  8-byte relocs required.  */
9604        type = DW_EH_PE_sdata8;
9605        break;
9606      }
9607    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9608 }
9609
9610 /* Emit load exclusive.  */
9611
9612 static void
9613 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9614                              rtx mem, rtx model_rtx)
9615 {
9616   rtx (*gen) (rtx, rtx, rtx);
9617
9618   switch (mode)
9619     {
9620     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9621     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9622     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9623     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9624     default:
9625       gcc_unreachable ();
9626     }
9627
9628   emit_insn (gen (rval, mem, model_rtx));
9629 }
9630
9631 /* Emit store exclusive.  */
9632
9633 static void
9634 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9635                               rtx rval, rtx mem, rtx model_rtx)
9636 {
9637   rtx (*gen) (rtx, rtx, rtx, rtx);
9638
9639   switch (mode)
9640     {
9641     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9642     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9643     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9644     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9645     default:
9646       gcc_unreachable ();
9647     }
9648
9649   emit_insn (gen (bval, rval, mem, model_rtx));
9650 }
9651
9652 /* Mark the previous jump instruction as unlikely.  */
9653
9654 static void
9655 aarch64_emit_unlikely_jump (rtx insn)
9656 {
9657   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9658
9659   insn = emit_jump_insn (insn);
9660   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9661 }
9662
9663 /* Expand a compare and swap pattern.  */
9664
9665 void
9666 aarch64_expand_compare_and_swap (rtx operands[])
9667 {
9668   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9669   machine_mode mode, cmp_mode;
9670   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9671
9672   bval = operands[0];
9673   rval = operands[1];
9674   mem = operands[2];
9675   oldval = operands[3];
9676   newval = operands[4];
9677   is_weak = operands[5];
9678   mod_s = operands[6];
9679   mod_f = operands[7];
9680   mode = GET_MODE (mem);
9681   cmp_mode = mode;
9682
9683   /* Normally the succ memory model must be stronger than fail, but in the
9684      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9685      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9686
9687   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9688       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9689     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9690
9691   switch (mode)
9692     {
9693     case QImode:
9694     case HImode:
9695       /* For short modes, we're going to perform the comparison in SImode,
9696          so do the zero-extension now.  */
9697       cmp_mode = SImode;
9698       rval = gen_reg_rtx (SImode);
9699       oldval = convert_modes (SImode, mode, oldval, true);
9700       /* Fall through.  */
9701
9702     case SImode:
9703     case DImode:
9704       /* Force the value into a register if needed.  */
9705       if (!aarch64_plus_operand (oldval, mode))
9706         oldval = force_reg (cmp_mode, oldval);
9707       break;
9708
9709     default:
9710       gcc_unreachable ();
9711     }
9712
9713   switch (mode)
9714     {
9715     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9716     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9717     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9718     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9719     default:
9720       gcc_unreachable ();
9721     }
9722
9723   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9724
9725   if (mode == QImode || mode == HImode)
9726     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9727
9728   x = gen_rtx_REG (CCmode, CC_REGNUM);
9729   x = gen_rtx_EQ (SImode, x, const0_rtx);
9730   emit_insn (gen_rtx_SET (bval, x));
9731 }
9732
9733 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9734    sequence implementing an atomic operation.  */
9735
9736 static void
9737 aarch64_emit_post_barrier (enum memmodel model)
9738 {
9739   const enum memmodel base_model = memmodel_base (model);
9740
9741   if (is_mm_sync (model)
9742       && (base_model == MEMMODEL_ACQUIRE
9743           || base_model == MEMMODEL_ACQ_REL
9744           || base_model == MEMMODEL_SEQ_CST))
9745     {
9746       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9747     }
9748 }
9749
9750 /* Split a compare and swap pattern.  */
9751
9752 void
9753 aarch64_split_compare_and_swap (rtx operands[])
9754 {
9755   rtx rval, mem, oldval, newval, scratch;
9756   machine_mode mode;
9757   bool is_weak;
9758   rtx_code_label *label1, *label2;
9759   rtx x, cond;
9760   enum memmodel model;
9761   rtx model_rtx;
9762
9763   rval = operands[0];
9764   mem = operands[1];
9765   oldval = operands[2];
9766   newval = operands[3];
9767   is_weak = (operands[4] != const0_rtx);
9768   model_rtx = operands[5];
9769   scratch = operands[7];
9770   mode = GET_MODE (mem);
9771   model = memmodel_from_int (INTVAL (model_rtx));
9772
9773   label1 = NULL;
9774   if (!is_weak)
9775     {
9776       label1 = gen_label_rtx ();
9777       emit_label (label1);
9778     }
9779   label2 = gen_label_rtx ();
9780
9781   /* The initial load can be relaxed for a __sync operation since a final
9782      barrier will be emitted to stop code hoisting.  */
9783   if (is_mm_sync (model))
9784     aarch64_emit_load_exclusive (mode, rval, mem,
9785                                  GEN_INT (MEMMODEL_RELAXED));
9786   else
9787     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9788
9789   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9790   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9791   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9792                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9793   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9794
9795   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9796
9797   if (!is_weak)
9798     {
9799       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9800       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9801                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9802       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9803     }
9804   else
9805     {
9806       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9807       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9808       emit_insn (gen_rtx_SET (cond, x));
9809     }
9810
9811   emit_label (label2);
9812
9813   /* Emit any final barrier needed for a __sync operation.  */
9814   if (is_mm_sync (model))
9815     aarch64_emit_post_barrier (model);
9816 }
9817
9818 /* Split an atomic operation.  */
9819
9820 void
9821 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9822                      rtx value, rtx model_rtx, rtx cond)
9823 {
9824   machine_mode mode = GET_MODE (mem);
9825   machine_mode wmode = (mode == DImode ? DImode : SImode);
9826   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9827   const bool is_sync = is_mm_sync (model);
9828   rtx_code_label *label;
9829   rtx x;
9830
9831   label = gen_label_rtx ();
9832   emit_label (label);
9833
9834   if (new_out)
9835     new_out = gen_lowpart (wmode, new_out);
9836   if (old_out)
9837     old_out = gen_lowpart (wmode, old_out);
9838   else
9839     old_out = new_out;
9840   value = simplify_gen_subreg (wmode, value, mode, 0);
9841
9842   /* The initial load can be relaxed for a __sync operation since a final
9843      barrier will be emitted to stop code hoisting.  */
9844  if (is_sync)
9845     aarch64_emit_load_exclusive (mode, old_out, mem,
9846                                  GEN_INT (MEMMODEL_RELAXED));
9847   else
9848     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9849
9850   switch (code)
9851     {
9852     case SET:
9853       new_out = value;
9854       break;
9855
9856     case NOT:
9857       x = gen_rtx_AND (wmode, old_out, value);
9858       emit_insn (gen_rtx_SET (new_out, x));
9859       x = gen_rtx_NOT (wmode, new_out);
9860       emit_insn (gen_rtx_SET (new_out, x));
9861       break;
9862
9863     case MINUS:
9864       if (CONST_INT_P (value))
9865         {
9866           value = GEN_INT (-INTVAL (value));
9867           code = PLUS;
9868         }
9869       /* Fall through.  */
9870
9871     default:
9872       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9873       emit_insn (gen_rtx_SET (new_out, x));
9874       break;
9875     }
9876
9877   aarch64_emit_store_exclusive (mode, cond, mem,
9878                                 gen_lowpart (mode, new_out), model_rtx);
9879
9880   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9881   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9882                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9883   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9884
9885   /* Emit any final barrier needed for a __sync operation.  */
9886   if (is_sync)
9887     aarch64_emit_post_barrier (model);
9888 }
9889
9890 static void
9891 aarch64_print_extension (void)
9892 {
9893   const struct aarch64_option_extension *opt = NULL;
9894
9895   for (opt = all_extensions; opt->name != NULL; opt++)
9896     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9897       asm_fprintf (asm_out_file, "+%s", opt->name);
9898
9899   asm_fprintf (asm_out_file, "\n");
9900 }
9901
9902 static void
9903 aarch64_start_file (void)
9904 {
9905   if (selected_arch)
9906     {
9907       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9908       aarch64_print_extension ();
9909     }
9910   else if (selected_cpu)
9911     {
9912       const char *truncated_name
9913             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9914       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9915       aarch64_print_extension ();
9916     }
9917   default_file_start();
9918 }
9919
9920 /* Target hook for c_mode_for_suffix.  */
9921 static machine_mode
9922 aarch64_c_mode_for_suffix (char suffix)
9923 {
9924   if (suffix == 'q')
9925     return TFmode;
9926
9927   return VOIDmode;
9928 }
9929
9930 /* We can only represent floating point constants which will fit in
9931    "quarter-precision" values.  These values are characterised by
9932    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9933    by:
9934
9935    (-1)^s * (n/16) * 2^r
9936
9937    Where:
9938      's' is the sign bit.
9939      'n' is an integer in the range 16 <= n <= 31.
9940      'r' is an integer in the range -3 <= r <= 4.  */
9941
9942 /* Return true iff X can be represented by a quarter-precision
9943    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9944 bool
9945 aarch64_float_const_representable_p (rtx x)
9946 {
9947   /* This represents our current view of how many bits
9948      make up the mantissa.  */
9949   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9950   int exponent;
9951   unsigned HOST_WIDE_INT mantissa, mask;
9952   REAL_VALUE_TYPE r, m;
9953   bool fail;
9954
9955   if (!CONST_DOUBLE_P (x))
9956     return false;
9957
9958   if (GET_MODE (x) == VOIDmode)
9959     return false;
9960
9961   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9962
9963   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9964      know if we have +zero until we analyse the mantissa, but we
9965      can reject the other invalid values.  */
9966   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9967       || REAL_VALUE_MINUS_ZERO (r))
9968     return false;
9969
9970   /* Extract exponent.  */
9971   r = real_value_abs (&r);
9972   exponent = REAL_EXP (&r);
9973
9974   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9975      highest (sign) bit, with a fixed binary point at bit point_pos.
9976      m1 holds the low part of the mantissa, m2 the high part.
9977      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9978      bits for the mantissa, this can fail (low bits will be lost).  */
9979   real_ldexp (&m, &r, point_pos - exponent);
9980   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9981
9982   /* If the low part of the mantissa has bits set we cannot represent
9983      the value.  */
9984   if (w.elt (0) != 0)
9985     return false;
9986   /* We have rejected the lower HOST_WIDE_INT, so update our
9987      understanding of how many bits lie in the mantissa and
9988      look only at the high HOST_WIDE_INT.  */
9989   mantissa = w.elt (1);
9990   point_pos -= HOST_BITS_PER_WIDE_INT;
9991
9992   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9993   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9994   if ((mantissa & mask) != 0)
9995     return false;
9996
9997   /* Having filtered unrepresentable values, we may now remove all
9998      but the highest 5 bits.  */
9999   mantissa >>= point_pos - 5;
10000
10001   /* We cannot represent the value 0.0, so reject it.  This is handled
10002      elsewhere.  */
10003   if (mantissa == 0)
10004     return false;
10005
10006   /* Then, as bit 4 is always set, we can mask it off, leaving
10007      the mantissa in the range [0, 15].  */
10008   mantissa &= ~(1 << 4);
10009   gcc_assert (mantissa <= 15);
10010
10011   /* GCC internally does not use IEEE754-like encoding (where normalized
10012      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
10013      Our mantissa values are shifted 4 places to the left relative to
10014      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
10015      by 5 places to correct for GCC's representation.  */
10016   exponent = 5 - exponent;
10017
10018   return (exponent >= 0 && exponent <= 7);
10019 }
10020
10021 char*
10022 aarch64_output_simd_mov_immediate (rtx const_vector,
10023                                    machine_mode mode,
10024                                    unsigned width)
10025 {
10026   bool is_valid;
10027   static char templ[40];
10028   const char *mnemonic;
10029   const char *shift_op;
10030   unsigned int lane_count = 0;
10031   char element_char;
10032
10033   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
10034
10035   /* This will return true to show const_vector is legal for use as either
10036      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
10037      also update INFO to show how the immediate should be generated.  */
10038   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
10039   gcc_assert (is_valid);
10040
10041   element_char = sizetochar (info.element_width);
10042   lane_count = width / info.element_width;
10043
10044   mode = GET_MODE_INNER (mode);
10045   if (mode == SFmode || mode == DFmode)
10046     {
10047       gcc_assert (info.shift == 0 && ! info.mvn);
10048       if (aarch64_float_const_zero_rtx_p (info.value))
10049         info.value = GEN_INT (0);
10050       else
10051         {
10052 #define buf_size 20
10053           REAL_VALUE_TYPE r;
10054           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
10055           char float_buf[buf_size] = {'\0'};
10056           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
10057 #undef buf_size
10058
10059           if (lane_count == 1)
10060             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
10061           else
10062             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
10063                       lane_count, element_char, float_buf);
10064           return templ;
10065         }
10066     }
10067
10068   mnemonic = info.mvn ? "mvni" : "movi";
10069   shift_op = info.msl ? "msl" : "lsl";
10070
10071   if (lane_count == 1)
10072     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
10073               mnemonic, UINTVAL (info.value));
10074   else if (info.shift)
10075     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
10076               ", %s %d", mnemonic, lane_count, element_char,
10077               UINTVAL (info.value), shift_op, info.shift);
10078   else
10079     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
10080               mnemonic, lane_count, element_char, UINTVAL (info.value));
10081   return templ;
10082 }
10083
10084 char*
10085 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
10086                                           machine_mode mode)
10087 {
10088   machine_mode vmode;
10089
10090   gcc_assert (!VECTOR_MODE_P (mode));
10091   vmode = aarch64_simd_container_mode (mode, 64);
10092   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
10093   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
10094 }
10095
10096 /* Split operands into moves from op[1] + op[2] into op[0].  */
10097
10098 void
10099 aarch64_split_combinev16qi (rtx operands[3])
10100 {
10101   unsigned int dest = REGNO (operands[0]);
10102   unsigned int src1 = REGNO (operands[1]);
10103   unsigned int src2 = REGNO (operands[2]);
10104   machine_mode halfmode = GET_MODE (operands[1]);
10105   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
10106   rtx destlo, desthi;
10107
10108   gcc_assert (halfmode == V16QImode);
10109
10110   if (src1 == dest && src2 == dest + halfregs)
10111     {
10112       /* No-op move.  Can't split to nothing; emit something.  */
10113       emit_note (NOTE_INSN_DELETED);
10114       return;
10115     }
10116
10117   /* Preserve register attributes for variable tracking.  */
10118   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
10119   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
10120                                GET_MODE_SIZE (halfmode));
10121
10122   /* Special case of reversed high/low parts.  */
10123   if (reg_overlap_mentioned_p (operands[2], destlo)
10124       && reg_overlap_mentioned_p (operands[1], desthi))
10125     {
10126       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10127       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
10128       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10129     }
10130   else if (!reg_overlap_mentioned_p (operands[2], destlo))
10131     {
10132       /* Try to avoid unnecessary moves if part of the result
10133          is in the right place already.  */
10134       if (src1 != dest)
10135         emit_move_insn (destlo, operands[1]);
10136       if (src2 != dest + halfregs)
10137         emit_move_insn (desthi, operands[2]);
10138     }
10139   else
10140     {
10141       if (src2 != dest + halfregs)
10142         emit_move_insn (desthi, operands[2]);
10143       if (src1 != dest)
10144         emit_move_insn (destlo, operands[1]);
10145     }
10146 }
10147
10148 /* vec_perm support.  */
10149
10150 #define MAX_VECT_LEN 16
10151
10152 struct expand_vec_perm_d
10153 {
10154   rtx target, op0, op1;
10155   unsigned char perm[MAX_VECT_LEN];
10156   machine_mode vmode;
10157   unsigned char nelt;
10158   bool one_vector_p;
10159   bool testing_p;
10160 };
10161
10162 /* Generate a variable permutation.  */
10163
10164 static void
10165 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
10166 {
10167   machine_mode vmode = GET_MODE (target);
10168   bool one_vector_p = rtx_equal_p (op0, op1);
10169
10170   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
10171   gcc_checking_assert (GET_MODE (op0) == vmode);
10172   gcc_checking_assert (GET_MODE (op1) == vmode);
10173   gcc_checking_assert (GET_MODE (sel) == vmode);
10174   gcc_checking_assert (TARGET_SIMD);
10175
10176   if (one_vector_p)
10177     {
10178       if (vmode == V8QImode)
10179         {
10180           /* Expand the argument to a V16QI mode by duplicating it.  */
10181           rtx pair = gen_reg_rtx (V16QImode);
10182           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
10183           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10184         }
10185       else
10186         {
10187           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
10188         }
10189     }
10190   else
10191     {
10192       rtx pair;
10193
10194       if (vmode == V8QImode)
10195         {
10196           pair = gen_reg_rtx (V16QImode);
10197           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
10198           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10199         }
10200       else
10201         {
10202           pair = gen_reg_rtx (OImode);
10203           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
10204           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
10205         }
10206     }
10207 }
10208
10209 void
10210 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
10211 {
10212   machine_mode vmode = GET_MODE (target);
10213   unsigned int nelt = GET_MODE_NUNITS (vmode);
10214   bool one_vector_p = rtx_equal_p (op0, op1);
10215   rtx mask;
10216
10217   /* The TBL instruction does not use a modulo index, so we must take care
10218      of that ourselves.  */
10219   mask = aarch64_simd_gen_const_vector_dup (vmode,
10220       one_vector_p ? nelt - 1 : 2 * nelt - 1);
10221   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
10222
10223   /* For big-endian, we also need to reverse the index within the vector
10224      (but not which vector).  */
10225   if (BYTES_BIG_ENDIAN)
10226     {
10227       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
10228       if (!one_vector_p)
10229         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
10230       sel = expand_simple_binop (vmode, XOR, sel, mask,
10231                                  NULL, 0, OPTAB_LIB_WIDEN);
10232     }
10233   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
10234 }
10235
10236 /* Recognize patterns suitable for the TRN instructions.  */
10237 static bool
10238 aarch64_evpc_trn (struct expand_vec_perm_d *d)
10239 {
10240   unsigned int i, odd, mask, nelt = d->nelt;
10241   rtx out, in0, in1, x;
10242   rtx (*gen) (rtx, rtx, rtx);
10243   machine_mode vmode = d->vmode;
10244
10245   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10246     return false;
10247
10248   /* Note that these are little-endian tests.
10249      We correct for big-endian later.  */
10250   if (d->perm[0] == 0)
10251     odd = 0;
10252   else if (d->perm[0] == 1)
10253     odd = 1;
10254   else
10255     return false;
10256   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10257
10258   for (i = 0; i < nelt; i += 2)
10259     {
10260       if (d->perm[i] != i + odd)
10261         return false;
10262       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
10263         return false;
10264     }
10265
10266   /* Success!  */
10267   if (d->testing_p)
10268     return true;
10269
10270   in0 = d->op0;
10271   in1 = d->op1;
10272   if (BYTES_BIG_ENDIAN)
10273     {
10274       x = in0, in0 = in1, in1 = x;
10275       odd = !odd;
10276     }
10277   out = d->target;
10278
10279   if (odd)
10280     {
10281       switch (vmode)
10282         {
10283         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
10284         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
10285         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
10286         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
10287         case V4SImode: gen = gen_aarch64_trn2v4si; break;
10288         case V2SImode: gen = gen_aarch64_trn2v2si; break;
10289         case V2DImode: gen = gen_aarch64_trn2v2di; break;
10290         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
10291         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
10292         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
10293         default:
10294           return false;
10295         }
10296     }
10297   else
10298     {
10299       switch (vmode)
10300         {
10301         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
10302         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
10303         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
10304         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
10305         case V4SImode: gen = gen_aarch64_trn1v4si; break;
10306         case V2SImode: gen = gen_aarch64_trn1v2si; break;
10307         case V2DImode: gen = gen_aarch64_trn1v2di; break;
10308         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
10309         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
10310         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
10311         default:
10312           return false;
10313         }
10314     }
10315
10316   emit_insn (gen (out, in0, in1));
10317   return true;
10318 }
10319
10320 /* Recognize patterns suitable for the UZP instructions.  */
10321 static bool
10322 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10323 {
10324   unsigned int i, odd, mask, nelt = d->nelt;
10325   rtx out, in0, in1, x;
10326   rtx (*gen) (rtx, rtx, rtx);
10327   machine_mode vmode = d->vmode;
10328
10329   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10330     return false;
10331
10332   /* Note that these are little-endian tests.
10333      We correct for big-endian later.  */
10334   if (d->perm[0] == 0)
10335     odd = 0;
10336   else if (d->perm[0] == 1)
10337     odd = 1;
10338   else
10339     return false;
10340   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10341
10342   for (i = 0; i < nelt; i++)
10343     {
10344       unsigned elt = (i * 2 + odd) & mask;
10345       if (d->perm[i] != elt)
10346         return false;
10347     }
10348
10349   /* Success!  */
10350   if (d->testing_p)
10351     return true;
10352
10353   in0 = d->op0;
10354   in1 = d->op1;
10355   if (BYTES_BIG_ENDIAN)
10356     {
10357       x = in0, in0 = in1, in1 = x;
10358       odd = !odd;
10359     }
10360   out = d->target;
10361
10362   if (odd)
10363     {
10364       switch (vmode)
10365         {
10366         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10367         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10368         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10369         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10370         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10371         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10372         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10373         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10374         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10375         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10376         default:
10377           return false;
10378         }
10379     }
10380   else
10381     {
10382       switch (vmode)
10383         {
10384         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10385         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10386         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10387         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10388         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10389         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10390         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10391         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10392         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10393         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10394         default:
10395           return false;
10396         }
10397     }
10398
10399   emit_insn (gen (out, in0, in1));
10400   return true;
10401 }
10402
10403 /* Recognize patterns suitable for the ZIP instructions.  */
10404 static bool
10405 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10406 {
10407   unsigned int i, high, mask, nelt = d->nelt;
10408   rtx out, in0, in1, x;
10409   rtx (*gen) (rtx, rtx, rtx);
10410   machine_mode vmode = d->vmode;
10411
10412   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10413     return false;
10414
10415   /* Note that these are little-endian tests.
10416      We correct for big-endian later.  */
10417   high = nelt / 2;
10418   if (d->perm[0] == high)
10419     /* Do Nothing.  */
10420     ;
10421   else if (d->perm[0] == 0)
10422     high = 0;
10423   else
10424     return false;
10425   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10426
10427   for (i = 0; i < nelt / 2; i++)
10428     {
10429       unsigned elt = (i + high) & mask;
10430       if (d->perm[i * 2] != elt)
10431         return false;
10432       elt = (elt + nelt) & mask;
10433       if (d->perm[i * 2 + 1] != elt)
10434         return false;
10435     }
10436
10437   /* Success!  */
10438   if (d->testing_p)
10439     return true;
10440
10441   in0 = d->op0;
10442   in1 = d->op1;
10443   if (BYTES_BIG_ENDIAN)
10444     {
10445       x = in0, in0 = in1, in1 = x;
10446       high = !high;
10447     }
10448   out = d->target;
10449
10450   if (high)
10451     {
10452       switch (vmode)
10453         {
10454         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10455         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10456         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10457         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10458         case V4SImode: gen = gen_aarch64_zip2v4si; break;
10459         case V2SImode: gen = gen_aarch64_zip2v2si; break;
10460         case V2DImode: gen = gen_aarch64_zip2v2di; break;
10461         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10462         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10463         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10464         default:
10465           return false;
10466         }
10467     }
10468   else
10469     {
10470       switch (vmode)
10471         {
10472         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10473         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10474         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10475         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10476         case V4SImode: gen = gen_aarch64_zip1v4si; break;
10477         case V2SImode: gen = gen_aarch64_zip1v2si; break;
10478         case V2DImode: gen = gen_aarch64_zip1v2di; break;
10479         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10480         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10481         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10482         default:
10483           return false;
10484         }
10485     }
10486
10487   emit_insn (gen (out, in0, in1));
10488   return true;
10489 }
10490
10491 /* Recognize patterns for the EXT insn.  */
10492
10493 static bool
10494 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10495 {
10496   unsigned int i, nelt = d->nelt;
10497   rtx (*gen) (rtx, rtx, rtx, rtx);
10498   rtx offset;
10499
10500   unsigned int location = d->perm[0]; /* Always < nelt.  */
10501
10502   /* Check if the extracted indices are increasing by one.  */
10503   for (i = 1; i < nelt; i++)
10504     {
10505       unsigned int required = location + i;
10506       if (d->one_vector_p)
10507         {
10508           /* We'll pass the same vector in twice, so allow indices to wrap.  */
10509           required &= (nelt - 1);
10510         }
10511       if (d->perm[i] != required)
10512         return false;
10513     }
10514
10515   switch (d->vmode)
10516     {
10517     case V16QImode: gen = gen_aarch64_extv16qi; break;
10518     case V8QImode: gen = gen_aarch64_extv8qi; break;
10519     case V4HImode: gen = gen_aarch64_extv4hi; break;
10520     case V8HImode: gen = gen_aarch64_extv8hi; break;
10521     case V2SImode: gen = gen_aarch64_extv2si; break;
10522     case V4SImode: gen = gen_aarch64_extv4si; break;
10523     case V2SFmode: gen = gen_aarch64_extv2sf; break;
10524     case V4SFmode: gen = gen_aarch64_extv4sf; break;
10525     case V2DImode: gen = gen_aarch64_extv2di; break;
10526     case V2DFmode: gen = gen_aarch64_extv2df; break;
10527     default:
10528       return false;
10529     }
10530
10531   /* Success! */
10532   if (d->testing_p)
10533     return true;
10534
10535   /* The case where (location == 0) is a no-op for both big- and little-endian,
10536      and is removed by the mid-end at optimization levels -O1 and higher.  */
10537
10538   if (BYTES_BIG_ENDIAN && (location != 0))
10539     {
10540       /* After setup, we want the high elements of the first vector (stored
10541          at the LSB end of the register), and the low elements of the second
10542          vector (stored at the MSB end of the register). So swap.  */
10543       std::swap (d->op0, d->op1);
10544       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
10545       location = nelt - location;
10546     }
10547
10548   offset = GEN_INT (location);
10549   emit_insn (gen (d->target, d->op0, d->op1, offset));
10550   return true;
10551 }
10552
10553 /* Recognize patterns for the REV insns.  */
10554
10555 static bool
10556 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10557 {
10558   unsigned int i, j, diff, nelt = d->nelt;
10559   rtx (*gen) (rtx, rtx);
10560
10561   if (!d->one_vector_p)
10562     return false;
10563
10564   diff = d->perm[0];
10565   switch (diff)
10566     {
10567     case 7:
10568       switch (d->vmode)
10569         {
10570         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10571         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10572         default:
10573           return false;
10574         }
10575       break;
10576     case 3:
10577       switch (d->vmode)
10578         {
10579         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10580         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10581         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10582         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10583         default:
10584           return false;
10585         }
10586       break;
10587     case 1:
10588       switch (d->vmode)
10589         {
10590         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10591         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10592         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10593         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10594         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10595         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10596         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10597         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10598         default:
10599           return false;
10600         }
10601       break;
10602     default:
10603       return false;
10604     }
10605
10606   for (i = 0; i < nelt ; i += diff + 1)
10607     for (j = 0; j <= diff; j += 1)
10608       {
10609         /* This is guaranteed to be true as the value of diff
10610            is 7, 3, 1 and we should have enough elements in the
10611            queue to generate this.  Getting a vector mask with a
10612            value of diff other than these values implies that
10613            something is wrong by the time we get here.  */
10614         gcc_assert (i + j < nelt);
10615         if (d->perm[i + j] != i + diff - j)
10616           return false;
10617       }
10618
10619   /* Success! */
10620   if (d->testing_p)
10621     return true;
10622
10623   emit_insn (gen (d->target, d->op0));
10624   return true;
10625 }
10626
10627 static bool
10628 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10629 {
10630   rtx (*gen) (rtx, rtx, rtx);
10631   rtx out = d->target;
10632   rtx in0;
10633   machine_mode vmode = d->vmode;
10634   unsigned int i, elt, nelt = d->nelt;
10635   rtx lane;
10636
10637   elt = d->perm[0];
10638   for (i = 1; i < nelt; i++)
10639     {
10640       if (elt != d->perm[i])
10641         return false;
10642     }
10643
10644   /* The generic preparation in aarch64_expand_vec_perm_const_1
10645      swaps the operand order and the permute indices if it finds
10646      d->perm[0] to be in the second operand.  Thus, we can always
10647      use d->op0 and need not do any extra arithmetic to get the
10648      correct lane number.  */
10649   in0 = d->op0;
10650   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10651
10652   switch (vmode)
10653     {
10654     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10655     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10656     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10657     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10658     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10659     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10660     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10661     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10662     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10663     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10664     default:
10665       return false;
10666     }
10667
10668   emit_insn (gen (out, in0, lane));
10669   return true;
10670 }
10671
10672 static bool
10673 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10674 {
10675   rtx rperm[MAX_VECT_LEN], sel;
10676   machine_mode vmode = d->vmode;
10677   unsigned int i, nelt = d->nelt;
10678
10679   if (d->testing_p)
10680     return true;
10681
10682   /* Generic code will try constant permutation twice.  Once with the
10683      original mode and again with the elements lowered to QImode.
10684      So wait and don't do the selector expansion ourselves.  */
10685   if (vmode != V8QImode && vmode != V16QImode)
10686     return false;
10687
10688   for (i = 0; i < nelt; ++i)
10689     {
10690       int nunits = GET_MODE_NUNITS (vmode);
10691
10692       /* If big-endian and two vectors we end up with a weird mixed-endian
10693          mode on NEON.  Reverse the index within each word but not the word
10694          itself.  */
10695       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10696                                            : d->perm[i]);
10697     }
10698   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10699   sel = force_reg (vmode, sel);
10700
10701   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10702   return true;
10703 }
10704
10705 static bool
10706 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10707 {
10708   /* The pattern matching functions above are written to look for a small
10709      number to begin the sequence (0, 1, N/2).  If we begin with an index
10710      from the second operand, we can swap the operands.  */
10711   if (d->perm[0] >= d->nelt)
10712     {
10713       unsigned i, nelt = d->nelt;
10714
10715       gcc_assert (nelt == (nelt & -nelt));
10716       for (i = 0; i < nelt; ++i)
10717         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10718
10719       std::swap (d->op0, d->op1);
10720     }
10721
10722   if (TARGET_SIMD)
10723     {
10724       if (aarch64_evpc_rev (d))
10725         return true;
10726       else if (aarch64_evpc_ext (d))
10727         return true;
10728       else if (aarch64_evpc_dup (d))
10729         return true;
10730       else if (aarch64_evpc_zip (d))
10731         return true;
10732       else if (aarch64_evpc_uzp (d))
10733         return true;
10734       else if (aarch64_evpc_trn (d))
10735         return true;
10736       return aarch64_evpc_tbl (d);
10737     }
10738   return false;
10739 }
10740
10741 /* Expand a vec_perm_const pattern.  */
10742
10743 bool
10744 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10745 {
10746   struct expand_vec_perm_d d;
10747   int i, nelt, which;
10748
10749   d.target = target;
10750   d.op0 = op0;
10751   d.op1 = op1;
10752
10753   d.vmode = GET_MODE (target);
10754   gcc_assert (VECTOR_MODE_P (d.vmode));
10755   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10756   d.testing_p = false;
10757
10758   for (i = which = 0; i < nelt; ++i)
10759     {
10760       rtx e = XVECEXP (sel, 0, i);
10761       int ei = INTVAL (e) & (2 * nelt - 1);
10762       which |= (ei < nelt ? 1 : 2);
10763       d.perm[i] = ei;
10764     }
10765
10766   switch (which)
10767     {
10768     default:
10769       gcc_unreachable ();
10770
10771     case 3:
10772       d.one_vector_p = false;
10773       if (!rtx_equal_p (op0, op1))
10774         break;
10775
10776       /* The elements of PERM do not suggest that only the first operand
10777          is used, but both operands are identical.  Allow easier matching
10778          of the permutation by folding the permutation into the single
10779          input vector.  */
10780       /* Fall Through.  */
10781     case 2:
10782       for (i = 0; i < nelt; ++i)
10783         d.perm[i] &= nelt - 1;
10784       d.op0 = op1;
10785       d.one_vector_p = true;
10786       break;
10787
10788     case 1:
10789       d.op1 = op0;
10790       d.one_vector_p = true;
10791       break;
10792     }
10793
10794   return aarch64_expand_vec_perm_const_1 (&d);
10795 }
10796
10797 static bool
10798 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10799                                      const unsigned char *sel)
10800 {
10801   struct expand_vec_perm_d d;
10802   unsigned int i, nelt, which;
10803   bool ret;
10804
10805   d.vmode = vmode;
10806   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10807   d.testing_p = true;
10808   memcpy (d.perm, sel, nelt);
10809
10810   /* Calculate whether all elements are in one vector.  */
10811   for (i = which = 0; i < nelt; ++i)
10812     {
10813       unsigned char e = d.perm[i];
10814       gcc_assert (e < 2 * nelt);
10815       which |= (e < nelt ? 1 : 2);
10816     }
10817
10818   /* If all elements are from the second vector, reindex as if from the
10819      first vector.  */
10820   if (which == 2)
10821     for (i = 0; i < nelt; ++i)
10822       d.perm[i] -= nelt;
10823
10824   /* Check whether the mask can be applied to a single vector.  */
10825   d.one_vector_p = (which != 3);
10826
10827   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10828   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10829   if (!d.one_vector_p)
10830     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10831
10832   start_sequence ();
10833   ret = aarch64_expand_vec_perm_const_1 (&d);
10834   end_sequence ();
10835
10836   return ret;
10837 }
10838
10839 rtx
10840 aarch64_reverse_mask (enum machine_mode mode)
10841 {
10842   /* We have to reverse each vector because we dont have
10843      a permuted load that can reverse-load according to ABI rules.  */
10844   rtx mask;
10845   rtvec v = rtvec_alloc (16);
10846   int i, j;
10847   int nunits = GET_MODE_NUNITS (mode);
10848   int usize = GET_MODE_UNIT_SIZE (mode);
10849
10850   gcc_assert (BYTES_BIG_ENDIAN);
10851   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10852
10853   for (i = 0; i < nunits; i++)
10854     for (j = 0; j < usize; j++)
10855       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10856   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10857   return force_reg (V16QImode, mask);
10858 }
10859
10860 /* Implement MODES_TIEABLE_P.  */
10861
10862 bool
10863 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10864 {
10865   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10866     return true;
10867
10868   /* We specifically want to allow elements of "structure" modes to
10869      be tieable to the structure.  This more general condition allows
10870      other rarer situations too.  */
10871   if (TARGET_SIMD
10872       && aarch64_vector_mode_p (mode1)
10873       && aarch64_vector_mode_p (mode2))
10874     return true;
10875
10876   return false;
10877 }
10878
10879 /* Return a new RTX holding the result of moving POINTER forward by
10880    AMOUNT bytes.  */
10881
10882 static rtx
10883 aarch64_move_pointer (rtx pointer, int amount)
10884 {
10885   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10886
10887   return adjust_automodify_address (pointer, GET_MODE (pointer),
10888                                     next, amount);
10889 }
10890
10891 /* Return a new RTX holding the result of moving POINTER forward by the
10892    size of the mode it points to.  */
10893
10894 static rtx
10895 aarch64_progress_pointer (rtx pointer)
10896 {
10897   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10898
10899   return aarch64_move_pointer (pointer, amount);
10900 }
10901
10902 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10903    MODE bytes.  */
10904
10905 static void
10906 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10907                                               machine_mode mode)
10908 {
10909   rtx reg = gen_reg_rtx (mode);
10910
10911   /* "Cast" the pointers to the correct mode.  */
10912   *src = adjust_address (*src, mode, 0);
10913   *dst = adjust_address (*dst, mode, 0);
10914   /* Emit the memcpy.  */
10915   emit_move_insn (reg, *src);
10916   emit_move_insn (*dst, reg);
10917   /* Move the pointers forward.  */
10918   *src = aarch64_progress_pointer (*src);
10919   *dst = aarch64_progress_pointer (*dst);
10920 }
10921
10922 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10923    we succeed, otherwise return false.  */
10924
10925 bool
10926 aarch64_expand_movmem (rtx *operands)
10927 {
10928   unsigned int n;
10929   rtx dst = operands[0];
10930   rtx src = operands[1];
10931   rtx base;
10932   bool speed_p = !optimize_function_for_size_p (cfun);
10933
10934   /* When optimizing for size, give a better estimate of the length of a
10935      memcpy call, but use the default otherwise.  */
10936   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10937
10938   /* We can't do anything smart if the amount to copy is not constant.  */
10939   if (!CONST_INT_P (operands[2]))
10940     return false;
10941
10942   n = UINTVAL (operands[2]);
10943
10944   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10945      need to make at most two moves.  For cases above 16 bytes it will be one
10946      move for each 16 byte chunk, then at most two additional moves.  */
10947   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10948     return false;
10949
10950   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10951   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10952
10953   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10954   src = adjust_automodify_address (src, VOIDmode, base, 0);
10955
10956   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10957      1-byte chunk.  */
10958   if (n < 4)
10959     {
10960       if (n >= 2)
10961         {
10962           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10963           n -= 2;
10964         }
10965
10966       if (n == 1)
10967         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10968
10969       return true;
10970     }
10971
10972   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10973      4-byte chunk, partially overlapping with the previously copied chunk.  */
10974   if (n < 8)
10975     {
10976       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10977       n -= 4;
10978       if (n > 0)
10979         {
10980           int move = n - 4;
10981
10982           src = aarch64_move_pointer (src, move);
10983           dst = aarch64_move_pointer (dst, move);
10984           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10985         }
10986       return true;
10987     }
10988
10989   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10990      them, then (if applicable) an 8-byte chunk.  */
10991   while (n >= 8)
10992     {
10993       if (n / 16)
10994         {
10995           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10996           n -= 16;
10997         }
10998       else
10999         {
11000           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
11001           n -= 8;
11002         }
11003     }
11004
11005   /* Finish the final bytes of the copy.  We can always do this in one
11006      instruction.  We either copy the exact amount we need, or partially
11007      overlap with the previous chunk we copied and copy 8-bytes.  */
11008   if (n == 0)
11009     return true;
11010   else if (n == 1)
11011     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
11012   else if (n == 2)
11013     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
11014   else if (n == 4)
11015     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11016   else
11017     {
11018       if (n == 3)
11019         {
11020           src = aarch64_move_pointer (src, -1);
11021           dst = aarch64_move_pointer (dst, -1);
11022           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11023         }
11024       else
11025         {
11026           int move = n - 8;
11027
11028           src = aarch64_move_pointer (src, move);
11029           dst = aarch64_move_pointer (dst, move);
11030           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
11031         }
11032     }
11033
11034   return true;
11035 }
11036
11037 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
11038
11039 static unsigned HOST_WIDE_INT
11040 aarch64_asan_shadow_offset (void)
11041 {
11042   return (HOST_WIDE_INT_1 << 36);
11043 }
11044
11045 static bool
11046 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
11047                                         unsigned int align,
11048                                         enum by_pieces_operation op,
11049                                         bool speed_p)
11050 {
11051   /* STORE_BY_PIECES can be used when copying a constant string, but
11052      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
11053      For now we always fail this and let the move_by_pieces code copy
11054      the string from read-only memory.  */
11055   if (op == STORE_BY_PIECES)
11056     return false;
11057
11058   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
11059 }
11060
11061 static enum machine_mode
11062 aarch64_code_to_ccmode (enum rtx_code code)
11063 {
11064   switch (code)
11065     {
11066     case NE:
11067       return CC_DNEmode;
11068
11069     case EQ:
11070       return CC_DEQmode;
11071
11072     case LE:
11073       return CC_DLEmode;
11074
11075     case LT:
11076       return CC_DLTmode;
11077
11078     case GE:
11079       return CC_DGEmode;
11080
11081     case GT:
11082       return CC_DGTmode;
11083
11084     case LEU:
11085       return CC_DLEUmode;
11086
11087     case LTU:
11088       return CC_DLTUmode;
11089
11090     case GEU:
11091       return CC_DGEUmode;
11092
11093     case GTU:
11094       return CC_DGTUmode;
11095
11096     default:
11097       return CCmode;
11098     }
11099 }
11100
11101 static rtx
11102 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
11103                         int code, tree treeop0, tree treeop1)
11104 {
11105   enum machine_mode op_mode, cmp_mode, cc_mode;
11106   rtx op0, op1, cmp, target;
11107   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11108   enum insn_code icode;
11109   struct expand_operand ops[4];
11110
11111   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
11112   if (cc_mode == CCmode)
11113     return NULL_RTX;
11114
11115   start_sequence ();
11116   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11117
11118   op_mode = GET_MODE (op0);
11119   if (op_mode == VOIDmode)
11120     op_mode = GET_MODE (op1);
11121
11122   switch (op_mode)
11123     {
11124     case QImode:
11125     case HImode:
11126     case SImode:
11127       cmp_mode = SImode;
11128       icode = CODE_FOR_cmpsi;
11129       break;
11130
11131     case DImode:
11132       cmp_mode = DImode;
11133       icode = CODE_FOR_cmpdi;
11134       break;
11135
11136     default:
11137       end_sequence ();
11138       return NULL_RTX;
11139     }
11140
11141   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11142   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11143   if (!op0 || !op1)
11144     {
11145       end_sequence ();
11146       return NULL_RTX;
11147     }
11148   *prep_seq = get_insns ();
11149   end_sequence ();
11150
11151   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
11152   target = gen_rtx_REG (CCmode, CC_REGNUM);
11153
11154   create_output_operand (&ops[0], target, CCmode);
11155   create_fixed_operand (&ops[1], cmp);
11156   create_fixed_operand (&ops[2], op0);
11157   create_fixed_operand (&ops[3], op1);
11158
11159   start_sequence ();
11160   if (!maybe_expand_insn (icode, 4, ops))
11161     {
11162       end_sequence ();
11163       return NULL_RTX;
11164     }
11165   *gen_seq = get_insns ();
11166   end_sequence ();
11167
11168   return gen_rtx_REG (cc_mode, CC_REGNUM);
11169 }
11170
11171 static rtx
11172 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
11173                        tree treeop0, tree treeop1, int bit_code)
11174 {
11175   rtx op0, op1, cmp0, cmp1, target;
11176   enum machine_mode op_mode, cmp_mode, cc_mode;
11177   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11178   enum insn_code icode = CODE_FOR_ccmp_andsi;
11179   struct expand_operand ops[6];
11180
11181   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
11182   if (cc_mode == CCmode)
11183     return NULL_RTX;
11184
11185   push_to_sequence ((rtx_insn*) *prep_seq);
11186   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11187
11188   op_mode = GET_MODE (op0);
11189   if (op_mode == VOIDmode)
11190     op_mode = GET_MODE (op1);
11191
11192   switch (op_mode)
11193     {
11194     case QImode:
11195     case HImode:
11196     case SImode:
11197       cmp_mode = SImode;
11198       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
11199                                                 : CODE_FOR_ccmp_iorsi;
11200       break;
11201
11202     case DImode:
11203       cmp_mode = DImode;
11204       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
11205                                                 : CODE_FOR_ccmp_iordi;
11206       break;
11207
11208     default:
11209       end_sequence ();
11210       return NULL_RTX;
11211     }
11212
11213   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11214   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11215   if (!op0 || !op1)
11216     {
11217       end_sequence ();
11218       return NULL_RTX;
11219     }
11220   *prep_seq = get_insns ();
11221   end_sequence ();
11222
11223   target = gen_rtx_REG (cc_mode, CC_REGNUM);
11224   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
11225   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
11226
11227   create_fixed_operand (&ops[0], prev);
11228   create_fixed_operand (&ops[1], target);
11229   create_fixed_operand (&ops[2], op0);
11230   create_fixed_operand (&ops[3], op1);
11231   create_fixed_operand (&ops[4], cmp0);
11232   create_fixed_operand (&ops[5], cmp1);
11233
11234   push_to_sequence ((rtx_insn*) *gen_seq);
11235   if (!maybe_expand_insn (icode, 6, ops))
11236     {
11237       end_sequence ();
11238       return NULL_RTX;
11239     }
11240
11241   *gen_seq = get_insns ();
11242   end_sequence ();
11243
11244   return target;
11245 }
11246
11247 #undef TARGET_GEN_CCMP_FIRST
11248 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
11249
11250 #undef TARGET_GEN_CCMP_NEXT
11251 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
11252
11253 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
11254    instruction fusion of some sort.  */
11255
11256 static bool
11257 aarch64_macro_fusion_p (void)
11258 {
11259   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
11260 }
11261
11262
11263 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
11264    should be kept together during scheduling.  */
11265
11266 static bool
11267 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
11268 {
11269   rtx set_dest;
11270   rtx prev_set = single_set (prev);
11271   rtx curr_set = single_set (curr);
11272   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
11273   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
11274
11275   if (!aarch64_macro_fusion_p ())
11276     return false;
11277
11278   if (simple_sets_p
11279       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
11280     {
11281       /* We are trying to match:
11282          prev (mov)  == (set (reg r0) (const_int imm16))
11283          curr (movk) == (set (zero_extract (reg r0)
11284                                            (const_int 16)
11285                                            (const_int 16))
11286                              (const_int imm16_1))  */
11287
11288       set_dest = SET_DEST (curr_set);
11289
11290       if (GET_CODE (set_dest) == ZERO_EXTRACT
11291           && CONST_INT_P (SET_SRC (curr_set))
11292           && CONST_INT_P (SET_SRC (prev_set))
11293           && CONST_INT_P (XEXP (set_dest, 2))
11294           && INTVAL (XEXP (set_dest, 2)) == 16
11295           && REG_P (XEXP (set_dest, 0))
11296           && REG_P (SET_DEST (prev_set))
11297           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
11298         {
11299           return true;
11300         }
11301     }
11302
11303   if (simple_sets_p
11304       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
11305     {
11306
11307       /*  We're trying to match:
11308           prev (adrp) == (set (reg r1)
11309                               (high (symbol_ref ("SYM"))))
11310           curr (add) == (set (reg r0)
11311                              (lo_sum (reg r1)
11312                                      (symbol_ref ("SYM"))))
11313           Note that r0 need not necessarily be the same as r1, especially
11314           during pre-regalloc scheduling.  */
11315
11316       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11317           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11318         {
11319           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11320               && REG_P (XEXP (SET_SRC (curr_set), 0))
11321               && REGNO (XEXP (SET_SRC (curr_set), 0))
11322                  == REGNO (SET_DEST (prev_set))
11323               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11324                               XEXP (SET_SRC (curr_set), 1)))
11325             return true;
11326         }
11327     }
11328
11329   if (simple_sets_p
11330       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11331     {
11332
11333       /* We're trying to match:
11334          prev (movk) == (set (zero_extract (reg r0)
11335                                            (const_int 16)
11336                                            (const_int 32))
11337                              (const_int imm16_1))
11338          curr (movk) == (set (zero_extract (reg r0)
11339                                            (const_int 16)
11340                                            (const_int 48))
11341                              (const_int imm16_2))  */
11342
11343       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11344           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11345           && REG_P (XEXP (SET_DEST (prev_set), 0))
11346           && REG_P (XEXP (SET_DEST (curr_set), 0))
11347           && REGNO (XEXP (SET_DEST (prev_set), 0))
11348              == REGNO (XEXP (SET_DEST (curr_set), 0))
11349           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11350           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11351           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11352           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11353           && CONST_INT_P (SET_SRC (prev_set))
11354           && CONST_INT_P (SET_SRC (curr_set)))
11355         return true;
11356
11357     }
11358   if (simple_sets_p
11359       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
11360     {
11361       /* We're trying to match:
11362           prev (adrp) == (set (reg r0)
11363                               (high (symbol_ref ("SYM"))))
11364           curr (ldr) == (set (reg r1)
11365                              (mem (lo_sum (reg r0)
11366                                              (symbol_ref ("SYM")))))
11367                  or
11368           curr (ldr) == (set (reg r1)
11369                              (zero_extend (mem
11370                                            (lo_sum (reg r0)
11371                                                    (symbol_ref ("SYM"))))))  */
11372       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11373           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11374         {
11375           rtx curr_src = SET_SRC (curr_set);
11376
11377           if (GET_CODE (curr_src) == ZERO_EXTEND)
11378             curr_src = XEXP (curr_src, 0);
11379
11380           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11381               && REG_P (XEXP (XEXP (curr_src, 0), 0))
11382               && REGNO (XEXP (XEXP (curr_src, 0), 0))
11383                  == REGNO (SET_DEST (prev_set))
11384               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11385                               XEXP (SET_SRC (prev_set), 0)))
11386               return true;
11387         }
11388     }
11389
11390   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11391       && any_condjump_p (curr))
11392     {
11393       enum attr_type prev_type = get_attr_type (prev);
11394
11395       /* FIXME: this misses some which is considered simple arthematic
11396          instructions for ThunderX.  Simple shifts are missed here.  */
11397       if (prev_type == TYPE_ALUS_SREG
11398           || prev_type == TYPE_ALUS_IMM
11399           || prev_type == TYPE_LOGICS_REG
11400           || prev_type == TYPE_LOGICS_IMM)
11401         return true;
11402     }
11403
11404   return false;
11405 }
11406
11407 /* If MEM is in the form of [base+offset], extract the two parts
11408    of address and set to BASE and OFFSET, otherwise return false
11409    after clearing BASE and OFFSET.  */
11410
11411 bool
11412 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11413 {
11414   rtx addr;
11415
11416   gcc_assert (MEM_P (mem));
11417
11418   addr = XEXP (mem, 0);
11419
11420   if (REG_P (addr))
11421     {
11422       *base = addr;
11423       *offset = const0_rtx;
11424       return true;
11425     }
11426
11427   if (GET_CODE (addr) == PLUS
11428       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11429     {
11430       *base = XEXP (addr, 0);
11431       *offset = XEXP (addr, 1);
11432       return true;
11433     }
11434
11435   *base = NULL_RTX;
11436   *offset = NULL_RTX;
11437
11438   return false;
11439 }
11440
11441 /* Types for scheduling fusion.  */
11442 enum sched_fusion_type
11443 {
11444   SCHED_FUSION_NONE = 0,
11445   SCHED_FUSION_LD_SIGN_EXTEND,
11446   SCHED_FUSION_LD_ZERO_EXTEND,
11447   SCHED_FUSION_LD,
11448   SCHED_FUSION_ST,
11449   SCHED_FUSION_NUM
11450 };
11451
11452 /* If INSN is a load or store of address in the form of [base+offset],
11453    extract the two parts and set to BASE and OFFSET.  Return scheduling
11454    fusion type this INSN is.  */
11455
11456 static enum sched_fusion_type
11457 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11458 {
11459   rtx x, dest, src;
11460   enum sched_fusion_type fusion = SCHED_FUSION_LD;
11461
11462   gcc_assert (INSN_P (insn));
11463   x = PATTERN (insn);
11464   if (GET_CODE (x) != SET)
11465     return SCHED_FUSION_NONE;
11466
11467   src = SET_SRC (x);
11468   dest = SET_DEST (x);
11469
11470   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11471       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11472     return SCHED_FUSION_NONE;
11473
11474   if (GET_CODE (src) == SIGN_EXTEND)
11475     {
11476       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11477       src = XEXP (src, 0);
11478       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11479         return SCHED_FUSION_NONE;
11480     }
11481   else if (GET_CODE (src) == ZERO_EXTEND)
11482     {
11483       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11484       src = XEXP (src, 0);
11485       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11486         return SCHED_FUSION_NONE;
11487     }
11488
11489   if (GET_CODE (src) == MEM && REG_P (dest))
11490     extract_base_offset_in_addr (src, base, offset);
11491   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11492     {
11493       fusion = SCHED_FUSION_ST;
11494       extract_base_offset_in_addr (dest, base, offset);
11495     }
11496   else
11497     return SCHED_FUSION_NONE;
11498
11499   if (*base == NULL_RTX || *offset == NULL_RTX)
11500     fusion = SCHED_FUSION_NONE;
11501
11502   return fusion;
11503 }
11504
11505 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11506
11507    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11508    and PRI are only calculated for these instructions.  For other instruction,
11509    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
11510    type instruction fusion can be added by returning different priorities.
11511
11512    It's important that irrelevant instructions get the largest FUSION_PRI.  */
11513
11514 static void
11515 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11516                                int *fusion_pri, int *pri)
11517 {
11518   int tmp, off_val;
11519   rtx base, offset;
11520   enum sched_fusion_type fusion;
11521
11522   gcc_assert (INSN_P (insn));
11523
11524   tmp = max_pri - 1;
11525   fusion = fusion_load_store (insn, &base, &offset);
11526   if (fusion == SCHED_FUSION_NONE)
11527     {
11528       *pri = tmp;
11529       *fusion_pri = tmp;
11530       return;
11531     }
11532
11533   /* Set FUSION_PRI according to fusion type and base register.  */
11534   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11535
11536   /* Calculate PRI.  */
11537   tmp /= 2;
11538
11539   /* INSN with smaller offset goes first.  */
11540   off_val = (int)(INTVAL (offset));
11541   if (off_val >= 0)
11542     tmp -= (off_val & 0xfffff);
11543   else
11544     tmp += ((- off_val) & 0xfffff);
11545
11546   *pri = tmp;
11547   return;
11548 }
11549
11550 /* Given OPERANDS of consecutive load/store, check if we can merge
11551    them into ldp/stp.  LOAD is true if they are load instructions.
11552    MODE is the mode of memory operands.  */
11553
11554 bool
11555 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11556                                 enum machine_mode mode)
11557 {
11558   HOST_WIDE_INT offval_1, offval_2, msize;
11559   enum reg_class rclass_1, rclass_2;
11560   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11561
11562   if (load)
11563     {
11564       mem_1 = operands[1];
11565       mem_2 = operands[3];
11566       reg_1 = operands[0];
11567       reg_2 = operands[2];
11568       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11569       if (REGNO (reg_1) == REGNO (reg_2))
11570         return false;
11571     }
11572   else
11573     {
11574       mem_1 = operands[0];
11575       mem_2 = operands[2];
11576       reg_1 = operands[1];
11577       reg_2 = operands[3];
11578     }
11579
11580   /* The mems cannot be volatile.  */
11581   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11582     return false;
11583
11584   /* Check if the addresses are in the form of [base+offset].  */
11585   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11586   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11587     return false;
11588   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11589   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11590     return false;
11591
11592   /* Check if the bases are same.  */
11593   if (!rtx_equal_p (base_1, base_2))
11594     return false;
11595
11596   offval_1 = INTVAL (offset_1);
11597   offval_2 = INTVAL (offset_2);
11598   msize = GET_MODE_SIZE (mode);
11599   /* Check if the offsets are consecutive.  */
11600   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11601     return false;
11602
11603   /* Check if the addresses are clobbered by load.  */
11604   if (load)
11605     {
11606       if (reg_mentioned_p (reg_1, mem_1))
11607         return false;
11608
11609       /* In increasing order, the last load can clobber the address.  */
11610       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11611       return false;
11612     }
11613
11614   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11615     rclass_1 = FP_REGS;
11616   else
11617     rclass_1 = GENERAL_REGS;
11618
11619   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11620     rclass_2 = FP_REGS;
11621   else
11622     rclass_2 = GENERAL_REGS;
11623
11624   /* Check if the registers are of same class.  */
11625   if (rclass_1 != rclass_2)
11626     return false;
11627
11628   return true;
11629 }
11630
11631 /* Given OPERANDS of consecutive load/store, check if we can merge
11632    them into ldp/stp by adjusting the offset.  LOAD is true if they
11633    are load instructions.  MODE is the mode of memory operands.
11634
11635    Given below consecutive stores:
11636
11637      str  w1, [xb, 0x100]
11638      str  w1, [xb, 0x104]
11639      str  w1, [xb, 0x108]
11640      str  w1, [xb, 0x10c]
11641
11642    Though the offsets are out of the range supported by stp, we can
11643    still pair them after adjusting the offset, like:
11644
11645      add  scratch, xb, 0x100
11646      stp  w1, w1, [scratch]
11647      stp  w1, w1, [scratch, 0x8]
11648
11649    The peephole patterns detecting this opportunity should guarantee
11650    the scratch register is avaliable.  */
11651
11652 bool
11653 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11654                                        enum machine_mode mode)
11655 {
11656   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11657   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11658   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11659   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11660
11661   if (load)
11662     {
11663       reg_1 = operands[0];
11664       mem_1 = operands[1];
11665       reg_2 = operands[2];
11666       mem_2 = operands[3];
11667       reg_3 = operands[4];
11668       mem_3 = operands[5];
11669       reg_4 = operands[6];
11670       mem_4 = operands[7];
11671       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11672                   && REG_P (reg_3) && REG_P (reg_4));
11673       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11674         return false;
11675     }
11676   else
11677     {
11678       mem_1 = operands[0];
11679       reg_1 = operands[1];
11680       mem_2 = operands[2];
11681       reg_2 = operands[3];
11682       mem_3 = operands[4];
11683       reg_3 = operands[5];
11684       mem_4 = operands[6];
11685       reg_4 = operands[7];
11686     }
11687   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11688   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11689     return false;
11690
11691   /* The mems cannot be volatile.  */
11692   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11693       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11694     return false;
11695
11696   /* Check if the addresses are in the form of [base+offset].  */
11697   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11698   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11699     return false;
11700   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11701   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11702     return false;
11703   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11704   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11705     return false;
11706   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11707   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11708     return false;
11709
11710   /* Check if the bases are same.  */
11711   if (!rtx_equal_p (base_1, base_2)
11712       || !rtx_equal_p (base_2, base_3)
11713       || !rtx_equal_p (base_3, base_4))
11714     return false;
11715
11716   offval_1 = INTVAL (offset_1);
11717   offval_2 = INTVAL (offset_2);
11718   offval_3 = INTVAL (offset_3);
11719   offval_4 = INTVAL (offset_4);
11720   msize = GET_MODE_SIZE (mode);
11721   /* Check if the offsets are consecutive.  */
11722   if ((offval_1 != (offval_2 + msize)
11723        || offval_1 != (offval_3 + msize * 2)
11724        || offval_1 != (offval_4 + msize * 3))
11725       && (offval_4 != (offval_3 + msize)
11726           || offval_4 != (offval_2 + msize * 2)
11727           || offval_4 != (offval_1 + msize * 3)))
11728     return false;
11729
11730   /* Check if the addresses are clobbered by load.  */
11731   if (load)
11732     {
11733       if (reg_mentioned_p (reg_1, mem_1)
11734           || reg_mentioned_p (reg_2, mem_2)
11735           || reg_mentioned_p (reg_3, mem_3))
11736         return false;
11737
11738       /* In increasing order, the last load can clobber the address.  */
11739       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11740         return false;
11741     }
11742
11743   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11744     rclass_1 = FP_REGS;
11745   else
11746     rclass_1 = GENERAL_REGS;
11747
11748   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11749     rclass_2 = FP_REGS;
11750   else
11751     rclass_2 = GENERAL_REGS;
11752
11753   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11754     rclass_3 = FP_REGS;
11755   else
11756     rclass_3 = GENERAL_REGS;
11757
11758   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11759     rclass_4 = FP_REGS;
11760   else
11761     rclass_4 = GENERAL_REGS;
11762
11763   /* Check if the registers are of same class.  */
11764   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11765     return false;
11766
11767   return true;
11768 }
11769
11770 /* Given OPERANDS of consecutive load/store, this function pairs them
11771    into ldp/stp after adjusting the offset.  It depends on the fact
11772    that addresses of load/store instructions are in increasing order.
11773    MODE is the mode of memory operands.  CODE is the rtl operator
11774    which should be applied to all memory operands, it's SIGN_EXTEND,
11775    ZERO_EXTEND or UNKNOWN.  */
11776
11777 bool
11778 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11779                              enum machine_mode mode, RTX_CODE code)
11780 {
11781   rtx base, offset, t1, t2;
11782   rtx mem_1, mem_2, mem_3, mem_4;
11783   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11784
11785   if (load)
11786     {
11787       mem_1 = operands[1];
11788       mem_2 = operands[3];
11789       mem_3 = operands[5];
11790       mem_4 = operands[7];
11791     }
11792   else
11793     {
11794       mem_1 = operands[0];
11795       mem_2 = operands[2];
11796       mem_3 = operands[4];
11797       mem_4 = operands[6];
11798       gcc_assert (code == UNKNOWN);
11799     }
11800
11801   extract_base_offset_in_addr (mem_1, &base, &offset);
11802   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11803
11804   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11805   msize = GET_MODE_SIZE (mode);
11806   stp_off_limit = msize * 0x40;
11807   off_val = INTVAL (offset);
11808   abs_off = (off_val < 0) ? -off_val : off_val;
11809   new_off = abs_off % stp_off_limit;
11810   adj_off = abs_off - new_off;
11811
11812   /* Further adjust to make sure all offsets are OK.  */
11813   if ((new_off + msize * 2) >= stp_off_limit)
11814     {
11815       adj_off += stp_off_limit;
11816       new_off -= stp_off_limit;
11817     }
11818
11819   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11820   if (adj_off >= 0x1000)
11821     return false;
11822
11823   if (off_val < 0)
11824     {
11825       adj_off = -adj_off;
11826       new_off = -new_off;
11827     }
11828
11829   /* Create new memory references.  */
11830   mem_1 = change_address (mem_1, VOIDmode,
11831                           plus_constant (DImode, operands[8], new_off));
11832
11833   /* Check if the adjusted address is OK for ldp/stp.  */
11834   if (!aarch64_mem_pair_operand (mem_1, mode))
11835     return false;
11836
11837   msize = GET_MODE_SIZE (mode);
11838   mem_2 = change_address (mem_2, VOIDmode,
11839                           plus_constant (DImode,
11840                                          operands[8],
11841                                          new_off + msize));
11842   mem_3 = change_address (mem_3, VOIDmode,
11843                           plus_constant (DImode,
11844                                          operands[8],
11845                                          new_off + msize * 2));
11846   mem_4 = change_address (mem_4, VOIDmode,
11847                           plus_constant (DImode,
11848                                          operands[8],
11849                                          new_off + msize * 3));
11850
11851   if (code == ZERO_EXTEND)
11852     {
11853       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11854       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11855       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11856       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11857     }
11858   else if (code == SIGN_EXTEND)
11859     {
11860       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11861       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11862       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11863       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11864     }
11865
11866   if (load)
11867     {
11868       operands[1] = mem_1;
11869       operands[3] = mem_2;
11870       operands[5] = mem_3;
11871       operands[7] = mem_4;
11872     }
11873   else
11874     {
11875       operands[0] = mem_1;
11876       operands[2] = mem_2;
11877       operands[4] = mem_3;
11878       operands[6] = mem_4;
11879     }
11880
11881   /* Emit adjusting instruction.  */
11882   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11883   /* Emit ldp/stp instructions.  */
11884   t1 = gen_rtx_SET (operands[0], operands[1]);
11885   t2 = gen_rtx_SET (operands[2], operands[3]);
11886   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11887   t1 = gen_rtx_SET (operands[4], operands[5]);
11888   t2 = gen_rtx_SET (operands[6], operands[7]);
11889   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11890   return true;
11891 }
11892
11893 /* Return 1 if pseudo register should be created and used to hold
11894    GOT address for PIC code.  */
11895
11896 bool
11897 aarch64_use_pseudo_pic_reg (void)
11898 {
11899   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
11900 }
11901
11902 #undef TARGET_ADDRESS_COST
11903 #define TARGET_ADDRESS_COST aarch64_address_cost
11904
11905 /* This hook will determines whether unnamed bitfields affect the alignment
11906    of the containing structure.  The hook returns true if the structure
11907    should inherit the alignment requirements of an unnamed bitfield's
11908    type.  */
11909 #undef TARGET_ALIGN_ANON_BITFIELD
11910 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11911
11912 #undef TARGET_ASM_ALIGNED_DI_OP
11913 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11914
11915 #undef TARGET_ASM_ALIGNED_HI_OP
11916 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11917
11918 #undef TARGET_ASM_ALIGNED_SI_OP
11919 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11920
11921 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11922 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11923   hook_bool_const_tree_hwi_hwi_const_tree_true
11924
11925 #undef TARGET_ASM_FILE_START
11926 #define TARGET_ASM_FILE_START aarch64_start_file
11927
11928 #undef TARGET_ASM_OUTPUT_MI_THUNK
11929 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11930
11931 #undef TARGET_ASM_SELECT_RTX_SECTION
11932 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11933
11934 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11935 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11936
11937 #undef TARGET_BUILD_BUILTIN_VA_LIST
11938 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11939
11940 #undef TARGET_CALLEE_COPIES
11941 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11942
11943 #undef TARGET_CAN_ELIMINATE
11944 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11945
11946 #undef TARGET_CANNOT_FORCE_CONST_MEM
11947 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11948
11949 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11950 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11951
11952 /* Only the least significant bit is used for initialization guard
11953    variables.  */
11954 #undef TARGET_CXX_GUARD_MASK_BIT
11955 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11956
11957 #undef TARGET_C_MODE_FOR_SUFFIX
11958 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11959
11960 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11961 #undef  TARGET_DEFAULT_TARGET_FLAGS
11962 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11963 #endif
11964
11965 #undef TARGET_CLASS_MAX_NREGS
11966 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11967
11968 #undef TARGET_BUILTIN_DECL
11969 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11970
11971 #undef  TARGET_EXPAND_BUILTIN
11972 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11973
11974 #undef TARGET_EXPAND_BUILTIN_VA_START
11975 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11976
11977 #undef TARGET_FOLD_BUILTIN
11978 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11979
11980 #undef TARGET_FUNCTION_ARG
11981 #define TARGET_FUNCTION_ARG aarch64_function_arg
11982
11983 #undef TARGET_FUNCTION_ARG_ADVANCE
11984 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11985
11986 #undef TARGET_FUNCTION_ARG_BOUNDARY
11987 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11988
11989 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11990 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11991
11992 #undef TARGET_FUNCTION_VALUE
11993 #define TARGET_FUNCTION_VALUE aarch64_function_value
11994
11995 #undef TARGET_FUNCTION_VALUE_REGNO_P
11996 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11997
11998 #undef TARGET_FRAME_POINTER_REQUIRED
11999 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
12000
12001 #undef TARGET_GIMPLE_FOLD_BUILTIN
12002 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
12003
12004 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
12005 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
12006
12007 #undef  TARGET_INIT_BUILTINS
12008 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
12009
12010 #undef TARGET_LEGITIMATE_ADDRESS_P
12011 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
12012
12013 #undef TARGET_LEGITIMATE_CONSTANT_P
12014 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
12015
12016 #undef TARGET_LIBGCC_CMP_RETURN_MODE
12017 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
12018
12019 #undef TARGET_LRA_P
12020 #define TARGET_LRA_P hook_bool_void_true
12021
12022 #undef TARGET_MANGLE_TYPE
12023 #define TARGET_MANGLE_TYPE aarch64_mangle_type
12024
12025 #undef TARGET_MEMORY_MOVE_COST
12026 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
12027
12028 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
12029 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
12030
12031 #undef TARGET_MUST_PASS_IN_STACK
12032 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
12033
12034 /* This target hook should return true if accesses to volatile bitfields
12035    should use the narrowest mode possible.  It should return false if these
12036    accesses should use the bitfield container type.  */
12037 #undef TARGET_NARROW_VOLATILE_BITFIELD
12038 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
12039
12040 #undef  TARGET_OPTION_OVERRIDE
12041 #define TARGET_OPTION_OVERRIDE aarch64_override_options
12042
12043 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
12044 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
12045   aarch64_override_options_after_change
12046
12047 #undef TARGET_PASS_BY_REFERENCE
12048 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
12049
12050 #undef TARGET_PREFERRED_RELOAD_CLASS
12051 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
12052
12053 #undef TARGET_SCHED_REASSOCIATION_WIDTH
12054 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
12055
12056 #undef TARGET_SECONDARY_RELOAD
12057 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
12058
12059 #undef TARGET_SHIFT_TRUNCATION_MASK
12060 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
12061
12062 #undef TARGET_SETUP_INCOMING_VARARGS
12063 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
12064
12065 #undef TARGET_STRUCT_VALUE_RTX
12066 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
12067
12068 #undef TARGET_REGISTER_MOVE_COST
12069 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
12070
12071 #undef TARGET_RETURN_IN_MEMORY
12072 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
12073
12074 #undef TARGET_RETURN_IN_MSB
12075 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
12076
12077 #undef TARGET_RTX_COSTS
12078 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
12079
12080 #undef TARGET_SCHED_ISSUE_RATE
12081 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
12082
12083 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
12084 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
12085   aarch64_sched_first_cycle_multipass_dfa_lookahead
12086
12087 #undef TARGET_TRAMPOLINE_INIT
12088 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
12089
12090 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
12091 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
12092
12093 #undef TARGET_VECTOR_MODE_SUPPORTED_P
12094 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
12095
12096 #undef TARGET_ARRAY_MODE_SUPPORTED_P
12097 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
12098
12099 #undef TARGET_VECTORIZE_ADD_STMT_COST
12100 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
12101
12102 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
12103 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
12104   aarch64_builtin_vectorization_cost
12105
12106 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
12107 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
12108
12109 #undef TARGET_VECTORIZE_BUILTINS
12110 #define TARGET_VECTORIZE_BUILTINS
12111
12112 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
12113 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
12114   aarch64_builtin_vectorized_function
12115
12116 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
12117 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
12118   aarch64_autovectorize_vector_sizes
12119
12120 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
12121 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
12122   aarch64_atomic_assign_expand_fenv
12123
12124 /* Section anchor support.  */
12125
12126 #undef TARGET_MIN_ANCHOR_OFFSET
12127 #define TARGET_MIN_ANCHOR_OFFSET -256
12128
12129 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
12130    byte offset; we can do much more for larger data types, but have no way
12131    to determine the size of the access.  We assume accesses are aligned.  */
12132 #undef TARGET_MAX_ANCHOR_OFFSET
12133 #define TARGET_MAX_ANCHOR_OFFSET 4095
12134
12135 #undef TARGET_VECTOR_ALIGNMENT
12136 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
12137
12138 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
12139 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
12140   aarch64_simd_vector_alignment_reachable
12141
12142 /* vec_perm support.  */
12143
12144 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
12145 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
12146   aarch64_vectorize_vec_perm_const_ok
12147
12148
12149 #undef TARGET_FIXED_CONDITION_CODE_REGS
12150 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
12151
12152 #undef TARGET_FLAGS_REGNUM
12153 #define TARGET_FLAGS_REGNUM CC_REGNUM
12154
12155 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
12156 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
12157
12158 #undef TARGET_ASAN_SHADOW_OFFSET
12159 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
12160
12161 #undef TARGET_LEGITIMIZE_ADDRESS
12162 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
12163
12164 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
12165 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
12166   aarch64_use_by_pieces_infrastructure_p
12167
12168 #undef TARGET_CAN_USE_DOLOOP_P
12169 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
12170
12171 #undef TARGET_SCHED_MACRO_FUSION_P
12172 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
12173
12174 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
12175 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
12176
12177 #undef TARGET_SCHED_FUSION_PRIORITY
12178 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
12179
12180 #undef TARGET_USE_PSEUDO_PIC_REG
12181 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
12182
12183 struct gcc_target targetm = TARGET_INITIALIZER;
12184
12185 #include "gt-aarch64.h"