gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97 #include "sched-int.h"
  98
  99 /* Defined for convenience.  */
 100 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 101
 102 /* Classifies an address.
 103
 104    ADDRESS_REG_IMM
 105        A simple base register plus immediate offset.
 106
 107    ADDRESS_REG_WB
 108        A base register indexed by immediate offset with writeback.
 109
 110    ADDRESS_REG_REG
 111        A base register indexed by (optionally scaled) register.
 112
 113    ADDRESS_REG_UXTW
 114        A base register indexed by (optionally scaled) zero-extended register.
 115
 116    ADDRESS_REG_SXTW
 117        A base register indexed by (optionally scaled) sign-extended register.
 118
 119    ADDRESS_LO_SUM
 120        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 121
 122    ADDRESS_SYMBOLIC:
 123        A constant symbolic address, in pc-relative literal pool.  */
 124
 125 enum aarch64_address_type {
 126   ADDRESS_REG_IMM,
 127   ADDRESS_REG_WB,
 128   ADDRESS_REG_REG,
 129   ADDRESS_REG_UXTW,
 130   ADDRESS_REG_SXTW,
 131   ADDRESS_LO_SUM,
 132   ADDRESS_SYMBOLIC
 133 };
 134
 135 struct aarch64_address_info {
 136   enum aarch64_address_type type;
 137   rtx base;
 138   rtx offset;
 139   int shift;
 140   enum aarch64_symbol_type symbol_type;
 141 };
 142
 143 struct simd_immediate_info
 144 {
 145   rtx value;
 146   int shift;
 147   int element_width;
 148   bool mvn;
 149   bool msl;
 150 };
 151
 152 /* The current code model.  */
 153 enum aarch64_code_model aarch64_cmodel;
 154
 155 #ifdef HAVE_AS_TLS
 156 #undef TARGET_HAVE_TLS
 157 #define TARGET_HAVE_TLS 1
 158 #endif
 159
 160 static bool aarch64_composite_type_p (const_tree, machine_mode);
 161 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 162                                                      const_tree,
 163                                                      machine_mode *, int *,
 164                                                      bool *);
 165 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 166 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_override_options_after_change (void);
 168 static bool aarch64_vector_mode_supported_p (machine_mode);
 169 static unsigned bit_count (unsigned HOST_WIDE_INT);
 170 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 171                                                  const unsigned char *sel);
 172 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 173
 174 /* Major revision number of the ARM Architecture implemented by the target.  */
 175 unsigned aarch64_architecture_version;
 176
 177 /* The processor for which instructions should be scheduled.  */
 178 enum aarch64_processor aarch64_tune = cortexa53;
 179
 180 /* The current tuning set.  */
 181 const struct tune_params *aarch64_tune_params;
 182
 183 /* Mask to specify which instructions we are allowed to generate.  */
 184 unsigned long aarch64_isa_flags = 0;
 185
 186 /* Mask to specify which instruction scheduling options should be used.  */
 187 unsigned long aarch64_tune_flags = 0;
 188
 189 /* Tuning parameters.  */
 190
 191 static const struct cpu_addrcost_table generic_addrcost_table =
 192 {
 193     {
 194       0, /* hi  */
 195       0, /* si  */
 196       0, /* di  */
 197       0, /* ti  */
 198     },
 199   0, /* pre_modify  */
 200   0, /* post_modify  */
 201   0, /* register_offset  */
 202   0, /* register_extend  */
 203   0 /* imm_offset  */
 204 };
 205
 206 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 207 {
 208     {
 209       1, /* hi  */
 210       0, /* si  */
 211       0, /* di  */
 212       1, /* ti  */
 213     },
 214   0, /* pre_modify  */
 215   0, /* post_modify  */
 216   0, /* register_offset  */
 217   0, /* register_extend  */
 218   0, /* imm_offset  */
 219 };
 220
 221 static const struct cpu_addrcost_table xgene1_addrcost_table =
 222 {
 223     {
 224       1, /* hi  */
 225       0, /* si  */
 226       0, /* di  */
 227       1, /* ti  */
 228     },
 229   1, /* pre_modify  */
 230   0, /* post_modify  */
 231   0, /* register_offset  */
 232   1, /* register_extend  */
 233   0, /* imm_offset  */
 234 };
 235
 236 static const struct cpu_regmove_cost generic_regmove_cost =
 237 {
 238   1, /* GP2GP  */
 239   /* Avoid the use of slow int<->fp moves for spilling by setting
 240      their cost higher than memmov_cost.  */
 241   5, /* GP2FP  */
 242   5, /* FP2GP  */
 243   2 /* FP2FP  */
 244 };
 245
 246 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 247 {
 248   1, /* GP2GP  */
 249   /* Avoid the use of slow int<->fp moves for spilling by setting
 250      their cost higher than memmov_cost.  */
 251   5, /* GP2FP  */
 252   5, /* FP2GP  */
 253   2 /* FP2FP  */
 254 };
 255
 256 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 257 {
 258   1, /* GP2GP  */
 259   /* Avoid the use of slow int<->fp moves for spilling by setting
 260      their cost higher than memmov_cost.  */
 261   5, /* GP2FP  */
 262   5, /* FP2GP  */
 263   2 /* FP2FP  */
 264 };
 265
 266 static const struct cpu_regmove_cost thunderx_regmove_cost =
 267 {
 268   2, /* GP2GP  */
 269   2, /* GP2FP  */
 270   6, /* FP2GP  */
 271   4 /* FP2FP  */
 272 };
 273
 274 static const struct cpu_regmove_cost xgene1_regmove_cost =
 275 {
 276   1, /* GP2GP  */
 277   /* Avoid the use of slow int<->fp moves for spilling by setting
 278      their cost higher than memmov_cost.  */
 279   8, /* GP2FP  */
 280   8, /* FP2GP  */
 281   2 /* FP2FP  */
 282 };
 283
 284 /* Generic costs for vector insn classes.  */
 285 static const struct cpu_vector_cost generic_vector_cost =
 286 {
 287   1, /* scalar_stmt_cost  */
 288   1, /* scalar_load_cost  */
 289   1, /* scalar_store_cost  */
 290   1, /* vec_stmt_cost  */
 291   1, /* vec_to_scalar_cost  */
 292   1, /* scalar_to_vec_cost  */
 293   1, /* vec_align_load_cost  */
 294   1, /* vec_unalign_load_cost  */
 295   1, /* vec_unalign_store_cost  */
 296   1, /* vec_store_cost  */
 297   3, /* cond_taken_branch_cost  */
 298   1 /* cond_not_taken_branch_cost  */
 299 };
 300
 301 /* Generic costs for vector insn classes.  */
 302 static const struct cpu_vector_cost cortexa57_vector_cost =
 303 {
 304   1, /* scalar_stmt_cost  */
 305   4, /* scalar_load_cost  */
 306   1, /* scalar_store_cost  */
 307   3, /* vec_stmt_cost  */
 308   8, /* vec_to_scalar_cost  */
 309   8, /* scalar_to_vec_cost  */
 310   5, /* vec_align_load_cost  */
 311   5, /* vec_unalign_load_cost  */
 312   1, /* vec_unalign_store_cost  */
 313   1, /* vec_store_cost  */
 314   1, /* cond_taken_branch_cost  */
 315   1 /* cond_not_taken_branch_cost  */
 316 };
 317
 318 /* Generic costs for vector insn classes.  */
 319 static const struct cpu_vector_cost xgene1_vector_cost =
 320 {
 321   1, /* scalar_stmt_cost  */
 322   5, /* scalar_load_cost  */
 323   1, /* scalar_store_cost  */
 324   2, /* vec_stmt_cost  */
 325   4, /* vec_to_scalar_cost  */
 326   4, /* scalar_to_vec_cost  */
 327   10, /* vec_align_load_cost  */
 328   10, /* vec_unalign_load_cost  */
 329   2, /* vec_unalign_store_cost  */
 330   2, /* vec_store_cost  */
 331   2, /* cond_taken_branch_cost  */
 332   1 /* cond_not_taken_branch_cost  */
 333 };
 334
 335 #define AARCH64_FUSE_NOTHING    (0)
 336 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 337 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 338 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 339 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 340 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 341
 342 static const struct tune_params generic_tunings =
 343 {
 344   &cortexa57_extra_costs,
 345   &generic_addrcost_table,
 346   &generic_regmove_cost,
 347   &generic_vector_cost,
 348   4, /* memmov_cost  */
 349   2, /* issue_rate  */
 350   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 351   8,    /* function_align.  */
 352   8,    /* jump_align.  */
 353   4,    /* loop_align.  */
 354   2,    /* int_reassoc_width.  */
 355   4,    /* fp_reassoc_width.  */
 356   1     /* vec_reassoc_width.  */
 357 };
 358
 359 static const struct tune_params cortexa53_tunings =
 360 {
 361   &cortexa53_extra_costs,
 362   &generic_addrcost_table,
 363   &cortexa53_regmove_cost,
 364   &generic_vector_cost,
 365   4, /* memmov_cost  */
 366   2, /* issue_rate  */
 367   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 368    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
 369   8,    /* function_align.  */
 370   8,    /* jump_align.  */
 371   4,    /* loop_align.  */
 372   2,    /* int_reassoc_width.  */
 373   4,    /* fp_reassoc_width.  */
 374   1     /* vec_reassoc_width.  */
 375 };
 376
 377 static const struct tune_params cortexa57_tunings =
 378 {
 379   &cortexa57_extra_costs,
 380   &cortexa57_addrcost_table,
 381   &cortexa57_regmove_cost,
 382   &cortexa57_vector_cost,
 383   4, /* memmov_cost  */
 384   3, /* issue_rate  */
 385   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 386    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 387   16,   /* function_align.  */
 388   8,    /* jump_align.  */
 389   4,    /* loop_align.  */
 390   2,    /* int_reassoc_width.  */
 391   4,    /* fp_reassoc_width.  */
 392   1     /* vec_reassoc_width.  */
 393 };
 394
 395 static const struct tune_params thunderx_tunings =
 396 {
 397   &thunderx_extra_costs,
 398   &generic_addrcost_table,
 399   &thunderx_regmove_cost,
 400   &generic_vector_cost,
 401   6, /* memmov_cost  */
 402   2, /* issue_rate  */
 403   AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
 404   8,    /* function_align.  */
 405   8,    /* jump_align.  */
 406   8,    /* loop_align.  */
 407   2,    /* int_reassoc_width.  */
 408   4,    /* fp_reassoc_width.  */
 409   1     /* vec_reassoc_width.  */
 410 };
 411
 412 static const struct tune_params xgene1_tunings =
 413 {
 414   &xgene1_extra_costs,
 415   &xgene1_addrcost_table,
 416   &xgene1_regmove_cost,
 417   &xgene1_vector_cost,
 418   6, /* memmov_cost  */
 419   4, /* issue_rate  */
 420   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 421   16,   /* function_align.  */
 422   8,    /* jump_align.  */
 423   16,   /* loop_align.  */
 424   2,    /* int_reassoc_width.  */
 425   4,    /* fp_reassoc_width.  */
 426   1     /* vec_reassoc_width.  */
 427 };
 428
 429 /* A processor implementing AArch64.  */
 430 struct processor
 431 {
 432   const char *const name;
 433   enum aarch64_processor core;
 434   const char *arch;
 435   unsigned architecture_version;
 436   const unsigned long flags;
 437   const struct tune_params *const tune;
 438 };
 439
 440 /* Processor cores implementing AArch64.  */
 441 static const struct processor all_cores[] =
 442 {
 443 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
 444   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 445 #include "aarch64-cores.def"
 446 #undef AARCH64_CORE
 447   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 448   {NULL, aarch64_none, NULL, 0, 0, NULL}
 449 };
 450
 451 /* Architectures implementing AArch64.  */
 452 static const struct processor all_architectures[] =
 453 {
 454 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 455   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 456 #include "aarch64-arches.def"
 457 #undef AARCH64_ARCH
 458   {NULL, aarch64_none, NULL, 0, 0, NULL}
 459 };
 460
 461 /* Target specification.  These are populated as commandline arguments
 462    are processed, or NULL if not specified.  */
 463 static const struct processor *selected_arch;
 464 static const struct processor *selected_cpu;
 465 static const struct processor *selected_tune;
 466
 467 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 468
 469 /* An ISA extension in the co-processor and main instruction set space.  */
 470 struct aarch64_option_extension
 471 {
 472   const char *const name;
 473   const unsigned long flags_on;
 474   const unsigned long flags_off;
 475 };
 476
 477 /* ISA extensions in AArch64.  */
 478 static const struct aarch64_option_extension all_extensions[] =
 479 {
 480 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 481   {NAME, FLAGS_ON, FLAGS_OFF},
 482 #include "aarch64-option-extensions.def"
 483 #undef AARCH64_OPT_EXTENSION
 484   {NULL, 0, 0}
 485 };
 486
 487 /* Used to track the size of an address when generating a pre/post
 488    increment address.  */
 489 static machine_mode aarch64_memory_reference_mode;
 490
 491 /* A table of valid AArch64 "bitmask immediate" values for
 492    logical instructions.  */
 493
 494 #define AARCH64_NUM_BITMASKS  5334
 495 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 496
 497 typedef enum aarch64_cond_code
 498 {
 499   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 500   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 501   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 502 }
 503 aarch64_cc;
 504
 505 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 506
 507 /* The condition codes of the processor, and the inverse function.  */
 508 static const char * const aarch64_condition_codes[] =
 509 {
 510   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 511   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 512 };
 513
 514 static unsigned int
 515 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 516 {
 517   return 2;
 518 }
 519
 520 static int
 521 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 522                              enum machine_mode mode)
 523 {
 524   if (VECTOR_MODE_P (mode))
 525     return aarch64_tune_params->vec_reassoc_width;
 526   if (INTEGRAL_MODE_P (mode))
 527     return aarch64_tune_params->int_reassoc_width;
 528   if (FLOAT_MODE_P (mode))
 529     return aarch64_tune_params->fp_reassoc_width;
 530   return 1;
 531 }
 532
 533 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 534 unsigned
 535 aarch64_dbx_register_number (unsigned regno)
 536 {
 537    if (GP_REGNUM_P (regno))
 538      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 539    else if (regno == SP_REGNUM)
 540      return AARCH64_DWARF_SP;
 541    else if (FP_REGNUM_P (regno))
 542      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 543
 544    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 545       equivalent DWARF register.  */
 546    return DWARF_FRAME_REGISTERS;
 547 }
 548
 549 /* Return TRUE if MODE is any of the large INT modes.  */
 550 static bool
 551 aarch64_vect_struct_mode_p (machine_mode mode)
 552 {
 553   return mode == OImode || mode == CImode || mode == XImode;
 554 }
 555
 556 /* Return TRUE if MODE is any of the vector modes.  */
 557 static bool
 558 aarch64_vector_mode_p (machine_mode mode)
 559 {
 560   return aarch64_vector_mode_supported_p (mode)
 561          || aarch64_vect_struct_mode_p (mode);
 562 }
 563
 564 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 565 static bool
 566 aarch64_array_mode_supported_p (machine_mode mode,
 567                                 unsigned HOST_WIDE_INT nelems)
 568 {
 569   if (TARGET_SIMD
 570       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 571       && (nelems >= 2 && nelems <= 4))
 572     return true;
 573
 574   return false;
 575 }
 576
 577 /* Implement HARD_REGNO_NREGS.  */
 578
 579 int
 580 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 581 {
 582   switch (aarch64_regno_regclass (regno))
 583     {
 584     case FP_REGS:
 585     case FP_LO_REGS:
 586       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 587     default:
 588       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 589     }
 590   gcc_unreachable ();
 591 }
 592
 593 /* Implement HARD_REGNO_MODE_OK.  */
 594
 595 int
 596 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 597 {
 598   if (GET_MODE_CLASS (mode) == MODE_CC)
 599     return regno == CC_REGNUM;
 600
 601   if (regno == SP_REGNUM)
 602     /* The purpose of comparing with ptr_mode is to support the
 603        global register variable associated with the stack pointer
 604        register via the syntax of asm ("wsp") in ILP32.  */
 605     return mode == Pmode || mode == ptr_mode;
 606
 607   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 608     return mode == Pmode;
 609
 610   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 611     return 1;
 612
 613   if (FP_REGNUM_P (regno))
 614     {
 615       if (aarch64_vect_struct_mode_p (mode))
 616         return
 617           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 618       else
 619         return 1;
 620     }
 621
 622   return 0;
 623 }
 624
 625 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 626 machine_mode
 627 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 628                                      machine_mode mode)
 629 {
 630   /* Handle modes that fit within single registers.  */
 631   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 632     {
 633       if (GET_MODE_SIZE (mode) >= 4)
 634         return mode;
 635       else
 636         return SImode;
 637     }
 638   /* Fall back to generic for multi-reg and very large modes.  */
 639   else
 640     return choose_hard_reg_mode (regno, nregs, false);
 641 }
 642
 643 /* Return true if calls to DECL should be treated as
 644    long-calls (ie called via a register).  */
 645 static bool
 646 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 647 {
 648   return false;
 649 }
 650
 651 /* Return true if calls to symbol-ref SYM should be treated as
 652    long-calls (ie called via a register).  */
 653 bool
 654 aarch64_is_long_call_p (rtx sym)
 655 {
 656   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 657 }
 658
 659 /* Return true if the offsets to a zero/sign-extract operation
 660    represent an expression that matches an extend operation.  The
 661    operands represent the paramters from
 662
 663    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 664 bool
 665 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 666                                 rtx extract_imm)
 667 {
 668   HOST_WIDE_INT mult_val, extract_val;
 669
 670   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 671     return false;
 672
 673   mult_val = INTVAL (mult_imm);
 674   extract_val = INTVAL (extract_imm);
 675
 676   if (extract_val > 8
 677       && extract_val < GET_MODE_BITSIZE (mode)
 678       && exact_log2 (extract_val & ~7) > 0
 679       && (extract_val & 7) <= 4
 680       && mult_val == (1 << (extract_val & 7)))
 681     return true;
 682
 683   return false;
 684 }
 685
 686 /* Emit an insn that's a simple single-set.  Both the operands must be
 687    known to be valid.  */
 688 inline static rtx
 689 emit_set_insn (rtx x, rtx y)
 690 {
 691   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 692 }
 693
 694 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 695    return the rtx for register 0 in the proper mode.  */
 696 rtx
 697 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 698 {
 699   machine_mode mode = SELECT_CC_MODE (code, x, y);
 700   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 701
 702   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 703   return cc_reg;
 704 }
 705
 706 /* Build the SYMBOL_REF for __tls_get_addr.  */
 707
 708 static GTY(()) rtx tls_get_addr_libfunc;
 709
 710 rtx
 711 aarch64_tls_get_addr (void)
 712 {
 713   if (!tls_get_addr_libfunc)
 714     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 715   return tls_get_addr_libfunc;
 716 }
 717
 718 /* Return the TLS model to use for ADDR.  */
 719
 720 static enum tls_model
 721 tls_symbolic_operand_type (rtx addr)
 722 {
 723   enum tls_model tls_kind = TLS_MODEL_NONE;
 724   rtx sym, addend;
 725
 726   if (GET_CODE (addr) == CONST)
 727     {
 728       split_const (addr, &sym, &addend);
 729       if (GET_CODE (sym) == SYMBOL_REF)
 730         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 731     }
 732   else if (GET_CODE (addr) == SYMBOL_REF)
 733     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 734
 735   return tls_kind;
 736 }
 737
 738 /* We'll allow lo_sum's in addresses in our legitimate addresses
 739    so that combine would take care of combining addresses where
 740    necessary, but for generation purposes, we'll generate the address
 741    as :
 742    RTL                               Absolute
 743    tmp = hi (symbol_ref);            adrp  x1, foo
 744    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 745                                      nop
 746
 747    PIC                               TLS
 748    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 749    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 750                                      bl   __tls_get_addr
 751                                      nop
 752
 753    Load TLS symbol, depending on TLS mechanism and TLS access model.
 754
 755    Global Dynamic - Traditional TLS:
 756    adrp tmp, :tlsgd:imm
 757    add  dest, tmp, #:tlsgd_lo12:imm
 758    bl   __tls_get_addr
 759
 760    Global Dynamic - TLS Descriptors:
 761    adrp dest, :tlsdesc:imm
 762    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 763    add  dest, dest, #:tlsdesc_lo12:imm
 764    blr  tmp
 765    mrs  tp, tpidr_el0
 766    add  dest, dest, tp
 767
 768    Initial Exec:
 769    mrs  tp, tpidr_el0
 770    adrp tmp, :gottprel:imm
 771    ldr  dest, [tmp, #:gottprel_lo12:imm]
 772    add  dest, dest, tp
 773
 774    Local Exec:
 775    mrs  tp, tpidr_el0
 776    add  t0, tp, #:tprel_hi12:imm, lsl #12
 777    add  t0, t0, #:tprel_lo12_nc:imm
 778 */
 779
 780 static void
 781 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 782                                    enum aarch64_symbol_type type)
 783 {
 784   switch (type)
 785     {
 786     case SYMBOL_SMALL_ABSOLUTE:
 787       {
 788         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 789         rtx tmp_reg = dest;
 790         machine_mode mode = GET_MODE (dest);
 791
 792         gcc_assert (mode == Pmode || mode == ptr_mode);
 793
 794         if (can_create_pseudo_p ())
 795           tmp_reg = gen_reg_rtx (mode);
 796
 797         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 798         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 799         return;
 800       }
 801
 802     case SYMBOL_TINY_ABSOLUTE:
 803       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 804       return;
 805
 806     case SYMBOL_SMALL_GOT:
 807       {
 808         /* In ILP32, the mode of dest can be either SImode or DImode,
 809            while the got entry is always of SImode size.  The mode of
 810            dest depends on how dest is used: if dest is assigned to a
 811            pointer (e.g. in the memory), it has SImode; it may have
 812            DImode if dest is dereferenced to access the memeory.
 813            This is why we have to handle three different ldr_got_small
 814            patterns here (two patterns for ILP32).  */
 815         rtx tmp_reg = dest;
 816         machine_mode mode = GET_MODE (dest);
 817
 818         if (can_create_pseudo_p ())
 819           tmp_reg = gen_reg_rtx (mode);
 820
 821         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 822         if (mode == ptr_mode)
 823           {
 824             if (mode == DImode)
 825               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 826             else
 827               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 828           }
 829         else
 830           {
 831             gcc_assert (mode == Pmode);
 832             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 833           }
 834
 835         return;
 836       }
 837
 838     case SYMBOL_SMALL_TLSGD:
 839       {
 840         rtx_insn *insns;
 841         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 842
 843         start_sequence ();
 844         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 845         insns = get_insns ();
 846         end_sequence ();
 847
 848         RTL_CONST_CALL_P (insns) = 1;
 849         emit_libcall_block (insns, dest, result, imm);
 850         return;
 851       }
 852
 853     case SYMBOL_SMALL_TLSDESC:
 854       {
 855         machine_mode mode = GET_MODE (dest);
 856         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 857         rtx tp;
 858
 859         gcc_assert (mode == Pmode || mode == ptr_mode);
 860
 861         /* In ILP32, the got entry is always of SImode size.  Unlike
 862            small GOT, the dest is fixed at reg 0.  */
 863         if (TARGET_ILP32)
 864           emit_insn (gen_tlsdesc_small_si (imm));
 865         else
 866           emit_insn (gen_tlsdesc_small_di (imm));
 867         tp = aarch64_load_tp (NULL);
 868
 869         if (mode != Pmode)
 870           tp = gen_lowpart (mode, tp);
 871
 872         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 873         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 874         return;
 875       }
 876
 877     case SYMBOL_SMALL_GOTTPREL:
 878       {
 879         /* In ILP32, the mode of dest can be either SImode or DImode,
 880            while the got entry is always of SImode size.  The mode of
 881            dest depends on how dest is used: if dest is assigned to a
 882            pointer (e.g. in the memory), it has SImode; it may have
 883            DImode if dest is dereferenced to access the memeory.
 884            This is why we have to handle three different tlsie_small
 885            patterns here (two patterns for ILP32).  */
 886         machine_mode mode = GET_MODE (dest);
 887         rtx tmp_reg = gen_reg_rtx (mode);
 888         rtx tp = aarch64_load_tp (NULL);
 889
 890         if (mode == ptr_mode)
 891           {
 892             if (mode == DImode)
 893               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 894             else
 895               {
 896                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 897                 tp = gen_lowpart (mode, tp);
 898               }
 899           }
 900         else
 901           {
 902             gcc_assert (mode == Pmode);
 903             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 904           }
 905
 906         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 907         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 908         return;
 909       }
 910
 911     case SYMBOL_SMALL_TPREL:
 912       {
 913         rtx tp = aarch64_load_tp (NULL);
 914
 915         if (GET_MODE (dest) != Pmode)
 916           tp = gen_lowpart (GET_MODE (dest), tp);
 917
 918         emit_insn (gen_tlsle_small (dest, tp, imm));
 919         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 920         return;
 921       }
 922
 923     case SYMBOL_TINY_GOT:
 924       emit_insn (gen_ldr_got_tiny (dest, imm));
 925       return;
 926
 927     default:
 928       gcc_unreachable ();
 929     }
 930 }
 931
 932 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 933    handle all moves if !can_create_pseudo_p ().  The distinction is
 934    important because, unlike emit_move_insn, the move expanders know
 935    how to force Pmode objects into the constant pool even when the
 936    constant pool address is not itself legitimate.  */
 937 static rtx
 938 aarch64_emit_move (rtx dest, rtx src)
 939 {
 940   return (can_create_pseudo_p ()
 941           ? emit_move_insn (dest, src)
 942           : emit_move_insn_1 (dest, src));
 943 }
 944
 945 /* Split a 128-bit move operation into two 64-bit move operations,
 946    taking care to handle partial overlap of register to register
 947    copies.  Special cases are needed when moving between GP regs and
 948    FP regs.  SRC can be a register, constant or memory; DST a register
 949    or memory.  If either operand is memory it must not have any side
 950    effects.  */
 951 void
 952 aarch64_split_128bit_move (rtx dst, rtx src)
 953 {
 954   rtx dst_lo, dst_hi;
 955   rtx src_lo, src_hi;
 956
 957   machine_mode mode = GET_MODE (dst);
 958
 959   gcc_assert (mode == TImode || mode == TFmode);
 960   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 961   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 962
 963   if (REG_P (dst) && REG_P (src))
 964     {
 965       int src_regno = REGNO (src);
 966       int dst_regno = REGNO (dst);
 967
 968       /* Handle FP <-> GP regs.  */
 969       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 970         {
 971           src_lo = gen_lowpart (word_mode, src);
 972           src_hi = gen_highpart (word_mode, src);
 973
 974           if (mode == TImode)
 975             {
 976               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 977               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 978             }
 979           else
 980             {
 981               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 982               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 983             }
 984           return;
 985         }
 986       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 987         {
 988           dst_lo = gen_lowpart (word_mode, dst);
 989           dst_hi = gen_highpart (word_mode, dst);
 990
 991           if (mode == TImode)
 992             {
 993               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 994               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 995             }
 996           else
 997             {
 998               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 999               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1000             }
1001           return;
1002         }
1003     }
1004
1005   dst_lo = gen_lowpart (word_mode, dst);
1006   dst_hi = gen_highpart (word_mode, dst);
1007   src_lo = gen_lowpart (word_mode, src);
1008   src_hi = gen_highpart_mode (word_mode, mode, src);
1009
1010   /* At most one pairing may overlap.  */
1011   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1012     {
1013       aarch64_emit_move (dst_hi, src_hi);
1014       aarch64_emit_move (dst_lo, src_lo);
1015     }
1016   else
1017     {
1018       aarch64_emit_move (dst_lo, src_lo);
1019       aarch64_emit_move (dst_hi, src_hi);
1020     }
1021 }
1022
1023 bool
1024 aarch64_split_128bit_move_p (rtx dst, rtx src)
1025 {
1026   return (! REG_P (src)
1027           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1028 }
1029
1030 /* Split a complex SIMD combine.  */
1031
1032 void
1033 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1034 {
1035   machine_mode src_mode = GET_MODE (src1);
1036   machine_mode dst_mode = GET_MODE (dst);
1037
1038   gcc_assert (VECTOR_MODE_P (dst_mode));
1039
1040   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1041     {
1042       rtx (*gen) (rtx, rtx, rtx);
1043
1044       switch (src_mode)
1045         {
1046         case V8QImode:
1047           gen = gen_aarch64_simd_combinev8qi;
1048           break;
1049         case V4HImode:
1050           gen = gen_aarch64_simd_combinev4hi;
1051           break;
1052         case V2SImode:
1053           gen = gen_aarch64_simd_combinev2si;
1054           break;
1055         case V2SFmode:
1056           gen = gen_aarch64_simd_combinev2sf;
1057           break;
1058         case DImode:
1059           gen = gen_aarch64_simd_combinedi;
1060           break;
1061         case DFmode:
1062           gen = gen_aarch64_simd_combinedf;
1063           break;
1064         default:
1065           gcc_unreachable ();
1066         }
1067
1068       emit_insn (gen (dst, src1, src2));
1069       return;
1070     }
1071 }
1072
1073 /* Split a complex SIMD move.  */
1074
1075 void
1076 aarch64_split_simd_move (rtx dst, rtx src)
1077 {
1078   machine_mode src_mode = GET_MODE (src);
1079   machine_mode dst_mode = GET_MODE (dst);
1080
1081   gcc_assert (VECTOR_MODE_P (dst_mode));
1082
1083   if (REG_P (dst) && REG_P (src))
1084     {
1085       rtx (*gen) (rtx, rtx);
1086
1087       gcc_assert (VECTOR_MODE_P (src_mode));
1088
1089       switch (src_mode)
1090         {
1091         case V16QImode:
1092           gen = gen_aarch64_split_simd_movv16qi;
1093           break;
1094         case V8HImode:
1095           gen = gen_aarch64_split_simd_movv8hi;
1096           break;
1097         case V4SImode:
1098           gen = gen_aarch64_split_simd_movv4si;
1099           break;
1100         case V2DImode:
1101           gen = gen_aarch64_split_simd_movv2di;
1102           break;
1103         case V4SFmode:
1104           gen = gen_aarch64_split_simd_movv4sf;
1105           break;
1106         case V2DFmode:
1107           gen = gen_aarch64_split_simd_movv2df;
1108           break;
1109         default:
1110           gcc_unreachable ();
1111         }
1112
1113       emit_insn (gen (dst, src));
1114       return;
1115     }
1116 }
1117
1118 static rtx
1119 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1120 {
1121   if (can_create_pseudo_p ())
1122     return force_reg (mode, value);
1123   else
1124     {
1125       x = aarch64_emit_move (x, value);
1126       return x;
1127     }
1128 }
1129
1130
1131 static rtx
1132 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1133 {
1134   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1135     {
1136       rtx high;
1137       /* Load the full offset into a register.  This
1138          might be improvable in the future.  */
1139       high = GEN_INT (offset);
1140       offset = 0;
1141       high = aarch64_force_temporary (mode, temp, high);
1142       reg = aarch64_force_temporary (mode, temp,
1143                                      gen_rtx_PLUS (mode, high, reg));
1144     }
1145   return plus_constant (mode, reg, offset);
1146 }
1147
1148 static int
1149 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1150                                 machine_mode mode)
1151 {
1152   unsigned HOST_WIDE_INT mask;
1153   int i;
1154   bool first;
1155   unsigned HOST_WIDE_INT val;
1156   bool subtargets;
1157   rtx subtarget;
1158   int one_match, zero_match, first_not_ffff_match;
1159   int num_insns = 0;
1160
1161   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1162     {
1163       if (generate)
1164         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1165       num_insns++;
1166       return num_insns;
1167     }
1168
1169   if (mode == SImode)
1170     {
1171       /* We know we can't do this in 1 insn, and we must be able to do it
1172          in two; so don't mess around looking for sequences that don't buy
1173          us anything.  */
1174       if (generate)
1175         {
1176           emit_insn (gen_rtx_SET (VOIDmode, dest,
1177                                   GEN_INT (INTVAL (imm) & 0xffff)));
1178           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1179                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1180         }
1181       num_insns += 2;
1182       return num_insns;
1183     }
1184
1185   /* Remaining cases are all for DImode.  */
1186
1187   val = INTVAL (imm);
1188   subtargets = optimize && can_create_pseudo_p ();
1189
1190   one_match = 0;
1191   zero_match = 0;
1192   mask = 0xffff;
1193   first_not_ffff_match = -1;
1194
1195   for (i = 0; i < 64; i += 16, mask <<= 16)
1196     {
1197       if ((val & mask) == mask)
1198         one_match++;
1199       else
1200         {
1201           if (first_not_ffff_match < 0)
1202             first_not_ffff_match = i;
1203           if ((val & mask) == 0)
1204             zero_match++;
1205         }
1206     }
1207
1208   if (one_match == 2)
1209     {
1210       /* Set one of the quarters and then insert back into result.  */
1211       mask = 0xffffll << first_not_ffff_match;
1212       if (generate)
1213         {
1214           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1215           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1216                                      GEN_INT ((val >> first_not_ffff_match)
1217                                               & 0xffff)));
1218         }
1219       num_insns += 2;
1220       return num_insns;
1221     }
1222
1223   if (zero_match == 2)
1224     goto simple_sequence;
1225
1226   mask = 0x0ffff0000UL;
1227   for (i = 16; i < 64; i += 16, mask <<= 16)
1228     {
1229       HOST_WIDE_INT comp = mask & ~(mask - 1);
1230
1231       if (aarch64_uimm12_shift (val - (val & mask)))
1232         {
1233           if (generate)
1234             {
1235               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1236               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1237                                       GEN_INT (val & mask)));
1238               emit_insn (gen_adddi3 (dest, subtarget,
1239                                      GEN_INT (val - (val & mask))));
1240             }
1241           num_insns += 2;
1242           return num_insns;
1243         }
1244       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1245         {
1246           if (generate)
1247             {
1248               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1249               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1250                                       GEN_INT ((val + comp) & mask)));
1251               emit_insn (gen_adddi3 (dest, subtarget,
1252                                      GEN_INT (val - ((val + comp) & mask))));
1253             }
1254           num_insns += 2;
1255           return num_insns;
1256         }
1257       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1258         {
1259           if (generate)
1260             {
1261               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1262               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1263                                       GEN_INT ((val - comp) | ~mask)));
1264               emit_insn (gen_adddi3 (dest, subtarget,
1265                                      GEN_INT (val - ((val - comp) | ~mask))));
1266             }
1267           num_insns += 2;
1268           return num_insns;
1269         }
1270       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1271         {
1272           if (generate)
1273             {
1274               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1275               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1276                                       GEN_INT (val | ~mask)));
1277               emit_insn (gen_adddi3 (dest, subtarget,
1278                                      GEN_INT (val - (val | ~mask))));
1279             }
1280           num_insns += 2;
1281           return num_insns;
1282         }
1283     }
1284
1285   /* See if we can do it by arithmetically combining two
1286      immediates.  */
1287   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1288     {
1289       int j;
1290       mask = 0xffff;
1291
1292       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1293           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1294         {
1295           if (generate)
1296             {
1297               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1298               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1299                                       GEN_INT (aarch64_bitmasks[i])));
1300               emit_insn (gen_adddi3 (dest, subtarget,
1301                                      GEN_INT (val - aarch64_bitmasks[i])));
1302             }
1303           num_insns += 2;
1304           return num_insns;
1305         }
1306
1307       for (j = 0; j < 64; j += 16, mask <<= 16)
1308         {
1309           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1310             {
1311               if (generate)
1312                 {
1313                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1314                                           GEN_INT (aarch64_bitmasks[i])));
1315                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1316                                              GEN_INT ((val >> j) & 0xffff)));
1317                 }
1318               num_insns += 2;
1319               return num_insns;
1320             }
1321         }
1322     }
1323
1324   /* See if we can do it by logically combining two immediates.  */
1325   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1326     {
1327       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1328         {
1329           int j;
1330
1331           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1332             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1333               {
1334                 if (generate)
1335                   {
1336                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1337                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1338                                             GEN_INT (aarch64_bitmasks[i])));
1339                     emit_insn (gen_iordi3 (dest, subtarget,
1340                                            GEN_INT (aarch64_bitmasks[j])));
1341                   }
1342                 num_insns += 2;
1343                 return num_insns;
1344               }
1345         }
1346       else if ((val & aarch64_bitmasks[i]) == val)
1347         {
1348           int j;
1349
1350           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1351             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1352               {
1353                 if (generate)
1354                   {
1355                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1356                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1357                                             GEN_INT (aarch64_bitmasks[j])));
1358                     emit_insn (gen_anddi3 (dest, subtarget,
1359                                            GEN_INT (aarch64_bitmasks[i])));
1360                   }
1361                 num_insns += 2;
1362                 return num_insns;
1363               }
1364         }
1365     }
1366
1367   if (one_match > zero_match)
1368     {
1369       /* Set either first three quarters or all but the third.   */
1370       mask = 0xffffll << (16 - first_not_ffff_match);
1371       if (generate)
1372         emit_insn (gen_rtx_SET (VOIDmode, dest,
1373                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1374       num_insns ++;
1375
1376       /* Now insert other two quarters.  */
1377       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1378            i < 64; i += 16, mask <<= 16)
1379         {
1380           if ((val & mask) != mask)
1381             {
1382               if (generate)
1383                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1384                                            GEN_INT ((val >> i) & 0xffff)));
1385               num_insns ++;
1386             }
1387         }
1388       return num_insns;
1389     }
1390
1391  simple_sequence:
1392   first = true;
1393   mask = 0xffff;
1394   for (i = 0; i < 64; i += 16, mask <<= 16)
1395     {
1396       if ((val & mask) != 0)
1397         {
1398           if (first)
1399             {
1400               if (generate)
1401                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1402                                         GEN_INT (val & mask)));
1403               num_insns ++;
1404               first = false;
1405             }
1406           else
1407             {
1408               if (generate)
1409                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1410                                            GEN_INT ((val >> i) & 0xffff)));
1411               num_insns ++;
1412             }
1413         }
1414     }
1415
1416   return num_insns;
1417 }
1418
1419
1420 void
1421 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1422 {
1423   machine_mode mode = GET_MODE (dest);
1424
1425   gcc_assert (mode == SImode || mode == DImode);
1426
1427   /* Check on what type of symbol it is.  */
1428   if (GET_CODE (imm) == SYMBOL_REF
1429       || GET_CODE (imm) == LABEL_REF
1430       || GET_CODE (imm) == CONST)
1431     {
1432       rtx mem, base, offset;
1433       enum aarch64_symbol_type sty;
1434
1435       /* If we have (const (plus symbol offset)), separate out the offset
1436          before we start classifying the symbol.  */
1437       split_const (imm, &base, &offset);
1438
1439       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1440       switch (sty)
1441         {
1442         case SYMBOL_FORCE_TO_MEM:
1443           if (offset != const0_rtx
1444               && targetm.cannot_force_const_mem (mode, imm))
1445             {
1446               gcc_assert (can_create_pseudo_p ());
1447               base = aarch64_force_temporary (mode, dest, base);
1448               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1449               aarch64_emit_move (dest, base);
1450               return;
1451             }
1452           mem = force_const_mem (ptr_mode, imm);
1453           gcc_assert (mem);
1454           if (mode != ptr_mode)
1455             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1456           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1457           return;
1458
1459         case SYMBOL_SMALL_TLSGD:
1460         case SYMBOL_SMALL_TLSDESC:
1461         case SYMBOL_SMALL_GOTTPREL:
1462         case SYMBOL_SMALL_GOT:
1463         case SYMBOL_TINY_GOT:
1464           if (offset != const0_rtx)
1465             {
1466               gcc_assert(can_create_pseudo_p ());
1467               base = aarch64_force_temporary (mode, dest, base);
1468               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1469               aarch64_emit_move (dest, base);
1470               return;
1471             }
1472           /* FALLTHRU */
1473
1474         case SYMBOL_SMALL_TPREL:
1475         case SYMBOL_SMALL_ABSOLUTE:
1476         case SYMBOL_TINY_ABSOLUTE:
1477           aarch64_load_symref_appropriately (dest, imm, sty);
1478           return;
1479
1480         default:
1481           gcc_unreachable ();
1482         }
1483     }
1484
1485   if (!CONST_INT_P (imm))
1486     {
1487       if (GET_CODE (imm) == HIGH)
1488         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1489       else
1490         {
1491           rtx mem = force_const_mem (mode, imm);
1492           gcc_assert (mem);
1493           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1494         }
1495
1496       return;
1497     }
1498
1499   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1500 }
1501
1502 static bool
1503 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1504                                  tree exp ATTRIBUTE_UNUSED)
1505 {
1506   /* Currently, always true.  */
1507   return true;
1508 }
1509
1510 /* Implement TARGET_PASS_BY_REFERENCE.  */
1511
1512 static bool
1513 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1514                            machine_mode mode,
1515                            const_tree type,
1516                            bool named ATTRIBUTE_UNUSED)
1517 {
1518   HOST_WIDE_INT size;
1519   machine_mode dummymode;
1520   int nregs;
1521
1522   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1523   size = (mode == BLKmode && type)
1524     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1525
1526   /* Aggregates are passed by reference based on their size.  */
1527   if (type && AGGREGATE_TYPE_P (type))
1528     {
1529       size = int_size_in_bytes (type);
1530     }
1531
1532   /* Variable sized arguments are always returned by reference.  */
1533   if (size < 0)
1534     return true;
1535
1536   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1537   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1538                                                &dummymode, &nregs,
1539                                                NULL))
1540     return false;
1541
1542   /* Arguments which are variable sized or larger than 2 registers are
1543      passed by reference unless they are a homogenous floating point
1544      aggregate.  */
1545   return size > 2 * UNITS_PER_WORD;
1546 }
1547
1548 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1549 static bool
1550 aarch64_return_in_msb (const_tree valtype)
1551 {
1552   machine_mode dummy_mode;
1553   int dummy_int;
1554
1555   /* Never happens in little-endian mode.  */
1556   if (!BYTES_BIG_ENDIAN)
1557     return false;
1558
1559   /* Only composite types smaller than or equal to 16 bytes can
1560      be potentially returned in registers.  */
1561   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1562       || int_size_in_bytes (valtype) <= 0
1563       || int_size_in_bytes (valtype) > 16)
1564     return false;
1565
1566   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1567      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1568      is always passed/returned in the least significant bits of fp/simd
1569      register(s).  */
1570   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1571                                                &dummy_mode, &dummy_int, NULL))
1572     return false;
1573
1574   return true;
1575 }
1576
1577 /* Implement TARGET_FUNCTION_VALUE.
1578    Define how to find the value returned by a function.  */
1579
1580 static rtx
1581 aarch64_function_value (const_tree type, const_tree func,
1582                         bool outgoing ATTRIBUTE_UNUSED)
1583 {
1584   machine_mode mode;
1585   int unsignedp;
1586   int count;
1587   machine_mode ag_mode;
1588
1589   mode = TYPE_MODE (type);
1590   if (INTEGRAL_TYPE_P (type))
1591     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1592
1593   if (aarch64_return_in_msb (type))
1594     {
1595       HOST_WIDE_INT size = int_size_in_bytes (type);
1596
1597       if (size % UNITS_PER_WORD != 0)
1598         {
1599           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1600           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1601         }
1602     }
1603
1604   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1605                                                &ag_mode, &count, NULL))
1606     {
1607       if (!aarch64_composite_type_p (type, mode))
1608         {
1609           gcc_assert (count == 1 && mode == ag_mode);
1610           return gen_rtx_REG (mode, V0_REGNUM);
1611         }
1612       else
1613         {
1614           int i;
1615           rtx par;
1616
1617           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1618           for (i = 0; i < count; i++)
1619             {
1620               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1621               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1622                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1623               XVECEXP (par, 0, i) = tmp;
1624             }
1625           return par;
1626         }
1627     }
1628   else
1629     return gen_rtx_REG (mode, R0_REGNUM);
1630 }
1631
1632 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1633    Return true if REGNO is the number of a hard register in which the values
1634    of called function may come back.  */
1635
1636 static bool
1637 aarch64_function_value_regno_p (const unsigned int regno)
1638 {
1639   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1640      of 16-byte return values are: 128-bit integers and 16-byte small
1641      structures (excluding homogeneous floating-point aggregates).  */
1642   if (regno == R0_REGNUM || regno == R1_REGNUM)
1643     return true;
1644
1645   /* Up to four fp/simd registers can return a function value, e.g. a
1646      homogeneous floating-point aggregate having four members.  */
1647   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1648     return !TARGET_GENERAL_REGS_ONLY;
1649
1650   return false;
1651 }
1652
1653 /* Implement TARGET_RETURN_IN_MEMORY.
1654
1655    If the type T of the result of a function is such that
1656      void func (T arg)
1657    would require that arg be passed as a value in a register (or set of
1658    registers) according to the parameter passing rules, then the result
1659    is returned in the same registers as would be used for such an
1660    argument.  */
1661
1662 static bool
1663 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1664 {
1665   HOST_WIDE_INT size;
1666   machine_mode ag_mode;
1667   int count;
1668
1669   if (!AGGREGATE_TYPE_P (type)
1670       && TREE_CODE (type) != COMPLEX_TYPE
1671       && TREE_CODE (type) != VECTOR_TYPE)
1672     /* Simple scalar types always returned in registers.  */
1673     return false;
1674
1675   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1676                                                type,
1677                                                &ag_mode,
1678                                                &count,
1679                                                NULL))
1680     return false;
1681
1682   /* Types larger than 2 registers returned in memory.  */
1683   size = int_size_in_bytes (type);
1684   return (size < 0 || size > 2 * UNITS_PER_WORD);
1685 }
1686
1687 static bool
1688 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1689                                const_tree type, int *nregs)
1690 {
1691   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1692   return aarch64_vfp_is_call_or_return_candidate (mode,
1693                                                   type,
1694                                                   &pcum->aapcs_vfp_rmode,
1695                                                   nregs,
1696                                                   NULL);
1697 }
1698
1699 /* Given MODE and TYPE of a function argument, return the alignment in
1700    bits.  The idea is to suppress any stronger alignment requested by
1701    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1702    This is a helper function for local use only.  */
1703
1704 static unsigned int
1705 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1706 {
1707   unsigned int alignment;
1708
1709   if (type)
1710     {
1711       if (!integer_zerop (TYPE_SIZE (type)))
1712         {
1713           if (TYPE_MODE (type) == mode)
1714             alignment = TYPE_ALIGN (type);
1715           else
1716             alignment = GET_MODE_ALIGNMENT (mode);
1717         }
1718       else
1719         alignment = 0;
1720     }
1721   else
1722     alignment = GET_MODE_ALIGNMENT (mode);
1723
1724   return alignment;
1725 }
1726
1727 /* Layout a function argument according to the AAPCS64 rules.  The rule
1728    numbers refer to the rule numbers in the AAPCS64.  */
1729
1730 static void
1731 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1732                     const_tree type,
1733                     bool named ATTRIBUTE_UNUSED)
1734 {
1735   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1736   int ncrn, nvrn, nregs;
1737   bool allocate_ncrn, allocate_nvrn;
1738   HOST_WIDE_INT size;
1739
1740   /* We need to do this once per argument.  */
1741   if (pcum->aapcs_arg_processed)
1742     return;
1743
1744   pcum->aapcs_arg_processed = true;
1745
1746   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1747   size
1748     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1749                         UNITS_PER_WORD);
1750
1751   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1752   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1753                                                  mode,
1754                                                  type,
1755                                                  &nregs);
1756
1757   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1758      The following code thus handles passing by SIMD/FP registers first.  */
1759
1760   nvrn = pcum->aapcs_nvrn;
1761
1762   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1763      and homogenous short-vector aggregates (HVA).  */
1764   if (allocate_nvrn)
1765     {
1766       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1767         {
1768           pcum->aapcs_nextnvrn = nvrn + nregs;
1769           if (!aarch64_composite_type_p (type, mode))
1770             {
1771               gcc_assert (nregs == 1);
1772               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1773             }
1774           else
1775             {
1776               rtx par;
1777               int i;
1778               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1779               for (i = 0; i < nregs; i++)
1780                 {
1781                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1782                                          V0_REGNUM + nvrn + i);
1783                   tmp = gen_rtx_EXPR_LIST
1784                     (VOIDmode, tmp,
1785                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1786                   XVECEXP (par, 0, i) = tmp;
1787                 }
1788               pcum->aapcs_reg = par;
1789             }
1790           return;
1791         }
1792       else
1793         {
1794           /* C.3 NSRN is set to 8.  */
1795           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1796           goto on_stack;
1797         }
1798     }
1799
1800   ncrn = pcum->aapcs_ncrn;
1801   nregs = size / UNITS_PER_WORD;
1802
1803   /* C6 - C9.  though the sign and zero extension semantics are
1804      handled elsewhere.  This is the case where the argument fits
1805      entirely general registers.  */
1806   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1807     {
1808       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1809
1810       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1811
1812       /* C.8 if the argument has an alignment of 16 then the NGRN is
1813          rounded up to the next even number.  */
1814       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1815         {
1816           ++ncrn;
1817           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1818         }
1819       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1820          A reg is still generated for it, but the caller should be smart
1821          enough not to use it.  */
1822       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1823         {
1824           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1825         }
1826       else
1827         {
1828           rtx par;
1829           int i;
1830
1831           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1832           for (i = 0; i < nregs; i++)
1833             {
1834               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1835               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1836                                        GEN_INT (i * UNITS_PER_WORD));
1837               XVECEXP (par, 0, i) = tmp;
1838             }
1839           pcum->aapcs_reg = par;
1840         }
1841
1842       pcum->aapcs_nextncrn = ncrn + nregs;
1843       return;
1844     }
1845
1846   /* C.11  */
1847   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1848
1849   /* The argument is passed on stack; record the needed number of words for
1850      this argument and align the total size if necessary.  */
1851 on_stack:
1852   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1853   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1854     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1855                                                16 / UNITS_PER_WORD);
1856   return;
1857 }
1858
1859 /* Implement TARGET_FUNCTION_ARG.  */
1860
1861 static rtx
1862 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1863                       const_tree type, bool named)
1864 {
1865   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1866   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1867
1868   if (mode == VOIDmode)
1869     return NULL_RTX;
1870
1871   aarch64_layout_arg (pcum_v, mode, type, named);
1872   return pcum->aapcs_reg;
1873 }
1874
1875 void
1876 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1877                            const_tree fntype ATTRIBUTE_UNUSED,
1878                            rtx libname ATTRIBUTE_UNUSED,
1879                            const_tree fndecl ATTRIBUTE_UNUSED,
1880                            unsigned n_named ATTRIBUTE_UNUSED)
1881 {
1882   pcum->aapcs_ncrn = 0;
1883   pcum->aapcs_nvrn = 0;
1884   pcum->aapcs_nextncrn = 0;
1885   pcum->aapcs_nextnvrn = 0;
1886   pcum->pcs_variant = ARM_PCS_AAPCS64;
1887   pcum->aapcs_reg = NULL_RTX;
1888   pcum->aapcs_arg_processed = false;
1889   pcum->aapcs_stack_words = 0;
1890   pcum->aapcs_stack_size = 0;
1891
1892   return;
1893 }
1894
1895 static void
1896 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1897                               machine_mode mode,
1898                               const_tree type,
1899                               bool named)
1900 {
1901   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1902   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1903     {
1904       aarch64_layout_arg (pcum_v, mode, type, named);
1905       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1906                   != (pcum->aapcs_stack_words != 0));
1907       pcum->aapcs_arg_processed = false;
1908       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1909       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1910       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1911       pcum->aapcs_stack_words = 0;
1912       pcum->aapcs_reg = NULL_RTX;
1913     }
1914 }
1915
1916 bool
1917 aarch64_function_arg_regno_p (unsigned regno)
1918 {
1919   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1920           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1921 }
1922
1923 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1924    PARM_BOUNDARY bits of alignment, but will be given anything up
1925    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1926    that both before and after the layout of each argument, the Next
1927    Stacked Argument Address (NSAA) will have a minimum alignment of
1928    8 bytes.  */
1929
1930 static unsigned int
1931 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1932 {
1933   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1934
1935   if (alignment < PARM_BOUNDARY)
1936     alignment = PARM_BOUNDARY;
1937   if (alignment > STACK_BOUNDARY)
1938     alignment = STACK_BOUNDARY;
1939   return alignment;
1940 }
1941
1942 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1943
1944    Return true if an argument passed on the stack should be padded upwards,
1945    i.e. if the least-significant byte of the stack slot has useful data.
1946
1947    Small aggregate types are placed in the lowest memory address.
1948
1949    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1950
1951 bool
1952 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1953 {
1954   /* On little-endian targets, the least significant byte of every stack
1955      argument is passed at the lowest byte address of the stack slot.  */
1956   if (!BYTES_BIG_ENDIAN)
1957     return true;
1958
1959   /* Otherwise, integral, floating-point and pointer types are padded downward:
1960      the least significant byte of a stack argument is passed at the highest
1961      byte address of the stack slot.  */
1962   if (type
1963       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1964          || POINTER_TYPE_P (type))
1965       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1966     return false;
1967
1968   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1969   return true;
1970 }
1971
1972 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1973
1974    It specifies padding for the last (may also be the only)
1975    element of a block move between registers and memory.  If
1976    assuming the block is in the memory, padding upward means that
1977    the last element is padded after its highest significant byte,
1978    while in downward padding, the last element is padded at the
1979    its least significant byte side.
1980
1981    Small aggregates and small complex types are always padded
1982    upwards.
1983
1984    We don't need to worry about homogeneous floating-point or
1985    short-vector aggregates; their move is not affected by the
1986    padding direction determined here.  Regardless of endianness,
1987    each element of such an aggregate is put in the least
1988    significant bits of a fp/simd register.
1989
1990    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1991    register has useful data, and return the opposite if the most
1992    significant byte does.  */
1993
1994 bool
1995 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1996                      bool first ATTRIBUTE_UNUSED)
1997 {
1998
1999   /* Small composite types are always padded upward.  */
2000   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2001     {
2002       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2003                             : GET_MODE_SIZE (mode));
2004       if (size < 2 * UNITS_PER_WORD)
2005         return true;
2006     }
2007
2008   /* Otherwise, use the default padding.  */
2009   return !BYTES_BIG_ENDIAN;
2010 }
2011
2012 static machine_mode
2013 aarch64_libgcc_cmp_return_mode (void)
2014 {
2015   return SImode;
2016 }
2017
2018 static bool
2019 aarch64_frame_pointer_required (void)
2020 {
2021   /* In aarch64_override_options_after_change
2022      flag_omit_leaf_frame_pointer turns off the frame pointer by
2023      default.  Turn it back on now if we've not got a leaf
2024      function.  */
2025   if (flag_omit_leaf_frame_pointer
2026       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2027     return true;
2028
2029   return false;
2030 }
2031
2032 /* Mark the registers that need to be saved by the callee and calculate
2033    the size of the callee-saved registers area and frame record (both FP
2034    and LR may be omitted).  */
2035 static void
2036 aarch64_layout_frame (void)
2037 {
2038   HOST_WIDE_INT offset = 0;
2039   int regno;
2040
2041   if (reload_completed && cfun->machine->frame.laid_out)
2042     return;
2043
2044 #define SLOT_NOT_REQUIRED (-2)
2045 #define SLOT_REQUIRED     (-1)
2046
2047   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2048   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2049
2050   /* First mark all the registers that really need to be saved...  */
2051   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2052     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2053
2054   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2055     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2056
2057   /* ... that includes the eh data registers (if needed)...  */
2058   if (crtl->calls_eh_return)
2059     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2060       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2061         = SLOT_REQUIRED;
2062
2063   /* ... and any callee saved register that dataflow says is live.  */
2064   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2065     if (df_regs_ever_live_p (regno)
2066         && (regno == R30_REGNUM
2067             || !call_used_regs[regno]))
2068       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2069
2070   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2071     if (df_regs_ever_live_p (regno)
2072         && !call_used_regs[regno])
2073       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2074
2075   if (frame_pointer_needed)
2076     {
2077       /* FP and LR are placed in the linkage record.  */
2078       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2079       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2080       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2081       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2082       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2083       offset += 2 * UNITS_PER_WORD;
2084     }
2085
2086   /* Now assign stack slots for them.  */
2087   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2088     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2089       {
2090         cfun->machine->frame.reg_offset[regno] = offset;
2091         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2092           cfun->machine->frame.wb_candidate1 = regno;
2093         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2094           cfun->machine->frame.wb_candidate2 = regno;
2095         offset += UNITS_PER_WORD;
2096       }
2097
2098   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2099     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2100       {
2101         cfun->machine->frame.reg_offset[regno] = offset;
2102         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2103           cfun->machine->frame.wb_candidate1 = regno;
2104         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2105                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2106           cfun->machine->frame.wb_candidate2 = regno;
2107         offset += UNITS_PER_WORD;
2108       }
2109
2110   cfun->machine->frame.padding0 =
2111     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2112   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2113
2114   cfun->machine->frame.saved_regs_size = offset;
2115
2116   cfun->machine->frame.hard_fp_offset
2117     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2118                         + get_frame_size ()
2119                         + cfun->machine->frame.saved_regs_size,
2120                         STACK_BOUNDARY / BITS_PER_UNIT);
2121
2122   cfun->machine->frame.frame_size
2123     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2124                         + crtl->outgoing_args_size,
2125                         STACK_BOUNDARY / BITS_PER_UNIT);
2126
2127   cfun->machine->frame.laid_out = true;
2128 }
2129
2130 static bool
2131 aarch64_register_saved_on_entry (int regno)
2132 {
2133   return cfun->machine->frame.reg_offset[regno] >= 0;
2134 }
2135
2136 static unsigned
2137 aarch64_next_callee_save (unsigned regno, unsigned limit)
2138 {
2139   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2140     regno ++;
2141   return regno;
2142 }
2143
2144 static void
2145 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2146                            HOST_WIDE_INT adjustment)
2147  {
2148   rtx base_rtx = stack_pointer_rtx;
2149   rtx insn, reg, mem;
2150
2151   reg = gen_rtx_REG (mode, regno);
2152   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2153                             plus_constant (Pmode, base_rtx, -adjustment));
2154   mem = gen_rtx_MEM (mode, mem);
2155
2156   insn = emit_move_insn (mem, reg);
2157   RTX_FRAME_RELATED_P (insn) = 1;
2158 }
2159
2160 static rtx
2161 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2162                           HOST_WIDE_INT adjustment)
2163 {
2164   switch (mode)
2165     {
2166     case DImode:
2167       return gen_storewb_pairdi_di (base, base, reg, reg2,
2168                                     GEN_INT (-adjustment),
2169                                     GEN_INT (UNITS_PER_WORD - adjustment));
2170     case DFmode:
2171       return gen_storewb_pairdf_di (base, base, reg, reg2,
2172                                     GEN_INT (-adjustment),
2173                                     GEN_INT (UNITS_PER_WORD - adjustment));
2174     default:
2175       gcc_unreachable ();
2176     }
2177 }
2178
2179 static void
2180 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2181                          unsigned regno2, HOST_WIDE_INT adjustment)
2182 {
2183   rtx_insn *insn;
2184   rtx reg1 = gen_rtx_REG (mode, regno1);
2185   rtx reg2 = gen_rtx_REG (mode, regno2);
2186
2187   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2188                                               reg2, adjustment));
2189   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2190   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2191   RTX_FRAME_RELATED_P (insn) = 1;
2192 }
2193
2194 static rtx
2195 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2196                          HOST_WIDE_INT adjustment)
2197 {
2198   switch (mode)
2199     {
2200     case DImode:
2201       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2202                                    GEN_INT (UNITS_PER_WORD));
2203     case DFmode:
2204       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2205                                    GEN_INT (UNITS_PER_WORD));
2206     default:
2207       gcc_unreachable ();
2208     }
2209 }
2210
2211 static rtx
2212 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2213                         rtx reg2)
2214 {
2215   switch (mode)
2216     {
2217     case DImode:
2218       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2219
2220     case DFmode:
2221       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2222
2223     default:
2224       gcc_unreachable ();
2225     }
2226 }
2227
2228 static rtx
2229 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2230                        rtx mem2)
2231 {
2232   switch (mode)
2233     {
2234     case DImode:
2235       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2236
2237     case DFmode:
2238       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2239
2240     default:
2241       gcc_unreachable ();
2242     }
2243 }
2244
2245
2246 static void
2247 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2248                            unsigned start, unsigned limit, bool skip_wb)
2249 {
2250   rtx_insn *insn;
2251   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2252                                                  ? gen_frame_mem : gen_rtx_MEM);
2253   unsigned regno;
2254   unsigned regno2;
2255
2256   for (regno = aarch64_next_callee_save (start, limit);
2257        regno <= limit;
2258        regno = aarch64_next_callee_save (regno + 1, limit))
2259     {
2260       rtx reg, mem;
2261       HOST_WIDE_INT offset;
2262
2263       if (skip_wb
2264           && (regno == cfun->machine->frame.wb_candidate1
2265               || regno == cfun->machine->frame.wb_candidate2))
2266         continue;
2267
2268       reg = gen_rtx_REG (mode, regno);
2269       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2270       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2271                                               offset));
2272
2273       regno2 = aarch64_next_callee_save (regno + 1, limit);
2274
2275       if (regno2 <= limit
2276           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2277               == cfun->machine->frame.reg_offset[regno2]))
2278
2279         {
2280           rtx reg2 = gen_rtx_REG (mode, regno2);
2281           rtx mem2;
2282
2283           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2284           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2285                                                    offset));
2286           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2287                                                     reg2));
2288
2289           /* The first part of a frame-related parallel insn is
2290              always assumed to be relevant to the frame
2291              calculations; subsequent parts, are only
2292              frame-related if explicitly marked.  */
2293           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2294           regno = regno2;
2295         }
2296       else
2297         insn = emit_move_insn (mem, reg);
2298
2299       RTX_FRAME_RELATED_P (insn) = 1;
2300     }
2301 }
2302
2303 static void
2304 aarch64_restore_callee_saves (machine_mode mode,
2305                               HOST_WIDE_INT start_offset, unsigned start,
2306                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2307 {
2308   rtx base_rtx = stack_pointer_rtx;
2309   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2310                                                  ? gen_frame_mem : gen_rtx_MEM);
2311   unsigned regno;
2312   unsigned regno2;
2313   HOST_WIDE_INT offset;
2314
2315   for (regno = aarch64_next_callee_save (start, limit);
2316        regno <= limit;
2317        regno = aarch64_next_callee_save (regno + 1, limit))
2318     {
2319       rtx reg, mem;
2320
2321       if (skip_wb
2322           && (regno == cfun->machine->frame.wb_candidate1
2323               || regno == cfun->machine->frame.wb_candidate2))
2324         continue;
2325
2326       reg = gen_rtx_REG (mode, regno);
2327       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2328       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2329
2330       regno2 = aarch64_next_callee_save (regno + 1, limit);
2331
2332       if (regno2 <= limit
2333           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2334               == cfun->machine->frame.reg_offset[regno2]))
2335         {
2336           rtx reg2 = gen_rtx_REG (mode, regno2);
2337           rtx mem2;
2338
2339           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2340           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2341           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2342
2343           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2344           regno = regno2;
2345         }
2346       else
2347         emit_move_insn (reg, mem);
2348       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2349     }
2350 }
2351
2352 /* AArch64 stack frames generated by this compiler look like:
2353
2354         +-------------------------------+
2355         |                               |
2356         |  incoming stack arguments     |
2357         |                               |
2358         +-------------------------------+
2359         |                               | <-- incoming stack pointer (aligned)
2360         |  callee-allocated save area   |
2361         |  for register varargs         |
2362         |                               |
2363         +-------------------------------+
2364         |  local variables              | <-- frame_pointer_rtx
2365         |                               |
2366         +-------------------------------+
2367         |  padding0                     | \
2368         +-------------------------------+  |
2369         |  callee-saved registers       |  | frame.saved_regs_size
2370         +-------------------------------+  |
2371         |  LR'                          |  |
2372         +-------------------------------+  |
2373         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2374         +-------------------------------+
2375         |  dynamic allocation           |
2376         +-------------------------------+
2377         |  padding                      |
2378         +-------------------------------+
2379         |  outgoing stack arguments     | <-- arg_pointer
2380         |                               |
2381         +-------------------------------+
2382         |                               | <-- stack_pointer_rtx (aligned)
2383
2384    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2385    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2386    unchanged.  */
2387
2388 /* Generate the prologue instructions for entry into a function.
2389    Establish the stack frame by decreasing the stack pointer with a
2390    properly calculated size and, if necessary, create a frame record
2391    filled with the values of LR and previous frame pointer.  The
2392    current FP is also set up if it is in use.  */
2393
2394 void
2395 aarch64_expand_prologue (void)
2396 {
2397   /* sub sp, sp, #<frame_size>
2398      stp {fp, lr}, [sp, #<frame_size> - 16]
2399      add fp, sp, #<frame_size> - hardfp_offset
2400      stp {cs_reg}, [fp, #-16] etc.
2401
2402      sub sp, sp, <final_adjustment_if_any>
2403   */
2404   HOST_WIDE_INT frame_size, offset;
2405   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2406   HOST_WIDE_INT hard_fp_offset;
2407   rtx_insn *insn;
2408
2409   aarch64_layout_frame ();
2410
2411   offset = frame_size = cfun->machine->frame.frame_size;
2412   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2413   fp_offset = frame_size - hard_fp_offset;
2414
2415   if (flag_stack_usage_info)
2416     current_function_static_stack_size = frame_size;
2417
2418   /* Store pairs and load pairs have a range only -512 to 504.  */
2419   if (offset >= 512)
2420     {
2421       /* When the frame has a large size, an initial decrease is done on
2422          the stack pointer to jump over the callee-allocated save area for
2423          register varargs, the local variable area and/or the callee-saved
2424          register area.  This will allow the pre-index write-back
2425          store pair instructions to be used for setting up the stack frame
2426          efficiently.  */
2427       offset = hard_fp_offset;
2428       if (offset >= 512)
2429         offset = cfun->machine->frame.saved_regs_size;
2430
2431       frame_size -= (offset + crtl->outgoing_args_size);
2432       fp_offset = 0;
2433
2434       if (frame_size >= 0x1000000)
2435         {
2436           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2437           emit_move_insn (op0, GEN_INT (-frame_size));
2438           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2439
2440           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2441                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2442                                      plus_constant (Pmode, stack_pointer_rtx,
2443                                                     -frame_size)));
2444           RTX_FRAME_RELATED_P (insn) = 1;
2445         }
2446       else if (frame_size > 0)
2447         {
2448           int hi_ofs = frame_size & 0xfff000;
2449           int lo_ofs = frame_size & 0x000fff;
2450
2451           if (hi_ofs)
2452             {
2453               insn = emit_insn (gen_add2_insn
2454                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2455               RTX_FRAME_RELATED_P (insn) = 1;
2456             }
2457           if (lo_ofs)
2458             {
2459               insn = emit_insn (gen_add2_insn
2460                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2461               RTX_FRAME_RELATED_P (insn) = 1;
2462             }
2463         }
2464     }
2465   else
2466     frame_size = -1;
2467
2468   if (offset > 0)
2469     {
2470       bool skip_wb = false;
2471
2472       if (frame_pointer_needed)
2473         {
2474           skip_wb = true;
2475
2476           if (fp_offset)
2477             {
2478               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2479                                                GEN_INT (-offset)));
2480               RTX_FRAME_RELATED_P (insn) = 1;
2481
2482               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2483                                          R30_REGNUM, false);
2484             }
2485           else
2486             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2487
2488           /* Set up frame pointer to point to the location of the
2489              previous frame pointer on the stack.  */
2490           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2491                                            stack_pointer_rtx,
2492                                            GEN_INT (fp_offset)));
2493           RTX_FRAME_RELATED_P (insn) = 1;
2494           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2495         }
2496       else
2497         {
2498           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2499           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2500
2501           if (fp_offset
2502               || reg1 == FIRST_PSEUDO_REGISTER
2503               || (reg2 == FIRST_PSEUDO_REGISTER
2504                   && offset >= 256))
2505             {
2506               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2507                                                GEN_INT (-offset)));
2508               RTX_FRAME_RELATED_P (insn) = 1;
2509             }
2510           else
2511             {
2512               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2513
2514               skip_wb = true;
2515
2516               if (reg2 == FIRST_PSEUDO_REGISTER)
2517                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2518               else
2519                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2520             }
2521         }
2522
2523       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2524                                  skip_wb);
2525       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2526                                  skip_wb);
2527     }
2528
2529   /* when offset >= 512,
2530      sub sp, sp, #<outgoing_args_size> */
2531   if (frame_size > -1)
2532     {
2533       if (crtl->outgoing_args_size > 0)
2534         {
2535           insn = emit_insn (gen_add2_insn
2536                             (stack_pointer_rtx,
2537                              GEN_INT (- crtl->outgoing_args_size)));
2538           RTX_FRAME_RELATED_P (insn) = 1;
2539         }
2540     }
2541 }
2542
2543 /* Return TRUE if we can use a simple_return insn.
2544
2545    This function checks whether the callee saved stack is empty, which
2546    means no restore actions are need. The pro_and_epilogue will use
2547    this to check whether shrink-wrapping opt is feasible.  */
2548
2549 bool
2550 aarch64_use_return_insn_p (void)
2551 {
2552   if (!reload_completed)
2553     return false;
2554
2555   if (crtl->profile)
2556     return false;
2557
2558   aarch64_layout_frame ();
2559
2560   return cfun->machine->frame.frame_size == 0;
2561 }
2562
2563 /* Generate the epilogue instructions for returning from a function.  */
2564 void
2565 aarch64_expand_epilogue (bool for_sibcall)
2566 {
2567   HOST_WIDE_INT frame_size, offset;
2568   HOST_WIDE_INT fp_offset;
2569   HOST_WIDE_INT hard_fp_offset;
2570   rtx_insn *insn;
2571   /* We need to add memory barrier to prevent read from deallocated stack.  */
2572   bool need_barrier_p = (get_frame_size () != 0
2573                          || cfun->machine->frame.saved_varargs_size);
2574
2575   aarch64_layout_frame ();
2576
2577   offset = frame_size = cfun->machine->frame.frame_size;
2578   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2579   fp_offset = frame_size - hard_fp_offset;
2580
2581   /* Store pairs and load pairs have a range only -512 to 504.  */
2582   if (offset >= 512)
2583     {
2584       offset = hard_fp_offset;
2585       if (offset >= 512)
2586         offset = cfun->machine->frame.saved_regs_size;
2587
2588       frame_size -= (offset + crtl->outgoing_args_size);
2589       fp_offset = 0;
2590       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2591         {
2592           insn = emit_insn (gen_add2_insn
2593                             (stack_pointer_rtx,
2594                              GEN_INT (crtl->outgoing_args_size)));
2595           RTX_FRAME_RELATED_P (insn) = 1;
2596         }
2597     }
2598   else
2599     frame_size = -1;
2600
2601   /* If there were outgoing arguments or we've done dynamic stack
2602      allocation, then restore the stack pointer from the frame
2603      pointer.  This is at most one insn and more efficient than using
2604      GCC's internal mechanism.  */
2605   if (frame_pointer_needed
2606       && (crtl->outgoing_args_size || cfun->calls_alloca))
2607     {
2608       if (cfun->calls_alloca)
2609         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2610
2611       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2612                                        hard_frame_pointer_rtx,
2613                                        GEN_INT (0)));
2614       offset = offset - fp_offset;
2615     }
2616
2617   if (offset > 0)
2618     {
2619       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2620       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2621       bool skip_wb = true;
2622       rtx cfi_ops = NULL;
2623
2624       if (frame_pointer_needed)
2625         fp_offset = 0;
2626       else if (fp_offset
2627                || reg1 == FIRST_PSEUDO_REGISTER
2628                || (reg2 == FIRST_PSEUDO_REGISTER
2629                    && offset >= 256))
2630         skip_wb = false;
2631
2632       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2633                                     skip_wb, &cfi_ops);
2634       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2635                                     skip_wb, &cfi_ops);
2636
2637       if (need_barrier_p)
2638         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2639
2640       if (skip_wb)
2641         {
2642           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2643           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2644
2645           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2646           if (reg2 == FIRST_PSEUDO_REGISTER)
2647             {
2648               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2649               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2650               mem = gen_rtx_MEM (mode1, mem);
2651               insn = emit_move_insn (rreg1, mem);
2652             }
2653           else
2654             {
2655               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2656
2657               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2658               insn = emit_insn (aarch64_gen_loadwb_pair
2659                                 (mode1, stack_pointer_rtx, rreg1,
2660                                  rreg2, offset));
2661             }
2662         }
2663       else
2664         {
2665           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2666                                            GEN_INT (offset)));
2667         }
2668
2669       /* Reset the CFA to be SP + FRAME_SIZE.  */
2670       rtx new_cfa = stack_pointer_rtx;
2671       if (frame_size > 0)
2672         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2673       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2674       REG_NOTES (insn) = cfi_ops;
2675       RTX_FRAME_RELATED_P (insn) = 1;
2676     }
2677
2678   if (frame_size > 0)
2679     {
2680       if (need_barrier_p)
2681         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2682
2683       if (frame_size >= 0x1000000)
2684         {
2685           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2686           emit_move_insn (op0, GEN_INT (frame_size));
2687           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2688         }
2689       else
2690         {
2691           int hi_ofs = frame_size & 0xfff000;
2692           int lo_ofs = frame_size & 0x000fff;
2693
2694           if (hi_ofs && lo_ofs)
2695             {
2696               insn = emit_insn (gen_add2_insn
2697                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2698               RTX_FRAME_RELATED_P (insn) = 1;
2699               frame_size = lo_ofs;
2700             }
2701           insn = emit_insn (gen_add2_insn
2702                             (stack_pointer_rtx, GEN_INT (frame_size)));
2703         }
2704
2705       /* Reset the CFA to be SP + 0.  */
2706       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2707       RTX_FRAME_RELATED_P (insn) = 1;
2708     }
2709
2710   /* Stack adjustment for exception handler.  */
2711   if (crtl->calls_eh_return)
2712     {
2713       /* We need to unwind the stack by the offset computed by
2714          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2715          to be SP; letting the CFA move during this adjustment
2716          is just as correct as retaining the CFA from the body
2717          of the function.  Therefore, do nothing special.  */
2718       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2719     }
2720
2721   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2722   if (!for_sibcall)
2723     emit_jump_insn (ret_rtx);
2724 }
2725
2726 /* Return the place to copy the exception unwinding return address to.
2727    This will probably be a stack slot, but could (in theory be the
2728    return register).  */
2729 rtx
2730 aarch64_final_eh_return_addr (void)
2731 {
2732   HOST_WIDE_INT fp_offset;
2733
2734   aarch64_layout_frame ();
2735
2736   fp_offset = cfun->machine->frame.frame_size
2737               - cfun->machine->frame.hard_fp_offset;
2738
2739   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2740     return gen_rtx_REG (DImode, LR_REGNUM);
2741
2742   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2743      result in a store to save LR introduced by builtin_eh_return () being
2744      incorrectly deleted because the alias is not detected.
2745      So in the calculation of the address to copy the exception unwinding
2746      return address to, we note 2 cases.
2747      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2748      we return a SP-relative location since all the addresses are SP-relative
2749      in this case.  This prevents the store from being optimized away.
2750      If the fp_offset is not 0, then the addresses will be FP-relative and
2751      therefore we return a FP-relative location.  */
2752
2753   if (frame_pointer_needed)
2754     {
2755       if (fp_offset)
2756         return gen_frame_mem (DImode,
2757                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2758       else
2759         return gen_frame_mem (DImode,
2760                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2761     }
2762
2763   /* If FP is not needed, we calculate the location of LR, which would be
2764      at the top of the saved registers block.  */
2765
2766   return gen_frame_mem (DImode,
2767                         plus_constant (Pmode,
2768                                        stack_pointer_rtx,
2769                                        fp_offset
2770                                        + cfun->machine->frame.saved_regs_size
2771                                        - 2 * UNITS_PER_WORD));
2772 }
2773
2774 /* Possibly output code to build up a constant in a register.  For
2775    the benefit of the costs infrastructure, returns the number of
2776    instructions which would be emitted.  GENERATE inhibits or
2777    enables code generation.  */
2778
2779 static int
2780 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2781 {
2782   int insns = 0;
2783
2784   if (aarch64_bitmask_imm (val, DImode))
2785     {
2786       if (generate)
2787         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2788       insns = 1;
2789     }
2790   else
2791     {
2792       int i;
2793       int ncount = 0;
2794       int zcount = 0;
2795       HOST_WIDE_INT valp = val >> 16;
2796       HOST_WIDE_INT valm;
2797       HOST_WIDE_INT tval;
2798
2799       for (i = 16; i < 64; i += 16)
2800         {
2801           valm = (valp & 0xffff);
2802
2803           if (valm != 0)
2804             ++ zcount;
2805
2806           if (valm != 0xffff)
2807             ++ ncount;
2808
2809           valp >>= 16;
2810         }
2811
2812       /* zcount contains the number of additional MOVK instructions
2813          required if the constant is built up with an initial MOVZ instruction,
2814          while ncount is the number of MOVK instructions required if starting
2815          with a MOVN instruction.  Choose the sequence that yields the fewest
2816          number of instructions, preferring MOVZ instructions when they are both
2817          the same.  */
2818       if (ncount < zcount)
2819         {
2820           if (generate)
2821             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2822                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2823           tval = 0xffff;
2824           insns++;
2825         }
2826       else
2827         {
2828           if (generate)
2829             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2830                             GEN_INT (val & 0xffff));
2831           tval = 0;
2832           insns++;
2833         }
2834
2835       val >>= 16;
2836
2837       for (i = 16; i < 64; i += 16)
2838         {
2839           if ((val & 0xffff) != tval)
2840             {
2841               if (generate)
2842                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2843                                            GEN_INT (i),
2844                                            GEN_INT (val & 0xffff)));
2845               insns++;
2846             }
2847           val >>= 16;
2848         }
2849     }
2850   return insns;
2851 }
2852
2853 static void
2854 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2855 {
2856   HOST_WIDE_INT mdelta = delta;
2857   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2858   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2859
2860   if (mdelta < 0)
2861     mdelta = -mdelta;
2862
2863   if (mdelta >= 4096 * 4096)
2864     {
2865       (void) aarch64_build_constant (scratchreg, delta, true);
2866       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2867     }
2868   else if (mdelta > 0)
2869     {
2870       if (mdelta >= 4096)
2871         {
2872           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2873           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2874           if (delta < 0)
2875             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2876                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2877           else
2878             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2879                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2880         }
2881       if (mdelta % 4096 != 0)
2882         {
2883           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2884           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2885                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2886         }
2887     }
2888 }
2889
2890 /* Output code to add DELTA to the first argument, and then jump
2891    to FUNCTION.  Used for C++ multiple inheritance.  */
2892 static void
2893 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2894                          HOST_WIDE_INT delta,
2895                          HOST_WIDE_INT vcall_offset,
2896                          tree function)
2897 {
2898   /* The this pointer is always in x0.  Note that this differs from
2899      Arm where the this pointer maybe bumped to r1 if r0 is required
2900      to return a pointer to an aggregate.  On AArch64 a result value
2901      pointer will be in x8.  */
2902   int this_regno = R0_REGNUM;
2903   rtx this_rtx, temp0, temp1, addr, funexp;
2904   rtx_insn *insn;
2905
2906   reload_completed = 1;
2907   emit_note (NOTE_INSN_PROLOGUE_END);
2908
2909   if (vcall_offset == 0)
2910     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2911   else
2912     {
2913       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2914
2915       this_rtx = gen_rtx_REG (Pmode, this_regno);
2916       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2917       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2918
2919       addr = this_rtx;
2920       if (delta != 0)
2921         {
2922           if (delta >= -256 && delta < 256)
2923             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2924                                        plus_constant (Pmode, this_rtx, delta));
2925           else
2926             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2927         }
2928
2929       if (Pmode == ptr_mode)
2930         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2931       else
2932         aarch64_emit_move (temp0,
2933                            gen_rtx_ZERO_EXTEND (Pmode,
2934                                                 gen_rtx_MEM (ptr_mode, addr)));
2935
2936       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2937           addr = plus_constant (Pmode, temp0, vcall_offset);
2938       else
2939         {
2940           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2941           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2942         }
2943
2944       if (Pmode == ptr_mode)
2945         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2946       else
2947         aarch64_emit_move (temp1,
2948                            gen_rtx_SIGN_EXTEND (Pmode,
2949                                                 gen_rtx_MEM (ptr_mode, addr)));
2950
2951       emit_insn (gen_add2_insn (this_rtx, temp1));
2952     }
2953
2954   /* Generate a tail call to the target function.  */
2955   if (!TREE_USED (function))
2956     {
2957       assemble_external (function);
2958       TREE_USED (function) = 1;
2959     }
2960   funexp = XEXP (DECL_RTL (function), 0);
2961   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2962   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2963   SIBLING_CALL_P (insn) = 1;
2964
2965   insn = get_insns ();
2966   shorten_branches (insn);
2967   final_start_function (insn, file, 1);
2968   final (insn, file, 1);
2969   final_end_function ();
2970
2971   /* Stop pretending to be a post-reload pass.  */
2972   reload_completed = 0;
2973 }
2974
2975 static bool
2976 aarch64_tls_referenced_p (rtx x)
2977 {
2978   if (!TARGET_HAVE_TLS)
2979     return false;
2980   subrtx_iterator::array_type array;
2981   FOR_EACH_SUBRTX (iter, array, x, ALL)
2982     {
2983       const_rtx x = *iter;
2984       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2985         return true;
2986       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2987          TLS offsets, not real symbol references.  */
2988       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2989         iter.skip_subrtxes ();
2990     }
2991   return false;
2992 }
2993
2994
2995 static int
2996 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2997 {
2998   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2999   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3000
3001   if (*imm1 < *imm2)
3002     return -1;
3003   if (*imm1 > *imm2)
3004     return +1;
3005   return 0;
3006 }
3007
3008
3009 static void
3010 aarch64_build_bitmask_table (void)
3011 {
3012   unsigned HOST_WIDE_INT mask, imm;
3013   unsigned int log_e, e, s, r;
3014   unsigned int nimms = 0;
3015
3016   for (log_e = 1; log_e <= 6; log_e++)
3017     {
3018       e = 1 << log_e;
3019       if (e == 64)
3020         mask = ~(HOST_WIDE_INT) 0;
3021       else
3022         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3023       for (s = 1; s < e; s++)
3024         {
3025           for (r = 0; r < e; r++)
3026             {
3027               /* set s consecutive bits to 1 (s < 64) */
3028               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3029               /* rotate right by r */
3030               if (r != 0)
3031                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3032               /* replicate the constant depending on SIMD size */
3033               switch (log_e) {
3034               case 1: imm |= (imm <<  2);
3035               case 2: imm |= (imm <<  4);
3036               case 3: imm |= (imm <<  8);
3037               case 4: imm |= (imm << 16);
3038               case 5: imm |= (imm << 32);
3039               case 6:
3040                 break;
3041               default:
3042                 gcc_unreachable ();
3043               }
3044               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3045               aarch64_bitmasks[nimms++] = imm;
3046             }
3047         }
3048     }
3049
3050   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3051   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3052          aarch64_bitmasks_cmp);
3053 }
3054
3055
3056 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3057    a left shift of 0 or 12 bits.  */
3058 bool
3059 aarch64_uimm12_shift (HOST_WIDE_INT val)
3060 {
3061   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3062           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3063           );
3064 }
3065
3066
3067 /* Return true if val is an immediate that can be loaded into a
3068    register by a MOVZ instruction.  */
3069 static bool
3070 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3071 {
3072   if (GET_MODE_SIZE (mode) > 4)
3073     {
3074       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3075           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3076         return 1;
3077     }
3078   else
3079     {
3080       /* Ignore sign extension.  */
3081       val &= (HOST_WIDE_INT) 0xffffffff;
3082     }
3083   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3084           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3085 }
3086
3087
3088 /* Return true if val is a valid bitmask immediate.  */
3089 bool
3090 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3091 {
3092   if (GET_MODE_SIZE (mode) < 8)
3093     {
3094       /* Replicate bit pattern.  */
3095       val &= (HOST_WIDE_INT) 0xffffffff;
3096       val |= val << 32;
3097     }
3098   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3099                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3100 }
3101
3102
3103 /* Return true if val is an immediate that can be loaded into a
3104    register in a single instruction.  */
3105 bool
3106 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3107 {
3108   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3109     return 1;
3110   return aarch64_bitmask_imm (val, mode);
3111 }
3112
3113 static bool
3114 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3115 {
3116   rtx base, offset;
3117
3118   if (GET_CODE (x) == HIGH)
3119     return true;
3120
3121   split_const (x, &base, &offset);
3122   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3123     {
3124       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3125           != SYMBOL_FORCE_TO_MEM)
3126         return true;
3127       else
3128         /* Avoid generating a 64-bit relocation in ILP32; leave
3129            to aarch64_expand_mov_immediate to handle it properly.  */
3130         return mode != ptr_mode;
3131     }
3132
3133   return aarch64_tls_referenced_p (x);
3134 }
3135
3136 /* Return true if register REGNO is a valid index register.
3137    STRICT_P is true if REG_OK_STRICT is in effect.  */
3138
3139 bool
3140 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3141 {
3142   if (!HARD_REGISTER_NUM_P (regno))
3143     {
3144       if (!strict_p)
3145         return true;
3146
3147       if (!reg_renumber)
3148         return false;
3149
3150       regno = reg_renumber[regno];
3151     }
3152   return GP_REGNUM_P (regno);
3153 }
3154
3155 /* Return true if register REGNO is a valid base register for mode MODE.
3156    STRICT_P is true if REG_OK_STRICT is in effect.  */
3157
3158 bool
3159 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3160 {
3161   if (!HARD_REGISTER_NUM_P (regno))
3162     {
3163       if (!strict_p)
3164         return true;
3165
3166       if (!reg_renumber)
3167         return false;
3168
3169       regno = reg_renumber[regno];
3170     }
3171
3172   /* The fake registers will be eliminated to either the stack or
3173      hard frame pointer, both of which are usually valid base registers.
3174      Reload deals with the cases where the eliminated form isn't valid.  */
3175   return (GP_REGNUM_P (regno)
3176           || regno == SP_REGNUM
3177           || regno == FRAME_POINTER_REGNUM
3178           || regno == ARG_POINTER_REGNUM);
3179 }
3180
3181 /* Return true if X is a valid base register for mode MODE.
3182    STRICT_P is true if REG_OK_STRICT is in effect.  */
3183
3184 static bool
3185 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3186 {
3187   if (!strict_p && GET_CODE (x) == SUBREG)
3188     x = SUBREG_REG (x);
3189
3190   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3191 }
3192
3193 /* Return true if address offset is a valid index.  If it is, fill in INFO
3194    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3195
3196 static bool
3197 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3198                         machine_mode mode, bool strict_p)
3199 {
3200   enum aarch64_address_type type;
3201   rtx index;
3202   int shift;
3203
3204   /* (reg:P) */
3205   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3206       && GET_MODE (x) == Pmode)
3207     {
3208       type = ADDRESS_REG_REG;
3209       index = x;
3210       shift = 0;
3211     }
3212   /* (sign_extend:DI (reg:SI)) */
3213   else if ((GET_CODE (x) == SIGN_EXTEND
3214             || GET_CODE (x) == ZERO_EXTEND)
3215            && GET_MODE (x) == DImode
3216            && GET_MODE (XEXP (x, 0)) == SImode)
3217     {
3218       type = (GET_CODE (x) == SIGN_EXTEND)
3219         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3220       index = XEXP (x, 0);
3221       shift = 0;
3222     }
3223   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3224   else if (GET_CODE (x) == MULT
3225            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3226                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3227            && GET_MODE (XEXP (x, 0)) == DImode
3228            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3229            && CONST_INT_P (XEXP (x, 1)))
3230     {
3231       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3232         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3233       index = XEXP (XEXP (x, 0), 0);
3234       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3235     }
3236   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3237   else if (GET_CODE (x) == ASHIFT
3238            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3239                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3240            && GET_MODE (XEXP (x, 0)) == DImode
3241            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3242            && CONST_INT_P (XEXP (x, 1)))
3243     {
3244       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3245         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3246       index = XEXP (XEXP (x, 0), 0);
3247       shift = INTVAL (XEXP (x, 1));
3248     }
3249   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3250   else if ((GET_CODE (x) == SIGN_EXTRACT
3251             || GET_CODE (x) == ZERO_EXTRACT)
3252            && GET_MODE (x) == DImode
3253            && GET_CODE (XEXP (x, 0)) == MULT
3254            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3255            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3256     {
3257       type = (GET_CODE (x) == SIGN_EXTRACT)
3258         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3259       index = XEXP (XEXP (x, 0), 0);
3260       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3261       if (INTVAL (XEXP (x, 1)) != 32 + shift
3262           || INTVAL (XEXP (x, 2)) != 0)
3263         shift = -1;
3264     }
3265   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3266      (const_int 0xffffffff<<shift)) */
3267   else if (GET_CODE (x) == AND
3268            && GET_MODE (x) == DImode
3269            && GET_CODE (XEXP (x, 0)) == MULT
3270            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3271            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3272            && CONST_INT_P (XEXP (x, 1)))
3273     {
3274       type = ADDRESS_REG_UXTW;
3275       index = XEXP (XEXP (x, 0), 0);
3276       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3277       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3278         shift = -1;
3279     }
3280   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3281   else if ((GET_CODE (x) == SIGN_EXTRACT
3282             || GET_CODE (x) == ZERO_EXTRACT)
3283            && GET_MODE (x) == DImode
3284            && GET_CODE (XEXP (x, 0)) == ASHIFT
3285            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3286            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3287     {
3288       type = (GET_CODE (x) == SIGN_EXTRACT)
3289         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3290       index = XEXP (XEXP (x, 0), 0);
3291       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3292       if (INTVAL (XEXP (x, 1)) != 32 + shift
3293           || INTVAL (XEXP (x, 2)) != 0)
3294         shift = -1;
3295     }
3296   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3297      (const_int 0xffffffff<<shift)) */
3298   else if (GET_CODE (x) == AND
3299            && GET_MODE (x) == DImode
3300            && GET_CODE (XEXP (x, 0)) == ASHIFT
3301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3302            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3303            && CONST_INT_P (XEXP (x, 1)))
3304     {
3305       type = ADDRESS_REG_UXTW;
3306       index = XEXP (XEXP (x, 0), 0);
3307       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3308       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3309         shift = -1;
3310     }
3311   /* (mult:P (reg:P) (const_int scale)) */
3312   else if (GET_CODE (x) == MULT
3313            && GET_MODE (x) == Pmode
3314            && GET_MODE (XEXP (x, 0)) == Pmode
3315            && CONST_INT_P (XEXP (x, 1)))
3316     {
3317       type = ADDRESS_REG_REG;
3318       index = XEXP (x, 0);
3319       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3320     }
3321   /* (ashift:P (reg:P) (const_int shift)) */
3322   else if (GET_CODE (x) == ASHIFT
3323            && GET_MODE (x) == Pmode
3324            && GET_MODE (XEXP (x, 0)) == Pmode
3325            && CONST_INT_P (XEXP (x, 1)))
3326     {
3327       type = ADDRESS_REG_REG;
3328       index = XEXP (x, 0);
3329       shift = INTVAL (XEXP (x, 1));
3330     }
3331   else
3332     return false;
3333
3334   if (GET_CODE (index) == SUBREG)
3335     index = SUBREG_REG (index);
3336
3337   if ((shift == 0 ||
3338        (shift > 0 && shift <= 3
3339         && (1 << shift) == GET_MODE_SIZE (mode)))
3340       && REG_P (index)
3341       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3342     {
3343       info->type = type;
3344       info->offset = index;
3345       info->shift = shift;
3346       return true;
3347     }
3348
3349   return false;
3350 }
3351
3352 bool
3353 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3354 {
3355   return (offset >= -64 * GET_MODE_SIZE (mode)
3356           && offset < 64 * GET_MODE_SIZE (mode)
3357           && offset % GET_MODE_SIZE (mode) == 0);
3358 }
3359
3360 static inline bool
3361 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3362                                HOST_WIDE_INT offset)
3363 {
3364   return offset >= -256 && offset < 256;
3365 }
3366
3367 static inline bool
3368 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3369 {
3370   return (offset >= 0
3371           && offset < 4096 * GET_MODE_SIZE (mode)
3372           && offset % GET_MODE_SIZE (mode) == 0);
3373 }
3374
3375 /* Return true if X is a valid address for machine mode MODE.  If it is,
3376    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3377    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3378
3379 static bool
3380 aarch64_classify_address (struct aarch64_address_info *info,
3381                           rtx x, machine_mode mode,
3382                           RTX_CODE outer_code, bool strict_p)
3383 {
3384   enum rtx_code code = GET_CODE (x);
3385   rtx op0, op1;
3386
3387   /* On BE, we use load/store pair for all large int mode load/stores.  */
3388   bool load_store_pair_p = (outer_code == PARALLEL
3389                             || (BYTES_BIG_ENDIAN
3390                                 && aarch64_vect_struct_mode_p (mode)));
3391
3392   bool allow_reg_index_p =
3393     !load_store_pair_p
3394     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3395     && !aarch64_vect_struct_mode_p (mode);
3396
3397   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3398      REG addressing.  */
3399   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3400       && (code != POST_INC && code != REG))
3401     return false;
3402
3403   switch (code)
3404     {
3405     case REG:
3406     case SUBREG:
3407       info->type = ADDRESS_REG_IMM;
3408       info->base = x;
3409       info->offset = const0_rtx;
3410       return aarch64_base_register_rtx_p (x, strict_p);
3411
3412     case PLUS:
3413       op0 = XEXP (x, 0);
3414       op1 = XEXP (x, 1);
3415
3416       if (! strict_p
3417           && REG_P (op0)
3418           && (op0 == virtual_stack_vars_rtx
3419               || op0 == frame_pointer_rtx
3420               || op0 == arg_pointer_rtx)
3421           && CONST_INT_P (op1))
3422         {
3423           info->type = ADDRESS_REG_IMM;
3424           info->base = op0;
3425           info->offset = op1;
3426
3427           return true;
3428         }
3429
3430       if (GET_MODE_SIZE (mode) != 0
3431           && CONST_INT_P (op1)
3432           && aarch64_base_register_rtx_p (op0, strict_p))
3433         {
3434           HOST_WIDE_INT offset = INTVAL (op1);
3435
3436           info->type = ADDRESS_REG_IMM;
3437           info->base = op0;
3438           info->offset = op1;
3439
3440           /* TImode and TFmode values are allowed in both pairs of X
3441              registers and individual Q registers.  The available
3442              address modes are:
3443              X,X: 7-bit signed scaled offset
3444              Q:   9-bit signed offset
3445              We conservatively require an offset representable in either mode.
3446            */
3447           if (mode == TImode || mode == TFmode)
3448             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3449                     && offset_9bit_signed_unscaled_p (mode, offset));
3450
3451           /* A 7bit offset check because OImode will emit a ldp/stp
3452              instruction (only big endian will get here).
3453              For ldp/stp instructions, the offset is scaled for the size of a
3454              single element of the pair.  */
3455           if (mode == OImode)
3456             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3457
3458           /* Three 9/12 bit offsets checks because CImode will emit three
3459              ldr/str instructions (only big endian will get here).  */
3460           if (mode == CImode)
3461             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3462                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3463                         || offset_12bit_unsigned_scaled_p (V16QImode,
3464                                                            offset + 32)));
3465
3466           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3467              instructions (only big endian will get here).  */
3468           if (mode == XImode)
3469             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3470                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3471                                                             offset + 32));
3472
3473           if (load_store_pair_p)
3474             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3475                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3476           else
3477             return (offset_9bit_signed_unscaled_p (mode, offset)
3478                     || offset_12bit_unsigned_scaled_p (mode, offset));
3479         }
3480
3481       if (allow_reg_index_p)
3482         {
3483           /* Look for base + (scaled/extended) index register.  */
3484           if (aarch64_base_register_rtx_p (op0, strict_p)
3485               && aarch64_classify_index (info, op1, mode, strict_p))
3486             {
3487               info->base = op0;
3488               return true;
3489             }
3490           if (aarch64_base_register_rtx_p (op1, strict_p)
3491               && aarch64_classify_index (info, op0, mode, strict_p))
3492             {
3493               info->base = op1;
3494               return true;
3495             }
3496         }
3497
3498       return false;
3499
3500     case POST_INC:
3501     case POST_DEC:
3502     case PRE_INC:
3503     case PRE_DEC:
3504       info->type = ADDRESS_REG_WB;
3505       info->base = XEXP (x, 0);
3506       info->offset = NULL_RTX;
3507       return aarch64_base_register_rtx_p (info->base, strict_p);
3508
3509     case POST_MODIFY:
3510     case PRE_MODIFY:
3511       info->type = ADDRESS_REG_WB;
3512       info->base = XEXP (x, 0);
3513       if (GET_CODE (XEXP (x, 1)) == PLUS
3514           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3515           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3516           && aarch64_base_register_rtx_p (info->base, strict_p))
3517         {
3518           HOST_WIDE_INT offset;
3519           info->offset = XEXP (XEXP (x, 1), 1);
3520           offset = INTVAL (info->offset);
3521
3522           /* TImode and TFmode values are allowed in both pairs of X
3523              registers and individual Q registers.  The available
3524              address modes are:
3525              X,X: 7-bit signed scaled offset
3526              Q:   9-bit signed offset
3527              We conservatively require an offset representable in either mode.
3528            */
3529           if (mode == TImode || mode == TFmode)
3530             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3531                     && offset_9bit_signed_unscaled_p (mode, offset));
3532
3533           if (load_store_pair_p)
3534             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3535                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3536           else
3537             return offset_9bit_signed_unscaled_p (mode, offset);
3538         }
3539       return false;
3540
3541     case CONST:
3542     case SYMBOL_REF:
3543     case LABEL_REF:
3544       /* load literal: pc-relative constant pool entry.  Only supported
3545          for SI mode or larger.  */
3546       info->type = ADDRESS_SYMBOLIC;
3547
3548       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3549         {
3550           rtx sym, addend;
3551
3552           split_const (x, &sym, &addend);
3553           return (GET_CODE (sym) == LABEL_REF
3554                   || (GET_CODE (sym) == SYMBOL_REF
3555                       && CONSTANT_POOL_ADDRESS_P (sym)));
3556         }
3557       return false;
3558
3559     case LO_SUM:
3560       info->type = ADDRESS_LO_SUM;
3561       info->base = XEXP (x, 0);
3562       info->offset = XEXP (x, 1);
3563       if (allow_reg_index_p
3564           && aarch64_base_register_rtx_p (info->base, strict_p))
3565         {
3566           rtx sym, offs;
3567           split_const (info->offset, &sym, &offs);
3568           if (GET_CODE (sym) == SYMBOL_REF
3569               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3570                   == SYMBOL_SMALL_ABSOLUTE))
3571             {
3572               /* The symbol and offset must be aligned to the access size.  */
3573               unsigned int align;
3574               unsigned int ref_size;
3575
3576               if (CONSTANT_POOL_ADDRESS_P (sym))
3577                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3578               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3579                 {
3580                   tree exp = SYMBOL_REF_DECL (sym);
3581                   align = TYPE_ALIGN (TREE_TYPE (exp));
3582                   align = CONSTANT_ALIGNMENT (exp, align);
3583                 }
3584               else if (SYMBOL_REF_DECL (sym))
3585                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3586               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3587                        && SYMBOL_REF_BLOCK (sym) != NULL)
3588                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3589               else
3590                 align = BITS_PER_UNIT;
3591
3592               ref_size = GET_MODE_SIZE (mode);
3593               if (ref_size == 0)
3594                 ref_size = GET_MODE_SIZE (DImode);
3595
3596               return ((INTVAL (offs) & (ref_size - 1)) == 0
3597                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3598             }
3599         }
3600       return false;
3601
3602     default:
3603       return false;
3604     }
3605 }
3606
3607 bool
3608 aarch64_symbolic_address_p (rtx x)
3609 {
3610   rtx offset;
3611
3612   split_const (x, &x, &offset);
3613   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3614 }
3615
3616 /* Classify the base of symbolic expression X, given that X appears in
3617    context CONTEXT.  */
3618
3619 enum aarch64_symbol_type
3620 aarch64_classify_symbolic_expression (rtx x,
3621                                       enum aarch64_symbol_context context)
3622 {
3623   rtx offset;
3624
3625   split_const (x, &x, &offset);
3626   return aarch64_classify_symbol (x, offset, context);
3627 }
3628
3629
3630 /* Return TRUE if X is a legitimate address for accessing memory in
3631    mode MODE.  */
3632 static bool
3633 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3634 {
3635   struct aarch64_address_info addr;
3636
3637   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3638 }
3639
3640 /* Return TRUE if X is a legitimate address for accessing memory in
3641    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3642    pair operation.  */
3643 bool
3644 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3645                               RTX_CODE outer_code, bool strict_p)
3646 {
3647   struct aarch64_address_info addr;
3648
3649   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3650 }
3651
3652 /* Return TRUE if rtx X is immediate constant 0.0 */
3653 bool
3654 aarch64_float_const_zero_rtx_p (rtx x)
3655 {
3656   REAL_VALUE_TYPE r;
3657
3658   if (GET_MODE (x) == VOIDmode)
3659     return false;
3660
3661   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3662   if (REAL_VALUE_MINUS_ZERO (r))
3663     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3664   return REAL_VALUES_EQUAL (r, dconst0);
3665 }
3666
3667 /* Return the fixed registers used for condition codes.  */
3668
3669 static bool
3670 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3671 {
3672   *p1 = CC_REGNUM;
3673   *p2 = INVALID_REGNUM;
3674   return true;
3675 }
3676
3677 /* Emit call insn with PAT and do aarch64-specific handling.  */
3678
3679 void
3680 aarch64_emit_call_insn (rtx pat)
3681 {
3682   rtx insn = emit_call_insn (pat);
3683
3684   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3685   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3686   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3687 }
3688
3689 machine_mode
3690 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3691 {
3692   /* All floating point compares return CCFP if it is an equality
3693      comparison, and CCFPE otherwise.  */
3694   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3695     {
3696       switch (code)
3697         {
3698         case EQ:
3699         case NE:
3700         case UNORDERED:
3701         case ORDERED:
3702         case UNLT:
3703         case UNLE:
3704         case UNGT:
3705         case UNGE:
3706         case UNEQ:
3707         case LTGT:
3708           return CCFPmode;
3709
3710         case LT:
3711         case LE:
3712         case GT:
3713         case GE:
3714           return CCFPEmode;
3715
3716         default:
3717           gcc_unreachable ();
3718         }
3719     }
3720
3721   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3722       && y == const0_rtx
3723       && (code == EQ || code == NE || code == LT || code == GE)
3724       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3725           || GET_CODE (x) == NEG))
3726     return CC_NZmode;
3727
3728   /* A compare with a shifted operand.  Because of canonicalization,
3729      the comparison will have to be swapped when we emit the assembly
3730      code.  */
3731   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3732       && (REG_P (y) || GET_CODE (y) == SUBREG)
3733       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3734           || GET_CODE (x) == LSHIFTRT
3735           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3736     return CC_SWPmode;
3737
3738   /* Similarly for a negated operand, but we can only do this for
3739      equalities.  */
3740   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3741       && (REG_P (y) || GET_CODE (y) == SUBREG)
3742       && (code == EQ || code == NE)
3743       && GET_CODE (x) == NEG)
3744     return CC_Zmode;
3745
3746   /* A compare of a mode narrower than SI mode against zero can be done
3747      by extending the value in the comparison.  */
3748   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3749       && y == const0_rtx)
3750     /* Only use sign-extension if we really need it.  */
3751     return ((code == GT || code == GE || code == LE || code == LT)
3752             ? CC_SESWPmode : CC_ZESWPmode);
3753
3754   /* For everything else, return CCmode.  */
3755   return CCmode;
3756 }
3757
3758 static int
3759 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3760
3761 int
3762 aarch64_get_condition_code (rtx x)
3763 {
3764   machine_mode mode = GET_MODE (XEXP (x, 0));
3765   enum rtx_code comp_code = GET_CODE (x);
3766
3767   if (GET_MODE_CLASS (mode) != MODE_CC)
3768     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3769   return aarch64_get_condition_code_1 (mode, comp_code);
3770 }
3771
3772 static int
3773 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3774 {
3775   int ne = -1, eq = -1;
3776   switch (mode)
3777     {
3778     case CCFPmode:
3779     case CCFPEmode:
3780       switch (comp_code)
3781         {
3782         case GE: return AARCH64_GE;
3783         case GT: return AARCH64_GT;
3784         case LE: return AARCH64_LS;
3785         case LT: return AARCH64_MI;
3786         case NE: return AARCH64_NE;
3787         case EQ: return AARCH64_EQ;
3788         case ORDERED: return AARCH64_VC;
3789         case UNORDERED: return AARCH64_VS;
3790         case UNLT: return AARCH64_LT;
3791         case UNLE: return AARCH64_LE;
3792         case UNGT: return AARCH64_HI;
3793         case UNGE: return AARCH64_PL;
3794         default: return -1;
3795         }
3796       break;
3797
3798     case CC_DNEmode:
3799       ne = AARCH64_NE;
3800       eq = AARCH64_EQ;
3801       break;
3802
3803     case CC_DEQmode:
3804       ne = AARCH64_EQ;
3805       eq = AARCH64_NE;
3806       break;
3807
3808     case CC_DGEmode:
3809       ne = AARCH64_GE;
3810       eq = AARCH64_LT;
3811       break;
3812
3813     case CC_DLTmode:
3814       ne = AARCH64_LT;
3815       eq = AARCH64_GE;
3816       break;
3817
3818     case CC_DGTmode:
3819       ne = AARCH64_GT;
3820       eq = AARCH64_LE;
3821       break;
3822
3823     case CC_DLEmode:
3824       ne = AARCH64_LE;
3825       eq = AARCH64_GT;
3826       break;
3827
3828     case CC_DGEUmode:
3829       ne = AARCH64_CS;
3830       eq = AARCH64_CC;
3831       break;
3832
3833     case CC_DLTUmode:
3834       ne = AARCH64_CC;
3835       eq = AARCH64_CS;
3836       break;
3837
3838     case CC_DGTUmode:
3839       ne = AARCH64_HI;
3840       eq = AARCH64_LS;
3841       break;
3842
3843     case CC_DLEUmode:
3844       ne = AARCH64_LS;
3845       eq = AARCH64_HI;
3846       break;
3847
3848     case CCmode:
3849       switch (comp_code)
3850         {
3851         case NE: return AARCH64_NE;
3852         case EQ: return AARCH64_EQ;
3853         case GE: return AARCH64_GE;
3854         case GT: return AARCH64_GT;
3855         case LE: return AARCH64_LE;
3856         case LT: return AARCH64_LT;
3857         case GEU: return AARCH64_CS;
3858         case GTU: return AARCH64_HI;
3859         case LEU: return AARCH64_LS;
3860         case LTU: return AARCH64_CC;
3861         default: return -1;
3862         }
3863       break;
3864
3865     case CC_SWPmode:
3866     case CC_ZESWPmode:
3867     case CC_SESWPmode:
3868       switch (comp_code)
3869         {
3870         case NE: return AARCH64_NE;
3871         case EQ: return AARCH64_EQ;
3872         case GE: return AARCH64_LE;
3873         case GT: return AARCH64_LT;
3874         case LE: return AARCH64_GE;
3875         case LT: return AARCH64_GT;
3876         case GEU: return AARCH64_LS;
3877         case GTU: return AARCH64_CC;
3878         case LEU: return AARCH64_CS;
3879         case LTU: return AARCH64_HI;
3880         default: return -1;
3881         }
3882       break;
3883
3884     case CC_NZmode:
3885       switch (comp_code)
3886         {
3887         case NE: return AARCH64_NE;
3888         case EQ: return AARCH64_EQ;
3889         case GE: return AARCH64_PL;
3890         case LT: return AARCH64_MI;
3891         default: return -1;
3892         }
3893       break;
3894
3895     case CC_Zmode:
3896       switch (comp_code)
3897         {
3898         case NE: return AARCH64_NE;
3899         case EQ: return AARCH64_EQ;
3900         default: return -1;
3901         }
3902       break;
3903
3904     default:
3905       return -1;
3906       break;
3907     }
3908
3909   if (comp_code == NE)
3910     return ne;
3911
3912   if (comp_code == EQ)
3913     return eq;
3914
3915   return -1;
3916 }
3917
3918 bool
3919 aarch64_const_vec_all_same_in_range_p (rtx x,
3920                                   HOST_WIDE_INT minval,
3921                                   HOST_WIDE_INT maxval)
3922 {
3923   HOST_WIDE_INT firstval;
3924   int count, i;
3925
3926   if (GET_CODE (x) != CONST_VECTOR
3927       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3928     return false;
3929
3930   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3931   if (firstval < minval || firstval > maxval)
3932     return false;
3933
3934   count = CONST_VECTOR_NUNITS (x);
3935   for (i = 1; i < count; i++)
3936     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3937       return false;
3938
3939   return true;
3940 }
3941
3942 bool
3943 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3944 {
3945   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3946 }
3947
3948 static unsigned
3949 bit_count (unsigned HOST_WIDE_INT value)
3950 {
3951   unsigned count = 0;
3952
3953   while (value)
3954     {
3955       count++;
3956       value &= value - 1;
3957     }
3958
3959   return count;
3960 }
3961
3962 /* N Z C V.  */
3963 #define AARCH64_CC_V 1
3964 #define AARCH64_CC_C (1 << 1)
3965 #define AARCH64_CC_Z (1 << 2)
3966 #define AARCH64_CC_N (1 << 3)
3967
3968 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3969    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3970 static const int aarch64_nzcv_codes[][2] =
3971 {
3972   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3973   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3974   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3975   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3976   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3977   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3978   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3979   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3980   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3981   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3982   {0, AARCH64_CC_V}, /* GE, N == V.  */
3983   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3984   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3985   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3986   {0, 0}, /* AL, Any.  */
3987   {0, 0}, /* NV, Any.  */
3988 };
3989
3990 int
3991 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3992 {
3993   switch (mode)
3994     {
3995     case CC_DNEmode:
3996       return NE;
3997
3998     case CC_DEQmode:
3999       return EQ;
4000
4001     case CC_DLEmode:
4002       return LE;
4003
4004     case CC_DGTmode:
4005       return GT;
4006
4007     case CC_DLTmode:
4008       return LT;
4009
4010     case CC_DGEmode:
4011       return GE;
4012
4013     case CC_DLEUmode:
4014       return LEU;
4015
4016     case CC_DGTUmode:
4017       return GTU;
4018
4019     case CC_DLTUmode:
4020       return LTU;
4021
4022     case CC_DGEUmode:
4023       return GEU;
4024
4025     default:
4026       gcc_unreachable ();
4027     }
4028 }
4029
4030
4031 void
4032 aarch64_print_operand (FILE *f, rtx x, char code)
4033 {
4034   switch (code)
4035     {
4036     /* An integer or symbol address without a preceding # sign.  */
4037     case 'c':
4038       switch (GET_CODE (x))
4039         {
4040         case CONST_INT:
4041           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4042           break;
4043
4044         case SYMBOL_REF:
4045           output_addr_const (f, x);
4046           break;
4047
4048         case CONST:
4049           if (GET_CODE (XEXP (x, 0)) == PLUS
4050               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4051             {
4052               output_addr_const (f, x);
4053               break;
4054             }
4055           /* Fall through.  */
4056
4057         default:
4058           output_operand_lossage ("Unsupported operand for code '%c'", code);
4059         }
4060       break;
4061
4062     case 'e':
4063       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4064       {
4065         int n;
4066
4067         if (!CONST_INT_P (x)
4068             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4069           {
4070             output_operand_lossage ("invalid operand for '%%%c'", code);
4071             return;
4072           }
4073
4074         switch (n)
4075           {
4076           case 3:
4077             fputc ('b', f);
4078             break;
4079           case 4:
4080             fputc ('h', f);
4081             break;
4082           case 5:
4083             fputc ('w', f);
4084             break;
4085           default:
4086             output_operand_lossage ("invalid operand for '%%%c'", code);
4087             return;
4088           }
4089       }
4090       break;
4091
4092     case 'p':
4093       {
4094         int n;
4095
4096         /* Print N such that 2^N == X.  */
4097         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4098           {
4099             output_operand_lossage ("invalid operand for '%%%c'", code);
4100             return;
4101           }
4102
4103         asm_fprintf (f, "%d", n);
4104       }
4105       break;
4106
4107     case 'P':
4108       /* Print the number of non-zero bits in X (a const_int).  */
4109       if (!CONST_INT_P (x))
4110         {
4111           output_operand_lossage ("invalid operand for '%%%c'", code);
4112           return;
4113         }
4114
4115       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4116       break;
4117
4118     case 'H':
4119       /* Print the higher numbered register of a pair (TImode) of regs.  */
4120       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4121         {
4122           output_operand_lossage ("invalid operand for '%%%c'", code);
4123           return;
4124         }
4125
4126       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4127       break;
4128
4129     case 'm':
4130       {
4131         int cond_code;
4132         /* Print a condition (eq, ne, etc).  */
4133
4134         /* CONST_TRUE_RTX means always -- that's the default.  */
4135         if (x == const_true_rtx)
4136           return;
4137
4138         if (!COMPARISON_P (x))
4139           {
4140             output_operand_lossage ("invalid operand for '%%%c'", code);
4141             return;
4142           }
4143
4144         cond_code = aarch64_get_condition_code (x);
4145         gcc_assert (cond_code >= 0);
4146         fputs (aarch64_condition_codes[cond_code], f);
4147       }
4148       break;
4149
4150     case 'M':
4151       {
4152         int cond_code;
4153         /* Print the inverse of a condition (eq <-> ne, etc).  */
4154
4155         /* CONST_TRUE_RTX means never -- that's the default.  */
4156         if (x == const_true_rtx)
4157           {
4158             fputs ("nv", f);
4159             return;
4160           }
4161
4162         if (!COMPARISON_P (x))
4163           {
4164             output_operand_lossage ("invalid operand for '%%%c'", code);
4165             return;
4166           }
4167         cond_code = aarch64_get_condition_code (x);
4168         gcc_assert (cond_code >= 0);
4169         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4170                                        (cond_code)], f);
4171       }
4172       break;
4173
4174     case 'b':
4175     case 'h':
4176     case 's':
4177     case 'd':
4178     case 'q':
4179       /* Print a scalar FP/SIMD register name.  */
4180       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4181         {
4182           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4183           return;
4184         }
4185       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4186       break;
4187
4188     case 'S':
4189     case 'T':
4190     case 'U':
4191     case 'V':
4192       /* Print the first FP/SIMD register name in a list.  */
4193       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4194         {
4195           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4196           return;
4197         }
4198       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4199       break;
4200
4201     case 'R':
4202       /* Print a scalar FP/SIMD register name + 1.  */
4203       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4204         {
4205           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4206           return;
4207         }
4208       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4209       break;
4210
4211     case 'X':
4212       /* Print bottom 16 bits of integer constant in hex.  */
4213       if (!CONST_INT_P (x))
4214         {
4215           output_operand_lossage ("invalid operand for '%%%c'", code);
4216           return;
4217         }
4218       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4219       break;
4220
4221     case 'w':
4222     case 'x':
4223       /* Print a general register name or the zero register (32-bit or
4224          64-bit).  */
4225       if (x == const0_rtx
4226           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4227         {
4228           asm_fprintf (f, "%czr", code);
4229           break;
4230         }
4231
4232       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4233         {
4234           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4235           break;
4236         }
4237
4238       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4239         {
4240           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4241           break;
4242         }
4243
4244       /* Fall through */
4245
4246     case 0:
4247       /* Print a normal operand, if it's a general register, then we
4248          assume DImode.  */
4249       if (x == NULL)
4250         {
4251           output_operand_lossage ("missing operand");
4252           return;
4253         }
4254
4255       switch (GET_CODE (x))
4256         {
4257         case REG:
4258           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4259           break;
4260
4261         case MEM:
4262           aarch64_memory_reference_mode = GET_MODE (x);
4263           output_address (XEXP (x, 0));
4264           break;
4265
4266         case LABEL_REF:
4267         case SYMBOL_REF:
4268           output_addr_const (asm_out_file, x);
4269           break;
4270
4271         case CONST_INT:
4272           asm_fprintf (f, "%wd", INTVAL (x));
4273           break;
4274
4275         case CONST_VECTOR:
4276           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4277             {
4278               gcc_assert (
4279                   aarch64_const_vec_all_same_in_range_p (x,
4280                                                          HOST_WIDE_INT_MIN,
4281                                                          HOST_WIDE_INT_MAX));
4282               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4283             }
4284           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4285             {
4286               fputc ('0', f);
4287             }
4288           else
4289             gcc_unreachable ();
4290           break;
4291
4292         case CONST_DOUBLE:
4293           /* CONST_DOUBLE can represent a double-width integer.
4294              In this case, the mode of x is VOIDmode.  */
4295           if (GET_MODE (x) == VOIDmode)
4296             ; /* Do Nothing.  */
4297           else if (aarch64_float_const_zero_rtx_p (x))
4298             {
4299               fputc ('0', f);
4300               break;
4301             }
4302           else if (aarch64_float_const_representable_p (x))
4303             {
4304 #define buf_size 20
4305               char float_buf[buf_size] = {'\0'};
4306               REAL_VALUE_TYPE r;
4307               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4308               real_to_decimal_for_mode (float_buf, &r,
4309                                         buf_size, buf_size,
4310                                         1, GET_MODE (x));
4311               asm_fprintf (asm_out_file, "%s", float_buf);
4312               break;
4313 #undef buf_size
4314             }
4315           output_operand_lossage ("invalid constant");
4316           return;
4317         default:
4318           output_operand_lossage ("invalid operand");
4319           return;
4320         }
4321       break;
4322
4323     case 'A':
4324       if (GET_CODE (x) == HIGH)
4325         x = XEXP (x, 0);
4326
4327       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4328         {
4329         case SYMBOL_SMALL_GOT:
4330           asm_fprintf (asm_out_file, ":got:");
4331           break;
4332
4333         case SYMBOL_SMALL_TLSGD:
4334           asm_fprintf (asm_out_file, ":tlsgd:");
4335           break;
4336
4337         case SYMBOL_SMALL_TLSDESC:
4338           asm_fprintf (asm_out_file, ":tlsdesc:");
4339           break;
4340
4341         case SYMBOL_SMALL_GOTTPREL:
4342           asm_fprintf (asm_out_file, ":gottprel:");
4343           break;
4344
4345         case SYMBOL_SMALL_TPREL:
4346           asm_fprintf (asm_out_file, ":tprel:");
4347           break;
4348
4349         case SYMBOL_TINY_GOT:
4350           gcc_unreachable ();
4351           break;
4352
4353         default:
4354           break;
4355         }
4356       output_addr_const (asm_out_file, x);
4357       break;
4358
4359     case 'L':
4360       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4361         {
4362         case SYMBOL_SMALL_GOT:
4363           asm_fprintf (asm_out_file, ":lo12:");
4364           break;
4365
4366         case SYMBOL_SMALL_TLSGD:
4367           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4368           break;
4369
4370         case SYMBOL_SMALL_TLSDESC:
4371           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4372           break;
4373
4374         case SYMBOL_SMALL_GOTTPREL:
4375           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4376           break;
4377
4378         case SYMBOL_SMALL_TPREL:
4379           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4380           break;
4381
4382         case SYMBOL_TINY_GOT:
4383           asm_fprintf (asm_out_file, ":got:");
4384           break;
4385
4386         default:
4387           break;
4388         }
4389       output_addr_const (asm_out_file, x);
4390       break;
4391
4392     case 'G':
4393
4394       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4395         {
4396         case SYMBOL_SMALL_TPREL:
4397           asm_fprintf (asm_out_file, ":tprel_hi12:");
4398           break;
4399         default:
4400           break;
4401         }
4402       output_addr_const (asm_out_file, x);
4403       break;
4404
4405     case 'K':
4406       {
4407         int cond_code;
4408         /* Print nzcv.  */
4409
4410         if (!COMPARISON_P (x))
4411           {
4412             output_operand_lossage ("invalid operand for '%%%c'", code);
4413             return;
4414           }
4415
4416         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4417         gcc_assert (cond_code >= 0);
4418         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4419       }
4420       break;
4421
4422     case 'k':
4423       {
4424         int cond_code;
4425         /* Print nzcv.  */
4426
4427         if (!COMPARISON_P (x))
4428           {
4429             output_operand_lossage ("invalid operand for '%%%c'", code);
4430             return;
4431           }
4432
4433         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4434         gcc_assert (cond_code >= 0);
4435         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4436       }
4437       break;
4438
4439     default:
4440       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4441       return;
4442     }
4443 }
4444
4445 void
4446 aarch64_print_operand_address (FILE *f, rtx x)
4447 {
4448   struct aarch64_address_info addr;
4449
4450   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4451                              MEM, true))
4452     switch (addr.type)
4453       {
4454       case ADDRESS_REG_IMM:
4455         if (addr.offset == const0_rtx)
4456           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4457         else
4458           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4459                        INTVAL (addr.offset));
4460         return;
4461
4462       case ADDRESS_REG_REG:
4463         if (addr.shift == 0)
4464           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4465                        reg_names [REGNO (addr.offset)]);
4466         else
4467           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4468                        reg_names [REGNO (addr.offset)], addr.shift);
4469         return;
4470
4471       case ADDRESS_REG_UXTW:
4472         if (addr.shift == 0)
4473           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4474                        REGNO (addr.offset) - R0_REGNUM);
4475         else
4476           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4477                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4478         return;
4479
4480       case ADDRESS_REG_SXTW:
4481         if (addr.shift == 0)
4482           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4483                        REGNO (addr.offset) - R0_REGNUM);
4484         else
4485           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4486                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4487         return;
4488
4489       case ADDRESS_REG_WB:
4490         switch (GET_CODE (x))
4491           {
4492           case PRE_INC:
4493             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4494                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4495             return;
4496           case POST_INC:
4497             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4498                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4499             return;
4500           case PRE_DEC:
4501             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4502                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4503             return;
4504           case POST_DEC:
4505             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4506                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4507             return;
4508           case PRE_MODIFY:
4509             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4510                          INTVAL (addr.offset));
4511             return;
4512           case POST_MODIFY:
4513             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4514                          INTVAL (addr.offset));
4515             return;
4516           default:
4517             break;
4518           }
4519         break;
4520
4521       case ADDRESS_LO_SUM:
4522         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4523         output_addr_const (f, addr.offset);
4524         asm_fprintf (f, "]");
4525         return;
4526
4527       case ADDRESS_SYMBOLIC:
4528         break;
4529       }
4530
4531   output_addr_const (f, x);
4532 }
4533
4534 bool
4535 aarch64_label_mentioned_p (rtx x)
4536 {
4537   const char *fmt;
4538   int i;
4539
4540   if (GET_CODE (x) == LABEL_REF)
4541     return true;
4542
4543   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4544      referencing instruction, but they are constant offsets, not
4545      symbols.  */
4546   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4547     return false;
4548
4549   fmt = GET_RTX_FORMAT (GET_CODE (x));
4550   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4551     {
4552       if (fmt[i] == 'E')
4553         {
4554           int j;
4555
4556           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4557             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4558               return 1;
4559         }
4560       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4561         return 1;
4562     }
4563
4564   return 0;
4565 }
4566
4567 /* Implement REGNO_REG_CLASS.  */
4568
4569 enum reg_class
4570 aarch64_regno_regclass (unsigned regno)
4571 {
4572   if (GP_REGNUM_P (regno))
4573     return GENERAL_REGS;
4574
4575   if (regno == SP_REGNUM)
4576     return STACK_REG;
4577
4578   if (regno == FRAME_POINTER_REGNUM
4579       || regno == ARG_POINTER_REGNUM)
4580     return POINTER_REGS;
4581
4582   if (FP_REGNUM_P (regno))
4583     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4584
4585   return NO_REGS;
4586 }
4587
4588 static rtx
4589 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4590 {
4591   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4592      where mask is selected by alignment and size of the offset.
4593      We try to pick as large a range for the offset as possible to
4594      maximize the chance of a CSE.  However, for aligned addresses
4595      we limit the range to 4k so that structures with different sized
4596      elements are likely to use the same base.  */
4597
4598   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4599     {
4600       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4601       HOST_WIDE_INT base_offset;
4602
4603       /* Does it look like we'll need a load/store-pair operation?  */
4604       if (GET_MODE_SIZE (mode) > 16
4605           || mode == TImode)
4606         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4607                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4608       /* For offsets aren't a multiple of the access size, the limit is
4609          -256...255.  */
4610       else if (offset & (GET_MODE_SIZE (mode) - 1))
4611         base_offset = (offset + 0x100) & ~0x1ff;
4612       else
4613         base_offset = offset & ~0xfff;
4614
4615       if (base_offset == 0)
4616         return x;
4617
4618       offset -= base_offset;
4619       rtx base_reg = gen_reg_rtx (Pmode);
4620       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4621                            NULL_RTX);
4622       emit_move_insn (base_reg, val);
4623       x = plus_constant (Pmode, base_reg, offset);
4624     }
4625
4626   return x;
4627 }
4628
4629 /* Try a machine-dependent way of reloading an illegitimate address
4630    operand.  If we find one, push the reload and return the new rtx.  */
4631
4632 rtx
4633 aarch64_legitimize_reload_address (rtx *x_p,
4634                                    machine_mode mode,
4635                                    int opnum, int type,
4636                                    int ind_levels ATTRIBUTE_UNUSED)
4637 {
4638   rtx x = *x_p;
4639
4640   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4641   if (aarch64_vect_struct_mode_p (mode)
4642       && GET_CODE (x) == PLUS
4643       && REG_P (XEXP (x, 0))
4644       && CONST_INT_P (XEXP (x, 1)))
4645     {
4646       rtx orig_rtx = x;
4647       x = copy_rtx (x);
4648       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4649                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4650                    opnum, (enum reload_type) type);
4651       return x;
4652     }
4653
4654   /* We must recognize output that we have already generated ourselves.  */
4655   if (GET_CODE (x) == PLUS
4656       && GET_CODE (XEXP (x, 0)) == PLUS
4657       && REG_P (XEXP (XEXP (x, 0), 0))
4658       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4659       && CONST_INT_P (XEXP (x, 1)))
4660     {
4661       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4662                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4663                    opnum, (enum reload_type) type);
4664       return x;
4665     }
4666
4667   /* We wish to handle large displacements off a base register by splitting
4668      the addend across an add and the mem insn.  This can cut the number of
4669      extra insns needed from 3 to 1.  It is only useful for load/store of a
4670      single register with 12 bit offset field.  */
4671   if (GET_CODE (x) == PLUS
4672       && REG_P (XEXP (x, 0))
4673       && CONST_INT_P (XEXP (x, 1))
4674       && HARD_REGISTER_P (XEXP (x, 0))
4675       && mode != TImode
4676       && mode != TFmode
4677       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4678     {
4679       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4680       HOST_WIDE_INT low = val & 0xfff;
4681       HOST_WIDE_INT high = val - low;
4682       HOST_WIDE_INT offs;
4683       rtx cst;
4684       machine_mode xmode = GET_MODE (x);
4685
4686       /* In ILP32, xmode can be either DImode or SImode.  */
4687       gcc_assert (xmode == DImode || xmode == SImode);
4688
4689       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4690          BLKmode alignment.  */
4691       if (GET_MODE_SIZE (mode) == 0)
4692         return NULL_RTX;
4693
4694       offs = low % GET_MODE_SIZE (mode);
4695
4696       /* Align misaligned offset by adjusting high part to compensate.  */
4697       if (offs != 0)
4698         {
4699           if (aarch64_uimm12_shift (high + offs))
4700             {
4701               /* Align down.  */
4702               low = low - offs;
4703               high = high + offs;
4704             }
4705           else
4706             {
4707               /* Align up.  */
4708               offs = GET_MODE_SIZE (mode) - offs;
4709               low = low + offs;
4710               high = high + (low & 0x1000) - offs;
4711               low &= 0xfff;
4712             }
4713         }
4714
4715       /* Check for overflow.  */
4716       if (high + low != val)
4717         return NULL_RTX;
4718
4719       cst = GEN_INT (high);
4720       if (!aarch64_uimm12_shift (high))
4721         cst = force_const_mem (xmode, cst);
4722
4723       /* Reload high part into base reg, leaving the low part
4724          in the mem instruction.
4725          Note that replacing this gen_rtx_PLUS with plus_constant is
4726          wrong in this case because we rely on the
4727          (plus (plus reg c1) c2) structure being preserved so that
4728          XEXP (*p, 0) in push_reload below uses the correct term.  */
4729       x = gen_rtx_PLUS (xmode,
4730                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4731                         GEN_INT (low));
4732
4733       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4734                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4735                    opnum, (enum reload_type) type);
4736       return x;
4737     }
4738
4739   return NULL_RTX;
4740 }
4741
4742
4743 static reg_class_t
4744 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4745                           reg_class_t rclass,
4746                           machine_mode mode,
4747                           secondary_reload_info *sri)
4748 {
4749   /* Without the TARGET_SIMD instructions we cannot move a Q register
4750      to a Q register directly.  We need a scratch.  */
4751   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4752       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4753       && reg_class_subset_p (rclass, FP_REGS))
4754     {
4755       if (mode == TFmode)
4756         sri->icode = CODE_FOR_aarch64_reload_movtf;
4757       else if (mode == TImode)
4758         sri->icode = CODE_FOR_aarch64_reload_movti;
4759       return NO_REGS;
4760     }
4761
4762   /* A TFmode or TImode memory access should be handled via an FP_REGS
4763      because AArch64 has richer addressing modes for LDR/STR instructions
4764      than LDP/STP instructions.  */
4765   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4766       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4767     return FP_REGS;
4768
4769   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4770       return GENERAL_REGS;
4771
4772   return NO_REGS;
4773 }
4774
4775 static bool
4776 aarch64_can_eliminate (const int from, const int to)
4777 {
4778   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4779      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4780
4781   if (frame_pointer_needed)
4782     {
4783       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4784         return true;
4785       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4786         return false;
4787       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4788           && !cfun->calls_alloca)
4789         return true;
4790       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4791         return true;
4792
4793       return false;
4794     }
4795   else
4796     {
4797       /* If we decided that we didn't need a leaf frame pointer but then used
4798          LR in the function, then we'll want a frame pointer after all, so
4799          prevent this elimination to ensure a frame pointer is used.  */
4800       if (to == STACK_POINTER_REGNUM
4801           && flag_omit_leaf_frame_pointer
4802           && df_regs_ever_live_p (LR_REGNUM))
4803         return false;
4804     }
4805
4806   return true;
4807 }
4808
4809 HOST_WIDE_INT
4810 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4811 {
4812   aarch64_layout_frame ();
4813
4814   if (to == HARD_FRAME_POINTER_REGNUM)
4815     {
4816       if (from == ARG_POINTER_REGNUM)
4817         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4818
4819       if (from == FRAME_POINTER_REGNUM)
4820         return (cfun->machine->frame.hard_fp_offset
4821                 - cfun->machine->frame.saved_varargs_size);
4822     }
4823
4824   if (to == STACK_POINTER_REGNUM)
4825     {
4826       if (from == FRAME_POINTER_REGNUM)
4827           return (cfun->machine->frame.frame_size
4828                   - cfun->machine->frame.saved_varargs_size);
4829     }
4830
4831   return cfun->machine->frame.frame_size;
4832 }
4833
4834 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4835    previous frame.  */
4836
4837 rtx
4838 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4839 {
4840   if (count != 0)
4841     return const0_rtx;
4842   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4843 }
4844
4845
4846 static void
4847 aarch64_asm_trampoline_template (FILE *f)
4848 {
4849   if (TARGET_ILP32)
4850     {
4851       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4852       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4853     }
4854   else
4855     {
4856       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4857       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4858     }
4859   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4860   assemble_aligned_integer (4, const0_rtx);
4861   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4862   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863 }
4864
4865 static void
4866 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4867 {
4868   rtx fnaddr, mem, a_tramp;
4869   const int tramp_code_sz = 16;
4870
4871   /* Don't need to copy the trailing D-words, we fill those in below.  */
4872   emit_block_move (m_tramp, assemble_trampoline_template (),
4873                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4874   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4875   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4876   if (GET_MODE (fnaddr) != ptr_mode)
4877     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4878   emit_move_insn (mem, fnaddr);
4879
4880   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4881   emit_move_insn (mem, chain_value);
4882
4883   /* XXX We should really define a "clear_cache" pattern and use
4884      gen_clear_cache().  */
4885   a_tramp = XEXP (m_tramp, 0);
4886   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4887                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4888                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4889                      ptr_mode);
4890 }
4891
4892 static unsigned char
4893 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4894 {
4895   switch (regclass)
4896     {
4897     case CALLER_SAVE_REGS:
4898     case POINTER_REGS:
4899     case GENERAL_REGS:
4900     case ALL_REGS:
4901     case FP_REGS:
4902     case FP_LO_REGS:
4903       return
4904         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4905                                        (GET_MODE_SIZE (mode) + 7) / 8;
4906     case STACK_REG:
4907       return 1;
4908
4909     case NO_REGS:
4910       return 0;
4911
4912     default:
4913       break;
4914     }
4915   gcc_unreachable ();
4916 }
4917
4918 static reg_class_t
4919 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4920 {
4921   if (regclass == POINTER_REGS)
4922     return GENERAL_REGS;
4923
4924   if (regclass == STACK_REG)
4925     {
4926       if (REG_P(x)
4927           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4928           return regclass;
4929
4930       return NO_REGS;
4931     }
4932
4933   /* If it's an integer immediate that MOVI can't handle, then
4934      FP_REGS is not an option, so we return NO_REGS instead.  */
4935   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4936       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4937     return NO_REGS;
4938
4939   /* Register eliminiation can result in a request for
4940      SP+constant->FP_REGS.  We cannot support such operations which
4941      use SP as source and an FP_REG as destination, so reject out
4942      right now.  */
4943   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4944     {
4945       rtx lhs = XEXP (x, 0);
4946
4947       /* Look through a possible SUBREG introduced by ILP32.  */
4948       if (GET_CODE (lhs) == SUBREG)
4949         lhs = SUBREG_REG (lhs);
4950
4951       gcc_assert (REG_P (lhs));
4952       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4953                                       POINTER_REGS));
4954       return NO_REGS;
4955     }
4956
4957   return regclass;
4958 }
4959
4960 void
4961 aarch64_asm_output_labelref (FILE* f, const char *name)
4962 {
4963   asm_fprintf (f, "%U%s", name);
4964 }
4965
4966 static void
4967 aarch64_elf_asm_constructor (rtx symbol, int priority)
4968 {
4969   if (priority == DEFAULT_INIT_PRIORITY)
4970     default_ctor_section_asm_out_constructor (symbol, priority);
4971   else
4972     {
4973       section *s;
4974       char buf[18];
4975       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4976       s = get_section (buf, SECTION_WRITE, NULL);
4977       switch_to_section (s);
4978       assemble_align (POINTER_SIZE);
4979       assemble_aligned_integer (POINTER_BYTES, symbol);
4980     }
4981 }
4982
4983 static void
4984 aarch64_elf_asm_destructor (rtx symbol, int priority)
4985 {
4986   if (priority == DEFAULT_INIT_PRIORITY)
4987     default_dtor_section_asm_out_destructor (symbol, priority);
4988   else
4989     {
4990       section *s;
4991       char buf[18];
4992       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4993       s = get_section (buf, SECTION_WRITE, NULL);
4994       switch_to_section (s);
4995       assemble_align (POINTER_SIZE);
4996       assemble_aligned_integer (POINTER_BYTES, symbol);
4997     }
4998 }
4999
5000 const char*
5001 aarch64_output_casesi (rtx *operands)
5002 {
5003   char buf[100];
5004   char label[100];
5005   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5006   int index;
5007   static const char *const patterns[4][2] =
5008   {
5009     {
5010       "ldrb\t%w3, [%0,%w1,uxtw]",
5011       "add\t%3, %4, %w3, sxtb #2"
5012     },
5013     {
5014       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5015       "add\t%3, %4, %w3, sxth #2"
5016     },
5017     {
5018       "ldr\t%w3, [%0,%w1,uxtw #2]",
5019       "add\t%3, %4, %w3, sxtw #2"
5020     },
5021     /* We assume that DImode is only generated when not optimizing and
5022        that we don't really need 64-bit address offsets.  That would
5023        imply an object file with 8GB of code in a single function!  */
5024     {
5025       "ldr\t%w3, [%0,%w1,uxtw #2]",
5026       "add\t%3, %4, %w3, sxtw #2"
5027     }
5028   };
5029
5030   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5031
5032   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5033
5034   gcc_assert (index >= 0 && index <= 3);
5035
5036   /* Need to implement table size reduction, by chaning the code below.  */
5037   output_asm_insn (patterns[index][0], operands);
5038   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5039   snprintf (buf, sizeof (buf),
5040             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5041   output_asm_insn (buf, operands);
5042   output_asm_insn (patterns[index][1], operands);
5043   output_asm_insn ("br\t%3", operands);
5044   assemble_label (asm_out_file, label);
5045   return "";
5046 }
5047
5048
5049 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5050    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5051    operator.  */
5052
5053 int
5054 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5055 {
5056   if (shift >= 0 && shift <= 3)
5057     {
5058       int size;
5059       for (size = 8; size <= 32; size *= 2)
5060         {
5061           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5062           if (mask == bits << shift)
5063             return size;
5064         }
5065     }
5066   return 0;
5067 }
5068
5069 static bool
5070 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5071                                    const_rtx x ATTRIBUTE_UNUSED)
5072 {
5073   /* We can't use blocks for constants when we're using a per-function
5074      constant pool.  */
5075   return false;
5076 }
5077
5078 static section *
5079 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5080                             rtx x ATTRIBUTE_UNUSED,
5081                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5082 {
5083   /* Force all constant pool entries into the current function section.  */
5084   return function_section (current_function_decl);
5085 }
5086
5087
5088 /* Costs.  */
5089
5090 /* Helper function for rtx cost calculation.  Strip a shift expression
5091    from X.  Returns the inner operand if successful, or the original
5092    expression on failure.  */
5093 static rtx
5094 aarch64_strip_shift (rtx x)
5095 {
5096   rtx op = x;
5097
5098   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5099      we can convert both to ROR during final output.  */
5100   if ((GET_CODE (op) == ASHIFT
5101        || GET_CODE (op) == ASHIFTRT
5102        || GET_CODE (op) == LSHIFTRT
5103        || GET_CODE (op) == ROTATERT
5104        || GET_CODE (op) == ROTATE)
5105       && CONST_INT_P (XEXP (op, 1)))
5106     return XEXP (op, 0);
5107
5108   if (GET_CODE (op) == MULT
5109       && CONST_INT_P (XEXP (op, 1))
5110       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5111     return XEXP (op, 0);
5112
5113   return x;
5114 }
5115
5116 /* Helper function for rtx cost calculation.  Strip an extend
5117    expression from X.  Returns the inner operand if successful, or the
5118    original expression on failure.  We deal with a number of possible
5119    canonicalization variations here.  */
5120 static rtx
5121 aarch64_strip_extend (rtx x)
5122 {
5123   rtx op = x;
5124
5125   /* Zero and sign extraction of a widened value.  */
5126   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5127       && XEXP (op, 2) == const0_rtx
5128       && GET_CODE (XEXP (op, 0)) == MULT
5129       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5130                                          XEXP (op, 1)))
5131     return XEXP (XEXP (op, 0), 0);
5132
5133   /* It can also be represented (for zero-extend) as an AND with an
5134      immediate.  */
5135   if (GET_CODE (op) == AND
5136       && GET_CODE (XEXP (op, 0)) == MULT
5137       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5138       && CONST_INT_P (XEXP (op, 1))
5139       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5140                            INTVAL (XEXP (op, 1))) != 0)
5141     return XEXP (XEXP (op, 0), 0);
5142
5143   /* Now handle extended register, as this may also have an optional
5144      left shift by 1..4.  */
5145   if (GET_CODE (op) == ASHIFT
5146       && CONST_INT_P (XEXP (op, 1))
5147       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5148     op = XEXP (op, 0);
5149
5150   if (GET_CODE (op) == ZERO_EXTEND
5151       || GET_CODE (op) == SIGN_EXTEND)
5152     op = XEXP (op, 0);
5153
5154   if (op != x)
5155     return op;
5156
5157   return x;
5158 }
5159
5160 /* Helper function for rtx cost calculation.  Calculate the cost of
5161    a MULT, which may be part of a multiply-accumulate rtx.  Return
5162    the calculated cost of the expression, recursing manually in to
5163    operands where needed.  */
5164
5165 static int
5166 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5167 {
5168   rtx op0, op1;
5169   const struct cpu_cost_table *extra_cost
5170     = aarch64_tune_params->insn_extra_cost;
5171   int cost = 0;
5172   bool maybe_fma = (outer == PLUS || outer == MINUS);
5173   machine_mode mode = GET_MODE (x);
5174
5175   gcc_checking_assert (code == MULT);
5176
5177   op0 = XEXP (x, 0);
5178   op1 = XEXP (x, 1);
5179
5180   if (VECTOR_MODE_P (mode))
5181     mode = GET_MODE_INNER (mode);
5182
5183   /* Integer multiply/fma.  */
5184   if (GET_MODE_CLASS (mode) == MODE_INT)
5185     {
5186       /* The multiply will be canonicalized as a shift, cost it as such.  */
5187       if (CONST_INT_P (op1)
5188           && exact_log2 (INTVAL (op1)) > 0)
5189         {
5190           if (speed)
5191             {
5192               if (maybe_fma)
5193                 /* ADD (shifted register).  */
5194                 cost += extra_cost->alu.arith_shift;
5195               else
5196                 /* LSL (immediate).  */
5197                 cost += extra_cost->alu.shift;
5198             }
5199
5200           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5201
5202           return cost;
5203         }
5204
5205       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5206       if ((GET_CODE (op0) == ZERO_EXTEND
5207            && GET_CODE (op1) == ZERO_EXTEND)
5208           || (GET_CODE (op0) == SIGN_EXTEND
5209               && GET_CODE (op1) == SIGN_EXTEND))
5210         {
5211           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5212                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5213
5214           if (speed)
5215             {
5216               if (maybe_fma)
5217                 /* MADD/SMADDL/UMADDL.  */
5218                 cost += extra_cost->mult[0].extend_add;
5219               else
5220                 /* MUL/SMULL/UMULL.  */
5221                 cost += extra_cost->mult[0].extend;
5222             }
5223
5224           return cost;
5225         }
5226
5227       /* This is either an integer multiply or an FMA.  In both cases
5228          we want to recurse and cost the operands.  */
5229       cost += rtx_cost (op0, MULT, 0, speed)
5230               + rtx_cost (op1, MULT, 1, speed);
5231
5232       if (speed)
5233         {
5234           if (maybe_fma)
5235             /* MADD.  */
5236             cost += extra_cost->mult[mode == DImode].add;
5237           else
5238             /* MUL.  */
5239             cost += extra_cost->mult[mode == DImode].simple;
5240         }
5241
5242       return cost;
5243     }
5244   else
5245     {
5246       if (speed)
5247         {
5248           /* Floating-point FMA/FMUL can also support negations of the
5249              operands.  */
5250           if (GET_CODE (op0) == NEG)
5251             op0 = XEXP (op0, 0);
5252           if (GET_CODE (op1) == NEG)
5253             op1 = XEXP (op1, 0);
5254
5255           if (maybe_fma)
5256             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5257             cost += extra_cost->fp[mode == DFmode].fma;
5258           else
5259             /* FMUL/FNMUL.  */
5260             cost += extra_cost->fp[mode == DFmode].mult;
5261         }
5262
5263       cost += rtx_cost (op0, MULT, 0, speed)
5264               + rtx_cost (op1, MULT, 1, speed);
5265       return cost;
5266     }
5267 }
5268
5269 static int
5270 aarch64_address_cost (rtx x,
5271                       machine_mode mode,
5272                       addr_space_t as ATTRIBUTE_UNUSED,
5273                       bool speed)
5274 {
5275   enum rtx_code c = GET_CODE (x);
5276   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5277   struct aarch64_address_info info;
5278   int cost = 0;
5279   info.shift = 0;
5280
5281   if (!aarch64_classify_address (&info, x, mode, c, false))
5282     {
5283       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5284         {
5285           /* This is a CONST or SYMBOL ref which will be split
5286              in a different way depending on the code model in use.
5287              Cost it through the generic infrastructure.  */
5288           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5289           /* Divide through by the cost of one instruction to
5290              bring it to the same units as the address costs.  */
5291           cost_symbol_ref /= COSTS_N_INSNS (1);
5292           /* The cost is then the cost of preparing the address,
5293              followed by an immediate (possibly 0) offset.  */
5294           return cost_symbol_ref + addr_cost->imm_offset;
5295         }
5296       else
5297         {
5298           /* This is most likely a jump table from a case
5299              statement.  */
5300           return addr_cost->register_offset;
5301         }
5302     }
5303
5304   switch (info.type)
5305     {
5306       case ADDRESS_LO_SUM:
5307       case ADDRESS_SYMBOLIC:
5308       case ADDRESS_REG_IMM:
5309         cost += addr_cost->imm_offset;
5310         break;
5311
5312       case ADDRESS_REG_WB:
5313         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5314           cost += addr_cost->pre_modify;
5315         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5316           cost += addr_cost->post_modify;
5317         else
5318           gcc_unreachable ();
5319
5320         break;
5321
5322       case ADDRESS_REG_REG:
5323         cost += addr_cost->register_offset;
5324         break;
5325
5326       case ADDRESS_REG_UXTW:
5327       case ADDRESS_REG_SXTW:
5328         cost += addr_cost->register_extend;
5329         break;
5330
5331       default:
5332         gcc_unreachable ();
5333     }
5334
5335
5336   if (info.shift > 0)
5337     {
5338       /* For the sake of calculating the cost of the shifted register
5339          component, we can treat same sized modes in the same way.  */
5340       switch (GET_MODE_BITSIZE (mode))
5341         {
5342           case 16:
5343             cost += addr_cost->addr_scale_costs.hi;
5344             break;
5345
5346           case 32:
5347             cost += addr_cost->addr_scale_costs.si;
5348             break;
5349
5350           case 64:
5351             cost += addr_cost->addr_scale_costs.di;
5352             break;
5353
5354           /* We can't tell, or this is a 128-bit vector.  */
5355           default:
5356             cost += addr_cost->addr_scale_costs.ti;
5357             break;
5358         }
5359     }
5360
5361   return cost;
5362 }
5363
5364 /* Return true if the RTX X in mode MODE is a zero or sign extract
5365    usable in an ADD or SUB (extended register) instruction.  */
5366 static bool
5367 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5368 {
5369   /* Catch add with a sign extract.
5370      This is add_<optab><mode>_multp2.  */
5371   if (GET_CODE (x) == SIGN_EXTRACT
5372       || GET_CODE (x) == ZERO_EXTRACT)
5373     {
5374       rtx op0 = XEXP (x, 0);
5375       rtx op1 = XEXP (x, 1);
5376       rtx op2 = XEXP (x, 2);
5377
5378       if (GET_CODE (op0) == MULT
5379           && CONST_INT_P (op1)
5380           && op2 == const0_rtx
5381           && CONST_INT_P (XEXP (op0, 1))
5382           && aarch64_is_extend_from_extract (mode,
5383                                              XEXP (op0, 1),
5384                                              op1))
5385         {
5386           return true;
5387         }
5388     }
5389
5390   return false;
5391 }
5392
5393 static bool
5394 aarch64_frint_unspec_p (unsigned int u)
5395 {
5396   switch (u)
5397     {
5398       case UNSPEC_FRINTZ:
5399       case UNSPEC_FRINTP:
5400       case UNSPEC_FRINTM:
5401       case UNSPEC_FRINTA:
5402       case UNSPEC_FRINTN:
5403       case UNSPEC_FRINTX:
5404       case UNSPEC_FRINTI:
5405         return true;
5406
5407       default:
5408         return false;
5409     }
5410 }
5411
5412 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5413    storing it in *COST.  Result is true if the total cost of the operation
5414    has now been calculated.  */
5415 static bool
5416 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5417 {
5418   rtx inner;
5419   rtx comparator;
5420   enum rtx_code cmpcode;
5421
5422   if (COMPARISON_P (op0))
5423     {
5424       inner = XEXP (op0, 0);
5425       comparator = XEXP (op0, 1);
5426       cmpcode = GET_CODE (op0);
5427     }
5428   else
5429     {
5430       inner = op0;
5431       comparator = const0_rtx;
5432       cmpcode = NE;
5433     }
5434
5435   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5436     {
5437       /* Conditional branch.  */
5438       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5439         return true;
5440       else
5441         {
5442           if (cmpcode == NE || cmpcode == EQ)
5443             {
5444               if (comparator == const0_rtx)
5445                 {
5446                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5447                   if (GET_CODE (inner) == ZERO_EXTRACT)
5448                     /* TBZ/TBNZ.  */
5449                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5450                                        0, speed);
5451                 else
5452                   /* CBZ/CBNZ.  */
5453                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5454
5455                 return true;
5456               }
5457             }
5458           else if (cmpcode == LT || cmpcode == GE)
5459             {
5460               /* TBZ/TBNZ.  */
5461               if (comparator == const0_rtx)
5462                 return true;
5463             }
5464         }
5465     }
5466   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5467     {
5468       /* It's a conditional operation based on the status flags,
5469          so it must be some flavor of CSEL.  */
5470
5471       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5472       if (GET_CODE (op1) == NEG
5473           || GET_CODE (op1) == NOT
5474           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5475         op1 = XEXP (op1, 0);
5476
5477       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5478       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5479       return true;
5480     }
5481
5482   /* We don't know what this is, cost all operands.  */
5483   return false;
5484 }
5485
5486 /* Calculate the cost of calculating X, storing it in *COST.  Result
5487    is true if the total cost of the operation has now been calculated.  */
5488 static bool
5489 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5490                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5491 {
5492   rtx op0, op1, op2;
5493   const struct cpu_cost_table *extra_cost
5494     = aarch64_tune_params->insn_extra_cost;
5495   machine_mode mode = GET_MODE (x);
5496
5497   /* By default, assume that everything has equivalent cost to the
5498      cheapest instruction.  Any additional costs are applied as a delta
5499      above this default.  */
5500   *cost = COSTS_N_INSNS (1);
5501
5502   /* TODO: The cost infrastructure currently does not handle
5503      vector operations.  Assume that all vector operations
5504      are equally expensive.  */
5505   if (VECTOR_MODE_P (mode))
5506     {
5507       if (speed)
5508         *cost += extra_cost->vect.alu;
5509       return true;
5510     }
5511
5512   switch (code)
5513     {
5514     case SET:
5515       /* The cost depends entirely on the operands to SET.  */
5516       *cost = 0;
5517       op0 = SET_DEST (x);
5518       op1 = SET_SRC (x);
5519
5520       switch (GET_CODE (op0))
5521         {
5522         case MEM:
5523           if (speed)
5524             {
5525               rtx address = XEXP (op0, 0);
5526               if (GET_MODE_CLASS (mode) == MODE_INT)
5527                 *cost += extra_cost->ldst.store;
5528               else if (mode == SFmode)
5529                 *cost += extra_cost->ldst.storef;
5530               else if (mode == DFmode)
5531                 *cost += extra_cost->ldst.stored;
5532
5533               *cost +=
5534                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5535                                                      0, speed));
5536             }
5537
5538           *cost += rtx_cost (op1, SET, 1, speed);
5539           return true;
5540
5541         case SUBREG:
5542           if (! REG_P (SUBREG_REG (op0)))
5543             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5544
5545           /* Fall through.  */
5546         case REG:
5547           /* const0_rtx is in general free, but we will use an
5548              instruction to set a register to 0.  */
5549           if (REG_P (op1) || op1 == const0_rtx)
5550             {
5551               /* The cost is 1 per register copied.  */
5552               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5553                               / UNITS_PER_WORD;
5554               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5555             }
5556           else
5557             /* Cost is just the cost of the RHS of the set.  */
5558             *cost += rtx_cost (op1, SET, 1, speed);
5559           return true;
5560
5561         case ZERO_EXTRACT:
5562         case SIGN_EXTRACT:
5563           /* Bit-field insertion.  Strip any redundant widening of
5564              the RHS to meet the width of the target.  */
5565           if (GET_CODE (op1) == SUBREG)
5566             op1 = SUBREG_REG (op1);
5567           if ((GET_CODE (op1) == ZERO_EXTEND
5568                || GET_CODE (op1) == SIGN_EXTEND)
5569               && CONST_INT_P (XEXP (op0, 1))
5570               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5571                   >= INTVAL (XEXP (op0, 1))))
5572             op1 = XEXP (op1, 0);
5573
5574           if (CONST_INT_P (op1))
5575             {
5576               /* MOV immediate is assumed to always be cheap.  */
5577               *cost = COSTS_N_INSNS (1);
5578             }
5579           else
5580             {
5581               /* BFM.  */
5582               if (speed)
5583                 *cost += extra_cost->alu.bfi;
5584               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5585             }
5586
5587           return true;
5588
5589         default:
5590           /* We can't make sense of this, assume default cost.  */
5591           *cost = COSTS_N_INSNS (1);
5592           return false;
5593         }
5594       return false;
5595
5596     case CONST_INT:
5597       /* If an instruction can incorporate a constant within the
5598          instruction, the instruction's expression avoids calling
5599          rtx_cost() on the constant.  If rtx_cost() is called on a
5600          constant, then it is usually because the constant must be
5601          moved into a register by one or more instructions.
5602
5603          The exception is constant 0, which can be expressed
5604          as XZR/WZR and is therefore free.  The exception to this is
5605          if we have (set (reg) (const0_rtx)) in which case we must cost
5606          the move.  However, we can catch that when we cost the SET, so
5607          we don't need to consider that here.  */
5608       if (x == const0_rtx)
5609         *cost = 0;
5610       else
5611         {
5612           /* To an approximation, building any other constant is
5613              proportionally expensive to the number of instructions
5614              required to build that constant.  This is true whether we
5615              are compiling for SPEED or otherwise.  */
5616           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5617                                  (NULL_RTX, x, false, mode));
5618         }
5619       return true;
5620
5621     case CONST_DOUBLE:
5622       if (speed)
5623         {
5624           /* mov[df,sf]_aarch64.  */
5625           if (aarch64_float_const_representable_p (x))
5626             /* FMOV (scalar immediate).  */
5627             *cost += extra_cost->fp[mode == DFmode].fpconst;
5628           else if (!aarch64_float_const_zero_rtx_p (x))
5629             {
5630               /* This will be a load from memory.  */
5631               if (mode == DFmode)
5632                 *cost += extra_cost->ldst.loadd;
5633               else
5634                 *cost += extra_cost->ldst.loadf;
5635             }
5636           else
5637             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5638                or MOV v0.s[0], wzr - neither of which are modeled by the
5639                cost tables.  Just use the default cost.  */
5640             {
5641             }
5642         }
5643
5644       return true;
5645
5646     case MEM:
5647       if (speed)
5648         {
5649           /* For loads we want the base cost of a load, plus an
5650              approximation for the additional cost of the addressing
5651              mode.  */
5652           rtx address = XEXP (x, 0);
5653           if (GET_MODE_CLASS (mode) == MODE_INT)
5654             *cost += extra_cost->ldst.load;
5655           else if (mode == SFmode)
5656             *cost += extra_cost->ldst.loadf;
5657           else if (mode == DFmode)
5658             *cost += extra_cost->ldst.loadd;
5659
5660           *cost +=
5661                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5662                                                      0, speed));
5663         }
5664
5665       return true;
5666
5667     case NEG:
5668       op0 = XEXP (x, 0);
5669
5670       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5671        {
5672           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5673               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5674             {
5675               /* CSETM.  */
5676               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5677               return true;
5678             }
5679
5680           /* Cost this as SUB wzr, X.  */
5681           op0 = CONST0_RTX (GET_MODE (x));
5682           op1 = XEXP (x, 0);
5683           goto cost_minus;
5684         }
5685
5686       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5687         {
5688           /* Support (neg(fma...)) as a single instruction only if
5689              sign of zeros is unimportant.  This matches the decision
5690              making in aarch64.md.  */
5691           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5692             {
5693               /* FNMADD.  */
5694               *cost = rtx_cost (op0, NEG, 0, speed);
5695               return true;
5696             }
5697           if (speed)
5698             /* FNEG.  */
5699             *cost += extra_cost->fp[mode == DFmode].neg;
5700           return false;
5701         }
5702
5703       return false;
5704
5705     case CLRSB:
5706     case CLZ:
5707       if (speed)
5708         *cost += extra_cost->alu.clz;
5709
5710       return false;
5711
5712     case COMPARE:
5713       op0 = XEXP (x, 0);
5714       op1 = XEXP (x, 1);
5715
5716       if (op1 == const0_rtx
5717           && GET_CODE (op0) == AND)
5718         {
5719           x = op0;
5720           goto cost_logic;
5721         }
5722
5723       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5724         {
5725           /* TODO: A write to the CC flags possibly costs extra, this
5726              needs encoding in the cost tables.  */
5727
5728           /* CC_ZESWPmode supports zero extend for free.  */
5729           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5730             op0 = XEXP (op0, 0);
5731
5732           /* ANDS.  */
5733           if (GET_CODE (op0) == AND)
5734             {
5735               x = op0;
5736               goto cost_logic;
5737             }
5738
5739           if (GET_CODE (op0) == PLUS)
5740             {
5741               /* ADDS (and CMN alias).  */
5742               x = op0;
5743               goto cost_plus;
5744             }
5745
5746           if (GET_CODE (op0) == MINUS)
5747             {
5748               /* SUBS.  */
5749               x = op0;
5750               goto cost_minus;
5751             }
5752
5753           if (GET_CODE (op1) == NEG)
5754             {
5755               /* CMN.  */
5756               if (speed)
5757                 *cost += extra_cost->alu.arith;
5758
5759               *cost += rtx_cost (op0, COMPARE, 0, speed);
5760               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5761               return true;
5762             }
5763
5764           /* CMP.
5765
5766              Compare can freely swap the order of operands, and
5767              canonicalization puts the more complex operation first.
5768              But the integer MINUS logic expects the shift/extend
5769              operation in op1.  */
5770           if (! (REG_P (op0)
5771                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5772           {
5773             op0 = XEXP (x, 1);
5774             op1 = XEXP (x, 0);
5775           }
5776           goto cost_minus;
5777         }
5778
5779       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5780         {
5781           /* FCMP.  */
5782           if (speed)
5783             *cost += extra_cost->fp[mode == DFmode].compare;
5784
5785           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5786             {
5787               /* FCMP supports constant 0.0 for no extra cost. */
5788               return true;
5789             }
5790           return false;
5791         }
5792
5793       return false;
5794
5795     case MINUS:
5796       {
5797         op0 = XEXP (x, 0);
5798         op1 = XEXP (x, 1);
5799
5800 cost_minus:
5801         /* Detect valid immediates.  */
5802         if ((GET_MODE_CLASS (mode) == MODE_INT
5803              || (GET_MODE_CLASS (mode) == MODE_CC
5804                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5805             && CONST_INT_P (op1)
5806             && aarch64_uimm12_shift (INTVAL (op1)))
5807           {
5808             *cost += rtx_cost (op0, MINUS, 0, speed);
5809
5810             if (speed)
5811               /* SUB(S) (immediate).  */
5812               *cost += extra_cost->alu.arith;
5813             return true;
5814
5815           }
5816
5817         /* Look for SUB (extended register).  */
5818         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5819           {
5820             if (speed)
5821               *cost += extra_cost->alu.arith_shift;
5822
5823             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5824                                (enum rtx_code) GET_CODE (op1),
5825                                0, speed);
5826             return true;
5827           }
5828
5829         rtx new_op1 = aarch64_strip_extend (op1);
5830
5831         /* Cost this as an FMA-alike operation.  */
5832         if ((GET_CODE (new_op1) == MULT
5833              || GET_CODE (new_op1) == ASHIFT)
5834             && code != COMPARE)
5835           {
5836             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5837                                             (enum rtx_code) code,
5838                                             speed);
5839             *cost += rtx_cost (op0, MINUS, 0, speed);
5840             return true;
5841           }
5842
5843         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5844
5845         if (speed)
5846           {
5847             if (GET_MODE_CLASS (mode) == MODE_INT)
5848               /* SUB(S).  */
5849               *cost += extra_cost->alu.arith;
5850             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5851               /* FSUB.  */
5852               *cost += extra_cost->fp[mode == DFmode].addsub;
5853           }
5854         return true;
5855       }
5856
5857     case PLUS:
5858       {
5859         rtx new_op0;
5860
5861         op0 = XEXP (x, 0);
5862         op1 = XEXP (x, 1);
5863
5864 cost_plus:
5865         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5866             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5867           {
5868             /* CSINC.  */
5869             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5870             *cost += rtx_cost (op1, PLUS, 1, speed);
5871             return true;
5872           }
5873
5874         if (GET_MODE_CLASS (mode) == MODE_INT
5875             && CONST_INT_P (op1)
5876             && aarch64_uimm12_shift (INTVAL (op1)))
5877           {
5878             *cost += rtx_cost (op0, PLUS, 0, speed);
5879
5880             if (speed)
5881               /* ADD (immediate).  */
5882               *cost += extra_cost->alu.arith;
5883             return true;
5884           }
5885
5886         /* Look for ADD (extended register).  */
5887         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5888           {
5889             if (speed)
5890               *cost += extra_cost->alu.arith_shift;
5891
5892             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5893                                (enum rtx_code) GET_CODE (op0),
5894                                0, speed);
5895             return true;
5896           }
5897
5898         /* Strip any extend, leave shifts behind as we will
5899            cost them through mult_cost.  */
5900         new_op0 = aarch64_strip_extend (op0);
5901
5902         if (GET_CODE (new_op0) == MULT
5903             || GET_CODE (new_op0) == ASHIFT)
5904           {
5905             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5906                                             speed);
5907             *cost += rtx_cost (op1, PLUS, 1, speed);
5908             return true;
5909           }
5910
5911         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5912                   + rtx_cost (op1, PLUS, 1, speed));
5913
5914         if (speed)
5915           {
5916             if (GET_MODE_CLASS (mode) == MODE_INT)
5917               /* ADD.  */
5918               *cost += extra_cost->alu.arith;
5919             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5920               /* FADD.  */
5921               *cost += extra_cost->fp[mode == DFmode].addsub;
5922           }
5923         return true;
5924       }
5925
5926     case BSWAP:
5927       *cost = COSTS_N_INSNS (1);
5928
5929       if (speed)
5930         *cost += extra_cost->alu.rev;
5931
5932       return false;
5933
5934     case IOR:
5935       if (aarch_rev16_p (x))
5936         {
5937           *cost = COSTS_N_INSNS (1);
5938
5939           if (speed)
5940             *cost += extra_cost->alu.rev;
5941
5942           return true;
5943         }
5944     /* Fall through.  */
5945     case XOR:
5946     case AND:
5947     cost_logic:
5948       op0 = XEXP (x, 0);
5949       op1 = XEXP (x, 1);
5950
5951       if (code == AND
5952           && GET_CODE (op0) == MULT
5953           && CONST_INT_P (XEXP (op0, 1))
5954           && CONST_INT_P (op1)
5955           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5956                                INTVAL (op1)) != 0)
5957         {
5958           /* This is a UBFM/SBFM.  */
5959           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5960           if (speed)
5961             *cost += extra_cost->alu.bfx;
5962           return true;
5963         }
5964
5965       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5966         {
5967           /* We possibly get the immediate for free, this is not
5968              modelled.  */
5969           if (CONST_INT_P (op1)
5970               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5971             {
5972               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5973
5974               if (speed)
5975                 *cost += extra_cost->alu.logical;
5976
5977               return true;
5978             }
5979           else
5980             {
5981               rtx new_op0 = op0;
5982
5983               /* Handle ORN, EON, or BIC.  */
5984               if (GET_CODE (op0) == NOT)
5985                 op0 = XEXP (op0, 0);
5986
5987               new_op0 = aarch64_strip_shift (op0);
5988
5989               /* If we had a shift on op0 then this is a logical-shift-
5990                  by-register/immediate operation.  Otherwise, this is just
5991                  a logical operation.  */
5992               if (speed)
5993                 {
5994                   if (new_op0 != op0)
5995                     {
5996                       /* Shift by immediate.  */
5997                       if (CONST_INT_P (XEXP (op0, 1)))
5998                         *cost += extra_cost->alu.log_shift;
5999                       else
6000                         *cost += extra_cost->alu.log_shift_reg;
6001                     }
6002                   else
6003                     *cost += extra_cost->alu.logical;
6004                 }
6005
6006               /* In both cases we want to cost both operands.  */
6007               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6008                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6009
6010               return true;
6011             }
6012         }
6013       return false;
6014
6015     case NOT:
6016       /* MVN.  */
6017       if (speed)
6018         *cost += extra_cost->alu.logical;
6019
6020       /* The logical instruction could have the shifted register form,
6021          but the cost is the same if the shift is processed as a separate
6022          instruction, so we don't bother with it here.  */
6023       return false;
6024
6025     case ZERO_EXTEND:
6026
6027       op0 = XEXP (x, 0);
6028       /* If a value is written in SI mode, then zero extended to DI
6029          mode, the operation will in general be free as a write to
6030          a 'w' register implicitly zeroes the upper bits of an 'x'
6031          register.  However, if this is
6032
6033            (set (reg) (zero_extend (reg)))
6034
6035          we must cost the explicit register move.  */
6036       if (mode == DImode
6037           && GET_MODE (op0) == SImode
6038           && outer == SET)
6039         {
6040           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6041
6042           if (!op_cost && speed)
6043             /* MOV.  */
6044             *cost += extra_cost->alu.extend;
6045           else
6046             /* Free, the cost is that of the SI mode operation.  */
6047             *cost = op_cost;
6048
6049           return true;
6050         }
6051       else if (MEM_P (XEXP (x, 0)))
6052         {
6053           /* All loads can zero extend to any size for free.  */
6054           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6055           return true;
6056         }
6057
6058       /* UXTB/UXTH.  */
6059       if (speed)
6060         *cost += extra_cost->alu.extend;
6061
6062       return false;
6063
6064     case SIGN_EXTEND:
6065       if (MEM_P (XEXP (x, 0)))
6066         {
6067           /* LDRSH.  */
6068           if (speed)
6069             {
6070               rtx address = XEXP (XEXP (x, 0), 0);
6071               *cost += extra_cost->ldst.load_sign_extend;
6072
6073               *cost +=
6074                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6075                                                      0, speed));
6076             }
6077           return true;
6078         }
6079
6080       if (speed)
6081         *cost += extra_cost->alu.extend;
6082       return false;
6083
6084     case ASHIFT:
6085       op0 = XEXP (x, 0);
6086       op1 = XEXP (x, 1);
6087
6088       if (CONST_INT_P (op1))
6089         {
6090           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6091              aliases.  */
6092           if (speed)
6093             *cost += extra_cost->alu.shift;
6094
6095           /* We can incorporate zero/sign extend for free.  */
6096           if (GET_CODE (op0) == ZERO_EXTEND
6097               || GET_CODE (op0) == SIGN_EXTEND)
6098             op0 = XEXP (op0, 0);
6099
6100           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6101           return true;
6102         }
6103       else
6104         {
6105           /* LSLV.  */
6106           if (speed)
6107             *cost += extra_cost->alu.shift_reg;
6108
6109           return false;  /* All arguments need to be in registers.  */
6110         }
6111
6112     case ROTATE:
6113     case ROTATERT:
6114     case LSHIFTRT:
6115     case ASHIFTRT:
6116       op0 = XEXP (x, 0);
6117       op1 = XEXP (x, 1);
6118
6119       if (CONST_INT_P (op1))
6120         {
6121           /* ASR (immediate) and friends.  */
6122           if (speed)
6123             *cost += extra_cost->alu.shift;
6124
6125           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6126           return true;
6127         }
6128       else
6129         {
6130
6131           /* ASR (register) and friends.  */
6132           if (speed)
6133             *cost += extra_cost->alu.shift_reg;
6134
6135           return false;  /* All arguments need to be in registers.  */
6136         }
6137
6138     case SYMBOL_REF:
6139
6140       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6141         {
6142           /* LDR.  */
6143           if (speed)
6144             *cost += extra_cost->ldst.load;
6145         }
6146       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6147                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6148         {
6149           /* ADRP, followed by ADD.  */
6150           *cost += COSTS_N_INSNS (1);
6151           if (speed)
6152             *cost += 2 * extra_cost->alu.arith;
6153         }
6154       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6155                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6156         {
6157           /* ADR.  */
6158           if (speed)
6159             *cost += extra_cost->alu.arith;
6160         }
6161
6162       if (flag_pic)
6163         {
6164           /* One extra load instruction, after accessing the GOT.  */
6165           *cost += COSTS_N_INSNS (1);
6166           if (speed)
6167             *cost += extra_cost->ldst.load;
6168         }
6169       return true;
6170
6171     case HIGH:
6172     case LO_SUM:
6173       /* ADRP/ADD (immediate).  */
6174       if (speed)
6175         *cost += extra_cost->alu.arith;
6176       return true;
6177
6178     case ZERO_EXTRACT:
6179     case SIGN_EXTRACT:
6180       /* UBFX/SBFX.  */
6181       if (speed)
6182         *cost += extra_cost->alu.bfx;
6183
6184       /* We can trust that the immediates used will be correct (there
6185          are no by-register forms), so we need only cost op0.  */
6186       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6187       return true;
6188
6189     case MULT:
6190       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6191       /* aarch64_rtx_mult_cost always handles recursion to its
6192          operands.  */
6193       return true;
6194
6195     case MOD:
6196     case UMOD:
6197       if (speed)
6198         {
6199           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6200             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6201                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6202           else if (GET_MODE (x) == DFmode)
6203             *cost += (extra_cost->fp[1].mult
6204                       + extra_cost->fp[1].div);
6205           else if (GET_MODE (x) == SFmode)
6206             *cost += (extra_cost->fp[0].mult
6207                       + extra_cost->fp[0].div);
6208         }
6209       return false;  /* All arguments need to be in registers.  */
6210
6211     case DIV:
6212     case UDIV:
6213     case SQRT:
6214       if (speed)
6215         {
6216           if (GET_MODE_CLASS (mode) == MODE_INT)
6217             /* There is no integer SQRT, so only DIV and UDIV can get
6218                here.  */
6219             *cost += extra_cost->mult[mode == DImode].idiv;
6220           else
6221             *cost += extra_cost->fp[mode == DFmode].div;
6222         }
6223       return false;  /* All arguments need to be in registers.  */
6224
6225     case IF_THEN_ELSE:
6226       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6227                                          XEXP (x, 2), cost, speed);
6228
6229     case EQ:
6230     case NE:
6231     case GT:
6232     case GTU:
6233     case LT:
6234     case LTU:
6235     case GE:
6236     case GEU:
6237     case LE:
6238     case LEU:
6239
6240       return false; /* All arguments must be in registers.  */
6241
6242     case FMA:
6243       op0 = XEXP (x, 0);
6244       op1 = XEXP (x, 1);
6245       op2 = XEXP (x, 2);
6246
6247       if (speed)
6248         *cost += extra_cost->fp[mode == DFmode].fma;
6249
6250       /* FMSUB, FNMADD, and FNMSUB are free.  */
6251       if (GET_CODE (op0) == NEG)
6252         op0 = XEXP (op0, 0);
6253
6254       if (GET_CODE (op2) == NEG)
6255         op2 = XEXP (op2, 0);
6256
6257       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6258          and the by-element operand as operand 0.  */
6259       if (GET_CODE (op1) == NEG)
6260         op1 = XEXP (op1, 0);
6261
6262       /* Catch vector-by-element operations.  The by-element operand can
6263          either be (vec_duplicate (vec_select (x))) or just
6264          (vec_select (x)), depending on whether we are multiplying by
6265          a vector or a scalar.
6266
6267          Canonicalization is not very good in these cases, FMA4 will put the
6268          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6269       if (GET_CODE (op0) == VEC_DUPLICATE)
6270         op0 = XEXP (op0, 0);
6271       else if (GET_CODE (op1) == VEC_DUPLICATE)
6272         op1 = XEXP (op1, 0);
6273
6274       if (GET_CODE (op0) == VEC_SELECT)
6275         op0 = XEXP (op0, 0);
6276       else if (GET_CODE (op1) == VEC_SELECT)
6277         op1 = XEXP (op1, 0);
6278
6279       /* If the remaining parameters are not registers,
6280          get the cost to put them into registers.  */
6281       *cost += rtx_cost (op0, FMA, 0, speed);
6282       *cost += rtx_cost (op1, FMA, 1, speed);
6283       *cost += rtx_cost (op2, FMA, 2, speed);
6284       return true;
6285
6286     case FLOAT_EXTEND:
6287       if (speed)
6288         *cost += extra_cost->fp[mode == DFmode].widen;
6289       return false;
6290
6291     case FLOAT_TRUNCATE:
6292       if (speed)
6293         *cost += extra_cost->fp[mode == DFmode].narrow;
6294       return false;
6295
6296     case FIX:
6297     case UNSIGNED_FIX:
6298       x = XEXP (x, 0);
6299       /* Strip the rounding part.  They will all be implemented
6300          by the fcvt* family of instructions anyway.  */
6301       if (GET_CODE (x) == UNSPEC)
6302         {
6303           unsigned int uns_code = XINT (x, 1);
6304
6305           if (uns_code == UNSPEC_FRINTA
6306               || uns_code == UNSPEC_FRINTM
6307               || uns_code == UNSPEC_FRINTN
6308               || uns_code == UNSPEC_FRINTP
6309               || uns_code == UNSPEC_FRINTZ)
6310             x = XVECEXP (x, 0, 0);
6311         }
6312
6313       if (speed)
6314         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6315
6316       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6317       return true;
6318
6319     case ABS:
6320       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6321         {
6322           /* FABS and FNEG are analogous.  */
6323           if (speed)
6324             *cost += extra_cost->fp[mode == DFmode].neg;
6325         }
6326       else
6327         {
6328           /* Integer ABS will either be split to
6329              two arithmetic instructions, or will be an ABS
6330              (scalar), which we don't model.  */
6331           *cost = COSTS_N_INSNS (2);
6332           if (speed)
6333             *cost += 2 * extra_cost->alu.arith;
6334         }
6335       return false;
6336
6337     case SMAX:
6338     case SMIN:
6339       if (speed)
6340         {
6341           /* FMAXNM/FMINNM/FMAX/FMIN.
6342              TODO: This may not be accurate for all implementations, but
6343              we do not model this in the cost tables.  */
6344           *cost += extra_cost->fp[mode == DFmode].addsub;
6345         }
6346       return false;
6347
6348     case UNSPEC:
6349       /* The floating point round to integer frint* instructions.  */
6350       if (aarch64_frint_unspec_p (XINT (x, 1)))
6351         {
6352           if (speed)
6353             *cost += extra_cost->fp[mode == DFmode].roundint;
6354
6355           return false;
6356         }
6357
6358       if (XINT (x, 1) == UNSPEC_RBIT)
6359         {
6360           if (speed)
6361             *cost += extra_cost->alu.rev;
6362
6363           return false;
6364         }
6365       break;
6366
6367     case TRUNCATE:
6368
6369       /* Decompose <su>muldi3_highpart.  */
6370       if (/* (truncate:DI  */
6371           mode == DImode
6372           /*   (lshiftrt:TI  */
6373           && GET_MODE (XEXP (x, 0)) == TImode
6374           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6375           /*      (mult:TI  */
6376           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6377           /*        (ANY_EXTEND:TI (reg:DI))
6378                     (ANY_EXTEND:TI (reg:DI)))  */
6379           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6380                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6381               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6382                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6383           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6384           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6385           /*     (const_int 64)  */
6386           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6387           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6388         {
6389           /* UMULH/SMULH.  */
6390           if (speed)
6391             *cost += extra_cost->mult[mode == DImode].extend;
6392           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6393                              MULT, 0, speed);
6394           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6395                              MULT, 1, speed);
6396           return true;
6397         }
6398
6399       /* Fall through.  */
6400     default:
6401       break;
6402     }
6403
6404   if (dump_file && (dump_flags & TDF_DETAILS))
6405     fprintf (dump_file,
6406       "\nFailed to cost RTX.  Assuming default cost.\n");
6407
6408   return true;
6409 }
6410
6411 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6412    calculated for X.  This cost is stored in *COST.  Returns true
6413    if the total cost of X was calculated.  */
6414 static bool
6415 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6416                    int param, int *cost, bool speed)
6417 {
6418   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6419
6420   if (dump_file && (dump_flags & TDF_DETAILS))
6421     {
6422       print_rtl_single (dump_file, x);
6423       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6424                speed ? "Hot" : "Cold",
6425                *cost, result ? "final" : "partial");
6426     }
6427
6428   return result;
6429 }
6430
6431 static int
6432 aarch64_register_move_cost (machine_mode mode,
6433                             reg_class_t from_i, reg_class_t to_i)
6434 {
6435   enum reg_class from = (enum reg_class) from_i;
6436   enum reg_class to = (enum reg_class) to_i;
6437   const struct cpu_regmove_cost *regmove_cost
6438     = aarch64_tune_params->regmove_cost;
6439
6440   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6441   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6442     to = GENERAL_REGS;
6443
6444   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6445     from = GENERAL_REGS;
6446
6447   /* Moving between GPR and stack cost is the same as GP2GP.  */
6448   if ((from == GENERAL_REGS && to == STACK_REG)
6449       || (to == GENERAL_REGS && from == STACK_REG))
6450     return regmove_cost->GP2GP;
6451
6452   /* To/From the stack register, we move via the gprs.  */
6453   if (to == STACK_REG || from == STACK_REG)
6454     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6455             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6456
6457   if (GET_MODE_SIZE (mode) == 16)
6458     {
6459       /* 128-bit operations on general registers require 2 instructions.  */
6460       if (from == GENERAL_REGS && to == GENERAL_REGS)
6461         return regmove_cost->GP2GP * 2;
6462       else if (from == GENERAL_REGS)
6463         return regmove_cost->GP2FP * 2;
6464       else if (to == GENERAL_REGS)
6465         return regmove_cost->FP2GP * 2;
6466
6467       /* When AdvSIMD instructions are disabled it is not possible to move
6468          a 128-bit value directly between Q registers.  This is handled in
6469          secondary reload.  A general register is used as a scratch to move
6470          the upper DI value and the lower DI value is moved directly,
6471          hence the cost is the sum of three moves. */
6472       if (! TARGET_SIMD)
6473         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6474
6475       return regmove_cost->FP2FP;
6476     }
6477
6478   if (from == GENERAL_REGS && to == GENERAL_REGS)
6479     return regmove_cost->GP2GP;
6480   else if (from == GENERAL_REGS)
6481     return regmove_cost->GP2FP;
6482   else if (to == GENERAL_REGS)
6483     return regmove_cost->FP2GP;
6484
6485   return regmove_cost->FP2FP;
6486 }
6487
6488 static int
6489 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6490                           reg_class_t rclass ATTRIBUTE_UNUSED,
6491                           bool in ATTRIBUTE_UNUSED)
6492 {
6493   return aarch64_tune_params->memmov_cost;
6494 }
6495
6496 /* Return the number of instructions that can be issued per cycle.  */
6497 static int
6498 aarch64_sched_issue_rate (void)
6499 {
6500   return aarch64_tune_params->issue_rate;
6501 }
6502
6503 static int
6504 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6505 {
6506   int issue_rate = aarch64_sched_issue_rate ();
6507
6508   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6509 }
6510
6511 /* Vectorizer cost model target hooks.  */
6512
6513 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6514 static int
6515 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6516                                     tree vectype,
6517                                     int misalign ATTRIBUTE_UNUSED)
6518 {
6519   unsigned elements;
6520
6521   switch (type_of_cost)
6522     {
6523       case scalar_stmt:
6524         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6525
6526       case scalar_load:
6527         return aarch64_tune_params->vec_costs->scalar_load_cost;
6528
6529       case scalar_store:
6530         return aarch64_tune_params->vec_costs->scalar_store_cost;
6531
6532       case vector_stmt:
6533         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6534
6535       case vector_load:
6536         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6537
6538       case vector_store:
6539         return aarch64_tune_params->vec_costs->vec_store_cost;
6540
6541       case vec_to_scalar:
6542         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6543
6544       case scalar_to_vec:
6545         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6546
6547       case unaligned_load:
6548         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6549
6550       case unaligned_store:
6551         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6552
6553       case cond_branch_taken:
6554         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6555
6556       case cond_branch_not_taken:
6557         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6558
6559       case vec_perm:
6560       case vec_promote_demote:
6561         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6562
6563       case vec_construct:
6564         elements = TYPE_VECTOR_SUBPARTS (vectype);
6565         return elements / 2 + 1;
6566
6567       default:
6568         gcc_unreachable ();
6569     }
6570 }
6571
6572 /* Implement targetm.vectorize.add_stmt_cost.  */
6573 static unsigned
6574 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6575                        struct _stmt_vec_info *stmt_info, int misalign,
6576                        enum vect_cost_model_location where)
6577 {
6578   unsigned *cost = (unsigned *) data;
6579   unsigned retval = 0;
6580
6581   if (flag_vect_cost_model)
6582     {
6583       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6584       int stmt_cost =
6585             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6586
6587       /* Statements in an inner loop relative to the loop being
6588          vectorized are weighted more heavily.  The value here is
6589          a function (linear for now) of the loop nest level.  */
6590       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6591         {
6592           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6593           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6594           unsigned nest_level = loop_depth (loop);
6595
6596           count *= nest_level;
6597         }
6598
6599       retval = (unsigned) (count * stmt_cost);
6600       cost[where] += retval;
6601     }
6602
6603   return retval;
6604 }
6605
6606 static void initialize_aarch64_code_model (void);
6607
6608 /* Parse the architecture extension string.  */
6609
6610 static void
6611 aarch64_parse_extension (char *str)
6612 {
6613   /* The extension string is parsed left to right.  */
6614   const struct aarch64_option_extension *opt = NULL;
6615
6616   /* Flag to say whether we are adding or removing an extension.  */
6617   int adding_ext = -1;
6618
6619   while (str != NULL && *str != 0)
6620     {
6621       char *ext;
6622       size_t len;
6623
6624       str++;
6625       ext = strchr (str, '+');
6626
6627       if (ext != NULL)
6628         len = ext - str;
6629       else
6630         len = strlen (str);
6631
6632       if (len >= 2 && strncmp (str, "no", 2) == 0)
6633         {
6634           adding_ext = 0;
6635           len -= 2;
6636           str += 2;
6637         }
6638       else if (len > 0)
6639         adding_ext = 1;
6640
6641       if (len == 0)
6642         {
6643           error ("missing feature modifier after %qs", adding_ext ? "+"
6644                                                                   : "+no");
6645           return;
6646         }
6647
6648       /* Scan over the extensions table trying to find an exact match.  */
6649       for (opt = all_extensions; opt->name != NULL; opt++)
6650         {
6651           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6652             {
6653               /* Add or remove the extension.  */
6654               if (adding_ext)
6655                 aarch64_isa_flags |= opt->flags_on;
6656               else
6657                 aarch64_isa_flags &= ~(opt->flags_off);
6658               break;
6659             }
6660         }
6661
6662       if (opt->name == NULL)
6663         {
6664           /* Extension not found in list.  */
6665           error ("unknown feature modifier %qs", str);
6666           return;
6667         }
6668
6669       str = ext;
6670     };
6671
6672   return;
6673 }
6674
6675 /* Parse the ARCH string.  */
6676
6677 static void
6678 aarch64_parse_arch (void)
6679 {
6680   char *ext;
6681   const struct processor *arch;
6682   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6683   size_t len;
6684
6685   strcpy (str, aarch64_arch_string);
6686
6687   ext = strchr (str, '+');
6688
6689   if (ext != NULL)
6690     len = ext - str;
6691   else
6692     len = strlen (str);
6693
6694   if (len == 0)
6695     {
6696       error ("missing arch name in -march=%qs", str);
6697       return;
6698     }
6699
6700   /* Loop through the list of supported ARCHs to find a match.  */
6701   for (arch = all_architectures; arch->name != NULL; arch++)
6702     {
6703       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6704         {
6705           selected_arch = arch;
6706           aarch64_isa_flags = selected_arch->flags;
6707
6708           if (!selected_cpu)
6709             selected_cpu = &all_cores[selected_arch->core];
6710
6711           if (ext != NULL)
6712             {
6713               /* ARCH string contains at least one extension.  */
6714               aarch64_parse_extension (ext);
6715             }
6716
6717           if (strcmp (selected_arch->arch, selected_cpu->arch))
6718             {
6719               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6720                        selected_cpu->name, selected_arch->name);
6721             }
6722
6723           return;
6724         }
6725     }
6726
6727   /* ARCH name not found in list.  */
6728   error ("unknown value %qs for -march", str);
6729   return;
6730 }
6731
6732 /* Parse the CPU string.  */
6733
6734 static void
6735 aarch64_parse_cpu (void)
6736 {
6737   char *ext;
6738   const struct processor *cpu;
6739   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6740   size_t len;
6741
6742   strcpy (str, aarch64_cpu_string);
6743
6744   ext = strchr (str, '+');
6745
6746   if (ext != NULL)
6747     len = ext - str;
6748   else
6749     len = strlen (str);
6750
6751   if (len == 0)
6752     {
6753       error ("missing cpu name in -mcpu=%qs", str);
6754       return;
6755     }
6756
6757   /* Loop through the list of supported CPUs to find a match.  */
6758   for (cpu = all_cores; cpu->name != NULL; cpu++)
6759     {
6760       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6761         {
6762           selected_cpu = cpu;
6763           aarch64_isa_flags = selected_cpu->flags;
6764
6765           if (ext != NULL)
6766             {
6767               /* CPU string contains at least one extension.  */
6768               aarch64_parse_extension (ext);
6769             }
6770
6771           return;
6772         }
6773     }
6774
6775   /* CPU name not found in list.  */
6776   error ("unknown value %qs for -mcpu", str);
6777   return;
6778 }
6779
6780 /* Parse the TUNE string.  */
6781
6782 static void
6783 aarch64_parse_tune (void)
6784 {
6785   const struct processor *cpu;
6786   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6787   strcpy (str, aarch64_tune_string);
6788
6789   /* Loop through the list of supported CPUs to find a match.  */
6790   for (cpu = all_cores; cpu->name != NULL; cpu++)
6791     {
6792       if (strcmp (cpu->name, str) == 0)
6793         {
6794           selected_tune = cpu;
6795           return;
6796         }
6797     }
6798
6799   /* CPU name not found in list.  */
6800   error ("unknown value %qs for -mtune", str);
6801   return;
6802 }
6803
6804
6805 /* Implement TARGET_OPTION_OVERRIDE.  */
6806
6807 static void
6808 aarch64_override_options (void)
6809 {
6810   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6811      If either of -march or -mtune is given, they override their
6812      respective component of -mcpu.
6813
6814      So, first parse AARCH64_CPU_STRING, then the others, be careful
6815      with -march as, if -mcpu is not present on the command line, march
6816      must set a sensible default CPU.  */
6817   if (aarch64_cpu_string)
6818     {
6819       aarch64_parse_cpu ();
6820     }
6821
6822   if (aarch64_arch_string)
6823     {
6824       aarch64_parse_arch ();
6825     }
6826
6827   if (aarch64_tune_string)
6828     {
6829       aarch64_parse_tune ();
6830     }
6831
6832 #ifndef HAVE_AS_MABI_OPTION
6833   /* The compiler may have been configured with 2.23.* binutils, which does
6834      not have support for ILP32.  */
6835   if (TARGET_ILP32)
6836     error ("Assembler does not support -mabi=ilp32");
6837 #endif
6838
6839   initialize_aarch64_code_model ();
6840
6841   aarch64_build_bitmask_table ();
6842
6843   /* This target defaults to strict volatile bitfields.  */
6844   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6845     flag_strict_volatile_bitfields = 1;
6846
6847   /* If the user did not specify a processor, choose the default
6848      one for them.  This will be the CPU set during configuration using
6849      --with-cpu, otherwise it is "generic".  */
6850   if (!selected_cpu)
6851     {
6852       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6853       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6854     }
6855
6856   gcc_assert (selected_cpu);
6857
6858   if (!selected_tune)
6859     selected_tune = selected_cpu;
6860
6861   aarch64_tune_flags = selected_tune->flags;
6862   aarch64_tune = selected_tune->core;
6863   aarch64_tune_params = selected_tune->tune;
6864   aarch64_architecture_version = selected_cpu->architecture_version;
6865
6866   if (aarch64_fix_a53_err835769 == 2)
6867     {
6868 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6869       aarch64_fix_a53_err835769 = 1;
6870 #else
6871       aarch64_fix_a53_err835769 = 0;
6872 #endif
6873     }
6874
6875   /* If not opzimizing for size, set the default
6876      alignment to what the target wants */
6877   if (!optimize_size)
6878     {
6879       if (align_loops <= 0)
6880         align_loops = aarch64_tune_params->loop_align;
6881       if (align_jumps <= 0)
6882         align_jumps = aarch64_tune_params->jump_align;
6883       if (align_functions <= 0)
6884         align_functions = aarch64_tune_params->function_align;
6885     }
6886
6887   aarch64_override_options_after_change ();
6888 }
6889
6890 /* Implement targetm.override_options_after_change.  */
6891
6892 static void
6893 aarch64_override_options_after_change (void)
6894 {
6895   if (flag_omit_frame_pointer)
6896     flag_omit_leaf_frame_pointer = false;
6897   else if (flag_omit_leaf_frame_pointer)
6898     flag_omit_frame_pointer = true;
6899 }
6900
6901 static struct machine_function *
6902 aarch64_init_machine_status (void)
6903 {
6904   struct machine_function *machine;
6905   machine = ggc_cleared_alloc<machine_function> ();
6906   return machine;
6907 }
6908
6909 void
6910 aarch64_init_expanders (void)
6911 {
6912   init_machine_status = aarch64_init_machine_status;
6913 }
6914
6915 /* A checking mechanism for the implementation of the various code models.  */
6916 static void
6917 initialize_aarch64_code_model (void)
6918 {
6919    if (flag_pic)
6920      {
6921        switch (aarch64_cmodel_var)
6922          {
6923          case AARCH64_CMODEL_TINY:
6924            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6925            break;
6926          case AARCH64_CMODEL_SMALL:
6927            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6928            break;
6929          case AARCH64_CMODEL_LARGE:
6930            sorry ("code model %qs with -f%s", "large",
6931                   flag_pic > 1 ? "PIC" : "pic");
6932          default:
6933            gcc_unreachable ();
6934          }
6935      }
6936    else
6937      aarch64_cmodel = aarch64_cmodel_var;
6938 }
6939
6940 /* Return true if SYMBOL_REF X binds locally.  */
6941
6942 static bool
6943 aarch64_symbol_binds_local_p (const_rtx x)
6944 {
6945   return (SYMBOL_REF_DECL (x)
6946           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6947           : SYMBOL_REF_LOCAL_P (x));
6948 }
6949
6950 /* Return true if SYMBOL_REF X is thread local */
6951 static bool
6952 aarch64_tls_symbol_p (rtx x)
6953 {
6954   if (! TARGET_HAVE_TLS)
6955     return false;
6956
6957   if (GET_CODE (x) != SYMBOL_REF)
6958     return false;
6959
6960   return SYMBOL_REF_TLS_MODEL (x) != 0;
6961 }
6962
6963 /* Classify a TLS symbol into one of the TLS kinds.  */
6964 enum aarch64_symbol_type
6965 aarch64_classify_tls_symbol (rtx x)
6966 {
6967   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6968
6969   switch (tls_kind)
6970     {
6971     case TLS_MODEL_GLOBAL_DYNAMIC:
6972     case TLS_MODEL_LOCAL_DYNAMIC:
6973       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6974
6975     case TLS_MODEL_INITIAL_EXEC:
6976       return SYMBOL_SMALL_GOTTPREL;
6977
6978     case TLS_MODEL_LOCAL_EXEC:
6979       return SYMBOL_SMALL_TPREL;
6980
6981     case TLS_MODEL_EMULATED:
6982     case TLS_MODEL_NONE:
6983       return SYMBOL_FORCE_TO_MEM;
6984
6985     default:
6986       gcc_unreachable ();
6987     }
6988 }
6989
6990 /* Return the method that should be used to access SYMBOL_REF or
6991    LABEL_REF X in context CONTEXT.  */
6992
6993 enum aarch64_symbol_type
6994 aarch64_classify_symbol (rtx x, rtx offset,
6995                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6996 {
6997   if (GET_CODE (x) == LABEL_REF)
6998     {
6999       switch (aarch64_cmodel)
7000         {
7001         case AARCH64_CMODEL_LARGE:
7002           return SYMBOL_FORCE_TO_MEM;
7003
7004         case AARCH64_CMODEL_TINY_PIC:
7005         case AARCH64_CMODEL_TINY:
7006           return SYMBOL_TINY_ABSOLUTE;
7007
7008         case AARCH64_CMODEL_SMALL_PIC:
7009         case AARCH64_CMODEL_SMALL:
7010           return SYMBOL_SMALL_ABSOLUTE;
7011
7012         default:
7013           gcc_unreachable ();
7014         }
7015     }
7016
7017   if (GET_CODE (x) == SYMBOL_REF)
7018     {
7019       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7020           return SYMBOL_FORCE_TO_MEM;
7021
7022       if (aarch64_tls_symbol_p (x))
7023         return aarch64_classify_tls_symbol (x);
7024
7025       switch (aarch64_cmodel)
7026         {
7027         case AARCH64_CMODEL_TINY:
7028           /* When we retreive symbol + offset address, we have to make sure
7029              the offset does not cause overflow of the final address.  But
7030              we have no way of knowing the address of symbol at compile time
7031              so we can't accurately say if the distance between the PC and
7032              symbol + offset is outside the addressible range of +/-1M in the
7033              TINY code model.  So we rely on images not being greater than
7034              1M and cap the offset at 1M and anything beyond 1M will have to
7035              be loaded using an alternative mechanism.  */
7036           if (SYMBOL_REF_WEAK (x)
7037               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7038             return SYMBOL_FORCE_TO_MEM;
7039           return SYMBOL_TINY_ABSOLUTE;
7040
7041         case AARCH64_CMODEL_SMALL:
7042           /* Same reasoning as the tiny code model, but the offset cap here is
7043              4G.  */
7044           if (SYMBOL_REF_WEAK (x)
7045               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7046                             HOST_WIDE_INT_C (4294967264)))
7047             return SYMBOL_FORCE_TO_MEM;
7048           return SYMBOL_SMALL_ABSOLUTE;
7049
7050         case AARCH64_CMODEL_TINY_PIC:
7051           if (!aarch64_symbol_binds_local_p (x))
7052             return SYMBOL_TINY_GOT;
7053           return SYMBOL_TINY_ABSOLUTE;
7054
7055         case AARCH64_CMODEL_SMALL_PIC:
7056           if (!aarch64_symbol_binds_local_p (x))
7057             return SYMBOL_SMALL_GOT;
7058           return SYMBOL_SMALL_ABSOLUTE;
7059
7060         default:
7061           gcc_unreachable ();
7062         }
7063     }
7064
7065   /* By default push everything into the constant pool.  */
7066   return SYMBOL_FORCE_TO_MEM;
7067 }
7068
7069 bool
7070 aarch64_constant_address_p (rtx x)
7071 {
7072   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7073 }
7074
7075 bool
7076 aarch64_legitimate_pic_operand_p (rtx x)
7077 {
7078   if (GET_CODE (x) == SYMBOL_REF
7079       || (GET_CODE (x) == CONST
7080           && GET_CODE (XEXP (x, 0)) == PLUS
7081           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7082      return false;
7083
7084   return true;
7085 }
7086
7087 /* Return true if X holds either a quarter-precision or
7088      floating-point +0.0 constant.  */
7089 static bool
7090 aarch64_valid_floating_const (machine_mode mode, rtx x)
7091 {
7092   if (!CONST_DOUBLE_P (x))
7093     return false;
7094
7095   /* TODO: We could handle moving 0.0 to a TFmode register,
7096      but first we would like to refactor the movtf_aarch64
7097      to be more amicable to split moves properly and
7098      correctly gate on TARGET_SIMD.  For now - reject all
7099      constants which are not to SFmode or DFmode registers.  */
7100   if (!(mode == SFmode || mode == DFmode))
7101     return false;
7102
7103   if (aarch64_float_const_zero_rtx_p (x))
7104     return true;
7105   return aarch64_float_const_representable_p (x);
7106 }
7107
7108 static bool
7109 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7110 {
7111   /* Do not allow vector struct mode constants.  We could support
7112      0 and -1 easily, but they need support in aarch64-simd.md.  */
7113   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7114     return false;
7115
7116   /* This could probably go away because
7117      we now decompose CONST_INTs according to expand_mov_immediate.  */
7118   if ((GET_CODE (x) == CONST_VECTOR
7119        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7120       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7121         return !targetm.cannot_force_const_mem (mode, x);
7122
7123   if (GET_CODE (x) == HIGH
7124       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7125     return true;
7126
7127   return aarch64_constant_address_p (x);
7128 }
7129
7130 rtx
7131 aarch64_load_tp (rtx target)
7132 {
7133   if (!target
7134       || GET_MODE (target) != Pmode
7135       || !register_operand (target, Pmode))
7136     target = gen_reg_rtx (Pmode);
7137
7138   /* Can return in any reg.  */
7139   emit_insn (gen_aarch64_load_tp_hard (target));
7140   return target;
7141 }
7142
7143 /* On AAPCS systems, this is the "struct __va_list".  */
7144 static GTY(()) tree va_list_type;
7145
7146 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7147    Return the type to use as __builtin_va_list.
7148
7149    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7150
7151    struct __va_list
7152    {
7153      void *__stack;
7154      void *__gr_top;
7155      void *__vr_top;
7156      int   __gr_offs;
7157      int   __vr_offs;
7158    };  */
7159
7160 static tree
7161 aarch64_build_builtin_va_list (void)
7162 {
7163   tree va_list_name;
7164   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7165
7166   /* Create the type.  */
7167   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7168   /* Give it the required name.  */
7169   va_list_name = build_decl (BUILTINS_LOCATION,
7170                              TYPE_DECL,
7171                              get_identifier ("__va_list"),
7172                              va_list_type);
7173   DECL_ARTIFICIAL (va_list_name) = 1;
7174   TYPE_NAME (va_list_type) = va_list_name;
7175   TYPE_STUB_DECL (va_list_type) = va_list_name;
7176
7177   /* Create the fields.  */
7178   f_stack = build_decl (BUILTINS_LOCATION,
7179                         FIELD_DECL, get_identifier ("__stack"),
7180                         ptr_type_node);
7181   f_grtop = build_decl (BUILTINS_LOCATION,
7182                         FIELD_DECL, get_identifier ("__gr_top"),
7183                         ptr_type_node);
7184   f_vrtop = build_decl (BUILTINS_LOCATION,
7185                         FIELD_DECL, get_identifier ("__vr_top"),
7186                         ptr_type_node);
7187   f_groff = build_decl (BUILTINS_LOCATION,
7188                         FIELD_DECL, get_identifier ("__gr_offs"),
7189                         integer_type_node);
7190   f_vroff = build_decl (BUILTINS_LOCATION,
7191                         FIELD_DECL, get_identifier ("__vr_offs"),
7192                         integer_type_node);
7193
7194   DECL_ARTIFICIAL (f_stack) = 1;
7195   DECL_ARTIFICIAL (f_grtop) = 1;
7196   DECL_ARTIFICIAL (f_vrtop) = 1;
7197   DECL_ARTIFICIAL (f_groff) = 1;
7198   DECL_ARTIFICIAL (f_vroff) = 1;
7199
7200   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7201   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7202   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7203   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7204   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7205
7206   TYPE_FIELDS (va_list_type) = f_stack;
7207   DECL_CHAIN (f_stack) = f_grtop;
7208   DECL_CHAIN (f_grtop) = f_vrtop;
7209   DECL_CHAIN (f_vrtop) = f_groff;
7210   DECL_CHAIN (f_groff) = f_vroff;
7211
7212   /* Compute its layout.  */
7213   layout_type (va_list_type);
7214
7215   return va_list_type;
7216 }
7217
7218 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7219 static void
7220 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7221 {
7222   const CUMULATIVE_ARGS *cum;
7223   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7224   tree stack, grtop, vrtop, groff, vroff;
7225   tree t;
7226   int gr_save_area_size;
7227   int vr_save_area_size;
7228   int vr_offset;
7229
7230   cum = &crtl->args.info;
7231   gr_save_area_size
7232     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7233   vr_save_area_size
7234     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7235
7236   if (TARGET_GENERAL_REGS_ONLY)
7237     {
7238       if (cum->aapcs_nvrn > 0)
7239         sorry ("%qs and floating point or vector arguments",
7240                "-mgeneral-regs-only");
7241       vr_save_area_size = 0;
7242     }
7243
7244   f_stack = TYPE_FIELDS (va_list_type_node);
7245   f_grtop = DECL_CHAIN (f_stack);
7246   f_vrtop = DECL_CHAIN (f_grtop);
7247   f_groff = DECL_CHAIN (f_vrtop);
7248   f_vroff = DECL_CHAIN (f_groff);
7249
7250   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7251                   NULL_TREE);
7252   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7253                   NULL_TREE);
7254   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7255                   NULL_TREE);
7256   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7257                   NULL_TREE);
7258   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7259                   NULL_TREE);
7260
7261   /* Emit code to initialize STACK, which points to the next varargs stack
7262      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7263      by named arguments.  STACK is 8-byte aligned.  */
7264   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7265   if (cum->aapcs_stack_size > 0)
7266     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7267   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7268   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7269
7270   /* Emit code to initialize GRTOP, the top of the GR save area.
7271      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7272   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7273   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7274   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7275
7276   /* Emit code to initialize VRTOP, the top of the VR save area.
7277      This address is gr_save_area_bytes below GRTOP, rounded
7278      down to the next 16-byte boundary.  */
7279   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7280   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7281                              STACK_BOUNDARY / BITS_PER_UNIT);
7282
7283   if (vr_offset)
7284     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7285   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7286   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7287
7288   /* Emit code to initialize GROFF, the offset from GRTOP of the
7289      next GPR argument.  */
7290   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7291               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7292   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7293
7294   /* Likewise emit code to initialize VROFF, the offset from FTOP
7295      of the next VR argument.  */
7296   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7297               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7298   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7299 }
7300
7301 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7302
7303 static tree
7304 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7305                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7306 {
7307   tree addr;
7308   bool indirect_p;
7309   bool is_ha;           /* is HFA or HVA.  */
7310   bool dw_align;        /* double-word align.  */
7311   machine_mode ag_mode = VOIDmode;
7312   int nregs;
7313   machine_mode mode;
7314
7315   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7316   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7317   HOST_WIDE_INT size, rsize, adjust, align;
7318   tree t, u, cond1, cond2;
7319
7320   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7321   if (indirect_p)
7322     type = build_pointer_type (type);
7323
7324   mode = TYPE_MODE (type);
7325
7326   f_stack = TYPE_FIELDS (va_list_type_node);
7327   f_grtop = DECL_CHAIN (f_stack);
7328   f_vrtop = DECL_CHAIN (f_grtop);
7329   f_groff = DECL_CHAIN (f_vrtop);
7330   f_vroff = DECL_CHAIN (f_groff);
7331
7332   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7333                   f_stack, NULL_TREE);
7334   size = int_size_in_bytes (type);
7335   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7336
7337   dw_align = false;
7338   adjust = 0;
7339   if (aarch64_vfp_is_call_or_return_candidate (mode,
7340                                                type,
7341                                                &ag_mode,
7342                                                &nregs,
7343                                                &is_ha))
7344     {
7345       /* TYPE passed in fp/simd registers.  */
7346       if (TARGET_GENERAL_REGS_ONLY)
7347         sorry ("%qs and floating point or vector arguments",
7348                "-mgeneral-regs-only");
7349
7350       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7351                       unshare_expr (valist), f_vrtop, NULL_TREE);
7352       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7353                       unshare_expr (valist), f_vroff, NULL_TREE);
7354
7355       rsize = nregs * UNITS_PER_VREG;
7356
7357       if (is_ha)
7358         {
7359           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7360             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7361         }
7362       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7363                && size < UNITS_PER_VREG)
7364         {
7365           adjust = UNITS_PER_VREG - size;
7366         }
7367     }
7368   else
7369     {
7370       /* TYPE passed in general registers.  */
7371       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7372                       unshare_expr (valist), f_grtop, NULL_TREE);
7373       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7374                       unshare_expr (valist), f_groff, NULL_TREE);
7375       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7376       nregs = rsize / UNITS_PER_WORD;
7377
7378       if (align > 8)
7379         dw_align = true;
7380
7381       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7382           && size < UNITS_PER_WORD)
7383         {
7384           adjust = UNITS_PER_WORD  - size;
7385         }
7386     }
7387
7388   /* Get a local temporary for the field value.  */
7389   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7390
7391   /* Emit code to branch if off >= 0.  */
7392   t = build2 (GE_EXPR, boolean_type_node, off,
7393               build_int_cst (TREE_TYPE (off), 0));
7394   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7395
7396   if (dw_align)
7397     {
7398       /* Emit: offs = (offs + 15) & -16.  */
7399       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7400                   build_int_cst (TREE_TYPE (off), 15));
7401       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7402                   build_int_cst (TREE_TYPE (off), -16));
7403       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7404     }
7405   else
7406     roundup = NULL;
7407
7408   /* Update ap.__[g|v]r_offs  */
7409   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7410               build_int_cst (TREE_TYPE (off), rsize));
7411   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7412
7413   /* String up.  */
7414   if (roundup)
7415     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7416
7417   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7418   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7419               build_int_cst (TREE_TYPE (f_off), 0));
7420   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7421
7422   /* String up: make sure the assignment happens before the use.  */
7423   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7424   COND_EXPR_ELSE (cond1) = t;
7425
7426   /* Prepare the trees handling the argument that is passed on the stack;
7427      the top level node will store in ON_STACK.  */
7428   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7429   if (align > 8)
7430     {
7431       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7432       t = fold_convert (intDI_type_node, arg);
7433       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7434                   build_int_cst (TREE_TYPE (t), 15));
7435       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7436                   build_int_cst (TREE_TYPE (t), -16));
7437       t = fold_convert (TREE_TYPE (arg), t);
7438       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7439     }
7440   else
7441     roundup = NULL;
7442   /* Advance ap.__stack  */
7443   t = fold_convert (intDI_type_node, arg);
7444   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7445               build_int_cst (TREE_TYPE (t), size + 7));
7446   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7447               build_int_cst (TREE_TYPE (t), -8));
7448   t = fold_convert (TREE_TYPE (arg), t);
7449   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7450   /* String up roundup and advance.  */
7451   if (roundup)
7452     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7453   /* String up with arg */
7454   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7455   /* Big-endianness related address adjustment.  */
7456   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7457       && size < UNITS_PER_WORD)
7458   {
7459     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7460                 size_int (UNITS_PER_WORD - size));
7461     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7462   }
7463
7464   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7465   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7466
7467   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7468   t = off;
7469   if (adjust)
7470     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7471                 build_int_cst (TREE_TYPE (off), adjust));
7472
7473   t = fold_convert (sizetype, t);
7474   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7475
7476   if (is_ha)
7477     {
7478       /* type ha; // treat as "struct {ftype field[n];}"
7479          ... [computing offs]
7480          for (i = 0; i <nregs; ++i, offs += 16)
7481            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7482          return ha;  */
7483       int i;
7484       tree tmp_ha, field_t, field_ptr_t;
7485
7486       /* Declare a local variable.  */
7487       tmp_ha = create_tmp_var_raw (type, "ha");
7488       gimple_add_tmp_var (tmp_ha);
7489
7490       /* Establish the base type.  */
7491       switch (ag_mode)
7492         {
7493         case SFmode:
7494           field_t = float_type_node;
7495           field_ptr_t = float_ptr_type_node;
7496           break;
7497         case DFmode:
7498           field_t = double_type_node;
7499           field_ptr_t = double_ptr_type_node;
7500           break;
7501         case TFmode:
7502           field_t = long_double_type_node;
7503           field_ptr_t = long_double_ptr_type_node;
7504           break;
7505 /* The half precision and quad precision are not fully supported yet.  Enable
7506    the following code after the support is complete.  Need to find the correct
7507    type node for __fp16 *.  */
7508 #if 0
7509         case HFmode:
7510           field_t = float_type_node;
7511           field_ptr_t = float_ptr_type_node;
7512           break;
7513 #endif
7514         case V2SImode:
7515         case V4SImode:
7516             {
7517               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7518               field_t = build_vector_type_for_mode (innertype, ag_mode);
7519               field_ptr_t = build_pointer_type (field_t);
7520             }
7521           break;
7522         default:
7523           gcc_assert (0);
7524         }
7525
7526       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7527       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7528       addr = t;
7529       t = fold_convert (field_ptr_t, addr);
7530       t = build2 (MODIFY_EXPR, field_t,
7531                   build1 (INDIRECT_REF, field_t, tmp_ha),
7532                   build1 (INDIRECT_REF, field_t, t));
7533
7534       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7535       for (i = 1; i < nregs; ++i)
7536         {
7537           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7538           u = fold_convert (field_ptr_t, addr);
7539           u = build2 (MODIFY_EXPR, field_t,
7540                       build2 (MEM_REF, field_t, tmp_ha,
7541                               build_int_cst (field_ptr_t,
7542                                              (i *
7543                                               int_size_in_bytes (field_t)))),
7544                       build1 (INDIRECT_REF, field_t, u));
7545           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7546         }
7547
7548       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7549       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7550     }
7551
7552   COND_EXPR_ELSE (cond2) = t;
7553   addr = fold_convert (build_pointer_type (type), cond1);
7554   addr = build_va_arg_indirect_ref (addr);
7555
7556   if (indirect_p)
7557     addr = build_va_arg_indirect_ref (addr);
7558
7559   return addr;
7560 }
7561
7562 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7563
7564 static void
7565 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7566                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7567                                 int no_rtl)
7568 {
7569   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7570   CUMULATIVE_ARGS local_cum;
7571   int gr_saved, vr_saved;
7572
7573   /* The caller has advanced CUM up to, but not beyond, the last named
7574      argument.  Advance a local copy of CUM past the last "real" named
7575      argument, to find out how many registers are left over.  */
7576   local_cum = *cum;
7577   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7578
7579   /* Found out how many registers we need to save.  */
7580   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7581   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7582
7583   if (TARGET_GENERAL_REGS_ONLY)
7584     {
7585       if (local_cum.aapcs_nvrn > 0)
7586         sorry ("%qs and floating point or vector arguments",
7587                "-mgeneral-regs-only");
7588       vr_saved = 0;
7589     }
7590
7591   if (!no_rtl)
7592     {
7593       if (gr_saved > 0)
7594         {
7595           rtx ptr, mem;
7596
7597           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7598           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7599                                - gr_saved * UNITS_PER_WORD);
7600           mem = gen_frame_mem (BLKmode, ptr);
7601           set_mem_alias_set (mem, get_varargs_alias_set ());
7602
7603           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7604                                mem, gr_saved);
7605         }
7606       if (vr_saved > 0)
7607         {
7608           /* We can't use move_block_from_reg, because it will use
7609              the wrong mode, storing D regs only.  */
7610           machine_mode mode = TImode;
7611           int off, i;
7612
7613           /* Set OFF to the offset from virtual_incoming_args_rtx of
7614              the first vector register.  The VR save area lies below
7615              the GR one, and is aligned to 16 bytes.  */
7616           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7617                                    STACK_BOUNDARY / BITS_PER_UNIT);
7618           off -= vr_saved * UNITS_PER_VREG;
7619
7620           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7621             {
7622               rtx ptr, mem;
7623
7624               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7625               mem = gen_frame_mem (mode, ptr);
7626               set_mem_alias_set (mem, get_varargs_alias_set ());
7627               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7628               off += UNITS_PER_VREG;
7629             }
7630         }
7631     }
7632
7633   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7634      any complication of having crtl->args.pretend_args_size changed.  */
7635   cfun->machine->frame.saved_varargs_size
7636     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7637                       STACK_BOUNDARY / BITS_PER_UNIT)
7638        + vr_saved * UNITS_PER_VREG);
7639 }
7640
7641 static void
7642 aarch64_conditional_register_usage (void)
7643 {
7644   int i;
7645   if (!TARGET_FLOAT)
7646     {
7647       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7648         {
7649           fixed_regs[i] = 1;
7650           call_used_regs[i] = 1;
7651         }
7652     }
7653 }
7654
7655 /* Walk down the type tree of TYPE counting consecutive base elements.
7656    If *MODEP is VOIDmode, then set it to the first valid floating point
7657    type.  If a non-floating point type is found, or if a floating point
7658    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7659    otherwise return the count in the sub-tree.  */
7660 static int
7661 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7662 {
7663   machine_mode mode;
7664   HOST_WIDE_INT size;
7665
7666   switch (TREE_CODE (type))
7667     {
7668     case REAL_TYPE:
7669       mode = TYPE_MODE (type);
7670       if (mode != DFmode && mode != SFmode && mode != TFmode)
7671         return -1;
7672
7673       if (*modep == VOIDmode)
7674         *modep = mode;
7675
7676       if (*modep == mode)
7677         return 1;
7678
7679       break;
7680
7681     case COMPLEX_TYPE:
7682       mode = TYPE_MODE (TREE_TYPE (type));
7683       if (mode != DFmode && mode != SFmode && mode != TFmode)
7684         return -1;
7685
7686       if (*modep == VOIDmode)
7687         *modep = mode;
7688
7689       if (*modep == mode)
7690         return 2;
7691
7692       break;
7693
7694     case VECTOR_TYPE:
7695       /* Use V2SImode and V4SImode as representatives of all 64-bit
7696          and 128-bit vector types.  */
7697       size = int_size_in_bytes (type);
7698       switch (size)
7699         {
7700         case 8:
7701           mode = V2SImode;
7702           break;
7703         case 16:
7704           mode = V4SImode;
7705           break;
7706         default:
7707           return -1;
7708         }
7709
7710       if (*modep == VOIDmode)
7711         *modep = mode;
7712
7713       /* Vector modes are considered to be opaque: two vectors are
7714          equivalent for the purposes of being homogeneous aggregates
7715          if they are the same size.  */
7716       if (*modep == mode)
7717         return 1;
7718
7719       break;
7720
7721     case ARRAY_TYPE:
7722       {
7723         int count;
7724         tree index = TYPE_DOMAIN (type);
7725
7726         /* Can't handle incomplete types nor sizes that are not
7727            fixed.  */
7728         if (!COMPLETE_TYPE_P (type)
7729             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7730           return -1;
7731
7732         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7733         if (count == -1
7734             || !index
7735             || !TYPE_MAX_VALUE (index)
7736             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7737             || !TYPE_MIN_VALUE (index)
7738             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7739             || count < 0)
7740           return -1;
7741
7742         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7743                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7744
7745         /* There must be no padding.  */
7746         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7747           return -1;
7748
7749         return count;
7750       }
7751
7752     case RECORD_TYPE:
7753       {
7754         int count = 0;
7755         int sub_count;
7756         tree field;
7757
7758         /* Can't handle incomplete types nor sizes that are not
7759            fixed.  */
7760         if (!COMPLETE_TYPE_P (type)
7761             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7762           return -1;
7763
7764         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7765           {
7766             if (TREE_CODE (field) != FIELD_DECL)
7767               continue;
7768
7769             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7770             if (sub_count < 0)
7771               return -1;
7772             count += sub_count;
7773           }
7774
7775         /* There must be no padding.  */
7776         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7777           return -1;
7778
7779         return count;
7780       }
7781
7782     case UNION_TYPE:
7783     case QUAL_UNION_TYPE:
7784       {
7785         /* These aren't very interesting except in a degenerate case.  */
7786         int count = 0;
7787         int sub_count;
7788         tree field;
7789
7790         /* Can't handle incomplete types nor sizes that are not
7791            fixed.  */
7792         if (!COMPLETE_TYPE_P (type)
7793             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7794           return -1;
7795
7796         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7797           {
7798             if (TREE_CODE (field) != FIELD_DECL)
7799               continue;
7800
7801             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7802             if (sub_count < 0)
7803               return -1;
7804             count = count > sub_count ? count : sub_count;
7805           }
7806
7807         /* There must be no padding.  */
7808         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7809           return -1;
7810
7811         return count;
7812       }
7813
7814     default:
7815       break;
7816     }
7817
7818   return -1;
7819 }
7820
7821 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7822    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7823    array types.  The C99 floating-point complex types are also considered
7824    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7825    types, which are GCC extensions and out of the scope of AAPCS64, are
7826    treated as composite types here as well.
7827
7828    Note that MODE itself is not sufficient in determining whether a type
7829    is such a composite type or not.  This is because
7830    stor-layout.c:compute_record_mode may have already changed the MODE
7831    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7832    structure with only one field may have its MODE set to the mode of the
7833    field.  Also an integer mode whose size matches the size of the
7834    RECORD_TYPE type may be used to substitute the original mode
7835    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7836    solely relied on.  */
7837
7838 static bool
7839 aarch64_composite_type_p (const_tree type,
7840                           machine_mode mode)
7841 {
7842   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7843     return true;
7844
7845   if (mode == BLKmode
7846       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7847       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7848     return true;
7849
7850   return false;
7851 }
7852
7853 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7854    type as described in AAPCS64 \S 4.1.2.
7855
7856    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7857
7858 static bool
7859 aarch64_short_vector_p (const_tree type,
7860                         machine_mode mode)
7861 {
7862   HOST_WIDE_INT size = -1;
7863
7864   if (type && TREE_CODE (type) == VECTOR_TYPE)
7865     size = int_size_in_bytes (type);
7866   else if (!aarch64_composite_type_p (type, mode)
7867            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7868                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7869     size = GET_MODE_SIZE (mode);
7870
7871   return (size == 8 || size == 16) ? true : false;
7872 }
7873
7874 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7875    shall be passed or returned in simd/fp register(s) (providing these
7876    parameter passing registers are available).
7877
7878    Upon successful return, *COUNT returns the number of needed registers,
7879    *BASE_MODE returns the mode of the individual register and when IS_HAF
7880    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7881    floating-point aggregate or a homogeneous short-vector aggregate.  */
7882
7883 static bool
7884 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7885                                          const_tree type,
7886                                          machine_mode *base_mode,
7887                                          int *count,
7888                                          bool *is_ha)
7889 {
7890   machine_mode new_mode = VOIDmode;
7891   bool composite_p = aarch64_composite_type_p (type, mode);
7892
7893   if (is_ha != NULL) *is_ha = false;
7894
7895   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7896       || aarch64_short_vector_p (type, mode))
7897     {
7898       *count = 1;
7899       new_mode = mode;
7900     }
7901   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7902     {
7903       if (is_ha != NULL) *is_ha = true;
7904       *count = 2;
7905       new_mode = GET_MODE_INNER (mode);
7906     }
7907   else if (type && composite_p)
7908     {
7909       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7910
7911       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7912         {
7913           if (is_ha != NULL) *is_ha = true;
7914           *count = ag_count;
7915         }
7916       else
7917         return false;
7918     }
7919   else
7920     return false;
7921
7922   *base_mode = new_mode;
7923   return true;
7924 }
7925
7926 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7927
7928 static rtx
7929 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7930                           int incoming ATTRIBUTE_UNUSED)
7931 {
7932   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7933 }
7934
7935 /* Implements target hook vector_mode_supported_p.  */
7936 static bool
7937 aarch64_vector_mode_supported_p (machine_mode mode)
7938 {
7939   if (TARGET_SIMD
7940       && (mode == V4SImode  || mode == V8HImode
7941           || mode == V16QImode || mode == V2DImode
7942           || mode == V2SImode  || mode == V4HImode
7943           || mode == V8QImode || mode == V2SFmode
7944           || mode == V4SFmode || mode == V2DFmode
7945           || mode == V1DFmode))
7946     return true;
7947
7948   return false;
7949 }
7950
7951 /* Return appropriate SIMD container
7952    for MODE within a vector of WIDTH bits.  */
7953 static machine_mode
7954 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7955 {
7956   gcc_assert (width == 64 || width == 128);
7957   if (TARGET_SIMD)
7958     {
7959       if (width == 128)
7960         switch (mode)
7961           {
7962           case DFmode:
7963             return V2DFmode;
7964           case SFmode:
7965             return V4SFmode;
7966           case SImode:
7967             return V4SImode;
7968           case HImode:
7969             return V8HImode;
7970           case QImode:
7971             return V16QImode;
7972           case DImode:
7973             return V2DImode;
7974           default:
7975             break;
7976           }
7977       else
7978         switch (mode)
7979           {
7980           case SFmode:
7981             return V2SFmode;
7982           case SImode:
7983             return V2SImode;
7984           case HImode:
7985             return V4HImode;
7986           case QImode:
7987             return V8QImode;
7988           default:
7989             break;
7990           }
7991     }
7992   return word_mode;
7993 }
7994
7995 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7996 static machine_mode
7997 aarch64_preferred_simd_mode (machine_mode mode)
7998 {
7999   return aarch64_simd_container_mode (mode, 128);
8000 }
8001
8002 /* Return the bitmask of possible vector sizes for the vectorizer
8003    to iterate over.  */
8004 static unsigned int
8005 aarch64_autovectorize_vector_sizes (void)
8006 {
8007   return (16 | 8);
8008 }
8009
8010 /* Implement TARGET_MANGLE_TYPE.  */
8011
8012 static const char *
8013 aarch64_mangle_type (const_tree type)
8014 {
8015   /* The AArch64 ABI documents say that "__va_list" has to be
8016      managled as if it is in the "std" namespace.  */
8017   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8018     return "St9__va_list";
8019
8020   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8021      builtin types.  */
8022   if (TYPE_NAME (type) != NULL)
8023     return aarch64_mangle_builtin_type (type);
8024
8025   /* Use the default mangling.  */
8026   return NULL;
8027 }
8028
8029
8030 /* Return true if the rtx_insn contains a MEM RTX somewhere
8031    in it.  */
8032
8033 static bool
8034 has_memory_op (rtx_insn *mem_insn)
8035 {
8036   subrtx_iterator::array_type array;
8037   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8038     if (MEM_P (*iter))
8039       return true;
8040
8041   return false;
8042 }
8043
8044 /* Find the first rtx_insn before insn that will generate an assembly
8045    instruction.  */
8046
8047 static rtx_insn *
8048 aarch64_prev_real_insn (rtx_insn *insn)
8049 {
8050   if (!insn)
8051     return NULL;
8052
8053   do
8054     {
8055       insn = prev_real_insn (insn);
8056     }
8057   while (insn && recog_memoized (insn) < 0);
8058
8059   return insn;
8060 }
8061
8062 static bool
8063 is_madd_op (enum attr_type t1)
8064 {
8065   unsigned int i;
8066   /* A number of these may be AArch32 only.  */
8067   enum attr_type mlatypes[] = {
8068     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8069     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8070     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8071   };
8072
8073   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8074     {
8075       if (t1 == mlatypes[i])
8076         return true;
8077     }
8078
8079   return false;
8080 }
8081
8082 /* Check if there is a register dependency between a load and the insn
8083    for which we hold recog_data.  */
8084
8085 static bool
8086 dep_between_memop_and_curr (rtx memop)
8087 {
8088   rtx load_reg;
8089   int opno;
8090
8091   gcc_assert (GET_CODE (memop) == SET);
8092
8093   if (!REG_P (SET_DEST (memop)))
8094     return false;
8095
8096   load_reg = SET_DEST (memop);
8097   for (opno = 1; opno < recog_data.n_operands; opno++)
8098     {
8099       rtx operand = recog_data.operand[opno];
8100       if (REG_P (operand)
8101           && reg_overlap_mentioned_p (load_reg, operand))
8102         return true;
8103
8104     }
8105   return false;
8106 }
8107
8108
8109 /* When working around the Cortex-A53 erratum 835769,
8110    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8111    instruction and has a preceding memory instruction such that a NOP
8112    should be inserted between them.  */
8113
8114 bool
8115 aarch64_madd_needs_nop (rtx_insn* insn)
8116 {
8117   enum attr_type attr_type;
8118   rtx_insn *prev;
8119   rtx body;
8120
8121   if (!aarch64_fix_a53_err835769)
8122     return false;
8123
8124   if (recog_memoized (insn) < 0)
8125     return false;
8126
8127   attr_type = get_attr_type (insn);
8128   if (!is_madd_op (attr_type))
8129     return false;
8130
8131   prev = aarch64_prev_real_insn (insn);
8132   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8133      Restore recog state to INSN to avoid state corruption.  */
8134   extract_constrain_insn_cached (insn);
8135
8136   if (!prev || !has_memory_op (prev))
8137     return false;
8138
8139   body = single_set (prev);
8140
8141   /* If the previous insn is a memory op and there is no dependency between
8142      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8143      have a complex memory operation, probably a load/store pair.
8144      Be conservative for now and emit a NOP.  */
8145   if (GET_MODE (recog_data.operand[0]) == DImode
8146       && (!body || !dep_between_memop_and_curr (body)))
8147     return true;
8148
8149   return false;
8150
8151 }
8152
8153
8154 /* Implement FINAL_PRESCAN_INSN.  */
8155
8156 void
8157 aarch64_final_prescan_insn (rtx_insn *insn)
8158 {
8159   if (aarch64_madd_needs_nop (insn))
8160     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8161 }
8162
8163
8164 /* Return the equivalent letter for size.  */
8165 static char
8166 sizetochar (int size)
8167 {
8168   switch (size)
8169     {
8170     case 64: return 'd';
8171     case 32: return 's';
8172     case 16: return 'h';
8173     case 8 : return 'b';
8174     default: gcc_unreachable ();
8175     }
8176 }
8177
8178 /* Return true iff x is a uniform vector of floating-point
8179    constants, and the constant can be represented in
8180    quarter-precision form.  Note, as aarch64_float_const_representable
8181    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8182 static bool
8183 aarch64_vect_float_const_representable_p (rtx x)
8184 {
8185   int i = 0;
8186   REAL_VALUE_TYPE r0, ri;
8187   rtx x0, xi;
8188
8189   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8190     return false;
8191
8192   x0 = CONST_VECTOR_ELT (x, 0);
8193   if (!CONST_DOUBLE_P (x0))
8194     return false;
8195
8196   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8197
8198   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8199     {
8200       xi = CONST_VECTOR_ELT (x, i);
8201       if (!CONST_DOUBLE_P (xi))
8202         return false;
8203
8204       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8205       if (!REAL_VALUES_EQUAL (r0, ri))
8206         return false;
8207     }
8208
8209   return aarch64_float_const_representable_p (x0);
8210 }
8211
8212 /* Return true for valid and false for invalid.  */
8213 bool
8214 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8215                               struct simd_immediate_info *info)
8216 {
8217 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8218   matches = 1;                                          \
8219   for (i = 0; i < idx; i += (STRIDE))                   \
8220     if (!(TEST))                                        \
8221       matches = 0;                                      \
8222   if (matches)                                          \
8223     {                                                   \
8224       immtype = (CLASS);                                \
8225       elsize = (ELSIZE);                                \
8226       eshift = (SHIFT);                                 \
8227       emvn = (NEG);                                     \
8228       break;                                            \
8229     }
8230
8231   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8232   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8233   unsigned char bytes[16];
8234   int immtype = -1, matches;
8235   unsigned int invmask = inverse ? 0xff : 0;
8236   int eshift, emvn;
8237
8238   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8239     {
8240       if (! (aarch64_simd_imm_zero_p (op, mode)
8241              || aarch64_vect_float_const_representable_p (op)))
8242         return false;
8243
8244       if (info)
8245         {
8246           info->value = CONST_VECTOR_ELT (op, 0);
8247           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8248           info->mvn = false;
8249           info->shift = 0;
8250         }
8251
8252       return true;
8253     }
8254
8255   /* Splat vector constant out into a byte vector.  */
8256   for (i = 0; i < n_elts; i++)
8257     {
8258       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8259          it must be laid out in the vector register in reverse order.  */
8260       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8261       unsigned HOST_WIDE_INT elpart;
8262       unsigned int part, parts;
8263
8264       if (CONST_INT_P (el))
8265         {
8266           elpart = INTVAL (el);
8267           parts = 1;
8268         }
8269       else if (GET_CODE (el) == CONST_DOUBLE)
8270         {
8271           elpart = CONST_DOUBLE_LOW (el);
8272           parts = 2;
8273         }
8274       else
8275         gcc_unreachable ();
8276
8277       for (part = 0; part < parts; part++)
8278         {
8279           unsigned int byte;
8280           for (byte = 0; byte < innersize; byte++)
8281             {
8282               bytes[idx++] = (elpart & 0xff) ^ invmask;
8283               elpart >>= BITS_PER_UNIT;
8284             }
8285           if (GET_CODE (el) == CONST_DOUBLE)
8286             elpart = CONST_DOUBLE_HIGH (el);
8287         }
8288     }
8289
8290   /* Sanity check.  */
8291   gcc_assert (idx == GET_MODE_SIZE (mode));
8292
8293   do
8294     {
8295       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8296              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8297
8298       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8299              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8300
8301       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8302              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8303
8304       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8305              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8306
8307       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8308
8309       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8310
8311       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8312              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8313
8314       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8315              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8316
8317       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8318              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8319
8320       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8321              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8322
8323       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8324
8325       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8326
8327       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8328              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8329
8330       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8331              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8332
8333       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8334              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8335
8336       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8337              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8338
8339       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8340
8341       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8342              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8343     }
8344   while (0);
8345
8346   if (immtype == -1)
8347     return false;
8348
8349   if (info)
8350     {
8351       info->element_width = elsize;
8352       info->mvn = emvn != 0;
8353       info->shift = eshift;
8354
8355       unsigned HOST_WIDE_INT imm = 0;
8356
8357       if (immtype >= 12 && immtype <= 15)
8358         info->msl = true;
8359
8360       /* Un-invert bytes of recognized vector, if necessary.  */
8361       if (invmask != 0)
8362         for (i = 0; i < idx; i++)
8363           bytes[i] ^= invmask;
8364
8365       if (immtype == 17)
8366         {
8367           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8368           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8369
8370           for (i = 0; i < 8; i++)
8371             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8372               << (i * BITS_PER_UNIT);
8373
8374
8375           info->value = GEN_INT (imm);
8376         }
8377       else
8378         {
8379           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8380             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8381
8382           /* Construct 'abcdefgh' because the assembler cannot handle
8383              generic constants.  */
8384           if (info->mvn)
8385             imm = ~imm;
8386           imm = (imm >> info->shift) & 0xff;
8387           info->value = GEN_INT (imm);
8388         }
8389     }
8390
8391   return true;
8392 #undef CHECK
8393 }
8394
8395 /* Check of immediate shift constants are within range.  */
8396 bool
8397 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8398 {
8399   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8400   if (left)
8401     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8402   else
8403     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8404 }
8405
8406 /* Return true if X is a uniform vector where all elements
8407    are either the floating-point constant 0.0 or the
8408    integer constant 0.  */
8409 bool
8410 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8411 {
8412   return x == CONST0_RTX (mode);
8413 }
8414
8415 bool
8416 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8417 {
8418   HOST_WIDE_INT imm = INTVAL (x);
8419   int i;
8420
8421   for (i = 0; i < 8; i++)
8422     {
8423       unsigned int byte = imm & 0xff;
8424       if (byte != 0xff && byte != 0)
8425        return false;
8426       imm >>= 8;
8427     }
8428
8429   return true;
8430 }
8431
8432 bool
8433 aarch64_mov_operand_p (rtx x,
8434                        enum aarch64_symbol_context context,
8435                        machine_mode mode)
8436 {
8437   if (GET_CODE (x) == HIGH
8438       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8439     return true;
8440
8441   if (CONST_INT_P (x))
8442     return true;
8443
8444   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8445     return true;
8446
8447   return aarch64_classify_symbolic_expression (x, context)
8448     == SYMBOL_TINY_ABSOLUTE;
8449 }
8450
8451 /* Return a const_int vector of VAL.  */
8452 rtx
8453 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8454 {
8455   int nunits = GET_MODE_NUNITS (mode);
8456   rtvec v = rtvec_alloc (nunits);
8457   int i;
8458
8459   for (i=0; i < nunits; i++)
8460     RTVEC_ELT (v, i) = GEN_INT (val);
8461
8462   return gen_rtx_CONST_VECTOR (mode, v);
8463 }
8464
8465 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8466
8467 bool
8468 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8469 {
8470   machine_mode vmode;
8471
8472   gcc_assert (!VECTOR_MODE_P (mode));
8473   vmode = aarch64_preferred_simd_mode (mode);
8474   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8475   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8476 }
8477
8478 /* Construct and return a PARALLEL RTX vector with elements numbering the
8479    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8480    the vector - from the perspective of the architecture.  This does not
8481    line up with GCC's perspective on lane numbers, so we end up with
8482    different masks depending on our target endian-ness.  The diagram
8483    below may help.  We must draw the distinction when building masks
8484    which select one half of the vector.  An instruction selecting
8485    architectural low-lanes for a big-endian target, must be described using
8486    a mask selecting GCC high-lanes.
8487
8488                  Big-Endian             Little-Endian
8489
8490 GCC             0   1   2   3           3   2   1   0
8491               | x | x | x | x |       | x | x | x | x |
8492 Architecture    3   2   1   0           3   2   1   0
8493
8494 Low Mask:         { 2, 3 }                { 0, 1 }
8495 High Mask:        { 0, 1 }                { 2, 3 }
8496 */
8497
8498 rtx
8499 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8500 {
8501   int nunits = GET_MODE_NUNITS (mode);
8502   rtvec v = rtvec_alloc (nunits / 2);
8503   int high_base = nunits / 2;
8504   int low_base = 0;
8505   int base;
8506   rtx t1;
8507   int i;
8508
8509   if (BYTES_BIG_ENDIAN)
8510     base = high ? low_base : high_base;
8511   else
8512     base = high ? high_base : low_base;
8513
8514   for (i = 0; i < nunits / 2; i++)
8515     RTVEC_ELT (v, i) = GEN_INT (base + i);
8516
8517   t1 = gen_rtx_PARALLEL (mode, v);
8518   return t1;
8519 }
8520
8521 /* Check OP for validity as a PARALLEL RTX vector with elements
8522    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8523    from the perspective of the architecture.  See the diagram above
8524    aarch64_simd_vect_par_cnst_half for more details.  */
8525
8526 bool
8527 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8528                                        bool high)
8529 {
8530   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8531   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8532   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8533   int i = 0;
8534
8535   if (!VECTOR_MODE_P (mode))
8536     return false;
8537
8538   if (count_op != count_ideal)
8539     return false;
8540
8541   for (i = 0; i < count_ideal; i++)
8542     {
8543       rtx elt_op = XVECEXP (op, 0, i);
8544       rtx elt_ideal = XVECEXP (ideal, 0, i);
8545
8546       if (!CONST_INT_P (elt_op)
8547           || INTVAL (elt_ideal) != INTVAL (elt_op))
8548         return false;
8549     }
8550   return true;
8551 }
8552
8553 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8554    HIGH (exclusive).  */
8555 void
8556 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8557                           const_tree exp)
8558 {
8559   HOST_WIDE_INT lane;
8560   gcc_assert (CONST_INT_P (operand));
8561   lane = INTVAL (operand);
8562
8563   if (lane < low || lane >= high)
8564   {
8565     if (exp)
8566       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8567     else
8568       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8569   }
8570 }
8571
8572 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8573    registers).  */
8574 void
8575 aarch64_simd_emit_pair_result_insn (machine_mode mode,
8576                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8577                             rtx op1)
8578 {
8579   rtx mem = gen_rtx_MEM (mode, destaddr);
8580   rtx tmp1 = gen_reg_rtx (mode);
8581   rtx tmp2 = gen_reg_rtx (mode);
8582
8583   emit_insn (intfn (tmp1, op1, tmp2));
8584
8585   emit_move_insn (mem, tmp1);
8586   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8587   emit_move_insn (mem, tmp2);
8588 }
8589
8590 /* Return TRUE if OP is a valid vector addressing mode.  */
8591 bool
8592 aarch64_simd_mem_operand_p (rtx op)
8593 {
8594   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8595                         || REG_P (XEXP (op, 0)));
8596 }
8597
8598 /* Emit a register copy from operand to operand, taking care not to
8599    early-clobber source registers in the process.
8600
8601    COUNT is the number of components into which the copy needs to be
8602    decomposed.  */
8603 void
8604 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8605                                 unsigned int count)
8606 {
8607   unsigned int i;
8608   int rdest = REGNO (operands[0]);
8609   int rsrc = REGNO (operands[1]);
8610
8611   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8612       || rdest < rsrc)
8613     for (i = 0; i < count; i++)
8614       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8615                       gen_rtx_REG (mode, rsrc + i));
8616   else
8617     for (i = 0; i < count; i++)
8618       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8619                       gen_rtx_REG (mode, rsrc + count - i - 1));
8620 }
8621
8622 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8623    one of VSTRUCT modes: OI, CI or XI.  */
8624 int
8625 aarch64_simd_attr_length_move (rtx_insn *insn)
8626 {
8627   machine_mode mode;
8628
8629   extract_insn_cached (insn);
8630
8631   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8632     {
8633       mode = GET_MODE (recog_data.operand[0]);
8634       switch (mode)
8635         {
8636         case OImode:
8637           return 8;
8638         case CImode:
8639           return 12;
8640         case XImode:
8641           return 16;
8642         default:
8643           gcc_unreachable ();
8644         }
8645     }
8646   return 4;
8647 }
8648
8649 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8650    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8651 int
8652 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8653 {
8654   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8655 }
8656
8657 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8658    alignment of a vector to 128 bits.  */
8659 static HOST_WIDE_INT
8660 aarch64_simd_vector_alignment (const_tree type)
8661 {
8662   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8663   return MIN (align, 128);
8664 }
8665
8666 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8667 static bool
8668 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8669 {
8670   if (is_packed)
8671     return false;
8672
8673   /* We guarantee alignment for vectors up to 128-bits.  */
8674   if (tree_int_cst_compare (TYPE_SIZE (type),
8675                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8676     return false;
8677
8678   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8679   return true;
8680 }
8681
8682 /* If VALS is a vector constant that can be loaded into a register
8683    using DUP, generate instructions to do so and return an RTX to
8684    assign to the register.  Otherwise return NULL_RTX.  */
8685 static rtx
8686 aarch64_simd_dup_constant (rtx vals)
8687 {
8688   machine_mode mode = GET_MODE (vals);
8689   machine_mode inner_mode = GET_MODE_INNER (mode);
8690   int n_elts = GET_MODE_NUNITS (mode);
8691   bool all_same = true;
8692   rtx x;
8693   int i;
8694
8695   if (GET_CODE (vals) != CONST_VECTOR)
8696     return NULL_RTX;
8697
8698   for (i = 1; i < n_elts; ++i)
8699     {
8700       x = CONST_VECTOR_ELT (vals, i);
8701       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8702         all_same = false;
8703     }
8704
8705   if (!all_same)
8706     return NULL_RTX;
8707
8708   /* We can load this constant by using DUP and a constant in a
8709      single ARM register.  This will be cheaper than a vector
8710      load.  */
8711   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8712   return gen_rtx_VEC_DUPLICATE (mode, x);
8713 }
8714
8715
8716 /* Generate code to load VALS, which is a PARALLEL containing only
8717    constants (for vec_init) or CONST_VECTOR, efficiently into a
8718    register.  Returns an RTX to copy into the register, or NULL_RTX
8719    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8720 static rtx
8721 aarch64_simd_make_constant (rtx vals)
8722 {
8723   machine_mode mode = GET_MODE (vals);
8724   rtx const_dup;
8725   rtx const_vec = NULL_RTX;
8726   int n_elts = GET_MODE_NUNITS (mode);
8727   int n_const = 0;
8728   int i;
8729
8730   if (GET_CODE (vals) == CONST_VECTOR)
8731     const_vec = vals;
8732   else if (GET_CODE (vals) == PARALLEL)
8733     {
8734       /* A CONST_VECTOR must contain only CONST_INTs and
8735          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8736          Only store valid constants in a CONST_VECTOR.  */
8737       for (i = 0; i < n_elts; ++i)
8738         {
8739           rtx x = XVECEXP (vals, 0, i);
8740           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8741             n_const++;
8742         }
8743       if (n_const == n_elts)
8744         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8745     }
8746   else
8747     gcc_unreachable ();
8748
8749   if (const_vec != NULL_RTX
8750       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8751     /* Load using MOVI/MVNI.  */
8752     return const_vec;
8753   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8754     /* Loaded using DUP.  */
8755     return const_dup;
8756   else if (const_vec != NULL_RTX)
8757     /* Load from constant pool. We can not take advantage of single-cycle
8758        LD1 because we need a PC-relative addressing mode.  */
8759     return const_vec;
8760   else
8761     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8762        We can not construct an initializer.  */
8763     return NULL_RTX;
8764 }
8765
8766 void
8767 aarch64_expand_vector_init (rtx target, rtx vals)
8768 {
8769   machine_mode mode = GET_MODE (target);
8770   machine_mode inner_mode = GET_MODE_INNER (mode);
8771   int n_elts = GET_MODE_NUNITS (mode);
8772   int n_var = 0, one_var = -1;
8773   bool all_same = true;
8774   rtx x, mem;
8775   int i;
8776
8777   x = XVECEXP (vals, 0, 0);
8778   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8779     n_var = 1, one_var = 0;
8780
8781   for (i = 1; i < n_elts; ++i)
8782     {
8783       x = XVECEXP (vals, 0, i);
8784       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8785         ++n_var, one_var = i;
8786
8787       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8788         all_same = false;
8789     }
8790
8791   if (n_var == 0)
8792     {
8793       rtx constant = aarch64_simd_make_constant (vals);
8794       if (constant != NULL_RTX)
8795         {
8796           emit_move_insn (target, constant);
8797           return;
8798         }
8799     }
8800
8801   /* Splat a single non-constant element if we can.  */
8802   if (all_same)
8803     {
8804       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8805       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8806       return;
8807     }
8808
8809   /* One field is non-constant.  Load constant then overwrite varying
8810      field.  This is more efficient than using the stack.  */
8811   if (n_var == 1)
8812     {
8813       rtx copy = copy_rtx (vals);
8814       rtx index = GEN_INT (one_var);
8815       enum insn_code icode;
8816
8817       /* Load constant part of vector, substitute neighboring value for
8818          varying element.  */
8819       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8820       aarch64_expand_vector_init (target, copy);
8821
8822       /* Insert variable.  */
8823       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8824       icode = optab_handler (vec_set_optab, mode);
8825       gcc_assert (icode != CODE_FOR_nothing);
8826       emit_insn (GEN_FCN (icode) (target, x, index));
8827       return;
8828     }
8829
8830   /* Construct the vector in memory one field at a time
8831      and load the whole vector.  */
8832   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8833   for (i = 0; i < n_elts; i++)
8834     emit_move_insn (adjust_address_nv (mem, inner_mode,
8835                                     i * GET_MODE_SIZE (inner_mode)),
8836                     XVECEXP (vals, 0, i));
8837   emit_move_insn (target, mem);
8838
8839 }
8840
8841 static unsigned HOST_WIDE_INT
8842 aarch64_shift_truncation_mask (machine_mode mode)
8843 {
8844   return
8845     (aarch64_vector_mode_supported_p (mode)
8846      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8847 }
8848
8849 #ifndef TLS_SECTION_ASM_FLAG
8850 #define TLS_SECTION_ASM_FLAG 'T'
8851 #endif
8852
8853 void
8854 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8855                                tree decl ATTRIBUTE_UNUSED)
8856 {
8857   char flagchars[10], *f = flagchars;
8858
8859   /* If we have already declared this section, we can use an
8860      abbreviated form to switch back to it -- unless this section is
8861      part of a COMDAT groups, in which case GAS requires the full
8862      declaration every time.  */
8863   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8864       && (flags & SECTION_DECLARED))
8865     {
8866       fprintf (asm_out_file, "\t.section\t%s\n", name);
8867       return;
8868     }
8869
8870   if (!(flags & SECTION_DEBUG))
8871     *f++ = 'a';
8872   if (flags & SECTION_WRITE)
8873     *f++ = 'w';
8874   if (flags & SECTION_CODE)
8875     *f++ = 'x';
8876   if (flags & SECTION_SMALL)
8877     *f++ = 's';
8878   if (flags & SECTION_MERGE)
8879     *f++ = 'M';
8880   if (flags & SECTION_STRINGS)
8881     *f++ = 'S';
8882   if (flags & SECTION_TLS)
8883     *f++ = TLS_SECTION_ASM_FLAG;
8884   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8885     *f++ = 'G';
8886   *f = '\0';
8887
8888   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8889
8890   if (!(flags & SECTION_NOTYPE))
8891     {
8892       const char *type;
8893       const char *format;
8894
8895       if (flags & SECTION_BSS)
8896         type = "nobits";
8897       else
8898         type = "progbits";
8899
8900 #ifdef TYPE_OPERAND_FMT
8901       format = "," TYPE_OPERAND_FMT;
8902 #else
8903       format = ",@%s";
8904 #endif
8905
8906       fprintf (asm_out_file, format, type);
8907
8908       if (flags & SECTION_ENTSIZE)
8909         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8910       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8911         {
8912           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8913             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8914           else
8915             fprintf (asm_out_file, ",%s,comdat",
8916                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8917         }
8918     }
8919
8920   putc ('\n', asm_out_file);
8921 }
8922
8923 /* Select a format to encode pointers in exception handling data.  */
8924 int
8925 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8926 {
8927    int type;
8928    switch (aarch64_cmodel)
8929      {
8930      case AARCH64_CMODEL_TINY:
8931      case AARCH64_CMODEL_TINY_PIC:
8932      case AARCH64_CMODEL_SMALL:
8933      case AARCH64_CMODEL_SMALL_PIC:
8934        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8935           for everything.  */
8936        type = DW_EH_PE_sdata4;
8937        break;
8938      default:
8939        /* No assumptions here.  8-byte relocs required.  */
8940        type = DW_EH_PE_sdata8;
8941        break;
8942      }
8943    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8944 }
8945
8946 /* Emit load exclusive.  */
8947
8948 static void
8949 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8950                              rtx mem, rtx model_rtx)
8951 {
8952   rtx (*gen) (rtx, rtx, rtx);
8953
8954   switch (mode)
8955     {
8956     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8957     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8958     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8959     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8960     default:
8961       gcc_unreachable ();
8962     }
8963
8964   emit_insn (gen (rval, mem, model_rtx));
8965 }
8966
8967 /* Emit store exclusive.  */
8968
8969 static void
8970 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8971                               rtx rval, rtx mem, rtx model_rtx)
8972 {
8973   rtx (*gen) (rtx, rtx, rtx, rtx);
8974
8975   switch (mode)
8976     {
8977     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8978     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8979     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8980     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8981     default:
8982       gcc_unreachable ();
8983     }
8984
8985   emit_insn (gen (bval, rval, mem, model_rtx));
8986 }
8987
8988 /* Mark the previous jump instruction as unlikely.  */
8989
8990 static void
8991 aarch64_emit_unlikely_jump (rtx insn)
8992 {
8993   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8994
8995   insn = emit_jump_insn (insn);
8996   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8997 }
8998
8999 /* Expand a compare and swap pattern.  */
9000
9001 void
9002 aarch64_expand_compare_and_swap (rtx operands[])
9003 {
9004   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9005   machine_mode mode, cmp_mode;
9006   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9007
9008   bval = operands[0];
9009   rval = operands[1];
9010   mem = operands[2];
9011   oldval = operands[3];
9012   newval = operands[4];
9013   is_weak = operands[5];
9014   mod_s = operands[6];
9015   mod_f = operands[7];
9016   mode = GET_MODE (mem);
9017   cmp_mode = mode;
9018
9019   /* Normally the succ memory model must be stronger than fail, but in the
9020      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9021      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9022
9023   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9024       && INTVAL (mod_s) == MEMMODEL_RELEASE)
9025     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9026
9027   switch (mode)
9028     {
9029     case QImode:
9030     case HImode:
9031       /* For short modes, we're going to perform the comparison in SImode,
9032          so do the zero-extension now.  */
9033       cmp_mode = SImode;
9034       rval = gen_reg_rtx (SImode);
9035       oldval = convert_modes (SImode, mode, oldval, true);
9036       /* Fall through.  */
9037
9038     case SImode:
9039     case DImode:
9040       /* Force the value into a register if needed.  */
9041       if (!aarch64_plus_operand (oldval, mode))
9042         oldval = force_reg (cmp_mode, oldval);
9043       break;
9044
9045     default:
9046       gcc_unreachable ();
9047     }
9048
9049   switch (mode)
9050     {
9051     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9052     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9053     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9054     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9055     default:
9056       gcc_unreachable ();
9057     }
9058
9059   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9060
9061   if (mode == QImode || mode == HImode)
9062     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9063
9064   x = gen_rtx_REG (CCmode, CC_REGNUM);
9065   x = gen_rtx_EQ (SImode, x, const0_rtx);
9066   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9067 }
9068
9069 /* Split a compare and swap pattern.  */
9070
9071 void
9072 aarch64_split_compare_and_swap (rtx operands[])
9073 {
9074   rtx rval, mem, oldval, newval, scratch;
9075   machine_mode mode;
9076   bool is_weak;
9077   rtx_code_label *label1, *label2;
9078   rtx x, cond;
9079
9080   rval = operands[0];
9081   mem = operands[1];
9082   oldval = operands[2];
9083   newval = operands[3];
9084   is_weak = (operands[4] != const0_rtx);
9085   scratch = operands[7];
9086   mode = GET_MODE (mem);
9087
9088   label1 = NULL;
9089   if (!is_weak)
9090     {
9091       label1 = gen_label_rtx ();
9092       emit_label (label1);
9093     }
9094   label2 = gen_label_rtx ();
9095
9096   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9097
9098   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9099   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9100   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9101                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9102   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9103
9104   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9105
9106   if (!is_weak)
9107     {
9108       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9109       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9110                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9111       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9112     }
9113   else
9114     {
9115       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9116       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9117       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9118     }
9119
9120   emit_label (label2);
9121 }
9122
9123 /* Split an atomic operation.  */
9124
9125 void
9126 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9127                      rtx value, rtx model_rtx, rtx cond)
9128 {
9129   machine_mode mode = GET_MODE (mem);
9130   machine_mode wmode = (mode == DImode ? DImode : SImode);
9131   rtx_code_label *label;
9132   rtx x;
9133
9134   label = gen_label_rtx ();
9135   emit_label (label);
9136
9137   if (new_out)
9138     new_out = gen_lowpart (wmode, new_out);
9139   if (old_out)
9140     old_out = gen_lowpart (wmode, old_out);
9141   else
9142     old_out = new_out;
9143   value = simplify_gen_subreg (wmode, value, mode, 0);
9144
9145   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9146
9147   switch (code)
9148     {
9149     case SET:
9150       new_out = value;
9151       break;
9152
9153     case NOT:
9154       x = gen_rtx_AND (wmode, old_out, value);
9155       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9156       x = gen_rtx_NOT (wmode, new_out);
9157       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9158       break;
9159
9160     case MINUS:
9161       if (CONST_INT_P (value))
9162         {
9163           value = GEN_INT (-INTVAL (value));
9164           code = PLUS;
9165         }
9166       /* Fall through.  */
9167
9168     default:
9169       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9170       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9171       break;
9172     }
9173
9174   aarch64_emit_store_exclusive (mode, cond, mem,
9175                                 gen_lowpart (mode, new_out), model_rtx);
9176
9177   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9178   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9179                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9180   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9181 }
9182
9183 static void
9184 aarch64_print_extension (void)
9185 {
9186   const struct aarch64_option_extension *opt = NULL;
9187
9188   for (opt = all_extensions; opt->name != NULL; opt++)
9189     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9190       asm_fprintf (asm_out_file, "+%s", opt->name);
9191
9192   asm_fprintf (asm_out_file, "\n");
9193 }
9194
9195 static void
9196 aarch64_start_file (void)
9197 {
9198   if (selected_arch)
9199     {
9200       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9201       aarch64_print_extension ();
9202     }
9203   else if (selected_cpu)
9204     {
9205       const char *truncated_name
9206             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9207       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9208       aarch64_print_extension ();
9209     }
9210   default_file_start();
9211 }
9212
9213 /* Target hook for c_mode_for_suffix.  */
9214 static machine_mode
9215 aarch64_c_mode_for_suffix (char suffix)
9216 {
9217   if (suffix == 'q')
9218     return TFmode;
9219
9220   return VOIDmode;
9221 }
9222
9223 /* We can only represent floating point constants which will fit in
9224    "quarter-precision" values.  These values are characterised by
9225    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9226    by:
9227
9228    (-1)^s * (n/16) * 2^r
9229
9230    Where:
9231      's' is the sign bit.
9232      'n' is an integer in the range 16 <= n <= 31.
9233      'r' is an integer in the range -3 <= r <= 4.  */
9234
9235 /* Return true iff X can be represented by a quarter-precision
9236    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9237 bool
9238 aarch64_float_const_representable_p (rtx x)
9239 {
9240   /* This represents our current view of how many bits
9241      make up the mantissa.  */
9242   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9243   int exponent;
9244   unsigned HOST_WIDE_INT mantissa, mask;
9245   REAL_VALUE_TYPE r, m;
9246   bool fail;
9247
9248   if (!CONST_DOUBLE_P (x))
9249     return false;
9250
9251   if (GET_MODE (x) == VOIDmode)
9252     return false;
9253
9254   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9255
9256   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9257      know if we have +zero until we analyse the mantissa, but we
9258      can reject the other invalid values.  */
9259   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9260       || REAL_VALUE_MINUS_ZERO (r))
9261     return false;
9262
9263   /* Extract exponent.  */
9264   r = real_value_abs (&r);
9265   exponent = REAL_EXP (&r);
9266
9267   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9268      highest (sign) bit, with a fixed binary point at bit point_pos.
9269      m1 holds the low part of the mantissa, m2 the high part.
9270      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9271      bits for the mantissa, this can fail (low bits will be lost).  */
9272   real_ldexp (&m, &r, point_pos - exponent);
9273   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9274
9275   /* If the low part of the mantissa has bits set we cannot represent
9276      the value.  */
9277   if (w.elt (0) != 0)
9278     return false;
9279   /* We have rejected the lower HOST_WIDE_INT, so update our
9280      understanding of how many bits lie in the mantissa and
9281      look only at the high HOST_WIDE_INT.  */
9282   mantissa = w.elt (1);
9283   point_pos -= HOST_BITS_PER_WIDE_INT;
9284
9285   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9286   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9287   if ((mantissa & mask) != 0)
9288     return false;
9289
9290   /* Having filtered unrepresentable values, we may now remove all
9291      but the highest 5 bits.  */
9292   mantissa >>= point_pos - 5;
9293
9294   /* We cannot represent the value 0.0, so reject it.  This is handled
9295      elsewhere.  */
9296   if (mantissa == 0)
9297     return false;
9298
9299   /* Then, as bit 4 is always set, we can mask it off, leaving
9300      the mantissa in the range [0, 15].  */
9301   mantissa &= ~(1 << 4);
9302   gcc_assert (mantissa <= 15);
9303
9304   /* GCC internally does not use IEEE754-like encoding (where normalized
9305      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9306      Our mantissa values are shifted 4 places to the left relative to
9307      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9308      by 5 places to correct for GCC's representation.  */
9309   exponent = 5 - exponent;
9310
9311   return (exponent >= 0 && exponent <= 7);
9312 }
9313
9314 char*
9315 aarch64_output_simd_mov_immediate (rtx const_vector,
9316                                    machine_mode mode,
9317                                    unsigned width)
9318 {
9319   bool is_valid;
9320   static char templ[40];
9321   const char *mnemonic;
9322   const char *shift_op;
9323   unsigned int lane_count = 0;
9324   char element_char;
9325
9326   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9327
9328   /* This will return true to show const_vector is legal for use as either
9329      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9330      also update INFO to show how the immediate should be generated.  */
9331   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9332   gcc_assert (is_valid);
9333
9334   element_char = sizetochar (info.element_width);
9335   lane_count = width / info.element_width;
9336
9337   mode = GET_MODE_INNER (mode);
9338   if (mode == SFmode || mode == DFmode)
9339     {
9340       gcc_assert (info.shift == 0 && ! info.mvn);
9341       if (aarch64_float_const_zero_rtx_p (info.value))
9342         info.value = GEN_INT (0);
9343       else
9344         {
9345 #define buf_size 20
9346           REAL_VALUE_TYPE r;
9347           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9348           char float_buf[buf_size] = {'\0'};
9349           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9350 #undef buf_size
9351
9352           if (lane_count == 1)
9353             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9354           else
9355             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9356                       lane_count, element_char, float_buf);
9357           return templ;
9358         }
9359     }
9360
9361   mnemonic = info.mvn ? "mvni" : "movi";
9362   shift_op = info.msl ? "msl" : "lsl";
9363
9364   if (lane_count == 1)
9365     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9366               mnemonic, UINTVAL (info.value));
9367   else if (info.shift)
9368     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9369               ", %s %d", mnemonic, lane_count, element_char,
9370               UINTVAL (info.value), shift_op, info.shift);
9371   else
9372     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9373               mnemonic, lane_count, element_char, UINTVAL (info.value));
9374   return templ;
9375 }
9376
9377 char*
9378 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9379                                           machine_mode mode)
9380 {
9381   machine_mode vmode;
9382
9383   gcc_assert (!VECTOR_MODE_P (mode));
9384   vmode = aarch64_simd_container_mode (mode, 64);
9385   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9386   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9387 }
9388
9389 /* Split operands into moves from op[1] + op[2] into op[0].  */
9390
9391 void
9392 aarch64_split_combinev16qi (rtx operands[3])
9393 {
9394   unsigned int dest = REGNO (operands[0]);
9395   unsigned int src1 = REGNO (operands[1]);
9396   unsigned int src2 = REGNO (operands[2]);
9397   machine_mode halfmode = GET_MODE (operands[1]);
9398   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9399   rtx destlo, desthi;
9400
9401   gcc_assert (halfmode == V16QImode);
9402
9403   if (src1 == dest && src2 == dest + halfregs)
9404     {
9405       /* No-op move.  Can't split to nothing; emit something.  */
9406       emit_note (NOTE_INSN_DELETED);
9407       return;
9408     }
9409
9410   /* Preserve register attributes for variable tracking.  */
9411   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9412   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9413                                GET_MODE_SIZE (halfmode));
9414
9415   /* Special case of reversed high/low parts.  */
9416   if (reg_overlap_mentioned_p (operands[2], destlo)
9417       && reg_overlap_mentioned_p (operands[1], desthi))
9418     {
9419       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9420       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9421       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9422     }
9423   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9424     {
9425       /* Try to avoid unnecessary moves if part of the result
9426          is in the right place already.  */
9427       if (src1 != dest)
9428         emit_move_insn (destlo, operands[1]);
9429       if (src2 != dest + halfregs)
9430         emit_move_insn (desthi, operands[2]);
9431     }
9432   else
9433     {
9434       if (src2 != dest + halfregs)
9435         emit_move_insn (desthi, operands[2]);
9436       if (src1 != dest)
9437         emit_move_insn (destlo, operands[1]);
9438     }
9439 }
9440
9441 /* vec_perm support.  */
9442
9443 #define MAX_VECT_LEN 16
9444
9445 struct expand_vec_perm_d
9446 {
9447   rtx target, op0, op1;
9448   unsigned char perm[MAX_VECT_LEN];
9449   machine_mode vmode;
9450   unsigned char nelt;
9451   bool one_vector_p;
9452   bool testing_p;
9453 };
9454
9455 /* Generate a variable permutation.  */
9456
9457 static void
9458 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9459 {
9460   machine_mode vmode = GET_MODE (target);
9461   bool one_vector_p = rtx_equal_p (op0, op1);
9462
9463   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9464   gcc_checking_assert (GET_MODE (op0) == vmode);
9465   gcc_checking_assert (GET_MODE (op1) == vmode);
9466   gcc_checking_assert (GET_MODE (sel) == vmode);
9467   gcc_checking_assert (TARGET_SIMD);
9468
9469   if (one_vector_p)
9470     {
9471       if (vmode == V8QImode)
9472         {
9473           /* Expand the argument to a V16QI mode by duplicating it.  */
9474           rtx pair = gen_reg_rtx (V16QImode);
9475           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9476           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9477         }
9478       else
9479         {
9480           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9481         }
9482     }
9483   else
9484     {
9485       rtx pair;
9486
9487       if (vmode == V8QImode)
9488         {
9489           pair = gen_reg_rtx (V16QImode);
9490           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9491           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9492         }
9493       else
9494         {
9495           pair = gen_reg_rtx (OImode);
9496           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9497           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9498         }
9499     }
9500 }
9501
9502 void
9503 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9504 {
9505   machine_mode vmode = GET_MODE (target);
9506   unsigned int nelt = GET_MODE_NUNITS (vmode);
9507   bool one_vector_p = rtx_equal_p (op0, op1);
9508   rtx mask;
9509
9510   /* The TBL instruction does not use a modulo index, so we must take care
9511      of that ourselves.  */
9512   mask = aarch64_simd_gen_const_vector_dup (vmode,
9513       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9514   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9515
9516   /* For big-endian, we also need to reverse the index within the vector
9517      (but not which vector).  */
9518   if (BYTES_BIG_ENDIAN)
9519     {
9520       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9521       if (!one_vector_p)
9522         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9523       sel = expand_simple_binop (vmode, XOR, sel, mask,
9524                                  NULL, 0, OPTAB_LIB_WIDEN);
9525     }
9526   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9527 }
9528
9529 /* Recognize patterns suitable for the TRN instructions.  */
9530 static bool
9531 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9532 {
9533   unsigned int i, odd, mask, nelt = d->nelt;
9534   rtx out, in0, in1, x;
9535   rtx (*gen) (rtx, rtx, rtx);
9536   machine_mode vmode = d->vmode;
9537
9538   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9539     return false;
9540
9541   /* Note that these are little-endian tests.
9542      We correct for big-endian later.  */
9543   if (d->perm[0] == 0)
9544     odd = 0;
9545   else if (d->perm[0] == 1)
9546     odd = 1;
9547   else
9548     return false;
9549   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9550
9551   for (i = 0; i < nelt; i += 2)
9552     {
9553       if (d->perm[i] != i + odd)
9554         return false;
9555       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9556         return false;
9557     }
9558
9559   /* Success!  */
9560   if (d->testing_p)
9561     return true;
9562
9563   in0 = d->op0;
9564   in1 = d->op1;
9565   if (BYTES_BIG_ENDIAN)
9566     {
9567       x = in0, in0 = in1, in1 = x;
9568       odd = !odd;
9569     }
9570   out = d->target;
9571
9572   if (odd)
9573     {
9574       switch (vmode)
9575         {
9576         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9577         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9578         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9579         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9580         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9581         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9582         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9583         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9584         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9585         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9586         default:
9587           return false;
9588         }
9589     }
9590   else
9591     {
9592       switch (vmode)
9593         {
9594         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9595         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9596         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9597         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9598         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9599         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9600         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9601         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9602         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9603         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9604         default:
9605           return false;
9606         }
9607     }
9608
9609   emit_insn (gen (out, in0, in1));
9610   return true;
9611 }
9612
9613 /* Recognize patterns suitable for the UZP instructions.  */
9614 static bool
9615 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9616 {
9617   unsigned int i, odd, mask, nelt = d->nelt;
9618   rtx out, in0, in1, x;
9619   rtx (*gen) (rtx, rtx, rtx);
9620   machine_mode vmode = d->vmode;
9621
9622   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9623     return false;
9624
9625   /* Note that these are little-endian tests.
9626      We correct for big-endian later.  */
9627   if (d->perm[0] == 0)
9628     odd = 0;
9629   else if (d->perm[0] == 1)
9630     odd = 1;
9631   else
9632     return false;
9633   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9634
9635   for (i = 0; i < nelt; i++)
9636     {
9637       unsigned elt = (i * 2 + odd) & mask;
9638       if (d->perm[i] != elt)
9639         return false;
9640     }
9641
9642   /* Success!  */
9643   if (d->testing_p)
9644     return true;
9645
9646   in0 = d->op0;
9647   in1 = d->op1;
9648   if (BYTES_BIG_ENDIAN)
9649     {
9650       x = in0, in0 = in1, in1 = x;
9651       odd = !odd;
9652     }
9653   out = d->target;
9654
9655   if (odd)
9656     {
9657       switch (vmode)
9658         {
9659         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9660         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9661         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9662         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9663         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9664         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9665         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9666         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9667         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9668         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9669         default:
9670           return false;
9671         }
9672     }
9673   else
9674     {
9675       switch (vmode)
9676         {
9677         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9678         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9679         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9680         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9681         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9682         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9683         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9684         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9685         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9686         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9687         default:
9688           return false;
9689         }
9690     }
9691
9692   emit_insn (gen (out, in0, in1));
9693   return true;
9694 }
9695
9696 /* Recognize patterns suitable for the ZIP instructions.  */
9697 static bool
9698 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9699 {
9700   unsigned int i, high, mask, nelt = d->nelt;
9701   rtx out, in0, in1, x;
9702   rtx (*gen) (rtx, rtx, rtx);
9703   machine_mode vmode = d->vmode;
9704
9705   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9706     return false;
9707
9708   /* Note that these are little-endian tests.
9709      We correct for big-endian later.  */
9710   high = nelt / 2;
9711   if (d->perm[0] == high)
9712     /* Do Nothing.  */
9713     ;
9714   else if (d->perm[0] == 0)
9715     high = 0;
9716   else
9717     return false;
9718   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9719
9720   for (i = 0; i < nelt / 2; i++)
9721     {
9722       unsigned elt = (i + high) & mask;
9723       if (d->perm[i * 2] != elt)
9724         return false;
9725       elt = (elt + nelt) & mask;
9726       if (d->perm[i * 2 + 1] != elt)
9727         return false;
9728     }
9729
9730   /* Success!  */
9731   if (d->testing_p)
9732     return true;
9733
9734   in0 = d->op0;
9735   in1 = d->op1;
9736   if (BYTES_BIG_ENDIAN)
9737     {
9738       x = in0, in0 = in1, in1 = x;
9739       high = !high;
9740     }
9741   out = d->target;
9742
9743   if (high)
9744     {
9745       switch (vmode)
9746         {
9747         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9748         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9749         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9750         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9751         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9752         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9753         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9754         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9755         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9756         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9757         default:
9758           return false;
9759         }
9760     }
9761   else
9762     {
9763       switch (vmode)
9764         {
9765         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9766         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9767         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9768         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9769         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9770         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9771         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9772         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9773         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9774         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9775         default:
9776           return false;
9777         }
9778     }
9779
9780   emit_insn (gen (out, in0, in1));
9781   return true;
9782 }
9783
9784 /* Recognize patterns for the EXT insn.  */
9785
9786 static bool
9787 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9788 {
9789   unsigned int i, nelt = d->nelt;
9790   rtx (*gen) (rtx, rtx, rtx, rtx);
9791   rtx offset;
9792
9793   unsigned int location = d->perm[0]; /* Always < nelt.  */
9794
9795   /* Check if the extracted indices are increasing by one.  */
9796   for (i = 1; i < nelt; i++)
9797     {
9798       unsigned int required = location + i;
9799       if (d->one_vector_p)
9800         {
9801           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9802           required &= (nelt - 1);
9803         }
9804       if (d->perm[i] != required)
9805         return false;
9806     }
9807
9808   switch (d->vmode)
9809     {
9810     case V16QImode: gen = gen_aarch64_extv16qi; break;
9811     case V8QImode: gen = gen_aarch64_extv8qi; break;
9812     case V4HImode: gen = gen_aarch64_extv4hi; break;
9813     case V8HImode: gen = gen_aarch64_extv8hi; break;
9814     case V2SImode: gen = gen_aarch64_extv2si; break;
9815     case V4SImode: gen = gen_aarch64_extv4si; break;
9816     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9817     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9818     case V2DImode: gen = gen_aarch64_extv2di; break;
9819     case V2DFmode: gen = gen_aarch64_extv2df; break;
9820     default:
9821       return false;
9822     }
9823
9824   /* Success! */
9825   if (d->testing_p)
9826     return true;
9827
9828   /* The case where (location == 0) is a no-op for both big- and little-endian,
9829      and is removed by the mid-end at optimization levels -O1 and higher.  */
9830
9831   if (BYTES_BIG_ENDIAN && (location != 0))
9832     {
9833       /* After setup, we want the high elements of the first vector (stored
9834          at the LSB end of the register), and the low elements of the second
9835          vector (stored at the MSB end of the register). So swap.  */
9836       std::swap (d->op0, d->op1);
9837       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9838       location = nelt - location;
9839     }
9840
9841   offset = GEN_INT (location);
9842   emit_insn (gen (d->target, d->op0, d->op1, offset));
9843   return true;
9844 }
9845
9846 /* Recognize patterns for the REV insns.  */
9847
9848 static bool
9849 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9850 {
9851   unsigned int i, j, diff, nelt = d->nelt;
9852   rtx (*gen) (rtx, rtx);
9853
9854   if (!d->one_vector_p)
9855     return false;
9856
9857   diff = d->perm[0];
9858   switch (diff)
9859     {
9860     case 7:
9861       switch (d->vmode)
9862         {
9863         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9864         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9865         default:
9866           return false;
9867         }
9868       break;
9869     case 3:
9870       switch (d->vmode)
9871         {
9872         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9873         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9874         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9875         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9876         default:
9877           return false;
9878         }
9879       break;
9880     case 1:
9881       switch (d->vmode)
9882         {
9883         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9884         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9885         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9886         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9887         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9888         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9889         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9890         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9891         default:
9892           return false;
9893         }
9894       break;
9895     default:
9896       return false;
9897     }
9898
9899   for (i = 0; i < nelt ; i += diff + 1)
9900     for (j = 0; j <= diff; j += 1)
9901       {
9902         /* This is guaranteed to be true as the value of diff
9903            is 7, 3, 1 and we should have enough elements in the
9904            queue to generate this.  Getting a vector mask with a
9905            value of diff other than these values implies that
9906            something is wrong by the time we get here.  */
9907         gcc_assert (i + j < nelt);
9908         if (d->perm[i + j] != i + diff - j)
9909           return false;
9910       }
9911
9912   /* Success! */
9913   if (d->testing_p)
9914     return true;
9915
9916   emit_insn (gen (d->target, d->op0));
9917   return true;
9918 }
9919
9920 static bool
9921 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9922 {
9923   rtx (*gen) (rtx, rtx, rtx);
9924   rtx out = d->target;
9925   rtx in0;
9926   machine_mode vmode = d->vmode;
9927   unsigned int i, elt, nelt = d->nelt;
9928   rtx lane;
9929
9930   elt = d->perm[0];
9931   for (i = 1; i < nelt; i++)
9932     {
9933       if (elt != d->perm[i])
9934         return false;
9935     }
9936
9937   /* The generic preparation in aarch64_expand_vec_perm_const_1
9938      swaps the operand order and the permute indices if it finds
9939      d->perm[0] to be in the second operand.  Thus, we can always
9940      use d->op0 and need not do any extra arithmetic to get the
9941      correct lane number.  */
9942   in0 = d->op0;
9943   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9944
9945   switch (vmode)
9946     {
9947     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9948     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9949     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9950     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9951     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9952     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9953     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9954     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9955     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9956     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9957     default:
9958       return false;
9959     }
9960
9961   emit_insn (gen (out, in0, lane));
9962   return true;
9963 }
9964
9965 static bool
9966 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9967 {
9968   rtx rperm[MAX_VECT_LEN], sel;
9969   machine_mode vmode = d->vmode;
9970   unsigned int i, nelt = d->nelt;
9971
9972   if (d->testing_p)
9973     return true;
9974
9975   /* Generic code will try constant permutation twice.  Once with the
9976      original mode and again with the elements lowered to QImode.
9977      So wait and don't do the selector expansion ourselves.  */
9978   if (vmode != V8QImode && vmode != V16QImode)
9979     return false;
9980
9981   for (i = 0; i < nelt; ++i)
9982     {
9983       int nunits = GET_MODE_NUNITS (vmode);
9984
9985       /* If big-endian and two vectors we end up with a weird mixed-endian
9986          mode on NEON.  Reverse the index within each word but not the word
9987          itself.  */
9988       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9989                                            : d->perm[i]);
9990     }
9991   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9992   sel = force_reg (vmode, sel);
9993
9994   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9995   return true;
9996 }
9997
9998 static bool
9999 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10000 {
10001   /* The pattern matching functions above are written to look for a small
10002      number to begin the sequence (0, 1, N/2).  If we begin with an index
10003      from the second operand, we can swap the operands.  */
10004   if (d->perm[0] >= d->nelt)
10005     {
10006       unsigned i, nelt = d->nelt;
10007
10008       gcc_assert (nelt == (nelt & -nelt));
10009       for (i = 0; i < nelt; ++i)
10010         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10011
10012       std::swap (d->op0, d->op1);
10013     }
10014
10015   if (TARGET_SIMD)
10016     {
10017       if (aarch64_evpc_rev (d))
10018         return true;
10019       else if (aarch64_evpc_ext (d))
10020         return true;
10021       else if (aarch64_evpc_dup (d))
10022         return true;
10023       else if (aarch64_evpc_zip (d))
10024         return true;
10025       else if (aarch64_evpc_uzp (d))
10026         return true;
10027       else if (aarch64_evpc_trn (d))
10028         return true;
10029       return aarch64_evpc_tbl (d);
10030     }
10031   return false;
10032 }
10033
10034 /* Expand a vec_perm_const pattern.  */
10035
10036 bool
10037 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10038 {
10039   struct expand_vec_perm_d d;
10040   int i, nelt, which;
10041
10042   d.target = target;
10043   d.op0 = op0;
10044   d.op1 = op1;
10045
10046   d.vmode = GET_MODE (target);
10047   gcc_assert (VECTOR_MODE_P (d.vmode));
10048   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10049   d.testing_p = false;
10050
10051   for (i = which = 0; i < nelt; ++i)
10052     {
10053       rtx e = XVECEXP (sel, 0, i);
10054       int ei = INTVAL (e) & (2 * nelt - 1);
10055       which |= (ei < nelt ? 1 : 2);
10056       d.perm[i] = ei;
10057     }
10058
10059   switch (which)
10060     {
10061     default:
10062       gcc_unreachable ();
10063
10064     case 3:
10065       d.one_vector_p = false;
10066       if (!rtx_equal_p (op0, op1))
10067         break;
10068
10069       /* The elements of PERM do not suggest that only the first operand
10070          is used, but both operands are identical.  Allow easier matching
10071          of the permutation by folding the permutation into the single
10072          input vector.  */
10073       /* Fall Through.  */
10074     case 2:
10075       for (i = 0; i < nelt; ++i)
10076         d.perm[i] &= nelt - 1;
10077       d.op0 = op1;
10078       d.one_vector_p = true;
10079       break;
10080
10081     case 1:
10082       d.op1 = op0;
10083       d.one_vector_p = true;
10084       break;
10085     }
10086
10087   return aarch64_expand_vec_perm_const_1 (&d);
10088 }
10089
10090 static bool
10091 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10092                                      const unsigned char *sel)
10093 {
10094   struct expand_vec_perm_d d;
10095   unsigned int i, nelt, which;
10096   bool ret;
10097
10098   d.vmode = vmode;
10099   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10100   d.testing_p = true;
10101   memcpy (d.perm, sel, nelt);
10102
10103   /* Calculate whether all elements are in one vector.  */
10104   for (i = which = 0; i < nelt; ++i)
10105     {
10106       unsigned char e = d.perm[i];
10107       gcc_assert (e < 2 * nelt);
10108       which |= (e < nelt ? 1 : 2);
10109     }
10110
10111   /* If all elements are from the second vector, reindex as if from the
10112      first vector.  */
10113   if (which == 2)
10114     for (i = 0; i < nelt; ++i)
10115       d.perm[i] -= nelt;
10116
10117   /* Check whether the mask can be applied to a single vector.  */
10118   d.one_vector_p = (which != 3);
10119
10120   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10121   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10122   if (!d.one_vector_p)
10123     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10124
10125   start_sequence ();
10126   ret = aarch64_expand_vec_perm_const_1 (&d);
10127   end_sequence ();
10128
10129   return ret;
10130 }
10131
10132 rtx
10133 aarch64_reverse_mask (enum machine_mode mode)
10134 {
10135   /* We have to reverse each vector because we dont have
10136      a permuted load that can reverse-load according to ABI rules.  */
10137   rtx mask;
10138   rtvec v = rtvec_alloc (16);
10139   int i, j;
10140   int nunits = GET_MODE_NUNITS (mode);
10141   int usize = GET_MODE_UNIT_SIZE (mode);
10142
10143   gcc_assert (BYTES_BIG_ENDIAN);
10144   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10145
10146   for (i = 0; i < nunits; i++)
10147     for (j = 0; j < usize; j++)
10148       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10149   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10150   return force_reg (V16QImode, mask);
10151 }
10152
10153 /* Implement MODES_TIEABLE_P.  */
10154
10155 bool
10156 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10157 {
10158   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10159     return true;
10160
10161   /* We specifically want to allow elements of "structure" modes to
10162      be tieable to the structure.  This more general condition allows
10163      other rarer situations too.  */
10164   if (TARGET_SIMD
10165       && aarch64_vector_mode_p (mode1)
10166       && aarch64_vector_mode_p (mode2))
10167     return true;
10168
10169   return false;
10170 }
10171
10172 /* Return a new RTX holding the result of moving POINTER forward by
10173    AMOUNT bytes.  */
10174
10175 static rtx
10176 aarch64_move_pointer (rtx pointer, int amount)
10177 {
10178   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10179
10180   return adjust_automodify_address (pointer, GET_MODE (pointer),
10181                                     next, amount);
10182 }
10183
10184 /* Return a new RTX holding the result of moving POINTER forward by the
10185    size of the mode it points to.  */
10186
10187 static rtx
10188 aarch64_progress_pointer (rtx pointer)
10189 {
10190   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10191
10192   return aarch64_move_pointer (pointer, amount);
10193 }
10194
10195 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10196    MODE bytes.  */
10197
10198 static void
10199 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10200                                               machine_mode mode)
10201 {
10202   rtx reg = gen_reg_rtx (mode);
10203
10204   /* "Cast" the pointers to the correct mode.  */
10205   *src = adjust_address (*src, mode, 0);
10206   *dst = adjust_address (*dst, mode, 0);
10207   /* Emit the memcpy.  */
10208   emit_move_insn (reg, *src);
10209   emit_move_insn (*dst, reg);
10210   /* Move the pointers forward.  */
10211   *src = aarch64_progress_pointer (*src);
10212   *dst = aarch64_progress_pointer (*dst);
10213 }
10214
10215 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10216    we succeed, otherwise return false.  */
10217
10218 bool
10219 aarch64_expand_movmem (rtx *operands)
10220 {
10221   unsigned int n;
10222   rtx dst = operands[0];
10223   rtx src = operands[1];
10224   rtx base;
10225   bool speed_p = !optimize_function_for_size_p (cfun);
10226
10227   /* When optimizing for size, give a better estimate of the length of a
10228      memcpy call, but use the default otherwise.  */
10229   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10230
10231   /* We can't do anything smart if the amount to copy is not constant.  */
10232   if (!CONST_INT_P (operands[2]))
10233     return false;
10234
10235   n = UINTVAL (operands[2]);
10236
10237   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10238      need to make at most two moves.  For cases above 16 bytes it will be one
10239      move for each 16 byte chunk, then at most two additional moves.  */
10240   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10241     return false;
10242
10243   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10244   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10245
10246   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10247   src = adjust_automodify_address (src, VOIDmode, base, 0);
10248
10249   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10250      1-byte chunk.  */
10251   if (n < 4)
10252     {
10253       if (n >= 2)
10254         {
10255           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10256           n -= 2;
10257         }
10258
10259       if (n == 1)
10260         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10261
10262       return true;
10263     }
10264
10265   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10266      4-byte chunk, partially overlapping with the previously copied chunk.  */
10267   if (n < 8)
10268     {
10269       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10270       n -= 4;
10271       if (n > 0)
10272         {
10273           int move = n - 4;
10274
10275           src = aarch64_move_pointer (src, move);
10276           dst = aarch64_move_pointer (dst, move);
10277           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10278         }
10279       return true;
10280     }
10281
10282   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10283      them, then (if applicable) an 8-byte chunk.  */
10284   while (n >= 8)
10285     {
10286       if (n / 16)
10287         {
10288           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10289           n -= 16;
10290         }
10291       else
10292         {
10293           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10294           n -= 8;
10295         }
10296     }
10297
10298   /* Finish the final bytes of the copy.  We can always do this in one
10299      instruction.  We either copy the exact amount we need, or partially
10300      overlap with the previous chunk we copied and copy 8-bytes.  */
10301   if (n == 0)
10302     return true;
10303   else if (n == 1)
10304     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10305   else if (n == 2)
10306     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10307   else if (n == 4)
10308     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10309   else
10310     {
10311       if (n == 3)
10312         {
10313           src = aarch64_move_pointer (src, -1);
10314           dst = aarch64_move_pointer (dst, -1);
10315           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10316         }
10317       else
10318         {
10319           int move = n - 8;
10320
10321           src = aarch64_move_pointer (src, move);
10322           dst = aarch64_move_pointer (dst, move);
10323           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10324         }
10325     }
10326
10327   return true;
10328 }
10329
10330 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10331
10332 static unsigned HOST_WIDE_INT
10333 aarch64_asan_shadow_offset (void)
10334 {
10335   return (HOST_WIDE_INT_1 << 36);
10336 }
10337
10338 static bool
10339 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10340                                         unsigned int align,
10341                                         enum by_pieces_operation op,
10342                                         bool speed_p)
10343 {
10344   /* STORE_BY_PIECES can be used when copying a constant string, but
10345      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10346      For now we always fail this and let the move_by_pieces code copy
10347      the string from read-only memory.  */
10348   if (op == STORE_BY_PIECES)
10349     return false;
10350
10351   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10352 }
10353
10354 static enum machine_mode
10355 aarch64_code_to_ccmode (enum rtx_code code)
10356 {
10357   switch (code)
10358     {
10359     case NE:
10360       return CC_DNEmode;
10361
10362     case EQ:
10363       return CC_DEQmode;
10364
10365     case LE:
10366       return CC_DLEmode;
10367
10368     case LT:
10369       return CC_DLTmode;
10370
10371     case GE:
10372       return CC_DGEmode;
10373
10374     case GT:
10375       return CC_DGTmode;
10376
10377     case LEU:
10378       return CC_DLEUmode;
10379
10380     case LTU:
10381       return CC_DLTUmode;
10382
10383     case GEU:
10384       return CC_DGEUmode;
10385
10386     case GTU:
10387       return CC_DGTUmode;
10388
10389     default:
10390       return CCmode;
10391     }
10392 }
10393
10394 static rtx
10395 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10396                         int code, tree treeop0, tree treeop1)
10397 {
10398   enum machine_mode op_mode, cmp_mode, cc_mode;
10399   rtx op0, op1, cmp, target;
10400   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10401   enum insn_code icode;
10402   struct expand_operand ops[4];
10403
10404   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10405   if (cc_mode == CCmode)
10406     return NULL_RTX;
10407
10408   start_sequence ();
10409   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10410
10411   op_mode = GET_MODE (op0);
10412   if (op_mode == VOIDmode)
10413     op_mode = GET_MODE (op1);
10414
10415   switch (op_mode)
10416     {
10417     case QImode:
10418     case HImode:
10419     case SImode:
10420       cmp_mode = SImode;
10421       icode = CODE_FOR_cmpsi;
10422       break;
10423
10424     case DImode:
10425       cmp_mode = DImode;
10426       icode = CODE_FOR_cmpdi;
10427       break;
10428
10429     default:
10430       end_sequence ();
10431       return NULL_RTX;
10432     }
10433
10434   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10435   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10436   if (!op0 || !op1)
10437     {
10438       end_sequence ();
10439       return NULL_RTX;
10440     }
10441   *prep_seq = get_insns ();
10442   end_sequence ();
10443
10444   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10445   target = gen_rtx_REG (CCmode, CC_REGNUM);
10446
10447   create_output_operand (&ops[0], target, CCmode);
10448   create_fixed_operand (&ops[1], cmp);
10449   create_fixed_operand (&ops[2], op0);
10450   create_fixed_operand (&ops[3], op1);
10451
10452   start_sequence ();
10453   if (!maybe_expand_insn (icode, 4, ops))
10454     {
10455       end_sequence ();
10456       return NULL_RTX;
10457     }
10458   *gen_seq = get_insns ();
10459   end_sequence ();
10460
10461   return gen_rtx_REG (cc_mode, CC_REGNUM);
10462 }
10463
10464 static rtx
10465 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10466                        tree treeop0, tree treeop1, int bit_code)
10467 {
10468   rtx op0, op1, cmp0, cmp1, target;
10469   enum machine_mode op_mode, cmp_mode, cc_mode;
10470   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10471   enum insn_code icode = CODE_FOR_ccmp_andsi;
10472   struct expand_operand ops[6];
10473
10474   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10475   if (cc_mode == CCmode)
10476     return NULL_RTX;
10477
10478   push_to_sequence ((rtx_insn*) *prep_seq);
10479   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10480
10481   op_mode = GET_MODE (op0);
10482   if (op_mode == VOIDmode)
10483     op_mode = GET_MODE (op1);
10484
10485   switch (op_mode)
10486     {
10487     case QImode:
10488     case HImode:
10489     case SImode:
10490       cmp_mode = SImode;
10491       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10492                                                 : CODE_FOR_ccmp_iorsi;
10493       break;
10494
10495     case DImode:
10496       cmp_mode = DImode;
10497       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10498                                                 : CODE_FOR_ccmp_iordi;
10499       break;
10500
10501     default:
10502       end_sequence ();
10503       return NULL_RTX;
10504     }
10505
10506   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10507   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10508   if (!op0 || !op1)
10509     {
10510       end_sequence ();
10511       return NULL_RTX;
10512     }
10513   *prep_seq = get_insns ();
10514   end_sequence ();
10515
10516   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10517   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10518   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10519
10520   create_fixed_operand (&ops[0], prev);
10521   create_fixed_operand (&ops[1], target);
10522   create_fixed_operand (&ops[2], op0);
10523   create_fixed_operand (&ops[3], op1);
10524   create_fixed_operand (&ops[4], cmp0);
10525   create_fixed_operand (&ops[5], cmp1);
10526
10527   push_to_sequence ((rtx_insn*) *gen_seq);
10528   if (!maybe_expand_insn (icode, 6, ops))
10529     {
10530       end_sequence ();
10531       return NULL_RTX;
10532     }
10533
10534   *gen_seq = get_insns ();
10535   end_sequence ();
10536
10537   return target;
10538 }
10539
10540 #undef TARGET_GEN_CCMP_FIRST
10541 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10542
10543 #undef TARGET_GEN_CCMP_NEXT
10544 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10545
10546 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10547    instruction fusion of some sort.  */
10548
10549 static bool
10550 aarch64_macro_fusion_p (void)
10551 {
10552   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10553 }
10554
10555
10556 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10557    should be kept together during scheduling.  */
10558
10559 static bool
10560 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10561 {
10562   rtx set_dest;
10563   rtx prev_set = single_set (prev);
10564   rtx curr_set = single_set (curr);
10565   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10566   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10567
10568   if (!aarch64_macro_fusion_p ())
10569     return false;
10570
10571   if (simple_sets_p
10572       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10573     {
10574       /* We are trying to match:
10575          prev (mov)  == (set (reg r0) (const_int imm16))
10576          curr (movk) == (set (zero_extract (reg r0)
10577                                            (const_int 16)
10578                                            (const_int 16))
10579                              (const_int imm16_1))  */
10580
10581       set_dest = SET_DEST (curr_set);
10582
10583       if (GET_CODE (set_dest) == ZERO_EXTRACT
10584           && CONST_INT_P (SET_SRC (curr_set))
10585           && CONST_INT_P (SET_SRC (prev_set))
10586           && CONST_INT_P (XEXP (set_dest, 2))
10587           && INTVAL (XEXP (set_dest, 2)) == 16
10588           && REG_P (XEXP (set_dest, 0))
10589           && REG_P (SET_DEST (prev_set))
10590           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10591         {
10592           return true;
10593         }
10594     }
10595
10596   if (simple_sets_p
10597       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10598     {
10599
10600       /*  We're trying to match:
10601           prev (adrp) == (set (reg r1)
10602                               (high (symbol_ref ("SYM"))))
10603           curr (add) == (set (reg r0)
10604                              (lo_sum (reg r1)
10605                                      (symbol_ref ("SYM"))))
10606           Note that r0 need not necessarily be the same as r1, especially
10607           during pre-regalloc scheduling.  */
10608
10609       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10610           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10611         {
10612           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10613               && REG_P (XEXP (SET_SRC (curr_set), 0))
10614               && REGNO (XEXP (SET_SRC (curr_set), 0))
10615                  == REGNO (SET_DEST (prev_set))
10616               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10617                               XEXP (SET_SRC (curr_set), 1)))
10618             return true;
10619         }
10620     }
10621
10622   if (simple_sets_p
10623       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10624     {
10625
10626       /* We're trying to match:
10627          prev (movk) == (set (zero_extract (reg r0)
10628                                            (const_int 16)
10629                                            (const_int 32))
10630                              (const_int imm16_1))
10631          curr (movk) == (set (zero_extract (reg r0)
10632                                            (const_int 16)
10633                                            (const_int 48))
10634                              (const_int imm16_2))  */
10635
10636       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10637           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10638           && REG_P (XEXP (SET_DEST (prev_set), 0))
10639           && REG_P (XEXP (SET_DEST (curr_set), 0))
10640           && REGNO (XEXP (SET_DEST (prev_set), 0))
10641              == REGNO (XEXP (SET_DEST (curr_set), 0))
10642           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10643           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10644           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10645           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10646           && CONST_INT_P (SET_SRC (prev_set))
10647           && CONST_INT_P (SET_SRC (curr_set)))
10648         return true;
10649
10650     }
10651   if (simple_sets_p
10652       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10653     {
10654       /* We're trying to match:
10655           prev (adrp) == (set (reg r0)
10656                               (high (symbol_ref ("SYM"))))
10657           curr (ldr) == (set (reg r1)
10658                              (mem (lo_sum (reg r0)
10659                                              (symbol_ref ("SYM")))))
10660                  or
10661           curr (ldr) == (set (reg r1)
10662                              (zero_extend (mem
10663                                            (lo_sum (reg r0)
10664                                                    (symbol_ref ("SYM"))))))  */
10665       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10666           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10667         {
10668           rtx curr_src = SET_SRC (curr_set);
10669
10670           if (GET_CODE (curr_src) == ZERO_EXTEND)
10671             curr_src = XEXP (curr_src, 0);
10672
10673           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10674               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10675               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10676                  == REGNO (SET_DEST (prev_set))
10677               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10678                               XEXP (SET_SRC (prev_set), 0)))
10679               return true;
10680         }
10681     }
10682
10683   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10684       && any_condjump_p (curr))
10685     {
10686       enum attr_type prev_type = get_attr_type (prev);
10687
10688       /* FIXME: this misses some which is considered simple arthematic
10689          instructions for ThunderX.  Simple shifts are missed here.  */
10690       if (prev_type == TYPE_ALUS_SREG
10691           || prev_type == TYPE_ALUS_IMM
10692           || prev_type == TYPE_LOGICS_REG
10693           || prev_type == TYPE_LOGICS_IMM)
10694         return true;
10695     }
10696
10697   return false;
10698 }
10699
10700 /* If MEM is in the form of [base+offset], extract the two parts
10701    of address and set to BASE and OFFSET, otherwise return false
10702    after clearing BASE and OFFSET.  */
10703
10704 bool
10705 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10706 {
10707   rtx addr;
10708
10709   gcc_assert (MEM_P (mem));
10710
10711   addr = XEXP (mem, 0);
10712
10713   if (REG_P (addr))
10714     {
10715       *base = addr;
10716       *offset = const0_rtx;
10717       return true;
10718     }
10719
10720   if (GET_CODE (addr) == PLUS
10721       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10722     {
10723       *base = XEXP (addr, 0);
10724       *offset = XEXP (addr, 1);
10725       return true;
10726     }
10727
10728   *base = NULL_RTX;
10729   *offset = NULL_RTX;
10730
10731   return false;
10732 }
10733
10734 /* Types for scheduling fusion.  */
10735 enum sched_fusion_type
10736 {
10737   SCHED_FUSION_NONE = 0,
10738   SCHED_FUSION_LD_SIGN_EXTEND,
10739   SCHED_FUSION_LD_ZERO_EXTEND,
10740   SCHED_FUSION_LD,
10741   SCHED_FUSION_ST,
10742   SCHED_FUSION_NUM
10743 };
10744
10745 /* If INSN is a load or store of address in the form of [base+offset],
10746    extract the two parts and set to BASE and OFFSET.  Return scheduling
10747    fusion type this INSN is.  */
10748
10749 static enum sched_fusion_type
10750 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10751 {
10752   rtx x, dest, src;
10753   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10754
10755   gcc_assert (INSN_P (insn));
10756   x = PATTERN (insn);
10757   if (GET_CODE (x) != SET)
10758     return SCHED_FUSION_NONE;
10759
10760   src = SET_SRC (x);
10761   dest = SET_DEST (x);
10762
10763   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10764       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10765     return SCHED_FUSION_NONE;
10766
10767   if (GET_CODE (src) == SIGN_EXTEND)
10768     {
10769       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10770       src = XEXP (src, 0);
10771       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10772         return SCHED_FUSION_NONE;
10773     }
10774   else if (GET_CODE (src) == ZERO_EXTEND)
10775     {
10776       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10777       src = XEXP (src, 0);
10778       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10779         return SCHED_FUSION_NONE;
10780     }
10781
10782   if (GET_CODE (src) == MEM && REG_P (dest))
10783     extract_base_offset_in_addr (src, base, offset);
10784   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10785     {
10786       fusion = SCHED_FUSION_ST;
10787       extract_base_offset_in_addr (dest, base, offset);
10788     }
10789   else
10790     return SCHED_FUSION_NONE;
10791
10792   if (*base == NULL_RTX || *offset == NULL_RTX)
10793     fusion = SCHED_FUSION_NONE;
10794
10795   return fusion;
10796 }
10797
10798 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10799
10800    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10801    and PRI are only calculated for these instructions.  For other instruction,
10802    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10803    type instruction fusion can be added by returning different priorities.
10804
10805    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10806
10807 static void
10808 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10809                                int *fusion_pri, int *pri)
10810 {
10811   int tmp, off_val;
10812   rtx base, offset;
10813   enum sched_fusion_type fusion;
10814
10815   gcc_assert (INSN_P (insn));
10816
10817   tmp = max_pri - 1;
10818   fusion = fusion_load_store (insn, &base, &offset);
10819   if (fusion == SCHED_FUSION_NONE)
10820     {
10821       *pri = tmp;
10822       *fusion_pri = tmp;
10823       return;
10824     }
10825
10826   /* Set FUSION_PRI according to fusion type and base register.  */
10827   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10828
10829   /* Calculate PRI.  */
10830   tmp /= 2;
10831
10832   /* INSN with smaller offset goes first.  */
10833   off_val = (int)(INTVAL (offset));
10834   if (off_val >= 0)
10835     tmp -= (off_val & 0xfffff);
10836   else
10837     tmp += ((- off_val) & 0xfffff);
10838
10839   *pri = tmp;
10840   return;
10841 }
10842
10843 /* Given OPERANDS of consecutive load/store, check if we can merge
10844    them into ldp/stp.  LOAD is true if they are load instructions.
10845    MODE is the mode of memory operands.  */
10846
10847 bool
10848 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10849                                 enum machine_mode mode)
10850 {
10851   HOST_WIDE_INT offval_1, offval_2, msize;
10852   enum reg_class rclass_1, rclass_2;
10853   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10854
10855   if (load)
10856     {
10857       mem_1 = operands[1];
10858       mem_2 = operands[3];
10859       reg_1 = operands[0];
10860       reg_2 = operands[2];
10861       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10862       if (REGNO (reg_1) == REGNO (reg_2))
10863         return false;
10864     }
10865   else
10866     {
10867       mem_1 = operands[0];
10868       mem_2 = operands[2];
10869       reg_1 = operands[1];
10870       reg_2 = operands[3];
10871     }
10872
10873   /* The mems cannot be volatile.  */
10874   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10875     return false;
10876
10877   /* Check if the addresses are in the form of [base+offset].  */
10878   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10879   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10880     return false;
10881   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10882   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10883     return false;
10884
10885   /* Check if the bases are same.  */
10886   if (!rtx_equal_p (base_1, base_2))
10887     return false;
10888
10889   offval_1 = INTVAL (offset_1);
10890   offval_2 = INTVAL (offset_2);
10891   msize = GET_MODE_SIZE (mode);
10892   /* Check if the offsets are consecutive.  */
10893   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10894     return false;
10895
10896   /* Check if the addresses are clobbered by load.  */
10897   if (load)
10898     {
10899       if (reg_mentioned_p (reg_1, mem_1))
10900         return false;
10901
10902       /* In increasing order, the last load can clobber the address.  */
10903       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10904       return false;
10905     }
10906
10907   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10908     rclass_1 = FP_REGS;
10909   else
10910     rclass_1 = GENERAL_REGS;
10911
10912   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10913     rclass_2 = FP_REGS;
10914   else
10915     rclass_2 = GENERAL_REGS;
10916
10917   /* Check if the registers are of same class.  */
10918   if (rclass_1 != rclass_2)
10919     return false;
10920
10921   return true;
10922 }
10923
10924 /* Given OPERANDS of consecutive load/store, check if we can merge
10925    them into ldp/stp by adjusting the offset.  LOAD is true if they
10926    are load instructions.  MODE is the mode of memory operands.
10927
10928    Given below consecutive stores:
10929
10930      str  w1, [xb, 0x100]
10931      str  w1, [xb, 0x104]
10932      str  w1, [xb, 0x108]
10933      str  w1, [xb, 0x10c]
10934
10935    Though the offsets are out of the range supported by stp, we can
10936    still pair them after adjusting the offset, like:
10937
10938      add  scratch, xb, 0x100
10939      stp  w1, w1, [scratch]
10940      stp  w1, w1, [scratch, 0x8]
10941
10942    The peephole patterns detecting this opportunity should guarantee
10943    the scratch register is avaliable.  */
10944
10945 bool
10946 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
10947                                        enum machine_mode mode)
10948 {
10949   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
10950   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
10951   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
10952   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
10953
10954   if (load)
10955     {
10956       reg_1 = operands[0];
10957       mem_1 = operands[1];
10958       reg_2 = operands[2];
10959       mem_2 = operands[3];
10960       reg_3 = operands[4];
10961       mem_3 = operands[5];
10962       reg_4 = operands[6];
10963       mem_4 = operands[7];
10964       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
10965                   && REG_P (reg_3) && REG_P (reg_4));
10966       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
10967         return false;
10968     }
10969   else
10970     {
10971       mem_1 = operands[0];
10972       reg_1 = operands[1];
10973       mem_2 = operands[2];
10974       reg_2 = operands[3];
10975       mem_3 = operands[4];
10976       reg_3 = operands[5];
10977       mem_4 = operands[6];
10978       reg_4 = operands[7];
10979     }
10980   /* Skip if memory operand is by itslef valid for ldp/stp.  */
10981   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
10982     return false;
10983
10984   /* The mems cannot be volatile.  */
10985   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
10986       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
10987     return false;
10988
10989   /* Check if the addresses are in the form of [base+offset].  */
10990   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10991   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10992     return false;
10993   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10994   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10995     return false;
10996   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
10997   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
10998     return false;
10999   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11000   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11001     return false;
11002
11003   /* Check if the bases are same.  */
11004   if (!rtx_equal_p (base_1, base_2)
11005       || !rtx_equal_p (base_2, base_3)
11006       || !rtx_equal_p (base_3, base_4))
11007     return false;
11008
11009   offval_1 = INTVAL (offset_1);
11010   offval_2 = INTVAL (offset_2);
11011   offval_3 = INTVAL (offset_3);
11012   offval_4 = INTVAL (offset_4);
11013   msize = GET_MODE_SIZE (mode);
11014   /* Check if the offsets are consecutive.  */
11015   if ((offval_1 != (offval_2 + msize)
11016        || offval_1 != (offval_3 + msize * 2)
11017        || offval_1 != (offval_4 + msize * 3))
11018       && (offval_4 != (offval_3 + msize)
11019           || offval_4 != (offval_2 + msize * 2)
11020           || offval_4 != (offval_1 + msize * 3)))
11021     return false;
11022
11023   /* Check if the addresses are clobbered by load.  */
11024   if (load)
11025     {
11026       if (reg_mentioned_p (reg_1, mem_1)
11027           || reg_mentioned_p (reg_2, mem_2)
11028           || reg_mentioned_p (reg_3, mem_3))
11029         return false;
11030
11031       /* In increasing order, the last load can clobber the address.  */
11032       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11033         return false;
11034     }
11035
11036   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11037     rclass_1 = FP_REGS;
11038   else
11039     rclass_1 = GENERAL_REGS;
11040
11041   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11042     rclass_2 = FP_REGS;
11043   else
11044     rclass_2 = GENERAL_REGS;
11045
11046   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11047     rclass_3 = FP_REGS;
11048   else
11049     rclass_3 = GENERAL_REGS;
11050
11051   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11052     rclass_4 = FP_REGS;
11053   else
11054     rclass_4 = GENERAL_REGS;
11055
11056   /* Check if the registers are of same class.  */
11057   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11058     return false;
11059
11060   return true;
11061 }
11062
11063 /* Given OPERANDS of consecutive load/store, this function pairs them
11064    into ldp/stp after adjusting the offset.  It depends on the fact
11065    that addresses of load/store instructions are in increasing order.
11066    MODE is the mode of memory operands.  CODE is the rtl operator
11067    which should be applied to all memory operands, it's SIGN_EXTEND,
11068    ZERO_EXTEND or UNKNOWN.  */
11069
11070 bool
11071 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11072                              enum machine_mode mode, RTX_CODE code)
11073 {
11074   rtx base, offset, t1, t2;
11075   rtx mem_1, mem_2, mem_3, mem_4;
11076   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11077
11078   if (load)
11079     {
11080       mem_1 = operands[1];
11081       mem_2 = operands[3];
11082       mem_3 = operands[5];
11083       mem_4 = operands[7];
11084     }
11085   else
11086     {
11087       mem_1 = operands[0];
11088       mem_2 = operands[2];
11089       mem_3 = operands[4];
11090       mem_4 = operands[6];
11091       gcc_assert (code == UNKNOWN);
11092     }
11093
11094   extract_base_offset_in_addr (mem_1, &base, &offset);
11095   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11096
11097   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11098   msize = GET_MODE_SIZE (mode);
11099   stp_off_limit = msize * 0x40;
11100   off_val = INTVAL (offset);
11101   abs_off = (off_val < 0) ? -off_val : off_val;
11102   new_off = abs_off % stp_off_limit;
11103   adj_off = abs_off - new_off;
11104
11105   /* Further adjust to make sure all offsets are OK.  */
11106   if ((new_off + msize * 2) >= stp_off_limit)
11107     {
11108       adj_off += stp_off_limit;
11109       new_off -= stp_off_limit;
11110     }
11111
11112   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11113   if (adj_off >= 0x1000)
11114     return false;
11115
11116   if (off_val < 0)
11117     {
11118       adj_off = -adj_off;
11119       new_off = -new_off;
11120     }
11121
11122   /* Create new memory references.  */
11123   mem_1 = change_address (mem_1, VOIDmode,
11124                           plus_constant (DImode, operands[8], new_off));
11125
11126   /* Check if the adjusted address is OK for ldp/stp.  */
11127   if (!aarch64_mem_pair_operand (mem_1, mode))
11128     return false;
11129
11130   msize = GET_MODE_SIZE (mode);
11131   mem_2 = change_address (mem_2, VOIDmode,
11132                           plus_constant (DImode,
11133                                          operands[8],
11134                                          new_off + msize));
11135   mem_3 = change_address (mem_3, VOIDmode,
11136                           plus_constant (DImode,
11137                                          operands[8],
11138                                          new_off + msize * 2));
11139   mem_4 = change_address (mem_4, VOIDmode,
11140                           plus_constant (DImode,
11141                                          operands[8],
11142                                          new_off + msize * 3));
11143
11144   if (code == ZERO_EXTEND)
11145     {
11146       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11147       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11148       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11149       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11150     }
11151   else if (code == SIGN_EXTEND)
11152     {
11153       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11154       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11155       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11156       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11157     }
11158
11159   if (load)
11160     {
11161       operands[1] = mem_1;
11162       operands[3] = mem_2;
11163       operands[5] = mem_3;
11164       operands[7] = mem_4;
11165     }
11166   else
11167     {
11168       operands[0] = mem_1;
11169       operands[2] = mem_2;
11170       operands[4] = mem_3;
11171       operands[6] = mem_4;
11172     }
11173
11174   /* Emit adjusting instruction.  */
11175   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11176                           plus_constant (DImode, base, adj_off)));
11177   /* Emit ldp/stp instructions.  */
11178   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11179   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11180   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11181   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11182   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11183   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11184   return true;
11185 }
11186
11187 #undef TARGET_ADDRESS_COST
11188 #define TARGET_ADDRESS_COST aarch64_address_cost
11189
11190 /* This hook will determines whether unnamed bitfields affect the alignment
11191    of the containing structure.  The hook returns true if the structure
11192    should inherit the alignment requirements of an unnamed bitfield's
11193    type.  */
11194 #undef TARGET_ALIGN_ANON_BITFIELD
11195 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11196
11197 #undef TARGET_ASM_ALIGNED_DI_OP
11198 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11199
11200 #undef TARGET_ASM_ALIGNED_HI_OP
11201 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11202
11203 #undef TARGET_ASM_ALIGNED_SI_OP
11204 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11205
11206 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11207 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11208   hook_bool_const_tree_hwi_hwi_const_tree_true
11209
11210 #undef TARGET_ASM_FILE_START
11211 #define TARGET_ASM_FILE_START aarch64_start_file
11212
11213 #undef TARGET_ASM_OUTPUT_MI_THUNK
11214 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11215
11216 #undef TARGET_ASM_SELECT_RTX_SECTION
11217 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11218
11219 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11220 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11221
11222 #undef TARGET_BUILD_BUILTIN_VA_LIST
11223 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11224
11225 #undef TARGET_CALLEE_COPIES
11226 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11227
11228 #undef TARGET_CAN_ELIMINATE
11229 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11230
11231 #undef TARGET_CANNOT_FORCE_CONST_MEM
11232 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11233
11234 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11235 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11236
11237 /* Only the least significant bit is used for initialization guard
11238    variables.  */
11239 #undef TARGET_CXX_GUARD_MASK_BIT
11240 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11241
11242 #undef TARGET_C_MODE_FOR_SUFFIX
11243 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11244
11245 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11246 #undef  TARGET_DEFAULT_TARGET_FLAGS
11247 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11248 #endif
11249
11250 #undef TARGET_CLASS_MAX_NREGS
11251 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11252
11253 #undef TARGET_BUILTIN_DECL
11254 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11255
11256 #undef  TARGET_EXPAND_BUILTIN
11257 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11258
11259 #undef TARGET_EXPAND_BUILTIN_VA_START
11260 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11261
11262 #undef TARGET_FOLD_BUILTIN
11263 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11264
11265 #undef TARGET_FUNCTION_ARG
11266 #define TARGET_FUNCTION_ARG aarch64_function_arg
11267
11268 #undef TARGET_FUNCTION_ARG_ADVANCE
11269 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11270
11271 #undef TARGET_FUNCTION_ARG_BOUNDARY
11272 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11273
11274 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11275 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11276
11277 #undef TARGET_FUNCTION_VALUE
11278 #define TARGET_FUNCTION_VALUE aarch64_function_value
11279
11280 #undef TARGET_FUNCTION_VALUE_REGNO_P
11281 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11282
11283 #undef TARGET_FRAME_POINTER_REQUIRED
11284 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11285
11286 #undef TARGET_GIMPLE_FOLD_BUILTIN
11287 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11288
11289 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11290 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11291
11292 #undef  TARGET_INIT_BUILTINS
11293 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11294
11295 #undef TARGET_LEGITIMATE_ADDRESS_P
11296 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11297
11298 #undef TARGET_LEGITIMATE_CONSTANT_P
11299 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11300
11301 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11302 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11303
11304 #undef TARGET_LRA_P
11305 #define TARGET_LRA_P hook_bool_void_true
11306
11307 #undef TARGET_MANGLE_TYPE
11308 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11309
11310 #undef TARGET_MEMORY_MOVE_COST
11311 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11312
11313 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11314 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11315
11316 #undef TARGET_MUST_PASS_IN_STACK
11317 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11318
11319 /* This target hook should return true if accesses to volatile bitfields
11320    should use the narrowest mode possible.  It should return false if these
11321    accesses should use the bitfield container type.  */
11322 #undef TARGET_NARROW_VOLATILE_BITFIELD
11323 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11324
11325 #undef  TARGET_OPTION_OVERRIDE
11326 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11327
11328 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11329 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11330   aarch64_override_options_after_change
11331
11332 #undef TARGET_PASS_BY_REFERENCE
11333 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11334
11335 #undef TARGET_PREFERRED_RELOAD_CLASS
11336 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11337
11338 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11339 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11340
11341 #undef TARGET_SECONDARY_RELOAD
11342 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11343
11344 #undef TARGET_SHIFT_TRUNCATION_MASK
11345 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11346
11347 #undef TARGET_SETUP_INCOMING_VARARGS
11348 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11349
11350 #undef TARGET_STRUCT_VALUE_RTX
11351 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11352
11353 #undef TARGET_REGISTER_MOVE_COST
11354 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11355
11356 #undef TARGET_RETURN_IN_MEMORY
11357 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11358
11359 #undef TARGET_RETURN_IN_MSB
11360 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11361
11362 #undef TARGET_RTX_COSTS
11363 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11364
11365 #undef TARGET_SCHED_ISSUE_RATE
11366 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11367
11368 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11369 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11370   aarch64_sched_first_cycle_multipass_dfa_lookahead
11371
11372 #undef TARGET_TRAMPOLINE_INIT
11373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11374
11375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11377
11378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11380
11381 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11382 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11383
11384 #undef TARGET_VECTORIZE_ADD_STMT_COST
11385 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11386
11387 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11388 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11389   aarch64_builtin_vectorization_cost
11390
11391 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11392 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11393
11394 #undef TARGET_VECTORIZE_BUILTINS
11395 #define TARGET_VECTORIZE_BUILTINS
11396
11397 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11398 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11399   aarch64_builtin_vectorized_function
11400
11401 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11402 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11403   aarch64_autovectorize_vector_sizes
11404
11405 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11406 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11407   aarch64_atomic_assign_expand_fenv
11408
11409 /* Section anchor support.  */
11410
11411 #undef TARGET_MIN_ANCHOR_OFFSET
11412 #define TARGET_MIN_ANCHOR_OFFSET -256
11413
11414 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11415    byte offset; we can do much more for larger data types, but have no way
11416    to determine the size of the access.  We assume accesses are aligned.  */
11417 #undef TARGET_MAX_ANCHOR_OFFSET
11418 #define TARGET_MAX_ANCHOR_OFFSET 4095
11419
11420 #undef TARGET_VECTOR_ALIGNMENT
11421 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11422
11423 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11424 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11425   aarch64_simd_vector_alignment_reachable
11426
11427 /* vec_perm support.  */
11428
11429 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11430 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11431   aarch64_vectorize_vec_perm_const_ok
11432
11433
11434 #undef TARGET_FIXED_CONDITION_CODE_REGS
11435 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11436
11437 #undef TARGET_FLAGS_REGNUM
11438 #define TARGET_FLAGS_REGNUM CC_REGNUM
11439
11440 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11441 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11442
11443 #undef TARGET_ASAN_SHADOW_OFFSET
11444 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11445
11446 #undef TARGET_LEGITIMIZE_ADDRESS
11447 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11448
11449 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11450 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11451   aarch64_use_by_pieces_infrastructure_p
11452
11453 #undef TARGET_CAN_USE_DOLOOP_P
11454 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11455
11456 #undef TARGET_SCHED_MACRO_FUSION_P
11457 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11458
11459 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11460 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11461
11462 #undef TARGET_SCHED_FUSION_PRIORITY
11463 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11464
11465 struct gcc_target targetm = TARGET_INITIALIZER;
11466
11467 #include "gt-aarch64.h"