gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "expr.h"
  57 #include "reload.h"
  58 #include "toplev.h"
  59 #include "target.h"
  60 #include "target-def.h"
  61 #include "targhooks.h"
  62 #include "ggc.h"
  63 #include "input.h"
  64 #include "function.h"
  65 #include "tm_p.h"
  66 #include "recog.h"
  67 #include "langhooks.h"
  68 #include "diagnostic-core.h"
  69 #include "hash-table.h"
  70 #include "tree-ssa-alias.h"
  71 #include "internal-fn.h"
  72 #include "gimple-fold.h"
  73 #include "tree-eh.h"
  74 #include "gimple-expr.h"
  75 #include "is-a.h"
  76 #include "gimple.h"
  77 #include "gimplify.h"
  78 #include "optabs.h"
  79 #include "dwarf2.h"
  80 #include "cfgloop.h"
  81 #include "tree-vectorizer.h"
  82 #include "aarch64-cost-tables.h"
  83 #include "dumpfile.h"
  84 #include "builtins.h"
  85 #include "rtl-iter.h"
  86 #include "tm-constrs.h"
  87
  88 /* Defined for convenience.  */
  89 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  90
  91 /* Classifies an address.
  92
  93    ADDRESS_REG_IMM
  94        A simple base register plus immediate offset.
  95
  96    ADDRESS_REG_WB
  97        A base register indexed by immediate offset with writeback.
  98
  99    ADDRESS_REG_REG
 100        A base register indexed by (optionally scaled) register.
 101
 102    ADDRESS_REG_UXTW
 103        A base register indexed by (optionally scaled) zero-extended register.
 104
 105    ADDRESS_REG_SXTW
 106        A base register indexed by (optionally scaled) sign-extended register.
 107
 108    ADDRESS_LO_SUM
 109        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 110
 111    ADDRESS_SYMBOLIC:
 112        A constant symbolic address, in pc-relative literal pool.  */
 113
 114 enum aarch64_address_type {
 115   ADDRESS_REG_IMM,
 116   ADDRESS_REG_WB,
 117   ADDRESS_REG_REG,
 118   ADDRESS_REG_UXTW,
 119   ADDRESS_REG_SXTW,
 120   ADDRESS_LO_SUM,
 121   ADDRESS_SYMBOLIC
 122 };
 123
 124 struct aarch64_address_info {
 125   enum aarch64_address_type type;
 126   rtx base;
 127   rtx offset;
 128   int shift;
 129   enum aarch64_symbol_type symbol_type;
 130 };
 131
 132 struct simd_immediate_info
 133 {
 134   rtx value;
 135   int shift;
 136   int element_width;
 137   bool mvn;
 138   bool msl;
 139 };
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 #ifdef HAVE_AS_TLS
 145 #undef TARGET_HAVE_TLS
 146 #define TARGET_HAVE_TLS 1
 147 #endif
 148
 149 static bool aarch64_lra_p (void);
 150 static bool aarch64_composite_type_p (const_tree, machine_mode);
 151 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 152                                                      const_tree,
 153                                                      machine_mode *, int *,
 154                                                      bool *);
 155 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 156 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 157 static void aarch64_override_options_after_change (void);
 158 static bool aarch64_vector_mode_supported_p (machine_mode);
 159 static unsigned bit_count (unsigned HOST_WIDE_INT);
 160 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 161                                                  const unsigned char *sel);
 162 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 163
 164 /* Major revision number of the ARM Architecture implemented by the target.  */
 165 unsigned aarch64_architecture_version;
 166
 167 /* The processor for which instructions should be scheduled.  */
 168 enum aarch64_processor aarch64_tune = cortexa53;
 169
 170 /* The current tuning set.  */
 171 const struct tune_params *aarch64_tune_params;
 172
 173 /* Mask to specify which instructions we are allowed to generate.  */
 174 unsigned long aarch64_isa_flags = 0;
 175
 176 /* Mask to specify which instruction scheduling options should be used.  */
 177 unsigned long aarch64_tune_flags = 0;
 178
 179 /* Tuning parameters.  */
 180
 181 #if HAVE_DESIGNATED_INITIALIZERS
 182 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 183 #else
 184 #define NAMED_PARAM(NAME, VAL) (VAL)
 185 #endif
 186
 187 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 188 __extension__
 189 #endif
 190
 191 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 192 __extension__
 193 #endif
 194 static const struct cpu_addrcost_table generic_addrcost_table =
 195 {
 196 #if HAVE_DESIGNATED_INITIALIZERS
 197   .addr_scale_costs =
 198 #endif
 199     {
 200       NAMED_PARAM (hi, 0),
 201       NAMED_PARAM (si, 0),
 202       NAMED_PARAM (di, 0),
 203       NAMED_PARAM (ti, 0),
 204     },
 205   NAMED_PARAM (pre_modify, 0),
 206   NAMED_PARAM (post_modify, 0),
 207   NAMED_PARAM (register_offset, 0),
 208   NAMED_PARAM (register_extend, 0),
 209   NAMED_PARAM (imm_offset, 0)
 210 };
 211
 212 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 213 __extension__
 214 #endif
 215 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 216 {
 217 #if HAVE_DESIGNATED_INITIALIZERS
 218   .addr_scale_costs =
 219 #endif
 220     {
 221       NAMED_PARAM (hi, 1),
 222       NAMED_PARAM (si, 0),
 223       NAMED_PARAM (di, 0),
 224       NAMED_PARAM (ti, 1),
 225     },
 226   NAMED_PARAM (pre_modify, 0),
 227   NAMED_PARAM (post_modify, 0),
 228   NAMED_PARAM (register_offset, 0),
 229   NAMED_PARAM (register_extend, 0),
 230   NAMED_PARAM (imm_offset, 0),
 231 };
 232
 233 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 234 __extension__
 235 #endif
 236 static const struct cpu_regmove_cost generic_regmove_cost =
 237 {
 238   NAMED_PARAM (GP2GP, 1),
 239   /* Avoid the use of slow int<->fp moves for spilling by setting
 240      their cost higher than memmov_cost.  */
 241   NAMED_PARAM (GP2FP, 5),
 242   NAMED_PARAM (FP2GP, 5),
 243   NAMED_PARAM (FP2FP, 2)
 244 };
 245
 246 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 247 {
 248   NAMED_PARAM (GP2GP, 1),
 249   /* Avoid the use of slow int<->fp moves for spilling by setting
 250      their cost higher than memmov_cost.  */
 251   NAMED_PARAM (GP2FP, 5),
 252   NAMED_PARAM (FP2GP, 5),
 253   NAMED_PARAM (FP2FP, 2)
 254 };
 255
 256 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 257 {
 258   NAMED_PARAM (GP2GP, 1),
 259   /* Avoid the use of slow int<->fp moves for spilling by setting
 260      their cost higher than memmov_cost.  */
 261   NAMED_PARAM (GP2FP, 5),
 262   NAMED_PARAM (FP2GP, 5),
 263   NAMED_PARAM (FP2FP, 2)
 264 };
 265
 266 static const struct cpu_regmove_cost thunderx_regmove_cost =
 267 {
 268   NAMED_PARAM (GP2GP, 2),
 269   NAMED_PARAM (GP2FP, 2),
 270   NAMED_PARAM (FP2GP, 6),
 271   NAMED_PARAM (FP2FP, 4)
 272 };
 273
 274 /* Generic costs for vector insn classes.  */
 275 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 276 __extension__
 277 #endif
 278 static const struct cpu_vector_cost generic_vector_cost =
 279 {
 280   NAMED_PARAM (scalar_stmt_cost, 1),
 281   NAMED_PARAM (scalar_load_cost, 1),
 282   NAMED_PARAM (scalar_store_cost, 1),
 283   NAMED_PARAM (vec_stmt_cost, 1),
 284   NAMED_PARAM (vec_to_scalar_cost, 1),
 285   NAMED_PARAM (scalar_to_vec_cost, 1),
 286   NAMED_PARAM (vec_align_load_cost, 1),
 287   NAMED_PARAM (vec_unalign_load_cost, 1),
 288   NAMED_PARAM (vec_unalign_store_cost, 1),
 289   NAMED_PARAM (vec_store_cost, 1),
 290   NAMED_PARAM (cond_taken_branch_cost, 3),
 291   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 292 };
 293
 294 /* Generic costs for vector insn classes.  */
 295 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 296 __extension__
 297 #endif
 298 static const struct cpu_vector_cost cortexa57_vector_cost =
 299 {
 300   NAMED_PARAM (scalar_stmt_cost, 1),
 301   NAMED_PARAM (scalar_load_cost, 4),
 302   NAMED_PARAM (scalar_store_cost, 1),
 303   NAMED_PARAM (vec_stmt_cost, 3),
 304   NAMED_PARAM (vec_to_scalar_cost, 8),
 305   NAMED_PARAM (scalar_to_vec_cost, 8),
 306   NAMED_PARAM (vec_align_load_cost, 5),
 307   NAMED_PARAM (vec_unalign_load_cost, 5),
 308   NAMED_PARAM (vec_unalign_store_cost, 1),
 309   NAMED_PARAM (vec_store_cost, 1),
 310   NAMED_PARAM (cond_taken_branch_cost, 1),
 311   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 312 };
 313
 314 #define AARCH64_FUSE_NOTHING    (0)
 315 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 316 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 317 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 318 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 319 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 320
 321 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 322 __extension__
 323 #endif
 324 static const struct tune_params generic_tunings =
 325 {
 326   &cortexa57_extra_costs,
 327   &generic_addrcost_table,
 328   &generic_regmove_cost,
 329   &generic_vector_cost,
 330   NAMED_PARAM (memmov_cost, 4),
 331   NAMED_PARAM (issue_rate, 2),
 332   NAMED_PARAM (fuseable_ops, AARCH64_FUSE_NOTHING),
 333   8,    /* function_align.  */
 334   8,    /* jump_align.  */
 335   4,    /* loop_align.  */
 336   2,    /* int_reassoc_width.  */
 337   4,    /* fp_reassoc_width.  */
 338   1     /* vec_reassoc_width.  */
 339 };
 340
 341 static const struct tune_params cortexa53_tunings =
 342 {
 343   &cortexa53_extra_costs,
 344   &generic_addrcost_table,
 345   &cortexa53_regmove_cost,
 346   &generic_vector_cost,
 347   NAMED_PARAM (memmov_cost, 4),
 348   NAMED_PARAM (issue_rate, 2),
 349   NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 350                              | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR)),
 351   8,    /* function_align.  */
 352   8,    /* jump_align.  */
 353   4,    /* loop_align.  */
 354   2,    /* int_reassoc_width.  */
 355   4,    /* fp_reassoc_width.  */
 356   1     /* vec_reassoc_width.  */
 357 };
 358
 359 static const struct tune_params cortexa57_tunings =
 360 {
 361   &cortexa57_extra_costs,
 362   &cortexa57_addrcost_table,
 363   &cortexa57_regmove_cost,
 364   &cortexa57_vector_cost,
 365   NAMED_PARAM (memmov_cost, 4),
 366   NAMED_PARAM (issue_rate, 3),
 367   NAMED_PARAM (fuseable_ops, (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK)),
 368   16,   /* function_align.  */
 369   8,    /* jump_align.  */
 370   4,    /* loop_align.  */
 371   2,    /* int_reassoc_width.  */
 372   4,    /* fp_reassoc_width.  */
 373   1     /* vec_reassoc_width.  */
 374 };
 375
 376 static const struct tune_params thunderx_tunings =
 377 {
 378   &thunderx_extra_costs,
 379   &generic_addrcost_table,
 380   &thunderx_regmove_cost,
 381   &generic_vector_cost,
 382   NAMED_PARAM (memmov_cost, 6),
 383   NAMED_PARAM (issue_rate, 2),
 384   NAMED_PARAM (fuseable_ops, AARCH64_FUSE_CMP_BRANCH),
 385   8,    /* function_align.  */
 386   8,    /* jump_align.  */
 387   8,    /* loop_align.  */
 388   2,    /* int_reassoc_width.  */
 389   4,    /* fp_reassoc_width.  */
 390   1     /* vec_reassoc_width.  */
 391 };
 392
 393 /* A processor implementing AArch64.  */
 394 struct processor
 395 {
 396   const char *const name;
 397   enum aarch64_processor core;
 398   const char *arch;
 399   unsigned architecture_version;
 400   const unsigned long flags;
 401   const struct tune_params *const tune;
 402 };
 403
 404 /* Processor cores implementing AArch64.  */
 405 static const struct processor all_cores[] =
 406 {
 407 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS) \
 408   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 409 #include "aarch64-cores.def"
 410 #undef AARCH64_CORE
 411   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 412   {NULL, aarch64_none, NULL, 0, 0, NULL}
 413 };
 414
 415 /* Architectures implementing AArch64.  */
 416 static const struct processor all_architectures[] =
 417 {
 418 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 419   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 420 #include "aarch64-arches.def"
 421 #undef AARCH64_ARCH
 422   {NULL, aarch64_none, NULL, 0, 0, NULL}
 423 };
 424
 425 /* Target specification.  These are populated as commandline arguments
 426    are processed, or NULL if not specified.  */
 427 static const struct processor *selected_arch;
 428 static const struct processor *selected_cpu;
 429 static const struct processor *selected_tune;
 430
 431 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 432
 433 /* An ISA extension in the co-processor and main instruction set space.  */
 434 struct aarch64_option_extension
 435 {
 436   const char *const name;
 437   const unsigned long flags_on;
 438   const unsigned long flags_off;
 439 };
 440
 441 /* ISA extensions in AArch64.  */
 442 static const struct aarch64_option_extension all_extensions[] =
 443 {
 444 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 445   {NAME, FLAGS_ON, FLAGS_OFF},
 446 #include "aarch64-option-extensions.def"
 447 #undef AARCH64_OPT_EXTENSION
 448   {NULL, 0, 0}
 449 };
 450
 451 /* Used to track the size of an address when generating a pre/post
 452    increment address.  */
 453 static machine_mode aarch64_memory_reference_mode;
 454
 455 /* Used to force GTY into this file.  */
 456 static GTY(()) int gty_dummy;
 457
 458 /* A table of valid AArch64 "bitmask immediate" values for
 459    logical instructions.  */
 460
 461 #define AARCH64_NUM_BITMASKS  5334
 462 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 463
 464 typedef enum aarch64_cond_code
 465 {
 466   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 467   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 468   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 469 }
 470 aarch64_cc;
 471
 472 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 473
 474 /* The condition codes of the processor, and the inverse function.  */
 475 static const char * const aarch64_condition_codes[] =
 476 {
 477   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 478   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 479 };
 480
 481 static unsigned int
 482 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 483 {
 484   return 2;
 485 }
 486
 487 static int
 488 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 489                              enum machine_mode mode)
 490 {
 491   if (VECTOR_MODE_P (mode))
 492     return aarch64_tune_params->vec_reassoc_width;
 493   if (INTEGRAL_MODE_P (mode))
 494     return aarch64_tune_params->int_reassoc_width;
 495   if (FLOAT_MODE_P (mode))
 496     return aarch64_tune_params->fp_reassoc_width;
 497   return 1;
 498 }
 499
 500 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 501 unsigned
 502 aarch64_dbx_register_number (unsigned regno)
 503 {
 504    if (GP_REGNUM_P (regno))
 505      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 506    else if (regno == SP_REGNUM)
 507      return AARCH64_DWARF_SP;
 508    else if (FP_REGNUM_P (regno))
 509      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 510
 511    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 512       equivalent DWARF register.  */
 513    return DWARF_FRAME_REGISTERS;
 514 }
 515
 516 /* Return TRUE if MODE is any of the large INT modes.  */
 517 static bool
 518 aarch64_vect_struct_mode_p (machine_mode mode)
 519 {
 520   return mode == OImode || mode == CImode || mode == XImode;
 521 }
 522
 523 /* Return TRUE if MODE is any of the vector modes.  */
 524 static bool
 525 aarch64_vector_mode_p (machine_mode mode)
 526 {
 527   return aarch64_vector_mode_supported_p (mode)
 528          || aarch64_vect_struct_mode_p (mode);
 529 }
 530
 531 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 532 static bool
 533 aarch64_array_mode_supported_p (machine_mode mode,
 534                                 unsigned HOST_WIDE_INT nelems)
 535 {
 536   if (TARGET_SIMD
 537       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 538       && (nelems >= 2 && nelems <= 4))
 539     return true;
 540
 541   return false;
 542 }
 543
 544 /* Implement HARD_REGNO_NREGS.  */
 545
 546 int
 547 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 548 {
 549   switch (aarch64_regno_regclass (regno))
 550     {
 551     case FP_REGS:
 552     case FP_LO_REGS:
 553       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 554     default:
 555       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 556     }
 557   gcc_unreachable ();
 558 }
 559
 560 /* Implement HARD_REGNO_MODE_OK.  */
 561
 562 int
 563 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 564 {
 565   if (GET_MODE_CLASS (mode) == MODE_CC)
 566     return regno == CC_REGNUM;
 567
 568   if (regno == SP_REGNUM)
 569     /* The purpose of comparing with ptr_mode is to support the
 570        global register variable associated with the stack pointer
 571        register via the syntax of asm ("wsp") in ILP32.  */
 572     return mode == Pmode || mode == ptr_mode;
 573
 574   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 575     return mode == Pmode;
 576
 577   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 578     return 1;
 579
 580   if (FP_REGNUM_P (regno))
 581     {
 582       if (aarch64_vect_struct_mode_p (mode))
 583         return
 584           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 585       else
 586         return 1;
 587     }
 588
 589   return 0;
 590 }
 591
 592 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 593 machine_mode
 594 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 595                                      machine_mode mode)
 596 {
 597   /* Handle modes that fit within single registers.  */
 598   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 599     {
 600       if (GET_MODE_SIZE (mode) >= 4)
 601         return mode;
 602       else
 603         return SImode;
 604     }
 605   /* Fall back to generic for multi-reg and very large modes.  */
 606   else
 607     return choose_hard_reg_mode (regno, nregs, false);
 608 }
 609
 610 /* Return true if calls to DECL should be treated as
 611    long-calls (ie called via a register).  */
 612 static bool
 613 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 614 {
 615   return false;
 616 }
 617
 618 /* Return true if calls to symbol-ref SYM should be treated as
 619    long-calls (ie called via a register).  */
 620 bool
 621 aarch64_is_long_call_p (rtx sym)
 622 {
 623   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 624 }
 625
 626 /* Return true if the offsets to a zero/sign-extract operation
 627    represent an expression that matches an extend operation.  The
 628    operands represent the paramters from
 629
 630    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 631 bool
 632 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 633                                 rtx extract_imm)
 634 {
 635   HOST_WIDE_INT mult_val, extract_val;
 636
 637   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 638     return false;
 639
 640   mult_val = INTVAL (mult_imm);
 641   extract_val = INTVAL (extract_imm);
 642
 643   if (extract_val > 8
 644       && extract_val < GET_MODE_BITSIZE (mode)
 645       && exact_log2 (extract_val & ~7) > 0
 646       && (extract_val & 7) <= 4
 647       && mult_val == (1 << (extract_val & 7)))
 648     return true;
 649
 650   return false;
 651 }
 652
 653 /* Emit an insn that's a simple single-set.  Both the operands must be
 654    known to be valid.  */
 655 inline static rtx
 656 emit_set_insn (rtx x, rtx y)
 657 {
 658   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 659 }
 660
 661 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 662    return the rtx for register 0 in the proper mode.  */
 663 rtx
 664 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 665 {
 666   machine_mode mode = SELECT_CC_MODE (code, x, y);
 667   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 668
 669   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 670   return cc_reg;
 671 }
 672
 673 /* Build the SYMBOL_REF for __tls_get_addr.  */
 674
 675 static GTY(()) rtx tls_get_addr_libfunc;
 676
 677 rtx
 678 aarch64_tls_get_addr (void)
 679 {
 680   if (!tls_get_addr_libfunc)
 681     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 682   return tls_get_addr_libfunc;
 683 }
 684
 685 /* Return the TLS model to use for ADDR.  */
 686
 687 static enum tls_model
 688 tls_symbolic_operand_type (rtx addr)
 689 {
 690   enum tls_model tls_kind = TLS_MODEL_NONE;
 691   rtx sym, addend;
 692
 693   if (GET_CODE (addr) == CONST)
 694     {
 695       split_const (addr, &sym, &addend);
 696       if (GET_CODE (sym) == SYMBOL_REF)
 697         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 698     }
 699   else if (GET_CODE (addr) == SYMBOL_REF)
 700     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 701
 702   return tls_kind;
 703 }
 704
 705 /* We'll allow lo_sum's in addresses in our legitimate addresses
 706    so that combine would take care of combining addresses where
 707    necessary, but for generation purposes, we'll generate the address
 708    as :
 709    RTL                               Absolute
 710    tmp = hi (symbol_ref);            adrp  x1, foo
 711    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 712                                      nop
 713
 714    PIC                               TLS
 715    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 716    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 717                                      bl   __tls_get_addr
 718                                      nop
 719
 720    Load TLS symbol, depending on TLS mechanism and TLS access model.
 721
 722    Global Dynamic - Traditional TLS:
 723    adrp tmp, :tlsgd:imm
 724    add  dest, tmp, #:tlsgd_lo12:imm
 725    bl   __tls_get_addr
 726
 727    Global Dynamic - TLS Descriptors:
 728    adrp dest, :tlsdesc:imm
 729    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 730    add  dest, dest, #:tlsdesc_lo12:imm
 731    blr  tmp
 732    mrs  tp, tpidr_el0
 733    add  dest, dest, tp
 734
 735    Initial Exec:
 736    mrs  tp, tpidr_el0
 737    adrp tmp, :gottprel:imm
 738    ldr  dest, [tmp, #:gottprel_lo12:imm]
 739    add  dest, dest, tp
 740
 741    Local Exec:
 742    mrs  tp, tpidr_el0
 743    add  t0, tp, #:tprel_hi12:imm
 744    add  t0, #:tprel_lo12_nc:imm
 745 */
 746
 747 static void
 748 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 749                                    enum aarch64_symbol_type type)
 750 {
 751   switch (type)
 752     {
 753     case SYMBOL_SMALL_ABSOLUTE:
 754       {
 755         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 756         rtx tmp_reg = dest;
 757         machine_mode mode = GET_MODE (dest);
 758
 759         gcc_assert (mode == Pmode || mode == ptr_mode);
 760
 761         if (can_create_pseudo_p ())
 762           tmp_reg = gen_reg_rtx (mode);
 763
 764         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 765         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 766         return;
 767       }
 768
 769     case SYMBOL_TINY_ABSOLUTE:
 770       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 771       return;
 772
 773     case SYMBOL_SMALL_GOT:
 774       {
 775         /* In ILP32, the mode of dest can be either SImode or DImode,
 776            while the got entry is always of SImode size.  The mode of
 777            dest depends on how dest is used: if dest is assigned to a
 778            pointer (e.g. in the memory), it has SImode; it may have
 779            DImode if dest is dereferenced to access the memeory.
 780            This is why we have to handle three different ldr_got_small
 781            patterns here (two patterns for ILP32).  */
 782         rtx tmp_reg = dest;
 783         machine_mode mode = GET_MODE (dest);
 784
 785         if (can_create_pseudo_p ())
 786           tmp_reg = gen_reg_rtx (mode);
 787
 788         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 789         if (mode == ptr_mode)
 790           {
 791             if (mode == DImode)
 792               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 793             else
 794               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 795           }
 796         else
 797           {
 798             gcc_assert (mode == Pmode);
 799             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 800           }
 801
 802         return;
 803       }
 804
 805     case SYMBOL_SMALL_TLSGD:
 806       {
 807         rtx_insn *insns;
 808         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 809
 810         start_sequence ();
 811         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 812         insns = get_insns ();
 813         end_sequence ();
 814
 815         RTL_CONST_CALL_P (insns) = 1;
 816         emit_libcall_block (insns, dest, result, imm);
 817         return;
 818       }
 819
 820     case SYMBOL_SMALL_TLSDESC:
 821       {
 822         machine_mode mode = GET_MODE (dest);
 823         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 824         rtx tp;
 825
 826         gcc_assert (mode == Pmode || mode == ptr_mode);
 827
 828         /* In ILP32, the got entry is always of SImode size.  Unlike
 829            small GOT, the dest is fixed at reg 0.  */
 830         if (TARGET_ILP32)
 831           emit_insn (gen_tlsdesc_small_si (imm));
 832         else
 833           emit_insn (gen_tlsdesc_small_di (imm));
 834         tp = aarch64_load_tp (NULL);
 835
 836         if (mode != Pmode)
 837           tp = gen_lowpart (mode, tp);
 838
 839         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 840         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 841         return;
 842       }
 843
 844     case SYMBOL_SMALL_GOTTPREL:
 845       {
 846         /* In ILP32, the mode of dest can be either SImode or DImode,
 847            while the got entry is always of SImode size.  The mode of
 848            dest depends on how dest is used: if dest is assigned to a
 849            pointer (e.g. in the memory), it has SImode; it may have
 850            DImode if dest is dereferenced to access the memeory.
 851            This is why we have to handle three different tlsie_small
 852            patterns here (two patterns for ILP32).  */
 853         machine_mode mode = GET_MODE (dest);
 854         rtx tmp_reg = gen_reg_rtx (mode);
 855         rtx tp = aarch64_load_tp (NULL);
 856
 857         if (mode == ptr_mode)
 858           {
 859             if (mode == DImode)
 860               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 861             else
 862               {
 863                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 864                 tp = gen_lowpart (mode, tp);
 865               }
 866           }
 867         else
 868           {
 869             gcc_assert (mode == Pmode);
 870             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 871           }
 872
 873         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 874         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 875         return;
 876       }
 877
 878     case SYMBOL_SMALL_TPREL:
 879       {
 880         rtx tp = aarch64_load_tp (NULL);
 881         emit_insn (gen_tlsle_small (dest, tp, imm));
 882         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 883         return;
 884       }
 885
 886     case SYMBOL_TINY_GOT:
 887       emit_insn (gen_ldr_got_tiny (dest, imm));
 888       return;
 889
 890     default:
 891       gcc_unreachable ();
 892     }
 893 }
 894
 895 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 896    handle all moves if !can_create_pseudo_p ().  The distinction is
 897    important because, unlike emit_move_insn, the move expanders know
 898    how to force Pmode objects into the constant pool even when the
 899    constant pool address is not itself legitimate.  */
 900 static rtx
 901 aarch64_emit_move (rtx dest, rtx src)
 902 {
 903   return (can_create_pseudo_p ()
 904           ? emit_move_insn (dest, src)
 905           : emit_move_insn_1 (dest, src));
 906 }
 907
 908 /* Split a 128-bit move operation into two 64-bit move operations,
 909    taking care to handle partial overlap of register to register
 910    copies.  Special cases are needed when moving between GP regs and
 911    FP regs.  SRC can be a register, constant or memory; DST a register
 912    or memory.  If either operand is memory it must not have any side
 913    effects.  */
 914 void
 915 aarch64_split_128bit_move (rtx dst, rtx src)
 916 {
 917   rtx dst_lo, dst_hi;
 918   rtx src_lo, src_hi;
 919
 920   machine_mode mode = GET_MODE (dst);
 921
 922   gcc_assert (mode == TImode || mode == TFmode);
 923   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 924   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 925
 926   if (REG_P (dst) && REG_P (src))
 927     {
 928       int src_regno = REGNO (src);
 929       int dst_regno = REGNO (dst);
 930
 931       /* Handle FP <-> GP regs.  */
 932       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 933         {
 934           src_lo = gen_lowpart (word_mode, src);
 935           src_hi = gen_highpart (word_mode, src);
 936
 937           if (mode == TImode)
 938             {
 939               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 940               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 941             }
 942           else
 943             {
 944               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 945               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 946             }
 947           return;
 948         }
 949       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 950         {
 951           dst_lo = gen_lowpart (word_mode, dst);
 952           dst_hi = gen_highpart (word_mode, dst);
 953
 954           if (mode == TImode)
 955             {
 956               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 957               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 958             }
 959           else
 960             {
 961               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 962               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 963             }
 964           return;
 965         }
 966     }
 967
 968   dst_lo = gen_lowpart (word_mode, dst);
 969   dst_hi = gen_highpart (word_mode, dst);
 970   src_lo = gen_lowpart (word_mode, src);
 971   src_hi = gen_highpart_mode (word_mode, mode, src);
 972
 973   /* At most one pairing may overlap.  */
 974   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 975     {
 976       aarch64_emit_move (dst_hi, src_hi);
 977       aarch64_emit_move (dst_lo, src_lo);
 978     }
 979   else
 980     {
 981       aarch64_emit_move (dst_lo, src_lo);
 982       aarch64_emit_move (dst_hi, src_hi);
 983     }
 984 }
 985
 986 bool
 987 aarch64_split_128bit_move_p (rtx dst, rtx src)
 988 {
 989   return (! REG_P (src)
 990           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 991 }
 992
 993 /* Split a complex SIMD combine.  */
 994
 995 void
 996 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 997 {
 998   machine_mode src_mode = GET_MODE (src1);
 999   machine_mode dst_mode = GET_MODE (dst);
1000
1001   gcc_assert (VECTOR_MODE_P (dst_mode));
1002
1003   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1004     {
1005       rtx (*gen) (rtx, rtx, rtx);
1006
1007       switch (src_mode)
1008         {
1009         case V8QImode:
1010           gen = gen_aarch64_simd_combinev8qi;
1011           break;
1012         case V4HImode:
1013           gen = gen_aarch64_simd_combinev4hi;
1014           break;
1015         case V2SImode:
1016           gen = gen_aarch64_simd_combinev2si;
1017           break;
1018         case V2SFmode:
1019           gen = gen_aarch64_simd_combinev2sf;
1020           break;
1021         case DImode:
1022           gen = gen_aarch64_simd_combinedi;
1023           break;
1024         case DFmode:
1025           gen = gen_aarch64_simd_combinedf;
1026           break;
1027         default:
1028           gcc_unreachable ();
1029         }
1030
1031       emit_insn (gen (dst, src1, src2));
1032       return;
1033     }
1034 }
1035
1036 /* Split a complex SIMD move.  */
1037
1038 void
1039 aarch64_split_simd_move (rtx dst, rtx src)
1040 {
1041   machine_mode src_mode = GET_MODE (src);
1042   machine_mode dst_mode = GET_MODE (dst);
1043
1044   gcc_assert (VECTOR_MODE_P (dst_mode));
1045
1046   if (REG_P (dst) && REG_P (src))
1047     {
1048       rtx (*gen) (rtx, rtx);
1049
1050       gcc_assert (VECTOR_MODE_P (src_mode));
1051
1052       switch (src_mode)
1053         {
1054         case V16QImode:
1055           gen = gen_aarch64_split_simd_movv16qi;
1056           break;
1057         case V8HImode:
1058           gen = gen_aarch64_split_simd_movv8hi;
1059           break;
1060         case V4SImode:
1061           gen = gen_aarch64_split_simd_movv4si;
1062           break;
1063         case V2DImode:
1064           gen = gen_aarch64_split_simd_movv2di;
1065           break;
1066         case V4SFmode:
1067           gen = gen_aarch64_split_simd_movv4sf;
1068           break;
1069         case V2DFmode:
1070           gen = gen_aarch64_split_simd_movv2df;
1071           break;
1072         default:
1073           gcc_unreachable ();
1074         }
1075
1076       emit_insn (gen (dst, src));
1077       return;
1078     }
1079 }
1080
1081 static rtx
1082 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1083 {
1084   if (can_create_pseudo_p ())
1085     return force_reg (mode, value);
1086   else
1087     {
1088       x = aarch64_emit_move (x, value);
1089       return x;
1090     }
1091 }
1092
1093
1094 static rtx
1095 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1096 {
1097   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1098     {
1099       rtx high;
1100       /* Load the full offset into a register.  This
1101          might be improvable in the future.  */
1102       high = GEN_INT (offset);
1103       offset = 0;
1104       high = aarch64_force_temporary (mode, temp, high);
1105       reg = aarch64_force_temporary (mode, temp,
1106                                      gen_rtx_PLUS (mode, high, reg));
1107     }
1108   return plus_constant (mode, reg, offset);
1109 }
1110
1111 static int
1112 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1113                                 machine_mode mode)
1114 {
1115   unsigned HOST_WIDE_INT mask;
1116   int i;
1117   bool first;
1118   unsigned HOST_WIDE_INT val;
1119   bool subtargets;
1120   rtx subtarget;
1121   int one_match, zero_match, first_not_ffff_match;
1122   int num_insns = 0;
1123
1124   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1125     {
1126       if (generate)
1127         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1128       num_insns++;
1129       return num_insns;
1130     }
1131
1132   if (mode == SImode)
1133     {
1134       /* We know we can't do this in 1 insn, and we must be able to do it
1135          in two; so don't mess around looking for sequences that don't buy
1136          us anything.  */
1137       if (generate)
1138         {
1139           emit_insn (gen_rtx_SET (VOIDmode, dest,
1140                                   GEN_INT (INTVAL (imm) & 0xffff)));
1141           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1142                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1143         }
1144       num_insns += 2;
1145       return num_insns;
1146     }
1147
1148   /* Remaining cases are all for DImode.  */
1149
1150   val = INTVAL (imm);
1151   subtargets = optimize && can_create_pseudo_p ();
1152
1153   one_match = 0;
1154   zero_match = 0;
1155   mask = 0xffff;
1156   first_not_ffff_match = -1;
1157
1158   for (i = 0; i < 64; i += 16, mask <<= 16)
1159     {
1160       if ((val & mask) == mask)
1161         one_match++;
1162       else
1163         {
1164           if (first_not_ffff_match < 0)
1165             first_not_ffff_match = i;
1166           if ((val & mask) == 0)
1167             zero_match++;
1168         }
1169     }
1170
1171   if (one_match == 2)
1172     {
1173       /* Set one of the quarters and then insert back into result.  */
1174       mask = 0xffffll << first_not_ffff_match;
1175       if (generate)
1176         {
1177           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1178           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1179                                      GEN_INT ((val >> first_not_ffff_match)
1180                                               & 0xffff)));
1181         }
1182       num_insns += 2;
1183       return num_insns;
1184     }
1185
1186   if (zero_match == 2)
1187     goto simple_sequence;
1188
1189   mask = 0x0ffff0000UL;
1190   for (i = 16; i < 64; i += 16, mask <<= 16)
1191     {
1192       HOST_WIDE_INT comp = mask & ~(mask - 1);
1193
1194       if (aarch64_uimm12_shift (val - (val & mask)))
1195         {
1196           if (generate)
1197             {
1198               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1199               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1200                                       GEN_INT (val & mask)));
1201               emit_insn (gen_adddi3 (dest, subtarget,
1202                                      GEN_INT (val - (val & mask))));
1203             }
1204           num_insns += 2;
1205           return num_insns;
1206         }
1207       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1208         {
1209           if (generate)
1210             {
1211               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1212               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1213                                       GEN_INT ((val + comp) & mask)));
1214               emit_insn (gen_adddi3 (dest, subtarget,
1215                                      GEN_INT (val - ((val + comp) & mask))));
1216             }
1217           num_insns += 2;
1218           return num_insns;
1219         }
1220       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1221         {
1222           if (generate)
1223             {
1224               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1225               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1226                                       GEN_INT ((val - comp) | ~mask)));
1227               emit_insn (gen_adddi3 (dest, subtarget,
1228                                      GEN_INT (val - ((val - comp) | ~mask))));
1229             }
1230           num_insns += 2;
1231           return num_insns;
1232         }
1233       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1234         {
1235           if (generate)
1236             {
1237               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1238               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1239                                       GEN_INT (val | ~mask)));
1240               emit_insn (gen_adddi3 (dest, subtarget,
1241                                      GEN_INT (val - (val | ~mask))));
1242             }
1243           num_insns += 2;
1244           return num_insns;
1245         }
1246     }
1247
1248   /* See if we can do it by arithmetically combining two
1249      immediates.  */
1250   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1251     {
1252       int j;
1253       mask = 0xffff;
1254
1255       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1256           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1257         {
1258           if (generate)
1259             {
1260               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1261               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1262                                       GEN_INT (aarch64_bitmasks[i])));
1263               emit_insn (gen_adddi3 (dest, subtarget,
1264                                      GEN_INT (val - aarch64_bitmasks[i])));
1265             }
1266           num_insns += 2;
1267           return num_insns;
1268         }
1269
1270       for (j = 0; j < 64; j += 16, mask <<= 16)
1271         {
1272           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1273             {
1274               if (generate)
1275                 {
1276                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1277                                           GEN_INT (aarch64_bitmasks[i])));
1278                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1279                                              GEN_INT ((val >> j) & 0xffff)));
1280                 }
1281               num_insns += 2;
1282               return num_insns;
1283             }
1284         }
1285     }
1286
1287   /* See if we can do it by logically combining two immediates.  */
1288   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1289     {
1290       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1291         {
1292           int j;
1293
1294           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1295             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1296               {
1297                 if (generate)
1298                   {
1299                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1300                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1301                                             GEN_INT (aarch64_bitmasks[i])));
1302                     emit_insn (gen_iordi3 (dest, subtarget,
1303                                            GEN_INT (aarch64_bitmasks[j])));
1304                   }
1305                 num_insns += 2;
1306                 return num_insns;
1307               }
1308         }
1309       else if ((val & aarch64_bitmasks[i]) == val)
1310         {
1311           int j;
1312
1313           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1314             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1315               {
1316                 if (generate)
1317                   {
1318                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1319                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1320                                             GEN_INT (aarch64_bitmasks[j])));
1321                     emit_insn (gen_anddi3 (dest, subtarget,
1322                                            GEN_INT (aarch64_bitmasks[i])));
1323                   }
1324                 num_insns += 2;
1325                 return num_insns;
1326               }
1327         }
1328     }
1329
1330   if (one_match > zero_match)
1331     {
1332       /* Set either first three quarters or all but the third.   */
1333       mask = 0xffffll << (16 - first_not_ffff_match);
1334       if (generate)
1335         emit_insn (gen_rtx_SET (VOIDmode, dest,
1336                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1337       num_insns ++;
1338
1339       /* Now insert other two quarters.  */
1340       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1341            i < 64; i += 16, mask <<= 16)
1342         {
1343           if ((val & mask) != mask)
1344             {
1345               if (generate)
1346                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1347                                            GEN_INT ((val >> i) & 0xffff)));
1348               num_insns ++;
1349             }
1350         }
1351       return num_insns;
1352     }
1353
1354  simple_sequence:
1355   first = true;
1356   mask = 0xffff;
1357   for (i = 0; i < 64; i += 16, mask <<= 16)
1358     {
1359       if ((val & mask) != 0)
1360         {
1361           if (first)
1362             {
1363               if (generate)
1364                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1365                                         GEN_INT (val & mask)));
1366               num_insns ++;
1367               first = false;
1368             }
1369           else
1370             {
1371               if (generate)
1372                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1373                                            GEN_INT ((val >> i) & 0xffff)));
1374               num_insns ++;
1375             }
1376         }
1377     }
1378
1379   return num_insns;
1380 }
1381
1382
1383 void
1384 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1385 {
1386   machine_mode mode = GET_MODE (dest);
1387
1388   gcc_assert (mode == SImode || mode == DImode);
1389
1390   /* Check on what type of symbol it is.  */
1391   if (GET_CODE (imm) == SYMBOL_REF
1392       || GET_CODE (imm) == LABEL_REF
1393       || GET_CODE (imm) == CONST)
1394     {
1395       rtx mem, base, offset;
1396       enum aarch64_symbol_type sty;
1397
1398       /* If we have (const (plus symbol offset)), separate out the offset
1399          before we start classifying the symbol.  */
1400       split_const (imm, &base, &offset);
1401
1402       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1403       switch (sty)
1404         {
1405         case SYMBOL_FORCE_TO_MEM:
1406           if (offset != const0_rtx
1407               && targetm.cannot_force_const_mem (mode, imm))
1408             {
1409               gcc_assert (can_create_pseudo_p ());
1410               base = aarch64_force_temporary (mode, dest, base);
1411               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1412               aarch64_emit_move (dest, base);
1413               return;
1414             }
1415           mem = force_const_mem (ptr_mode, imm);
1416           gcc_assert (mem);
1417           if (mode != ptr_mode)
1418             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1419           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1420           return;
1421
1422         case SYMBOL_SMALL_TLSGD:
1423         case SYMBOL_SMALL_TLSDESC:
1424         case SYMBOL_SMALL_GOTTPREL:
1425         case SYMBOL_SMALL_GOT:
1426         case SYMBOL_TINY_GOT:
1427           if (offset != const0_rtx)
1428             {
1429               gcc_assert(can_create_pseudo_p ());
1430               base = aarch64_force_temporary (mode, dest, base);
1431               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1432               aarch64_emit_move (dest, base);
1433               return;
1434             }
1435           /* FALLTHRU */
1436
1437         case SYMBOL_SMALL_TPREL:
1438         case SYMBOL_SMALL_ABSOLUTE:
1439         case SYMBOL_TINY_ABSOLUTE:
1440           aarch64_load_symref_appropriately (dest, imm, sty);
1441           return;
1442
1443         default:
1444           gcc_unreachable ();
1445         }
1446     }
1447
1448   if (!CONST_INT_P (imm))
1449     {
1450       if (GET_CODE (imm) == HIGH)
1451         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1452       else
1453         {
1454           rtx mem = force_const_mem (mode, imm);
1455           gcc_assert (mem);
1456           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1457         }
1458
1459       return;
1460     }
1461
1462   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1463 }
1464
1465 static bool
1466 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1467                                  tree exp ATTRIBUTE_UNUSED)
1468 {
1469   /* Currently, always true.  */
1470   return true;
1471 }
1472
1473 /* Implement TARGET_PASS_BY_REFERENCE.  */
1474
1475 static bool
1476 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1477                            machine_mode mode,
1478                            const_tree type,
1479                            bool named ATTRIBUTE_UNUSED)
1480 {
1481   HOST_WIDE_INT size;
1482   machine_mode dummymode;
1483   int nregs;
1484
1485   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1486   size = (mode == BLKmode && type)
1487     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1488
1489   /* Aggregates are passed by reference based on their size.  */
1490   if (type && AGGREGATE_TYPE_P (type))
1491     {
1492       size = int_size_in_bytes (type);
1493     }
1494
1495   /* Variable sized arguments are always returned by reference.  */
1496   if (size < 0)
1497     return true;
1498
1499   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1500   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1501                                                &dummymode, &nregs,
1502                                                NULL))
1503     return false;
1504
1505   /* Arguments which are variable sized or larger than 2 registers are
1506      passed by reference unless they are a homogenous floating point
1507      aggregate.  */
1508   return size > 2 * UNITS_PER_WORD;
1509 }
1510
1511 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1512 static bool
1513 aarch64_return_in_msb (const_tree valtype)
1514 {
1515   machine_mode dummy_mode;
1516   int dummy_int;
1517
1518   /* Never happens in little-endian mode.  */
1519   if (!BYTES_BIG_ENDIAN)
1520     return false;
1521
1522   /* Only composite types smaller than or equal to 16 bytes can
1523      be potentially returned in registers.  */
1524   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1525       || int_size_in_bytes (valtype) <= 0
1526       || int_size_in_bytes (valtype) > 16)
1527     return false;
1528
1529   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1530      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1531      is always passed/returned in the least significant bits of fp/simd
1532      register(s).  */
1533   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1534                                                &dummy_mode, &dummy_int, NULL))
1535     return false;
1536
1537   return true;
1538 }
1539
1540 /* Implement TARGET_FUNCTION_VALUE.
1541    Define how to find the value returned by a function.  */
1542
1543 static rtx
1544 aarch64_function_value (const_tree type, const_tree func,
1545                         bool outgoing ATTRIBUTE_UNUSED)
1546 {
1547   machine_mode mode;
1548   int unsignedp;
1549   int count;
1550   machine_mode ag_mode;
1551
1552   mode = TYPE_MODE (type);
1553   if (INTEGRAL_TYPE_P (type))
1554     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1555
1556   if (aarch64_return_in_msb (type))
1557     {
1558       HOST_WIDE_INT size = int_size_in_bytes (type);
1559
1560       if (size % UNITS_PER_WORD != 0)
1561         {
1562           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1563           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1564         }
1565     }
1566
1567   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1568                                                &ag_mode, &count, NULL))
1569     {
1570       if (!aarch64_composite_type_p (type, mode))
1571         {
1572           gcc_assert (count == 1 && mode == ag_mode);
1573           return gen_rtx_REG (mode, V0_REGNUM);
1574         }
1575       else
1576         {
1577           int i;
1578           rtx par;
1579
1580           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1581           for (i = 0; i < count; i++)
1582             {
1583               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1584               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1585                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1586               XVECEXP (par, 0, i) = tmp;
1587             }
1588           return par;
1589         }
1590     }
1591   else
1592     return gen_rtx_REG (mode, R0_REGNUM);
1593 }
1594
1595 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1596    Return true if REGNO is the number of a hard register in which the values
1597    of called function may come back.  */
1598
1599 static bool
1600 aarch64_function_value_regno_p (const unsigned int regno)
1601 {
1602   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1603      of 16-byte return values are: 128-bit integers and 16-byte small
1604      structures (excluding homogeneous floating-point aggregates).  */
1605   if (regno == R0_REGNUM || regno == R1_REGNUM)
1606     return true;
1607
1608   /* Up to four fp/simd registers can return a function value, e.g. a
1609      homogeneous floating-point aggregate having four members.  */
1610   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1611     return !TARGET_GENERAL_REGS_ONLY;
1612
1613   return false;
1614 }
1615
1616 /* Implement TARGET_RETURN_IN_MEMORY.
1617
1618    If the type T of the result of a function is such that
1619      void func (T arg)
1620    would require that arg be passed as a value in a register (or set of
1621    registers) according to the parameter passing rules, then the result
1622    is returned in the same registers as would be used for such an
1623    argument.  */
1624
1625 static bool
1626 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1627 {
1628   HOST_WIDE_INT size;
1629   machine_mode ag_mode;
1630   int count;
1631
1632   if (!AGGREGATE_TYPE_P (type)
1633       && TREE_CODE (type) != COMPLEX_TYPE
1634       && TREE_CODE (type) != VECTOR_TYPE)
1635     /* Simple scalar types always returned in registers.  */
1636     return false;
1637
1638   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1639                                                type,
1640                                                &ag_mode,
1641                                                &count,
1642                                                NULL))
1643     return false;
1644
1645   /* Types larger than 2 registers returned in memory.  */
1646   size = int_size_in_bytes (type);
1647   return (size < 0 || size > 2 * UNITS_PER_WORD);
1648 }
1649
1650 static bool
1651 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1652                                const_tree type, int *nregs)
1653 {
1654   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1655   return aarch64_vfp_is_call_or_return_candidate (mode,
1656                                                   type,
1657                                                   &pcum->aapcs_vfp_rmode,
1658                                                   nregs,
1659                                                   NULL);
1660 }
1661
1662 /* Given MODE and TYPE of a function argument, return the alignment in
1663    bits.  The idea is to suppress any stronger alignment requested by
1664    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1665    This is a helper function for local use only.  */
1666
1667 static unsigned int
1668 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1669 {
1670   unsigned int alignment;
1671
1672   if (type)
1673     {
1674       if (!integer_zerop (TYPE_SIZE (type)))
1675         {
1676           if (TYPE_MODE (type) == mode)
1677             alignment = TYPE_ALIGN (type);
1678           else
1679             alignment = GET_MODE_ALIGNMENT (mode);
1680         }
1681       else
1682         alignment = 0;
1683     }
1684   else
1685     alignment = GET_MODE_ALIGNMENT (mode);
1686
1687   return alignment;
1688 }
1689
1690 /* Layout a function argument according to the AAPCS64 rules.  The rule
1691    numbers refer to the rule numbers in the AAPCS64.  */
1692
1693 static void
1694 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1695                     const_tree type,
1696                     bool named ATTRIBUTE_UNUSED)
1697 {
1698   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1699   int ncrn, nvrn, nregs;
1700   bool allocate_ncrn, allocate_nvrn;
1701   HOST_WIDE_INT size;
1702
1703   /* We need to do this once per argument.  */
1704   if (pcum->aapcs_arg_processed)
1705     return;
1706
1707   pcum->aapcs_arg_processed = true;
1708
1709   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1710   size
1711     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1712                         UNITS_PER_WORD);
1713
1714   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1715   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1716                                                  mode,
1717                                                  type,
1718                                                  &nregs);
1719
1720   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1721      The following code thus handles passing by SIMD/FP registers first.  */
1722
1723   nvrn = pcum->aapcs_nvrn;
1724
1725   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1726      and homogenous short-vector aggregates (HVA).  */
1727   if (allocate_nvrn)
1728     {
1729       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1730         {
1731           pcum->aapcs_nextnvrn = nvrn + nregs;
1732           if (!aarch64_composite_type_p (type, mode))
1733             {
1734               gcc_assert (nregs == 1);
1735               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1736             }
1737           else
1738             {
1739               rtx par;
1740               int i;
1741               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1742               for (i = 0; i < nregs; i++)
1743                 {
1744                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1745                                          V0_REGNUM + nvrn + i);
1746                   tmp = gen_rtx_EXPR_LIST
1747                     (VOIDmode, tmp,
1748                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1749                   XVECEXP (par, 0, i) = tmp;
1750                 }
1751               pcum->aapcs_reg = par;
1752             }
1753           return;
1754         }
1755       else
1756         {
1757           /* C.3 NSRN is set to 8.  */
1758           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1759           goto on_stack;
1760         }
1761     }
1762
1763   ncrn = pcum->aapcs_ncrn;
1764   nregs = size / UNITS_PER_WORD;
1765
1766   /* C6 - C9.  though the sign and zero extension semantics are
1767      handled elsewhere.  This is the case where the argument fits
1768      entirely general registers.  */
1769   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1770     {
1771       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1772
1773       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1774
1775       /* C.8 if the argument has an alignment of 16 then the NGRN is
1776          rounded up to the next even number.  */
1777       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1778         {
1779           ++ncrn;
1780           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1781         }
1782       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1783          A reg is still generated for it, but the caller should be smart
1784          enough not to use it.  */
1785       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1786         {
1787           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1788         }
1789       else
1790         {
1791           rtx par;
1792           int i;
1793
1794           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1795           for (i = 0; i < nregs; i++)
1796             {
1797               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1798               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1799                                        GEN_INT (i * UNITS_PER_WORD));
1800               XVECEXP (par, 0, i) = tmp;
1801             }
1802           pcum->aapcs_reg = par;
1803         }
1804
1805       pcum->aapcs_nextncrn = ncrn + nregs;
1806       return;
1807     }
1808
1809   /* C.11  */
1810   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1811
1812   /* The argument is passed on stack; record the needed number of words for
1813      this argument and align the total size if necessary.  */
1814 on_stack:
1815   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1816   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1817     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1818                                                16 / UNITS_PER_WORD);
1819   return;
1820 }
1821
1822 /* Implement TARGET_FUNCTION_ARG.  */
1823
1824 static rtx
1825 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1826                       const_tree type, bool named)
1827 {
1828   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1829   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1830
1831   if (mode == VOIDmode)
1832     return NULL_RTX;
1833
1834   aarch64_layout_arg (pcum_v, mode, type, named);
1835   return pcum->aapcs_reg;
1836 }
1837
1838 void
1839 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1840                            const_tree fntype ATTRIBUTE_UNUSED,
1841                            rtx libname ATTRIBUTE_UNUSED,
1842                            const_tree fndecl ATTRIBUTE_UNUSED,
1843                            unsigned n_named ATTRIBUTE_UNUSED)
1844 {
1845   pcum->aapcs_ncrn = 0;
1846   pcum->aapcs_nvrn = 0;
1847   pcum->aapcs_nextncrn = 0;
1848   pcum->aapcs_nextnvrn = 0;
1849   pcum->pcs_variant = ARM_PCS_AAPCS64;
1850   pcum->aapcs_reg = NULL_RTX;
1851   pcum->aapcs_arg_processed = false;
1852   pcum->aapcs_stack_words = 0;
1853   pcum->aapcs_stack_size = 0;
1854
1855   return;
1856 }
1857
1858 static void
1859 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1860                               machine_mode mode,
1861                               const_tree type,
1862                               bool named)
1863 {
1864   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1865   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1866     {
1867       aarch64_layout_arg (pcum_v, mode, type, named);
1868       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1869                   != (pcum->aapcs_stack_words != 0));
1870       pcum->aapcs_arg_processed = false;
1871       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1872       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1873       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1874       pcum->aapcs_stack_words = 0;
1875       pcum->aapcs_reg = NULL_RTX;
1876     }
1877 }
1878
1879 bool
1880 aarch64_function_arg_regno_p (unsigned regno)
1881 {
1882   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1883           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1884 }
1885
1886 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1887    PARM_BOUNDARY bits of alignment, but will be given anything up
1888    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1889    that both before and after the layout of each argument, the Next
1890    Stacked Argument Address (NSAA) will have a minimum alignment of
1891    8 bytes.  */
1892
1893 static unsigned int
1894 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1895 {
1896   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1897
1898   if (alignment < PARM_BOUNDARY)
1899     alignment = PARM_BOUNDARY;
1900   if (alignment > STACK_BOUNDARY)
1901     alignment = STACK_BOUNDARY;
1902   return alignment;
1903 }
1904
1905 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1906
1907    Return true if an argument passed on the stack should be padded upwards,
1908    i.e. if the least-significant byte of the stack slot has useful data.
1909
1910    Small aggregate types are placed in the lowest memory address.
1911
1912    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1913
1914 bool
1915 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1916 {
1917   /* On little-endian targets, the least significant byte of every stack
1918      argument is passed at the lowest byte address of the stack slot.  */
1919   if (!BYTES_BIG_ENDIAN)
1920     return true;
1921
1922   /* Otherwise, integral, floating-point and pointer types are padded downward:
1923      the least significant byte of a stack argument is passed at the highest
1924      byte address of the stack slot.  */
1925   if (type
1926       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1927          || POINTER_TYPE_P (type))
1928       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1929     return false;
1930
1931   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1932   return true;
1933 }
1934
1935 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1936
1937    It specifies padding for the last (may also be the only)
1938    element of a block move between registers and memory.  If
1939    assuming the block is in the memory, padding upward means that
1940    the last element is padded after its highest significant byte,
1941    while in downward padding, the last element is padded at the
1942    its least significant byte side.
1943
1944    Small aggregates and small complex types are always padded
1945    upwards.
1946
1947    We don't need to worry about homogeneous floating-point or
1948    short-vector aggregates; their move is not affected by the
1949    padding direction determined here.  Regardless of endianness,
1950    each element of such an aggregate is put in the least
1951    significant bits of a fp/simd register.
1952
1953    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1954    register has useful data, and return the opposite if the most
1955    significant byte does.  */
1956
1957 bool
1958 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1959                      bool first ATTRIBUTE_UNUSED)
1960 {
1961
1962   /* Small composite types are always padded upward.  */
1963   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1964     {
1965       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1966                             : GET_MODE_SIZE (mode));
1967       if (size < 2 * UNITS_PER_WORD)
1968         return true;
1969     }
1970
1971   /* Otherwise, use the default padding.  */
1972   return !BYTES_BIG_ENDIAN;
1973 }
1974
1975 static machine_mode
1976 aarch64_libgcc_cmp_return_mode (void)
1977 {
1978   return SImode;
1979 }
1980
1981 static bool
1982 aarch64_frame_pointer_required (void)
1983 {
1984   /* In aarch64_override_options_after_change
1985      flag_omit_leaf_frame_pointer turns off the frame pointer by
1986      default.  Turn it back on now if we've not got a leaf
1987      function.  */
1988   if (flag_omit_leaf_frame_pointer
1989       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1990     return true;
1991
1992   return false;
1993 }
1994
1995 /* Mark the registers that need to be saved by the callee and calculate
1996    the size of the callee-saved registers area and frame record (both FP
1997    and LR may be omitted).  */
1998 static void
1999 aarch64_layout_frame (void)
2000 {
2001   HOST_WIDE_INT offset = 0;
2002   int regno;
2003
2004   if (reload_completed && cfun->machine->frame.laid_out)
2005     return;
2006
2007 #define SLOT_NOT_REQUIRED (-2)
2008 #define SLOT_REQUIRED     (-1)
2009
2010   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2011   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2012
2013   /* First mark all the registers that really need to be saved...  */
2014   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2015     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2016
2017   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2018     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2019
2020   /* ... that includes the eh data registers (if needed)...  */
2021   if (crtl->calls_eh_return)
2022     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2023       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2024         = SLOT_REQUIRED;
2025
2026   /* ... and any callee saved register that dataflow says is live.  */
2027   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2028     if (df_regs_ever_live_p (regno)
2029         && (regno == R30_REGNUM
2030             || !call_used_regs[regno]))
2031       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2032
2033   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2034     if (df_regs_ever_live_p (regno)
2035         && !call_used_regs[regno])
2036       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2037
2038   if (frame_pointer_needed)
2039     {
2040       /* FP and LR are placed in the linkage record.  */
2041       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2042       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2043       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2044       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2045       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2046       offset += 2 * UNITS_PER_WORD;
2047     }
2048
2049   /* Now assign stack slots for them.  */
2050   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2051     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2052       {
2053         cfun->machine->frame.reg_offset[regno] = offset;
2054         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2055           cfun->machine->frame.wb_candidate1 = regno;
2056         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2057           cfun->machine->frame.wb_candidate2 = regno;
2058         offset += UNITS_PER_WORD;
2059       }
2060
2061   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2062     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2063       {
2064         cfun->machine->frame.reg_offset[regno] = offset;
2065         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2066           cfun->machine->frame.wb_candidate1 = regno;
2067         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2068                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2069           cfun->machine->frame.wb_candidate2 = regno;
2070         offset += UNITS_PER_WORD;
2071       }
2072
2073   cfun->machine->frame.padding0 =
2074     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2075   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2076
2077   cfun->machine->frame.saved_regs_size = offset;
2078
2079   cfun->machine->frame.hard_fp_offset
2080     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2081                         + get_frame_size ()
2082                         + cfun->machine->frame.saved_regs_size,
2083                         STACK_BOUNDARY / BITS_PER_UNIT);
2084
2085   cfun->machine->frame.frame_size
2086     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2087                         + crtl->outgoing_args_size,
2088                         STACK_BOUNDARY / BITS_PER_UNIT);
2089
2090   cfun->machine->frame.laid_out = true;
2091 }
2092
2093 static bool
2094 aarch64_register_saved_on_entry (int regno)
2095 {
2096   return cfun->machine->frame.reg_offset[regno] >= 0;
2097 }
2098
2099 static unsigned
2100 aarch64_next_callee_save (unsigned regno, unsigned limit)
2101 {
2102   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2103     regno ++;
2104   return regno;
2105 }
2106
2107 static void
2108 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2109                            HOST_WIDE_INT adjustment)
2110  {
2111   rtx base_rtx = stack_pointer_rtx;
2112   rtx insn, reg, mem;
2113
2114   reg = gen_rtx_REG (mode, regno);
2115   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2116                             plus_constant (Pmode, base_rtx, -adjustment));
2117   mem = gen_rtx_MEM (mode, mem);
2118
2119   insn = emit_move_insn (mem, reg);
2120   RTX_FRAME_RELATED_P (insn) = 1;
2121 }
2122
2123 static rtx
2124 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2125                           HOST_WIDE_INT adjustment)
2126 {
2127   switch (mode)
2128     {
2129     case DImode:
2130       return gen_storewb_pairdi_di (base, base, reg, reg2,
2131                                     GEN_INT (-adjustment),
2132                                     GEN_INT (UNITS_PER_WORD - adjustment));
2133     case DFmode:
2134       return gen_storewb_pairdf_di (base, base, reg, reg2,
2135                                     GEN_INT (-adjustment),
2136                                     GEN_INT (UNITS_PER_WORD - adjustment));
2137     default:
2138       gcc_unreachable ();
2139     }
2140 }
2141
2142 static void
2143 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2144                          unsigned regno2, HOST_WIDE_INT adjustment)
2145 {
2146   rtx_insn *insn;
2147   rtx reg1 = gen_rtx_REG (mode, regno1);
2148   rtx reg2 = gen_rtx_REG (mode, regno2);
2149
2150   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2151                                               reg2, adjustment));
2152   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2153   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2154   RTX_FRAME_RELATED_P (insn) = 1;
2155 }
2156
2157 static rtx
2158 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2159                          HOST_WIDE_INT adjustment)
2160 {
2161   switch (mode)
2162     {
2163     case DImode:
2164       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2165                                    GEN_INT (UNITS_PER_WORD));
2166     case DFmode:
2167       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2168                                    GEN_INT (UNITS_PER_WORD));
2169     default:
2170       gcc_unreachable ();
2171     }
2172 }
2173
2174 static rtx
2175 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2176                         rtx reg2)
2177 {
2178   switch (mode)
2179     {
2180     case DImode:
2181       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2182
2183     case DFmode:
2184       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2185
2186     default:
2187       gcc_unreachable ();
2188     }
2189 }
2190
2191 static rtx
2192 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2193                        rtx mem2)
2194 {
2195   switch (mode)
2196     {
2197     case DImode:
2198       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2199
2200     case DFmode:
2201       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2202
2203     default:
2204       gcc_unreachable ();
2205     }
2206 }
2207
2208
2209 static void
2210 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2211                            unsigned start, unsigned limit, bool skip_wb)
2212 {
2213   rtx_insn *insn;
2214   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2215                                                  ? gen_frame_mem : gen_rtx_MEM);
2216   unsigned regno;
2217   unsigned regno2;
2218
2219   for (regno = aarch64_next_callee_save (start, limit);
2220        regno <= limit;
2221        regno = aarch64_next_callee_save (regno + 1, limit))
2222     {
2223       rtx reg, mem;
2224       HOST_WIDE_INT offset;
2225
2226       if (skip_wb
2227           && (regno == cfun->machine->frame.wb_candidate1
2228               || regno == cfun->machine->frame.wb_candidate2))
2229         continue;
2230
2231       reg = gen_rtx_REG (mode, regno);
2232       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2233       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2234                                               offset));
2235
2236       regno2 = aarch64_next_callee_save (regno + 1, limit);
2237
2238       if (regno2 <= limit
2239           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2240               == cfun->machine->frame.reg_offset[regno2]))
2241
2242         {
2243           rtx reg2 = gen_rtx_REG (mode, regno2);
2244           rtx mem2;
2245
2246           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2247           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2248                                                    offset));
2249           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2250                                                     reg2));
2251
2252           /* The first part of a frame-related parallel insn is
2253              always assumed to be relevant to the frame
2254              calculations; subsequent parts, are only
2255              frame-related if explicitly marked.  */
2256           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2257           regno = regno2;
2258         }
2259       else
2260         insn = emit_move_insn (mem, reg);
2261
2262       RTX_FRAME_RELATED_P (insn) = 1;
2263     }
2264 }
2265
2266 static void
2267 aarch64_restore_callee_saves (machine_mode mode,
2268                               HOST_WIDE_INT start_offset, unsigned start,
2269                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2270 {
2271   rtx base_rtx = stack_pointer_rtx;
2272   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2273                                                  ? gen_frame_mem : gen_rtx_MEM);
2274   unsigned regno;
2275   unsigned regno2;
2276   HOST_WIDE_INT offset;
2277
2278   for (regno = aarch64_next_callee_save (start, limit);
2279        regno <= limit;
2280        regno = aarch64_next_callee_save (regno + 1, limit))
2281     {
2282       rtx reg, mem;
2283
2284       if (skip_wb
2285           && (regno == cfun->machine->frame.wb_candidate1
2286               || regno == cfun->machine->frame.wb_candidate2))
2287         continue;
2288
2289       reg = gen_rtx_REG (mode, regno);
2290       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2291       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2292
2293       regno2 = aarch64_next_callee_save (regno + 1, limit);
2294
2295       if (regno2 <= limit
2296           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2297               == cfun->machine->frame.reg_offset[regno2]))
2298         {
2299           rtx reg2 = gen_rtx_REG (mode, regno2);
2300           rtx mem2;
2301
2302           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2303           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2304           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2305
2306           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2307           regno = regno2;
2308         }
2309       else
2310         emit_move_insn (reg, mem);
2311       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2312     }
2313 }
2314
2315 /* AArch64 stack frames generated by this compiler look like:
2316
2317         +-------------------------------+
2318         |                               |
2319         |  incoming stack arguments     |
2320         |                               |
2321         +-------------------------------+
2322         |                               | <-- incoming stack pointer (aligned)
2323         |  callee-allocated save area   |
2324         |  for register varargs         |
2325         |                               |
2326         +-------------------------------+
2327         |  local variables              | <-- frame_pointer_rtx
2328         |                               |
2329         +-------------------------------+
2330         |  padding0                     | \
2331         +-------------------------------+  |
2332         |  callee-saved registers       |  | frame.saved_regs_size
2333         +-------------------------------+  |
2334         |  LR'                          |  |
2335         +-------------------------------+  |
2336         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2337         +-------------------------------+
2338         |  dynamic allocation           |
2339         +-------------------------------+
2340         |  padding                      |
2341         +-------------------------------+
2342         |  outgoing stack arguments     | <-- arg_pointer
2343         |                               |
2344         +-------------------------------+
2345         |                               | <-- stack_pointer_rtx (aligned)
2346
2347    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2348    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2349    unchanged.  */
2350
2351 /* Generate the prologue instructions for entry into a function.
2352    Establish the stack frame by decreasing the stack pointer with a
2353    properly calculated size and, if necessary, create a frame record
2354    filled with the values of LR and previous frame pointer.  The
2355    current FP is also set up if it is in use.  */
2356
2357 void
2358 aarch64_expand_prologue (void)
2359 {
2360   /* sub sp, sp, #<frame_size>
2361      stp {fp, lr}, [sp, #<frame_size> - 16]
2362      add fp, sp, #<frame_size> - hardfp_offset
2363      stp {cs_reg}, [fp, #-16] etc.
2364
2365      sub sp, sp, <final_adjustment_if_any>
2366   */
2367   HOST_WIDE_INT frame_size, offset;
2368   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2369   HOST_WIDE_INT hard_fp_offset;
2370   rtx_insn *insn;
2371
2372   aarch64_layout_frame ();
2373
2374   offset = frame_size = cfun->machine->frame.frame_size;
2375   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2376   fp_offset = frame_size - hard_fp_offset;
2377
2378   if (flag_stack_usage_info)
2379     current_function_static_stack_size = frame_size;
2380
2381   /* Store pairs and load pairs have a range only -512 to 504.  */
2382   if (offset >= 512)
2383     {
2384       /* When the frame has a large size, an initial decrease is done on
2385          the stack pointer to jump over the callee-allocated save area for
2386          register varargs, the local variable area and/or the callee-saved
2387          register area.  This will allow the pre-index write-back
2388          store pair instructions to be used for setting up the stack frame
2389          efficiently.  */
2390       offset = hard_fp_offset;
2391       if (offset >= 512)
2392         offset = cfun->machine->frame.saved_regs_size;
2393
2394       frame_size -= (offset + crtl->outgoing_args_size);
2395       fp_offset = 0;
2396
2397       if (frame_size >= 0x1000000)
2398         {
2399           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2400           emit_move_insn (op0, GEN_INT (-frame_size));
2401           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2402
2403           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2404                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2405                                      plus_constant (Pmode, stack_pointer_rtx,
2406                                                     -frame_size)));
2407           RTX_FRAME_RELATED_P (insn) = 1;
2408         }
2409       else if (frame_size > 0)
2410         {
2411           int hi_ofs = frame_size & 0xfff000;
2412           int lo_ofs = frame_size & 0x000fff;
2413
2414           if (hi_ofs)
2415             {
2416               insn = emit_insn (gen_add2_insn
2417                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2418               RTX_FRAME_RELATED_P (insn) = 1;
2419             }
2420           if (lo_ofs)
2421             {
2422               insn = emit_insn (gen_add2_insn
2423                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2424               RTX_FRAME_RELATED_P (insn) = 1;
2425             }
2426         }
2427     }
2428   else
2429     frame_size = -1;
2430
2431   if (offset > 0)
2432     {
2433       bool skip_wb = false;
2434
2435       if (frame_pointer_needed)
2436         {
2437           skip_wb = true;
2438
2439           if (fp_offset)
2440             {
2441               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2442                                                GEN_INT (-offset)));
2443               RTX_FRAME_RELATED_P (insn) = 1;
2444
2445               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2446                                          R30_REGNUM, false);
2447             }
2448           else
2449             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2450
2451           /* Set up frame pointer to point to the location of the
2452              previous frame pointer on the stack.  */
2453           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2454                                            stack_pointer_rtx,
2455                                            GEN_INT (fp_offset)));
2456           RTX_FRAME_RELATED_P (insn) = 1;
2457           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2458         }
2459       else
2460         {
2461           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2462           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2463
2464           if (fp_offset
2465               || reg1 == FIRST_PSEUDO_REGISTER
2466               || (reg2 == FIRST_PSEUDO_REGISTER
2467                   && offset >= 256))
2468             {
2469               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2470                                                GEN_INT (-offset)));
2471               RTX_FRAME_RELATED_P (insn) = 1;
2472             }
2473           else
2474             {
2475               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2476
2477               skip_wb = true;
2478
2479               if (reg2 == FIRST_PSEUDO_REGISTER)
2480                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2481               else
2482                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2483             }
2484         }
2485
2486       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2487                                  skip_wb);
2488       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2489                                  skip_wb);
2490     }
2491
2492   /* when offset >= 512,
2493      sub sp, sp, #<outgoing_args_size> */
2494   if (frame_size > -1)
2495     {
2496       if (crtl->outgoing_args_size > 0)
2497         {
2498           insn = emit_insn (gen_add2_insn
2499                             (stack_pointer_rtx,
2500                              GEN_INT (- crtl->outgoing_args_size)));
2501           RTX_FRAME_RELATED_P (insn) = 1;
2502         }
2503     }
2504 }
2505
2506 /* Return TRUE if we can use a simple_return insn.
2507
2508    This function checks whether the callee saved stack is empty, which
2509    means no restore actions are need. The pro_and_epilogue will use
2510    this to check whether shrink-wrapping opt is feasible.  */
2511
2512 bool
2513 aarch64_use_return_insn_p (void)
2514 {
2515   if (!reload_completed)
2516     return false;
2517
2518   if (crtl->profile)
2519     return false;
2520
2521   aarch64_layout_frame ();
2522
2523   return cfun->machine->frame.frame_size == 0;
2524 }
2525
2526 /* Generate the epilogue instructions for returning from a function.  */
2527 void
2528 aarch64_expand_epilogue (bool for_sibcall)
2529 {
2530   HOST_WIDE_INT frame_size, offset;
2531   HOST_WIDE_INT fp_offset;
2532   HOST_WIDE_INT hard_fp_offset;
2533   rtx_insn *insn;
2534   /* We need to add memory barrier to prevent read from deallocated stack.  */
2535   bool need_barrier_p = (get_frame_size () != 0
2536                          || cfun->machine->frame.saved_varargs_size);
2537
2538   aarch64_layout_frame ();
2539
2540   offset = frame_size = cfun->machine->frame.frame_size;
2541   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2542   fp_offset = frame_size - hard_fp_offset;
2543
2544   /* Store pairs and load pairs have a range only -512 to 504.  */
2545   if (offset >= 512)
2546     {
2547       offset = hard_fp_offset;
2548       if (offset >= 512)
2549         offset = cfun->machine->frame.saved_regs_size;
2550
2551       frame_size -= (offset + crtl->outgoing_args_size);
2552       fp_offset = 0;
2553       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2554         {
2555           insn = emit_insn (gen_add2_insn
2556                             (stack_pointer_rtx,
2557                              GEN_INT (crtl->outgoing_args_size)));
2558           RTX_FRAME_RELATED_P (insn) = 1;
2559         }
2560     }
2561   else
2562     frame_size = -1;
2563
2564   /* If there were outgoing arguments or we've done dynamic stack
2565      allocation, then restore the stack pointer from the frame
2566      pointer.  This is at most one insn and more efficient than using
2567      GCC's internal mechanism.  */
2568   if (frame_pointer_needed
2569       && (crtl->outgoing_args_size || cfun->calls_alloca))
2570     {
2571       if (cfun->calls_alloca)
2572         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2573
2574       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2575                                        hard_frame_pointer_rtx,
2576                                        GEN_INT (0)));
2577       offset = offset - fp_offset;
2578     }
2579
2580   if (offset > 0)
2581     {
2582       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2583       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2584       bool skip_wb = true;
2585       rtx cfi_ops = NULL;
2586
2587       if (frame_pointer_needed)
2588         fp_offset = 0;
2589       else if (fp_offset
2590                || reg1 == FIRST_PSEUDO_REGISTER
2591                || (reg2 == FIRST_PSEUDO_REGISTER
2592                    && offset >= 256))
2593         skip_wb = false;
2594
2595       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2596                                     skip_wb, &cfi_ops);
2597       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2598                                     skip_wb, &cfi_ops);
2599
2600       if (need_barrier_p)
2601         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2602
2603       if (skip_wb)
2604         {
2605           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2606           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2607
2608           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2609           if (reg2 == FIRST_PSEUDO_REGISTER)
2610             {
2611               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2612               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2613               mem = gen_rtx_MEM (mode1, mem);
2614               insn = emit_move_insn (rreg1, mem);
2615             }
2616           else
2617             {
2618               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2619
2620               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2621               insn = emit_insn (aarch64_gen_loadwb_pair
2622                                 (mode1, stack_pointer_rtx, rreg1,
2623                                  rreg2, offset));
2624             }
2625         }
2626       else
2627         {
2628           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2629                                            GEN_INT (offset)));
2630         }
2631
2632       /* Reset the CFA to be SP + FRAME_SIZE.  */
2633       rtx new_cfa = stack_pointer_rtx;
2634       if (frame_size > 0)
2635         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2636       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2637       REG_NOTES (insn) = cfi_ops;
2638       RTX_FRAME_RELATED_P (insn) = 1;
2639     }
2640
2641   if (frame_size > 0)
2642     {
2643       if (need_barrier_p)
2644         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2645
2646       if (frame_size >= 0x1000000)
2647         {
2648           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2649           emit_move_insn (op0, GEN_INT (frame_size));
2650           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2651         }
2652       else
2653         {
2654           int hi_ofs = frame_size & 0xfff000;
2655           int lo_ofs = frame_size & 0x000fff;
2656
2657           if (hi_ofs && lo_ofs)
2658             {
2659               insn = emit_insn (gen_add2_insn
2660                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2661               RTX_FRAME_RELATED_P (insn) = 1;
2662               frame_size = lo_ofs;
2663             }
2664           insn = emit_insn (gen_add2_insn
2665                             (stack_pointer_rtx, GEN_INT (frame_size)));
2666         }
2667
2668       /* Reset the CFA to be SP + 0.  */
2669       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2670       RTX_FRAME_RELATED_P (insn) = 1;
2671     }
2672
2673   /* Stack adjustment for exception handler.  */
2674   if (crtl->calls_eh_return)
2675     {
2676       /* We need to unwind the stack by the offset computed by
2677          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2678          to be SP; letting the CFA move during this adjustment
2679          is just as correct as retaining the CFA from the body
2680          of the function.  Therefore, do nothing special.  */
2681       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2682     }
2683
2684   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2685   if (!for_sibcall)
2686     emit_jump_insn (ret_rtx);
2687 }
2688
2689 /* Return the place to copy the exception unwinding return address to.
2690    This will probably be a stack slot, but could (in theory be the
2691    return register).  */
2692 rtx
2693 aarch64_final_eh_return_addr (void)
2694 {
2695   HOST_WIDE_INT fp_offset;
2696
2697   aarch64_layout_frame ();
2698
2699   fp_offset = cfun->machine->frame.frame_size
2700               - cfun->machine->frame.hard_fp_offset;
2701
2702   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2703     return gen_rtx_REG (DImode, LR_REGNUM);
2704
2705   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2706      result in a store to save LR introduced by builtin_eh_return () being
2707      incorrectly deleted because the alias is not detected.
2708      So in the calculation of the address to copy the exception unwinding
2709      return address to, we note 2 cases.
2710      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2711      we return a SP-relative location since all the addresses are SP-relative
2712      in this case.  This prevents the store from being optimized away.
2713      If the fp_offset is not 0, then the addresses will be FP-relative and
2714      therefore we return a FP-relative location.  */
2715
2716   if (frame_pointer_needed)
2717     {
2718       if (fp_offset)
2719         return gen_frame_mem (DImode,
2720                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2721       else
2722         return gen_frame_mem (DImode,
2723                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2724     }
2725
2726   /* If FP is not needed, we calculate the location of LR, which would be
2727      at the top of the saved registers block.  */
2728
2729   return gen_frame_mem (DImode,
2730                         plus_constant (Pmode,
2731                                        stack_pointer_rtx,
2732                                        fp_offset
2733                                        + cfun->machine->frame.saved_regs_size
2734                                        - 2 * UNITS_PER_WORD));
2735 }
2736
2737 /* Possibly output code to build up a constant in a register.  For
2738    the benefit of the costs infrastructure, returns the number of
2739    instructions which would be emitted.  GENERATE inhibits or
2740    enables code generation.  */
2741
2742 static int
2743 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2744 {
2745   int insns = 0;
2746
2747   if (aarch64_bitmask_imm (val, DImode))
2748     {
2749       if (generate)
2750         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2751       insns = 1;
2752     }
2753   else
2754     {
2755       int i;
2756       int ncount = 0;
2757       int zcount = 0;
2758       HOST_WIDE_INT valp = val >> 16;
2759       HOST_WIDE_INT valm;
2760       HOST_WIDE_INT tval;
2761
2762       for (i = 16; i < 64; i += 16)
2763         {
2764           valm = (valp & 0xffff);
2765
2766           if (valm != 0)
2767             ++ zcount;
2768
2769           if (valm != 0xffff)
2770             ++ ncount;
2771
2772           valp >>= 16;
2773         }
2774
2775       /* zcount contains the number of additional MOVK instructions
2776          required if the constant is built up with an initial MOVZ instruction,
2777          while ncount is the number of MOVK instructions required if starting
2778          with a MOVN instruction.  Choose the sequence that yields the fewest
2779          number of instructions, preferring MOVZ instructions when they are both
2780          the same.  */
2781       if (ncount < zcount)
2782         {
2783           if (generate)
2784             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2785                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2786           tval = 0xffff;
2787           insns++;
2788         }
2789       else
2790         {
2791           if (generate)
2792             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2793                             GEN_INT (val & 0xffff));
2794           tval = 0;
2795           insns++;
2796         }
2797
2798       val >>= 16;
2799
2800       for (i = 16; i < 64; i += 16)
2801         {
2802           if ((val & 0xffff) != tval)
2803             {
2804               if (generate)
2805                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2806                                            GEN_INT (i),
2807                                            GEN_INT (val & 0xffff)));
2808               insns++;
2809             }
2810           val >>= 16;
2811         }
2812     }
2813   return insns;
2814 }
2815
2816 static void
2817 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2818 {
2819   HOST_WIDE_INT mdelta = delta;
2820   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2821   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2822
2823   if (mdelta < 0)
2824     mdelta = -mdelta;
2825
2826   if (mdelta >= 4096 * 4096)
2827     {
2828       (void) aarch64_build_constant (scratchreg, delta, true);
2829       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2830     }
2831   else if (mdelta > 0)
2832     {
2833       if (mdelta >= 4096)
2834         {
2835           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2836           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2837           if (delta < 0)
2838             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2839                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2840           else
2841             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2842                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2843         }
2844       if (mdelta % 4096 != 0)
2845         {
2846           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2847           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2848                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2849         }
2850     }
2851 }
2852
2853 /* Output code to add DELTA to the first argument, and then jump
2854    to FUNCTION.  Used for C++ multiple inheritance.  */
2855 static void
2856 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2857                          HOST_WIDE_INT delta,
2858                          HOST_WIDE_INT vcall_offset,
2859                          tree function)
2860 {
2861   /* The this pointer is always in x0.  Note that this differs from
2862      Arm where the this pointer maybe bumped to r1 if r0 is required
2863      to return a pointer to an aggregate.  On AArch64 a result value
2864      pointer will be in x8.  */
2865   int this_regno = R0_REGNUM;
2866   rtx this_rtx, temp0, temp1, addr, funexp;
2867   rtx_insn *insn;
2868
2869   reload_completed = 1;
2870   emit_note (NOTE_INSN_PROLOGUE_END);
2871
2872   if (vcall_offset == 0)
2873     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2874   else
2875     {
2876       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2877
2878       this_rtx = gen_rtx_REG (Pmode, this_regno);
2879       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2880       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2881
2882       addr = this_rtx;
2883       if (delta != 0)
2884         {
2885           if (delta >= -256 && delta < 256)
2886             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2887                                        plus_constant (Pmode, this_rtx, delta));
2888           else
2889             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2890         }
2891
2892       if (Pmode == ptr_mode)
2893         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2894       else
2895         aarch64_emit_move (temp0,
2896                            gen_rtx_ZERO_EXTEND (Pmode,
2897                                                 gen_rtx_MEM (ptr_mode, addr)));
2898
2899       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2900           addr = plus_constant (Pmode, temp0, vcall_offset);
2901       else
2902         {
2903           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2904           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2905         }
2906
2907       if (Pmode == ptr_mode)
2908         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2909       else
2910         aarch64_emit_move (temp1,
2911                            gen_rtx_SIGN_EXTEND (Pmode,
2912                                                 gen_rtx_MEM (ptr_mode, addr)));
2913
2914       emit_insn (gen_add2_insn (this_rtx, temp1));
2915     }
2916
2917   /* Generate a tail call to the target function.  */
2918   if (!TREE_USED (function))
2919     {
2920       assemble_external (function);
2921       TREE_USED (function) = 1;
2922     }
2923   funexp = XEXP (DECL_RTL (function), 0);
2924   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2925   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2926   SIBLING_CALL_P (insn) = 1;
2927
2928   insn = get_insns ();
2929   shorten_branches (insn);
2930   final_start_function (insn, file, 1);
2931   final (insn, file, 1);
2932   final_end_function ();
2933
2934   /* Stop pretending to be a post-reload pass.  */
2935   reload_completed = 0;
2936 }
2937
2938 static bool
2939 aarch64_tls_referenced_p (rtx x)
2940 {
2941   if (!TARGET_HAVE_TLS)
2942     return false;
2943   subrtx_iterator::array_type array;
2944   FOR_EACH_SUBRTX (iter, array, x, ALL)
2945     {
2946       const_rtx x = *iter;
2947       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2948         return true;
2949       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2950          TLS offsets, not real symbol references.  */
2951       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2952         iter.skip_subrtxes ();
2953     }
2954   return false;
2955 }
2956
2957
2958 static int
2959 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2960 {
2961   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2962   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2963
2964   if (*imm1 < *imm2)
2965     return -1;
2966   if (*imm1 > *imm2)
2967     return +1;
2968   return 0;
2969 }
2970
2971
2972 static void
2973 aarch64_build_bitmask_table (void)
2974 {
2975   unsigned HOST_WIDE_INT mask, imm;
2976   unsigned int log_e, e, s, r;
2977   unsigned int nimms = 0;
2978
2979   for (log_e = 1; log_e <= 6; log_e++)
2980     {
2981       e = 1 << log_e;
2982       if (e == 64)
2983         mask = ~(HOST_WIDE_INT) 0;
2984       else
2985         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2986       for (s = 1; s < e; s++)
2987         {
2988           for (r = 0; r < e; r++)
2989             {
2990               /* set s consecutive bits to 1 (s < 64) */
2991               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2992               /* rotate right by r */
2993               if (r != 0)
2994                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2995               /* replicate the constant depending on SIMD size */
2996               switch (log_e) {
2997               case 1: imm |= (imm <<  2);
2998               case 2: imm |= (imm <<  4);
2999               case 3: imm |= (imm <<  8);
3000               case 4: imm |= (imm << 16);
3001               case 5: imm |= (imm << 32);
3002               case 6:
3003                 break;
3004               default:
3005                 gcc_unreachable ();
3006               }
3007               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3008               aarch64_bitmasks[nimms++] = imm;
3009             }
3010         }
3011     }
3012
3013   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3014   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3015          aarch64_bitmasks_cmp);
3016 }
3017
3018
3019 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3020    a left shift of 0 or 12 bits.  */
3021 bool
3022 aarch64_uimm12_shift (HOST_WIDE_INT val)
3023 {
3024   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3025           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3026           );
3027 }
3028
3029
3030 /* Return true if val is an immediate that can be loaded into a
3031    register by a MOVZ instruction.  */
3032 static bool
3033 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3034 {
3035   if (GET_MODE_SIZE (mode) > 4)
3036     {
3037       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3038           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3039         return 1;
3040     }
3041   else
3042     {
3043       /* Ignore sign extension.  */
3044       val &= (HOST_WIDE_INT) 0xffffffff;
3045     }
3046   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3047           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3048 }
3049
3050
3051 /* Return true if val is a valid bitmask immediate.  */
3052 bool
3053 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3054 {
3055   if (GET_MODE_SIZE (mode) < 8)
3056     {
3057       /* Replicate bit pattern.  */
3058       val &= (HOST_WIDE_INT) 0xffffffff;
3059       val |= val << 32;
3060     }
3061   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3062                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3063 }
3064
3065
3066 /* Return true if val is an immediate that can be loaded into a
3067    register in a single instruction.  */
3068 bool
3069 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3070 {
3071   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3072     return 1;
3073   return aarch64_bitmask_imm (val, mode);
3074 }
3075
3076 static bool
3077 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3078 {
3079   rtx base, offset;
3080
3081   if (GET_CODE (x) == HIGH)
3082     return true;
3083
3084   split_const (x, &base, &offset);
3085   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3086     {
3087       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3088           != SYMBOL_FORCE_TO_MEM)
3089         return true;
3090       else
3091         /* Avoid generating a 64-bit relocation in ILP32; leave
3092            to aarch64_expand_mov_immediate to handle it properly.  */
3093         return mode != ptr_mode;
3094     }
3095
3096   return aarch64_tls_referenced_p (x);
3097 }
3098
3099 /* Return true if register REGNO is a valid index register.
3100    STRICT_P is true if REG_OK_STRICT is in effect.  */
3101
3102 bool
3103 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3104 {
3105   if (!HARD_REGISTER_NUM_P (regno))
3106     {
3107       if (!strict_p)
3108         return true;
3109
3110       if (!reg_renumber)
3111         return false;
3112
3113       regno = reg_renumber[regno];
3114     }
3115   return GP_REGNUM_P (regno);
3116 }
3117
3118 /* Return true if register REGNO is a valid base register for mode MODE.
3119    STRICT_P is true if REG_OK_STRICT is in effect.  */
3120
3121 bool
3122 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3123 {
3124   if (!HARD_REGISTER_NUM_P (regno))
3125     {
3126       if (!strict_p)
3127         return true;
3128
3129       if (!reg_renumber)
3130         return false;
3131
3132       regno = reg_renumber[regno];
3133     }
3134
3135   /* The fake registers will be eliminated to either the stack or
3136      hard frame pointer, both of which are usually valid base registers.
3137      Reload deals with the cases where the eliminated form isn't valid.  */
3138   return (GP_REGNUM_P (regno)
3139           || regno == SP_REGNUM
3140           || regno == FRAME_POINTER_REGNUM
3141           || regno == ARG_POINTER_REGNUM);
3142 }
3143
3144 /* Return true if X is a valid base register for mode MODE.
3145    STRICT_P is true if REG_OK_STRICT is in effect.  */
3146
3147 static bool
3148 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3149 {
3150   if (!strict_p && GET_CODE (x) == SUBREG)
3151     x = SUBREG_REG (x);
3152
3153   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3154 }
3155
3156 /* Return true if address offset is a valid index.  If it is, fill in INFO
3157    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3158
3159 static bool
3160 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3161                         machine_mode mode, bool strict_p)
3162 {
3163   enum aarch64_address_type type;
3164   rtx index;
3165   int shift;
3166
3167   /* (reg:P) */
3168   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3169       && GET_MODE (x) == Pmode)
3170     {
3171       type = ADDRESS_REG_REG;
3172       index = x;
3173       shift = 0;
3174     }
3175   /* (sign_extend:DI (reg:SI)) */
3176   else if ((GET_CODE (x) == SIGN_EXTEND
3177             || GET_CODE (x) == ZERO_EXTEND)
3178            && GET_MODE (x) == DImode
3179            && GET_MODE (XEXP (x, 0)) == SImode)
3180     {
3181       type = (GET_CODE (x) == SIGN_EXTEND)
3182         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3183       index = XEXP (x, 0);
3184       shift = 0;
3185     }
3186   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3187   else if (GET_CODE (x) == MULT
3188            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3189                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3190            && GET_MODE (XEXP (x, 0)) == DImode
3191            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3192            && CONST_INT_P (XEXP (x, 1)))
3193     {
3194       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3195         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3196       index = XEXP (XEXP (x, 0), 0);
3197       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3198     }
3199   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3200   else if (GET_CODE (x) == ASHIFT
3201            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3202                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3203            && GET_MODE (XEXP (x, 0)) == DImode
3204            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3205            && CONST_INT_P (XEXP (x, 1)))
3206     {
3207       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3208         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3209       index = XEXP (XEXP (x, 0), 0);
3210       shift = INTVAL (XEXP (x, 1));
3211     }
3212   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3213   else if ((GET_CODE (x) == SIGN_EXTRACT
3214             || GET_CODE (x) == ZERO_EXTRACT)
3215            && GET_MODE (x) == DImode
3216            && GET_CODE (XEXP (x, 0)) == MULT
3217            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3218            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3219     {
3220       type = (GET_CODE (x) == SIGN_EXTRACT)
3221         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3222       index = XEXP (XEXP (x, 0), 0);
3223       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3224       if (INTVAL (XEXP (x, 1)) != 32 + shift
3225           || INTVAL (XEXP (x, 2)) != 0)
3226         shift = -1;
3227     }
3228   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3229      (const_int 0xffffffff<<shift)) */
3230   else if (GET_CODE (x) == AND
3231            && GET_MODE (x) == DImode
3232            && GET_CODE (XEXP (x, 0)) == MULT
3233            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3234            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3235            && CONST_INT_P (XEXP (x, 1)))
3236     {
3237       type = ADDRESS_REG_UXTW;
3238       index = XEXP (XEXP (x, 0), 0);
3239       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3240       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3241         shift = -1;
3242     }
3243   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3244   else if ((GET_CODE (x) == SIGN_EXTRACT
3245             || GET_CODE (x) == ZERO_EXTRACT)
3246            && GET_MODE (x) == DImode
3247            && GET_CODE (XEXP (x, 0)) == ASHIFT
3248            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3249            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3250     {
3251       type = (GET_CODE (x) == SIGN_EXTRACT)
3252         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3253       index = XEXP (XEXP (x, 0), 0);
3254       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3255       if (INTVAL (XEXP (x, 1)) != 32 + shift
3256           || INTVAL (XEXP (x, 2)) != 0)
3257         shift = -1;
3258     }
3259   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3260      (const_int 0xffffffff<<shift)) */
3261   else if (GET_CODE (x) == AND
3262            && GET_MODE (x) == DImode
3263            && GET_CODE (XEXP (x, 0)) == ASHIFT
3264            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3265            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3266            && CONST_INT_P (XEXP (x, 1)))
3267     {
3268       type = ADDRESS_REG_UXTW;
3269       index = XEXP (XEXP (x, 0), 0);
3270       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3271       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3272         shift = -1;
3273     }
3274   /* (mult:P (reg:P) (const_int scale)) */
3275   else if (GET_CODE (x) == MULT
3276            && GET_MODE (x) == Pmode
3277            && GET_MODE (XEXP (x, 0)) == Pmode
3278            && CONST_INT_P (XEXP (x, 1)))
3279     {
3280       type = ADDRESS_REG_REG;
3281       index = XEXP (x, 0);
3282       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3283     }
3284   /* (ashift:P (reg:P) (const_int shift)) */
3285   else if (GET_CODE (x) == ASHIFT
3286            && GET_MODE (x) == Pmode
3287            && GET_MODE (XEXP (x, 0)) == Pmode
3288            && CONST_INT_P (XEXP (x, 1)))
3289     {
3290       type = ADDRESS_REG_REG;
3291       index = XEXP (x, 0);
3292       shift = INTVAL (XEXP (x, 1));
3293     }
3294   else
3295     return false;
3296
3297   if (GET_CODE (index) == SUBREG)
3298     index = SUBREG_REG (index);
3299
3300   if ((shift == 0 ||
3301        (shift > 0 && shift <= 3
3302         && (1 << shift) == GET_MODE_SIZE (mode)))
3303       && REG_P (index)
3304       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3305     {
3306       info->type = type;
3307       info->offset = index;
3308       info->shift = shift;
3309       return true;
3310     }
3311
3312   return false;
3313 }
3314
3315 bool
3316 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3317 {
3318   return (offset >= -64 * GET_MODE_SIZE (mode)
3319           && offset < 64 * GET_MODE_SIZE (mode)
3320           && offset % GET_MODE_SIZE (mode) == 0);
3321 }
3322
3323 static inline bool
3324 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3325                                HOST_WIDE_INT offset)
3326 {
3327   return offset >= -256 && offset < 256;
3328 }
3329
3330 static inline bool
3331 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3332 {
3333   return (offset >= 0
3334           && offset < 4096 * GET_MODE_SIZE (mode)
3335           && offset % GET_MODE_SIZE (mode) == 0);
3336 }
3337
3338 /* Return true if X is a valid address for machine mode MODE.  If it is,
3339    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3340    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3341
3342 static bool
3343 aarch64_classify_address (struct aarch64_address_info *info,
3344                           rtx x, machine_mode mode,
3345                           RTX_CODE outer_code, bool strict_p)
3346 {
3347   enum rtx_code code = GET_CODE (x);
3348   rtx op0, op1;
3349   bool allow_reg_index_p =
3350     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3351                                || aarch64_vector_mode_supported_p (mode));
3352   /* Don't support anything other than POST_INC or REG addressing for
3353      AdvSIMD.  */
3354   if (aarch64_vect_struct_mode_p (mode)
3355       && (code != POST_INC && code != REG))
3356     return false;
3357
3358   switch (code)
3359     {
3360     case REG:
3361     case SUBREG:
3362       info->type = ADDRESS_REG_IMM;
3363       info->base = x;
3364       info->offset = const0_rtx;
3365       return aarch64_base_register_rtx_p (x, strict_p);
3366
3367     case PLUS:
3368       op0 = XEXP (x, 0);
3369       op1 = XEXP (x, 1);
3370
3371       if (! strict_p
3372           && REG_P (op0)
3373           && (op0 == virtual_stack_vars_rtx
3374               || op0 == frame_pointer_rtx
3375               || op0 == arg_pointer_rtx)
3376           && CONST_INT_P (op1))
3377         {
3378           info->type = ADDRESS_REG_IMM;
3379           info->base = op0;
3380           info->offset = op1;
3381
3382           return true;
3383         }
3384
3385       if (GET_MODE_SIZE (mode) != 0
3386           && CONST_INT_P (op1)
3387           && aarch64_base_register_rtx_p (op0, strict_p))
3388         {
3389           HOST_WIDE_INT offset = INTVAL (op1);
3390
3391           info->type = ADDRESS_REG_IMM;
3392           info->base = op0;
3393           info->offset = op1;
3394
3395           /* TImode and TFmode values are allowed in both pairs of X
3396              registers and individual Q registers.  The available
3397              address modes are:
3398              X,X: 7-bit signed scaled offset
3399              Q:   9-bit signed offset
3400              We conservatively require an offset representable in either mode.
3401            */
3402           if (mode == TImode || mode == TFmode)
3403             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3404                     && offset_9bit_signed_unscaled_p (mode, offset));
3405
3406           if (outer_code == PARALLEL)
3407             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3408                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3409           else
3410             return (offset_9bit_signed_unscaled_p (mode, offset)
3411                     || offset_12bit_unsigned_scaled_p (mode, offset));
3412         }
3413
3414       if (allow_reg_index_p)
3415         {
3416           /* Look for base + (scaled/extended) index register.  */
3417           if (aarch64_base_register_rtx_p (op0, strict_p)
3418               && aarch64_classify_index (info, op1, mode, strict_p))
3419             {
3420               info->base = op0;
3421               return true;
3422             }
3423           if (aarch64_base_register_rtx_p (op1, strict_p)
3424               && aarch64_classify_index (info, op0, mode, strict_p))
3425             {
3426               info->base = op1;
3427               return true;
3428             }
3429         }
3430
3431       return false;
3432
3433     case POST_INC:
3434     case POST_DEC:
3435     case PRE_INC:
3436     case PRE_DEC:
3437       info->type = ADDRESS_REG_WB;
3438       info->base = XEXP (x, 0);
3439       info->offset = NULL_RTX;
3440       return aarch64_base_register_rtx_p (info->base, strict_p);
3441
3442     case POST_MODIFY:
3443     case PRE_MODIFY:
3444       info->type = ADDRESS_REG_WB;
3445       info->base = XEXP (x, 0);
3446       if (GET_CODE (XEXP (x, 1)) == PLUS
3447           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3448           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3449           && aarch64_base_register_rtx_p (info->base, strict_p))
3450         {
3451           HOST_WIDE_INT offset;
3452           info->offset = XEXP (XEXP (x, 1), 1);
3453           offset = INTVAL (info->offset);
3454
3455           /* TImode and TFmode values are allowed in both pairs of X
3456              registers and individual Q registers.  The available
3457              address modes are:
3458              X,X: 7-bit signed scaled offset
3459              Q:   9-bit signed offset
3460              We conservatively require an offset representable in either mode.
3461            */
3462           if (mode == TImode || mode == TFmode)
3463             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3464                     && offset_9bit_signed_unscaled_p (mode, offset));
3465
3466           if (outer_code == PARALLEL)
3467             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3468                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3469           else
3470             return offset_9bit_signed_unscaled_p (mode, offset);
3471         }
3472       return false;
3473
3474     case CONST:
3475     case SYMBOL_REF:
3476     case LABEL_REF:
3477       /* load literal: pc-relative constant pool entry.  Only supported
3478          for SI mode or larger.  */
3479       info->type = ADDRESS_SYMBOLIC;
3480       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3481         {
3482           rtx sym, addend;
3483
3484           split_const (x, &sym, &addend);
3485           return (GET_CODE (sym) == LABEL_REF
3486                   || (GET_CODE (sym) == SYMBOL_REF
3487                       && CONSTANT_POOL_ADDRESS_P (sym)));
3488         }
3489       return false;
3490
3491     case LO_SUM:
3492       info->type = ADDRESS_LO_SUM;
3493       info->base = XEXP (x, 0);
3494       info->offset = XEXP (x, 1);
3495       if (allow_reg_index_p
3496           && aarch64_base_register_rtx_p (info->base, strict_p))
3497         {
3498           rtx sym, offs;
3499           split_const (info->offset, &sym, &offs);
3500           if (GET_CODE (sym) == SYMBOL_REF
3501               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3502                   == SYMBOL_SMALL_ABSOLUTE))
3503             {
3504               /* The symbol and offset must be aligned to the access size.  */
3505               unsigned int align;
3506               unsigned int ref_size;
3507
3508               if (CONSTANT_POOL_ADDRESS_P (sym))
3509                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3510               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3511                 {
3512                   tree exp = SYMBOL_REF_DECL (sym);
3513                   align = TYPE_ALIGN (TREE_TYPE (exp));
3514                   align = CONSTANT_ALIGNMENT (exp, align);
3515                 }
3516               else if (SYMBOL_REF_DECL (sym))
3517                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3518               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3519                        && SYMBOL_REF_BLOCK (sym) != NULL)
3520                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3521               else
3522                 align = BITS_PER_UNIT;
3523
3524               ref_size = GET_MODE_SIZE (mode);
3525               if (ref_size == 0)
3526                 ref_size = GET_MODE_SIZE (DImode);
3527
3528               return ((INTVAL (offs) & (ref_size - 1)) == 0
3529                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3530             }
3531         }
3532       return false;
3533
3534     default:
3535       return false;
3536     }
3537 }
3538
3539 bool
3540 aarch64_symbolic_address_p (rtx x)
3541 {
3542   rtx offset;
3543
3544   split_const (x, &x, &offset);
3545   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3546 }
3547
3548 /* Classify the base of symbolic expression X, given that X appears in
3549    context CONTEXT.  */
3550
3551 enum aarch64_symbol_type
3552 aarch64_classify_symbolic_expression (rtx x,
3553                                       enum aarch64_symbol_context context)
3554 {
3555   rtx offset;
3556
3557   split_const (x, &x, &offset);
3558   return aarch64_classify_symbol (x, offset, context);
3559 }
3560
3561
3562 /* Return TRUE if X is a legitimate address for accessing memory in
3563    mode MODE.  */
3564 static bool
3565 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3566 {
3567   struct aarch64_address_info addr;
3568
3569   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3570 }
3571
3572 /* Return TRUE if X is a legitimate address for accessing memory in
3573    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3574    pair operation.  */
3575 bool
3576 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3577                               RTX_CODE outer_code, bool strict_p)
3578 {
3579   struct aarch64_address_info addr;
3580
3581   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3582 }
3583
3584 /* Return TRUE if rtx X is immediate constant 0.0 */
3585 bool
3586 aarch64_float_const_zero_rtx_p (rtx x)
3587 {
3588   REAL_VALUE_TYPE r;
3589
3590   if (GET_MODE (x) == VOIDmode)
3591     return false;
3592
3593   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3594   if (REAL_VALUE_MINUS_ZERO (r))
3595     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3596   return REAL_VALUES_EQUAL (r, dconst0);
3597 }
3598
3599 /* Return the fixed registers used for condition codes.  */
3600
3601 static bool
3602 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3603 {
3604   *p1 = CC_REGNUM;
3605   *p2 = INVALID_REGNUM;
3606   return true;
3607 }
3608
3609 /* Emit call insn with PAT and do aarch64-specific handling.  */
3610
3611 void
3612 aarch64_emit_call_insn (rtx pat)
3613 {
3614   rtx insn = emit_call_insn (pat);
3615
3616   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3617   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3618   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3619 }
3620
3621 machine_mode
3622 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3623 {
3624   /* All floating point compares return CCFP if it is an equality
3625      comparison, and CCFPE otherwise.  */
3626   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3627     {
3628       switch (code)
3629         {
3630         case EQ:
3631         case NE:
3632         case UNORDERED:
3633         case ORDERED:
3634         case UNLT:
3635         case UNLE:
3636         case UNGT:
3637         case UNGE:
3638         case UNEQ:
3639         case LTGT:
3640           return CCFPmode;
3641
3642         case LT:
3643         case LE:
3644         case GT:
3645         case GE:
3646           return CCFPEmode;
3647
3648         default:
3649           gcc_unreachable ();
3650         }
3651     }
3652
3653   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3654       && y == const0_rtx
3655       && (code == EQ || code == NE || code == LT || code == GE)
3656       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3657           || GET_CODE (x) == NEG))
3658     return CC_NZmode;
3659
3660   /* A compare with a shifted operand.  Because of canonicalization,
3661      the comparison will have to be swapped when we emit the assembly
3662      code.  */
3663   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3664       && (REG_P (y) || GET_CODE (y) == SUBREG)
3665       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3666           || GET_CODE (x) == LSHIFTRT
3667           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3668     return CC_SWPmode;
3669
3670   /* Similarly for a negated operand, but we can only do this for
3671      equalities.  */
3672   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3673       && (REG_P (y) || GET_CODE (y) == SUBREG)
3674       && (code == EQ || code == NE)
3675       && GET_CODE (x) == NEG)
3676     return CC_Zmode;
3677
3678   /* A compare of a mode narrower than SI mode against zero can be done
3679      by extending the value in the comparison.  */
3680   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3681       && y == const0_rtx)
3682     /* Only use sign-extension if we really need it.  */
3683     return ((code == GT || code == GE || code == LE || code == LT)
3684             ? CC_SESWPmode : CC_ZESWPmode);
3685
3686   /* For everything else, return CCmode.  */
3687   return CCmode;
3688 }
3689
3690 static int
3691 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3692
3693 int
3694 aarch64_get_condition_code (rtx x)
3695 {
3696   machine_mode mode = GET_MODE (XEXP (x, 0));
3697   enum rtx_code comp_code = GET_CODE (x);
3698
3699   if (GET_MODE_CLASS (mode) != MODE_CC)
3700     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3701   return aarch64_get_condition_code_1 (mode, comp_code);
3702 }
3703
3704 static int
3705 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3706 {
3707   int ne = -1, eq = -1;
3708   switch (mode)
3709     {
3710     case CCFPmode:
3711     case CCFPEmode:
3712       switch (comp_code)
3713         {
3714         case GE: return AARCH64_GE;
3715         case GT: return AARCH64_GT;
3716         case LE: return AARCH64_LS;
3717         case LT: return AARCH64_MI;
3718         case NE: return AARCH64_NE;
3719         case EQ: return AARCH64_EQ;
3720         case ORDERED: return AARCH64_VC;
3721         case UNORDERED: return AARCH64_VS;
3722         case UNLT: return AARCH64_LT;
3723         case UNLE: return AARCH64_LE;
3724         case UNGT: return AARCH64_HI;
3725         case UNGE: return AARCH64_PL;
3726         default: return -1;
3727         }
3728       break;
3729
3730     case CC_DNEmode:
3731       ne = AARCH64_NE;
3732       eq = AARCH64_EQ;
3733       break;
3734
3735     case CC_DEQmode:
3736       ne = AARCH64_EQ;
3737       eq = AARCH64_NE;
3738       break;
3739
3740     case CC_DGEmode:
3741       ne = AARCH64_GE;
3742       eq = AARCH64_LT;
3743       break;
3744
3745     case CC_DLTmode:
3746       ne = AARCH64_LT;
3747       eq = AARCH64_GE;
3748       break;
3749
3750     case CC_DGTmode:
3751       ne = AARCH64_GT;
3752       eq = AARCH64_LE;
3753       break;
3754
3755     case CC_DLEmode:
3756       ne = AARCH64_LE;
3757       eq = AARCH64_GT;
3758       break;
3759
3760     case CC_DGEUmode:
3761       ne = AARCH64_CS;
3762       eq = AARCH64_CC;
3763       break;
3764
3765     case CC_DLTUmode:
3766       ne = AARCH64_CC;
3767       eq = AARCH64_CS;
3768       break;
3769
3770     case CC_DGTUmode:
3771       ne = AARCH64_HI;
3772       eq = AARCH64_LS;
3773       break;
3774
3775     case CC_DLEUmode:
3776       ne = AARCH64_LS;
3777       eq = AARCH64_HI;
3778       break;
3779
3780     case CCmode:
3781       switch (comp_code)
3782         {
3783         case NE: return AARCH64_NE;
3784         case EQ: return AARCH64_EQ;
3785         case GE: return AARCH64_GE;
3786         case GT: return AARCH64_GT;
3787         case LE: return AARCH64_LE;
3788         case LT: return AARCH64_LT;
3789         case GEU: return AARCH64_CS;
3790         case GTU: return AARCH64_HI;
3791         case LEU: return AARCH64_LS;
3792         case LTU: return AARCH64_CC;
3793         default: return -1;
3794         }
3795       break;
3796
3797     case CC_SWPmode:
3798     case CC_ZESWPmode:
3799     case CC_SESWPmode:
3800       switch (comp_code)
3801         {
3802         case NE: return AARCH64_NE;
3803         case EQ: return AARCH64_EQ;
3804         case GE: return AARCH64_LE;
3805         case GT: return AARCH64_LT;
3806         case LE: return AARCH64_GE;
3807         case LT: return AARCH64_GT;
3808         case GEU: return AARCH64_LS;
3809         case GTU: return AARCH64_CC;
3810         case LEU: return AARCH64_CS;
3811         case LTU: return AARCH64_HI;
3812         default: return -1;
3813         }
3814       break;
3815
3816     case CC_NZmode:
3817       switch (comp_code)
3818         {
3819         case NE: return AARCH64_NE;
3820         case EQ: return AARCH64_EQ;
3821         case GE: return AARCH64_PL;
3822         case LT: return AARCH64_MI;
3823         default: return -1;
3824         }
3825       break;
3826
3827     case CC_Zmode:
3828       switch (comp_code)
3829         {
3830         case NE: return AARCH64_NE;
3831         case EQ: return AARCH64_EQ;
3832         default: return -1;
3833         }
3834       break;
3835
3836     default:
3837       return -1;
3838       break;
3839     }
3840
3841   if (comp_code == NE)
3842     return ne;
3843
3844   if (comp_code == EQ)
3845     return eq;
3846
3847   return -1;
3848 }
3849
3850 bool
3851 aarch64_const_vec_all_same_in_range_p (rtx x,
3852                                   HOST_WIDE_INT minval,
3853                                   HOST_WIDE_INT maxval)
3854 {
3855   HOST_WIDE_INT firstval;
3856   int count, i;
3857
3858   if (GET_CODE (x) != CONST_VECTOR
3859       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3860     return false;
3861
3862   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3863   if (firstval < minval || firstval > maxval)
3864     return false;
3865
3866   count = CONST_VECTOR_NUNITS (x);
3867   for (i = 1; i < count; i++)
3868     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3869       return false;
3870
3871   return true;
3872 }
3873
3874 bool
3875 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3876 {
3877   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3878 }
3879
3880 static unsigned
3881 bit_count (unsigned HOST_WIDE_INT value)
3882 {
3883   unsigned count = 0;
3884
3885   while (value)
3886     {
3887       count++;
3888       value &= value - 1;
3889     }
3890
3891   return count;
3892 }
3893
3894 /* N Z C V.  */
3895 #define AARCH64_CC_V 1
3896 #define AARCH64_CC_C (1 << 1)
3897 #define AARCH64_CC_Z (1 << 2)
3898 #define AARCH64_CC_N (1 << 3)
3899
3900 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3901    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3902 static const int aarch64_nzcv_codes[][2] =
3903 {
3904   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3905   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3906   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3907   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3908   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3909   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3910   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3911   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3912   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3913   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3914   {0, AARCH64_CC_V}, /* GE, N == V.  */
3915   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3916   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3917   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3918   {0, 0}, /* AL, Any.  */
3919   {0, 0}, /* NV, Any.  */
3920 };
3921
3922 int
3923 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3924 {
3925   switch (mode)
3926     {
3927     case CC_DNEmode:
3928       return NE;
3929
3930     case CC_DEQmode:
3931       return EQ;
3932
3933     case CC_DLEmode:
3934       return LE;
3935
3936     case CC_DGTmode:
3937       return GT;
3938
3939     case CC_DLTmode:
3940       return LT;
3941
3942     case CC_DGEmode:
3943       return GE;
3944
3945     case CC_DLEUmode:
3946       return LEU;
3947
3948     case CC_DGTUmode:
3949       return GTU;
3950
3951     case CC_DLTUmode:
3952       return LTU;
3953
3954     case CC_DGEUmode:
3955       return GEU;
3956
3957     default:
3958       gcc_unreachable ();
3959     }
3960 }
3961
3962
3963 void
3964 aarch64_print_operand (FILE *f, rtx x, char code)
3965 {
3966   switch (code)
3967     {
3968     /* An integer or symbol address without a preceding # sign.  */
3969     case 'c':
3970       switch (GET_CODE (x))
3971         {
3972         case CONST_INT:
3973           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3974           break;
3975
3976         case SYMBOL_REF:
3977           output_addr_const (f, x);
3978           break;
3979
3980         case CONST:
3981           if (GET_CODE (XEXP (x, 0)) == PLUS
3982               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3983             {
3984               output_addr_const (f, x);
3985               break;
3986             }
3987           /* Fall through.  */
3988
3989         default:
3990           output_operand_lossage ("Unsupported operand for code '%c'", code);
3991         }
3992       break;
3993
3994     case 'e':
3995       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3996       {
3997         int n;
3998
3999         if (!CONST_INT_P (x)
4000             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4001           {
4002             output_operand_lossage ("invalid operand for '%%%c'", code);
4003             return;
4004           }
4005
4006         switch (n)
4007           {
4008           case 3:
4009             fputc ('b', f);
4010             break;
4011           case 4:
4012             fputc ('h', f);
4013             break;
4014           case 5:
4015             fputc ('w', f);
4016             break;
4017           default:
4018             output_operand_lossage ("invalid operand for '%%%c'", code);
4019             return;
4020           }
4021       }
4022       break;
4023
4024     case 'p':
4025       {
4026         int n;
4027
4028         /* Print N such that 2^N == X.  */
4029         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4030           {
4031             output_operand_lossage ("invalid operand for '%%%c'", code);
4032             return;
4033           }
4034
4035         asm_fprintf (f, "%d", n);
4036       }
4037       break;
4038
4039     case 'P':
4040       /* Print the number of non-zero bits in X (a const_int).  */
4041       if (!CONST_INT_P (x))
4042         {
4043           output_operand_lossage ("invalid operand for '%%%c'", code);
4044           return;
4045         }
4046
4047       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4048       break;
4049
4050     case 'H':
4051       /* Print the higher numbered register of a pair (TImode) of regs.  */
4052       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4053         {
4054           output_operand_lossage ("invalid operand for '%%%c'", code);
4055           return;
4056         }
4057
4058       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4059       break;
4060
4061     case 'm':
4062       {
4063         int cond_code;
4064         /* Print a condition (eq, ne, etc).  */
4065
4066         /* CONST_TRUE_RTX means always -- that's the default.  */
4067         if (x == const_true_rtx)
4068           return;
4069
4070         if (!COMPARISON_P (x))
4071           {
4072             output_operand_lossage ("invalid operand for '%%%c'", code);
4073             return;
4074           }
4075
4076         cond_code = aarch64_get_condition_code (x);
4077         gcc_assert (cond_code >= 0);
4078         fputs (aarch64_condition_codes[cond_code], f);
4079       }
4080       break;
4081
4082     case 'M':
4083       {
4084         int cond_code;
4085         /* Print the inverse of a condition (eq <-> ne, etc).  */
4086
4087         /* CONST_TRUE_RTX means never -- that's the default.  */
4088         if (x == const_true_rtx)
4089           {
4090             fputs ("nv", f);
4091             return;
4092           }
4093
4094         if (!COMPARISON_P (x))
4095           {
4096             output_operand_lossage ("invalid operand for '%%%c'", code);
4097             return;
4098           }
4099         cond_code = aarch64_get_condition_code (x);
4100         gcc_assert (cond_code >= 0);
4101         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4102                                        (cond_code)], f);
4103       }
4104       break;
4105
4106     case 'b':
4107     case 'h':
4108     case 's':
4109     case 'd':
4110     case 'q':
4111       /* Print a scalar FP/SIMD register name.  */
4112       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4113         {
4114           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4115           return;
4116         }
4117       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4118       break;
4119
4120     case 'S':
4121     case 'T':
4122     case 'U':
4123     case 'V':
4124       /* Print the first FP/SIMD register name in a list.  */
4125       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4126         {
4127           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4128           return;
4129         }
4130       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4131       break;
4132
4133     case 'X':
4134       /* Print bottom 16 bits of integer constant in hex.  */
4135       if (!CONST_INT_P (x))
4136         {
4137           output_operand_lossage ("invalid operand for '%%%c'", code);
4138           return;
4139         }
4140       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4141       break;
4142
4143     case 'w':
4144     case 'x':
4145       /* Print a general register name or the zero register (32-bit or
4146          64-bit).  */
4147       if (x == const0_rtx
4148           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4149         {
4150           asm_fprintf (f, "%czr", code);
4151           break;
4152         }
4153
4154       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4155         {
4156           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4157           break;
4158         }
4159
4160       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4161         {
4162           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4163           break;
4164         }
4165
4166       /* Fall through */
4167
4168     case 0:
4169       /* Print a normal operand, if it's a general register, then we
4170          assume DImode.  */
4171       if (x == NULL)
4172         {
4173           output_operand_lossage ("missing operand");
4174           return;
4175         }
4176
4177       switch (GET_CODE (x))
4178         {
4179         case REG:
4180           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4181           break;
4182
4183         case MEM:
4184           aarch64_memory_reference_mode = GET_MODE (x);
4185           output_address (XEXP (x, 0));
4186           break;
4187
4188         case LABEL_REF:
4189         case SYMBOL_REF:
4190           output_addr_const (asm_out_file, x);
4191           break;
4192
4193         case CONST_INT:
4194           asm_fprintf (f, "%wd", INTVAL (x));
4195           break;
4196
4197         case CONST_VECTOR:
4198           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4199             {
4200               gcc_assert (
4201                   aarch64_const_vec_all_same_in_range_p (x,
4202                                                          HOST_WIDE_INT_MIN,
4203                                                          HOST_WIDE_INT_MAX));
4204               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4205             }
4206           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4207             {
4208               fputc ('0', f);
4209             }
4210           else
4211             gcc_unreachable ();
4212           break;
4213
4214         case CONST_DOUBLE:
4215           /* CONST_DOUBLE can represent a double-width integer.
4216              In this case, the mode of x is VOIDmode.  */
4217           if (GET_MODE (x) == VOIDmode)
4218             ; /* Do Nothing.  */
4219           else if (aarch64_float_const_zero_rtx_p (x))
4220             {
4221               fputc ('0', f);
4222               break;
4223             }
4224           else if (aarch64_float_const_representable_p (x))
4225             {
4226 #define buf_size 20
4227               char float_buf[buf_size] = {'\0'};
4228               REAL_VALUE_TYPE r;
4229               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4230               real_to_decimal_for_mode (float_buf, &r,
4231                                         buf_size, buf_size,
4232                                         1, GET_MODE (x));
4233               asm_fprintf (asm_out_file, "%s", float_buf);
4234               break;
4235 #undef buf_size
4236             }
4237           output_operand_lossage ("invalid constant");
4238           return;
4239         default:
4240           output_operand_lossage ("invalid operand");
4241           return;
4242         }
4243       break;
4244
4245     case 'A':
4246       if (GET_CODE (x) == HIGH)
4247         x = XEXP (x, 0);
4248
4249       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4250         {
4251         case SYMBOL_SMALL_GOT:
4252           asm_fprintf (asm_out_file, ":got:");
4253           break;
4254
4255         case SYMBOL_SMALL_TLSGD:
4256           asm_fprintf (asm_out_file, ":tlsgd:");
4257           break;
4258
4259         case SYMBOL_SMALL_TLSDESC:
4260           asm_fprintf (asm_out_file, ":tlsdesc:");
4261           break;
4262
4263         case SYMBOL_SMALL_GOTTPREL:
4264           asm_fprintf (asm_out_file, ":gottprel:");
4265           break;
4266
4267         case SYMBOL_SMALL_TPREL:
4268           asm_fprintf (asm_out_file, ":tprel:");
4269           break;
4270
4271         case SYMBOL_TINY_GOT:
4272           gcc_unreachable ();
4273           break;
4274
4275         default:
4276           break;
4277         }
4278       output_addr_const (asm_out_file, x);
4279       break;
4280
4281     case 'L':
4282       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4283         {
4284         case SYMBOL_SMALL_GOT:
4285           asm_fprintf (asm_out_file, ":lo12:");
4286           break;
4287
4288         case SYMBOL_SMALL_TLSGD:
4289           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4290           break;
4291
4292         case SYMBOL_SMALL_TLSDESC:
4293           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4294           break;
4295
4296         case SYMBOL_SMALL_GOTTPREL:
4297           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4298           break;
4299
4300         case SYMBOL_SMALL_TPREL:
4301           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4302           break;
4303
4304         case SYMBOL_TINY_GOT:
4305           asm_fprintf (asm_out_file, ":got:");
4306           break;
4307
4308         default:
4309           break;
4310         }
4311       output_addr_const (asm_out_file, x);
4312       break;
4313
4314     case 'G':
4315
4316       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4317         {
4318         case SYMBOL_SMALL_TPREL:
4319           asm_fprintf (asm_out_file, ":tprel_hi12:");
4320           break;
4321         default:
4322           break;
4323         }
4324       output_addr_const (asm_out_file, x);
4325       break;
4326
4327     case 'K':
4328       {
4329         int cond_code;
4330         /* Print nzcv.  */
4331
4332         if (!COMPARISON_P (x))
4333           {
4334             output_operand_lossage ("invalid operand for '%%%c'", code);
4335             return;
4336           }
4337
4338         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4339         gcc_assert (cond_code >= 0);
4340         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4341       }
4342       break;
4343
4344     case 'k':
4345       {
4346         int cond_code;
4347         /* Print nzcv.  */
4348
4349         if (!COMPARISON_P (x))
4350           {
4351             output_operand_lossage ("invalid operand for '%%%c'", code);
4352             return;
4353           }
4354
4355         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4356         gcc_assert (cond_code >= 0);
4357         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4358       }
4359       break;
4360
4361     default:
4362       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4363       return;
4364     }
4365 }
4366
4367 void
4368 aarch64_print_operand_address (FILE *f, rtx x)
4369 {
4370   struct aarch64_address_info addr;
4371
4372   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4373                              MEM, true))
4374     switch (addr.type)
4375       {
4376       case ADDRESS_REG_IMM:
4377         if (addr.offset == const0_rtx)
4378           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4379         else
4380           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4381                        INTVAL (addr.offset));
4382         return;
4383
4384       case ADDRESS_REG_REG:
4385         if (addr.shift == 0)
4386           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4387                        reg_names [REGNO (addr.offset)]);
4388         else
4389           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4390                        reg_names [REGNO (addr.offset)], addr.shift);
4391         return;
4392
4393       case ADDRESS_REG_UXTW:
4394         if (addr.shift == 0)
4395           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4396                        REGNO (addr.offset) - R0_REGNUM);
4397         else
4398           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4399                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4400         return;
4401
4402       case ADDRESS_REG_SXTW:
4403         if (addr.shift == 0)
4404           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4405                        REGNO (addr.offset) - R0_REGNUM);
4406         else
4407           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4408                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4409         return;
4410
4411       case ADDRESS_REG_WB:
4412         switch (GET_CODE (x))
4413           {
4414           case PRE_INC:
4415             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4416                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4417             return;
4418           case POST_INC:
4419             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4420                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4421             return;
4422           case PRE_DEC:
4423             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4424                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4425             return;
4426           case POST_DEC:
4427             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4428                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4429             return;
4430           case PRE_MODIFY:
4431             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4432                          INTVAL (addr.offset));
4433             return;
4434           case POST_MODIFY:
4435             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4436                          INTVAL (addr.offset));
4437             return;
4438           default:
4439             break;
4440           }
4441         break;
4442
4443       case ADDRESS_LO_SUM:
4444         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4445         output_addr_const (f, addr.offset);
4446         asm_fprintf (f, "]");
4447         return;
4448
4449       case ADDRESS_SYMBOLIC:
4450         break;
4451       }
4452
4453   output_addr_const (f, x);
4454 }
4455
4456 bool
4457 aarch64_label_mentioned_p (rtx x)
4458 {
4459   const char *fmt;
4460   int i;
4461
4462   if (GET_CODE (x) == LABEL_REF)
4463     return true;
4464
4465   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4466      referencing instruction, but they are constant offsets, not
4467      symbols.  */
4468   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4469     return false;
4470
4471   fmt = GET_RTX_FORMAT (GET_CODE (x));
4472   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4473     {
4474       if (fmt[i] == 'E')
4475         {
4476           int j;
4477
4478           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4479             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4480               return 1;
4481         }
4482       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4483         return 1;
4484     }
4485
4486   return 0;
4487 }
4488
4489 /* Implement REGNO_REG_CLASS.  */
4490
4491 enum reg_class
4492 aarch64_regno_regclass (unsigned regno)
4493 {
4494   if (GP_REGNUM_P (regno))
4495     return GENERAL_REGS;
4496
4497   if (regno == SP_REGNUM)
4498     return STACK_REG;
4499
4500   if (regno == FRAME_POINTER_REGNUM
4501       || regno == ARG_POINTER_REGNUM)
4502     return POINTER_REGS;
4503
4504   if (FP_REGNUM_P (regno))
4505     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4506
4507   return NO_REGS;
4508 }
4509
4510 static rtx
4511 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4512 {
4513   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4514      where mask is selected by alignment and size of the offset.
4515      We try to pick as large a range for the offset as possible to
4516      maximize the chance of a CSE.  However, for aligned addresses
4517      we limit the range to 4k so that structures with different sized
4518      elements are likely to use the same base.  */
4519
4520   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4521     {
4522       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4523       HOST_WIDE_INT base_offset;
4524
4525       /* Does it look like we'll need a load/store-pair operation?  */
4526       if (GET_MODE_SIZE (mode) > 16
4527           || mode == TImode)
4528         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4529                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4530       /* For offsets aren't a multiple of the access size, the limit is
4531          -256...255.  */
4532       else if (offset & (GET_MODE_SIZE (mode) - 1))
4533         base_offset = (offset + 0x100) & ~0x1ff;
4534       else
4535         base_offset = offset & ~0xfff;
4536
4537       if (base_offset == 0)
4538         return x;
4539
4540       offset -= base_offset;
4541       rtx base_reg = gen_reg_rtx (Pmode);
4542       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4543                            NULL_RTX);
4544       emit_move_insn (base_reg, val);
4545       x = plus_constant (Pmode, base_reg, offset);
4546     }
4547
4548   return x;
4549 }
4550
4551 /* Try a machine-dependent way of reloading an illegitimate address
4552    operand.  If we find one, push the reload and return the new rtx.  */
4553
4554 rtx
4555 aarch64_legitimize_reload_address (rtx *x_p,
4556                                    machine_mode mode,
4557                                    int opnum, int type,
4558                                    int ind_levels ATTRIBUTE_UNUSED)
4559 {
4560   rtx x = *x_p;
4561
4562   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4563   if (aarch64_vect_struct_mode_p (mode)
4564       && GET_CODE (x) == PLUS
4565       && REG_P (XEXP (x, 0))
4566       && CONST_INT_P (XEXP (x, 1)))
4567     {
4568       rtx orig_rtx = x;
4569       x = copy_rtx (x);
4570       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4571                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4572                    opnum, (enum reload_type) type);
4573       return x;
4574     }
4575
4576   /* We must recognize output that we have already generated ourselves.  */
4577   if (GET_CODE (x) == PLUS
4578       && GET_CODE (XEXP (x, 0)) == PLUS
4579       && REG_P (XEXP (XEXP (x, 0), 0))
4580       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4581       && CONST_INT_P (XEXP (x, 1)))
4582     {
4583       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4584                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4585                    opnum, (enum reload_type) type);
4586       return x;
4587     }
4588
4589   /* We wish to handle large displacements off a base register by splitting
4590      the addend across an add and the mem insn.  This can cut the number of
4591      extra insns needed from 3 to 1.  It is only useful for load/store of a
4592      single register with 12 bit offset field.  */
4593   if (GET_CODE (x) == PLUS
4594       && REG_P (XEXP (x, 0))
4595       && CONST_INT_P (XEXP (x, 1))
4596       && HARD_REGISTER_P (XEXP (x, 0))
4597       && mode != TImode
4598       && mode != TFmode
4599       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4600     {
4601       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4602       HOST_WIDE_INT low = val & 0xfff;
4603       HOST_WIDE_INT high = val - low;
4604       HOST_WIDE_INT offs;
4605       rtx cst;
4606       machine_mode xmode = GET_MODE (x);
4607
4608       /* In ILP32, xmode can be either DImode or SImode.  */
4609       gcc_assert (xmode == DImode || xmode == SImode);
4610
4611       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4612          BLKmode alignment.  */
4613       if (GET_MODE_SIZE (mode) == 0)
4614         return NULL_RTX;
4615
4616       offs = low % GET_MODE_SIZE (mode);
4617
4618       /* Align misaligned offset by adjusting high part to compensate.  */
4619       if (offs != 0)
4620         {
4621           if (aarch64_uimm12_shift (high + offs))
4622             {
4623               /* Align down.  */
4624               low = low - offs;
4625               high = high + offs;
4626             }
4627           else
4628             {
4629               /* Align up.  */
4630               offs = GET_MODE_SIZE (mode) - offs;
4631               low = low + offs;
4632               high = high + (low & 0x1000) - offs;
4633               low &= 0xfff;
4634             }
4635         }
4636
4637       /* Check for overflow.  */
4638       if (high + low != val)
4639         return NULL_RTX;
4640
4641       cst = GEN_INT (high);
4642       if (!aarch64_uimm12_shift (high))
4643         cst = force_const_mem (xmode, cst);
4644
4645       /* Reload high part into base reg, leaving the low part
4646          in the mem instruction.
4647          Note that replacing this gen_rtx_PLUS with plus_constant is
4648          wrong in this case because we rely on the
4649          (plus (plus reg c1) c2) structure being preserved so that
4650          XEXP (*p, 0) in push_reload below uses the correct term.  */
4651       x = gen_rtx_PLUS (xmode,
4652                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4653                         GEN_INT (low));
4654
4655       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4656                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4657                    opnum, (enum reload_type) type);
4658       return x;
4659     }
4660
4661   return NULL_RTX;
4662 }
4663
4664
4665 static reg_class_t
4666 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4667                           reg_class_t rclass,
4668                           machine_mode mode,
4669                           secondary_reload_info *sri)
4670 {
4671   /* Without the TARGET_SIMD instructions we cannot move a Q register
4672      to a Q register directly.  We need a scratch.  */
4673   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4674       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4675       && reg_class_subset_p (rclass, FP_REGS))
4676     {
4677       if (mode == TFmode)
4678         sri->icode = CODE_FOR_aarch64_reload_movtf;
4679       else if (mode == TImode)
4680         sri->icode = CODE_FOR_aarch64_reload_movti;
4681       return NO_REGS;
4682     }
4683
4684   /* A TFmode or TImode memory access should be handled via an FP_REGS
4685      because AArch64 has richer addressing modes for LDR/STR instructions
4686      than LDP/STP instructions.  */
4687   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4688       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4689     return FP_REGS;
4690
4691   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4692       return GENERAL_REGS;
4693
4694   return NO_REGS;
4695 }
4696
4697 static bool
4698 aarch64_can_eliminate (const int from, const int to)
4699 {
4700   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4701      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4702
4703   if (frame_pointer_needed)
4704     {
4705       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4706         return true;
4707       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4708         return false;
4709       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4710           && !cfun->calls_alloca)
4711         return true;
4712       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4713         return true;
4714
4715       return false;
4716     }
4717   else
4718     {
4719       /* If we decided that we didn't need a leaf frame pointer but then used
4720          LR in the function, then we'll want a frame pointer after all, so
4721          prevent this elimination to ensure a frame pointer is used.  */
4722       if (to == STACK_POINTER_REGNUM
4723           && flag_omit_leaf_frame_pointer
4724           && df_regs_ever_live_p (LR_REGNUM))
4725         return false;
4726     }
4727
4728   return true;
4729 }
4730
4731 HOST_WIDE_INT
4732 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4733 {
4734   aarch64_layout_frame ();
4735
4736   if (to == HARD_FRAME_POINTER_REGNUM)
4737     {
4738       if (from == ARG_POINTER_REGNUM)
4739         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4740
4741       if (from == FRAME_POINTER_REGNUM)
4742         return (cfun->machine->frame.hard_fp_offset
4743                 - cfun->machine->frame.saved_varargs_size);
4744     }
4745
4746   if (to == STACK_POINTER_REGNUM)
4747     {
4748       if (from == FRAME_POINTER_REGNUM)
4749           return (cfun->machine->frame.frame_size
4750                   - cfun->machine->frame.saved_varargs_size);
4751     }
4752
4753   return cfun->machine->frame.frame_size;
4754 }
4755
4756 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4757    previous frame.  */
4758
4759 rtx
4760 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4761 {
4762   if (count != 0)
4763     return const0_rtx;
4764   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4765 }
4766
4767
4768 static void
4769 aarch64_asm_trampoline_template (FILE *f)
4770 {
4771   if (TARGET_ILP32)
4772     {
4773       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4774       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4775     }
4776   else
4777     {
4778       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4779       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4780     }
4781   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4782   assemble_aligned_integer (4, const0_rtx);
4783   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4784   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4785 }
4786
4787 static void
4788 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4789 {
4790   rtx fnaddr, mem, a_tramp;
4791   const int tramp_code_sz = 16;
4792
4793   /* Don't need to copy the trailing D-words, we fill those in below.  */
4794   emit_block_move (m_tramp, assemble_trampoline_template (),
4795                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4796   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4797   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4798   if (GET_MODE (fnaddr) != ptr_mode)
4799     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4800   emit_move_insn (mem, fnaddr);
4801
4802   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4803   emit_move_insn (mem, chain_value);
4804
4805   /* XXX We should really define a "clear_cache" pattern and use
4806      gen_clear_cache().  */
4807   a_tramp = XEXP (m_tramp, 0);
4808   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4809                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4810                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4811                      ptr_mode);
4812 }
4813
4814 static unsigned char
4815 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4816 {
4817   switch (regclass)
4818     {
4819     case CALLER_SAVE_REGS:
4820     case POINTER_REGS:
4821     case GENERAL_REGS:
4822     case ALL_REGS:
4823     case FP_REGS:
4824     case FP_LO_REGS:
4825       return
4826         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4827                                        (GET_MODE_SIZE (mode) + 7) / 8;
4828     case STACK_REG:
4829       return 1;
4830
4831     case NO_REGS:
4832       return 0;
4833
4834     default:
4835       break;
4836     }
4837   gcc_unreachable ();
4838 }
4839
4840 static reg_class_t
4841 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4842 {
4843   if (regclass == POINTER_REGS)
4844     return GENERAL_REGS;
4845
4846   if (regclass == STACK_REG)
4847     {
4848       if (REG_P(x)
4849           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4850           return regclass;
4851
4852       return NO_REGS;
4853     }
4854
4855   /* If it's an integer immediate that MOVI can't handle, then
4856      FP_REGS is not an option, so we return NO_REGS instead.  */
4857   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4858       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4859     return NO_REGS;
4860
4861   /* Register eliminiation can result in a request for
4862      SP+constant->FP_REGS.  We cannot support such operations which
4863      use SP as source and an FP_REG as destination, so reject out
4864      right now.  */
4865   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4866     {
4867       rtx lhs = XEXP (x, 0);
4868
4869       /* Look through a possible SUBREG introduced by ILP32.  */
4870       if (GET_CODE (lhs) == SUBREG)
4871         lhs = SUBREG_REG (lhs);
4872
4873       gcc_assert (REG_P (lhs));
4874       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4875                                       POINTER_REGS));
4876       return NO_REGS;
4877     }
4878
4879   return regclass;
4880 }
4881
4882 void
4883 aarch64_asm_output_labelref (FILE* f, const char *name)
4884 {
4885   asm_fprintf (f, "%U%s", name);
4886 }
4887
4888 static void
4889 aarch64_elf_asm_constructor (rtx symbol, int priority)
4890 {
4891   if (priority == DEFAULT_INIT_PRIORITY)
4892     default_ctor_section_asm_out_constructor (symbol, priority);
4893   else
4894     {
4895       section *s;
4896       char buf[18];
4897       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4898       s = get_section (buf, SECTION_WRITE, NULL);
4899       switch_to_section (s);
4900       assemble_align (POINTER_SIZE);
4901       assemble_aligned_integer (POINTER_BYTES, symbol);
4902     }
4903 }
4904
4905 static void
4906 aarch64_elf_asm_destructor (rtx symbol, int priority)
4907 {
4908   if (priority == DEFAULT_INIT_PRIORITY)
4909     default_dtor_section_asm_out_destructor (symbol, priority);
4910   else
4911     {
4912       section *s;
4913       char buf[18];
4914       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4915       s = get_section (buf, SECTION_WRITE, NULL);
4916       switch_to_section (s);
4917       assemble_align (POINTER_SIZE);
4918       assemble_aligned_integer (POINTER_BYTES, symbol);
4919     }
4920 }
4921
4922 const char*
4923 aarch64_output_casesi (rtx *operands)
4924 {
4925   char buf[100];
4926   char label[100];
4927   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
4928   int index;
4929   static const char *const patterns[4][2] =
4930   {
4931     {
4932       "ldrb\t%w3, [%0,%w1,uxtw]",
4933       "add\t%3, %4, %w3, sxtb #2"
4934     },
4935     {
4936       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4937       "add\t%3, %4, %w3, sxth #2"
4938     },
4939     {
4940       "ldr\t%w3, [%0,%w1,uxtw #2]",
4941       "add\t%3, %4, %w3, sxtw #2"
4942     },
4943     /* We assume that DImode is only generated when not optimizing and
4944        that we don't really need 64-bit address offsets.  That would
4945        imply an object file with 8GB of code in a single function!  */
4946     {
4947       "ldr\t%w3, [%0,%w1,uxtw #2]",
4948       "add\t%3, %4, %w3, sxtw #2"
4949     }
4950   };
4951
4952   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4953
4954   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4955
4956   gcc_assert (index >= 0 && index <= 3);
4957
4958   /* Need to implement table size reduction, by chaning the code below.  */
4959   output_asm_insn (patterns[index][0], operands);
4960   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4961   snprintf (buf, sizeof (buf),
4962             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4963   output_asm_insn (buf, operands);
4964   output_asm_insn (patterns[index][1], operands);
4965   output_asm_insn ("br\t%3", operands);
4966   assemble_label (asm_out_file, label);
4967   return "";
4968 }
4969
4970
4971 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4972    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4973    operator.  */
4974
4975 int
4976 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4977 {
4978   if (shift >= 0 && shift <= 3)
4979     {
4980       int size;
4981       for (size = 8; size <= 32; size *= 2)
4982         {
4983           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4984           if (mask == bits << shift)
4985             return size;
4986         }
4987     }
4988   return 0;
4989 }
4990
4991 static bool
4992 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
4993                                    const_rtx x ATTRIBUTE_UNUSED)
4994 {
4995   /* We can't use blocks for constants when we're using a per-function
4996      constant pool.  */
4997   return false;
4998 }
4999
5000 static section *
5001 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5002                             rtx x ATTRIBUTE_UNUSED,
5003                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5004 {
5005   /* Force all constant pool entries into the current function section.  */
5006   return function_section (current_function_decl);
5007 }
5008
5009
5010 /* Costs.  */
5011
5012 /* Helper function for rtx cost calculation.  Strip a shift expression
5013    from X.  Returns the inner operand if successful, or the original
5014    expression on failure.  */
5015 static rtx
5016 aarch64_strip_shift (rtx x)
5017 {
5018   rtx op = x;
5019
5020   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5021      we can convert both to ROR during final output.  */
5022   if ((GET_CODE (op) == ASHIFT
5023        || GET_CODE (op) == ASHIFTRT
5024        || GET_CODE (op) == LSHIFTRT
5025        || GET_CODE (op) == ROTATERT
5026        || GET_CODE (op) == ROTATE)
5027       && CONST_INT_P (XEXP (op, 1)))
5028     return XEXP (op, 0);
5029
5030   if (GET_CODE (op) == MULT
5031       && CONST_INT_P (XEXP (op, 1))
5032       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5033     return XEXP (op, 0);
5034
5035   return x;
5036 }
5037
5038 /* Helper function for rtx cost calculation.  Strip an extend
5039    expression from X.  Returns the inner operand if successful, or the
5040    original expression on failure.  We deal with a number of possible
5041    canonicalization variations here.  */
5042 static rtx
5043 aarch64_strip_extend (rtx x)
5044 {
5045   rtx op = x;
5046
5047   /* Zero and sign extraction of a widened value.  */
5048   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5049       && XEXP (op, 2) == const0_rtx
5050       && GET_CODE (XEXP (op, 0)) == MULT
5051       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5052                                          XEXP (op, 1)))
5053     return XEXP (XEXP (op, 0), 0);
5054
5055   /* It can also be represented (for zero-extend) as an AND with an
5056      immediate.  */
5057   if (GET_CODE (op) == AND
5058       && GET_CODE (XEXP (op, 0)) == MULT
5059       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5060       && CONST_INT_P (XEXP (op, 1))
5061       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5062                            INTVAL (XEXP (op, 1))) != 0)
5063     return XEXP (XEXP (op, 0), 0);
5064
5065   /* Now handle extended register, as this may also have an optional
5066      left shift by 1..4.  */
5067   if (GET_CODE (op) == ASHIFT
5068       && CONST_INT_P (XEXP (op, 1))
5069       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5070     op = XEXP (op, 0);
5071
5072   if (GET_CODE (op) == ZERO_EXTEND
5073       || GET_CODE (op) == SIGN_EXTEND)
5074     op = XEXP (op, 0);
5075
5076   if (op != x)
5077     return op;
5078
5079   return x;
5080 }
5081
5082 /* Helper function for rtx cost calculation.  Calculate the cost of
5083    a MULT, which may be part of a multiply-accumulate rtx.  Return
5084    the calculated cost of the expression, recursing manually in to
5085    operands where needed.  */
5086
5087 static int
5088 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5089 {
5090   rtx op0, op1;
5091   const struct cpu_cost_table *extra_cost
5092     = aarch64_tune_params->insn_extra_cost;
5093   int cost = 0;
5094   bool maybe_fma = (outer == PLUS || outer == MINUS);
5095   machine_mode mode = GET_MODE (x);
5096
5097   gcc_checking_assert (code == MULT);
5098
5099   op0 = XEXP (x, 0);
5100   op1 = XEXP (x, 1);
5101
5102   if (VECTOR_MODE_P (mode))
5103     mode = GET_MODE_INNER (mode);
5104
5105   /* Integer multiply/fma.  */
5106   if (GET_MODE_CLASS (mode) == MODE_INT)
5107     {
5108       /* The multiply will be canonicalized as a shift, cost it as such.  */
5109       if (CONST_INT_P (op1)
5110           && exact_log2 (INTVAL (op1)) > 0)
5111         {
5112           if (speed)
5113             {
5114               if (maybe_fma)
5115                 /* ADD (shifted register).  */
5116                 cost += extra_cost->alu.arith_shift;
5117               else
5118                 /* LSL (immediate).  */
5119                 cost += extra_cost->alu.shift;
5120             }
5121
5122           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5123
5124           return cost;
5125         }
5126
5127       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5128       if ((GET_CODE (op0) == ZERO_EXTEND
5129            && GET_CODE (op1) == ZERO_EXTEND)
5130           || (GET_CODE (op0) == SIGN_EXTEND
5131               && GET_CODE (op1) == SIGN_EXTEND))
5132         {
5133           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5134                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5135
5136           if (speed)
5137             {
5138               if (maybe_fma)
5139                 /* MADD/SMADDL/UMADDL.  */
5140                 cost += extra_cost->mult[0].extend_add;
5141               else
5142                 /* MUL/SMULL/UMULL.  */
5143                 cost += extra_cost->mult[0].extend;
5144             }
5145
5146           return cost;
5147         }
5148
5149       /* This is either an integer multiply or an FMA.  In both cases
5150          we want to recurse and cost the operands.  */
5151       cost += rtx_cost (op0, MULT, 0, speed)
5152               + rtx_cost (op1, MULT, 1, speed);
5153
5154       if (speed)
5155         {
5156           if (maybe_fma)
5157             /* MADD.  */
5158             cost += extra_cost->mult[mode == DImode].add;
5159           else
5160             /* MUL.  */
5161             cost += extra_cost->mult[mode == DImode].simple;
5162         }
5163
5164       return cost;
5165     }
5166   else
5167     {
5168       if (speed)
5169         {
5170           /* Floating-point FMA/FMUL can also support negations of the
5171              operands.  */
5172           if (GET_CODE (op0) == NEG)
5173             op0 = XEXP (op0, 0);
5174           if (GET_CODE (op1) == NEG)
5175             op1 = XEXP (op1, 0);
5176
5177           if (maybe_fma)
5178             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5179             cost += extra_cost->fp[mode == DFmode].fma;
5180           else
5181             /* FMUL/FNMUL.  */
5182             cost += extra_cost->fp[mode == DFmode].mult;
5183         }
5184
5185       cost += rtx_cost (op0, MULT, 0, speed)
5186               + rtx_cost (op1, MULT, 1, speed);
5187       return cost;
5188     }
5189 }
5190
5191 static int
5192 aarch64_address_cost (rtx x,
5193                       machine_mode mode,
5194                       addr_space_t as ATTRIBUTE_UNUSED,
5195                       bool speed)
5196 {
5197   enum rtx_code c = GET_CODE (x);
5198   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5199   struct aarch64_address_info info;
5200   int cost = 0;
5201   info.shift = 0;
5202
5203   if (!aarch64_classify_address (&info, x, mode, c, false))
5204     {
5205       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5206         {
5207           /* This is a CONST or SYMBOL ref which will be split
5208              in a different way depending on the code model in use.
5209              Cost it through the generic infrastructure.  */
5210           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5211           /* Divide through by the cost of one instruction to
5212              bring it to the same units as the address costs.  */
5213           cost_symbol_ref /= COSTS_N_INSNS (1);
5214           /* The cost is then the cost of preparing the address,
5215              followed by an immediate (possibly 0) offset.  */
5216           return cost_symbol_ref + addr_cost->imm_offset;
5217         }
5218       else
5219         {
5220           /* This is most likely a jump table from a case
5221              statement.  */
5222           return addr_cost->register_offset;
5223         }
5224     }
5225
5226   switch (info.type)
5227     {
5228       case ADDRESS_LO_SUM:
5229       case ADDRESS_SYMBOLIC:
5230       case ADDRESS_REG_IMM:
5231         cost += addr_cost->imm_offset;
5232         break;
5233
5234       case ADDRESS_REG_WB:
5235         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5236           cost += addr_cost->pre_modify;
5237         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5238           cost += addr_cost->post_modify;
5239         else
5240           gcc_unreachable ();
5241
5242         break;
5243
5244       case ADDRESS_REG_REG:
5245         cost += addr_cost->register_offset;
5246         break;
5247
5248       case ADDRESS_REG_UXTW:
5249       case ADDRESS_REG_SXTW:
5250         cost += addr_cost->register_extend;
5251         break;
5252
5253       default:
5254         gcc_unreachable ();
5255     }
5256
5257
5258   if (info.shift > 0)
5259     {
5260       /* For the sake of calculating the cost of the shifted register
5261          component, we can treat same sized modes in the same way.  */
5262       switch (GET_MODE_BITSIZE (mode))
5263         {
5264           case 16:
5265             cost += addr_cost->addr_scale_costs.hi;
5266             break;
5267
5268           case 32:
5269             cost += addr_cost->addr_scale_costs.si;
5270             break;
5271
5272           case 64:
5273             cost += addr_cost->addr_scale_costs.di;
5274             break;
5275
5276           /* We can't tell, or this is a 128-bit vector.  */
5277           default:
5278             cost += addr_cost->addr_scale_costs.ti;
5279             break;
5280         }
5281     }
5282
5283   return cost;
5284 }
5285
5286 /* Return true if the RTX X in mode MODE is a zero or sign extract
5287    usable in an ADD or SUB (extended register) instruction.  */
5288 static bool
5289 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5290 {
5291   /* Catch add with a sign extract.
5292      This is add_<optab><mode>_multp2.  */
5293   if (GET_CODE (x) == SIGN_EXTRACT
5294       || GET_CODE (x) == ZERO_EXTRACT)
5295     {
5296       rtx op0 = XEXP (x, 0);
5297       rtx op1 = XEXP (x, 1);
5298       rtx op2 = XEXP (x, 2);
5299
5300       if (GET_CODE (op0) == MULT
5301           && CONST_INT_P (op1)
5302           && op2 == const0_rtx
5303           && CONST_INT_P (XEXP (op0, 1))
5304           && aarch64_is_extend_from_extract (mode,
5305                                              XEXP (op0, 1),
5306                                              op1))
5307         {
5308           return true;
5309         }
5310     }
5311
5312   return false;
5313 }
5314
5315 static bool
5316 aarch64_frint_unspec_p (unsigned int u)
5317 {
5318   switch (u)
5319     {
5320       case UNSPEC_FRINTZ:
5321       case UNSPEC_FRINTP:
5322       case UNSPEC_FRINTM:
5323       case UNSPEC_FRINTA:
5324       case UNSPEC_FRINTN:
5325       case UNSPEC_FRINTX:
5326       case UNSPEC_FRINTI:
5327         return true;
5328
5329       default:
5330         return false;
5331     }
5332 }
5333
5334 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5335    storing it in *COST.  Result is true if the total cost of the operation
5336    has now been calculated.  */
5337 static bool
5338 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5339 {
5340   rtx inner;
5341   rtx comparator;
5342   enum rtx_code cmpcode;
5343
5344   if (COMPARISON_P (op0))
5345     {
5346       inner = XEXP (op0, 0);
5347       comparator = XEXP (op0, 1);
5348       cmpcode = GET_CODE (op0);
5349     }
5350   else
5351     {
5352       inner = op0;
5353       comparator = const0_rtx;
5354       cmpcode = NE;
5355     }
5356
5357   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5358     {
5359       /* Conditional branch.  */
5360       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5361         return true;
5362       else
5363         {
5364           if (cmpcode == NE || cmpcode == EQ)
5365             {
5366               if (comparator == const0_rtx)
5367                 {
5368                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5369                   if (GET_CODE (inner) == ZERO_EXTRACT)
5370                     /* TBZ/TBNZ.  */
5371                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5372                                        0, speed);
5373                 else
5374                   /* CBZ/CBNZ.  */
5375                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5376
5377                 return true;
5378               }
5379             }
5380           else if (cmpcode == LT || cmpcode == GE)
5381             {
5382               /* TBZ/TBNZ.  */
5383               if (comparator == const0_rtx)
5384                 return true;
5385             }
5386         }
5387     }
5388   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5389     {
5390       /* It's a conditional operation based on the status flags,
5391          so it must be some flavor of CSEL.  */
5392
5393       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5394       if (GET_CODE (op1) == NEG
5395           || GET_CODE (op1) == NOT
5396           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5397         op1 = XEXP (op1, 0);
5398
5399       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5400       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5401       return true;
5402     }
5403
5404   /* We don't know what this is, cost all operands.  */
5405   return false;
5406 }
5407
5408 /* Calculate the cost of calculating X, storing it in *COST.  Result
5409    is true if the total cost of the operation has now been calculated.  */
5410 static bool
5411 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5412                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5413 {
5414   rtx op0, op1, op2;
5415   const struct cpu_cost_table *extra_cost
5416     = aarch64_tune_params->insn_extra_cost;
5417   machine_mode mode = GET_MODE (x);
5418
5419   /* By default, assume that everything has equivalent cost to the
5420      cheapest instruction.  Any additional costs are applied as a delta
5421      above this default.  */
5422   *cost = COSTS_N_INSNS (1);
5423
5424   /* TODO: The cost infrastructure currently does not handle
5425      vector operations.  Assume that all vector operations
5426      are equally expensive.  */
5427   if (VECTOR_MODE_P (mode))
5428     {
5429       if (speed)
5430         *cost += extra_cost->vect.alu;
5431       return true;
5432     }
5433
5434   switch (code)
5435     {
5436     case SET:
5437       /* The cost depends entirely on the operands to SET.  */
5438       *cost = 0;
5439       op0 = SET_DEST (x);
5440       op1 = SET_SRC (x);
5441
5442       switch (GET_CODE (op0))
5443         {
5444         case MEM:
5445           if (speed)
5446             {
5447               rtx address = XEXP (op0, 0);
5448               if (GET_MODE_CLASS (mode) == MODE_INT)
5449                 *cost += extra_cost->ldst.store;
5450               else if (mode == SFmode)
5451                 *cost += extra_cost->ldst.storef;
5452               else if (mode == DFmode)
5453                 *cost += extra_cost->ldst.stored;
5454
5455               *cost +=
5456                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5457                                                      0, speed));
5458             }
5459
5460           *cost += rtx_cost (op1, SET, 1, speed);
5461           return true;
5462
5463         case SUBREG:
5464           if (! REG_P (SUBREG_REG (op0)))
5465             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5466
5467           /* Fall through.  */
5468         case REG:
5469           /* const0_rtx is in general free, but we will use an
5470              instruction to set a register to 0.  */
5471           if (REG_P (op1) || op1 == const0_rtx)
5472             {
5473               /* The cost is 1 per register copied.  */
5474               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5475                               / UNITS_PER_WORD;
5476               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5477             }
5478           else
5479             /* Cost is just the cost of the RHS of the set.  */
5480             *cost += rtx_cost (op1, SET, 1, speed);
5481           return true;
5482
5483         case ZERO_EXTRACT:
5484         case SIGN_EXTRACT:
5485           /* Bit-field insertion.  Strip any redundant widening of
5486              the RHS to meet the width of the target.  */
5487           if (GET_CODE (op1) == SUBREG)
5488             op1 = SUBREG_REG (op1);
5489           if ((GET_CODE (op1) == ZERO_EXTEND
5490                || GET_CODE (op1) == SIGN_EXTEND)
5491               && CONST_INT_P (XEXP (op0, 1))
5492               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5493                   >= INTVAL (XEXP (op0, 1))))
5494             op1 = XEXP (op1, 0);
5495
5496           if (CONST_INT_P (op1))
5497             {
5498               /* MOV immediate is assumed to always be cheap.  */
5499               *cost = COSTS_N_INSNS (1);
5500             }
5501           else
5502             {
5503               /* BFM.  */
5504               if (speed)
5505                 *cost += extra_cost->alu.bfi;
5506               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5507             }
5508
5509           return true;
5510
5511         default:
5512           /* We can't make sense of this, assume default cost.  */
5513           *cost = COSTS_N_INSNS (1);
5514           return false;
5515         }
5516       return false;
5517
5518     case CONST_INT:
5519       /* If an instruction can incorporate a constant within the
5520          instruction, the instruction's expression avoids calling
5521          rtx_cost() on the constant.  If rtx_cost() is called on a
5522          constant, then it is usually because the constant must be
5523          moved into a register by one or more instructions.
5524
5525          The exception is constant 0, which can be expressed
5526          as XZR/WZR and is therefore free.  The exception to this is
5527          if we have (set (reg) (const0_rtx)) in which case we must cost
5528          the move.  However, we can catch that when we cost the SET, so
5529          we don't need to consider that here.  */
5530       if (x == const0_rtx)
5531         *cost = 0;
5532       else
5533         {
5534           /* To an approximation, building any other constant is
5535              proportionally expensive to the number of instructions
5536              required to build that constant.  This is true whether we
5537              are compiling for SPEED or otherwise.  */
5538           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5539                                  (NULL_RTX, x, false, mode));
5540         }
5541       return true;
5542
5543     case CONST_DOUBLE:
5544       if (speed)
5545         {
5546           /* mov[df,sf]_aarch64.  */
5547           if (aarch64_float_const_representable_p (x))
5548             /* FMOV (scalar immediate).  */
5549             *cost += extra_cost->fp[mode == DFmode].fpconst;
5550           else if (!aarch64_float_const_zero_rtx_p (x))
5551             {
5552               /* This will be a load from memory.  */
5553               if (mode == DFmode)
5554                 *cost += extra_cost->ldst.loadd;
5555               else
5556                 *cost += extra_cost->ldst.loadf;
5557             }
5558           else
5559             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5560                or MOV v0.s[0], wzr - neither of which are modeled by the
5561                cost tables.  Just use the default cost.  */
5562             {
5563             }
5564         }
5565
5566       return true;
5567
5568     case MEM:
5569       if (speed)
5570         {
5571           /* For loads we want the base cost of a load, plus an
5572              approximation for the additional cost of the addressing
5573              mode.  */
5574           rtx address = XEXP (x, 0);
5575           if (GET_MODE_CLASS (mode) == MODE_INT)
5576             *cost += extra_cost->ldst.load;
5577           else if (mode == SFmode)
5578             *cost += extra_cost->ldst.loadf;
5579           else if (mode == DFmode)
5580             *cost += extra_cost->ldst.loadd;
5581
5582           *cost +=
5583                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5584                                                      0, speed));
5585         }
5586
5587       return true;
5588
5589     case NEG:
5590       op0 = XEXP (x, 0);
5591
5592       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5593        {
5594           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5595               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5596             {
5597               /* CSETM.  */
5598               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5599               return true;
5600             }
5601
5602           /* Cost this as SUB wzr, X.  */
5603           op0 = CONST0_RTX (GET_MODE (x));
5604           op1 = XEXP (x, 0);
5605           goto cost_minus;
5606         }
5607
5608       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5609         {
5610           /* Support (neg(fma...)) as a single instruction only if
5611              sign of zeros is unimportant.  This matches the decision
5612              making in aarch64.md.  */
5613           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5614             {
5615               /* FNMADD.  */
5616               *cost = rtx_cost (op0, NEG, 0, speed);
5617               return true;
5618             }
5619           if (speed)
5620             /* FNEG.  */
5621             *cost += extra_cost->fp[mode == DFmode].neg;
5622           return false;
5623         }
5624
5625       return false;
5626
5627     case CLRSB:
5628     case CLZ:
5629       if (speed)
5630         *cost += extra_cost->alu.clz;
5631
5632       return false;
5633
5634     case COMPARE:
5635       op0 = XEXP (x, 0);
5636       op1 = XEXP (x, 1);
5637
5638       if (op1 == const0_rtx
5639           && GET_CODE (op0) == AND)
5640         {
5641           x = op0;
5642           goto cost_logic;
5643         }
5644
5645       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5646         {
5647           /* TODO: A write to the CC flags possibly costs extra, this
5648              needs encoding in the cost tables.  */
5649
5650           /* CC_ZESWPmode supports zero extend for free.  */
5651           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5652             op0 = XEXP (op0, 0);
5653
5654           /* ANDS.  */
5655           if (GET_CODE (op0) == AND)
5656             {
5657               x = op0;
5658               goto cost_logic;
5659             }
5660
5661           if (GET_CODE (op0) == PLUS)
5662             {
5663               /* ADDS (and CMN alias).  */
5664               x = op0;
5665               goto cost_plus;
5666             }
5667
5668           if (GET_CODE (op0) == MINUS)
5669             {
5670               /* SUBS.  */
5671               x = op0;
5672               goto cost_minus;
5673             }
5674
5675           if (GET_CODE (op1) == NEG)
5676             {
5677               /* CMN.  */
5678               if (speed)
5679                 *cost += extra_cost->alu.arith;
5680
5681               *cost += rtx_cost (op0, COMPARE, 0, speed);
5682               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5683               return true;
5684             }
5685
5686           /* CMP.
5687
5688              Compare can freely swap the order of operands, and
5689              canonicalization puts the more complex operation first.
5690              But the integer MINUS logic expects the shift/extend
5691              operation in op1.  */
5692           if (! (REG_P (op0)
5693                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5694           {
5695             op0 = XEXP (x, 1);
5696             op1 = XEXP (x, 0);
5697           }
5698           goto cost_minus;
5699         }
5700
5701       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5702         {
5703           /* FCMP.  */
5704           if (speed)
5705             *cost += extra_cost->fp[mode == DFmode].compare;
5706
5707           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5708             {
5709               /* FCMP supports constant 0.0 for no extra cost. */
5710               return true;
5711             }
5712           return false;
5713         }
5714
5715       return false;
5716
5717     case MINUS:
5718       {
5719         op0 = XEXP (x, 0);
5720         op1 = XEXP (x, 1);
5721
5722 cost_minus:
5723         /* Detect valid immediates.  */
5724         if ((GET_MODE_CLASS (mode) == MODE_INT
5725              || (GET_MODE_CLASS (mode) == MODE_CC
5726                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5727             && CONST_INT_P (op1)
5728             && aarch64_uimm12_shift (INTVAL (op1)))
5729           {
5730             *cost += rtx_cost (op0, MINUS, 0, speed);
5731
5732             if (speed)
5733               /* SUB(S) (immediate).  */
5734               *cost += extra_cost->alu.arith;
5735             return true;
5736
5737           }
5738
5739         /* Look for SUB (extended register).  */
5740         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5741           {
5742             if (speed)
5743               *cost += extra_cost->alu.arith_shift;
5744
5745             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5746                                (enum rtx_code) GET_CODE (op1),
5747                                0, speed);
5748             return true;
5749           }
5750
5751         rtx new_op1 = aarch64_strip_extend (op1);
5752
5753         /* Cost this as an FMA-alike operation.  */
5754         if ((GET_CODE (new_op1) == MULT
5755              || GET_CODE (new_op1) == ASHIFT)
5756             && code != COMPARE)
5757           {
5758             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5759                                             (enum rtx_code) code,
5760                                             speed);
5761             *cost += rtx_cost (op0, MINUS, 0, speed);
5762             return true;
5763           }
5764
5765         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5766
5767         if (speed)
5768           {
5769             if (GET_MODE_CLASS (mode) == MODE_INT)
5770               /* SUB(S).  */
5771               *cost += extra_cost->alu.arith;
5772             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5773               /* FSUB.  */
5774               *cost += extra_cost->fp[mode == DFmode].addsub;
5775           }
5776         return true;
5777       }
5778
5779     case PLUS:
5780       {
5781         rtx new_op0;
5782
5783         op0 = XEXP (x, 0);
5784         op1 = XEXP (x, 1);
5785
5786 cost_plus:
5787         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5788             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5789           {
5790             /* CSINC.  */
5791             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5792             *cost += rtx_cost (op1, PLUS, 1, speed);
5793             return true;
5794           }
5795
5796         if (GET_MODE_CLASS (mode) == MODE_INT
5797             && CONST_INT_P (op1)
5798             && aarch64_uimm12_shift (INTVAL (op1)))
5799           {
5800             *cost += rtx_cost (op0, PLUS, 0, speed);
5801
5802             if (speed)
5803               /* ADD (immediate).  */
5804               *cost += extra_cost->alu.arith;
5805             return true;
5806           }
5807
5808         /* Look for ADD (extended register).  */
5809         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5810           {
5811             if (speed)
5812               *cost += extra_cost->alu.arith_shift;
5813
5814             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5815                                (enum rtx_code) GET_CODE (op0),
5816                                0, speed);
5817             return true;
5818           }
5819
5820         /* Strip any extend, leave shifts behind as we will
5821            cost them through mult_cost.  */
5822         new_op0 = aarch64_strip_extend (op0);
5823
5824         if (GET_CODE (new_op0) == MULT
5825             || GET_CODE (new_op0) == ASHIFT)
5826           {
5827             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5828                                             speed);
5829             *cost += rtx_cost (op1, PLUS, 1, speed);
5830             return true;
5831           }
5832
5833         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5834                   + rtx_cost (op1, PLUS, 1, speed));
5835
5836         if (speed)
5837           {
5838             if (GET_MODE_CLASS (mode) == MODE_INT)
5839               /* ADD.  */
5840               *cost += extra_cost->alu.arith;
5841             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5842               /* FADD.  */
5843               *cost += extra_cost->fp[mode == DFmode].addsub;
5844           }
5845         return true;
5846       }
5847
5848     case BSWAP:
5849       *cost = COSTS_N_INSNS (1);
5850
5851       if (speed)
5852         *cost += extra_cost->alu.rev;
5853
5854       return false;
5855
5856     case IOR:
5857       if (aarch_rev16_p (x))
5858         {
5859           *cost = COSTS_N_INSNS (1);
5860
5861           if (speed)
5862             *cost += extra_cost->alu.rev;
5863
5864           return true;
5865         }
5866     /* Fall through.  */
5867     case XOR:
5868     case AND:
5869     cost_logic:
5870       op0 = XEXP (x, 0);
5871       op1 = XEXP (x, 1);
5872
5873       if (code == AND
5874           && GET_CODE (op0) == MULT
5875           && CONST_INT_P (XEXP (op0, 1))
5876           && CONST_INT_P (op1)
5877           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5878                                INTVAL (op1)) != 0)
5879         {
5880           /* This is a UBFM/SBFM.  */
5881           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5882           if (speed)
5883             *cost += extra_cost->alu.bfx;
5884           return true;
5885         }
5886
5887       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5888         {
5889           /* We possibly get the immediate for free, this is not
5890              modelled.  */
5891           if (CONST_INT_P (op1)
5892               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5893             {
5894               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5895
5896               if (speed)
5897                 *cost += extra_cost->alu.logical;
5898
5899               return true;
5900             }
5901           else
5902             {
5903               rtx new_op0 = op0;
5904
5905               /* Handle ORN, EON, or BIC.  */
5906               if (GET_CODE (op0) == NOT)
5907                 op0 = XEXP (op0, 0);
5908
5909               new_op0 = aarch64_strip_shift (op0);
5910
5911               /* If we had a shift on op0 then this is a logical-shift-
5912                  by-register/immediate operation.  Otherwise, this is just
5913                  a logical operation.  */
5914               if (speed)
5915                 {
5916                   if (new_op0 != op0)
5917                     {
5918                       /* Shift by immediate.  */
5919                       if (CONST_INT_P (XEXP (op0, 1)))
5920                         *cost += extra_cost->alu.log_shift;
5921                       else
5922                         *cost += extra_cost->alu.log_shift_reg;
5923                     }
5924                   else
5925                     *cost += extra_cost->alu.logical;
5926                 }
5927
5928               /* In both cases we want to cost both operands.  */
5929               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5930                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5931
5932               return true;
5933             }
5934         }
5935       return false;
5936
5937     case NOT:
5938       /* MVN.  */
5939       if (speed)
5940         *cost += extra_cost->alu.logical;
5941
5942       /* The logical instruction could have the shifted register form,
5943          but the cost is the same if the shift is processed as a separate
5944          instruction, so we don't bother with it here.  */
5945       return false;
5946
5947     case ZERO_EXTEND:
5948
5949       op0 = XEXP (x, 0);
5950       /* If a value is written in SI mode, then zero extended to DI
5951          mode, the operation will in general be free as a write to
5952          a 'w' register implicitly zeroes the upper bits of an 'x'
5953          register.  However, if this is
5954
5955            (set (reg) (zero_extend (reg)))
5956
5957          we must cost the explicit register move.  */
5958       if (mode == DImode
5959           && GET_MODE (op0) == SImode
5960           && outer == SET)
5961         {
5962           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5963
5964           if (!op_cost && speed)
5965             /* MOV.  */
5966             *cost += extra_cost->alu.extend;
5967           else
5968             /* Free, the cost is that of the SI mode operation.  */
5969             *cost = op_cost;
5970
5971           return true;
5972         }
5973       else if (MEM_P (XEXP (x, 0)))
5974         {
5975           /* All loads can zero extend to any size for free.  */
5976           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5977           return true;
5978         }
5979
5980       /* UXTB/UXTH.  */
5981       if (speed)
5982         *cost += extra_cost->alu.extend;
5983
5984       return false;
5985
5986     case SIGN_EXTEND:
5987       if (MEM_P (XEXP (x, 0)))
5988         {
5989           /* LDRSH.  */
5990           if (speed)
5991             {
5992               rtx address = XEXP (XEXP (x, 0), 0);
5993               *cost += extra_cost->ldst.load_sign_extend;
5994
5995               *cost +=
5996                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5997                                                      0, speed));
5998             }
5999           return true;
6000         }
6001
6002       if (speed)
6003         *cost += extra_cost->alu.extend;
6004       return false;
6005
6006     case ASHIFT:
6007       op0 = XEXP (x, 0);
6008       op1 = XEXP (x, 1);
6009
6010       if (CONST_INT_P (op1))
6011         {
6012           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6013              aliases.  */
6014           if (speed)
6015             *cost += extra_cost->alu.shift;
6016
6017           /* We can incorporate zero/sign extend for free.  */
6018           if (GET_CODE (op0) == ZERO_EXTEND
6019               || GET_CODE (op0) == SIGN_EXTEND)
6020             op0 = XEXP (op0, 0);
6021
6022           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6023           return true;
6024         }
6025       else
6026         {
6027           /* LSLV.  */
6028           if (speed)
6029             *cost += extra_cost->alu.shift_reg;
6030
6031           return false;  /* All arguments need to be in registers.  */
6032         }
6033
6034     case ROTATE:
6035     case ROTATERT:
6036     case LSHIFTRT:
6037     case ASHIFTRT:
6038       op0 = XEXP (x, 0);
6039       op1 = XEXP (x, 1);
6040
6041       if (CONST_INT_P (op1))
6042         {
6043           /* ASR (immediate) and friends.  */
6044           if (speed)
6045             *cost += extra_cost->alu.shift;
6046
6047           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6048           return true;
6049         }
6050       else
6051         {
6052
6053           /* ASR (register) and friends.  */
6054           if (speed)
6055             *cost += extra_cost->alu.shift_reg;
6056
6057           return false;  /* All arguments need to be in registers.  */
6058         }
6059
6060     case SYMBOL_REF:
6061
6062       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6063         {
6064           /* LDR.  */
6065           if (speed)
6066             *cost += extra_cost->ldst.load;
6067         }
6068       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6069                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6070         {
6071           /* ADRP, followed by ADD.  */
6072           *cost += COSTS_N_INSNS (1);
6073           if (speed)
6074             *cost += 2 * extra_cost->alu.arith;
6075         }
6076       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6077                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6078         {
6079           /* ADR.  */
6080           if (speed)
6081             *cost += extra_cost->alu.arith;
6082         }
6083
6084       if (flag_pic)
6085         {
6086           /* One extra load instruction, after accessing the GOT.  */
6087           *cost += COSTS_N_INSNS (1);
6088           if (speed)
6089             *cost += extra_cost->ldst.load;
6090         }
6091       return true;
6092
6093     case HIGH:
6094     case LO_SUM:
6095       /* ADRP/ADD (immediate).  */
6096       if (speed)
6097         *cost += extra_cost->alu.arith;
6098       return true;
6099
6100     case ZERO_EXTRACT:
6101     case SIGN_EXTRACT:
6102       /* UBFX/SBFX.  */
6103       if (speed)
6104         *cost += extra_cost->alu.bfx;
6105
6106       /* We can trust that the immediates used will be correct (there
6107          are no by-register forms), so we need only cost op0.  */
6108       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6109       return true;
6110
6111     case MULT:
6112       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6113       /* aarch64_rtx_mult_cost always handles recursion to its
6114          operands.  */
6115       return true;
6116
6117     case MOD:
6118     case UMOD:
6119       if (speed)
6120         {
6121           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6122             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6123                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6124           else if (GET_MODE (x) == DFmode)
6125             *cost += (extra_cost->fp[1].mult
6126                       + extra_cost->fp[1].div);
6127           else if (GET_MODE (x) == SFmode)
6128             *cost += (extra_cost->fp[0].mult
6129                       + extra_cost->fp[0].div);
6130         }
6131       return false;  /* All arguments need to be in registers.  */
6132
6133     case DIV:
6134     case UDIV:
6135     case SQRT:
6136       if (speed)
6137         {
6138           if (GET_MODE_CLASS (mode) == MODE_INT)
6139             /* There is no integer SQRT, so only DIV and UDIV can get
6140                here.  */
6141             *cost += extra_cost->mult[mode == DImode].idiv;
6142           else
6143             *cost += extra_cost->fp[mode == DFmode].div;
6144         }
6145       return false;  /* All arguments need to be in registers.  */
6146
6147     case IF_THEN_ELSE:
6148       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6149                                          XEXP (x, 2), cost, speed);
6150
6151     case EQ:
6152     case NE:
6153     case GT:
6154     case GTU:
6155     case LT:
6156     case LTU:
6157     case GE:
6158     case GEU:
6159     case LE:
6160     case LEU:
6161
6162       return false; /* All arguments must be in registers.  */
6163
6164     case FMA:
6165       op0 = XEXP (x, 0);
6166       op1 = XEXP (x, 1);
6167       op2 = XEXP (x, 2);
6168
6169       if (speed)
6170         *cost += extra_cost->fp[mode == DFmode].fma;
6171
6172       /* FMSUB, FNMADD, and FNMSUB are free.  */
6173       if (GET_CODE (op0) == NEG)
6174         op0 = XEXP (op0, 0);
6175
6176       if (GET_CODE (op2) == NEG)
6177         op2 = XEXP (op2, 0);
6178
6179       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6180          and the by-element operand as operand 0.  */
6181       if (GET_CODE (op1) == NEG)
6182         op1 = XEXP (op1, 0);
6183
6184       /* Catch vector-by-element operations.  The by-element operand can
6185          either be (vec_duplicate (vec_select (x))) or just
6186          (vec_select (x)), depending on whether we are multiplying by
6187          a vector or a scalar.
6188
6189          Canonicalization is not very good in these cases, FMA4 will put the
6190          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6191       if (GET_CODE (op0) == VEC_DUPLICATE)
6192         op0 = XEXP (op0, 0);
6193       else if (GET_CODE (op1) == VEC_DUPLICATE)
6194         op1 = XEXP (op1, 0);
6195
6196       if (GET_CODE (op0) == VEC_SELECT)
6197         op0 = XEXP (op0, 0);
6198       else if (GET_CODE (op1) == VEC_SELECT)
6199         op1 = XEXP (op1, 0);
6200
6201       /* If the remaining parameters are not registers,
6202          get the cost to put them into registers.  */
6203       *cost += rtx_cost (op0, FMA, 0, speed);
6204       *cost += rtx_cost (op1, FMA, 1, speed);
6205       *cost += rtx_cost (op2, FMA, 2, speed);
6206       return true;
6207
6208     case FLOAT_EXTEND:
6209       if (speed)
6210         *cost += extra_cost->fp[mode == DFmode].widen;
6211       return false;
6212
6213     case FLOAT_TRUNCATE:
6214       if (speed)
6215         *cost += extra_cost->fp[mode == DFmode].narrow;
6216       return false;
6217
6218     case FIX:
6219     case UNSIGNED_FIX:
6220       x = XEXP (x, 0);
6221       /* Strip the rounding part.  They will all be implemented
6222          by the fcvt* family of instructions anyway.  */
6223       if (GET_CODE (x) == UNSPEC)
6224         {
6225           unsigned int uns_code = XINT (x, 1);
6226
6227           if (uns_code == UNSPEC_FRINTA
6228               || uns_code == UNSPEC_FRINTM
6229               || uns_code == UNSPEC_FRINTN
6230               || uns_code == UNSPEC_FRINTP
6231               || uns_code == UNSPEC_FRINTZ)
6232             x = XVECEXP (x, 0, 0);
6233         }
6234
6235       if (speed)
6236         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6237
6238       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6239       return true;
6240
6241     case ABS:
6242       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6243         {
6244           /* FABS and FNEG are analogous.  */
6245           if (speed)
6246             *cost += extra_cost->fp[mode == DFmode].neg;
6247         }
6248       else
6249         {
6250           /* Integer ABS will either be split to
6251              two arithmetic instructions, or will be an ABS
6252              (scalar), which we don't model.  */
6253           *cost = COSTS_N_INSNS (2);
6254           if (speed)
6255             *cost += 2 * extra_cost->alu.arith;
6256         }
6257       return false;
6258
6259     case SMAX:
6260     case SMIN:
6261       if (speed)
6262         {
6263           /* FMAXNM/FMINNM/FMAX/FMIN.
6264              TODO: This may not be accurate for all implementations, but
6265              we do not model this in the cost tables.  */
6266           *cost += extra_cost->fp[mode == DFmode].addsub;
6267         }
6268       return false;
6269
6270     case UNSPEC:
6271       /* The floating point round to integer frint* instructions.  */
6272       if (aarch64_frint_unspec_p (XINT (x, 1)))
6273         {
6274           if (speed)
6275             *cost += extra_cost->fp[mode == DFmode].roundint;
6276
6277           return false;
6278         }
6279
6280       if (XINT (x, 1) == UNSPEC_RBIT)
6281         {
6282           if (speed)
6283             *cost += extra_cost->alu.rev;
6284
6285           return false;
6286         }
6287       break;
6288
6289     case TRUNCATE:
6290
6291       /* Decompose <su>muldi3_highpart.  */
6292       if (/* (truncate:DI  */
6293           mode == DImode
6294           /*   (lshiftrt:TI  */
6295           && GET_MODE (XEXP (x, 0)) == TImode
6296           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6297           /*      (mult:TI  */
6298           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6299           /*        (ANY_EXTEND:TI (reg:DI))
6300                     (ANY_EXTEND:TI (reg:DI)))  */
6301           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6302                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6303               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6304                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6305           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6306           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6307           /*     (const_int 64)  */
6308           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6309           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6310         {
6311           /* UMULH/SMULH.  */
6312           if (speed)
6313             *cost += extra_cost->mult[mode == DImode].extend;
6314           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6315                              MULT, 0, speed);
6316           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6317                              MULT, 1, speed);
6318           return true;
6319         }
6320
6321       /* Fall through.  */
6322     default:
6323       break;
6324     }
6325
6326   if (dump_file && (dump_flags & TDF_DETAILS))
6327     fprintf (dump_file,
6328       "\nFailed to cost RTX.  Assuming default cost.\n");
6329
6330   return true;
6331 }
6332
6333 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6334    calculated for X.  This cost is stored in *COST.  Returns true
6335    if the total cost of X was calculated.  */
6336 static bool
6337 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6338                    int param, int *cost, bool speed)
6339 {
6340   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6341
6342   if (dump_file && (dump_flags & TDF_DETAILS))
6343     {
6344       print_rtl_single (dump_file, x);
6345       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6346                speed ? "Hot" : "Cold",
6347                *cost, result ? "final" : "partial");
6348     }
6349
6350   return result;
6351 }
6352
6353 static int
6354 aarch64_register_move_cost (machine_mode mode,
6355                             reg_class_t from_i, reg_class_t to_i)
6356 {
6357   enum reg_class from = (enum reg_class) from_i;
6358   enum reg_class to = (enum reg_class) to_i;
6359   const struct cpu_regmove_cost *regmove_cost
6360     = aarch64_tune_params->regmove_cost;
6361
6362   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6363   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6364     to = GENERAL_REGS;
6365
6366   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6367     from = GENERAL_REGS;
6368
6369   /* Moving between GPR and stack cost is the same as GP2GP.  */
6370   if ((from == GENERAL_REGS && to == STACK_REG)
6371       || (to == GENERAL_REGS && from == STACK_REG))
6372     return regmove_cost->GP2GP;
6373
6374   /* To/From the stack register, we move via the gprs.  */
6375   if (to == STACK_REG || from == STACK_REG)
6376     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6377             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6378
6379   if (GET_MODE_SIZE (mode) == 16)
6380     {
6381       /* 128-bit operations on general registers require 2 instructions.  */
6382       if (from == GENERAL_REGS && to == GENERAL_REGS)
6383         return regmove_cost->GP2GP * 2;
6384       else if (from == GENERAL_REGS)
6385         return regmove_cost->GP2FP * 2;
6386       else if (to == GENERAL_REGS)
6387         return regmove_cost->FP2GP * 2;
6388
6389       /* When AdvSIMD instructions are disabled it is not possible to move
6390          a 128-bit value directly between Q registers.  This is handled in
6391          secondary reload.  A general register is used as a scratch to move
6392          the upper DI value and the lower DI value is moved directly,
6393          hence the cost is the sum of three moves. */
6394       if (! TARGET_SIMD)
6395         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6396
6397       return regmove_cost->FP2FP;
6398     }
6399
6400   if (from == GENERAL_REGS && to == GENERAL_REGS)
6401     return regmove_cost->GP2GP;
6402   else if (from == GENERAL_REGS)
6403     return regmove_cost->GP2FP;
6404   else if (to == GENERAL_REGS)
6405     return regmove_cost->FP2GP;
6406
6407   return regmove_cost->FP2FP;
6408 }
6409
6410 static int
6411 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6412                           reg_class_t rclass ATTRIBUTE_UNUSED,
6413                           bool in ATTRIBUTE_UNUSED)
6414 {
6415   return aarch64_tune_params->memmov_cost;
6416 }
6417
6418 /* Return the number of instructions that can be issued per cycle.  */
6419 static int
6420 aarch64_sched_issue_rate (void)
6421 {
6422   return aarch64_tune_params->issue_rate;
6423 }
6424
6425 /* Vectorizer cost model target hooks.  */
6426
6427 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6428 static int
6429 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6430                                     tree vectype,
6431                                     int misalign ATTRIBUTE_UNUSED)
6432 {
6433   unsigned elements;
6434
6435   switch (type_of_cost)
6436     {
6437       case scalar_stmt:
6438         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6439
6440       case scalar_load:
6441         return aarch64_tune_params->vec_costs->scalar_load_cost;
6442
6443       case scalar_store:
6444         return aarch64_tune_params->vec_costs->scalar_store_cost;
6445
6446       case vector_stmt:
6447         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6448
6449       case vector_load:
6450         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6451
6452       case vector_store:
6453         return aarch64_tune_params->vec_costs->vec_store_cost;
6454
6455       case vec_to_scalar:
6456         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6457
6458       case scalar_to_vec:
6459         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6460
6461       case unaligned_load:
6462         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6463
6464       case unaligned_store:
6465         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6466
6467       case cond_branch_taken:
6468         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6469
6470       case cond_branch_not_taken:
6471         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6472
6473       case vec_perm:
6474       case vec_promote_demote:
6475         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6476
6477       case vec_construct:
6478         elements = TYPE_VECTOR_SUBPARTS (vectype);
6479         return elements / 2 + 1;
6480
6481       default:
6482         gcc_unreachable ();
6483     }
6484 }
6485
6486 /* Implement targetm.vectorize.add_stmt_cost.  */
6487 static unsigned
6488 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6489                        struct _stmt_vec_info *stmt_info, int misalign,
6490                        enum vect_cost_model_location where)
6491 {
6492   unsigned *cost = (unsigned *) data;
6493   unsigned retval = 0;
6494
6495   if (flag_vect_cost_model)
6496     {
6497       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6498       int stmt_cost =
6499             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6500
6501       /* Statements in an inner loop relative to the loop being
6502          vectorized are weighted more heavily.  The value here is
6503          a function (linear for now) of the loop nest level.  */
6504       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6505         {
6506           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6507           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6508           unsigned nest_level = loop_depth (loop);
6509
6510           count *= nest_level;
6511         }
6512
6513       retval = (unsigned) (count * stmt_cost);
6514       cost[where] += retval;
6515     }
6516
6517   return retval;
6518 }
6519
6520 static void initialize_aarch64_code_model (void);
6521
6522 /* Parse the architecture extension string.  */
6523
6524 static void
6525 aarch64_parse_extension (char *str)
6526 {
6527   /* The extension string is parsed left to right.  */
6528   const struct aarch64_option_extension *opt = NULL;
6529
6530   /* Flag to say whether we are adding or removing an extension.  */
6531   int adding_ext = -1;
6532
6533   while (str != NULL && *str != 0)
6534     {
6535       char *ext;
6536       size_t len;
6537
6538       str++;
6539       ext = strchr (str, '+');
6540
6541       if (ext != NULL)
6542         len = ext - str;
6543       else
6544         len = strlen (str);
6545
6546       if (len >= 2 && strncmp (str, "no", 2) == 0)
6547         {
6548           adding_ext = 0;
6549           len -= 2;
6550           str += 2;
6551         }
6552       else if (len > 0)
6553         adding_ext = 1;
6554
6555       if (len == 0)
6556         {
6557           error ("missing feature modifier after %qs", adding_ext ? "+"
6558                                                                   : "+no");
6559           return;
6560         }
6561
6562       /* Scan over the extensions table trying to find an exact match.  */
6563       for (opt = all_extensions; opt->name != NULL; opt++)
6564         {
6565           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6566             {
6567               /* Add or remove the extension.  */
6568               if (adding_ext)
6569                 aarch64_isa_flags |= opt->flags_on;
6570               else
6571                 aarch64_isa_flags &= ~(opt->flags_off);
6572               break;
6573             }
6574         }
6575
6576       if (opt->name == NULL)
6577         {
6578           /* Extension not found in list.  */
6579           error ("unknown feature modifier %qs", str);
6580           return;
6581         }
6582
6583       str = ext;
6584     };
6585
6586   return;
6587 }
6588
6589 /* Parse the ARCH string.  */
6590
6591 static void
6592 aarch64_parse_arch (void)
6593 {
6594   char *ext;
6595   const struct processor *arch;
6596   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6597   size_t len;
6598
6599   strcpy (str, aarch64_arch_string);
6600
6601   ext = strchr (str, '+');
6602
6603   if (ext != NULL)
6604     len = ext - str;
6605   else
6606     len = strlen (str);
6607
6608   if (len == 0)
6609     {
6610       error ("missing arch name in -march=%qs", str);
6611       return;
6612     }
6613
6614   /* Loop through the list of supported ARCHs to find a match.  */
6615   for (arch = all_architectures; arch->name != NULL; arch++)
6616     {
6617       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6618         {
6619           selected_arch = arch;
6620           aarch64_isa_flags = selected_arch->flags;
6621
6622           if (!selected_cpu)
6623             selected_cpu = &all_cores[selected_arch->core];
6624
6625           if (ext != NULL)
6626             {
6627               /* ARCH string contains at least one extension.  */
6628               aarch64_parse_extension (ext);
6629             }
6630
6631           if (strcmp (selected_arch->arch, selected_cpu->arch))
6632             {
6633               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6634                        selected_cpu->name, selected_arch->name);
6635             }
6636
6637           return;
6638         }
6639     }
6640
6641   /* ARCH name not found in list.  */
6642   error ("unknown value %qs for -march", str);
6643   return;
6644 }
6645
6646 /* Parse the CPU string.  */
6647
6648 static void
6649 aarch64_parse_cpu (void)
6650 {
6651   char *ext;
6652   const struct processor *cpu;
6653   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6654   size_t len;
6655
6656   strcpy (str, aarch64_cpu_string);
6657
6658   ext = strchr (str, '+');
6659
6660   if (ext != NULL)
6661     len = ext - str;
6662   else
6663     len = strlen (str);
6664
6665   if (len == 0)
6666     {
6667       error ("missing cpu name in -mcpu=%qs", str);
6668       return;
6669     }
6670
6671   /* Loop through the list of supported CPUs to find a match.  */
6672   for (cpu = all_cores; cpu->name != NULL; cpu++)
6673     {
6674       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6675         {
6676           selected_cpu = cpu;
6677           aarch64_isa_flags = selected_cpu->flags;
6678
6679           if (ext != NULL)
6680             {
6681               /* CPU string contains at least one extension.  */
6682               aarch64_parse_extension (ext);
6683             }
6684
6685           return;
6686         }
6687     }
6688
6689   /* CPU name not found in list.  */
6690   error ("unknown value %qs for -mcpu", str);
6691   return;
6692 }
6693
6694 /* Parse the TUNE string.  */
6695
6696 static void
6697 aarch64_parse_tune (void)
6698 {
6699   const struct processor *cpu;
6700   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6701   strcpy (str, aarch64_tune_string);
6702
6703   /* Loop through the list of supported CPUs to find a match.  */
6704   for (cpu = all_cores; cpu->name != NULL; cpu++)
6705     {
6706       if (strcmp (cpu->name, str) == 0)
6707         {
6708           selected_tune = cpu;
6709           return;
6710         }
6711     }
6712
6713   /* CPU name not found in list.  */
6714   error ("unknown value %qs for -mtune", str);
6715   return;
6716 }
6717
6718
6719 /* Implement TARGET_OPTION_OVERRIDE.  */
6720
6721 static void
6722 aarch64_override_options (void)
6723 {
6724   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6725      If either of -march or -mtune is given, they override their
6726      respective component of -mcpu.
6727
6728      So, first parse AARCH64_CPU_STRING, then the others, be careful
6729      with -march as, if -mcpu is not present on the command line, march
6730      must set a sensible default CPU.  */
6731   if (aarch64_cpu_string)
6732     {
6733       aarch64_parse_cpu ();
6734     }
6735
6736   if (aarch64_arch_string)
6737     {
6738       aarch64_parse_arch ();
6739     }
6740
6741   if (aarch64_tune_string)
6742     {
6743       aarch64_parse_tune ();
6744     }
6745
6746 #ifndef HAVE_AS_MABI_OPTION
6747   /* The compiler may have been configured with 2.23.* binutils, which does
6748      not have support for ILP32.  */
6749   if (TARGET_ILP32)
6750     error ("Assembler does not support -mabi=ilp32");
6751 #endif
6752
6753   initialize_aarch64_code_model ();
6754
6755   aarch64_build_bitmask_table ();
6756
6757   /* This target defaults to strict volatile bitfields.  */
6758   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6759     flag_strict_volatile_bitfields = 1;
6760
6761   /* If the user did not specify a processor, choose the default
6762      one for them.  This will be the CPU set during configuration using
6763      --with-cpu, otherwise it is "generic".  */
6764   if (!selected_cpu)
6765     {
6766       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6767       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6768     }
6769
6770   gcc_assert (selected_cpu);
6771
6772   if (!selected_tune)
6773     selected_tune = selected_cpu;
6774
6775   aarch64_tune_flags = selected_tune->flags;
6776   aarch64_tune = selected_tune->core;
6777   aarch64_tune_params = selected_tune->tune;
6778   aarch64_architecture_version = selected_cpu->architecture_version;
6779
6780   if (aarch64_fix_a53_err835769 == 2)
6781     {
6782 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6783       aarch64_fix_a53_err835769 = 1;
6784 #else
6785       aarch64_fix_a53_err835769 = 0;
6786 #endif
6787     }
6788
6789   /* If not opzimizing for size, set the default
6790      alignment to what the target wants */
6791   if (!optimize_size)
6792     {
6793       if (align_loops <= 0)
6794         align_loops = aarch64_tune_params->loop_align;
6795       if (align_jumps <= 0)
6796         align_jumps = aarch64_tune_params->jump_align;
6797       if (align_functions <= 0)
6798         align_functions = aarch64_tune_params->function_align;
6799     }
6800
6801   aarch64_override_options_after_change ();
6802 }
6803
6804 /* Implement targetm.override_options_after_change.  */
6805
6806 static void
6807 aarch64_override_options_after_change (void)
6808 {
6809   if (flag_omit_frame_pointer)
6810     flag_omit_leaf_frame_pointer = false;
6811   else if (flag_omit_leaf_frame_pointer)
6812     flag_omit_frame_pointer = true;
6813 }
6814
6815 static struct machine_function *
6816 aarch64_init_machine_status (void)
6817 {
6818   struct machine_function *machine;
6819   machine = ggc_cleared_alloc<machine_function> ();
6820   return machine;
6821 }
6822
6823 void
6824 aarch64_init_expanders (void)
6825 {
6826   init_machine_status = aarch64_init_machine_status;
6827 }
6828
6829 /* A checking mechanism for the implementation of the various code models.  */
6830 static void
6831 initialize_aarch64_code_model (void)
6832 {
6833    if (flag_pic)
6834      {
6835        switch (aarch64_cmodel_var)
6836          {
6837          case AARCH64_CMODEL_TINY:
6838            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6839            break;
6840          case AARCH64_CMODEL_SMALL:
6841            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6842            break;
6843          case AARCH64_CMODEL_LARGE:
6844            sorry ("code model %qs with -f%s", "large",
6845                   flag_pic > 1 ? "PIC" : "pic");
6846          default:
6847            gcc_unreachable ();
6848          }
6849      }
6850    else
6851      aarch64_cmodel = aarch64_cmodel_var;
6852 }
6853
6854 /* Return true if SYMBOL_REF X binds locally.  */
6855
6856 static bool
6857 aarch64_symbol_binds_local_p (const_rtx x)
6858 {
6859   return (SYMBOL_REF_DECL (x)
6860           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6861           : SYMBOL_REF_LOCAL_P (x));
6862 }
6863
6864 /* Return true if SYMBOL_REF X is thread local */
6865 static bool
6866 aarch64_tls_symbol_p (rtx x)
6867 {
6868   if (! TARGET_HAVE_TLS)
6869     return false;
6870
6871   if (GET_CODE (x) != SYMBOL_REF)
6872     return false;
6873
6874   return SYMBOL_REF_TLS_MODEL (x) != 0;
6875 }
6876
6877 /* Classify a TLS symbol into one of the TLS kinds.  */
6878 enum aarch64_symbol_type
6879 aarch64_classify_tls_symbol (rtx x)
6880 {
6881   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6882
6883   switch (tls_kind)
6884     {
6885     case TLS_MODEL_GLOBAL_DYNAMIC:
6886     case TLS_MODEL_LOCAL_DYNAMIC:
6887       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6888
6889     case TLS_MODEL_INITIAL_EXEC:
6890       return SYMBOL_SMALL_GOTTPREL;
6891
6892     case TLS_MODEL_LOCAL_EXEC:
6893       return SYMBOL_SMALL_TPREL;
6894
6895     case TLS_MODEL_EMULATED:
6896     case TLS_MODEL_NONE:
6897       return SYMBOL_FORCE_TO_MEM;
6898
6899     default:
6900       gcc_unreachable ();
6901     }
6902 }
6903
6904 /* Return the method that should be used to access SYMBOL_REF or
6905    LABEL_REF X in context CONTEXT.  */
6906
6907 enum aarch64_symbol_type
6908 aarch64_classify_symbol (rtx x, rtx offset,
6909                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6910 {
6911   if (GET_CODE (x) == LABEL_REF)
6912     {
6913       switch (aarch64_cmodel)
6914         {
6915         case AARCH64_CMODEL_LARGE:
6916           return SYMBOL_FORCE_TO_MEM;
6917
6918         case AARCH64_CMODEL_TINY_PIC:
6919         case AARCH64_CMODEL_TINY:
6920           return SYMBOL_TINY_ABSOLUTE;
6921
6922         case AARCH64_CMODEL_SMALL_PIC:
6923         case AARCH64_CMODEL_SMALL:
6924           return SYMBOL_SMALL_ABSOLUTE;
6925
6926         default:
6927           gcc_unreachable ();
6928         }
6929     }
6930
6931   if (GET_CODE (x) == SYMBOL_REF)
6932     {
6933       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6934           return SYMBOL_FORCE_TO_MEM;
6935
6936       if (aarch64_tls_symbol_p (x))
6937         return aarch64_classify_tls_symbol (x);
6938
6939       switch (aarch64_cmodel)
6940         {
6941         case AARCH64_CMODEL_TINY:
6942           /* When we retreive symbol + offset address, we have to make sure
6943              the offset does not cause overflow of the final address.  But
6944              we have no way of knowing the address of symbol at compile time
6945              so we can't accurately say if the distance between the PC and
6946              symbol + offset is outside the addressible range of +/-1M in the
6947              TINY code model.  So we rely on images not being greater than
6948              1M and cap the offset at 1M and anything beyond 1M will have to
6949              be loaded using an alternative mechanism.  */
6950           if (SYMBOL_REF_WEAK (x)
6951               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
6952             return SYMBOL_FORCE_TO_MEM;
6953           return SYMBOL_TINY_ABSOLUTE;
6954
6955         case AARCH64_CMODEL_SMALL:
6956           /* Same reasoning as the tiny code model, but the offset cap here is
6957              4G.  */
6958           if (SYMBOL_REF_WEAK (x)
6959               || INTVAL (offset) < (HOST_WIDE_INT) -4294967263
6960               || INTVAL (offset) > (HOST_WIDE_INT) 4294967264)
6961             return SYMBOL_FORCE_TO_MEM;
6962           return SYMBOL_SMALL_ABSOLUTE;
6963
6964         case AARCH64_CMODEL_TINY_PIC:
6965           if (!aarch64_symbol_binds_local_p (x))
6966             return SYMBOL_TINY_GOT;
6967           return SYMBOL_TINY_ABSOLUTE;
6968
6969         case AARCH64_CMODEL_SMALL_PIC:
6970           if (!aarch64_symbol_binds_local_p (x))
6971             return SYMBOL_SMALL_GOT;
6972           return SYMBOL_SMALL_ABSOLUTE;
6973
6974         default:
6975           gcc_unreachable ();
6976         }
6977     }
6978
6979   /* By default push everything into the constant pool.  */
6980   return SYMBOL_FORCE_TO_MEM;
6981 }
6982
6983 bool
6984 aarch64_constant_address_p (rtx x)
6985 {
6986   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6987 }
6988
6989 bool
6990 aarch64_legitimate_pic_operand_p (rtx x)
6991 {
6992   if (GET_CODE (x) == SYMBOL_REF
6993       || (GET_CODE (x) == CONST
6994           && GET_CODE (XEXP (x, 0)) == PLUS
6995           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6996      return false;
6997
6998   return true;
6999 }
7000
7001 /* Return true if X holds either a quarter-precision or
7002      floating-point +0.0 constant.  */
7003 static bool
7004 aarch64_valid_floating_const (machine_mode mode, rtx x)
7005 {
7006   if (!CONST_DOUBLE_P (x))
7007     return false;
7008
7009   /* TODO: We could handle moving 0.0 to a TFmode register,
7010      but first we would like to refactor the movtf_aarch64
7011      to be more amicable to split moves properly and
7012      correctly gate on TARGET_SIMD.  For now - reject all
7013      constants which are not to SFmode or DFmode registers.  */
7014   if (!(mode == SFmode || mode == DFmode))
7015     return false;
7016
7017   if (aarch64_float_const_zero_rtx_p (x))
7018     return true;
7019   return aarch64_float_const_representable_p (x);
7020 }
7021
7022 static bool
7023 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7024 {
7025   /* Do not allow vector struct mode constants.  We could support
7026      0 and -1 easily, but they need support in aarch64-simd.md.  */
7027   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7028     return false;
7029
7030   /* This could probably go away because
7031      we now decompose CONST_INTs according to expand_mov_immediate.  */
7032   if ((GET_CODE (x) == CONST_VECTOR
7033        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7034       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7035         return !targetm.cannot_force_const_mem (mode, x);
7036
7037   if (GET_CODE (x) == HIGH
7038       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7039     return true;
7040
7041   return aarch64_constant_address_p (x);
7042 }
7043
7044 rtx
7045 aarch64_load_tp (rtx target)
7046 {
7047   if (!target
7048       || GET_MODE (target) != Pmode
7049       || !register_operand (target, Pmode))
7050     target = gen_reg_rtx (Pmode);
7051
7052   /* Can return in any reg.  */
7053   emit_insn (gen_aarch64_load_tp_hard (target));
7054   return target;
7055 }
7056
7057 /* On AAPCS systems, this is the "struct __va_list".  */
7058 static GTY(()) tree va_list_type;
7059
7060 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7061    Return the type to use as __builtin_va_list.
7062
7063    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7064
7065    struct __va_list
7066    {
7067      void *__stack;
7068      void *__gr_top;
7069      void *__vr_top;
7070      int   __gr_offs;
7071      int   __vr_offs;
7072    };  */
7073
7074 static tree
7075 aarch64_build_builtin_va_list (void)
7076 {
7077   tree va_list_name;
7078   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7079
7080   /* Create the type.  */
7081   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7082   /* Give it the required name.  */
7083   va_list_name = build_decl (BUILTINS_LOCATION,
7084                              TYPE_DECL,
7085                              get_identifier ("__va_list"),
7086                              va_list_type);
7087   DECL_ARTIFICIAL (va_list_name) = 1;
7088   TYPE_NAME (va_list_type) = va_list_name;
7089   TYPE_STUB_DECL (va_list_type) = va_list_name;
7090
7091   /* Create the fields.  */
7092   f_stack = build_decl (BUILTINS_LOCATION,
7093                         FIELD_DECL, get_identifier ("__stack"),
7094                         ptr_type_node);
7095   f_grtop = build_decl (BUILTINS_LOCATION,
7096                         FIELD_DECL, get_identifier ("__gr_top"),
7097                         ptr_type_node);
7098   f_vrtop = build_decl (BUILTINS_LOCATION,
7099                         FIELD_DECL, get_identifier ("__vr_top"),
7100                         ptr_type_node);
7101   f_groff = build_decl (BUILTINS_LOCATION,
7102                         FIELD_DECL, get_identifier ("__gr_offs"),
7103                         integer_type_node);
7104   f_vroff = build_decl (BUILTINS_LOCATION,
7105                         FIELD_DECL, get_identifier ("__vr_offs"),
7106                         integer_type_node);
7107
7108   DECL_ARTIFICIAL (f_stack) = 1;
7109   DECL_ARTIFICIAL (f_grtop) = 1;
7110   DECL_ARTIFICIAL (f_vrtop) = 1;
7111   DECL_ARTIFICIAL (f_groff) = 1;
7112   DECL_ARTIFICIAL (f_vroff) = 1;
7113
7114   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7115   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7116   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7117   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7118   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7119
7120   TYPE_FIELDS (va_list_type) = f_stack;
7121   DECL_CHAIN (f_stack) = f_grtop;
7122   DECL_CHAIN (f_grtop) = f_vrtop;
7123   DECL_CHAIN (f_vrtop) = f_groff;
7124   DECL_CHAIN (f_groff) = f_vroff;
7125
7126   /* Compute its layout.  */
7127   layout_type (va_list_type);
7128
7129   return va_list_type;
7130 }
7131
7132 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7133 static void
7134 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7135 {
7136   const CUMULATIVE_ARGS *cum;
7137   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7138   tree stack, grtop, vrtop, groff, vroff;
7139   tree t;
7140   int gr_save_area_size;
7141   int vr_save_area_size;
7142   int vr_offset;
7143
7144   cum = &crtl->args.info;
7145   gr_save_area_size
7146     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7147   vr_save_area_size
7148     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7149
7150   if (TARGET_GENERAL_REGS_ONLY)
7151     {
7152       if (cum->aapcs_nvrn > 0)
7153         sorry ("%qs and floating point or vector arguments",
7154                "-mgeneral-regs-only");
7155       vr_save_area_size = 0;
7156     }
7157
7158   f_stack = TYPE_FIELDS (va_list_type_node);
7159   f_grtop = DECL_CHAIN (f_stack);
7160   f_vrtop = DECL_CHAIN (f_grtop);
7161   f_groff = DECL_CHAIN (f_vrtop);
7162   f_vroff = DECL_CHAIN (f_groff);
7163
7164   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7165                   NULL_TREE);
7166   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7167                   NULL_TREE);
7168   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7169                   NULL_TREE);
7170   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7171                   NULL_TREE);
7172   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7173                   NULL_TREE);
7174
7175   /* Emit code to initialize STACK, which points to the next varargs stack
7176      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7177      by named arguments.  STACK is 8-byte aligned.  */
7178   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7179   if (cum->aapcs_stack_size > 0)
7180     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7181   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7182   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7183
7184   /* Emit code to initialize GRTOP, the top of the GR save area.
7185      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7186   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7187   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7188   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7189
7190   /* Emit code to initialize VRTOP, the top of the VR save area.
7191      This address is gr_save_area_bytes below GRTOP, rounded
7192      down to the next 16-byte boundary.  */
7193   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7194   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7195                              STACK_BOUNDARY / BITS_PER_UNIT);
7196
7197   if (vr_offset)
7198     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7199   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7200   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7201
7202   /* Emit code to initialize GROFF, the offset from GRTOP of the
7203      next GPR argument.  */
7204   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7205               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7206   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7207
7208   /* Likewise emit code to initialize VROFF, the offset from FTOP
7209      of the next VR argument.  */
7210   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7211               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7212   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7213 }
7214
7215 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7216
7217 static tree
7218 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7219                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7220 {
7221   tree addr;
7222   bool indirect_p;
7223   bool is_ha;           /* is HFA or HVA.  */
7224   bool dw_align;        /* double-word align.  */
7225   machine_mode ag_mode = VOIDmode;
7226   int nregs;
7227   machine_mode mode;
7228
7229   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7230   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7231   HOST_WIDE_INT size, rsize, adjust, align;
7232   tree t, u, cond1, cond2;
7233
7234   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7235   if (indirect_p)
7236     type = build_pointer_type (type);
7237
7238   mode = TYPE_MODE (type);
7239
7240   f_stack = TYPE_FIELDS (va_list_type_node);
7241   f_grtop = DECL_CHAIN (f_stack);
7242   f_vrtop = DECL_CHAIN (f_grtop);
7243   f_groff = DECL_CHAIN (f_vrtop);
7244   f_vroff = DECL_CHAIN (f_groff);
7245
7246   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7247                   f_stack, NULL_TREE);
7248   size = int_size_in_bytes (type);
7249   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7250
7251   dw_align = false;
7252   adjust = 0;
7253   if (aarch64_vfp_is_call_or_return_candidate (mode,
7254                                                type,
7255                                                &ag_mode,
7256                                                &nregs,
7257                                                &is_ha))
7258     {
7259       /* TYPE passed in fp/simd registers.  */
7260       if (TARGET_GENERAL_REGS_ONLY)
7261         sorry ("%qs and floating point or vector arguments",
7262                "-mgeneral-regs-only");
7263
7264       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7265                       unshare_expr (valist), f_vrtop, NULL_TREE);
7266       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7267                       unshare_expr (valist), f_vroff, NULL_TREE);
7268
7269       rsize = nregs * UNITS_PER_VREG;
7270
7271       if (is_ha)
7272         {
7273           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7274             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7275         }
7276       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7277                && size < UNITS_PER_VREG)
7278         {
7279           adjust = UNITS_PER_VREG - size;
7280         }
7281     }
7282   else
7283     {
7284       /* TYPE passed in general registers.  */
7285       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7286                       unshare_expr (valist), f_grtop, NULL_TREE);
7287       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7288                       unshare_expr (valist), f_groff, NULL_TREE);
7289       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7290       nregs = rsize / UNITS_PER_WORD;
7291
7292       if (align > 8)
7293         dw_align = true;
7294
7295       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7296           && size < UNITS_PER_WORD)
7297         {
7298           adjust = UNITS_PER_WORD  - size;
7299         }
7300     }
7301
7302   /* Get a local temporary for the field value.  */
7303   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7304
7305   /* Emit code to branch if off >= 0.  */
7306   t = build2 (GE_EXPR, boolean_type_node, off,
7307               build_int_cst (TREE_TYPE (off), 0));
7308   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7309
7310   if (dw_align)
7311     {
7312       /* Emit: offs = (offs + 15) & -16.  */
7313       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7314                   build_int_cst (TREE_TYPE (off), 15));
7315       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7316                   build_int_cst (TREE_TYPE (off), -16));
7317       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7318     }
7319   else
7320     roundup = NULL;
7321
7322   /* Update ap.__[g|v]r_offs  */
7323   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7324               build_int_cst (TREE_TYPE (off), rsize));
7325   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7326
7327   /* String up.  */
7328   if (roundup)
7329     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7330
7331   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7332   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7333               build_int_cst (TREE_TYPE (f_off), 0));
7334   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7335
7336   /* String up: make sure the assignment happens before the use.  */
7337   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7338   COND_EXPR_ELSE (cond1) = t;
7339
7340   /* Prepare the trees handling the argument that is passed on the stack;
7341      the top level node will store in ON_STACK.  */
7342   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7343   if (align > 8)
7344     {
7345       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7346       t = fold_convert (intDI_type_node, arg);
7347       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7348                   build_int_cst (TREE_TYPE (t), 15));
7349       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7350                   build_int_cst (TREE_TYPE (t), -16));
7351       t = fold_convert (TREE_TYPE (arg), t);
7352       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7353     }
7354   else
7355     roundup = NULL;
7356   /* Advance ap.__stack  */
7357   t = fold_convert (intDI_type_node, arg);
7358   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7359               build_int_cst (TREE_TYPE (t), size + 7));
7360   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7361               build_int_cst (TREE_TYPE (t), -8));
7362   t = fold_convert (TREE_TYPE (arg), t);
7363   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7364   /* String up roundup and advance.  */
7365   if (roundup)
7366     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7367   /* String up with arg */
7368   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7369   /* Big-endianness related address adjustment.  */
7370   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7371       && size < UNITS_PER_WORD)
7372   {
7373     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7374                 size_int (UNITS_PER_WORD - size));
7375     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7376   }
7377
7378   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7379   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7380
7381   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7382   t = off;
7383   if (adjust)
7384     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7385                 build_int_cst (TREE_TYPE (off), adjust));
7386
7387   t = fold_convert (sizetype, t);
7388   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7389
7390   if (is_ha)
7391     {
7392       /* type ha; // treat as "struct {ftype field[n];}"
7393          ... [computing offs]
7394          for (i = 0; i <nregs; ++i, offs += 16)
7395            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7396          return ha;  */
7397       int i;
7398       tree tmp_ha, field_t, field_ptr_t;
7399
7400       /* Declare a local variable.  */
7401       tmp_ha = create_tmp_var_raw (type, "ha");
7402       gimple_add_tmp_var (tmp_ha);
7403
7404       /* Establish the base type.  */
7405       switch (ag_mode)
7406         {
7407         case SFmode:
7408           field_t = float_type_node;
7409           field_ptr_t = float_ptr_type_node;
7410           break;
7411         case DFmode:
7412           field_t = double_type_node;
7413           field_ptr_t = double_ptr_type_node;
7414           break;
7415         case TFmode:
7416           field_t = long_double_type_node;
7417           field_ptr_t = long_double_ptr_type_node;
7418           break;
7419 /* The half precision and quad precision are not fully supported yet.  Enable
7420    the following code after the support is complete.  Need to find the correct
7421    type node for __fp16 *.  */
7422 #if 0
7423         case HFmode:
7424           field_t = float_type_node;
7425           field_ptr_t = float_ptr_type_node;
7426           break;
7427 #endif
7428         case V2SImode:
7429         case V4SImode:
7430             {
7431               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7432               field_t = build_vector_type_for_mode (innertype, ag_mode);
7433               field_ptr_t = build_pointer_type (field_t);
7434             }
7435           break;
7436         default:
7437           gcc_assert (0);
7438         }
7439
7440       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7441       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7442       addr = t;
7443       t = fold_convert (field_ptr_t, addr);
7444       t = build2 (MODIFY_EXPR, field_t,
7445                   build1 (INDIRECT_REF, field_t, tmp_ha),
7446                   build1 (INDIRECT_REF, field_t, t));
7447
7448       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7449       for (i = 1; i < nregs; ++i)
7450         {
7451           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7452           u = fold_convert (field_ptr_t, addr);
7453           u = build2 (MODIFY_EXPR, field_t,
7454                       build2 (MEM_REF, field_t, tmp_ha,
7455                               build_int_cst (field_ptr_t,
7456                                              (i *
7457                                               int_size_in_bytes (field_t)))),
7458                       build1 (INDIRECT_REF, field_t, u));
7459           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7460         }
7461
7462       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7463       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7464     }
7465
7466   COND_EXPR_ELSE (cond2) = t;
7467   addr = fold_convert (build_pointer_type (type), cond1);
7468   addr = build_va_arg_indirect_ref (addr);
7469
7470   if (indirect_p)
7471     addr = build_va_arg_indirect_ref (addr);
7472
7473   return addr;
7474 }
7475
7476 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7477
7478 static void
7479 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7480                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7481                                 int no_rtl)
7482 {
7483   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7484   CUMULATIVE_ARGS local_cum;
7485   int gr_saved, vr_saved;
7486
7487   /* The caller has advanced CUM up to, but not beyond, the last named
7488      argument.  Advance a local copy of CUM past the last "real" named
7489      argument, to find out how many registers are left over.  */
7490   local_cum = *cum;
7491   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7492
7493   /* Found out how many registers we need to save.  */
7494   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7495   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7496
7497   if (TARGET_GENERAL_REGS_ONLY)
7498     {
7499       if (local_cum.aapcs_nvrn > 0)
7500         sorry ("%qs and floating point or vector arguments",
7501                "-mgeneral-regs-only");
7502       vr_saved = 0;
7503     }
7504
7505   if (!no_rtl)
7506     {
7507       if (gr_saved > 0)
7508         {
7509           rtx ptr, mem;
7510
7511           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7512           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7513                                - gr_saved * UNITS_PER_WORD);
7514           mem = gen_frame_mem (BLKmode, ptr);
7515           set_mem_alias_set (mem, get_varargs_alias_set ());
7516
7517           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7518                                mem, gr_saved);
7519         }
7520       if (vr_saved > 0)
7521         {
7522           /* We can't use move_block_from_reg, because it will use
7523              the wrong mode, storing D regs only.  */
7524           machine_mode mode = TImode;
7525           int off, i;
7526
7527           /* Set OFF to the offset from virtual_incoming_args_rtx of
7528              the first vector register.  The VR save area lies below
7529              the GR one, and is aligned to 16 bytes.  */
7530           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7531                                    STACK_BOUNDARY / BITS_PER_UNIT);
7532           off -= vr_saved * UNITS_PER_VREG;
7533
7534           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7535             {
7536               rtx ptr, mem;
7537
7538               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7539               mem = gen_frame_mem (mode, ptr);
7540               set_mem_alias_set (mem, get_varargs_alias_set ());
7541               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7542               off += UNITS_PER_VREG;
7543             }
7544         }
7545     }
7546
7547   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7548      any complication of having crtl->args.pretend_args_size changed.  */
7549   cfun->machine->frame.saved_varargs_size
7550     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7551                       STACK_BOUNDARY / BITS_PER_UNIT)
7552        + vr_saved * UNITS_PER_VREG);
7553 }
7554
7555 static void
7556 aarch64_conditional_register_usage (void)
7557 {
7558   int i;
7559   if (!TARGET_FLOAT)
7560     {
7561       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7562         {
7563           fixed_regs[i] = 1;
7564           call_used_regs[i] = 1;
7565         }
7566     }
7567 }
7568
7569 /* Walk down the type tree of TYPE counting consecutive base elements.
7570    If *MODEP is VOIDmode, then set it to the first valid floating point
7571    type.  If a non-floating point type is found, or if a floating point
7572    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7573    otherwise return the count in the sub-tree.  */
7574 static int
7575 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7576 {
7577   machine_mode mode;
7578   HOST_WIDE_INT size;
7579
7580   switch (TREE_CODE (type))
7581     {
7582     case REAL_TYPE:
7583       mode = TYPE_MODE (type);
7584       if (mode != DFmode && mode != SFmode && mode != TFmode)
7585         return -1;
7586
7587       if (*modep == VOIDmode)
7588         *modep = mode;
7589
7590       if (*modep == mode)
7591         return 1;
7592
7593       break;
7594
7595     case COMPLEX_TYPE:
7596       mode = TYPE_MODE (TREE_TYPE (type));
7597       if (mode != DFmode && mode != SFmode && mode != TFmode)
7598         return -1;
7599
7600       if (*modep == VOIDmode)
7601         *modep = mode;
7602
7603       if (*modep == mode)
7604         return 2;
7605
7606       break;
7607
7608     case VECTOR_TYPE:
7609       /* Use V2SImode and V4SImode as representatives of all 64-bit
7610          and 128-bit vector types.  */
7611       size = int_size_in_bytes (type);
7612       switch (size)
7613         {
7614         case 8:
7615           mode = V2SImode;
7616           break;
7617         case 16:
7618           mode = V4SImode;
7619           break;
7620         default:
7621           return -1;
7622         }
7623
7624       if (*modep == VOIDmode)
7625         *modep = mode;
7626
7627       /* Vector modes are considered to be opaque: two vectors are
7628          equivalent for the purposes of being homogeneous aggregates
7629          if they are the same size.  */
7630       if (*modep == mode)
7631         return 1;
7632
7633       break;
7634
7635     case ARRAY_TYPE:
7636       {
7637         int count;
7638         tree index = TYPE_DOMAIN (type);
7639
7640         /* Can't handle incomplete types nor sizes that are not
7641            fixed.  */
7642         if (!COMPLETE_TYPE_P (type)
7643             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7644           return -1;
7645
7646         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7647         if (count == -1
7648             || !index
7649             || !TYPE_MAX_VALUE (index)
7650             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7651             || !TYPE_MIN_VALUE (index)
7652             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7653             || count < 0)
7654           return -1;
7655
7656         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7657                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7658
7659         /* There must be no padding.  */
7660         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7661           return -1;
7662
7663         return count;
7664       }
7665
7666     case RECORD_TYPE:
7667       {
7668         int count = 0;
7669         int sub_count;
7670         tree field;
7671
7672         /* Can't handle incomplete types nor sizes that are not
7673            fixed.  */
7674         if (!COMPLETE_TYPE_P (type)
7675             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7676           return -1;
7677
7678         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7679           {
7680             if (TREE_CODE (field) != FIELD_DECL)
7681               continue;
7682
7683             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7684             if (sub_count < 0)
7685               return -1;
7686             count += sub_count;
7687           }
7688
7689         /* There must be no padding.  */
7690         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7691           return -1;
7692
7693         return count;
7694       }
7695
7696     case UNION_TYPE:
7697     case QUAL_UNION_TYPE:
7698       {
7699         /* These aren't very interesting except in a degenerate case.  */
7700         int count = 0;
7701         int sub_count;
7702         tree field;
7703
7704         /* Can't handle incomplete types nor sizes that are not
7705            fixed.  */
7706         if (!COMPLETE_TYPE_P (type)
7707             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7708           return -1;
7709
7710         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7711           {
7712             if (TREE_CODE (field) != FIELD_DECL)
7713               continue;
7714
7715             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7716             if (sub_count < 0)
7717               return -1;
7718             count = count > sub_count ? count : sub_count;
7719           }
7720
7721         /* There must be no padding.  */
7722         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7723           return -1;
7724
7725         return count;
7726       }
7727
7728     default:
7729       break;
7730     }
7731
7732   return -1;
7733 }
7734
7735 /* Return true if we use LRA instead of reload pass.  */
7736 static bool
7737 aarch64_lra_p (void)
7738 {
7739   return aarch64_lra_flag;
7740 }
7741
7742 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7743    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7744    array types.  The C99 floating-point complex types are also considered
7745    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7746    types, which are GCC extensions and out of the scope of AAPCS64, are
7747    treated as composite types here as well.
7748
7749    Note that MODE itself is not sufficient in determining whether a type
7750    is such a composite type or not.  This is because
7751    stor-layout.c:compute_record_mode may have already changed the MODE
7752    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7753    structure with only one field may have its MODE set to the mode of the
7754    field.  Also an integer mode whose size matches the size of the
7755    RECORD_TYPE type may be used to substitute the original mode
7756    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7757    solely relied on.  */
7758
7759 static bool
7760 aarch64_composite_type_p (const_tree type,
7761                           machine_mode mode)
7762 {
7763   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7764     return true;
7765
7766   if (mode == BLKmode
7767       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7768       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7769     return true;
7770
7771   return false;
7772 }
7773
7774 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7775    type as described in AAPCS64 \S 4.1.2.
7776
7777    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7778
7779 static bool
7780 aarch64_short_vector_p (const_tree type,
7781                         machine_mode mode)
7782 {
7783   HOST_WIDE_INT size = -1;
7784
7785   if (type && TREE_CODE (type) == VECTOR_TYPE)
7786     size = int_size_in_bytes (type);
7787   else if (!aarch64_composite_type_p (type, mode)
7788            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7789                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7790     size = GET_MODE_SIZE (mode);
7791
7792   return (size == 8 || size == 16) ? true : false;
7793 }
7794
7795 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7796    shall be passed or returned in simd/fp register(s) (providing these
7797    parameter passing registers are available).
7798
7799    Upon successful return, *COUNT returns the number of needed registers,
7800    *BASE_MODE returns the mode of the individual register and when IS_HAF
7801    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7802    floating-point aggregate or a homogeneous short-vector aggregate.  */
7803
7804 static bool
7805 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7806                                          const_tree type,
7807                                          machine_mode *base_mode,
7808                                          int *count,
7809                                          bool *is_ha)
7810 {
7811   machine_mode new_mode = VOIDmode;
7812   bool composite_p = aarch64_composite_type_p (type, mode);
7813
7814   if (is_ha != NULL) *is_ha = false;
7815
7816   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7817       || aarch64_short_vector_p (type, mode))
7818     {
7819       *count = 1;
7820       new_mode = mode;
7821     }
7822   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7823     {
7824       if (is_ha != NULL) *is_ha = true;
7825       *count = 2;
7826       new_mode = GET_MODE_INNER (mode);
7827     }
7828   else if (type && composite_p)
7829     {
7830       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7831
7832       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7833         {
7834           if (is_ha != NULL) *is_ha = true;
7835           *count = ag_count;
7836         }
7837       else
7838         return false;
7839     }
7840   else
7841     return false;
7842
7843   *base_mode = new_mode;
7844   return true;
7845 }
7846
7847 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7848
7849 static rtx
7850 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7851                           int incoming ATTRIBUTE_UNUSED)
7852 {
7853   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7854 }
7855
7856 /* Implements target hook vector_mode_supported_p.  */
7857 static bool
7858 aarch64_vector_mode_supported_p (machine_mode mode)
7859 {
7860   if (TARGET_SIMD
7861       && (mode == V4SImode  || mode == V8HImode
7862           || mode == V16QImode || mode == V2DImode
7863           || mode == V2SImode  || mode == V4HImode
7864           || mode == V8QImode || mode == V2SFmode
7865           || mode == V4SFmode || mode == V2DFmode
7866           || mode == V1DFmode))
7867     return true;
7868
7869   return false;
7870 }
7871
7872 /* Return appropriate SIMD container
7873    for MODE within a vector of WIDTH bits.  */
7874 static machine_mode
7875 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7876 {
7877   gcc_assert (width == 64 || width == 128);
7878   if (TARGET_SIMD)
7879     {
7880       if (width == 128)
7881         switch (mode)
7882           {
7883           case DFmode:
7884             return V2DFmode;
7885           case SFmode:
7886             return V4SFmode;
7887           case SImode:
7888             return V4SImode;
7889           case HImode:
7890             return V8HImode;
7891           case QImode:
7892             return V16QImode;
7893           case DImode:
7894             return V2DImode;
7895           default:
7896             break;
7897           }
7898       else
7899         switch (mode)
7900           {
7901           case SFmode:
7902             return V2SFmode;
7903           case SImode:
7904             return V2SImode;
7905           case HImode:
7906             return V4HImode;
7907           case QImode:
7908             return V8QImode;
7909           default:
7910             break;
7911           }
7912     }
7913   return word_mode;
7914 }
7915
7916 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7917 static machine_mode
7918 aarch64_preferred_simd_mode (machine_mode mode)
7919 {
7920   return aarch64_simd_container_mode (mode, 128);
7921 }
7922
7923 /* Return the bitmask of possible vector sizes for the vectorizer
7924    to iterate over.  */
7925 static unsigned int
7926 aarch64_autovectorize_vector_sizes (void)
7927 {
7928   return (16 | 8);
7929 }
7930
7931 /* Implement TARGET_MANGLE_TYPE.  */
7932
7933 static const char *
7934 aarch64_mangle_type (const_tree type)
7935 {
7936   /* The AArch64 ABI documents say that "__va_list" has to be
7937      managled as if it is in the "std" namespace.  */
7938   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7939     return "St9__va_list";
7940
7941   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
7942      builtin types.  */
7943   if (TYPE_NAME (type) != NULL)
7944     return aarch64_mangle_builtin_type (type);
7945
7946   /* Use the default mangling.  */
7947   return NULL;
7948 }
7949
7950
7951 /* Return true if the rtx_insn contains a MEM RTX somewhere
7952    in it.  */
7953
7954 static bool
7955 has_memory_op (rtx_insn *mem_insn)
7956 {
7957   subrtx_iterator::array_type array;
7958   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
7959     if (MEM_P (*iter))
7960       return true;
7961
7962   return false;
7963 }
7964
7965 /* Find the first rtx_insn before insn that will generate an assembly
7966    instruction.  */
7967
7968 static rtx_insn *
7969 aarch64_prev_real_insn (rtx_insn *insn)
7970 {
7971   if (!insn)
7972     return NULL;
7973
7974   do
7975     {
7976       insn = prev_real_insn (insn);
7977     }
7978   while (insn && recog_memoized (insn) < 0);
7979
7980   return insn;
7981 }
7982
7983 static bool
7984 is_madd_op (enum attr_type t1)
7985 {
7986   unsigned int i;
7987   /* A number of these may be AArch32 only.  */
7988   enum attr_type mlatypes[] = {
7989     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
7990     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
7991     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
7992   };
7993
7994   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
7995     {
7996       if (t1 == mlatypes[i])
7997         return true;
7998     }
7999
8000   return false;
8001 }
8002
8003 /* Check if there is a register dependency between a load and the insn
8004    for which we hold recog_data.  */
8005
8006 static bool
8007 dep_between_memop_and_curr (rtx memop)
8008 {
8009   rtx load_reg;
8010   int opno;
8011
8012   gcc_assert (GET_CODE (memop) == SET);
8013
8014   if (!REG_P (SET_DEST (memop)))
8015     return false;
8016
8017   load_reg = SET_DEST (memop);
8018   for (opno = 1; opno < recog_data.n_operands; opno++)
8019     {
8020       rtx operand = recog_data.operand[opno];
8021       if (REG_P (operand)
8022           && reg_overlap_mentioned_p (load_reg, operand))
8023         return true;
8024
8025     }
8026   return false;
8027 }
8028
8029
8030 /* When working around the Cortex-A53 erratum 835769,
8031    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8032    instruction and has a preceding memory instruction such that a NOP
8033    should be inserted between them.  */
8034
8035 bool
8036 aarch64_madd_needs_nop (rtx_insn* insn)
8037 {
8038   enum attr_type attr_type;
8039   rtx_insn *prev;
8040   rtx body;
8041
8042   if (!aarch64_fix_a53_err835769)
8043     return false;
8044
8045   if (recog_memoized (insn) < 0)
8046     return false;
8047
8048   attr_type = get_attr_type (insn);
8049   if (!is_madd_op (attr_type))
8050     return false;
8051
8052   prev = aarch64_prev_real_insn (insn);
8053   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8054      Restore recog state to INSN to avoid state corruption.  */
8055   extract_constrain_insn_cached (insn);
8056
8057   if (!prev || !has_memory_op (prev))
8058     return false;
8059
8060   body = single_set (prev);
8061
8062   /* If the previous insn is a memory op and there is no dependency between
8063      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8064      have a complex memory operation, probably a load/store pair.
8065      Be conservative for now and emit a NOP.  */
8066   if (GET_MODE (recog_data.operand[0]) == DImode
8067       && (!body || !dep_between_memop_and_curr (body)))
8068     return true;
8069
8070   return false;
8071
8072 }
8073
8074
8075 /* Implement FINAL_PRESCAN_INSN.  */
8076
8077 void
8078 aarch64_final_prescan_insn (rtx_insn *insn)
8079 {
8080   if (aarch64_madd_needs_nop (insn))
8081     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8082 }
8083
8084
8085 /* Return the equivalent letter for size.  */
8086 static char
8087 sizetochar (int size)
8088 {
8089   switch (size)
8090     {
8091     case 64: return 'd';
8092     case 32: return 's';
8093     case 16: return 'h';
8094     case 8 : return 'b';
8095     default: gcc_unreachable ();
8096     }
8097 }
8098
8099 /* Return true iff x is a uniform vector of floating-point
8100    constants, and the constant can be represented in
8101    quarter-precision form.  Note, as aarch64_float_const_representable
8102    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8103 static bool
8104 aarch64_vect_float_const_representable_p (rtx x)
8105 {
8106   int i = 0;
8107   REAL_VALUE_TYPE r0, ri;
8108   rtx x0, xi;
8109
8110   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8111     return false;
8112
8113   x0 = CONST_VECTOR_ELT (x, 0);
8114   if (!CONST_DOUBLE_P (x0))
8115     return false;
8116
8117   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8118
8119   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8120     {
8121       xi = CONST_VECTOR_ELT (x, i);
8122       if (!CONST_DOUBLE_P (xi))
8123         return false;
8124
8125       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8126       if (!REAL_VALUES_EQUAL (r0, ri))
8127         return false;
8128     }
8129
8130   return aarch64_float_const_representable_p (x0);
8131 }
8132
8133 /* Return true for valid and false for invalid.  */
8134 bool
8135 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8136                               struct simd_immediate_info *info)
8137 {
8138 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8139   matches = 1;                                          \
8140   for (i = 0; i < idx; i += (STRIDE))                   \
8141     if (!(TEST))                                        \
8142       matches = 0;                                      \
8143   if (matches)                                          \
8144     {                                                   \
8145       immtype = (CLASS);                                \
8146       elsize = (ELSIZE);                                \
8147       eshift = (SHIFT);                                 \
8148       emvn = (NEG);                                     \
8149       break;                                            \
8150     }
8151
8152   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8153   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8154   unsigned char bytes[16];
8155   int immtype = -1, matches;
8156   unsigned int invmask = inverse ? 0xff : 0;
8157   int eshift, emvn;
8158
8159   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8160     {
8161       if (! (aarch64_simd_imm_zero_p (op, mode)
8162              || aarch64_vect_float_const_representable_p (op)))
8163         return false;
8164
8165       if (info)
8166         {
8167           info->value = CONST_VECTOR_ELT (op, 0);
8168           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8169           info->mvn = false;
8170           info->shift = 0;
8171         }
8172
8173       return true;
8174     }
8175
8176   /* Splat vector constant out into a byte vector.  */
8177   for (i = 0; i < n_elts; i++)
8178     {
8179       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8180          it must be laid out in the vector register in reverse order.  */
8181       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8182       unsigned HOST_WIDE_INT elpart;
8183       unsigned int part, parts;
8184
8185       if (CONST_INT_P (el))
8186         {
8187           elpart = INTVAL (el);
8188           parts = 1;
8189         }
8190       else if (GET_CODE (el) == CONST_DOUBLE)
8191         {
8192           elpart = CONST_DOUBLE_LOW (el);
8193           parts = 2;
8194         }
8195       else
8196         gcc_unreachable ();
8197
8198       for (part = 0; part < parts; part++)
8199         {
8200           unsigned int byte;
8201           for (byte = 0; byte < innersize; byte++)
8202             {
8203               bytes[idx++] = (elpart & 0xff) ^ invmask;
8204               elpart >>= BITS_PER_UNIT;
8205             }
8206           if (GET_CODE (el) == CONST_DOUBLE)
8207             elpart = CONST_DOUBLE_HIGH (el);
8208         }
8209     }
8210
8211   /* Sanity check.  */
8212   gcc_assert (idx == GET_MODE_SIZE (mode));
8213
8214   do
8215     {
8216       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8217              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8218
8219       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8220              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8221
8222       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8223              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8224
8225       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8226              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8227
8228       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8229
8230       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8231
8232       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8233              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8234
8235       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8236              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8237
8238       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8239              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8240
8241       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8242              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8243
8244       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8245
8246       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8247
8248       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8249              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8250
8251       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8252              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8253
8254       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8255              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8256
8257       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8258              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8259
8260       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8261
8262       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8263              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8264     }
8265   while (0);
8266
8267   if (immtype == -1)
8268     return false;
8269
8270   if (info)
8271     {
8272       info->element_width = elsize;
8273       info->mvn = emvn != 0;
8274       info->shift = eshift;
8275
8276       unsigned HOST_WIDE_INT imm = 0;
8277
8278       if (immtype >= 12 && immtype <= 15)
8279         info->msl = true;
8280
8281       /* Un-invert bytes of recognized vector, if necessary.  */
8282       if (invmask != 0)
8283         for (i = 0; i < idx; i++)
8284           bytes[i] ^= invmask;
8285
8286       if (immtype == 17)
8287         {
8288           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8289           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8290
8291           for (i = 0; i < 8; i++)
8292             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8293               << (i * BITS_PER_UNIT);
8294
8295
8296           info->value = GEN_INT (imm);
8297         }
8298       else
8299         {
8300           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8301             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8302
8303           /* Construct 'abcdefgh' because the assembler cannot handle
8304              generic constants.  */
8305           if (info->mvn)
8306             imm = ~imm;
8307           imm = (imm >> info->shift) & 0xff;
8308           info->value = GEN_INT (imm);
8309         }
8310     }
8311
8312   return true;
8313 #undef CHECK
8314 }
8315
8316 /* Check of immediate shift constants are within range.  */
8317 bool
8318 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8319 {
8320   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8321   if (left)
8322     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8323   else
8324     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8325 }
8326
8327 /* Return true if X is a uniform vector where all elements
8328    are either the floating-point constant 0.0 or the
8329    integer constant 0.  */
8330 bool
8331 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8332 {
8333   return x == CONST0_RTX (mode);
8334 }
8335
8336 bool
8337 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8338 {
8339   HOST_WIDE_INT imm = INTVAL (x);
8340   int i;
8341
8342   for (i = 0; i < 8; i++)
8343     {
8344       unsigned int byte = imm & 0xff;
8345       if (byte != 0xff && byte != 0)
8346        return false;
8347       imm >>= 8;
8348     }
8349
8350   return true;
8351 }
8352
8353 bool
8354 aarch64_mov_operand_p (rtx x,
8355                        enum aarch64_symbol_context context,
8356                        machine_mode mode)
8357 {
8358   if (GET_CODE (x) == HIGH
8359       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8360     return true;
8361
8362   if (CONST_INT_P (x))
8363     return true;
8364
8365   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8366     return true;
8367
8368   return aarch64_classify_symbolic_expression (x, context)
8369     == SYMBOL_TINY_ABSOLUTE;
8370 }
8371
8372 /* Return a const_int vector of VAL.  */
8373 rtx
8374 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8375 {
8376   int nunits = GET_MODE_NUNITS (mode);
8377   rtvec v = rtvec_alloc (nunits);
8378   int i;
8379
8380   for (i=0; i < nunits; i++)
8381     RTVEC_ELT (v, i) = GEN_INT (val);
8382
8383   return gen_rtx_CONST_VECTOR (mode, v);
8384 }
8385
8386 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8387
8388 bool
8389 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8390 {
8391   machine_mode vmode;
8392
8393   gcc_assert (!VECTOR_MODE_P (mode));
8394   vmode = aarch64_preferred_simd_mode (mode);
8395   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8396   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8397 }
8398
8399 /* Construct and return a PARALLEL RTX vector with elements numbering the
8400    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8401    the vector - from the perspective of the architecture.  This does not
8402    line up with GCC's perspective on lane numbers, so we end up with
8403    different masks depending on our target endian-ness.  The diagram
8404    below may help.  We must draw the distinction when building masks
8405    which select one half of the vector.  An instruction selecting
8406    architectural low-lanes for a big-endian target, must be described using
8407    a mask selecting GCC high-lanes.
8408
8409                  Big-Endian             Little-Endian
8410
8411 GCC             0   1   2   3           3   2   1   0
8412               | x | x | x | x |       | x | x | x | x |
8413 Architecture    3   2   1   0           3   2   1   0
8414
8415 Low Mask:         { 2, 3 }                { 0, 1 }
8416 High Mask:        { 0, 1 }                { 2, 3 }
8417 */
8418
8419 rtx
8420 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8421 {
8422   int nunits = GET_MODE_NUNITS (mode);
8423   rtvec v = rtvec_alloc (nunits / 2);
8424   int high_base = nunits / 2;
8425   int low_base = 0;
8426   int base;
8427   rtx t1;
8428   int i;
8429
8430   if (BYTES_BIG_ENDIAN)
8431     base = high ? low_base : high_base;
8432   else
8433     base = high ? high_base : low_base;
8434
8435   for (i = 0; i < nunits / 2; i++)
8436     RTVEC_ELT (v, i) = GEN_INT (base + i);
8437
8438   t1 = gen_rtx_PARALLEL (mode, v);
8439   return t1;
8440 }
8441
8442 /* Check OP for validity as a PARALLEL RTX vector with elements
8443    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8444    from the perspective of the architecture.  See the diagram above
8445    aarch64_simd_vect_par_cnst_half for more details.  */
8446
8447 bool
8448 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8449                                        bool high)
8450 {
8451   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8452   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8453   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8454   int i = 0;
8455
8456   if (!VECTOR_MODE_P (mode))
8457     return false;
8458
8459   if (count_op != count_ideal)
8460     return false;
8461
8462   for (i = 0; i < count_ideal; i++)
8463     {
8464       rtx elt_op = XVECEXP (op, 0, i);
8465       rtx elt_ideal = XVECEXP (ideal, 0, i);
8466
8467       if (!CONST_INT_P (elt_op)
8468           || INTVAL (elt_ideal) != INTVAL (elt_op))
8469         return false;
8470     }
8471   return true;
8472 }
8473
8474 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8475    HIGH (exclusive).  */
8476 void
8477 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8478                           const_tree exp)
8479 {
8480   HOST_WIDE_INT lane;
8481   gcc_assert (CONST_INT_P (operand));
8482   lane = INTVAL (operand);
8483
8484   if (lane < low || lane >= high)
8485   {
8486     if (exp)
8487       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8488     else
8489       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8490   }
8491 }
8492
8493 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8494    registers).  */
8495 void
8496 aarch64_simd_emit_pair_result_insn (machine_mode mode,
8497                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8498                             rtx op1)
8499 {
8500   rtx mem = gen_rtx_MEM (mode, destaddr);
8501   rtx tmp1 = gen_reg_rtx (mode);
8502   rtx tmp2 = gen_reg_rtx (mode);
8503
8504   emit_insn (intfn (tmp1, op1, tmp2));
8505
8506   emit_move_insn (mem, tmp1);
8507   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8508   emit_move_insn (mem, tmp2);
8509 }
8510
8511 /* Return TRUE if OP is a valid vector addressing mode.  */
8512 bool
8513 aarch64_simd_mem_operand_p (rtx op)
8514 {
8515   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8516                         || REG_P (XEXP (op, 0)));
8517 }
8518
8519 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8520    not to early-clobber SRC registers in the process.
8521
8522    We assume that the operands described by SRC and DEST represent a
8523    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
8524    number of components into which the copy has been decomposed.  */
8525 void
8526 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8527                                 rtx *src, unsigned int count)
8528 {
8529   unsigned int i;
8530
8531   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8532       || REGNO (operands[0]) < REGNO (operands[1]))
8533     {
8534       for (i = 0; i < count; i++)
8535         {
8536           operands[2 * i] = dest[i];
8537           operands[2 * i + 1] = src[i];
8538         }
8539     }
8540   else
8541     {
8542       for (i = 0; i < count; i++)
8543         {
8544           operands[2 * i] = dest[count - i - 1];
8545           operands[2 * i + 1] = src[count - i - 1];
8546         }
8547     }
8548 }
8549
8550 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8551    one of VSTRUCT modes: OI, CI or XI.  */
8552 int
8553 aarch64_simd_attr_length_move (rtx_insn *insn)
8554 {
8555   machine_mode mode;
8556
8557   extract_insn_cached (insn);
8558
8559   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8560     {
8561       mode = GET_MODE (recog_data.operand[0]);
8562       switch (mode)
8563         {
8564         case OImode:
8565           return 8;
8566         case CImode:
8567           return 12;
8568         case XImode:
8569           return 16;
8570         default:
8571           gcc_unreachable ();
8572         }
8573     }
8574   return 4;
8575 }
8576
8577 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8578    alignment of a vector to 128 bits.  */
8579 static HOST_WIDE_INT
8580 aarch64_simd_vector_alignment (const_tree type)
8581 {
8582   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8583   return MIN (align, 128);
8584 }
8585
8586 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8587 static bool
8588 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8589 {
8590   if (is_packed)
8591     return false;
8592
8593   /* We guarantee alignment for vectors up to 128-bits.  */
8594   if (tree_int_cst_compare (TYPE_SIZE (type),
8595                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8596     return false;
8597
8598   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8599   return true;
8600 }
8601
8602 /* If VALS is a vector constant that can be loaded into a register
8603    using DUP, generate instructions to do so and return an RTX to
8604    assign to the register.  Otherwise return NULL_RTX.  */
8605 static rtx
8606 aarch64_simd_dup_constant (rtx vals)
8607 {
8608   machine_mode mode = GET_MODE (vals);
8609   machine_mode inner_mode = GET_MODE_INNER (mode);
8610   int n_elts = GET_MODE_NUNITS (mode);
8611   bool all_same = true;
8612   rtx x;
8613   int i;
8614
8615   if (GET_CODE (vals) != CONST_VECTOR)
8616     return NULL_RTX;
8617
8618   for (i = 1; i < n_elts; ++i)
8619     {
8620       x = CONST_VECTOR_ELT (vals, i);
8621       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8622         all_same = false;
8623     }
8624
8625   if (!all_same)
8626     return NULL_RTX;
8627
8628   /* We can load this constant by using DUP and a constant in a
8629      single ARM register.  This will be cheaper than a vector
8630      load.  */
8631   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8632   return gen_rtx_VEC_DUPLICATE (mode, x);
8633 }
8634
8635
8636 /* Generate code to load VALS, which is a PARALLEL containing only
8637    constants (for vec_init) or CONST_VECTOR, efficiently into a
8638    register.  Returns an RTX to copy into the register, or NULL_RTX
8639    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8640 static rtx
8641 aarch64_simd_make_constant (rtx vals)
8642 {
8643   machine_mode mode = GET_MODE (vals);
8644   rtx const_dup;
8645   rtx const_vec = NULL_RTX;
8646   int n_elts = GET_MODE_NUNITS (mode);
8647   int n_const = 0;
8648   int i;
8649
8650   if (GET_CODE (vals) == CONST_VECTOR)
8651     const_vec = vals;
8652   else if (GET_CODE (vals) == PARALLEL)
8653     {
8654       /* A CONST_VECTOR must contain only CONST_INTs and
8655          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8656          Only store valid constants in a CONST_VECTOR.  */
8657       for (i = 0; i < n_elts; ++i)
8658         {
8659           rtx x = XVECEXP (vals, 0, i);
8660           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8661             n_const++;
8662         }
8663       if (n_const == n_elts)
8664         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8665     }
8666   else
8667     gcc_unreachable ();
8668
8669   if (const_vec != NULL_RTX
8670       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8671     /* Load using MOVI/MVNI.  */
8672     return const_vec;
8673   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8674     /* Loaded using DUP.  */
8675     return const_dup;
8676   else if (const_vec != NULL_RTX)
8677     /* Load from constant pool. We can not take advantage of single-cycle
8678        LD1 because we need a PC-relative addressing mode.  */
8679     return const_vec;
8680   else
8681     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8682        We can not construct an initializer.  */
8683     return NULL_RTX;
8684 }
8685
8686 void
8687 aarch64_expand_vector_init (rtx target, rtx vals)
8688 {
8689   machine_mode mode = GET_MODE (target);
8690   machine_mode inner_mode = GET_MODE_INNER (mode);
8691   int n_elts = GET_MODE_NUNITS (mode);
8692   int n_var = 0, one_var = -1;
8693   bool all_same = true;
8694   rtx x, mem;
8695   int i;
8696
8697   x = XVECEXP (vals, 0, 0);
8698   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8699     n_var = 1, one_var = 0;
8700
8701   for (i = 1; i < n_elts; ++i)
8702     {
8703       x = XVECEXP (vals, 0, i);
8704       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8705         ++n_var, one_var = i;
8706
8707       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8708         all_same = false;
8709     }
8710
8711   if (n_var == 0)
8712     {
8713       rtx constant = aarch64_simd_make_constant (vals);
8714       if (constant != NULL_RTX)
8715         {
8716           emit_move_insn (target, constant);
8717           return;
8718         }
8719     }
8720
8721   /* Splat a single non-constant element if we can.  */
8722   if (all_same)
8723     {
8724       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8725       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8726       return;
8727     }
8728
8729   /* One field is non-constant.  Load constant then overwrite varying
8730      field.  This is more efficient than using the stack.  */
8731   if (n_var == 1)
8732     {
8733       rtx copy = copy_rtx (vals);
8734       rtx index = GEN_INT (one_var);
8735       enum insn_code icode;
8736
8737       /* Load constant part of vector, substitute neighboring value for
8738          varying element.  */
8739       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8740       aarch64_expand_vector_init (target, copy);
8741
8742       /* Insert variable.  */
8743       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8744       icode = optab_handler (vec_set_optab, mode);
8745       gcc_assert (icode != CODE_FOR_nothing);
8746       emit_insn (GEN_FCN (icode) (target, x, index));
8747       return;
8748     }
8749
8750   /* Construct the vector in memory one field at a time
8751      and load the whole vector.  */
8752   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8753   for (i = 0; i < n_elts; i++)
8754     emit_move_insn (adjust_address_nv (mem, inner_mode,
8755                                     i * GET_MODE_SIZE (inner_mode)),
8756                     XVECEXP (vals, 0, i));
8757   emit_move_insn (target, mem);
8758
8759 }
8760
8761 static unsigned HOST_WIDE_INT
8762 aarch64_shift_truncation_mask (machine_mode mode)
8763 {
8764   return
8765     (aarch64_vector_mode_supported_p (mode)
8766      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8767 }
8768
8769 #ifndef TLS_SECTION_ASM_FLAG
8770 #define TLS_SECTION_ASM_FLAG 'T'
8771 #endif
8772
8773 void
8774 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8775                                tree decl ATTRIBUTE_UNUSED)
8776 {
8777   char flagchars[10], *f = flagchars;
8778
8779   /* If we have already declared this section, we can use an
8780      abbreviated form to switch back to it -- unless this section is
8781      part of a COMDAT groups, in which case GAS requires the full
8782      declaration every time.  */
8783   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8784       && (flags & SECTION_DECLARED))
8785     {
8786       fprintf (asm_out_file, "\t.section\t%s\n", name);
8787       return;
8788     }
8789
8790   if (!(flags & SECTION_DEBUG))
8791     *f++ = 'a';
8792   if (flags & SECTION_WRITE)
8793     *f++ = 'w';
8794   if (flags & SECTION_CODE)
8795     *f++ = 'x';
8796   if (flags & SECTION_SMALL)
8797     *f++ = 's';
8798   if (flags & SECTION_MERGE)
8799     *f++ = 'M';
8800   if (flags & SECTION_STRINGS)
8801     *f++ = 'S';
8802   if (flags & SECTION_TLS)
8803     *f++ = TLS_SECTION_ASM_FLAG;
8804   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8805     *f++ = 'G';
8806   *f = '\0';
8807
8808   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8809
8810   if (!(flags & SECTION_NOTYPE))
8811     {
8812       const char *type;
8813       const char *format;
8814
8815       if (flags & SECTION_BSS)
8816         type = "nobits";
8817       else
8818         type = "progbits";
8819
8820 #ifdef TYPE_OPERAND_FMT
8821       format = "," TYPE_OPERAND_FMT;
8822 #else
8823       format = ",@%s";
8824 #endif
8825
8826       fprintf (asm_out_file, format, type);
8827
8828       if (flags & SECTION_ENTSIZE)
8829         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8830       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8831         {
8832           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8833             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8834           else
8835             fprintf (asm_out_file, ",%s,comdat",
8836                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8837         }
8838     }
8839
8840   putc ('\n', asm_out_file);
8841 }
8842
8843 /* Select a format to encode pointers in exception handling data.  */
8844 int
8845 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8846 {
8847    int type;
8848    switch (aarch64_cmodel)
8849      {
8850      case AARCH64_CMODEL_TINY:
8851      case AARCH64_CMODEL_TINY_PIC:
8852      case AARCH64_CMODEL_SMALL:
8853      case AARCH64_CMODEL_SMALL_PIC:
8854        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8855           for everything.  */
8856        type = DW_EH_PE_sdata4;
8857        break;
8858      default:
8859        /* No assumptions here.  8-byte relocs required.  */
8860        type = DW_EH_PE_sdata8;
8861        break;
8862      }
8863    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8864 }
8865
8866 /* Emit load exclusive.  */
8867
8868 static void
8869 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8870                              rtx mem, rtx model_rtx)
8871 {
8872   rtx (*gen) (rtx, rtx, rtx);
8873
8874   switch (mode)
8875     {
8876     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8877     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8878     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8879     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8880     default:
8881       gcc_unreachable ();
8882     }
8883
8884   emit_insn (gen (rval, mem, model_rtx));
8885 }
8886
8887 /* Emit store exclusive.  */
8888
8889 static void
8890 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8891                               rtx rval, rtx mem, rtx model_rtx)
8892 {
8893   rtx (*gen) (rtx, rtx, rtx, rtx);
8894
8895   switch (mode)
8896     {
8897     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8898     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8899     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8900     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8901     default:
8902       gcc_unreachable ();
8903     }
8904
8905   emit_insn (gen (bval, rval, mem, model_rtx));
8906 }
8907
8908 /* Mark the previous jump instruction as unlikely.  */
8909
8910 static void
8911 aarch64_emit_unlikely_jump (rtx insn)
8912 {
8913   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8914
8915   insn = emit_jump_insn (insn);
8916   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8917 }
8918
8919 /* Expand a compare and swap pattern.  */
8920
8921 void
8922 aarch64_expand_compare_and_swap (rtx operands[])
8923 {
8924   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8925   machine_mode mode, cmp_mode;
8926   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8927
8928   bval = operands[0];
8929   rval = operands[1];
8930   mem = operands[2];
8931   oldval = operands[3];
8932   newval = operands[4];
8933   is_weak = operands[5];
8934   mod_s = operands[6];
8935   mod_f = operands[7];
8936   mode = GET_MODE (mem);
8937   cmp_mode = mode;
8938
8939   /* Normally the succ memory model must be stronger than fail, but in the
8940      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8941      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8942
8943   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8944       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8945     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8946
8947   switch (mode)
8948     {
8949     case QImode:
8950     case HImode:
8951       /* For short modes, we're going to perform the comparison in SImode,
8952          so do the zero-extension now.  */
8953       cmp_mode = SImode;
8954       rval = gen_reg_rtx (SImode);
8955       oldval = convert_modes (SImode, mode, oldval, true);
8956       /* Fall through.  */
8957
8958     case SImode:
8959     case DImode:
8960       /* Force the value into a register if needed.  */
8961       if (!aarch64_plus_operand (oldval, mode))
8962         oldval = force_reg (cmp_mode, oldval);
8963       break;
8964
8965     default:
8966       gcc_unreachable ();
8967     }
8968
8969   switch (mode)
8970     {
8971     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8972     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8973     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8974     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8975     default:
8976       gcc_unreachable ();
8977     }
8978
8979   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8980
8981   if (mode == QImode || mode == HImode)
8982     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8983
8984   x = gen_rtx_REG (CCmode, CC_REGNUM);
8985   x = gen_rtx_EQ (SImode, x, const0_rtx);
8986   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8987 }
8988
8989 /* Split a compare and swap pattern.  */
8990
8991 void
8992 aarch64_split_compare_and_swap (rtx operands[])
8993 {
8994   rtx rval, mem, oldval, newval, scratch;
8995   machine_mode mode;
8996   bool is_weak;
8997   rtx_code_label *label1, *label2;
8998   rtx x, cond;
8999
9000   rval = operands[0];
9001   mem = operands[1];
9002   oldval = operands[2];
9003   newval = operands[3];
9004   is_weak = (operands[4] != const0_rtx);
9005   scratch = operands[7];
9006   mode = GET_MODE (mem);
9007
9008   label1 = NULL;
9009   if (!is_weak)
9010     {
9011       label1 = gen_label_rtx ();
9012       emit_label (label1);
9013     }
9014   label2 = gen_label_rtx ();
9015
9016   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9017
9018   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9019   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9020   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9021                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9022   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9023
9024   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9025
9026   if (!is_weak)
9027     {
9028       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9029       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9030                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9031       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9032     }
9033   else
9034     {
9035       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9036       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9037       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9038     }
9039
9040   emit_label (label2);
9041 }
9042
9043 /* Split an atomic operation.  */
9044
9045 void
9046 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9047                      rtx value, rtx model_rtx, rtx cond)
9048 {
9049   machine_mode mode = GET_MODE (mem);
9050   machine_mode wmode = (mode == DImode ? DImode : SImode);
9051   rtx_code_label *label;
9052   rtx x;
9053
9054   label = gen_label_rtx ();
9055   emit_label (label);
9056
9057   if (new_out)
9058     new_out = gen_lowpart (wmode, new_out);
9059   if (old_out)
9060     old_out = gen_lowpart (wmode, old_out);
9061   else
9062     old_out = new_out;
9063   value = simplify_gen_subreg (wmode, value, mode, 0);
9064
9065   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9066
9067   switch (code)
9068     {
9069     case SET:
9070       new_out = value;
9071       break;
9072
9073     case NOT:
9074       x = gen_rtx_AND (wmode, old_out, value);
9075       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9076       x = gen_rtx_NOT (wmode, new_out);
9077       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9078       break;
9079
9080     case MINUS:
9081       if (CONST_INT_P (value))
9082         {
9083           value = GEN_INT (-INTVAL (value));
9084           code = PLUS;
9085         }
9086       /* Fall through.  */
9087
9088     default:
9089       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9090       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9091       break;
9092     }
9093
9094   aarch64_emit_store_exclusive (mode, cond, mem,
9095                                 gen_lowpart (mode, new_out), model_rtx);
9096
9097   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9098   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9099                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9100   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9101 }
9102
9103 static void
9104 aarch64_print_extension (void)
9105 {
9106   const struct aarch64_option_extension *opt = NULL;
9107
9108   for (opt = all_extensions; opt->name != NULL; opt++)
9109     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9110       asm_fprintf (asm_out_file, "+%s", opt->name);
9111
9112   asm_fprintf (asm_out_file, "\n");
9113 }
9114
9115 static void
9116 aarch64_start_file (void)
9117 {
9118   if (selected_arch)
9119     {
9120       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9121       aarch64_print_extension ();
9122     }
9123   else if (selected_cpu)
9124     {
9125       const char *truncated_name
9126             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9127       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9128       aarch64_print_extension ();
9129     }
9130   default_file_start();
9131 }
9132
9133 /* Target hook for c_mode_for_suffix.  */
9134 static machine_mode
9135 aarch64_c_mode_for_suffix (char suffix)
9136 {
9137   if (suffix == 'q')
9138     return TFmode;
9139
9140   return VOIDmode;
9141 }
9142
9143 /* We can only represent floating point constants which will fit in
9144    "quarter-precision" values.  These values are characterised by
9145    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9146    by:
9147
9148    (-1)^s * (n/16) * 2^r
9149
9150    Where:
9151      's' is the sign bit.
9152      'n' is an integer in the range 16 <= n <= 31.
9153      'r' is an integer in the range -3 <= r <= 4.  */
9154
9155 /* Return true iff X can be represented by a quarter-precision
9156    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9157 bool
9158 aarch64_float_const_representable_p (rtx x)
9159 {
9160   /* This represents our current view of how many bits
9161      make up the mantissa.  */
9162   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9163   int exponent;
9164   unsigned HOST_WIDE_INT mantissa, mask;
9165   REAL_VALUE_TYPE r, m;
9166   bool fail;
9167
9168   if (!CONST_DOUBLE_P (x))
9169     return false;
9170
9171   if (GET_MODE (x) == VOIDmode)
9172     return false;
9173
9174   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9175
9176   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9177      know if we have +zero until we analyse the mantissa, but we
9178      can reject the other invalid values.  */
9179   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9180       || REAL_VALUE_MINUS_ZERO (r))
9181     return false;
9182
9183   /* Extract exponent.  */
9184   r = real_value_abs (&r);
9185   exponent = REAL_EXP (&r);
9186
9187   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9188      highest (sign) bit, with a fixed binary point at bit point_pos.
9189      m1 holds the low part of the mantissa, m2 the high part.
9190      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9191      bits for the mantissa, this can fail (low bits will be lost).  */
9192   real_ldexp (&m, &r, point_pos - exponent);
9193   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9194
9195   /* If the low part of the mantissa has bits set we cannot represent
9196      the value.  */
9197   if (w.elt (0) != 0)
9198     return false;
9199   /* We have rejected the lower HOST_WIDE_INT, so update our
9200      understanding of how many bits lie in the mantissa and
9201      look only at the high HOST_WIDE_INT.  */
9202   mantissa = w.elt (1);
9203   point_pos -= HOST_BITS_PER_WIDE_INT;
9204
9205   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9206   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9207   if ((mantissa & mask) != 0)
9208     return false;
9209
9210   /* Having filtered unrepresentable values, we may now remove all
9211      but the highest 5 bits.  */
9212   mantissa >>= point_pos - 5;
9213
9214   /* We cannot represent the value 0.0, so reject it.  This is handled
9215      elsewhere.  */
9216   if (mantissa == 0)
9217     return false;
9218
9219   /* Then, as bit 4 is always set, we can mask it off, leaving
9220      the mantissa in the range [0, 15].  */
9221   mantissa &= ~(1 << 4);
9222   gcc_assert (mantissa <= 15);
9223
9224   /* GCC internally does not use IEEE754-like encoding (where normalized
9225      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9226      Our mantissa values are shifted 4 places to the left relative to
9227      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9228      by 5 places to correct for GCC's representation.  */
9229   exponent = 5 - exponent;
9230
9231   return (exponent >= 0 && exponent <= 7);
9232 }
9233
9234 char*
9235 aarch64_output_simd_mov_immediate (rtx const_vector,
9236                                    machine_mode mode,
9237                                    unsigned width)
9238 {
9239   bool is_valid;
9240   static char templ[40];
9241   const char *mnemonic;
9242   const char *shift_op;
9243   unsigned int lane_count = 0;
9244   char element_char;
9245
9246   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9247
9248   /* This will return true to show const_vector is legal for use as either
9249      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9250      also update INFO to show how the immediate should be generated.  */
9251   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9252   gcc_assert (is_valid);
9253
9254   element_char = sizetochar (info.element_width);
9255   lane_count = width / info.element_width;
9256
9257   mode = GET_MODE_INNER (mode);
9258   if (mode == SFmode || mode == DFmode)
9259     {
9260       gcc_assert (info.shift == 0 && ! info.mvn);
9261       if (aarch64_float_const_zero_rtx_p (info.value))
9262         info.value = GEN_INT (0);
9263       else
9264         {
9265 #define buf_size 20
9266           REAL_VALUE_TYPE r;
9267           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9268           char float_buf[buf_size] = {'\0'};
9269           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9270 #undef buf_size
9271
9272           if (lane_count == 1)
9273             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9274           else
9275             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9276                       lane_count, element_char, float_buf);
9277           return templ;
9278         }
9279     }
9280
9281   mnemonic = info.mvn ? "mvni" : "movi";
9282   shift_op = info.msl ? "msl" : "lsl";
9283
9284   if (lane_count == 1)
9285     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9286               mnemonic, UINTVAL (info.value));
9287   else if (info.shift)
9288     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9289               ", %s %d", mnemonic, lane_count, element_char,
9290               UINTVAL (info.value), shift_op, info.shift);
9291   else
9292     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9293               mnemonic, lane_count, element_char, UINTVAL (info.value));
9294   return templ;
9295 }
9296
9297 char*
9298 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9299                                           machine_mode mode)
9300 {
9301   machine_mode vmode;
9302
9303   gcc_assert (!VECTOR_MODE_P (mode));
9304   vmode = aarch64_simd_container_mode (mode, 64);
9305   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9306   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9307 }
9308
9309 /* Split operands into moves from op[1] + op[2] into op[0].  */
9310
9311 void
9312 aarch64_split_combinev16qi (rtx operands[3])
9313 {
9314   unsigned int dest = REGNO (operands[0]);
9315   unsigned int src1 = REGNO (operands[1]);
9316   unsigned int src2 = REGNO (operands[2]);
9317   machine_mode halfmode = GET_MODE (operands[1]);
9318   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9319   rtx destlo, desthi;
9320
9321   gcc_assert (halfmode == V16QImode);
9322
9323   if (src1 == dest && src2 == dest + halfregs)
9324     {
9325       /* No-op move.  Can't split to nothing; emit something.  */
9326       emit_note (NOTE_INSN_DELETED);
9327       return;
9328     }
9329
9330   /* Preserve register attributes for variable tracking.  */
9331   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9332   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9333                                GET_MODE_SIZE (halfmode));
9334
9335   /* Special case of reversed high/low parts.  */
9336   if (reg_overlap_mentioned_p (operands[2], destlo)
9337       && reg_overlap_mentioned_p (operands[1], desthi))
9338     {
9339       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9340       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9341       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9342     }
9343   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9344     {
9345       /* Try to avoid unnecessary moves if part of the result
9346          is in the right place already.  */
9347       if (src1 != dest)
9348         emit_move_insn (destlo, operands[1]);
9349       if (src2 != dest + halfregs)
9350         emit_move_insn (desthi, operands[2]);
9351     }
9352   else
9353     {
9354       if (src2 != dest + halfregs)
9355         emit_move_insn (desthi, operands[2]);
9356       if (src1 != dest)
9357         emit_move_insn (destlo, operands[1]);
9358     }
9359 }
9360
9361 /* vec_perm support.  */
9362
9363 #define MAX_VECT_LEN 16
9364
9365 struct expand_vec_perm_d
9366 {
9367   rtx target, op0, op1;
9368   unsigned char perm[MAX_VECT_LEN];
9369   machine_mode vmode;
9370   unsigned char nelt;
9371   bool one_vector_p;
9372   bool testing_p;
9373 };
9374
9375 /* Generate a variable permutation.  */
9376
9377 static void
9378 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9379 {
9380   machine_mode vmode = GET_MODE (target);
9381   bool one_vector_p = rtx_equal_p (op0, op1);
9382
9383   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9384   gcc_checking_assert (GET_MODE (op0) == vmode);
9385   gcc_checking_assert (GET_MODE (op1) == vmode);
9386   gcc_checking_assert (GET_MODE (sel) == vmode);
9387   gcc_checking_assert (TARGET_SIMD);
9388
9389   if (one_vector_p)
9390     {
9391       if (vmode == V8QImode)
9392         {
9393           /* Expand the argument to a V16QI mode by duplicating it.  */
9394           rtx pair = gen_reg_rtx (V16QImode);
9395           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9396           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9397         }
9398       else
9399         {
9400           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9401         }
9402     }
9403   else
9404     {
9405       rtx pair;
9406
9407       if (vmode == V8QImode)
9408         {
9409           pair = gen_reg_rtx (V16QImode);
9410           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9411           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9412         }
9413       else
9414         {
9415           pair = gen_reg_rtx (OImode);
9416           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9417           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9418         }
9419     }
9420 }
9421
9422 void
9423 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9424 {
9425   machine_mode vmode = GET_MODE (target);
9426   unsigned int nelt = GET_MODE_NUNITS (vmode);
9427   bool one_vector_p = rtx_equal_p (op0, op1);
9428   rtx mask;
9429
9430   /* The TBL instruction does not use a modulo index, so we must take care
9431      of that ourselves.  */
9432   mask = aarch64_simd_gen_const_vector_dup (vmode,
9433       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9434   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9435
9436   /* For big-endian, we also need to reverse the index within the vector
9437      (but not which vector).  */
9438   if (BYTES_BIG_ENDIAN)
9439     {
9440       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9441       if (!one_vector_p)
9442         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9443       sel = expand_simple_binop (vmode, XOR, sel, mask,
9444                                  NULL, 0, OPTAB_LIB_WIDEN);
9445     }
9446   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9447 }
9448
9449 /* Recognize patterns suitable for the TRN instructions.  */
9450 static bool
9451 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9452 {
9453   unsigned int i, odd, mask, nelt = d->nelt;
9454   rtx out, in0, in1, x;
9455   rtx (*gen) (rtx, rtx, rtx);
9456   machine_mode vmode = d->vmode;
9457
9458   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9459     return false;
9460
9461   /* Note that these are little-endian tests.
9462      We correct for big-endian later.  */
9463   if (d->perm[0] == 0)
9464     odd = 0;
9465   else if (d->perm[0] == 1)
9466     odd = 1;
9467   else
9468     return false;
9469   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9470
9471   for (i = 0; i < nelt; i += 2)
9472     {
9473       if (d->perm[i] != i + odd)
9474         return false;
9475       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9476         return false;
9477     }
9478
9479   /* Success!  */
9480   if (d->testing_p)
9481     return true;
9482
9483   in0 = d->op0;
9484   in1 = d->op1;
9485   if (BYTES_BIG_ENDIAN)
9486     {
9487       x = in0, in0 = in1, in1 = x;
9488       odd = !odd;
9489     }
9490   out = d->target;
9491
9492   if (odd)
9493     {
9494       switch (vmode)
9495         {
9496         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9497         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9498         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9499         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9500         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9501         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9502         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9503         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9504         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9505         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9506         default:
9507           return false;
9508         }
9509     }
9510   else
9511     {
9512       switch (vmode)
9513         {
9514         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9515         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9516         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9517         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9518         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9519         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9520         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9521         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9522         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9523         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9524         default:
9525           return false;
9526         }
9527     }
9528
9529   emit_insn (gen (out, in0, in1));
9530   return true;
9531 }
9532
9533 /* Recognize patterns suitable for the UZP instructions.  */
9534 static bool
9535 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9536 {
9537   unsigned int i, odd, mask, nelt = d->nelt;
9538   rtx out, in0, in1, x;
9539   rtx (*gen) (rtx, rtx, rtx);
9540   machine_mode vmode = d->vmode;
9541
9542   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9543     return false;
9544
9545   /* Note that these are little-endian tests.
9546      We correct for big-endian later.  */
9547   if (d->perm[0] == 0)
9548     odd = 0;
9549   else if (d->perm[0] == 1)
9550     odd = 1;
9551   else
9552     return false;
9553   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9554
9555   for (i = 0; i < nelt; i++)
9556     {
9557       unsigned elt = (i * 2 + odd) & mask;
9558       if (d->perm[i] != elt)
9559         return false;
9560     }
9561
9562   /* Success!  */
9563   if (d->testing_p)
9564     return true;
9565
9566   in0 = d->op0;
9567   in1 = d->op1;
9568   if (BYTES_BIG_ENDIAN)
9569     {
9570       x = in0, in0 = in1, in1 = x;
9571       odd = !odd;
9572     }
9573   out = d->target;
9574
9575   if (odd)
9576     {
9577       switch (vmode)
9578         {
9579         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9580         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9581         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9582         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9583         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9584         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9585         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9586         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9587         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9588         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9589         default:
9590           return false;
9591         }
9592     }
9593   else
9594     {
9595       switch (vmode)
9596         {
9597         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9598         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9599         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9600         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9601         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9602         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9603         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9604         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9605         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9606         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9607         default:
9608           return false;
9609         }
9610     }
9611
9612   emit_insn (gen (out, in0, in1));
9613   return true;
9614 }
9615
9616 /* Recognize patterns suitable for the ZIP instructions.  */
9617 static bool
9618 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9619 {
9620   unsigned int i, high, mask, nelt = d->nelt;
9621   rtx out, in0, in1, x;
9622   rtx (*gen) (rtx, rtx, rtx);
9623   machine_mode vmode = d->vmode;
9624
9625   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9626     return false;
9627
9628   /* Note that these are little-endian tests.
9629      We correct for big-endian later.  */
9630   high = nelt / 2;
9631   if (d->perm[0] == high)
9632     /* Do Nothing.  */
9633     ;
9634   else if (d->perm[0] == 0)
9635     high = 0;
9636   else
9637     return false;
9638   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9639
9640   for (i = 0; i < nelt / 2; i++)
9641     {
9642       unsigned elt = (i + high) & mask;
9643       if (d->perm[i * 2] != elt)
9644         return false;
9645       elt = (elt + nelt) & mask;
9646       if (d->perm[i * 2 + 1] != elt)
9647         return false;
9648     }
9649
9650   /* Success!  */
9651   if (d->testing_p)
9652     return true;
9653
9654   in0 = d->op0;
9655   in1 = d->op1;
9656   if (BYTES_BIG_ENDIAN)
9657     {
9658       x = in0, in0 = in1, in1 = x;
9659       high = !high;
9660     }
9661   out = d->target;
9662
9663   if (high)
9664     {
9665       switch (vmode)
9666         {
9667         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9668         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9669         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9670         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9671         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9672         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9673         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9674         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9675         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9676         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9677         default:
9678           return false;
9679         }
9680     }
9681   else
9682     {
9683       switch (vmode)
9684         {
9685         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9686         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9687         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9688         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9689         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9690         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9691         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9692         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9693         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9694         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9695         default:
9696           return false;
9697         }
9698     }
9699
9700   emit_insn (gen (out, in0, in1));
9701   return true;
9702 }
9703
9704 /* Recognize patterns for the EXT insn.  */
9705
9706 static bool
9707 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9708 {
9709   unsigned int i, nelt = d->nelt;
9710   rtx (*gen) (rtx, rtx, rtx, rtx);
9711   rtx offset;
9712
9713   unsigned int location = d->perm[0]; /* Always < nelt.  */
9714
9715   /* Check if the extracted indices are increasing by one.  */
9716   for (i = 1; i < nelt; i++)
9717     {
9718       unsigned int required = location + i;
9719       if (d->one_vector_p)
9720         {
9721           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9722           required &= (nelt - 1);
9723         }
9724       if (d->perm[i] != required)
9725         return false;
9726     }
9727
9728   switch (d->vmode)
9729     {
9730     case V16QImode: gen = gen_aarch64_extv16qi; break;
9731     case V8QImode: gen = gen_aarch64_extv8qi; break;
9732     case V4HImode: gen = gen_aarch64_extv4hi; break;
9733     case V8HImode: gen = gen_aarch64_extv8hi; break;
9734     case V2SImode: gen = gen_aarch64_extv2si; break;
9735     case V4SImode: gen = gen_aarch64_extv4si; break;
9736     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9737     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9738     case V2DImode: gen = gen_aarch64_extv2di; break;
9739     case V2DFmode: gen = gen_aarch64_extv2df; break;
9740     default:
9741       return false;
9742     }
9743
9744   /* Success! */
9745   if (d->testing_p)
9746     return true;
9747
9748   /* The case where (location == 0) is a no-op for both big- and little-endian,
9749      and is removed by the mid-end at optimization levels -O1 and higher.  */
9750
9751   if (BYTES_BIG_ENDIAN && (location != 0))
9752     {
9753       /* After setup, we want the high elements of the first vector (stored
9754          at the LSB end of the register), and the low elements of the second
9755          vector (stored at the MSB end of the register). So swap.  */
9756       rtx temp = d->op0;
9757       d->op0 = d->op1;
9758       d->op1 = temp;
9759       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9760       location = nelt - location;
9761     }
9762
9763   offset = GEN_INT (location);
9764   emit_insn (gen (d->target, d->op0, d->op1, offset));
9765   return true;
9766 }
9767
9768 /* Recognize patterns for the REV insns.  */
9769
9770 static bool
9771 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9772 {
9773   unsigned int i, j, diff, nelt = d->nelt;
9774   rtx (*gen) (rtx, rtx);
9775
9776   if (!d->one_vector_p)
9777     return false;
9778
9779   diff = d->perm[0];
9780   switch (diff)
9781     {
9782     case 7:
9783       switch (d->vmode)
9784         {
9785         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9786         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9787         default:
9788           return false;
9789         }
9790       break;
9791     case 3:
9792       switch (d->vmode)
9793         {
9794         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9795         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9796         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9797         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9798         default:
9799           return false;
9800         }
9801       break;
9802     case 1:
9803       switch (d->vmode)
9804         {
9805         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9806         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9807         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9808         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9809         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9810         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9811         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9812         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9813         default:
9814           return false;
9815         }
9816       break;
9817     default:
9818       return false;
9819     }
9820
9821   for (i = 0; i < nelt ; i += diff + 1)
9822     for (j = 0; j <= diff; j += 1)
9823       {
9824         /* This is guaranteed to be true as the value of diff
9825            is 7, 3, 1 and we should have enough elements in the
9826            queue to generate this.  Getting a vector mask with a
9827            value of diff other than these values implies that
9828            something is wrong by the time we get here.  */
9829         gcc_assert (i + j < nelt);
9830         if (d->perm[i + j] != i + diff - j)
9831           return false;
9832       }
9833
9834   /* Success! */
9835   if (d->testing_p)
9836     return true;
9837
9838   emit_insn (gen (d->target, d->op0));
9839   return true;
9840 }
9841
9842 static bool
9843 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9844 {
9845   rtx (*gen) (rtx, rtx, rtx);
9846   rtx out = d->target;
9847   rtx in0;
9848   machine_mode vmode = d->vmode;
9849   unsigned int i, elt, nelt = d->nelt;
9850   rtx lane;
9851
9852   elt = d->perm[0];
9853   for (i = 1; i < nelt; i++)
9854     {
9855       if (elt != d->perm[i])
9856         return false;
9857     }
9858
9859   /* The generic preparation in aarch64_expand_vec_perm_const_1
9860      swaps the operand order and the permute indices if it finds
9861      d->perm[0] to be in the second operand.  Thus, we can always
9862      use d->op0 and need not do any extra arithmetic to get the
9863      correct lane number.  */
9864   in0 = d->op0;
9865   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9866
9867   switch (vmode)
9868     {
9869     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9870     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9871     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9872     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9873     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9874     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9875     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9876     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9877     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9878     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9879     default:
9880       return false;
9881     }
9882
9883   emit_insn (gen (out, in0, lane));
9884   return true;
9885 }
9886
9887 static bool
9888 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9889 {
9890   rtx rperm[MAX_VECT_LEN], sel;
9891   machine_mode vmode = d->vmode;
9892   unsigned int i, nelt = d->nelt;
9893
9894   if (d->testing_p)
9895     return true;
9896
9897   /* Generic code will try constant permutation twice.  Once with the
9898      original mode and again with the elements lowered to QImode.
9899      So wait and don't do the selector expansion ourselves.  */
9900   if (vmode != V8QImode && vmode != V16QImode)
9901     return false;
9902
9903   for (i = 0; i < nelt; ++i)
9904     {
9905       int nunits = GET_MODE_NUNITS (vmode);
9906
9907       /* If big-endian and two vectors we end up with a weird mixed-endian
9908          mode on NEON.  Reverse the index within each word but not the word
9909          itself.  */
9910       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9911                                            : d->perm[i]);
9912     }
9913   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9914   sel = force_reg (vmode, sel);
9915
9916   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9917   return true;
9918 }
9919
9920 static bool
9921 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9922 {
9923   /* The pattern matching functions above are written to look for a small
9924      number to begin the sequence (0, 1, N/2).  If we begin with an index
9925      from the second operand, we can swap the operands.  */
9926   if (d->perm[0] >= d->nelt)
9927     {
9928       unsigned i, nelt = d->nelt;
9929       rtx x;
9930
9931       gcc_assert (nelt == (nelt & -nelt));
9932       for (i = 0; i < nelt; ++i)
9933         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9934
9935       x = d->op0;
9936       d->op0 = d->op1;
9937       d->op1 = x;
9938     }
9939
9940   if (TARGET_SIMD)
9941     {
9942       if (aarch64_evpc_rev (d))
9943         return true;
9944       else if (aarch64_evpc_ext (d))
9945         return true;
9946       else if (aarch64_evpc_dup (d))
9947         return true;
9948       else if (aarch64_evpc_zip (d))
9949         return true;
9950       else if (aarch64_evpc_uzp (d))
9951         return true;
9952       else if (aarch64_evpc_trn (d))
9953         return true;
9954       return aarch64_evpc_tbl (d);
9955     }
9956   return false;
9957 }
9958
9959 /* Expand a vec_perm_const pattern.  */
9960
9961 bool
9962 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9963 {
9964   struct expand_vec_perm_d d;
9965   int i, nelt, which;
9966
9967   d.target = target;
9968   d.op0 = op0;
9969   d.op1 = op1;
9970
9971   d.vmode = GET_MODE (target);
9972   gcc_assert (VECTOR_MODE_P (d.vmode));
9973   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9974   d.testing_p = false;
9975
9976   for (i = which = 0; i < nelt; ++i)
9977     {
9978       rtx e = XVECEXP (sel, 0, i);
9979       int ei = INTVAL (e) & (2 * nelt - 1);
9980       which |= (ei < nelt ? 1 : 2);
9981       d.perm[i] = ei;
9982     }
9983
9984   switch (which)
9985     {
9986     default:
9987       gcc_unreachable ();
9988
9989     case 3:
9990       d.one_vector_p = false;
9991       if (!rtx_equal_p (op0, op1))
9992         break;
9993
9994       /* The elements of PERM do not suggest that only the first operand
9995          is used, but both operands are identical.  Allow easier matching
9996          of the permutation by folding the permutation into the single
9997          input vector.  */
9998       /* Fall Through.  */
9999     case 2:
10000       for (i = 0; i < nelt; ++i)
10001         d.perm[i] &= nelt - 1;
10002       d.op0 = op1;
10003       d.one_vector_p = true;
10004       break;
10005
10006     case 1:
10007       d.op1 = op0;
10008       d.one_vector_p = true;
10009       break;
10010     }
10011
10012   return aarch64_expand_vec_perm_const_1 (&d);
10013 }
10014
10015 static bool
10016 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10017                                      const unsigned char *sel)
10018 {
10019   struct expand_vec_perm_d d;
10020   unsigned int i, nelt, which;
10021   bool ret;
10022
10023   d.vmode = vmode;
10024   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10025   d.testing_p = true;
10026   memcpy (d.perm, sel, nelt);
10027
10028   /* Calculate whether all elements are in one vector.  */
10029   for (i = which = 0; i < nelt; ++i)
10030     {
10031       unsigned char e = d.perm[i];
10032       gcc_assert (e < 2 * nelt);
10033       which |= (e < nelt ? 1 : 2);
10034     }
10035
10036   /* If all elements are from the second vector, reindex as if from the
10037      first vector.  */
10038   if (which == 2)
10039     for (i = 0; i < nelt; ++i)
10040       d.perm[i] -= nelt;
10041
10042   /* Check whether the mask can be applied to a single vector.  */
10043   d.one_vector_p = (which != 3);
10044
10045   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10046   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10047   if (!d.one_vector_p)
10048     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10049
10050   start_sequence ();
10051   ret = aarch64_expand_vec_perm_const_1 (&d);
10052   end_sequence ();
10053
10054   return ret;
10055 }
10056
10057 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
10058 bool
10059 aarch64_cannot_change_mode_class (machine_mode from,
10060                                   machine_mode to,
10061                                   enum reg_class rclass)
10062 {
10063   /* Full-reg subregs are allowed on general regs or any class if they are
10064      the same size.  */
10065   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
10066       || !reg_classes_intersect_p (FP_REGS, rclass))
10067     return false;
10068
10069   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
10070      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
10071      2. Scalar to Scalar for integer modes or same size float modes.
10072      3. Vector to Vector modes.
10073      4. On little-endian only, Vector-Structure to Vector modes.  */
10074   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
10075     {
10076       if (aarch64_vector_mode_supported_p (from)
10077           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
10078         return false;
10079
10080       if (GET_MODE_NUNITS (from) == 1
10081           && GET_MODE_NUNITS (to) == 1
10082           && (GET_MODE_CLASS (from) == MODE_INT
10083               || from == to))
10084         return false;
10085
10086       if (aarch64_vector_mode_supported_p (from)
10087           && aarch64_vector_mode_supported_p (to))
10088         return false;
10089
10090       /* Within an vector structure straddling multiple vector registers
10091          we are in a mixed-endian representation.  As such, we can't
10092          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
10093          switch between vectors and vector structures cheaply.  */
10094       if (!BYTES_BIG_ENDIAN)
10095         if ((aarch64_vector_mode_supported_p (from)
10096               && aarch64_vect_struct_mode_p (to))
10097             || (aarch64_vector_mode_supported_p (to)
10098               && aarch64_vect_struct_mode_p (from)))
10099           return false;
10100     }
10101
10102   return true;
10103 }
10104
10105 /* Implement MODES_TIEABLE_P.  */
10106
10107 bool
10108 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10109 {
10110   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10111     return true;
10112
10113   /* We specifically want to allow elements of "structure" modes to
10114      be tieable to the structure.  This more general condition allows
10115      other rarer situations too.  */
10116   if (TARGET_SIMD
10117       && aarch64_vector_mode_p (mode1)
10118       && aarch64_vector_mode_p (mode2))
10119     return true;
10120
10121   return false;
10122 }
10123
10124 /* Return a new RTX holding the result of moving POINTER forward by
10125    AMOUNT bytes.  */
10126
10127 static rtx
10128 aarch64_move_pointer (rtx pointer, int amount)
10129 {
10130   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10131
10132   return adjust_automodify_address (pointer, GET_MODE (pointer),
10133                                     next, amount);
10134 }
10135
10136 /* Return a new RTX holding the result of moving POINTER forward by the
10137    size of the mode it points to.  */
10138
10139 static rtx
10140 aarch64_progress_pointer (rtx pointer)
10141 {
10142   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10143
10144   return aarch64_move_pointer (pointer, amount);
10145 }
10146
10147 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10148    MODE bytes.  */
10149
10150 static void
10151 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10152                                               machine_mode mode)
10153 {
10154   rtx reg = gen_reg_rtx (mode);
10155
10156   /* "Cast" the pointers to the correct mode.  */
10157   *src = adjust_address (*src, mode, 0);
10158   *dst = adjust_address (*dst, mode, 0);
10159   /* Emit the memcpy.  */
10160   emit_move_insn (reg, *src);
10161   emit_move_insn (*dst, reg);
10162   /* Move the pointers forward.  */
10163   *src = aarch64_progress_pointer (*src);
10164   *dst = aarch64_progress_pointer (*dst);
10165 }
10166
10167 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10168    we succeed, otherwise return false.  */
10169
10170 bool
10171 aarch64_expand_movmem (rtx *operands)
10172 {
10173   unsigned int n;
10174   rtx dst = operands[0];
10175   rtx src = operands[1];
10176   rtx base;
10177   bool speed_p = !optimize_function_for_size_p (cfun);
10178
10179   /* When optimizing for size, give a better estimate of the length of a
10180      memcpy call, but use the default otherwise.  */
10181   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10182
10183   /* We can't do anything smart if the amount to copy is not constant.  */
10184   if (!CONST_INT_P (operands[2]))
10185     return false;
10186
10187   n = UINTVAL (operands[2]);
10188
10189   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10190      need to make at most two moves.  For cases above 16 bytes it will be one
10191      move for each 16 byte chunk, then at most two additional moves.  */
10192   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10193     return false;
10194
10195   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10196   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10197
10198   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10199   src = adjust_automodify_address (src, VOIDmode, base, 0);
10200
10201   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10202      1-byte chunk.  */
10203   if (n < 4)
10204     {
10205       if (n >= 2)
10206         {
10207           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10208           n -= 2;
10209         }
10210
10211       if (n == 1)
10212         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10213
10214       return true;
10215     }
10216
10217   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10218      4-byte chunk, partially overlapping with the previously copied chunk.  */
10219   if (n < 8)
10220     {
10221       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10222       n -= 4;
10223       if (n > 0)
10224         {
10225           int move = n - 4;
10226
10227           src = aarch64_move_pointer (src, move);
10228           dst = aarch64_move_pointer (dst, move);
10229           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10230         }
10231       return true;
10232     }
10233
10234   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10235      them, then (if applicable) an 8-byte chunk.  */
10236   while (n >= 8)
10237     {
10238       if (n / 16)
10239         {
10240           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10241           n -= 16;
10242         }
10243       else
10244         {
10245           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10246           n -= 8;
10247         }
10248     }
10249
10250   /* Finish the final bytes of the copy.  We can always do this in one
10251      instruction.  We either copy the exact amount we need, or partially
10252      overlap with the previous chunk we copied and copy 8-bytes.  */
10253   if (n == 0)
10254     return true;
10255   else if (n == 1)
10256     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10257   else if (n == 2)
10258     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10259   else if (n == 4)
10260     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10261   else
10262     {
10263       if (n == 3)
10264         {
10265           src = aarch64_move_pointer (src, -1);
10266           dst = aarch64_move_pointer (dst, -1);
10267           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10268         }
10269       else
10270         {
10271           int move = n - 8;
10272
10273           src = aarch64_move_pointer (src, move);
10274           dst = aarch64_move_pointer (dst, move);
10275           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10276         }
10277     }
10278
10279   return true;
10280 }
10281
10282 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10283
10284 static unsigned HOST_WIDE_INT
10285 aarch64_asan_shadow_offset (void)
10286 {
10287   return (HOST_WIDE_INT_1 << 36);
10288 }
10289
10290 static bool
10291 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10292                                         unsigned int align,
10293                                         enum by_pieces_operation op,
10294                                         bool speed_p)
10295 {
10296   /* STORE_BY_PIECES can be used when copying a constant string, but
10297      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10298      For now we always fail this and let the move_by_pieces code copy
10299      the string from read-only memory.  */
10300   if (op == STORE_BY_PIECES)
10301     return false;
10302
10303   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10304 }
10305
10306 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10307    instruction fusion of some sort.  */
10308
10309 static bool
10310 aarch64_macro_fusion_p (void)
10311 {
10312   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10313 }
10314
10315
10316 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10317    should be kept together during scheduling.  */
10318
10319 static bool
10320 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10321 {
10322   rtx set_dest;
10323   rtx prev_set = single_set (prev);
10324   rtx curr_set = single_set (curr);
10325   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10326   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10327
10328   if (!aarch64_macro_fusion_p ())
10329     return false;
10330
10331   if (simple_sets_p
10332       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10333     {
10334       /* We are trying to match:
10335          prev (mov)  == (set (reg r0) (const_int imm16))
10336          curr (movk) == (set (zero_extract (reg r0)
10337                                            (const_int 16)
10338                                            (const_int 16))
10339                              (const_int imm16_1))  */
10340
10341       set_dest = SET_DEST (curr_set);
10342
10343       if (GET_CODE (set_dest) == ZERO_EXTRACT
10344           && CONST_INT_P (SET_SRC (curr_set))
10345           && CONST_INT_P (SET_SRC (prev_set))
10346           && CONST_INT_P (XEXP (set_dest, 2))
10347           && INTVAL (XEXP (set_dest, 2)) == 16
10348           && REG_P (XEXP (set_dest, 0))
10349           && REG_P (SET_DEST (prev_set))
10350           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10351         {
10352           return true;
10353         }
10354     }
10355
10356   if (simple_sets_p
10357       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10358     {
10359
10360       /*  We're trying to match:
10361           prev (adrp) == (set (reg r1)
10362                               (high (symbol_ref ("SYM"))))
10363           curr (add) == (set (reg r0)
10364                              (lo_sum (reg r1)
10365                                      (symbol_ref ("SYM"))))
10366           Note that r0 need not necessarily be the same as r1, especially
10367           during pre-regalloc scheduling.  */
10368
10369       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10370           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10371         {
10372           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10373               && REG_P (XEXP (SET_SRC (curr_set), 0))
10374               && REGNO (XEXP (SET_SRC (curr_set), 0))
10375                  == REGNO (SET_DEST (prev_set))
10376               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10377                               XEXP (SET_SRC (curr_set), 1)))
10378             return true;
10379         }
10380     }
10381
10382   if (simple_sets_p
10383       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10384     {
10385
10386       /* We're trying to match:
10387          prev (movk) == (set (zero_extract (reg r0)
10388                                            (const_int 16)
10389                                            (const_int 32))
10390                              (const_int imm16_1))
10391          curr (movk) == (set (zero_extract (reg r0)
10392                                            (const_int 16)
10393                                            (const_int 48))
10394                              (const_int imm16_2))  */
10395
10396       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10397           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10398           && REG_P (XEXP (SET_DEST (prev_set), 0))
10399           && REG_P (XEXP (SET_DEST (curr_set), 0))
10400           && REGNO (XEXP (SET_DEST (prev_set), 0))
10401              == REGNO (XEXP (SET_DEST (curr_set), 0))
10402           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10403           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10404           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10405           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10406           && CONST_INT_P (SET_SRC (prev_set))
10407           && CONST_INT_P (SET_SRC (curr_set)))
10408         return true;
10409
10410     }
10411   if (simple_sets_p
10412       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10413     {
10414       /* We're trying to match:
10415           prev (adrp) == (set (reg r0)
10416                               (high (symbol_ref ("SYM"))))
10417           curr (ldr) == (set (reg r1)
10418                              (mem (lo_sum (reg r0)
10419                                              (symbol_ref ("SYM")))))
10420                  or
10421           curr (ldr) == (set (reg r1)
10422                              (zero_extend (mem
10423                                            (lo_sum (reg r0)
10424                                                    (symbol_ref ("SYM"))))))  */
10425       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10426           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10427         {
10428           rtx curr_src = SET_SRC (curr_set);
10429
10430           if (GET_CODE (curr_src) == ZERO_EXTEND)
10431             curr_src = XEXP (curr_src, 0);
10432
10433           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10434               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10435               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10436                  == REGNO (SET_DEST (prev_set))
10437               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10438                               XEXP (SET_SRC (prev_set), 0)))
10439               return true;
10440         }
10441     }
10442
10443   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10444       && any_condjump_p (curr))
10445     {
10446       enum attr_type prev_type = get_attr_type (prev);
10447
10448       /* FIXME: this misses some which is considered simple arthematic
10449          instructions for ThunderX.  Simple shifts are missed here.  */
10450       if (prev_type == TYPE_ALUS_SREG
10451           || prev_type == TYPE_ALUS_IMM
10452           || prev_type == TYPE_LOGICS_REG
10453           || prev_type == TYPE_LOGICS_IMM)
10454         return true;
10455     }
10456
10457   return false;
10458 }
10459
10460 /* If MEM is in the form of [base+offset], extract the two parts
10461    of address and set to BASE and OFFSET, otherwise return false
10462    after clearing BASE and OFFSET.  */
10463
10464 bool
10465 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10466 {
10467   rtx addr;
10468
10469   gcc_assert (MEM_P (mem));
10470
10471   addr = XEXP (mem, 0);
10472
10473   if (REG_P (addr))
10474     {
10475       *base = addr;
10476       *offset = const0_rtx;
10477       return true;
10478     }
10479
10480   if (GET_CODE (addr) == PLUS
10481       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10482     {
10483       *base = XEXP (addr, 0);
10484       *offset = XEXP (addr, 1);
10485       return true;
10486     }
10487
10488   *base = NULL_RTX;
10489   *offset = NULL_RTX;
10490
10491   return false;
10492 }
10493
10494 /* Types for scheduling fusion.  */
10495 enum sched_fusion_type
10496 {
10497   SCHED_FUSION_NONE = 0,
10498   SCHED_FUSION_LD_SIGN_EXTEND,
10499   SCHED_FUSION_LD_ZERO_EXTEND,
10500   SCHED_FUSION_LD,
10501   SCHED_FUSION_ST,
10502   SCHED_FUSION_NUM
10503 };
10504
10505 /* If INSN is a load or store of address in the form of [base+offset],
10506    extract the two parts and set to BASE and OFFSET.  Return scheduling
10507    fusion type this INSN is.  */
10508
10509 static enum sched_fusion_type
10510 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10511 {
10512   rtx x, dest, src;
10513   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10514
10515   gcc_assert (INSN_P (insn));
10516   x = PATTERN (insn);
10517   if (GET_CODE (x) != SET)
10518     return SCHED_FUSION_NONE;
10519
10520   src = SET_SRC (x);
10521   dest = SET_DEST (x);
10522
10523   if (GET_MODE (src) != SImode && GET_MODE (src) != DImode
10524       && GET_MODE (src) != SFmode && GET_MODE (src) != DFmode)
10525     return SCHED_FUSION_NONE;
10526
10527   if (GET_CODE (src) == SIGN_EXTEND)
10528     {
10529       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10530       src = XEXP (src, 0);
10531       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10532         return SCHED_FUSION_NONE;
10533     }
10534   else if (GET_CODE (src) == ZERO_EXTEND)
10535     {
10536       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10537       src = XEXP (src, 0);
10538       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10539         return SCHED_FUSION_NONE;
10540     }
10541
10542   if (GET_CODE (src) == MEM && REG_P (dest))
10543     extract_base_offset_in_addr (src, base, offset);
10544   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10545     {
10546       fusion = SCHED_FUSION_ST;
10547       extract_base_offset_in_addr (dest, base, offset);
10548     }
10549   else
10550     return SCHED_FUSION_NONE;
10551
10552   if (*base == NULL_RTX || *offset == NULL_RTX)
10553     fusion = SCHED_FUSION_NONE;
10554
10555   return fusion;
10556 }
10557
10558 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10559
10560    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10561    and PRI are only calculated for these instructions.  For other instruction,
10562    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10563    type instruction fusion can be added by returning different priorities.
10564
10565    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10566
10567 static void
10568 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10569                                int *fusion_pri, int *pri)
10570 {
10571   int tmp, off_val;
10572   rtx base, offset;
10573   enum sched_fusion_type fusion;
10574
10575   gcc_assert (INSN_P (insn));
10576
10577   tmp = max_pri - 1;
10578   fusion = fusion_load_store (insn, &base, &offset);
10579   if (fusion == SCHED_FUSION_NONE)
10580     {
10581       *pri = tmp;
10582       *fusion_pri = tmp;
10583       return;
10584     }
10585
10586   /* Set FUSION_PRI according to fusion type and base register.  */
10587   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10588
10589   /* Calculate PRI.  */
10590   tmp /= 2;
10591
10592   /* INSN with smaller offset goes first.  */
10593   off_val = (int)(INTVAL (offset));
10594   if (off_val >= 0)
10595     tmp -= (off_val & 0xfffff);
10596   else
10597     tmp += ((- off_val) & 0xfffff);
10598
10599   *pri = tmp;
10600   return;
10601 }
10602
10603 /* Given OPERANDS of consecutive load/store, check if we can merge
10604    them into ldp/stp.  LOAD is true if they are load instructions.
10605    MODE is the mode of memory operands.  */
10606
10607 bool
10608 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10609                                 enum machine_mode mode)
10610 {
10611   HOST_WIDE_INT offval_1, offval_2, msize;
10612   enum reg_class rclass_1, rclass_2;
10613   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10614
10615   if (load)
10616     {
10617       mem_1 = operands[1];
10618       mem_2 = operands[3];
10619       reg_1 = operands[0];
10620       reg_2 = operands[2];
10621       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10622       if (REGNO (reg_1) == REGNO (reg_2))
10623         return false;
10624     }
10625   else
10626     {
10627       mem_1 = operands[0];
10628       mem_2 = operands[2];
10629       reg_1 = operands[1];
10630       reg_2 = operands[3];
10631     }
10632
10633   /* The mems cannot be volatile.  */
10634   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10635     return false;
10636
10637   /* Check if the addresses are in the form of [base+offset].  */
10638   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10639   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10640     return false;
10641   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10642   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10643     return false;
10644
10645   /* Check if the bases are same.  */
10646   if (!rtx_equal_p (base_1, base_2))
10647     return false;
10648
10649   offval_1 = INTVAL (offset_1);
10650   offval_2 = INTVAL (offset_2);
10651   msize = GET_MODE_SIZE (mode);
10652   /* Check if the offsets are consecutive.  */
10653   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10654     return false;
10655
10656   /* Check if the addresses are clobbered by load.  */
10657   if (load)
10658     {
10659       if (reg_mentioned_p (reg_1, mem_1))
10660         return false;
10661
10662       /* In increasing order, the last load can clobber the address.  */
10663       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10664       return false;
10665     }
10666
10667   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10668     rclass_1 = FP_REGS;
10669   else
10670     rclass_1 = GENERAL_REGS;
10671
10672   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10673     rclass_2 = FP_REGS;
10674   else
10675     rclass_2 = GENERAL_REGS;
10676
10677   /* Check if the registers are of same class.  */
10678   if (rclass_1 != rclass_2)
10679     return false;
10680
10681   return true;
10682 }
10683
10684 /* Given OPERANDS of consecutive load/store, check if we can merge
10685    them into ldp/stp by adjusting the offset.  LOAD is true if they
10686    are load instructions.  MODE is the mode of memory operands.
10687
10688    Given below consecutive stores:
10689
10690      str  w1, [xb, 0x100]
10691      str  w1, [xb, 0x104]
10692      str  w1, [xb, 0x108]
10693      str  w1, [xb, 0x10c]
10694
10695    Though the offsets are out of the range supported by stp, we can
10696    still pair them after adjusting the offset, like:
10697
10698      add  scratch, xb, 0x100
10699      stp  w1, w1, [scratch]
10700      stp  w1, w1, [scratch, 0x8]
10701
10702    The peephole patterns detecting this opportunity should guarantee
10703    the scratch register is avaliable.  */
10704
10705 bool
10706 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
10707                                        enum machine_mode mode)
10708 {
10709   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
10710   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
10711   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
10712   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
10713
10714   if (load)
10715     {
10716       reg_1 = operands[0];
10717       mem_1 = operands[1];
10718       reg_2 = operands[2];
10719       mem_2 = operands[3];
10720       reg_3 = operands[4];
10721       mem_3 = operands[5];
10722       reg_4 = operands[6];
10723       mem_4 = operands[7];
10724       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
10725                   && REG_P (reg_3) && REG_P (reg_4));
10726       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
10727         return false;
10728     }
10729   else
10730     {
10731       mem_1 = operands[0];
10732       reg_1 = operands[1];
10733       mem_2 = operands[2];
10734       reg_2 = operands[3];
10735       mem_3 = operands[4];
10736       reg_3 = operands[5];
10737       mem_4 = operands[6];
10738       reg_4 = operands[7];
10739     }
10740   /* Skip if memory operand is by itslef valid for ldp/stp.  */
10741   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
10742     return false;
10743
10744   /* The mems cannot be volatile.  */
10745   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
10746       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
10747     return false;
10748
10749   /* Check if the addresses are in the form of [base+offset].  */
10750   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10751   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10752     return false;
10753   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10754   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10755     return false;
10756   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
10757   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
10758     return false;
10759   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
10760   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
10761     return false;
10762
10763   /* Check if the bases are same.  */
10764   if (!rtx_equal_p (base_1, base_2)
10765       || !rtx_equal_p (base_2, base_3)
10766       || !rtx_equal_p (base_3, base_4))
10767     return false;
10768
10769   offval_1 = INTVAL (offset_1);
10770   offval_2 = INTVAL (offset_2);
10771   offval_3 = INTVAL (offset_3);
10772   offval_4 = INTVAL (offset_4);
10773   msize = GET_MODE_SIZE (mode);
10774   /* Check if the offsets are consecutive.  */
10775   if ((offval_1 != (offval_2 + msize)
10776        || offval_1 != (offval_3 + msize * 2)
10777        || offval_1 != (offval_4 + msize * 3))
10778       && (offval_4 != (offval_3 + msize)
10779           || offval_4 != (offval_2 + msize * 2)
10780           || offval_4 != (offval_1 + msize * 3)))
10781     return false;
10782
10783   /* Check if the addresses are clobbered by load.  */
10784   if (load)
10785     {
10786       if (reg_mentioned_p (reg_1, mem_1)
10787           || reg_mentioned_p (reg_2, mem_2)
10788           || reg_mentioned_p (reg_3, mem_3))
10789         return false;
10790
10791       /* In increasing order, the last load can clobber the address.  */
10792       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
10793         return false;
10794     }
10795
10796   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10797     rclass_1 = FP_REGS;
10798   else
10799     rclass_1 = GENERAL_REGS;
10800
10801   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10802     rclass_2 = FP_REGS;
10803   else
10804     rclass_2 = GENERAL_REGS;
10805
10806   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
10807     rclass_3 = FP_REGS;
10808   else
10809     rclass_3 = GENERAL_REGS;
10810
10811   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
10812     rclass_4 = FP_REGS;
10813   else
10814     rclass_4 = GENERAL_REGS;
10815
10816   /* Check if the registers are of same class.  */
10817   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
10818     return false;
10819
10820   return true;
10821 }
10822
10823 /* Given OPERANDS of consecutive load/store, this function pairs them
10824    into ldp/stp after adjusting the offset.  It depends on the fact
10825    that addresses of load/store instructions are in increasing order.
10826    MODE is the mode of memory operands.  CODE is the rtl operator
10827    which should be applied to all memory operands, it's SIGN_EXTEND,
10828    ZERO_EXTEND or UNKNOWN.  */
10829
10830 bool
10831 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
10832                              enum machine_mode mode, RTX_CODE code)
10833 {
10834   rtx base, offset, t1, t2;
10835   rtx mem_1, mem_2, mem_3, mem_4;
10836   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
10837
10838   if (load)
10839     {
10840       mem_1 = operands[1];
10841       mem_2 = operands[3];
10842       mem_3 = operands[5];
10843       mem_4 = operands[7];
10844     }
10845   else
10846     {
10847       mem_1 = operands[0];
10848       mem_2 = operands[2];
10849       mem_3 = operands[4];
10850       mem_4 = operands[6];
10851       gcc_assert (code == UNKNOWN);
10852     }
10853
10854   extract_base_offset_in_addr (mem_1, &base, &offset);
10855   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
10856
10857   /* Adjust offset thus it can fit in ldp/stp instruction.  */
10858   msize = GET_MODE_SIZE (mode);
10859   stp_off_limit = msize * 0x40;
10860   off_val = INTVAL (offset);
10861   abs_off = (off_val < 0) ? -off_val : off_val;
10862   new_off = abs_off % stp_off_limit;
10863   adj_off = abs_off - new_off;
10864
10865   /* Further adjust to make sure all offsets are OK.  */
10866   if ((new_off + msize * 2) >= stp_off_limit)
10867     {
10868       adj_off += stp_off_limit;
10869       new_off -= stp_off_limit;
10870     }
10871
10872   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
10873   if (adj_off >= 0x1000)
10874     return false;
10875
10876   if (off_val < 0)
10877     {
10878       adj_off = -adj_off;
10879       new_off = -new_off;
10880     }
10881
10882   /* Create new memory references.  */
10883   mem_1 = change_address (mem_1, VOIDmode,
10884                           plus_constant (DImode, operands[8], new_off));
10885
10886   /* Check if the adjusted address is OK for ldp/stp.  */
10887   if (!aarch64_mem_pair_operand (mem_1, mode))
10888     return false;
10889
10890   msize = GET_MODE_SIZE (mode);
10891   mem_2 = change_address (mem_2, VOIDmode,
10892                           plus_constant (DImode,
10893                                          operands[8],
10894                                          new_off + msize));
10895   mem_3 = change_address (mem_3, VOIDmode,
10896                           plus_constant (DImode,
10897                                          operands[8],
10898                                          new_off + msize * 2));
10899   mem_4 = change_address (mem_4, VOIDmode,
10900                           plus_constant (DImode,
10901                                          operands[8],
10902                                          new_off + msize * 3));
10903
10904   if (code == ZERO_EXTEND)
10905     {
10906       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
10907       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
10908       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
10909       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
10910     }
10911   else if (code == SIGN_EXTEND)
10912     {
10913       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
10914       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
10915       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
10916       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
10917     }
10918
10919   if (load)
10920     {
10921       operands[1] = mem_1;
10922       operands[3] = mem_2;
10923       operands[5] = mem_3;
10924       operands[7] = mem_4;
10925     }
10926   else
10927     {
10928       operands[0] = mem_1;
10929       operands[2] = mem_2;
10930       operands[4] = mem_3;
10931       operands[6] = mem_4;
10932     }
10933
10934   /* Emit adjusting instruction.  */
10935   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
10936                           plus_constant (DImode, base, adj_off)));
10937   /* Emit ldp/stp instructions.  */
10938   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
10939   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
10940   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
10941   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
10942   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
10943   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
10944   return true;
10945 }
10946
10947 #undef TARGET_ADDRESS_COST
10948 #define TARGET_ADDRESS_COST aarch64_address_cost
10949
10950 /* This hook will determines whether unnamed bitfields affect the alignment
10951    of the containing structure.  The hook returns true if the structure
10952    should inherit the alignment requirements of an unnamed bitfield's
10953    type.  */
10954 #undef TARGET_ALIGN_ANON_BITFIELD
10955 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
10956
10957 #undef TARGET_ASM_ALIGNED_DI_OP
10958 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
10959
10960 #undef TARGET_ASM_ALIGNED_HI_OP
10961 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
10962
10963 #undef TARGET_ASM_ALIGNED_SI_OP
10964 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
10965
10966 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
10967 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
10968   hook_bool_const_tree_hwi_hwi_const_tree_true
10969
10970 #undef TARGET_ASM_FILE_START
10971 #define TARGET_ASM_FILE_START aarch64_start_file
10972
10973 #undef TARGET_ASM_OUTPUT_MI_THUNK
10974 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
10975
10976 #undef TARGET_ASM_SELECT_RTX_SECTION
10977 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
10978
10979 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
10980 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
10981
10982 #undef TARGET_BUILD_BUILTIN_VA_LIST
10983 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
10984
10985 #undef TARGET_CALLEE_COPIES
10986 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
10987
10988 #undef TARGET_CAN_ELIMINATE
10989 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
10990
10991 #undef TARGET_CANNOT_FORCE_CONST_MEM
10992 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
10993
10994 #undef TARGET_CONDITIONAL_REGISTER_USAGE
10995 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
10996
10997 /* Only the least significant bit is used for initialization guard
10998    variables.  */
10999 #undef TARGET_CXX_GUARD_MASK_BIT
11000 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11001
11002 #undef TARGET_C_MODE_FOR_SUFFIX
11003 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11004
11005 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11006 #undef  TARGET_DEFAULT_TARGET_FLAGS
11007 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11008 #endif
11009
11010 #undef TARGET_CLASS_MAX_NREGS
11011 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11012
11013 #undef TARGET_BUILTIN_DECL
11014 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11015
11016 #undef  TARGET_EXPAND_BUILTIN
11017 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11018
11019 #undef TARGET_EXPAND_BUILTIN_VA_START
11020 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11021
11022 #undef TARGET_FOLD_BUILTIN
11023 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11024
11025 #undef TARGET_FUNCTION_ARG
11026 #define TARGET_FUNCTION_ARG aarch64_function_arg
11027
11028 #undef TARGET_FUNCTION_ARG_ADVANCE
11029 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11030
11031 #undef TARGET_FUNCTION_ARG_BOUNDARY
11032 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11033
11034 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11035 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11036
11037 #undef TARGET_FUNCTION_VALUE
11038 #define TARGET_FUNCTION_VALUE aarch64_function_value
11039
11040 #undef TARGET_FUNCTION_VALUE_REGNO_P
11041 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11042
11043 #undef TARGET_FRAME_POINTER_REQUIRED
11044 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11045
11046 #undef TARGET_GIMPLE_FOLD_BUILTIN
11047 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11048
11049 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11050 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11051
11052 #undef  TARGET_INIT_BUILTINS
11053 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11054
11055 #undef TARGET_LEGITIMATE_ADDRESS_P
11056 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11057
11058 #undef TARGET_LEGITIMATE_CONSTANT_P
11059 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11060
11061 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11062 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11063
11064 #undef TARGET_LRA_P
11065 #define TARGET_LRA_P aarch64_lra_p
11066
11067 #undef TARGET_MANGLE_TYPE
11068 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11069
11070 #undef TARGET_MEMORY_MOVE_COST
11071 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11072
11073 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11074 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11075
11076 #undef TARGET_MUST_PASS_IN_STACK
11077 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11078
11079 /* This target hook should return true if accesses to volatile bitfields
11080    should use the narrowest mode possible.  It should return false if these
11081    accesses should use the bitfield container type.  */
11082 #undef TARGET_NARROW_VOLATILE_BITFIELD
11083 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11084
11085 #undef  TARGET_OPTION_OVERRIDE
11086 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11087
11088 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11089 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11090   aarch64_override_options_after_change
11091
11092 #undef TARGET_PASS_BY_REFERENCE
11093 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11094
11095 #undef TARGET_PREFERRED_RELOAD_CLASS
11096 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11097
11098 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11099 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11100
11101 #undef TARGET_SECONDARY_RELOAD
11102 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11103
11104 #undef TARGET_SHIFT_TRUNCATION_MASK
11105 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11106
11107 #undef TARGET_SETUP_INCOMING_VARARGS
11108 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11109
11110 #undef TARGET_STRUCT_VALUE_RTX
11111 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11112
11113 #undef TARGET_REGISTER_MOVE_COST
11114 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11115
11116 #undef TARGET_RETURN_IN_MEMORY
11117 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11118
11119 #undef TARGET_RETURN_IN_MSB
11120 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11121
11122 #undef TARGET_RTX_COSTS
11123 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11124
11125 #undef TARGET_SCHED_ISSUE_RATE
11126 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11127
11128 #undef TARGET_TRAMPOLINE_INIT
11129 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11130
11131 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11132 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11133
11134 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11135 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11136
11137 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11138 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11139
11140 #undef TARGET_VECTORIZE_ADD_STMT_COST
11141 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11142
11143 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11144 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11145   aarch64_builtin_vectorization_cost
11146
11147 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11148 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11149
11150 #undef TARGET_VECTORIZE_BUILTINS
11151 #define TARGET_VECTORIZE_BUILTINS
11152
11153 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11154 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11155   aarch64_builtin_vectorized_function
11156
11157 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11158 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11159   aarch64_autovectorize_vector_sizes
11160
11161 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11162 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11163   aarch64_atomic_assign_expand_fenv
11164
11165 /* Section anchor support.  */
11166
11167 #undef TARGET_MIN_ANCHOR_OFFSET
11168 #define TARGET_MIN_ANCHOR_OFFSET -256
11169
11170 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11171    byte offset; we can do much more for larger data types, but have no way
11172    to determine the size of the access.  We assume accesses are aligned.  */
11173 #undef TARGET_MAX_ANCHOR_OFFSET
11174 #define TARGET_MAX_ANCHOR_OFFSET 4095
11175
11176 #undef TARGET_VECTOR_ALIGNMENT
11177 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11178
11179 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11180 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11181   aarch64_simd_vector_alignment_reachable
11182
11183 /* vec_perm support.  */
11184
11185 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11186 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11187   aarch64_vectorize_vec_perm_const_ok
11188
11189
11190 #undef TARGET_FIXED_CONDITION_CODE_REGS
11191 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11192
11193 #undef TARGET_FLAGS_REGNUM
11194 #define TARGET_FLAGS_REGNUM CC_REGNUM
11195
11196 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11197 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11198
11199 #undef TARGET_ASAN_SHADOW_OFFSET
11200 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11201
11202 #undef TARGET_LEGITIMIZE_ADDRESS
11203 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11204
11205 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11206 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11207   aarch64_use_by_pieces_infrastructure_p
11208
11209 #undef TARGET_CAN_USE_DOLOOP_P
11210 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11211
11212 #undef TARGET_SCHED_MACRO_FUSION_P
11213 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11214
11215 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11216 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11217
11218 #undef TARGET_SCHED_FUSION_PRIORITY
11219 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11220
11221 struct gcc_target targetm = TARGET_INITIALIZER;
11222
11223 #include "gt-aarch64.h"