gcc-4_9-branch/gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "tree.h"
  29 #include "stringpool.h"
  30 #include "stor-layout.h"
  31 #include "calls.h"
  32 #include "varasm.h"
  33 #include "regs.h"
  34 #include "df.h"
  35 #include "hard-reg-set.h"
  36 #include "output.h"
  37 #include "expr.h"
  38 #include "reload.h"
  39 #include "toplev.h"
  40 #include "target.h"
  41 #include "target-def.h"
  42 #include "targhooks.h"
  43 #include "ggc.h"
  44 #include "function.h"
  45 #include "tm_p.h"
  46 #include "recog.h"
  47 #include "langhooks.h"
  48 #include "diagnostic-core.h"
  49 #include "pointer-set.h"
  50 #include "hash-table.h"
  51 #include "vec.h"
  52 #include "basic-block.h"
  53 #include "tree-ssa-alias.h"
  54 #include "internal-fn.h"
  55 #include "gimple-fold.h"
  56 #include "tree-eh.h"
  57 #include "gimple-expr.h"
  58 #include "is-a.h"
  59 #include "gimple.h"
  60 #include "gimplify.h"
  61 #include "optabs.h"
  62 #include "dwarf2.h"
  63 #include "cfgloop.h"
  64 #include "tree-vectorizer.h"
  65 #include "config/arm/aarch-cost-tables.h"
  66 #include "dumpfile.h"
  67
  68 /* Defined for convenience.  */
  69 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  70
  71 /* Classifies an address.
  72
  73    ADDRESS_REG_IMM
  74        A simple base register plus immediate offset.
  75
  76    ADDRESS_REG_WB
  77        A base register indexed by immediate offset with writeback.
  78
  79    ADDRESS_REG_REG
  80        A base register indexed by (optionally scaled) register.
  81
  82    ADDRESS_REG_UXTW
  83        A base register indexed by (optionally scaled) zero-extended register.
  84
  85    ADDRESS_REG_SXTW
  86        A base register indexed by (optionally scaled) sign-extended register.
  87
  88    ADDRESS_LO_SUM
  89        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  90
  91    ADDRESS_SYMBOLIC:
  92        A constant symbolic address, in pc-relative literal pool.  */
  93
  94 enum aarch64_address_type {
  95   ADDRESS_REG_IMM,
  96   ADDRESS_REG_WB,
  97   ADDRESS_REG_REG,
  98   ADDRESS_REG_UXTW,
  99   ADDRESS_REG_SXTW,
 100   ADDRESS_LO_SUM,
 101   ADDRESS_SYMBOLIC
 102 };
 103
 104 struct aarch64_address_info {
 105   enum aarch64_address_type type;
 106   rtx base;
 107   rtx offset;
 108   int shift;
 109   enum aarch64_symbol_type symbol_type;
 110 };
 111
 112 struct simd_immediate_info
 113 {
 114   rtx value;
 115   int shift;
 116   int element_width;
 117   bool mvn;
 118   bool msl;
 119 };
 120
 121 /* The current code model.  */
 122 enum aarch64_code_model aarch64_cmodel;
 123
 124 #ifdef HAVE_AS_TLS
 125 #undef TARGET_HAVE_TLS
 126 #define TARGET_HAVE_TLS 1
 127 #endif
 128
 129 static bool aarch64_lra_p (void);
 130 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
 131 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
 132                                                      const_tree,
 133                                                      enum machine_mode *, int *,
 134                                                      bool *);
 135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_override_options_after_change (void);
 138 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 139 static unsigned bit_count (unsigned HOST_WIDE_INT);
 140 static bool aarch64_const_vec_all_same_int_p (rtx,
 141                                               HOST_WIDE_INT, HOST_WIDE_INT);
 142
 143 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 146
 147 /* The processor for which instructions should be scheduled.  */
 148 enum aarch64_processor aarch64_tune = cortexa53;
 149
 150 /* The current tuning set.  */
 151 const struct tune_params *aarch64_tune_params;
 152
 153 /* Mask to specify which instructions we are allowed to generate.  */
 154 unsigned long aarch64_isa_flags = 0;
 155
 156 /* Mask to specify which instruction scheduling options should be used.  */
 157 unsigned long aarch64_tune_flags = 0;
 158
 159 /* Tuning parameters.  */
 160
 161 #if HAVE_DESIGNATED_INITIALIZERS
 162 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 163 #else
 164 #define NAMED_PARAM(NAME, VAL) (VAL)
 165 #endif
 166
 167 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 168 __extension__
 169 #endif
 170
 171 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 172 __extension__
 173 #endif
 174 static const struct cpu_addrcost_table generic_addrcost_table =
 175 {
 176 #if HAVE_DESIGNATED_INITIALIZERS
 177   .addr_scale_costs =
 178 #endif
 179     {
 180       NAMED_PARAM (qi, 0),
 181       NAMED_PARAM (hi, 0),
 182       NAMED_PARAM (si, 0),
 183       NAMED_PARAM (ti, 0),
 184     },
 185   NAMED_PARAM (pre_modify, 0),
 186   NAMED_PARAM (post_modify, 0),
 187   NAMED_PARAM (register_offset, 0),
 188   NAMED_PARAM (register_extend, 0),
 189   NAMED_PARAM (imm_offset, 0)
 190 };
 191
 192 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 193 __extension__
 194 #endif
 195 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 196 {
 197 #if HAVE_DESIGNATED_INITIALIZERS
 198   .addr_scale_costs =
 199 #endif
 200     {
 201       NAMED_PARAM (qi, 0),
 202       NAMED_PARAM (hi, 1),
 203       NAMED_PARAM (si, 0),
 204       NAMED_PARAM (ti, 1),
 205     },
 206   NAMED_PARAM (pre_modify, 0),
 207   NAMED_PARAM (post_modify, 0),
 208   NAMED_PARAM (register_offset, 0),
 209   NAMED_PARAM (register_extend, 0),
 210   NAMED_PARAM (imm_offset, 0),
 211 };
 212
 213 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 214 __extension__
 215 #endif
 216 static const struct cpu_regmove_cost generic_regmove_cost =
 217 {
 218   NAMED_PARAM (GP2GP, 1),
 219   NAMED_PARAM (GP2FP, 2),
 220   NAMED_PARAM (FP2GP, 2),
 221   /* We currently do not provide direct support for TFmode Q->Q move.
 222      Therefore we need to raise the cost above 2 in order to have
 223      reload handle the situation.  */
 224   NAMED_PARAM (FP2FP, 4)
 225 };
 226
 227 /* Generic costs for vector insn classes.  */
 228 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 229 __extension__
 230 #endif
 231 static const struct cpu_vector_cost generic_vector_cost =
 232 {
 233   NAMED_PARAM (scalar_stmt_cost, 1),
 234   NAMED_PARAM (scalar_load_cost, 1),
 235   NAMED_PARAM (scalar_store_cost, 1),
 236   NAMED_PARAM (vec_stmt_cost, 1),
 237   NAMED_PARAM (vec_to_scalar_cost, 1),
 238   NAMED_PARAM (scalar_to_vec_cost, 1),
 239   NAMED_PARAM (vec_align_load_cost, 1),
 240   NAMED_PARAM (vec_unalign_load_cost, 1),
 241   NAMED_PARAM (vec_unalign_store_cost, 1),
 242   NAMED_PARAM (vec_store_cost, 1),
 243   NAMED_PARAM (cond_taken_branch_cost, 3),
 244   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 245 };
 246
 247 /* Generic costs for vector insn classes.  */
 248 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 249 __extension__
 250 #endif
 251 static const struct cpu_vector_cost cortexa57_vector_cost =
 252 {
 253   NAMED_PARAM (scalar_stmt_cost, 1),
 254   NAMED_PARAM (scalar_load_cost, 4),
 255   NAMED_PARAM (scalar_store_cost, 1),
 256   NAMED_PARAM (vec_stmt_cost, 3),
 257   NAMED_PARAM (vec_to_scalar_cost, 8),
 258   NAMED_PARAM (scalar_to_vec_cost, 8),
 259   NAMED_PARAM (vec_align_load_cost, 5),
 260   NAMED_PARAM (vec_unalign_load_cost, 5),
 261   NAMED_PARAM (vec_unalign_store_cost, 1),
 262   NAMED_PARAM (vec_store_cost, 1),
 263   NAMED_PARAM (cond_taken_branch_cost, 1),
 264   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 265 };
 266
 267 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 268 __extension__
 269 #endif
 270 static const struct tune_params generic_tunings =
 271 {
 272   &cortexa57_extra_costs,
 273   &generic_addrcost_table,
 274   &generic_regmove_cost,
 275   &generic_vector_cost,
 276   NAMED_PARAM (memmov_cost, 4),
 277   NAMED_PARAM (issue_rate, 2)
 278 };
 279
 280 static const struct tune_params cortexa53_tunings =
 281 {
 282   &cortexa53_extra_costs,
 283   &generic_addrcost_table,
 284   &generic_regmove_cost,
 285   &generic_vector_cost,
 286   NAMED_PARAM (memmov_cost, 4),
 287   NAMED_PARAM (issue_rate, 2)
 288 };
 289
 290 static const struct tune_params cortexa57_tunings =
 291 {
 292   &cortexa57_extra_costs,
 293   &cortexa57_addrcost_table,
 294   &generic_regmove_cost,
 295   &cortexa57_vector_cost,
 296   NAMED_PARAM (memmov_cost, 4),
 297   NAMED_PARAM (issue_rate, 3)
 298 };
 299
 300 /* A processor implementing AArch64.  */
 301 struct processor
 302 {
 303   const char *const name;
 304   enum aarch64_processor core;
 305   const char *arch;
 306   const unsigned long flags;
 307   const struct tune_params *const tune;
 308 };
 309
 310 /* Processor cores implementing AArch64.  */
 311 static const struct processor all_cores[] =
 312 {
 313 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
 314   {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
 315 #include "aarch64-cores.def"
 316 #undef AARCH64_CORE
 317   {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
 318   {NULL, aarch64_none, NULL, 0, NULL}
 319 };
 320
 321 /* Architectures implementing AArch64.  */
 322 static const struct processor all_architectures[] =
 323 {
 324 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 325   {NAME, CORE, #ARCH, FLAGS, NULL},
 326 #include "aarch64-arches.def"
 327 #undef AARCH64_ARCH
 328   {NULL, aarch64_none, NULL, 0, NULL}
 329 };
 330
 331 /* Target specification.  These are populated as commandline arguments
 332    are processed, or NULL if not specified.  */
 333 static const struct processor *selected_arch;
 334 static const struct processor *selected_cpu;
 335 static const struct processor *selected_tune;
 336
 337 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 338
 339 /* An ISA extension in the co-processor and main instruction set space.  */
 340 struct aarch64_option_extension
 341 {
 342   const char *const name;
 343   const unsigned long flags_on;
 344   const unsigned long flags_off;
 345 };
 346
 347 /* ISA extensions in AArch64.  */
 348 static const struct aarch64_option_extension all_extensions[] =
 349 {
 350 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 351   {NAME, FLAGS_ON, FLAGS_OFF},
 352 #include "aarch64-option-extensions.def"
 353 #undef AARCH64_OPT_EXTENSION
 354   {NULL, 0, 0}
 355 };
 356
 357 /* Used to track the size of an address when generating a pre/post
 358    increment address.  */
 359 static enum machine_mode aarch64_memory_reference_mode;
 360
 361 /* Used to force GTY into this file.  */
 362 static GTY(()) int gty_dummy;
 363
 364 /* A table of valid AArch64 "bitmask immediate" values for
 365    logical instructions.  */
 366
 367 #define AARCH64_NUM_BITMASKS  5334
 368 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 369
 370 typedef enum aarch64_cond_code
 371 {
 372   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 373   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 374   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 375 }
 376 aarch64_cc;
 377
 378 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 379
 380 /* The condition codes of the processor, and the inverse function.  */
 381 static const char * const aarch64_condition_codes[] =
 382 {
 383   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 384   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 385 };
 386
 387 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 388 unsigned
 389 aarch64_dbx_register_number (unsigned regno)
 390 {
 391    if (GP_REGNUM_P (regno))
 392      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 393    else if (regno == SP_REGNUM)
 394      return AARCH64_DWARF_SP;
 395    else if (FP_REGNUM_P (regno))
 396      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 397
 398    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 399       equivalent DWARF register.  */
 400    return DWARF_FRAME_REGISTERS;
 401 }
 402
 403 /* Return TRUE if MODE is any of the large INT modes.  */
 404 static bool
 405 aarch64_vect_struct_mode_p (enum machine_mode mode)
 406 {
 407   return mode == OImode || mode == CImode || mode == XImode;
 408 }
 409
 410 /* Return TRUE if MODE is any of the vector modes.  */
 411 static bool
 412 aarch64_vector_mode_p (enum machine_mode mode)
 413 {
 414   return aarch64_vector_mode_supported_p (mode)
 415          || aarch64_vect_struct_mode_p (mode);
 416 }
 417
 418 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 419 static bool
 420 aarch64_array_mode_supported_p (enum machine_mode mode,
 421                                 unsigned HOST_WIDE_INT nelems)
 422 {
 423   if (TARGET_SIMD
 424       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 425       && (nelems >= 2 && nelems <= 4))
 426     return true;
 427
 428   return false;
 429 }
 430
 431 /* Implement HARD_REGNO_NREGS.  */
 432
 433 int
 434 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
 435 {
 436   switch (aarch64_regno_regclass (regno))
 437     {
 438     case FP_REGS:
 439     case FP_LO_REGS:
 440       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 441     default:
 442       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 443     }
 444   gcc_unreachable ();
 445 }
 446
 447 /* Implement HARD_REGNO_MODE_OK.  */
 448
 449 int
 450 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
 451 {
 452   if (GET_MODE_CLASS (mode) == MODE_CC)
 453     return regno == CC_REGNUM;
 454
 455   if (regno == SP_REGNUM)
 456     /* The purpose of comparing with ptr_mode is to support the
 457        global register variable associated with the stack pointer
 458        register via the syntax of asm ("wsp") in ILP32.  */
 459     return mode == Pmode || mode == ptr_mode;
 460
 461   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 462     return mode == Pmode;
 463
 464   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 465     return 1;
 466
 467   if (FP_REGNUM_P (regno))
 468     {
 469       if (aarch64_vect_struct_mode_p (mode))
 470         return
 471           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 472       else
 473         return 1;
 474     }
 475
 476   return 0;
 477 }
 478
 479 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 480 enum machine_mode
 481 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 482                                      enum machine_mode mode)
 483 {
 484   /* Handle modes that fit within single registers.  */
 485   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 486     {
 487       if (GET_MODE_SIZE (mode) >= 4)
 488         return mode;
 489       else
 490         return SImode;
 491     }
 492   /* Fall back to generic for multi-reg and very large modes.  */
 493   else
 494     return choose_hard_reg_mode (regno, nregs, false);
 495 }
 496
 497 /* Return true if calls to DECL should be treated as
 498    long-calls (ie called via a register).  */
 499 static bool
 500 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 501 {
 502   return false;
 503 }
 504
 505 /* Return true if calls to symbol-ref SYM should be treated as
 506    long-calls (ie called via a register).  */
 507 bool
 508 aarch64_is_long_call_p (rtx sym)
 509 {
 510   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 511 }
 512
 513 /* Return true if the offsets to a zero/sign-extract operation
 514    represent an expression that matches an extend operation.  The
 515    operands represent the paramters from
 516
 517    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 518 bool
 519 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 520                                 rtx extract_imm)
 521 {
 522   HOST_WIDE_INT mult_val, extract_val;
 523
 524   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 525     return false;
 526
 527   mult_val = INTVAL (mult_imm);
 528   extract_val = INTVAL (extract_imm);
 529
 530   if (extract_val > 8
 531       && extract_val < GET_MODE_BITSIZE (mode)
 532       && exact_log2 (extract_val & ~7) > 0
 533       && (extract_val & 7) <= 4
 534       && mult_val == (1 << (extract_val & 7)))
 535     return true;
 536
 537   return false;
 538 }
 539
 540 /* Emit an insn that's a simple single-set.  Both the operands must be
 541    known to be valid.  */
 542 inline static rtx
 543 emit_set_insn (rtx x, rtx y)
 544 {
 545   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 546 }
 547
 548 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 549    return the rtx for register 0 in the proper mode.  */
 550 rtx
 551 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 552 {
 553   enum machine_mode mode = SELECT_CC_MODE (code, x, y);
 554   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 555
 556   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 557   return cc_reg;
 558 }
 559
 560 /* Build the SYMBOL_REF for __tls_get_addr.  */
 561
 562 static GTY(()) rtx tls_get_addr_libfunc;
 563
 564 rtx
 565 aarch64_tls_get_addr (void)
 566 {
 567   if (!tls_get_addr_libfunc)
 568     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 569   return tls_get_addr_libfunc;
 570 }
 571
 572 /* Return the TLS model to use for ADDR.  */
 573
 574 static enum tls_model
 575 tls_symbolic_operand_type (rtx addr)
 576 {
 577   enum tls_model tls_kind = TLS_MODEL_NONE;
 578   rtx sym, addend;
 579
 580   if (GET_CODE (addr) == CONST)
 581     {
 582       split_const (addr, &sym, &addend);
 583       if (GET_CODE (sym) == SYMBOL_REF)
 584         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 585     }
 586   else if (GET_CODE (addr) == SYMBOL_REF)
 587     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 588
 589   return tls_kind;
 590 }
 591
 592 /* We'll allow lo_sum's in addresses in our legitimate addresses
 593    so that combine would take care of combining addresses where
 594    necessary, but for generation purposes, we'll generate the address
 595    as :
 596    RTL                               Absolute
 597    tmp = hi (symbol_ref);            adrp  x1, foo
 598    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 599                                      nop
 600
 601    PIC                               TLS
 602    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 603    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 604                                      bl   __tls_get_addr
 605                                      nop
 606
 607    Load TLS symbol, depending on TLS mechanism and TLS access model.
 608
 609    Global Dynamic - Traditional TLS:
 610    adrp tmp, :tlsgd:imm
 611    add  dest, tmp, #:tlsgd_lo12:imm
 612    bl   __tls_get_addr
 613
 614    Global Dynamic - TLS Descriptors:
 615    adrp dest, :tlsdesc:imm
 616    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 617    add  dest, dest, #:tlsdesc_lo12:imm
 618    blr  tmp
 619    mrs  tp, tpidr_el0
 620    add  dest, dest, tp
 621
 622    Initial Exec:
 623    mrs  tp, tpidr_el0
 624    adrp tmp, :gottprel:imm
 625    ldr  dest, [tmp, #:gottprel_lo12:imm]
 626    add  dest, dest, tp
 627
 628    Local Exec:
 629    mrs  tp, tpidr_el0
 630    add  t0, tp, #:tprel_hi12:imm
 631    add  t0, #:tprel_lo12_nc:imm
 632 */
 633
 634 static void
 635 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 636                                    enum aarch64_symbol_type type)
 637 {
 638   switch (type)
 639     {
 640     case SYMBOL_SMALL_ABSOLUTE:
 641       {
 642         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 643         rtx tmp_reg = dest;
 644         enum machine_mode mode = GET_MODE (dest);
 645
 646         gcc_assert (mode == Pmode || mode == ptr_mode);
 647
 648         if (can_create_pseudo_p ())
 649           tmp_reg = gen_reg_rtx (mode);
 650
 651         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 652         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 653         return;
 654       }
 655
 656     case SYMBOL_TINY_ABSOLUTE:
 657       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 658       return;
 659
 660     case SYMBOL_SMALL_GOT:
 661       {
 662         /* In ILP32, the mode of dest can be either SImode or DImode,
 663            while the got entry is always of SImode size.  The mode of
 664            dest depends on how dest is used: if dest is assigned to a
 665            pointer (e.g. in the memory), it has SImode; it may have
 666            DImode if dest is dereferenced to access the memeory.
 667            This is why we have to handle three different ldr_got_small
 668            patterns here (two patterns for ILP32).  */
 669         rtx tmp_reg = dest;
 670         enum machine_mode mode = GET_MODE (dest);
 671
 672         if (can_create_pseudo_p ())
 673           tmp_reg = gen_reg_rtx (mode);
 674
 675         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 676         if (mode == ptr_mode)
 677           {
 678             if (mode == DImode)
 679               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 680             else
 681               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 682           }
 683         else
 684           {
 685             gcc_assert (mode == Pmode);
 686             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 687           }
 688
 689         return;
 690       }
 691
 692     case SYMBOL_SMALL_TLSGD:
 693       {
 694         rtx insns;
 695         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 696
 697         start_sequence ();
 698         emit_call_insn (gen_tlsgd_small (result, imm));
 699         insns = get_insns ();
 700         end_sequence ();
 701
 702         RTL_CONST_CALL_P (insns) = 1;
 703         emit_libcall_block (insns, dest, result, imm);
 704         return;
 705       }
 706
 707     case SYMBOL_SMALL_TLSDESC:
 708       {
 709         enum machine_mode mode = GET_MODE (dest);
 710         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 711         rtx tp;
 712
 713         gcc_assert (mode == Pmode || mode == ptr_mode);
 714
 715         /* In ILP32, the got entry is always of SImode size.  Unlike
 716            small GOT, the dest is fixed at reg 0.  */
 717         if (TARGET_ILP32)
 718           emit_insn (gen_tlsdesc_small_si (imm));
 719         else
 720           emit_insn (gen_tlsdesc_small_di (imm));
 721         tp = aarch64_load_tp (NULL);
 722
 723         if (mode != Pmode)
 724           tp = gen_lowpart (mode, tp);
 725
 726         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 727         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 728         return;
 729       }
 730
 731     case SYMBOL_SMALL_GOTTPREL:
 732       {
 733         /* In ILP32, the mode of dest can be either SImode or DImode,
 734            while the got entry is always of SImode size.  The mode of
 735            dest depends on how dest is used: if dest is assigned to a
 736            pointer (e.g. in the memory), it has SImode; it may have
 737            DImode if dest is dereferenced to access the memeory.
 738            This is why we have to handle three different tlsie_small
 739            patterns here (two patterns for ILP32).  */
 740         enum machine_mode mode = GET_MODE (dest);
 741         rtx tmp_reg = gen_reg_rtx (mode);
 742         rtx tp = aarch64_load_tp (NULL);
 743
 744         if (mode == ptr_mode)
 745           {
 746             if (mode == DImode)
 747               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 748             else
 749               {
 750                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 751                 tp = gen_lowpart (mode, tp);
 752               }
 753           }
 754         else
 755           {
 756             gcc_assert (mode == Pmode);
 757             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 758           }
 759
 760         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 761         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 762         return;
 763       }
 764
 765     case SYMBOL_SMALL_TPREL:
 766       {
 767         rtx tp = aarch64_load_tp (NULL);
 768         emit_insn (gen_tlsle_small (dest, tp, imm));
 769         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 770         return;
 771       }
 772
 773     case SYMBOL_TINY_GOT:
 774       emit_insn (gen_ldr_got_tiny (dest, imm));
 775       return;
 776
 777     default:
 778       gcc_unreachable ();
 779     }
 780 }
 781
 782 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 783    handle all moves if !can_create_pseudo_p ().  The distinction is
 784    important because, unlike emit_move_insn, the move expanders know
 785    how to force Pmode objects into the constant pool even when the
 786    constant pool address is not itself legitimate.  */
 787 static rtx
 788 aarch64_emit_move (rtx dest, rtx src)
 789 {
 790   return (can_create_pseudo_p ()
 791           ? emit_move_insn (dest, src)
 792           : emit_move_insn_1 (dest, src));
 793 }
 794
 795 /* Split a 128-bit move operation into two 64-bit move operations,
 796    taking care to handle partial overlap of register to register
 797    copies.  Special cases are needed when moving between GP regs and
 798    FP regs.  SRC can be a register, constant or memory; DST a register
 799    or memory.  If either operand is memory it must not have any side
 800    effects.  */
 801 void
 802 aarch64_split_128bit_move (rtx dst, rtx src)
 803 {
 804   rtx dst_lo, dst_hi;
 805   rtx src_lo, src_hi;
 806
 807   enum machine_mode mode = GET_MODE (dst);
 808
 809   gcc_assert (mode == TImode || mode == TFmode);
 810   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 811   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 812
 813   if (REG_P (dst) && REG_P (src))
 814     {
 815       int src_regno = REGNO (src);
 816       int dst_regno = REGNO (dst);
 817
 818       /* Handle FP <-> GP regs.  */
 819       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 820         {
 821           src_lo = gen_lowpart (word_mode, src);
 822           src_hi = gen_highpart (word_mode, src);
 823
 824           if (mode == TImode)
 825             {
 826               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 827               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 828             }
 829           else
 830             {
 831               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 832               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 833             }
 834           return;
 835         }
 836       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 837         {
 838           dst_lo = gen_lowpart (word_mode, dst);
 839           dst_hi = gen_highpart (word_mode, dst);
 840
 841           if (mode == TImode)
 842             {
 843               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 844               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 845             }
 846           else
 847             {
 848               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 849               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 850             }
 851           return;
 852         }
 853     }
 854
 855   dst_lo = gen_lowpart (word_mode, dst);
 856   dst_hi = gen_highpart (word_mode, dst);
 857   src_lo = gen_lowpart (word_mode, src);
 858   src_hi = gen_highpart_mode (word_mode, mode, src);
 859
 860   /* At most one pairing may overlap.  */
 861   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 862     {
 863       aarch64_emit_move (dst_hi, src_hi);
 864       aarch64_emit_move (dst_lo, src_lo);
 865     }
 866   else
 867     {
 868       aarch64_emit_move (dst_lo, src_lo);
 869       aarch64_emit_move (dst_hi, src_hi);
 870     }
 871 }
 872
 873 bool
 874 aarch64_split_128bit_move_p (rtx dst, rtx src)
 875 {
 876   return (! REG_P (src)
 877           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 878 }
 879
 880 /* Split a complex SIMD combine.  */
 881
 882 void
 883 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 884 {
 885   enum machine_mode src_mode = GET_MODE (src1);
 886   enum machine_mode dst_mode = GET_MODE (dst);
 887
 888   gcc_assert (VECTOR_MODE_P (dst_mode));
 889
 890   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
 891     {
 892       rtx (*gen) (rtx, rtx, rtx);
 893
 894       switch (src_mode)
 895         {
 896         case V8QImode:
 897           gen = gen_aarch64_simd_combinev8qi;
 898           break;
 899         case V4HImode:
 900           gen = gen_aarch64_simd_combinev4hi;
 901           break;
 902         case V2SImode:
 903           gen = gen_aarch64_simd_combinev2si;
 904           break;
 905         case V2SFmode:
 906           gen = gen_aarch64_simd_combinev2sf;
 907           break;
 908         case DImode:
 909           gen = gen_aarch64_simd_combinedi;
 910           break;
 911         case DFmode:
 912           gen = gen_aarch64_simd_combinedf;
 913           break;
 914         default:
 915           gcc_unreachable ();
 916         }
 917
 918       emit_insn (gen (dst, src1, src2));
 919       return;
 920     }
 921 }
 922
 923 /* Split a complex SIMD move.  */
 924
 925 void
 926 aarch64_split_simd_move (rtx dst, rtx src)
 927 {
 928   enum machine_mode src_mode = GET_MODE (src);
 929   enum machine_mode dst_mode = GET_MODE (dst);
 930
 931   gcc_assert (VECTOR_MODE_P (dst_mode));
 932
 933   if (REG_P (dst) && REG_P (src))
 934     {
 935       rtx (*gen) (rtx, rtx);
 936
 937       gcc_assert (VECTOR_MODE_P (src_mode));
 938
 939       switch (src_mode)
 940         {
 941         case V16QImode:
 942           gen = gen_aarch64_split_simd_movv16qi;
 943           break;
 944         case V8HImode:
 945           gen = gen_aarch64_split_simd_movv8hi;
 946           break;
 947         case V4SImode:
 948           gen = gen_aarch64_split_simd_movv4si;
 949           break;
 950         case V2DImode:
 951           gen = gen_aarch64_split_simd_movv2di;
 952           break;
 953         case V4SFmode:
 954           gen = gen_aarch64_split_simd_movv4sf;
 955           break;
 956         case V2DFmode:
 957           gen = gen_aarch64_split_simd_movv2df;
 958           break;
 959         default:
 960           gcc_unreachable ();
 961         }
 962
 963       emit_insn (gen (dst, src));
 964       return;
 965     }
 966 }
 967
 968 static rtx
 969 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
 970 {
 971   if (can_create_pseudo_p ())
 972     return force_reg (mode, value);
 973   else
 974     {
 975       x = aarch64_emit_move (x, value);
 976       return x;
 977     }
 978 }
 979
 980
 981 static rtx
 982 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
 983 {
 984   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
 985     {
 986       rtx high;
 987       /* Load the full offset into a register.  This
 988          might be improvable in the future.  */
 989       high = GEN_INT (offset);
 990       offset = 0;
 991       high = aarch64_force_temporary (mode, temp, high);
 992       reg = aarch64_force_temporary (mode, temp,
 993                                      gen_rtx_PLUS (mode, high, reg));
 994     }
 995   return plus_constant (mode, reg, offset);
 996 }
 997
 998 void
 999 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1000 {
1001   enum machine_mode mode = GET_MODE (dest);
1002   unsigned HOST_WIDE_INT mask;
1003   int i;
1004   bool first;
1005   unsigned HOST_WIDE_INT val;
1006   bool subtargets;
1007   rtx subtarget;
1008   int one_match, zero_match;
1009
1010   gcc_assert (mode == SImode || mode == DImode);
1011
1012   /* Check on what type of symbol it is.  */
1013   if (GET_CODE (imm) == SYMBOL_REF
1014       || GET_CODE (imm) == LABEL_REF
1015       || GET_CODE (imm) == CONST)
1016     {
1017       rtx mem, base, offset;
1018       enum aarch64_symbol_type sty;
1019
1020       /* If we have (const (plus symbol offset)), separate out the offset
1021          before we start classifying the symbol.  */
1022       split_const (imm, &base, &offset);
1023
1024       sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1025       switch (sty)
1026         {
1027         case SYMBOL_FORCE_TO_MEM:
1028           if (offset != const0_rtx
1029               && targetm.cannot_force_const_mem (mode, imm))
1030             {
1031               gcc_assert (can_create_pseudo_p ());
1032               base = aarch64_force_temporary (mode, dest, base);
1033               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1034               aarch64_emit_move (dest, base);
1035               return;
1036             }
1037           mem = force_const_mem (ptr_mode, imm);
1038           gcc_assert (mem);
1039           if (mode != ptr_mode)
1040             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1041           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1042           return;
1043
1044         case SYMBOL_SMALL_TLSGD:
1045         case SYMBOL_SMALL_TLSDESC:
1046         case SYMBOL_SMALL_GOTTPREL:
1047         case SYMBOL_SMALL_GOT:
1048         case SYMBOL_TINY_GOT:
1049           if (offset != const0_rtx)
1050             {
1051               gcc_assert(can_create_pseudo_p ());
1052               base = aarch64_force_temporary (mode, dest, base);
1053               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1054               aarch64_emit_move (dest, base);
1055               return;
1056             }
1057           /* FALLTHRU */
1058
1059         case SYMBOL_SMALL_TPREL:
1060         case SYMBOL_SMALL_ABSOLUTE:
1061         case SYMBOL_TINY_ABSOLUTE:
1062           aarch64_load_symref_appropriately (dest, imm, sty);
1063           return;
1064
1065         default:
1066           gcc_unreachable ();
1067         }
1068     }
1069
1070   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1071     {
1072       emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1073       return;
1074     }
1075
1076   if (!CONST_INT_P (imm))
1077     {
1078       if (GET_CODE (imm) == HIGH)
1079         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1080       else
1081         {
1082           rtx mem = force_const_mem (mode, imm);
1083           gcc_assert (mem);
1084           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1085         }
1086
1087       return;
1088     }
1089
1090   if (mode == SImode)
1091     {
1092       /* We know we can't do this in 1 insn, and we must be able to do it
1093          in two; so don't mess around looking for sequences that don't buy
1094          us anything.  */
1095       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1096       emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1097                                  GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1098       return;
1099     }
1100
1101   /* Remaining cases are all for DImode.  */
1102
1103   val = INTVAL (imm);
1104   subtargets = optimize && can_create_pseudo_p ();
1105
1106   one_match = 0;
1107   zero_match = 0;
1108   mask = 0xffff;
1109
1110   for (i = 0; i < 64; i += 16, mask <<= 16)
1111     {
1112       if ((val & mask) == 0)
1113         zero_match++;
1114       else if ((val & mask) == mask)
1115         one_match++;
1116     }
1117
1118   if (one_match == 2)
1119     {
1120       mask = 0xffff;
1121       for (i = 0; i < 64; i += 16, mask <<= 16)
1122         {
1123           if ((val & mask) != mask)
1124             {
1125               emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1126               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1127                                          GEN_INT ((val >> i) & 0xffff)));
1128               return;
1129             }
1130         }
1131       gcc_unreachable ();
1132     }
1133
1134   if (zero_match == 2)
1135     goto simple_sequence;
1136
1137   mask = 0x0ffff0000UL;
1138   for (i = 16; i < 64; i += 16, mask <<= 16)
1139     {
1140       HOST_WIDE_INT comp = mask & ~(mask - 1);
1141
1142       if (aarch64_uimm12_shift (val - (val & mask)))
1143         {
1144           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1145
1146           emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1147           emit_insn (gen_adddi3 (dest, subtarget,
1148                                  GEN_INT (val - (val & mask))));
1149           return;
1150         }
1151       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1152         {
1153           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1154
1155           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1156                                   GEN_INT ((val + comp) & mask)));
1157           emit_insn (gen_adddi3 (dest, subtarget,
1158                                  GEN_INT (val - ((val + comp) & mask))));
1159           return;
1160         }
1161       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1162         {
1163           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1164
1165           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1166                                   GEN_INT ((val - comp) | ~mask)));
1167           emit_insn (gen_adddi3 (dest, subtarget,
1168                                  GEN_INT (val - ((val - comp) | ~mask))));
1169           return;
1170         }
1171       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1172         {
1173           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1174
1175           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1176                                   GEN_INT (val | ~mask)));
1177           emit_insn (gen_adddi3 (dest, subtarget,
1178                                  GEN_INT (val - (val | ~mask))));
1179           return;
1180         }
1181     }
1182
1183   /* See if we can do it by arithmetically combining two
1184      immediates.  */
1185   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1186     {
1187       int j;
1188       mask = 0xffff;
1189
1190       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1191           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1192         {
1193           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1194           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1195                                   GEN_INT (aarch64_bitmasks[i])));
1196           emit_insn (gen_adddi3 (dest, subtarget,
1197                                  GEN_INT (val - aarch64_bitmasks[i])));
1198           return;
1199         }
1200
1201       for (j = 0; j < 64; j += 16, mask <<= 16)
1202         {
1203           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1204             {
1205               emit_insn (gen_rtx_SET (VOIDmode, dest,
1206                                       GEN_INT (aarch64_bitmasks[i])));
1207               emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1208                                          GEN_INT ((val >> j) & 0xffff)));
1209               return;
1210             }
1211         }
1212     }
1213
1214   /* See if we can do it by logically combining two immediates.  */
1215   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1216     {
1217       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1218         {
1219           int j;
1220
1221           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1222             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1223               {
1224                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1225                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1226                                         GEN_INT (aarch64_bitmasks[i])));
1227                 emit_insn (gen_iordi3 (dest, subtarget,
1228                                        GEN_INT (aarch64_bitmasks[j])));
1229                 return;
1230               }
1231         }
1232       else if ((val & aarch64_bitmasks[i]) == val)
1233         {
1234           int j;
1235
1236           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1237             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1238               {
1239
1240                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1241                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1242                                         GEN_INT (aarch64_bitmasks[j])));
1243                 emit_insn (gen_anddi3 (dest, subtarget,
1244                                        GEN_INT (aarch64_bitmasks[i])));
1245                 return;
1246               }
1247         }
1248     }
1249
1250  simple_sequence:
1251   first = true;
1252   mask = 0xffff;
1253   for (i = 0; i < 64; i += 16, mask <<= 16)
1254     {
1255       if ((val & mask) != 0)
1256         {
1257           if (first)
1258             {
1259               emit_insn (gen_rtx_SET (VOIDmode, dest,
1260                                       GEN_INT (val & mask)));
1261               first = false;
1262             }
1263           else
1264             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1265                                        GEN_INT ((val >> i) & 0xffff)));
1266         }
1267     }
1268 }
1269
1270 static bool
1271 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1272                                  tree exp ATTRIBUTE_UNUSED)
1273 {
1274   /* Currently, always true.  */
1275   return true;
1276 }
1277
1278 /* Implement TARGET_PASS_BY_REFERENCE.  */
1279
1280 static bool
1281 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1282                            enum machine_mode mode,
1283                            const_tree type,
1284                            bool named ATTRIBUTE_UNUSED)
1285 {
1286   HOST_WIDE_INT size;
1287   enum machine_mode dummymode;
1288   int nregs;
1289
1290   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1291   size = (mode == BLKmode && type)
1292     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1293
1294   /* Aggregates are passed by reference based on their size.  */
1295   if (type && AGGREGATE_TYPE_P (type))
1296     {
1297       size = int_size_in_bytes (type);
1298     }
1299
1300   /* Variable sized arguments are always returned by reference.  */
1301   if (size < 0)
1302     return true;
1303
1304   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1305   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1306                                                &dummymode, &nregs,
1307                                                NULL))
1308     return false;
1309
1310   /* Arguments which are variable sized or larger than 2 registers are
1311      passed by reference unless they are a homogenous floating point
1312      aggregate.  */
1313   return size > 2 * UNITS_PER_WORD;
1314 }
1315
1316 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1317 static bool
1318 aarch64_return_in_msb (const_tree valtype)
1319 {
1320   enum machine_mode dummy_mode;
1321   int dummy_int;
1322
1323   /* Never happens in little-endian mode.  */
1324   if (!BYTES_BIG_ENDIAN)
1325     return false;
1326
1327   /* Only composite types smaller than or equal to 16 bytes can
1328      be potentially returned in registers.  */
1329   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1330       || int_size_in_bytes (valtype) <= 0
1331       || int_size_in_bytes (valtype) > 16)
1332     return false;
1333
1334   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1335      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1336      is always passed/returned in the least significant bits of fp/simd
1337      register(s).  */
1338   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1339                                                &dummy_mode, &dummy_int, NULL))
1340     return false;
1341
1342   return true;
1343 }
1344
1345 /* Implement TARGET_FUNCTION_VALUE.
1346    Define how to find the value returned by a function.  */
1347
1348 static rtx
1349 aarch64_function_value (const_tree type, const_tree func,
1350                         bool outgoing ATTRIBUTE_UNUSED)
1351 {
1352   enum machine_mode mode;
1353   int unsignedp;
1354   int count;
1355   enum machine_mode ag_mode;
1356
1357   mode = TYPE_MODE (type);
1358   if (INTEGRAL_TYPE_P (type))
1359     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1360
1361   if (aarch64_return_in_msb (type))
1362     {
1363       HOST_WIDE_INT size = int_size_in_bytes (type);
1364
1365       if (size % UNITS_PER_WORD != 0)
1366         {
1367           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1368           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1369         }
1370     }
1371
1372   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1373                                                &ag_mode, &count, NULL))
1374     {
1375       if (!aarch64_composite_type_p (type, mode))
1376         {
1377           gcc_assert (count == 1 && mode == ag_mode);
1378           return gen_rtx_REG (mode, V0_REGNUM);
1379         }
1380       else
1381         {
1382           int i;
1383           rtx par;
1384
1385           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1386           for (i = 0; i < count; i++)
1387             {
1388               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1389               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1390                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1391               XVECEXP (par, 0, i) = tmp;
1392             }
1393           return par;
1394         }
1395     }
1396   else
1397     return gen_rtx_REG (mode, R0_REGNUM);
1398 }
1399
1400 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1401    Return true if REGNO is the number of a hard register in which the values
1402    of called function may come back.  */
1403
1404 static bool
1405 aarch64_function_value_regno_p (const unsigned int regno)
1406 {
1407   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1408      of 16-byte return values are: 128-bit integers and 16-byte small
1409      structures (excluding homogeneous floating-point aggregates).  */
1410   if (regno == R0_REGNUM || regno == R1_REGNUM)
1411     return true;
1412
1413   /* Up to four fp/simd registers can return a function value, e.g. a
1414      homogeneous floating-point aggregate having four members.  */
1415   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1416     return !TARGET_GENERAL_REGS_ONLY;
1417
1418   return false;
1419 }
1420
1421 /* Implement TARGET_RETURN_IN_MEMORY.
1422
1423    If the type T of the result of a function is such that
1424      void func (T arg)
1425    would require that arg be passed as a value in a register (or set of
1426    registers) according to the parameter passing rules, then the result
1427    is returned in the same registers as would be used for such an
1428    argument.  */
1429
1430 static bool
1431 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1432 {
1433   HOST_WIDE_INT size;
1434   enum machine_mode ag_mode;
1435   int count;
1436
1437   if (!AGGREGATE_TYPE_P (type)
1438       && TREE_CODE (type) != COMPLEX_TYPE
1439       && TREE_CODE (type) != VECTOR_TYPE)
1440     /* Simple scalar types always returned in registers.  */
1441     return false;
1442
1443   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1444                                                type,
1445                                                &ag_mode,
1446                                                &count,
1447                                                NULL))
1448     return false;
1449
1450   /* Types larger than 2 registers returned in memory.  */
1451   size = int_size_in_bytes (type);
1452   return (size < 0 || size > 2 * UNITS_PER_WORD);
1453 }
1454
1455 static bool
1456 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1457                                const_tree type, int *nregs)
1458 {
1459   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1460   return aarch64_vfp_is_call_or_return_candidate (mode,
1461                                                   type,
1462                                                   &pcum->aapcs_vfp_rmode,
1463                                                   nregs,
1464                                                   NULL);
1465 }
1466
1467 /* Given MODE and TYPE of a function argument, return the alignment in
1468    bits.  The idea is to suppress any stronger alignment requested by
1469    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1470    This is a helper function for local use only.  */
1471
1472 static unsigned int
1473 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1474 {
1475   unsigned int alignment;
1476
1477   if (type)
1478     {
1479       if (!integer_zerop (TYPE_SIZE (type)))
1480         {
1481           if (TYPE_MODE (type) == mode)
1482             alignment = TYPE_ALIGN (type);
1483           else
1484             alignment = GET_MODE_ALIGNMENT (mode);
1485         }
1486       else
1487         alignment = 0;
1488     }
1489   else
1490     alignment = GET_MODE_ALIGNMENT (mode);
1491
1492   return alignment;
1493 }
1494
1495 /* Layout a function argument according to the AAPCS64 rules.  The rule
1496    numbers refer to the rule numbers in the AAPCS64.  */
1497
1498 static void
1499 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1500                     const_tree type,
1501                     bool named ATTRIBUTE_UNUSED)
1502 {
1503   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1504   int ncrn, nvrn, nregs;
1505   bool allocate_ncrn, allocate_nvrn;
1506   HOST_WIDE_INT size;
1507
1508   /* We need to do this once per argument.  */
1509   if (pcum->aapcs_arg_processed)
1510     return;
1511
1512   pcum->aapcs_arg_processed = true;
1513
1514   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1515   size
1516     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1517                         UNITS_PER_WORD);
1518
1519   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1520   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1521                                                  mode,
1522                                                  type,
1523                                                  &nregs);
1524
1525   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1526      The following code thus handles passing by SIMD/FP registers first.  */
1527
1528   nvrn = pcum->aapcs_nvrn;
1529
1530   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1531      and homogenous short-vector aggregates (HVA).  */
1532   if (allocate_nvrn)
1533     {
1534       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1535         {
1536           pcum->aapcs_nextnvrn = nvrn + nregs;
1537           if (!aarch64_composite_type_p (type, mode))
1538             {
1539               gcc_assert (nregs == 1);
1540               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1541             }
1542           else
1543             {
1544               rtx par;
1545               int i;
1546               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1547               for (i = 0; i < nregs; i++)
1548                 {
1549                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1550                                          V0_REGNUM + nvrn + i);
1551                   tmp = gen_rtx_EXPR_LIST
1552                     (VOIDmode, tmp,
1553                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1554                   XVECEXP (par, 0, i) = tmp;
1555                 }
1556               pcum->aapcs_reg = par;
1557             }
1558           return;
1559         }
1560       else
1561         {
1562           /* C.3 NSRN is set to 8.  */
1563           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1564           goto on_stack;
1565         }
1566     }
1567
1568   ncrn = pcum->aapcs_ncrn;
1569   nregs = size / UNITS_PER_WORD;
1570
1571   /* C6 - C9.  though the sign and zero extension semantics are
1572      handled elsewhere.  This is the case where the argument fits
1573      entirely general registers.  */
1574   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1575     {
1576       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1577
1578       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1579
1580       /* C.8 if the argument has an alignment of 16 then the NGRN is
1581          rounded up to the next even number.  */
1582       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1583         {
1584           ++ncrn;
1585           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1586         }
1587       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1588          A reg is still generated for it, but the caller should be smart
1589          enough not to use it.  */
1590       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1591         {
1592           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1593         }
1594       else
1595         {
1596           rtx par;
1597           int i;
1598
1599           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1600           for (i = 0; i < nregs; i++)
1601             {
1602               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1603               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1604                                        GEN_INT (i * UNITS_PER_WORD));
1605               XVECEXP (par, 0, i) = tmp;
1606             }
1607           pcum->aapcs_reg = par;
1608         }
1609
1610       pcum->aapcs_nextncrn = ncrn + nregs;
1611       return;
1612     }
1613
1614   /* C.11  */
1615   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1616
1617   /* The argument is passed on stack; record the needed number of words for
1618      this argument and align the total size if necessary.  */
1619 on_stack:
1620   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1621   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1622     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1623                                                16 / UNITS_PER_WORD);
1624   return;
1625 }
1626
1627 /* Implement TARGET_FUNCTION_ARG.  */
1628
1629 static rtx
1630 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1631                       const_tree type, bool named)
1632 {
1633   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1634   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1635
1636   if (mode == VOIDmode)
1637     return NULL_RTX;
1638
1639   aarch64_layout_arg (pcum_v, mode, type, named);
1640   return pcum->aapcs_reg;
1641 }
1642
1643 void
1644 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1645                            const_tree fntype ATTRIBUTE_UNUSED,
1646                            rtx libname ATTRIBUTE_UNUSED,
1647                            const_tree fndecl ATTRIBUTE_UNUSED,
1648                            unsigned n_named ATTRIBUTE_UNUSED)
1649 {
1650   pcum->aapcs_ncrn = 0;
1651   pcum->aapcs_nvrn = 0;
1652   pcum->aapcs_nextncrn = 0;
1653   pcum->aapcs_nextnvrn = 0;
1654   pcum->pcs_variant = ARM_PCS_AAPCS64;
1655   pcum->aapcs_reg = NULL_RTX;
1656   pcum->aapcs_arg_processed = false;
1657   pcum->aapcs_stack_words = 0;
1658   pcum->aapcs_stack_size = 0;
1659
1660   return;
1661 }
1662
1663 static void
1664 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1665                               enum machine_mode mode,
1666                               const_tree type,
1667                               bool named)
1668 {
1669   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1670   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1671     {
1672       aarch64_layout_arg (pcum_v, mode, type, named);
1673       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1674                   != (pcum->aapcs_stack_words != 0));
1675       pcum->aapcs_arg_processed = false;
1676       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1677       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1678       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1679       pcum->aapcs_stack_words = 0;
1680       pcum->aapcs_reg = NULL_RTX;
1681     }
1682 }
1683
1684 bool
1685 aarch64_function_arg_regno_p (unsigned regno)
1686 {
1687   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1688           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1689 }
1690
1691 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1692    PARM_BOUNDARY bits of alignment, but will be given anything up
1693    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1694    that both before and after the layout of each argument, the Next
1695    Stacked Argument Address (NSAA) will have a minimum alignment of
1696    8 bytes.  */
1697
1698 static unsigned int
1699 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1700 {
1701   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1702
1703   if (alignment < PARM_BOUNDARY)
1704     alignment = PARM_BOUNDARY;
1705   if (alignment > STACK_BOUNDARY)
1706     alignment = STACK_BOUNDARY;
1707   return alignment;
1708 }
1709
1710 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1711
1712    Return true if an argument passed on the stack should be padded upwards,
1713    i.e. if the least-significant byte of the stack slot has useful data.
1714
1715    Small aggregate types are placed in the lowest memory address.
1716
1717    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1718
1719 bool
1720 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1721 {
1722   /* On little-endian targets, the least significant byte of every stack
1723      argument is passed at the lowest byte address of the stack slot.  */
1724   if (!BYTES_BIG_ENDIAN)
1725     return true;
1726
1727   /* Otherwise, integral, floating-point and pointer types are padded downward:
1728      the least significant byte of a stack argument is passed at the highest
1729      byte address of the stack slot.  */
1730   if (type
1731       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1732          || POINTER_TYPE_P (type))
1733       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1734     return false;
1735
1736   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1737   return true;
1738 }
1739
1740 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1741
1742    It specifies padding for the last (may also be the only)
1743    element of a block move between registers and memory.  If
1744    assuming the block is in the memory, padding upward means that
1745    the last element is padded after its highest significant byte,
1746    while in downward padding, the last element is padded at the
1747    its least significant byte side.
1748
1749    Small aggregates and small complex types are always padded
1750    upwards.
1751
1752    We don't need to worry about homogeneous floating-point or
1753    short-vector aggregates; their move is not affected by the
1754    padding direction determined here.  Regardless of endianness,
1755    each element of such an aggregate is put in the least
1756    significant bits of a fp/simd register.
1757
1758    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1759    register has useful data, and return the opposite if the most
1760    significant byte does.  */
1761
1762 bool
1763 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1764                      bool first ATTRIBUTE_UNUSED)
1765 {
1766
1767   /* Small composite types are always padded upward.  */
1768   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1769     {
1770       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1771                             : GET_MODE_SIZE (mode));
1772       if (size < 2 * UNITS_PER_WORD)
1773         return true;
1774     }
1775
1776   /* Otherwise, use the default padding.  */
1777   return !BYTES_BIG_ENDIAN;
1778 }
1779
1780 static enum machine_mode
1781 aarch64_libgcc_cmp_return_mode (void)
1782 {
1783   return SImode;
1784 }
1785
1786 static bool
1787 aarch64_frame_pointer_required (void)
1788 {
1789   /* If the function contains dynamic stack allocations, we need to
1790      use the frame pointer to access the static parts of the frame.  */
1791   if (cfun->calls_alloca)
1792     return true;
1793
1794   /* In aarch64_override_options_after_change
1795      flag_omit_leaf_frame_pointer turns off the frame pointer by
1796      default.  Turn it back on now if we've not got a leaf
1797      function.  */
1798   if (flag_omit_leaf_frame_pointer
1799       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1800     return true;
1801
1802   return false;
1803 }
1804
1805 /* Mark the registers that need to be saved by the callee and calculate
1806    the size of the callee-saved registers area and frame record (both FP
1807    and LR may be omitted).  */
1808 static void
1809 aarch64_layout_frame (void)
1810 {
1811   HOST_WIDE_INT offset = 0;
1812   int regno;
1813
1814   if (reload_completed && cfun->machine->frame.laid_out)
1815     return;
1816
1817 #define SLOT_NOT_REQUIRED (-2)
1818 #define SLOT_REQUIRED     (-1)
1819
1820   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
1821   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
1822
1823   /* First mark all the registers that really need to be saved...  */
1824   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1825     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1826
1827   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1828     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1829
1830   /* ... that includes the eh data registers (if needed)...  */
1831   if (crtl->calls_eh_return)
1832     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1833       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1834         = SLOT_REQUIRED;
1835
1836   /* ... and any callee saved register that dataflow says is live.  */
1837   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1838     if (df_regs_ever_live_p (regno)
1839         && !call_used_regs[regno])
1840       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1841
1842   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1843     if (df_regs_ever_live_p (regno)
1844         && !call_used_regs[regno])
1845       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1846
1847   if (frame_pointer_needed)
1848     {
1849       /* FP and LR are placed in the linkage record.  */
1850       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1851       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
1852       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1853       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
1854       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1855       offset += 2 * UNITS_PER_WORD;
1856     }
1857
1858   /* Now assign stack slots for them.  */
1859   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1860     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1861       {
1862         cfun->machine->frame.reg_offset[regno] = offset;
1863         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1864           cfun->machine->frame.wb_candidate1 = regno;
1865         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
1866           cfun->machine->frame.wb_candidate2 = regno;
1867         offset += UNITS_PER_WORD;
1868       }
1869
1870   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1871     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1872       {
1873         cfun->machine->frame.reg_offset[regno] = offset;
1874         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1875           cfun->machine->frame.wb_candidate1 = regno;
1876         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
1877                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
1878           cfun->machine->frame.wb_candidate2 = regno;
1879         offset += UNITS_PER_WORD;
1880       }
1881
1882   cfun->machine->frame.padding0 =
1883     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1884   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1885
1886   cfun->machine->frame.saved_regs_size = offset;
1887
1888   cfun->machine->frame.hard_fp_offset
1889     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1890                         + get_frame_size ()
1891                         + cfun->machine->frame.saved_regs_size,
1892                         STACK_BOUNDARY / BITS_PER_UNIT);
1893
1894   cfun->machine->frame.frame_size
1895     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1896                         + crtl->outgoing_args_size,
1897                         STACK_BOUNDARY / BITS_PER_UNIT);
1898
1899   cfun->machine->frame.laid_out = true;
1900 }
1901
1902 /* Make the last instruction frame-related and note that it performs
1903    the operation described by FRAME_PATTERN.  */
1904
1905 static void
1906 aarch64_set_frame_expr (rtx frame_pattern)
1907 {
1908   rtx insn;
1909
1910   insn = get_last_insn ();
1911   RTX_FRAME_RELATED_P (insn) = 1;
1912   RTX_FRAME_RELATED_P (frame_pattern) = 1;
1913   REG_NOTES (insn) = alloc_EXPR_LIST (REG_FRAME_RELATED_EXPR,
1914                                       frame_pattern,
1915                                       REG_NOTES (insn));
1916 }
1917
1918 static bool
1919 aarch64_register_saved_on_entry (int regno)
1920 {
1921   return cfun->machine->frame.reg_offset[regno] >= 0;
1922 }
1923
1924 static unsigned
1925 aarch64_next_callee_save (unsigned regno, unsigned limit)
1926 {
1927   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1928     regno ++;
1929   return regno;
1930 }
1931
1932 static void
1933 aarch64_pushwb_single_reg (enum machine_mode mode, unsigned regno,
1934                            HOST_WIDE_INT adjustment)
1935  {
1936   rtx base_rtx = stack_pointer_rtx;
1937   rtx insn, reg, mem;
1938
1939   reg = gen_rtx_REG (mode, regno);
1940   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
1941                             plus_constant (Pmode, base_rtx, -adjustment));
1942   mem = gen_rtx_MEM (mode, mem);
1943
1944   insn = emit_move_insn (mem, reg);
1945   RTX_FRAME_RELATED_P (insn) = 1;
1946 }
1947
1948 static void
1949 aarch64_popwb_single_reg (enum machine_mode mode, unsigned regno,
1950                           HOST_WIDE_INT adjustment)
1951 {
1952   rtx base_rtx = stack_pointer_rtx;
1953   rtx insn, reg, mem;
1954
1955   reg = gen_rtx_REG (mode, regno);
1956   mem = gen_rtx_POST_MODIFY (Pmode, base_rtx,
1957                              plus_constant (Pmode, base_rtx, adjustment));
1958   mem = gen_rtx_MEM (mode, mem);
1959
1960   insn = emit_move_insn (reg, mem);
1961   add_reg_note (insn, REG_CFA_RESTORE, reg);
1962   RTX_FRAME_RELATED_P (insn) = 1;
1963 }
1964
1965 static rtx
1966 aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1967                           HOST_WIDE_INT adjustment)
1968 {
1969   switch (mode)
1970     {
1971     case DImode:
1972       return gen_storewb_pairdi_di (base, base, reg, reg2,
1973                                     GEN_INT (-adjustment),
1974                                     GEN_INT (UNITS_PER_WORD - adjustment));
1975     case DFmode:
1976       return gen_storewb_pairdf_di (base, base, reg, reg2,
1977                                     GEN_INT (-adjustment),
1978                                     GEN_INT (UNITS_PER_WORD - adjustment));
1979     default:
1980       gcc_unreachable ();
1981     }
1982 }
1983
1984 static void
1985 aarch64_pushwb_pair_reg (enum machine_mode mode, unsigned regno1,
1986                          unsigned regno2, HOST_WIDE_INT adjustment)
1987 {
1988   rtx insn;
1989   rtx reg1 = gen_rtx_REG (mode, regno1);
1990   rtx reg2 = gen_rtx_REG (mode, regno2);
1991
1992   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
1993                                               reg2, adjustment));
1994   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
1995
1996   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1997   RTX_FRAME_RELATED_P (insn) = 1;
1998 }
1999
2000 static rtx
2001 aarch64_gen_loadwb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
2002                          HOST_WIDE_INT adjustment)
2003 {
2004   switch (mode)
2005     {
2006     case DImode:
2007       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2008                                    GEN_INT (adjustment + UNITS_PER_WORD));
2009     case DFmode:
2010       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2011                                    GEN_INT (adjustment + UNITS_PER_WORD));
2012     default:
2013       gcc_unreachable ();
2014     }
2015 }
2016
2017 static void
2018 aarch64_popwb_pair_reg (enum machine_mode mode, unsigned regno1,
2019                         unsigned regno2, HOST_WIDE_INT adjustment, rtx cfa)
2020 {
2021   rtx insn;
2022   rtx reg1 = gen_rtx_REG (mode, regno1);
2023   rtx reg2 = gen_rtx_REG (mode, regno2);
2024
2025   insn = emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
2026                                              reg2, adjustment));
2027   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2028   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2029   RTX_FRAME_RELATED_P (insn) = 1;
2030
2031   if (cfa)
2032     add_reg_note (insn, REG_CFA_ADJUST_CFA,
2033                   (gen_rtx_SET (Pmode, stack_pointer_rtx,
2034                                 plus_constant (Pmode, cfa, adjustment))));
2035
2036   add_reg_note (insn, REG_CFA_RESTORE, reg1);
2037   add_reg_note (insn, REG_CFA_RESTORE, reg2);
2038 }
2039
2040 static rtx
2041 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2042                         rtx reg2)
2043 {
2044   switch (mode)
2045     {
2046     case DImode:
2047       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2048
2049     case DFmode:
2050       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2051
2052     default:
2053       gcc_unreachable ();
2054     }
2055 }
2056
2057 static rtx
2058 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2059                        rtx mem2)
2060 {
2061   switch (mode)
2062     {
2063     case DImode:
2064       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2065
2066     case DFmode:
2067       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2068
2069     default:
2070       gcc_unreachable ();
2071     }
2072 }
2073
2074
2075 static void
2076 aarch64_save_callee_saves (enum machine_mode mode, HOST_WIDE_INT start_offset,
2077                            unsigned start, unsigned limit, bool skip_wb)
2078 {
2079   rtx insn;
2080   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2081                                                  ? gen_frame_mem : gen_rtx_MEM);
2082   unsigned regno;
2083   unsigned regno2;
2084
2085   for (regno = aarch64_next_callee_save (start, limit);
2086        regno <= limit;
2087        regno = aarch64_next_callee_save (regno + 1, limit))
2088     {
2089       rtx reg, mem;
2090       HOST_WIDE_INT offset;
2091
2092       if (skip_wb
2093           && (regno == cfun->machine->frame.wb_candidate1
2094               || regno == cfun->machine->frame.wb_candidate2))
2095         continue;
2096
2097       reg = gen_rtx_REG (mode, regno);
2098       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2099       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2100                                               offset));
2101
2102       regno2 = aarch64_next_callee_save (regno + 1, limit);
2103
2104       if (regno2 <= limit
2105           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2106               == cfun->machine->frame.reg_offset[regno2]))
2107
2108         {
2109           rtx reg2 = gen_rtx_REG (mode, regno2);
2110           rtx mem2;
2111
2112           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2113           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2114                                                    offset));
2115           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2116                                                     reg2));
2117
2118           /* The first part of a frame-related parallel insn is
2119              always assumed to be relevant to the frame
2120              calculations; subsequent parts, are only
2121              frame-related if explicitly marked.  */
2122           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2123           regno = regno2;
2124         }
2125       else
2126         insn = emit_move_insn (mem, reg);
2127
2128       RTX_FRAME_RELATED_P (insn) = 1;
2129     }
2130 }
2131
2132 static void
2133 aarch64_restore_callee_saves (enum machine_mode mode,
2134                               HOST_WIDE_INT start_offset, unsigned start,
2135                               unsigned limit, bool skip_wb)
2136 {
2137   rtx insn;
2138   rtx base_rtx = stack_pointer_rtx;
2139   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2140                                                  ? gen_frame_mem : gen_rtx_MEM);
2141   unsigned regno;
2142   unsigned regno2;
2143   HOST_WIDE_INT offset;
2144
2145   for (regno = aarch64_next_callee_save (start, limit);
2146        regno <= limit;
2147        regno = aarch64_next_callee_save (regno + 1, limit))
2148     {
2149       rtx reg, mem;
2150
2151       if (skip_wb
2152           && (regno == cfun->machine->frame.wb_candidate1
2153               || regno == cfun->machine->frame.wb_candidate2))
2154         continue;
2155
2156       reg = gen_rtx_REG (mode, regno);
2157       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2158       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2159
2160       regno2 = aarch64_next_callee_save (regno + 1, limit);
2161
2162       if (regno2 <= limit
2163           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2164               == cfun->machine->frame.reg_offset[regno2]))
2165         {
2166           rtx reg2 = gen_rtx_REG (mode, regno2);
2167           rtx mem2;
2168
2169           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2170           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2171           insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2,
2172                                                    mem2));
2173           add_reg_note (insn, REG_CFA_RESTORE, reg);
2174           add_reg_note (insn, REG_CFA_RESTORE, reg2);
2175
2176           /* The first part of a frame-related parallel insn is
2177              always assumed to be relevant to the frame
2178              calculations; subsequent parts, are only
2179              frame-related if explicitly marked.  */
2180           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2181           regno = regno2;
2182         }
2183       else
2184         {
2185           insn = emit_move_insn (reg, mem);
2186           add_reg_note (insn, REG_CFA_RESTORE, reg);
2187         }
2188
2189       RTX_FRAME_RELATED_P (insn) = 1;
2190     }
2191 }
2192
2193 /* AArch64 stack frames generated by this compiler look like:
2194
2195         +-------------------------------+
2196         |                               |
2197         |  incoming stack arguments     |
2198         |                               |
2199         +-------------------------------+
2200         |                               | <-- incoming stack pointer (aligned)
2201         |  callee-allocated save area   |
2202         |  for register varargs         |
2203         |                               |
2204         +-------------------------------+
2205         |  local variables              | <-- frame_pointer_rtx
2206         |                               |
2207         +-------------------------------+
2208         |  padding0                     | \
2209         +-------------------------------+  |
2210         |  callee-saved registers       |  | frame.saved_regs_size
2211         +-------------------------------+  |
2212         |  LR'                          |  |
2213         +-------------------------------+  |
2214         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2215         +-------------------------------+
2216         |  dynamic allocation           |
2217         +-------------------------------+
2218         |  padding                      |
2219         +-------------------------------+
2220         |  outgoing stack arguments     | <-- arg_pointer
2221         |                               |
2222         +-------------------------------+
2223         |                               | <-- stack_pointer_rtx (aligned)
2224
2225    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2226    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2227    unchanged.  */
2228
2229 /* Generate the prologue instructions for entry into a function.
2230    Establish the stack frame by decreasing the stack pointer with a
2231    properly calculated size and, if necessary, create a frame record
2232    filled with the values of LR and previous frame pointer.  The
2233    current FP is also set up if it is in use.  */
2234
2235 void
2236 aarch64_expand_prologue (void)
2237 {
2238   /* sub sp, sp, #<frame_size>
2239      stp {fp, lr}, [sp, #<frame_size> - 16]
2240      add fp, sp, #<frame_size> - hardfp_offset
2241      stp {cs_reg}, [fp, #-16] etc.
2242
2243      sub sp, sp, <final_adjustment_if_any>
2244   */
2245   HOST_WIDE_INT frame_size, offset;
2246   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2247   rtx insn;
2248
2249   aarch64_layout_frame ();
2250
2251   if (flag_stack_usage_info)
2252     current_function_static_stack_size = cfun->machine->frame.frame_size;
2253
2254   frame_size = cfun->machine->frame.frame_size;
2255   offset = cfun->machine->frame.frame_size;
2256
2257   fp_offset = cfun->machine->frame.frame_size
2258               - cfun->machine->frame.hard_fp_offset;
2259
2260   /* Store pairs and load pairs have a range only -512 to 504.  */
2261   if (offset >= 512)
2262     {
2263       /* When the frame has a large size, an initial decrease is done on
2264          the stack pointer to jump over the callee-allocated save area for
2265          register varargs, the local variable area and/or the callee-saved
2266          register area.  This will allow the pre-index write-back
2267          store pair instructions to be used for setting up the stack frame
2268          efficiently.  */
2269       offset = cfun->machine->frame.hard_fp_offset;
2270       if (offset >= 512)
2271         offset = cfun->machine->frame.saved_regs_size;
2272
2273       frame_size -= (offset + crtl->outgoing_args_size);
2274       fp_offset = 0;
2275
2276       if (frame_size >= 0x1000000)
2277         {
2278           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2279           emit_move_insn (op0, GEN_INT (-frame_size));
2280           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2281           aarch64_set_frame_expr (gen_rtx_SET
2282                                   (Pmode, stack_pointer_rtx,
2283                                    plus_constant (Pmode,
2284                                                   stack_pointer_rtx,
2285                                                   -frame_size)));
2286         }
2287       else if (frame_size > 0)
2288         {
2289           if ((frame_size & 0xfff) != frame_size)
2290             {
2291               insn = emit_insn (gen_add2_insn
2292                                 (stack_pointer_rtx,
2293                                  GEN_INT (-(frame_size
2294                                             & ~(HOST_WIDE_INT)0xfff))));
2295               RTX_FRAME_RELATED_P (insn) = 1;
2296             }
2297           if ((frame_size & 0xfff) != 0)
2298             {
2299               insn = emit_insn (gen_add2_insn
2300                                 (stack_pointer_rtx,
2301                                  GEN_INT (-(frame_size
2302                                             & (HOST_WIDE_INT)0xfff))));
2303               RTX_FRAME_RELATED_P (insn) = 1;
2304             }
2305         }
2306     }
2307   else
2308     frame_size = -1;
2309
2310   if (offset > 0)
2311     {
2312       bool skip_wb = false;
2313
2314       if (frame_pointer_needed)
2315         {
2316           skip_wb = true;
2317
2318           if (fp_offset)
2319             {
2320               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2321                                                GEN_INT (-offset)));
2322               RTX_FRAME_RELATED_P (insn) = 1;
2323               aarch64_set_frame_expr (gen_rtx_SET
2324                                       (Pmode, stack_pointer_rtx,
2325                                        gen_rtx_MINUS (Pmode, stack_pointer_rtx,
2326                                                       GEN_INT (offset))));
2327
2328               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2329                                          R30_REGNUM, false);
2330             }
2331           else
2332             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2333
2334           /* Set up frame pointer to point to the location of the
2335              previous frame pointer on the stack.  */
2336           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2337                                            stack_pointer_rtx,
2338                                            GEN_INT (fp_offset)));
2339           aarch64_set_frame_expr (gen_rtx_SET
2340                                   (Pmode, hard_frame_pointer_rtx,
2341                                    plus_constant (Pmode,
2342                                                   stack_pointer_rtx,
2343                                                   fp_offset)));
2344           RTX_FRAME_RELATED_P (insn) = 1;
2345           insn = emit_insn (gen_stack_tie (stack_pointer_rtx,
2346                                            hard_frame_pointer_rtx));
2347         }
2348       else
2349         {
2350           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2351           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2352
2353           if (fp_offset
2354               || reg1 == FIRST_PSEUDO_REGISTER
2355               || (reg2 == FIRST_PSEUDO_REGISTER
2356                   && offset >= 256))
2357             {
2358               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2359                                                GEN_INT (-offset)));
2360               RTX_FRAME_RELATED_P (insn) = 1;
2361             }
2362           else
2363             {
2364               enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2365
2366               skip_wb = true;
2367
2368               if (reg2 == FIRST_PSEUDO_REGISTER)
2369                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2370               else
2371                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2372             }
2373         }
2374
2375       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2376                                  skip_wb);
2377       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2378                                  skip_wb);
2379     }
2380
2381   /* when offset >= 512,
2382      sub sp, sp, #<outgoing_args_size> */
2383   if (frame_size > -1)
2384     {
2385       if (crtl->outgoing_args_size > 0)
2386         {
2387           insn = emit_insn (gen_add2_insn
2388                             (stack_pointer_rtx,
2389                              GEN_INT (- crtl->outgoing_args_size)));
2390           RTX_FRAME_RELATED_P (insn) = 1;
2391         }
2392     }
2393 }
2394
2395 /* Generate the epilogue instructions for returning from a function.  */
2396 void
2397 aarch64_expand_epilogue (bool for_sibcall)
2398 {
2399   HOST_WIDE_INT frame_size, offset;
2400   HOST_WIDE_INT fp_offset;
2401   rtx insn;
2402   rtx cfa_reg;
2403
2404   aarch64_layout_frame ();
2405
2406   offset = frame_size = cfun->machine->frame.frame_size;
2407   fp_offset = cfun->machine->frame.frame_size
2408               - cfun->machine->frame.hard_fp_offset;
2409
2410   cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx;
2411
2412   /* Store pairs and load pairs have a range only -512 to 504.  */
2413   if (offset >= 512)
2414     {
2415       offset = cfun->machine->frame.hard_fp_offset;
2416       if (offset >= 512)
2417         offset = cfun->machine->frame.saved_regs_size;
2418
2419       frame_size -= (offset + crtl->outgoing_args_size);
2420       fp_offset = 0;
2421       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2422         {
2423           insn = emit_insn (gen_add2_insn
2424                             (stack_pointer_rtx,
2425                              GEN_INT (crtl->outgoing_args_size)));
2426           RTX_FRAME_RELATED_P (insn) = 1;
2427         }
2428     }
2429   else
2430     frame_size = -1;
2431
2432   /* If there were outgoing arguments or we've done dynamic stack
2433      allocation, then restore the stack pointer from the frame
2434      pointer.  This is at most one insn and more efficient than using
2435      GCC's internal mechanism.  */
2436   if (frame_pointer_needed
2437       && (crtl->outgoing_args_size || cfun->calls_alloca))
2438     {
2439       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2440                                        hard_frame_pointer_rtx,
2441                                        GEN_INT (0)));
2442       offset = offset - fp_offset;
2443       RTX_FRAME_RELATED_P (insn) = 1;
2444       /* As SP is set to (FP - fp_offset), according to the rules in
2445          dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
2446          from the value of SP from now on.  */
2447       cfa_reg = stack_pointer_rtx;
2448     }
2449
2450   if (offset > 0)
2451     {
2452       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2453       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2454       bool skip_wb = true;
2455
2456       if (frame_pointer_needed)
2457         fp_offset = 0;
2458       else if (fp_offset
2459                || reg1 == FIRST_PSEUDO_REGISTER
2460                || (reg2 == FIRST_PSEUDO_REGISTER
2461                    && offset >= 256))
2462         skip_wb = false;
2463
2464       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2465                                     skip_wb);
2466       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2467                                     skip_wb);
2468
2469       if (skip_wb)
2470         {
2471           enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2472
2473           if (reg2 == FIRST_PSEUDO_REGISTER)
2474             aarch64_popwb_single_reg (mode1, reg1, offset);
2475           else
2476             {
2477               if (reg1 != HARD_FRAME_POINTER_REGNUM)
2478                 cfa_reg = NULL;
2479
2480               aarch64_popwb_pair_reg (mode1, reg1, reg2, offset, cfa_reg);
2481             }
2482         }
2483       else
2484         {
2485           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2486                                            GEN_INT (offset)));
2487           RTX_FRAME_RELATED_P (insn) = 1;
2488         }
2489     }
2490
2491   /* Stack adjustment for exception handler.  */
2492   if (crtl->calls_eh_return)
2493     {
2494       /* We need to unwind the stack by the offset computed by
2495          EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
2496          based on SP.  Ideally we would update the SP and define the
2497          CFA along the lines of:
2498
2499          SP = SP + EH_RETURN_STACKADJ_RTX
2500          (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
2501
2502          However the dwarf emitter only understands a constant
2503          register offset.
2504
2505          The solution chosen here is to use the otherwise unused IP0
2506          as a temporary register to hold the current SP value.  The
2507          CFA is described using IP0 then SP is modified.  */
2508
2509       rtx ip0 = gen_rtx_REG (DImode, IP0_REGNUM);
2510
2511       insn = emit_move_insn (ip0, stack_pointer_rtx);
2512       add_reg_note (insn, REG_CFA_DEF_CFA, ip0);
2513       RTX_FRAME_RELATED_P (insn) = 1;
2514
2515       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2516
2517       /* Ensure the assignment to IP0 does not get optimized away.  */
2518       emit_use (ip0);
2519     }
2520
2521   if (frame_size > -1)
2522     {
2523       if (frame_size >= 0x1000000)
2524         {
2525           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2526           emit_move_insn (op0, GEN_INT (frame_size));
2527           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2528           aarch64_set_frame_expr (gen_rtx_SET
2529                                   (Pmode, stack_pointer_rtx,
2530                                    plus_constant (Pmode,
2531                                                   stack_pointer_rtx,
2532                                                   frame_size)));
2533         }
2534       else if (frame_size > 0)
2535         {
2536           if ((frame_size & 0xfff) != 0)
2537             {
2538               insn = emit_insn (gen_add2_insn
2539                                 (stack_pointer_rtx,
2540                                  GEN_INT ((frame_size
2541                                            & (HOST_WIDE_INT) 0xfff))));
2542               RTX_FRAME_RELATED_P (insn) = 1;
2543             }
2544           if ((frame_size & 0xfff) != frame_size)
2545             {
2546               insn = emit_insn (gen_add2_insn
2547                                 (stack_pointer_rtx,
2548                                  GEN_INT ((frame_size
2549                                            & ~ (HOST_WIDE_INT) 0xfff))));
2550               RTX_FRAME_RELATED_P (insn) = 1;
2551             }
2552         }
2553
2554       aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx,
2555                                            plus_constant (Pmode,
2556                                                           stack_pointer_rtx,
2557                                                           offset)));
2558     }
2559
2560   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2561   if (!for_sibcall)
2562     emit_jump_insn (ret_rtx);
2563 }
2564
2565 /* Return the place to copy the exception unwinding return address to.
2566    This will probably be a stack slot, but could (in theory be the
2567    return register).  */
2568 rtx
2569 aarch64_final_eh_return_addr (void)
2570 {
2571   HOST_WIDE_INT fp_offset;
2572
2573   aarch64_layout_frame ();
2574
2575   fp_offset = cfun->machine->frame.frame_size
2576               - cfun->machine->frame.hard_fp_offset;
2577
2578   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2579     return gen_rtx_REG (DImode, LR_REGNUM);
2580
2581   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2582      result in a store to save LR introduced by builtin_eh_return () being
2583      incorrectly deleted because the alias is not detected.
2584      So in the calculation of the address to copy the exception unwinding
2585      return address to, we note 2 cases.
2586      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2587      we return a SP-relative location since all the addresses are SP-relative
2588      in this case.  This prevents the store from being optimized away.
2589      If the fp_offset is not 0, then the addresses will be FP-relative and
2590      therefore we return a FP-relative location.  */
2591
2592   if (frame_pointer_needed)
2593     {
2594       if (fp_offset)
2595         return gen_frame_mem (DImode,
2596                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2597       else
2598         return gen_frame_mem (DImode,
2599                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2600     }
2601
2602   /* If FP is not needed, we calculate the location of LR, which would be
2603      at the top of the saved registers block.  */
2604
2605   return gen_frame_mem (DImode,
2606                         plus_constant (Pmode,
2607                                        stack_pointer_rtx,
2608                                        fp_offset
2609                                        + cfun->machine->frame.saved_regs_size
2610                                        - 2 * UNITS_PER_WORD));
2611 }
2612
2613 /* Possibly output code to build up a constant in a register.  For
2614    the benefit of the costs infrastructure, returns the number of
2615    instructions which would be emitted.  GENERATE inhibits or
2616    enables code generation.  */
2617
2618 static int
2619 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2620 {
2621   int insns = 0;
2622
2623   if (aarch64_bitmask_imm (val, DImode))
2624     {
2625       if (generate)
2626         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2627       insns = 1;
2628     }
2629   else
2630     {
2631       int i;
2632       int ncount = 0;
2633       int zcount = 0;
2634       HOST_WIDE_INT valp = val >> 16;
2635       HOST_WIDE_INT valm;
2636       HOST_WIDE_INT tval;
2637
2638       for (i = 16; i < 64; i += 16)
2639         {
2640           valm = (valp & 0xffff);
2641
2642           if (valm != 0)
2643             ++ zcount;
2644
2645           if (valm != 0xffff)
2646             ++ ncount;
2647
2648           valp >>= 16;
2649         }
2650
2651       /* zcount contains the number of additional MOVK instructions
2652          required if the constant is built up with an initial MOVZ instruction,
2653          while ncount is the number of MOVK instructions required if starting
2654          with a MOVN instruction.  Choose the sequence that yields the fewest
2655          number of instructions, preferring MOVZ instructions when they are both
2656          the same.  */
2657       if (ncount < zcount)
2658         {
2659           if (generate)
2660             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2661                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2662           tval = 0xffff;
2663           insns++;
2664         }
2665       else
2666         {
2667           if (generate)
2668             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2669                             GEN_INT (val & 0xffff));
2670           tval = 0;
2671           insns++;
2672         }
2673
2674       val >>= 16;
2675
2676       for (i = 16; i < 64; i += 16)
2677         {
2678           if ((val & 0xffff) != tval)
2679             {
2680               if (generate)
2681                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2682                                            GEN_INT (i),
2683                                            GEN_INT (val & 0xffff)));
2684               insns++;
2685             }
2686           val >>= 16;
2687         }
2688     }
2689   return insns;
2690 }
2691
2692 static void
2693 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2694 {
2695   HOST_WIDE_INT mdelta = delta;
2696   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2697   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2698
2699   if (mdelta < 0)
2700     mdelta = -mdelta;
2701
2702   if (mdelta >= 4096 * 4096)
2703     {
2704       (void) aarch64_build_constant (scratchreg, delta, true);
2705       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2706     }
2707   else if (mdelta > 0)
2708     {
2709       if (mdelta >= 4096)
2710         {
2711           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2712           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2713           if (delta < 0)
2714             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2715                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2716           else
2717             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2718                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2719         }
2720       if (mdelta % 4096 != 0)
2721         {
2722           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2723           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2724                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2725         }
2726     }
2727 }
2728
2729 /* Output code to add DELTA to the first argument, and then jump
2730    to FUNCTION.  Used for C++ multiple inheritance.  */
2731 static void
2732 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2733                          HOST_WIDE_INT delta,
2734                          HOST_WIDE_INT vcall_offset,
2735                          tree function)
2736 {
2737   /* The this pointer is always in x0.  Note that this differs from
2738      Arm where the this pointer maybe bumped to r1 if r0 is required
2739      to return a pointer to an aggregate.  On AArch64 a result value
2740      pointer will be in x8.  */
2741   int this_regno = R0_REGNUM;
2742   rtx this_rtx, temp0, temp1, addr, insn, funexp;
2743
2744   reload_completed = 1;
2745   emit_note (NOTE_INSN_PROLOGUE_END);
2746
2747   if (vcall_offset == 0)
2748     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2749   else
2750     {
2751       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2752
2753       this_rtx = gen_rtx_REG (Pmode, this_regno);
2754       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2755       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2756
2757       addr = this_rtx;
2758       if (delta != 0)
2759         {
2760           if (delta >= -256 && delta < 256)
2761             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2762                                        plus_constant (Pmode, this_rtx, delta));
2763           else
2764             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2765         }
2766
2767       if (Pmode == ptr_mode)
2768         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2769       else
2770         aarch64_emit_move (temp0,
2771                            gen_rtx_ZERO_EXTEND (Pmode,
2772                                                 gen_rtx_MEM (ptr_mode, addr)));
2773
2774       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2775           addr = plus_constant (Pmode, temp0, vcall_offset);
2776       else
2777         {
2778           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2779           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2780         }
2781
2782       if (Pmode == ptr_mode)
2783         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2784       else
2785         aarch64_emit_move (temp1,
2786                            gen_rtx_SIGN_EXTEND (Pmode,
2787                                                 gen_rtx_MEM (ptr_mode, addr)));
2788
2789       emit_insn (gen_add2_insn (this_rtx, temp1));
2790     }
2791
2792   /* Generate a tail call to the target function.  */
2793   if (!TREE_USED (function))
2794     {
2795       assemble_external (function);
2796       TREE_USED (function) = 1;
2797     }
2798   funexp = XEXP (DECL_RTL (function), 0);
2799   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2800   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2801   SIBLING_CALL_P (insn) = 1;
2802
2803   insn = get_insns ();
2804   shorten_branches (insn);
2805   final_start_function (insn, file, 1);
2806   final (insn, file, 1);
2807   final_end_function ();
2808
2809   /* Stop pretending to be a post-reload pass.  */
2810   reload_completed = 0;
2811 }
2812
2813 static int
2814 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2815 {
2816   if (GET_CODE (*x) == SYMBOL_REF)
2817     return SYMBOL_REF_TLS_MODEL (*x) != 0;
2818
2819   /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2820      TLS offsets, not real symbol references.  */
2821   if (GET_CODE (*x) == UNSPEC
2822       && XINT (*x, 1) == UNSPEC_TLS)
2823     return -1;
2824
2825   return 0;
2826 }
2827
2828 static bool
2829 aarch64_tls_referenced_p (rtx x)
2830 {
2831   if (!TARGET_HAVE_TLS)
2832     return false;
2833
2834   return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2835 }
2836
2837
2838 static int
2839 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2840 {
2841   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2842   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2843
2844   if (*imm1 < *imm2)
2845     return -1;
2846   if (*imm1 > *imm2)
2847     return +1;
2848   return 0;
2849 }
2850
2851
2852 static void
2853 aarch64_build_bitmask_table (void)
2854 {
2855   unsigned HOST_WIDE_INT mask, imm;
2856   unsigned int log_e, e, s, r;
2857   unsigned int nimms = 0;
2858
2859   for (log_e = 1; log_e <= 6; log_e++)
2860     {
2861       e = 1 << log_e;
2862       if (e == 64)
2863         mask = ~(HOST_WIDE_INT) 0;
2864       else
2865         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2866       for (s = 1; s < e; s++)
2867         {
2868           for (r = 0; r < e; r++)
2869             {
2870               /* set s consecutive bits to 1 (s < 64) */
2871               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2872               /* rotate right by r */
2873               if (r != 0)
2874                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2875               /* replicate the constant depending on SIMD size */
2876               switch (log_e) {
2877               case 1: imm |= (imm <<  2);
2878               case 2: imm |= (imm <<  4);
2879               case 3: imm |= (imm <<  8);
2880               case 4: imm |= (imm << 16);
2881               case 5: imm |= (imm << 32);
2882               case 6:
2883                 break;
2884               default:
2885                 gcc_unreachable ();
2886               }
2887               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2888               aarch64_bitmasks[nimms++] = imm;
2889             }
2890         }
2891     }
2892
2893   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2894   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2895          aarch64_bitmasks_cmp);
2896 }
2897
2898
2899 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2900    a left shift of 0 or 12 bits.  */
2901 bool
2902 aarch64_uimm12_shift (HOST_WIDE_INT val)
2903 {
2904   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2905           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2906           );
2907 }
2908
2909
2910 /* Return true if val is an immediate that can be loaded into a
2911    register by a MOVZ instruction.  */
2912 static bool
2913 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2914 {
2915   if (GET_MODE_SIZE (mode) > 4)
2916     {
2917       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2918           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2919         return 1;
2920     }
2921   else
2922     {
2923       /* Ignore sign extension.  */
2924       val &= (HOST_WIDE_INT) 0xffffffff;
2925     }
2926   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2927           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2928 }
2929
2930
2931 /* Return true if val is a valid bitmask immediate.  */
2932 bool
2933 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2934 {
2935   if (GET_MODE_SIZE (mode) < 8)
2936     {
2937       /* Replicate bit pattern.  */
2938       val &= (HOST_WIDE_INT) 0xffffffff;
2939       val |= val << 32;
2940     }
2941   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2942                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2943 }
2944
2945
2946 /* Return true if val is an immediate that can be loaded into a
2947    register in a single instruction.  */
2948 bool
2949 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2950 {
2951   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2952     return 1;
2953   return aarch64_bitmask_imm (val, mode);
2954 }
2955
2956 static bool
2957 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2958 {
2959   rtx base, offset;
2960
2961   if (GET_CODE (x) == HIGH)
2962     return true;
2963
2964   split_const (x, &base, &offset);
2965   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2966     {
2967       if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2968           != SYMBOL_FORCE_TO_MEM)
2969         return true;
2970       else
2971         /* Avoid generating a 64-bit relocation in ILP32; leave
2972            to aarch64_expand_mov_immediate to handle it properly.  */
2973         return mode != ptr_mode;
2974     }
2975
2976   return aarch64_tls_referenced_p (x);
2977 }
2978
2979 /* Return true if register REGNO is a valid index register.
2980    STRICT_P is true if REG_OK_STRICT is in effect.  */
2981
2982 bool
2983 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2984 {
2985   if (!HARD_REGISTER_NUM_P (regno))
2986     {
2987       if (!strict_p)
2988         return true;
2989
2990       if (!reg_renumber)
2991         return false;
2992
2993       regno = reg_renumber[regno];
2994     }
2995   return GP_REGNUM_P (regno);
2996 }
2997
2998 /* Return true if register REGNO is a valid base register for mode MODE.
2999    STRICT_P is true if REG_OK_STRICT is in effect.  */
3000
3001 bool
3002 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3003 {
3004   if (!HARD_REGISTER_NUM_P (regno))
3005     {
3006       if (!strict_p)
3007         return true;
3008
3009       if (!reg_renumber)
3010         return false;
3011
3012       regno = reg_renumber[regno];
3013     }
3014
3015   /* The fake registers will be eliminated to either the stack or
3016      hard frame pointer, both of which are usually valid base registers.
3017      Reload deals with the cases where the eliminated form isn't valid.  */
3018   return (GP_REGNUM_P (regno)
3019           || regno == SP_REGNUM
3020           || regno == FRAME_POINTER_REGNUM
3021           || regno == ARG_POINTER_REGNUM);
3022 }
3023
3024 /* Return true if X is a valid base register for mode MODE.
3025    STRICT_P is true if REG_OK_STRICT is in effect.  */
3026
3027 static bool
3028 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3029 {
3030   if (!strict_p && GET_CODE (x) == SUBREG)
3031     x = SUBREG_REG (x);
3032
3033   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3034 }
3035
3036 /* Return true if address offset is a valid index.  If it is, fill in INFO
3037    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3038
3039 static bool
3040 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3041                         enum machine_mode mode, bool strict_p)
3042 {
3043   enum aarch64_address_type type;
3044   rtx index;
3045   int shift;
3046
3047   /* (reg:P) */
3048   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3049       && GET_MODE (x) == Pmode)
3050     {
3051       type = ADDRESS_REG_REG;
3052       index = x;
3053       shift = 0;
3054     }
3055   /* (sign_extend:DI (reg:SI)) */
3056   else if ((GET_CODE (x) == SIGN_EXTEND
3057             || GET_CODE (x) == ZERO_EXTEND)
3058            && GET_MODE (x) == DImode
3059            && GET_MODE (XEXP (x, 0)) == SImode)
3060     {
3061       type = (GET_CODE (x) == SIGN_EXTEND)
3062         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3063       index = XEXP (x, 0);
3064       shift = 0;
3065     }
3066   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3067   else if (GET_CODE (x) == MULT
3068            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3069                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3070            && GET_MODE (XEXP (x, 0)) == DImode
3071            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3072            && CONST_INT_P (XEXP (x, 1)))
3073     {
3074       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3075         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3076       index = XEXP (XEXP (x, 0), 0);
3077       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3078     }
3079   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3080   else if (GET_CODE (x) == ASHIFT
3081            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3082                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3083            && GET_MODE (XEXP (x, 0)) == DImode
3084            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3085            && CONST_INT_P (XEXP (x, 1)))
3086     {
3087       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3088         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3089       index = XEXP (XEXP (x, 0), 0);
3090       shift = INTVAL (XEXP (x, 1));
3091     }
3092   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3093   else if ((GET_CODE (x) == SIGN_EXTRACT
3094             || GET_CODE (x) == ZERO_EXTRACT)
3095            && GET_MODE (x) == DImode
3096            && GET_CODE (XEXP (x, 0)) == MULT
3097            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3098            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3099     {
3100       type = (GET_CODE (x) == SIGN_EXTRACT)
3101         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3102       index = XEXP (XEXP (x, 0), 0);
3103       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3104       if (INTVAL (XEXP (x, 1)) != 32 + shift
3105           || INTVAL (XEXP (x, 2)) != 0)
3106         shift = -1;
3107     }
3108   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3109      (const_int 0xffffffff<<shift)) */
3110   else if (GET_CODE (x) == AND
3111            && GET_MODE (x) == DImode
3112            && GET_CODE (XEXP (x, 0)) == MULT
3113            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3114            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3115            && CONST_INT_P (XEXP (x, 1)))
3116     {
3117       type = ADDRESS_REG_UXTW;
3118       index = XEXP (XEXP (x, 0), 0);
3119       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3120       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3121         shift = -1;
3122     }
3123   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3124   else if ((GET_CODE (x) == SIGN_EXTRACT
3125             || GET_CODE (x) == ZERO_EXTRACT)
3126            && GET_MODE (x) == DImode
3127            && GET_CODE (XEXP (x, 0)) == ASHIFT
3128            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3129            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3130     {
3131       type = (GET_CODE (x) == SIGN_EXTRACT)
3132         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3133       index = XEXP (XEXP (x, 0), 0);
3134       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3135       if (INTVAL (XEXP (x, 1)) != 32 + shift
3136           || INTVAL (XEXP (x, 2)) != 0)
3137         shift = -1;
3138     }
3139   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3140      (const_int 0xffffffff<<shift)) */
3141   else if (GET_CODE (x) == AND
3142            && GET_MODE (x) == DImode
3143            && GET_CODE (XEXP (x, 0)) == ASHIFT
3144            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3145            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3146            && CONST_INT_P (XEXP (x, 1)))
3147     {
3148       type = ADDRESS_REG_UXTW;
3149       index = XEXP (XEXP (x, 0), 0);
3150       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3151       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3152         shift = -1;
3153     }
3154   /* (mult:P (reg:P) (const_int scale)) */
3155   else if (GET_CODE (x) == MULT
3156            && GET_MODE (x) == Pmode
3157            && GET_MODE (XEXP (x, 0)) == Pmode
3158            && CONST_INT_P (XEXP (x, 1)))
3159     {
3160       type = ADDRESS_REG_REG;
3161       index = XEXP (x, 0);
3162       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3163     }
3164   /* (ashift:P (reg:P) (const_int shift)) */
3165   else if (GET_CODE (x) == ASHIFT
3166            && GET_MODE (x) == Pmode
3167            && GET_MODE (XEXP (x, 0)) == Pmode
3168            && CONST_INT_P (XEXP (x, 1)))
3169     {
3170       type = ADDRESS_REG_REG;
3171       index = XEXP (x, 0);
3172       shift = INTVAL (XEXP (x, 1));
3173     }
3174   else
3175     return false;
3176
3177   if (GET_CODE (index) == SUBREG)
3178     index = SUBREG_REG (index);
3179
3180   if ((shift == 0 ||
3181        (shift > 0 && shift <= 3
3182         && (1 << shift) == GET_MODE_SIZE (mode)))
3183       && REG_P (index)
3184       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3185     {
3186       info->type = type;
3187       info->offset = index;
3188       info->shift = shift;
3189       return true;
3190     }
3191
3192   return false;
3193 }
3194
3195 static inline bool
3196 offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3197 {
3198   return (offset >= -64 * GET_MODE_SIZE (mode)
3199           && offset < 64 * GET_MODE_SIZE (mode)
3200           && offset % GET_MODE_SIZE (mode) == 0);
3201 }
3202
3203 static inline bool
3204 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3205                                HOST_WIDE_INT offset)
3206 {
3207   return offset >= -256 && offset < 256;
3208 }
3209
3210 static inline bool
3211 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3212 {
3213   return (offset >= 0
3214           && offset < 4096 * GET_MODE_SIZE (mode)
3215           && offset % GET_MODE_SIZE (mode) == 0);
3216 }
3217
3218 /* Return true if X is a valid address for machine mode MODE.  If it is,
3219    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3220    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3221
3222 static bool
3223 aarch64_classify_address (struct aarch64_address_info *info,
3224                           rtx x, enum machine_mode mode,
3225                           RTX_CODE outer_code, bool strict_p)
3226 {
3227   enum rtx_code code = GET_CODE (x);
3228   rtx op0, op1;
3229   bool allow_reg_index_p =
3230     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3231                                || aarch64_vector_mode_supported_p (mode));
3232   /* Don't support anything other than POST_INC or REG addressing for
3233      AdvSIMD.  */
3234   if (aarch64_vect_struct_mode_p (mode)
3235       && (code != POST_INC && code != REG))
3236     return false;
3237
3238   switch (code)
3239     {
3240     case REG:
3241     case SUBREG:
3242       info->type = ADDRESS_REG_IMM;
3243       info->base = x;
3244       info->offset = const0_rtx;
3245       return aarch64_base_register_rtx_p (x, strict_p);
3246
3247     case PLUS:
3248       op0 = XEXP (x, 0);
3249       op1 = XEXP (x, 1);
3250       if (GET_MODE_SIZE (mode) != 0
3251           && CONST_INT_P (op1)
3252           && aarch64_base_register_rtx_p (op0, strict_p))
3253         {
3254           HOST_WIDE_INT offset = INTVAL (op1);
3255
3256           info->type = ADDRESS_REG_IMM;
3257           info->base = op0;
3258           info->offset = op1;
3259
3260           /* TImode and TFmode values are allowed in both pairs of X
3261              registers and individual Q registers.  The available
3262              address modes are:
3263              X,X: 7-bit signed scaled offset
3264              Q:   9-bit signed offset
3265              We conservatively require an offset representable in either mode.
3266            */
3267           if (mode == TImode || mode == TFmode)
3268             return (offset_7bit_signed_scaled_p (mode, offset)
3269                     && offset_9bit_signed_unscaled_p (mode, offset));
3270
3271           if (outer_code == PARALLEL)
3272             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3273                     && offset_7bit_signed_scaled_p (mode, offset));
3274           else
3275             return (offset_9bit_signed_unscaled_p (mode, offset)
3276                     || offset_12bit_unsigned_scaled_p (mode, offset));
3277         }
3278
3279       if (allow_reg_index_p)
3280         {
3281           /* Look for base + (scaled/extended) index register.  */
3282           if (aarch64_base_register_rtx_p (op0, strict_p)
3283               && aarch64_classify_index (info, op1, mode, strict_p))
3284             {
3285               info->base = op0;
3286               return true;
3287             }
3288           if (aarch64_base_register_rtx_p (op1, strict_p)
3289               && aarch64_classify_index (info, op0, mode, strict_p))
3290             {
3291               info->base = op1;
3292               return true;
3293             }
3294         }
3295
3296       return false;
3297
3298     case POST_INC:
3299     case POST_DEC:
3300     case PRE_INC:
3301     case PRE_DEC:
3302       info->type = ADDRESS_REG_WB;
3303       info->base = XEXP (x, 0);
3304       info->offset = NULL_RTX;
3305       return aarch64_base_register_rtx_p (info->base, strict_p);
3306
3307     case POST_MODIFY:
3308     case PRE_MODIFY:
3309       info->type = ADDRESS_REG_WB;
3310       info->base = XEXP (x, 0);
3311       if (GET_CODE (XEXP (x, 1)) == PLUS
3312           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3313           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3314           && aarch64_base_register_rtx_p (info->base, strict_p))
3315         {
3316           HOST_WIDE_INT offset;
3317           info->offset = XEXP (XEXP (x, 1), 1);
3318           offset = INTVAL (info->offset);
3319
3320           /* TImode and TFmode values are allowed in both pairs of X
3321              registers and individual Q registers.  The available
3322              address modes are:
3323              X,X: 7-bit signed scaled offset
3324              Q:   9-bit signed offset
3325              We conservatively require an offset representable in either mode.
3326            */
3327           if (mode == TImode || mode == TFmode)
3328             return (offset_7bit_signed_scaled_p (mode, offset)
3329                     && offset_9bit_signed_unscaled_p (mode, offset));
3330
3331           if (outer_code == PARALLEL)
3332             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3333                     && offset_7bit_signed_scaled_p (mode, offset));
3334           else
3335             return offset_9bit_signed_unscaled_p (mode, offset);
3336         }
3337       return false;
3338
3339     case CONST:
3340     case SYMBOL_REF:
3341     case LABEL_REF:
3342       /* load literal: pc-relative constant pool entry.  Only supported
3343          for SI mode or larger.  */
3344       info->type = ADDRESS_SYMBOLIC;
3345       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3346         {
3347           rtx sym, addend;
3348
3349           split_const (x, &sym, &addend);
3350           return (GET_CODE (sym) == LABEL_REF
3351                   || (GET_CODE (sym) == SYMBOL_REF
3352                       && CONSTANT_POOL_ADDRESS_P (sym)));
3353         }
3354       return false;
3355
3356     case LO_SUM:
3357       info->type = ADDRESS_LO_SUM;
3358       info->base = XEXP (x, 0);
3359       info->offset = XEXP (x, 1);
3360       if (allow_reg_index_p
3361           && aarch64_base_register_rtx_p (info->base, strict_p))
3362         {
3363           rtx sym, offs;
3364           split_const (info->offset, &sym, &offs);
3365           if (GET_CODE (sym) == SYMBOL_REF
3366               && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3367                   == SYMBOL_SMALL_ABSOLUTE))
3368             {
3369               /* The symbol and offset must be aligned to the access size.  */
3370               unsigned int align;
3371               unsigned int ref_size;
3372
3373               if (CONSTANT_POOL_ADDRESS_P (sym))
3374                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3375               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3376                 {
3377                   tree exp = SYMBOL_REF_DECL (sym);
3378                   align = TYPE_ALIGN (TREE_TYPE (exp));
3379                   align = CONSTANT_ALIGNMENT (exp, align);
3380                 }
3381               else if (SYMBOL_REF_DECL (sym))
3382                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3383               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3384                        && SYMBOL_REF_BLOCK (sym) != NULL)
3385                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3386               else
3387                 align = BITS_PER_UNIT;
3388
3389               ref_size = GET_MODE_SIZE (mode);
3390               if (ref_size == 0)
3391                 ref_size = GET_MODE_SIZE (DImode);
3392
3393               return ((INTVAL (offs) & (ref_size - 1)) == 0
3394                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3395             }
3396         }
3397       return false;
3398
3399     default:
3400       return false;
3401     }
3402 }
3403
3404 bool
3405 aarch64_symbolic_address_p (rtx x)
3406 {
3407   rtx offset;
3408
3409   split_const (x, &x, &offset);
3410   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3411 }
3412
3413 /* Classify the base of symbolic expression X, given that X appears in
3414    context CONTEXT.  */
3415
3416 enum aarch64_symbol_type
3417 aarch64_classify_symbolic_expression (rtx x,
3418                                       enum aarch64_symbol_context context)
3419 {
3420   rtx offset;
3421
3422   split_const (x, &x, &offset);
3423   return aarch64_classify_symbol (x, context);
3424 }
3425
3426
3427 /* Return TRUE if X is a legitimate address for accessing memory in
3428    mode MODE.  */
3429 static bool
3430 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3431 {
3432   struct aarch64_address_info addr;
3433
3434   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3435 }
3436
3437 /* Return TRUE if X is a legitimate address for accessing memory in
3438    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3439    pair operation.  */
3440 bool
3441 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3442                               RTX_CODE outer_code, bool strict_p)
3443 {
3444   struct aarch64_address_info addr;
3445
3446   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3447 }
3448
3449 /* Return TRUE if rtx X is immediate constant 0.0 */
3450 bool
3451 aarch64_float_const_zero_rtx_p (rtx x)
3452 {
3453   REAL_VALUE_TYPE r;
3454
3455   if (GET_MODE (x) == VOIDmode)
3456     return false;
3457
3458   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3459   if (REAL_VALUE_MINUS_ZERO (r))
3460     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3461   return REAL_VALUES_EQUAL (r, dconst0);
3462 }
3463
3464 /* Return the fixed registers used for condition codes.  */
3465
3466 static bool
3467 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3468 {
3469   *p1 = CC_REGNUM;
3470   *p2 = INVALID_REGNUM;
3471   return true;
3472 }
3473
3474 enum machine_mode
3475 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3476 {
3477   /* All floating point compares return CCFP if it is an equality
3478      comparison, and CCFPE otherwise.  */
3479   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3480     {
3481       switch (code)
3482         {
3483         case EQ:
3484         case NE:
3485         case UNORDERED:
3486         case ORDERED:
3487         case UNLT:
3488         case UNLE:
3489         case UNGT:
3490         case UNGE:
3491         case UNEQ:
3492         case LTGT:
3493           return CCFPmode;
3494
3495         case LT:
3496         case LE:
3497         case GT:
3498         case GE:
3499           return CCFPEmode;
3500
3501         default:
3502           gcc_unreachable ();
3503         }
3504     }
3505
3506   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3507       && y == const0_rtx
3508       && (code == EQ || code == NE || code == LT || code == GE)
3509       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3510           || GET_CODE (x) == NEG))
3511     return CC_NZmode;
3512
3513   /* A compare with a shifted operand.  Because of canonicalization,
3514      the comparison will have to be swapped when we emit the assembly
3515      code.  */
3516   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3517       && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3518       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3519           || GET_CODE (x) == LSHIFTRT
3520           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3521     return CC_SWPmode;
3522
3523   /* Similarly for a negated operand, but we can only do this for
3524      equalities.  */
3525   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3526       && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3527       && (code == EQ || code == NE)
3528       && GET_CODE (x) == NEG)
3529     return CC_Zmode;
3530
3531   /* A compare of a mode narrower than SI mode against zero can be done
3532      by extending the value in the comparison.  */
3533   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3534       && y == const0_rtx)
3535     /* Only use sign-extension if we really need it.  */
3536     return ((code == GT || code == GE || code == LE || code == LT)
3537             ? CC_SESWPmode : CC_ZESWPmode);
3538
3539   /* For everything else, return CCmode.  */
3540   return CCmode;
3541 }
3542
3543 static unsigned
3544 aarch64_get_condition_code (rtx x)
3545 {
3546   enum machine_mode mode = GET_MODE (XEXP (x, 0));
3547   enum rtx_code comp_code = GET_CODE (x);
3548
3549   if (GET_MODE_CLASS (mode) != MODE_CC)
3550     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3551
3552   switch (mode)
3553     {
3554     case CCFPmode:
3555     case CCFPEmode:
3556       switch (comp_code)
3557         {
3558         case GE: return AARCH64_GE;
3559         case GT: return AARCH64_GT;
3560         case LE: return AARCH64_LS;
3561         case LT: return AARCH64_MI;
3562         case NE: return AARCH64_NE;
3563         case EQ: return AARCH64_EQ;
3564         case ORDERED: return AARCH64_VC;
3565         case UNORDERED: return AARCH64_VS;
3566         case UNLT: return AARCH64_LT;
3567         case UNLE: return AARCH64_LE;
3568         case UNGT: return AARCH64_HI;
3569         case UNGE: return AARCH64_PL;
3570         default: gcc_unreachable ();
3571         }
3572       break;
3573
3574     case CCmode:
3575       switch (comp_code)
3576         {
3577         case NE: return AARCH64_NE;
3578         case EQ: return AARCH64_EQ;
3579         case GE: return AARCH64_GE;
3580         case GT: return AARCH64_GT;
3581         case LE: return AARCH64_LE;
3582         case LT: return AARCH64_LT;
3583         case GEU: return AARCH64_CS;
3584         case GTU: return AARCH64_HI;
3585         case LEU: return AARCH64_LS;
3586         case LTU: return AARCH64_CC;
3587         default: gcc_unreachable ();
3588         }
3589       break;
3590
3591     case CC_SWPmode:
3592     case CC_ZESWPmode:
3593     case CC_SESWPmode:
3594       switch (comp_code)
3595         {
3596         case NE: return AARCH64_NE;
3597         case EQ: return AARCH64_EQ;
3598         case GE: return AARCH64_LE;
3599         case GT: return AARCH64_LT;
3600         case LE: return AARCH64_GE;
3601         case LT: return AARCH64_GT;
3602         case GEU: return AARCH64_LS;
3603         case GTU: return AARCH64_CC;
3604         case LEU: return AARCH64_CS;
3605         case LTU: return AARCH64_HI;
3606         default: gcc_unreachable ();
3607         }
3608       break;
3609
3610     case CC_NZmode:
3611       switch (comp_code)
3612         {
3613         case NE: return AARCH64_NE;
3614         case EQ: return AARCH64_EQ;
3615         case GE: return AARCH64_PL;
3616         case LT: return AARCH64_MI;
3617         default: gcc_unreachable ();
3618         }
3619       break;
3620
3621     case CC_Zmode:
3622       switch (comp_code)
3623         {
3624         case NE: return AARCH64_NE;
3625         case EQ: return AARCH64_EQ;
3626         default: gcc_unreachable ();
3627         }
3628       break;
3629
3630     default:
3631       gcc_unreachable ();
3632       break;
3633     }
3634 }
3635
3636 static unsigned
3637 bit_count (unsigned HOST_WIDE_INT value)
3638 {
3639   unsigned count = 0;
3640
3641   while (value)
3642     {
3643       count++;
3644       value &= value - 1;
3645     }
3646
3647   return count;
3648 }
3649
3650 void
3651 aarch64_print_operand (FILE *f, rtx x, char code)
3652 {
3653   switch (code)
3654     {
3655     /* An integer or symbol address without a preceding # sign.  */
3656     case 'c':
3657       switch (GET_CODE (x))
3658         {
3659         case CONST_INT:
3660           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3661           break;
3662
3663         case SYMBOL_REF:
3664           output_addr_const (f, x);
3665           break;
3666
3667         case CONST:
3668           if (GET_CODE (XEXP (x, 0)) == PLUS
3669               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3670             {
3671               output_addr_const (f, x);
3672               break;
3673             }
3674           /* Fall through.  */
3675
3676         default:
3677           output_operand_lossage ("Unsupported operand for code '%c'", code);
3678         }
3679       break;
3680
3681     case 'e':
3682       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3683       {
3684         int n;
3685
3686         if (GET_CODE (x) != CONST_INT
3687             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3688           {
3689             output_operand_lossage ("invalid operand for '%%%c'", code);
3690             return;
3691           }
3692
3693         switch (n)
3694           {
3695           case 3:
3696             fputc ('b', f);
3697             break;
3698           case 4:
3699             fputc ('h', f);
3700             break;
3701           case 5:
3702             fputc ('w', f);
3703             break;
3704           default:
3705             output_operand_lossage ("invalid operand for '%%%c'", code);
3706             return;
3707           }
3708       }
3709       break;
3710
3711     case 'p':
3712       {
3713         int n;
3714
3715         /* Print N such that 2^N == X.  */
3716         if (GET_CODE (x) != CONST_INT || (n = exact_log2 (INTVAL (x))) < 0)
3717           {
3718             output_operand_lossage ("invalid operand for '%%%c'", code);
3719             return;
3720           }
3721
3722         asm_fprintf (f, "%d", n);
3723       }
3724       break;
3725
3726     case 'P':
3727       /* Print the number of non-zero bits in X (a const_int).  */
3728       if (GET_CODE (x) != CONST_INT)
3729         {
3730           output_operand_lossage ("invalid operand for '%%%c'", code);
3731           return;
3732         }
3733
3734       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3735       break;
3736
3737     case 'H':
3738       /* Print the higher numbered register of a pair (TImode) of regs.  */
3739       if (GET_CODE (x) != REG || !GP_REGNUM_P (REGNO (x) + 1))
3740         {
3741           output_operand_lossage ("invalid operand for '%%%c'", code);
3742           return;
3743         }
3744
3745       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3746       break;
3747
3748     case 'm':
3749       /* Print a condition (eq, ne, etc).  */
3750
3751       /* CONST_TRUE_RTX means always -- that's the default.  */
3752       if (x == const_true_rtx)
3753         return;
3754
3755       if (!COMPARISON_P (x))
3756         {
3757           output_operand_lossage ("invalid operand for '%%%c'", code);
3758           return;
3759         }
3760
3761       fputs (aarch64_condition_codes[aarch64_get_condition_code (x)], f);
3762       break;
3763
3764     case 'M':
3765       /* Print the inverse of a condition (eq <-> ne, etc).  */
3766
3767       /* CONST_TRUE_RTX means never -- that's the default.  */
3768       if (x == const_true_rtx)
3769         {
3770           fputs ("nv", f);
3771           return;
3772         }
3773
3774       if (!COMPARISON_P (x))
3775         {
3776           output_operand_lossage ("invalid operand for '%%%c'", code);
3777           return;
3778         }
3779
3780       fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3781                                   (aarch64_get_condition_code (x))], f);
3782       break;
3783
3784     case 'b':
3785     case 'h':
3786     case 's':
3787     case 'd':
3788     case 'q':
3789       /* Print a scalar FP/SIMD register name.  */
3790       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3791         {
3792           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3793           return;
3794         }
3795       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3796       break;
3797
3798     case 'S':
3799     case 'T':
3800     case 'U':
3801     case 'V':
3802       /* Print the first FP/SIMD register name in a list.  */
3803       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3804         {
3805           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3806           return;
3807         }
3808       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3809       break;
3810
3811     case 'X':
3812       /* Print bottom 16 bits of integer constant in hex.  */
3813       if (GET_CODE (x) != CONST_INT)
3814         {
3815           output_operand_lossage ("invalid operand for '%%%c'", code);
3816           return;
3817         }
3818       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3819       break;
3820
3821     case 'w':
3822     case 'x':
3823       /* Print a general register name or the zero register (32-bit or
3824          64-bit).  */
3825       if (x == const0_rtx
3826           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3827         {
3828           asm_fprintf (f, "%czr", code);
3829           break;
3830         }
3831
3832       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3833         {
3834           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3835           break;
3836         }
3837
3838       if (REG_P (x) && REGNO (x) == SP_REGNUM)
3839         {
3840           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3841           break;
3842         }
3843
3844       /* Fall through */
3845
3846     case 0:
3847       /* Print a normal operand, if it's a general register, then we
3848          assume DImode.  */
3849       if (x == NULL)
3850         {
3851           output_operand_lossage ("missing operand");
3852           return;
3853         }
3854
3855       switch (GET_CODE (x))
3856         {
3857         case REG:
3858           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3859           break;
3860
3861         case MEM:
3862           aarch64_memory_reference_mode = GET_MODE (x);
3863           output_address (XEXP (x, 0));
3864           break;
3865
3866         case LABEL_REF:
3867         case SYMBOL_REF:
3868           output_addr_const (asm_out_file, x);
3869           break;
3870
3871         case CONST_INT:
3872           asm_fprintf (f, "%wd", INTVAL (x));
3873           break;
3874
3875         case CONST_VECTOR:
3876           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3877             {
3878               gcc_assert (aarch64_const_vec_all_same_int_p (x,
3879                                                             HOST_WIDE_INT_MIN,
3880                                                             HOST_WIDE_INT_MAX));
3881               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3882             }
3883           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3884             {
3885               fputc ('0', f);
3886             }
3887           else
3888             gcc_unreachable ();
3889           break;
3890
3891         case CONST_DOUBLE:
3892           /* CONST_DOUBLE can represent a double-width integer.
3893              In this case, the mode of x is VOIDmode.  */
3894           if (GET_MODE (x) == VOIDmode)
3895             ; /* Do Nothing.  */
3896           else if (aarch64_float_const_zero_rtx_p (x))
3897             {
3898               fputc ('0', f);
3899               break;
3900             }
3901           else if (aarch64_float_const_representable_p (x))
3902             {
3903 #define buf_size 20
3904               char float_buf[buf_size] = {'\0'};
3905               REAL_VALUE_TYPE r;
3906               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3907               real_to_decimal_for_mode (float_buf, &r,
3908                                         buf_size, buf_size,
3909                                         1, GET_MODE (x));
3910               asm_fprintf (asm_out_file, "%s", float_buf);
3911               break;
3912 #undef buf_size
3913             }
3914           output_operand_lossage ("invalid constant");
3915           return;
3916         default:
3917           output_operand_lossage ("invalid operand");
3918           return;
3919         }
3920       break;
3921
3922     case 'A':
3923       if (GET_CODE (x) == HIGH)
3924         x = XEXP (x, 0);
3925
3926       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3927         {
3928         case SYMBOL_SMALL_GOT:
3929           asm_fprintf (asm_out_file, ":got:");
3930           break;
3931
3932         case SYMBOL_SMALL_TLSGD:
3933           asm_fprintf (asm_out_file, ":tlsgd:");
3934           break;
3935
3936         case SYMBOL_SMALL_TLSDESC:
3937           asm_fprintf (asm_out_file, ":tlsdesc:");
3938           break;
3939
3940         case SYMBOL_SMALL_GOTTPREL:
3941           asm_fprintf (asm_out_file, ":gottprel:");
3942           break;
3943
3944         case SYMBOL_SMALL_TPREL:
3945           asm_fprintf (asm_out_file, ":tprel:");
3946           break;
3947
3948         case SYMBOL_TINY_GOT:
3949           gcc_unreachable ();
3950           break;
3951
3952         default:
3953           break;
3954         }
3955       output_addr_const (asm_out_file, x);
3956       break;
3957
3958     case 'L':
3959       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3960         {
3961         case SYMBOL_SMALL_GOT:
3962           asm_fprintf (asm_out_file, ":lo12:");
3963           break;
3964
3965         case SYMBOL_SMALL_TLSGD:
3966           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3967           break;
3968
3969         case SYMBOL_SMALL_TLSDESC:
3970           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3971           break;
3972
3973         case SYMBOL_SMALL_GOTTPREL:
3974           asm_fprintf (asm_out_file, ":gottprel_lo12:");
3975           break;
3976
3977         case SYMBOL_SMALL_TPREL:
3978           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3979           break;
3980
3981         case SYMBOL_TINY_GOT:
3982           asm_fprintf (asm_out_file, ":got:");
3983           break;
3984
3985         default:
3986           break;
3987         }
3988       output_addr_const (asm_out_file, x);
3989       break;
3990
3991     case 'G':
3992
3993       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3994         {
3995         case SYMBOL_SMALL_TPREL:
3996           asm_fprintf (asm_out_file, ":tprel_hi12:");
3997           break;
3998         default:
3999           break;
4000         }
4001       output_addr_const (asm_out_file, x);
4002       break;
4003
4004     default:
4005       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4006       return;
4007     }
4008 }
4009
4010 void
4011 aarch64_print_operand_address (FILE *f, rtx x)
4012 {
4013   struct aarch64_address_info addr;
4014
4015   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4016                              MEM, true))
4017     switch (addr.type)
4018       {
4019       case ADDRESS_REG_IMM:
4020         if (addr.offset == const0_rtx)
4021           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4022         else
4023           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4024                        INTVAL (addr.offset));
4025         return;
4026
4027       case ADDRESS_REG_REG:
4028         if (addr.shift == 0)
4029           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4030                        reg_names [REGNO (addr.offset)]);
4031         else
4032           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4033                        reg_names [REGNO (addr.offset)], addr.shift);
4034         return;
4035
4036       case ADDRESS_REG_UXTW:
4037         if (addr.shift == 0)
4038           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4039                        REGNO (addr.offset) - R0_REGNUM);
4040         else
4041           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4042                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4043         return;
4044
4045       case ADDRESS_REG_SXTW:
4046         if (addr.shift == 0)
4047           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4048                        REGNO (addr.offset) - R0_REGNUM);
4049         else
4050           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4051                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4052         return;
4053
4054       case ADDRESS_REG_WB:
4055         switch (GET_CODE (x))
4056           {
4057           case PRE_INC:
4058             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4059                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4060             return;
4061           case POST_INC:
4062             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4063                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4064             return;
4065           case PRE_DEC:
4066             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4067                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4068             return;
4069           case POST_DEC:
4070             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4071                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4072             return;
4073           case PRE_MODIFY:
4074             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4075                          INTVAL (addr.offset));
4076             return;
4077           case POST_MODIFY:
4078             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4079                          INTVAL (addr.offset));
4080             return;
4081           default:
4082             break;
4083           }
4084         break;
4085
4086       case ADDRESS_LO_SUM:
4087         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4088         output_addr_const (f, addr.offset);
4089         asm_fprintf (f, "]");
4090         return;
4091
4092       case ADDRESS_SYMBOLIC:
4093         break;
4094       }
4095
4096   output_addr_const (f, x);
4097 }
4098
4099 bool
4100 aarch64_label_mentioned_p (rtx x)
4101 {
4102   const char *fmt;
4103   int i;
4104
4105   if (GET_CODE (x) == LABEL_REF)
4106     return true;
4107
4108   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4109      referencing instruction, but they are constant offsets, not
4110      symbols.  */
4111   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4112     return false;
4113
4114   fmt = GET_RTX_FORMAT (GET_CODE (x));
4115   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4116     {
4117       if (fmt[i] == 'E')
4118         {
4119           int j;
4120
4121           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4122             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4123               return 1;
4124         }
4125       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4126         return 1;
4127     }
4128
4129   return 0;
4130 }
4131
4132 /* Implement REGNO_REG_CLASS.  */
4133
4134 enum reg_class
4135 aarch64_regno_regclass (unsigned regno)
4136 {
4137   if (GP_REGNUM_P (regno))
4138     return CORE_REGS;
4139
4140   if (regno == SP_REGNUM)
4141     return STACK_REG;
4142
4143   if (regno == FRAME_POINTER_REGNUM
4144       || regno == ARG_POINTER_REGNUM)
4145     return POINTER_REGS;
4146
4147   if (FP_REGNUM_P (regno))
4148     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4149
4150   return NO_REGS;
4151 }
4152
4153 /* Try a machine-dependent way of reloading an illegitimate address
4154    operand.  If we find one, push the reload and return the new rtx.  */
4155
4156 rtx
4157 aarch64_legitimize_reload_address (rtx *x_p,
4158                                    enum machine_mode mode,
4159                                    int opnum, int type,
4160                                    int ind_levels ATTRIBUTE_UNUSED)
4161 {
4162   rtx x = *x_p;
4163
4164   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4165   if (aarch64_vect_struct_mode_p (mode)
4166       && GET_CODE (x) == PLUS
4167       && REG_P (XEXP (x, 0))
4168       && CONST_INT_P (XEXP (x, 1)))
4169     {
4170       rtx orig_rtx = x;
4171       x = copy_rtx (x);
4172       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4173                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4174                    opnum, (enum reload_type) type);
4175       return x;
4176     }
4177
4178   /* We must recognize output that we have already generated ourselves.  */
4179   if (GET_CODE (x) == PLUS
4180       && GET_CODE (XEXP (x, 0)) == PLUS
4181       && REG_P (XEXP (XEXP (x, 0), 0))
4182       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4183       && CONST_INT_P (XEXP (x, 1)))
4184     {
4185       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4186                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4187                    opnum, (enum reload_type) type);
4188       return x;
4189     }
4190
4191   /* We wish to handle large displacements off a base register by splitting
4192      the addend across an add and the mem insn.  This can cut the number of
4193      extra insns needed from 3 to 1.  It is only useful for load/store of a
4194      single register with 12 bit offset field.  */
4195   if (GET_CODE (x) == PLUS
4196       && REG_P (XEXP (x, 0))
4197       && CONST_INT_P (XEXP (x, 1))
4198       && HARD_REGISTER_P (XEXP (x, 0))
4199       && mode != TImode
4200       && mode != TFmode
4201       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4202     {
4203       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4204       HOST_WIDE_INT low = val & 0xfff;
4205       HOST_WIDE_INT high = val - low;
4206       HOST_WIDE_INT offs;
4207       rtx cst;
4208       enum machine_mode xmode = GET_MODE (x);
4209
4210       /* In ILP32, xmode can be either DImode or SImode.  */
4211       gcc_assert (xmode == DImode || xmode == SImode);
4212
4213       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4214          BLKmode alignment.  */
4215       if (GET_MODE_SIZE (mode) == 0)
4216         return NULL_RTX;
4217
4218       offs = low % GET_MODE_SIZE (mode);
4219
4220       /* Align misaligned offset by adjusting high part to compensate.  */
4221       if (offs != 0)
4222         {
4223           if (aarch64_uimm12_shift (high + offs))
4224             {
4225               /* Align down.  */
4226               low = low - offs;
4227               high = high + offs;
4228             }
4229           else
4230             {
4231               /* Align up.  */
4232               offs = GET_MODE_SIZE (mode) - offs;
4233               low = low + offs;
4234               high = high + (low & 0x1000) - offs;
4235               low &= 0xfff;
4236             }
4237         }
4238
4239       /* Check for overflow.  */
4240       if (high + low != val)
4241         return NULL_RTX;
4242
4243       cst = GEN_INT (high);
4244       if (!aarch64_uimm12_shift (high))
4245         cst = force_const_mem (xmode, cst);
4246
4247       /* Reload high part into base reg, leaving the low part
4248          in the mem instruction.
4249          Note that replacing this gen_rtx_PLUS with plus_constant is
4250          wrong in this case because we rely on the
4251          (plus (plus reg c1) c2) structure being preserved so that
4252          XEXP (*p, 0) in push_reload below uses the correct term.  */
4253       x = gen_rtx_PLUS (xmode,
4254                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4255                         GEN_INT (low));
4256
4257       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4258                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4259                    opnum, (enum reload_type) type);
4260       return x;
4261     }
4262
4263   return NULL_RTX;
4264 }
4265
4266
4267 static reg_class_t
4268 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4269                           reg_class_t rclass,
4270                           enum machine_mode mode,
4271                           secondary_reload_info *sri)
4272 {
4273   /* Without the TARGET_SIMD instructions we cannot move a Q register
4274      to a Q register directly.  We need a scratch.  */
4275   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4276       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4277       && reg_class_subset_p (rclass, FP_REGS))
4278     {
4279       if (mode == TFmode)
4280         sri->icode = CODE_FOR_aarch64_reload_movtf;
4281       else if (mode == TImode)
4282         sri->icode = CODE_FOR_aarch64_reload_movti;
4283       return NO_REGS;
4284     }
4285
4286   /* A TFmode or TImode memory access should be handled via an FP_REGS
4287      because AArch64 has richer addressing modes for LDR/STR instructions
4288      than LDP/STP instructions.  */
4289   if (!TARGET_GENERAL_REGS_ONLY && rclass == CORE_REGS
4290       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4291     return FP_REGS;
4292
4293   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4294       return CORE_REGS;
4295
4296   return NO_REGS;
4297 }
4298
4299 static bool
4300 aarch64_can_eliminate (const int from, const int to)
4301 {
4302   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4303      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4304
4305   if (frame_pointer_needed)
4306     {
4307       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4308         return true;
4309       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4310         return false;
4311       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4312           && !cfun->calls_alloca)
4313         return true;
4314       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4315         return true;
4316
4317       return false;
4318     }
4319
4320   return true;
4321 }
4322
4323 HOST_WIDE_INT
4324 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4325 {
4326   aarch64_layout_frame ();
4327
4328   if (to == HARD_FRAME_POINTER_REGNUM)
4329     {
4330       if (from == ARG_POINTER_REGNUM)
4331         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4332
4333       if (from == FRAME_POINTER_REGNUM)
4334         return (cfun->machine->frame.hard_fp_offset
4335                 - cfun->machine->frame.saved_varargs_size);
4336     }
4337
4338   if (to == STACK_POINTER_REGNUM)
4339     {
4340       if (from == FRAME_POINTER_REGNUM)
4341           return (cfun->machine->frame.frame_size
4342                   - cfun->machine->frame.saved_varargs_size);
4343     }
4344
4345   return cfun->machine->frame.frame_size;
4346 }
4347
4348 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4349    previous frame.  */
4350
4351 rtx
4352 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4353 {
4354   if (count != 0)
4355     return const0_rtx;
4356   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4357 }
4358
4359
4360 static void
4361 aarch64_asm_trampoline_template (FILE *f)
4362 {
4363   if (TARGET_ILP32)
4364     {
4365       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4366       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4367     }
4368   else
4369     {
4370       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4371       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4372     }
4373   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4374   assemble_aligned_integer (4, const0_rtx);
4375   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4376   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4377 }
4378
4379 static void
4380 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4381 {
4382   rtx fnaddr, mem, a_tramp;
4383   const int tramp_code_sz = 16;
4384
4385   /* Don't need to copy the trailing D-words, we fill those in below.  */
4386   emit_block_move (m_tramp, assemble_trampoline_template (),
4387                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4388   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4389   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4390   if (GET_MODE (fnaddr) != ptr_mode)
4391     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4392   emit_move_insn (mem, fnaddr);
4393
4394   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4395   emit_move_insn (mem, chain_value);
4396
4397   /* XXX We should really define a "clear_cache" pattern and use
4398      gen_clear_cache().  */
4399   a_tramp = XEXP (m_tramp, 0);
4400   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4401                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4402                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4403                      ptr_mode);
4404 }
4405
4406 static unsigned char
4407 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4408 {
4409   switch (regclass)
4410     {
4411     case CALLER_SAVE_REGS:
4412     case CORE_REGS:
4413     case POINTER_REGS:
4414     case GENERAL_REGS:
4415     case ALL_REGS:
4416     case FP_REGS:
4417     case FP_LO_REGS:
4418       return
4419         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4420                                        (GET_MODE_SIZE (mode) + 7) / 8;
4421     case STACK_REG:
4422       return 1;
4423
4424     case NO_REGS:
4425       return 0;
4426
4427     default:
4428       break;
4429     }
4430   gcc_unreachable ();
4431 }
4432
4433 static reg_class_t
4434 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4435 {
4436   if (regclass == POINTER_REGS)
4437     return GENERAL_REGS;
4438
4439   if (regclass == STACK_REG)
4440     {
4441       if (REG_P(x)
4442           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4443           return regclass;
4444
4445       return NO_REGS;
4446     }
4447
4448   /* If it's an integer immediate that MOVI can't handle, then
4449      FP_REGS is not an option, so we return NO_REGS instead.  */
4450   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4451       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4452     return NO_REGS;
4453
4454   /* Register eliminiation can result in a request for
4455      SP+constant->FP_REGS.  We cannot support such operations which
4456      use SP as source and an FP_REG as destination, so reject out
4457      right now.  */
4458   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4459     {
4460       rtx lhs = XEXP (x, 0);
4461
4462       /* Look through a possible SUBREG introduced by ILP32.  */
4463       if (GET_CODE (lhs) == SUBREG)
4464         lhs = SUBREG_REG (lhs);
4465
4466       gcc_assert (REG_P (lhs));
4467       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4468                                       POINTER_REGS));
4469       return NO_REGS;
4470     }
4471
4472   return regclass;
4473 }
4474
4475 void
4476 aarch64_asm_output_labelref (FILE* f, const char *name)
4477 {
4478   asm_fprintf (f, "%U%s", name);
4479 }
4480
4481 static void
4482 aarch64_elf_asm_constructor (rtx symbol, int priority)
4483 {
4484   if (priority == DEFAULT_INIT_PRIORITY)
4485     default_ctor_section_asm_out_constructor (symbol, priority);
4486   else
4487     {
4488       section *s;
4489       char buf[18];
4490       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4491       s = get_section (buf, SECTION_WRITE, NULL);
4492       switch_to_section (s);
4493       assemble_align (POINTER_SIZE);
4494       assemble_aligned_integer (POINTER_BYTES, symbol);
4495     }
4496 }
4497
4498 static void
4499 aarch64_elf_asm_destructor (rtx symbol, int priority)
4500 {
4501   if (priority == DEFAULT_INIT_PRIORITY)
4502     default_dtor_section_asm_out_destructor (symbol, priority);
4503   else
4504     {
4505       section *s;
4506       char buf[18];
4507       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4508       s = get_section (buf, SECTION_WRITE, NULL);
4509       switch_to_section (s);
4510       assemble_align (POINTER_SIZE);
4511       assemble_aligned_integer (POINTER_BYTES, symbol);
4512     }
4513 }
4514
4515 const char*
4516 aarch64_output_casesi (rtx *operands)
4517 {
4518   char buf[100];
4519   char label[100];
4520   rtx diff_vec = PATTERN (NEXT_INSN (operands[2]));
4521   int index;
4522   static const char *const patterns[4][2] =
4523   {
4524     {
4525       "ldrb\t%w3, [%0,%w1,uxtw]",
4526       "add\t%3, %4, %w3, sxtb #2"
4527     },
4528     {
4529       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4530       "add\t%3, %4, %w3, sxth #2"
4531     },
4532     {
4533       "ldr\t%w3, [%0,%w1,uxtw #2]",
4534       "add\t%3, %4, %w3, sxtw #2"
4535     },
4536     /* We assume that DImode is only generated when not optimizing and
4537        that we don't really need 64-bit address offsets.  That would
4538        imply an object file with 8GB of code in a single function!  */
4539     {
4540       "ldr\t%w3, [%0,%w1,uxtw #2]",
4541       "add\t%3, %4, %w3, sxtw #2"
4542     }
4543   };
4544
4545   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4546
4547   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4548
4549   gcc_assert (index >= 0 && index <= 3);
4550
4551   /* Need to implement table size reduction, by chaning the code below.  */
4552   output_asm_insn (patterns[index][0], operands);
4553   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4554   snprintf (buf, sizeof (buf),
4555             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4556   output_asm_insn (buf, operands);
4557   output_asm_insn (patterns[index][1], operands);
4558   output_asm_insn ("br\t%3", operands);
4559   assemble_label (asm_out_file, label);
4560   return "";
4561 }
4562
4563
4564 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4565    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4566    operator.  */
4567
4568 int
4569 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4570 {
4571   if (shift >= 0 && shift <= 3)
4572     {
4573       int size;
4574       for (size = 8; size <= 32; size *= 2)
4575         {
4576           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4577           if (mask == bits << shift)
4578             return size;
4579         }
4580     }
4581   return 0;
4582 }
4583
4584 static bool
4585 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4586                                    const_rtx x ATTRIBUTE_UNUSED)
4587 {
4588   /* We can't use blocks for constants when we're using a per-function
4589      constant pool.  */
4590   return false;
4591 }
4592
4593 static section *
4594 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4595                             rtx x ATTRIBUTE_UNUSED,
4596                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4597 {
4598   /* Force all constant pool entries into the current function section.  */
4599   return function_section (current_function_decl);
4600 }
4601
4602
4603 /* Costs.  */
4604
4605 /* Helper function for rtx cost calculation.  Strip a shift expression
4606    from X.  Returns the inner operand if successful, or the original
4607    expression on failure.  */
4608 static rtx
4609 aarch64_strip_shift (rtx x)
4610 {
4611   rtx op = x;
4612
4613   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4614      we can convert both to ROR during final output.  */
4615   if ((GET_CODE (op) == ASHIFT
4616        || GET_CODE (op) == ASHIFTRT
4617        || GET_CODE (op) == LSHIFTRT
4618        || GET_CODE (op) == ROTATERT
4619        || GET_CODE (op) == ROTATE)
4620       && CONST_INT_P (XEXP (op, 1)))
4621     return XEXP (op, 0);
4622
4623   if (GET_CODE (op) == MULT
4624       && CONST_INT_P (XEXP (op, 1))
4625       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4626     return XEXP (op, 0);
4627
4628   return x;
4629 }
4630
4631 /* Helper function for rtx cost calculation.  Strip an extend
4632    expression from X.  Returns the inner operand if successful, or the
4633    original expression on failure.  We deal with a number of possible
4634    canonicalization variations here.  */
4635 static rtx
4636 aarch64_strip_extend (rtx x)
4637 {
4638   rtx op = x;
4639
4640   /* Zero and sign extraction of a widened value.  */
4641   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4642       && XEXP (op, 2) == const0_rtx
4643       && GET_CODE (XEXP (op, 0)) == MULT
4644       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4645                                          XEXP (op, 1)))
4646     return XEXP (XEXP (op, 0), 0);
4647
4648   /* It can also be represented (for zero-extend) as an AND with an
4649      immediate.  */
4650   if (GET_CODE (op) == AND
4651       && GET_CODE (XEXP (op, 0)) == MULT
4652       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4653       && CONST_INT_P (XEXP (op, 1))
4654       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4655                            INTVAL (XEXP (op, 1))) != 0)
4656     return XEXP (XEXP (op, 0), 0);
4657
4658   /* Now handle extended register, as this may also have an optional
4659      left shift by 1..4.  */
4660   if (GET_CODE (op) == ASHIFT
4661       && CONST_INT_P (XEXP (op, 1))
4662       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4663     op = XEXP (op, 0);
4664
4665   if (GET_CODE (op) == ZERO_EXTEND
4666       || GET_CODE (op) == SIGN_EXTEND)
4667     op = XEXP (op, 0);
4668
4669   if (op != x)
4670     return op;
4671
4672   return x;
4673 }
4674
4675 /* Helper function for rtx cost calculation.  Calculate the cost of
4676    a MULT, which may be part of a multiply-accumulate rtx.  Return
4677    the calculated cost of the expression, recursing manually in to
4678    operands where needed.  */
4679
4680 static int
4681 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4682 {
4683   rtx op0, op1;
4684   const struct cpu_cost_table *extra_cost
4685     = aarch64_tune_params->insn_extra_cost;
4686   int cost = 0;
4687   bool maybe_fma = (outer == PLUS || outer == MINUS);
4688   enum machine_mode mode = GET_MODE (x);
4689
4690   gcc_checking_assert (code == MULT);
4691
4692   op0 = XEXP (x, 0);
4693   op1 = XEXP (x, 1);
4694
4695   if (VECTOR_MODE_P (mode))
4696     mode = GET_MODE_INNER (mode);
4697
4698   /* Integer multiply/fma.  */
4699   if (GET_MODE_CLASS (mode) == MODE_INT)
4700     {
4701       /* The multiply will be canonicalized as a shift, cost it as such.  */
4702       if (CONST_INT_P (op1)
4703           && exact_log2 (INTVAL (op1)) > 0)
4704         {
4705           if (speed)
4706             {
4707               if (maybe_fma)
4708                 /* ADD (shifted register).  */
4709                 cost += extra_cost->alu.arith_shift;
4710               else
4711                 /* LSL (immediate).  */
4712                 cost += extra_cost->alu.shift;
4713             }
4714
4715           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4716
4717           return cost;
4718         }
4719
4720       /* Integer multiplies or FMAs have zero/sign extending variants.  */
4721       if ((GET_CODE (op0) == ZERO_EXTEND
4722            && GET_CODE (op1) == ZERO_EXTEND)
4723           || (GET_CODE (op0) == SIGN_EXTEND
4724               && GET_CODE (op1) == SIGN_EXTEND))
4725         {
4726           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4727                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4728
4729           if (speed)
4730             {
4731               if (maybe_fma)
4732                 /* MADD/SMADDL/UMADDL.  */
4733                 cost += extra_cost->mult[0].extend_add;
4734               else
4735                 /* MUL/SMULL/UMULL.  */
4736                 cost += extra_cost->mult[0].extend;
4737             }
4738
4739           return cost;
4740         }
4741
4742       /* This is either an integer multiply or an FMA.  In both cases
4743          we want to recurse and cost the operands.  */
4744       cost += rtx_cost (op0, MULT, 0, speed)
4745               + rtx_cost (op1, MULT, 1, speed);
4746
4747       if (speed)
4748         {
4749           if (maybe_fma)
4750             /* MADD.  */
4751             cost += extra_cost->mult[mode == DImode].add;
4752           else
4753             /* MUL.  */
4754             cost += extra_cost->mult[mode == DImode].simple;
4755         }
4756
4757       return cost;
4758     }
4759   else
4760     {
4761       if (speed)
4762         {
4763           /* Floating-point FMA/FMUL can also support negations of the
4764              operands.  */
4765           if (GET_CODE (op0) == NEG)
4766             op0 = XEXP (op0, 0);
4767           if (GET_CODE (op1) == NEG)
4768             op1 = XEXP (op1, 0);
4769
4770           if (maybe_fma)
4771             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
4772             cost += extra_cost->fp[mode == DFmode].fma;
4773           else
4774             /* FMUL/FNMUL.  */
4775             cost += extra_cost->fp[mode == DFmode].mult;
4776         }
4777
4778       cost += rtx_cost (op0, MULT, 0, speed)
4779               + rtx_cost (op1, MULT, 1, speed);
4780       return cost;
4781     }
4782 }
4783
4784 static int
4785 aarch64_address_cost (rtx x,
4786                       enum machine_mode mode,
4787                       addr_space_t as ATTRIBUTE_UNUSED,
4788                       bool speed)
4789 {
4790   enum rtx_code c = GET_CODE (x);
4791   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4792   struct aarch64_address_info info;
4793   int cost = 0;
4794   info.shift = 0;
4795
4796   if (!aarch64_classify_address (&info, x, mode, c, false))
4797     {
4798       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4799         {
4800           /* This is a CONST or SYMBOL ref which will be split
4801              in a different way depending on the code model in use.
4802              Cost it through the generic infrastructure.  */
4803           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4804           /* Divide through by the cost of one instruction to
4805              bring it to the same units as the address costs.  */
4806           cost_symbol_ref /= COSTS_N_INSNS (1);
4807           /* The cost is then the cost of preparing the address,
4808              followed by an immediate (possibly 0) offset.  */
4809           return cost_symbol_ref + addr_cost->imm_offset;
4810         }
4811       else
4812         {
4813           /* This is most likely a jump table from a case
4814              statement.  */
4815           return addr_cost->register_offset;
4816         }
4817     }
4818
4819   switch (info.type)
4820     {
4821       case ADDRESS_LO_SUM:
4822       case ADDRESS_SYMBOLIC:
4823       case ADDRESS_REG_IMM:
4824         cost += addr_cost->imm_offset;
4825         break;
4826
4827       case ADDRESS_REG_WB:
4828         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4829           cost += addr_cost->pre_modify;
4830         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4831           cost += addr_cost->post_modify;
4832         else
4833           gcc_unreachable ();
4834
4835         break;
4836
4837       case ADDRESS_REG_REG:
4838         cost += addr_cost->register_offset;
4839         break;
4840
4841       case ADDRESS_REG_UXTW:
4842       case ADDRESS_REG_SXTW:
4843         cost += addr_cost->register_extend;
4844         break;
4845
4846       default:
4847         gcc_unreachable ();
4848     }
4849
4850
4851   if (info.shift > 0)
4852     {
4853       /* For the sake of calculating the cost of the shifted register
4854          component, we can treat same sized modes in the same way.  */
4855       switch (GET_MODE_BITSIZE (mode))
4856         {
4857           case 16:
4858             cost += addr_cost->addr_scale_costs.hi;
4859             break;
4860
4861           case 32:
4862             cost += addr_cost->addr_scale_costs.si;
4863             break;
4864
4865           case 64:
4866             cost += addr_cost->addr_scale_costs.di;
4867             break;
4868
4869           /* We can't tell, or this is a 128-bit vector.  */
4870           default:
4871             cost += addr_cost->addr_scale_costs.ti;
4872             break;
4873         }
4874     }
4875
4876   return cost;
4877 }
4878
4879 /* Return true if the RTX X in mode MODE is a zero or sign extract
4880    usable in an ADD or SUB (extended register) instruction.  */
4881 static bool
4882 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4883 {
4884   /* Catch add with a sign extract.
4885      This is add_<optab><mode>_multp2.  */
4886   if (GET_CODE (x) == SIGN_EXTRACT
4887       || GET_CODE (x) == ZERO_EXTRACT)
4888     {
4889       rtx op0 = XEXP (x, 0);
4890       rtx op1 = XEXP (x, 1);
4891       rtx op2 = XEXP (x, 2);
4892
4893       if (GET_CODE (op0) == MULT
4894           && CONST_INT_P (op1)
4895           && op2 == const0_rtx
4896           && CONST_INT_P (XEXP (op0, 1))
4897           && aarch64_is_extend_from_extract (mode,
4898                                              XEXP (op0, 1),
4899                                              op1))
4900         {
4901           return true;
4902         }
4903     }
4904
4905   return false;
4906 }
4907
4908 static bool
4909 aarch64_frint_unspec_p (unsigned int u)
4910 {
4911   switch (u)
4912     {
4913       case UNSPEC_FRINTZ:
4914       case UNSPEC_FRINTP:
4915       case UNSPEC_FRINTM:
4916       case UNSPEC_FRINTA:
4917       case UNSPEC_FRINTN:
4918       case UNSPEC_FRINTX:
4919       case UNSPEC_FRINTI:
4920         return true;
4921
4922       default:
4923         return false;
4924     }
4925 }
4926
4927 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4928    storing it in *COST.  Result is true if the total cost of the operation
4929    has now been calculated.  */
4930 static bool
4931 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4932 {
4933   rtx inner;
4934   rtx comparator;
4935   enum rtx_code cmpcode;
4936
4937   if (COMPARISON_P (op0))
4938     {
4939       inner = XEXP (op0, 0);
4940       comparator = XEXP (op0, 1);
4941       cmpcode = GET_CODE (op0);
4942     }
4943   else
4944     {
4945       inner = op0;
4946       comparator = const0_rtx;
4947       cmpcode = NE;
4948     }
4949
4950   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
4951     {
4952       /* Conditional branch.  */
4953       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4954         return true;
4955       else
4956         {
4957           if (cmpcode == NE || cmpcode == EQ)
4958             {
4959               if (comparator == const0_rtx)
4960                 {
4961                   /* TBZ/TBNZ/CBZ/CBNZ.  */
4962                   if (GET_CODE (inner) == ZERO_EXTRACT)
4963                     /* TBZ/TBNZ.  */
4964                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
4965                                        0, speed);
4966                 else
4967                   /* CBZ/CBNZ.  */
4968                   *cost += rtx_cost (inner, cmpcode, 0, speed);
4969
4970                 return true;
4971               }
4972             }
4973           else if (cmpcode == LT || cmpcode == GE)
4974             {
4975               /* TBZ/TBNZ.  */
4976               if (comparator == const0_rtx)
4977                 return true;
4978             }
4979         }
4980     }
4981   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4982     {
4983       /* It's a conditional operation based on the status flags,
4984          so it must be some flavor of CSEL.  */
4985
4986       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
4987       if (GET_CODE (op1) == NEG
4988           || GET_CODE (op1) == NOT
4989           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
4990         op1 = XEXP (op1, 0);
4991
4992       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
4993       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
4994       return true;
4995     }
4996
4997   /* We don't know what this is, cost all operands.  */
4998   return false;
4999 }
5000
5001 /* Calculate the cost of calculating X, storing it in *COST.  Result
5002    is true if the total cost of the operation has now been calculated.  */
5003 static bool
5004 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5005                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5006 {
5007   rtx op0, op1, op2;
5008   const struct cpu_cost_table *extra_cost
5009     = aarch64_tune_params->insn_extra_cost;
5010   enum machine_mode mode = GET_MODE (x);
5011
5012   /* By default, assume that everything has equivalent cost to the
5013      cheapest instruction.  Any additional costs are applied as a delta
5014      above this default.  */
5015   *cost = COSTS_N_INSNS (1);
5016
5017   /* TODO: The cost infrastructure currently does not handle
5018      vector operations.  Assume that all vector operations
5019      are equally expensive.  */
5020   if (VECTOR_MODE_P (mode))
5021     {
5022       if (speed)
5023         *cost += extra_cost->vect.alu;
5024       return true;
5025     }
5026
5027   switch (code)
5028     {
5029     case SET:
5030       /* The cost depends entirely on the operands to SET.  */
5031       *cost = 0;
5032       op0 = SET_DEST (x);
5033       op1 = SET_SRC (x);
5034
5035       switch (GET_CODE (op0))
5036         {
5037         case MEM:
5038           if (speed)
5039             {
5040               rtx address = XEXP (op0, 0);
5041               if (GET_MODE_CLASS (mode) == MODE_INT)
5042                 *cost += extra_cost->ldst.store;
5043               else if (mode == SFmode)
5044                 *cost += extra_cost->ldst.storef;
5045               else if (mode == DFmode)
5046                 *cost += extra_cost->ldst.stored;
5047
5048               *cost +=
5049                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5050                                                      0, speed));
5051             }
5052
5053           *cost += rtx_cost (op1, SET, 1, speed);
5054           return true;
5055
5056         case SUBREG:
5057           if (! REG_P (SUBREG_REG (op0)))
5058             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5059
5060           /* Fall through.  */
5061         case REG:
5062           /* const0_rtx is in general free, but we will use an
5063              instruction to set a register to 0.  */
5064           if (REG_P (op1) || op1 == const0_rtx)
5065             {
5066               /* The cost is 1 per register copied.  */
5067               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5068                               / UNITS_PER_WORD;
5069               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5070             }
5071           else
5072             /* Cost is just the cost of the RHS of the set.  */
5073             *cost += rtx_cost (op1, SET, 1, speed);
5074           return true;
5075
5076         case ZERO_EXTRACT:
5077         case SIGN_EXTRACT:
5078           /* Bit-field insertion.  Strip any redundant widening of
5079              the RHS to meet the width of the target.  */
5080           if (GET_CODE (op1) == SUBREG)
5081             op1 = SUBREG_REG (op1);
5082           if ((GET_CODE (op1) == ZERO_EXTEND
5083                || GET_CODE (op1) == SIGN_EXTEND)
5084               && GET_CODE (XEXP (op0, 1)) == CONST_INT
5085               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5086                   >= INTVAL (XEXP (op0, 1))))
5087             op1 = XEXP (op1, 0);
5088
5089           if (CONST_INT_P (op1))
5090             {
5091               /* MOV immediate is assumed to always be cheap.  */
5092               *cost = COSTS_N_INSNS (1);
5093             }
5094           else
5095             {
5096               /* BFM.  */
5097               if (speed)
5098                 *cost += extra_cost->alu.bfi;
5099               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5100             }
5101
5102           return true;
5103
5104         default:
5105           /* We can't make sense of this, assume default cost.  */
5106           *cost = COSTS_N_INSNS (1);
5107           return false;
5108         }
5109       return false;
5110
5111     case CONST_INT:
5112       /* If an instruction can incorporate a constant within the
5113          instruction, the instruction's expression avoids calling
5114          rtx_cost() on the constant.  If rtx_cost() is called on a
5115          constant, then it is usually because the constant must be
5116          moved into a register by one or more instructions.
5117
5118          The exception is constant 0, which can be expressed
5119          as XZR/WZR and is therefore free.  The exception to this is
5120          if we have (set (reg) (const0_rtx)) in which case we must cost
5121          the move.  However, we can catch that when we cost the SET, so
5122          we don't need to consider that here.  */
5123       if (x == const0_rtx)
5124         *cost = 0;
5125       else
5126         {
5127           /* To an approximation, building any other constant is
5128              proportionally expensive to the number of instructions
5129              required to build that constant.  This is true whether we
5130              are compiling for SPEED or otherwise.  */
5131           *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5132                                                          INTVAL (x),
5133                                                          false));
5134         }
5135       return true;
5136
5137     case CONST_DOUBLE:
5138       if (speed)
5139         {
5140           /* mov[df,sf]_aarch64.  */
5141           if (aarch64_float_const_representable_p (x))
5142             /* FMOV (scalar immediate).  */
5143             *cost += extra_cost->fp[mode == DFmode].fpconst;
5144           else if (!aarch64_float_const_zero_rtx_p (x))
5145             {
5146               /* This will be a load from memory.  */
5147               if (mode == DFmode)
5148                 *cost += extra_cost->ldst.loadd;
5149               else
5150                 *cost += extra_cost->ldst.loadf;
5151             }
5152           else
5153             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5154                or MOV v0.s[0], wzr - neither of which are modeled by the
5155                cost tables.  Just use the default cost.  */
5156             {
5157             }
5158         }
5159
5160       return true;
5161
5162     case MEM:
5163       if (speed)
5164         {
5165           /* For loads we want the base cost of a load, plus an
5166              approximation for the additional cost of the addressing
5167              mode.  */
5168           rtx address = XEXP (x, 0);
5169           if (GET_MODE_CLASS (mode) == MODE_INT)
5170             *cost += extra_cost->ldst.load;
5171           else if (mode == SFmode)
5172             *cost += extra_cost->ldst.loadf;
5173           else if (mode == DFmode)
5174             *cost += extra_cost->ldst.loadd;
5175
5176           *cost +=
5177                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5178                                                      0, speed));
5179         }
5180
5181       return true;
5182
5183     case NEG:
5184       op0 = XEXP (x, 0);
5185
5186       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5187        {
5188           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5189               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5190             {
5191               /* CSETM.  */
5192               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5193               return true;
5194             }
5195
5196           /* Cost this as SUB wzr, X.  */
5197           op0 = CONST0_RTX (GET_MODE (x));
5198           op1 = XEXP (x, 0);
5199           goto cost_minus;
5200         }
5201
5202       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5203         {
5204           /* Support (neg(fma...)) as a single instruction only if
5205              sign of zeros is unimportant.  This matches the decision
5206              making in aarch64.md.  */
5207           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5208             {
5209               /* FNMADD.  */
5210               *cost = rtx_cost (op0, NEG, 0, speed);
5211               return true;
5212             }
5213           if (speed)
5214             /* FNEG.  */
5215             *cost += extra_cost->fp[mode == DFmode].neg;
5216           return false;
5217         }
5218
5219       return false;
5220
5221     case CLRSB:
5222     case CLZ:
5223       if (speed)
5224         *cost += extra_cost->alu.clz;
5225
5226       return false;
5227
5228     case COMPARE:
5229       op0 = XEXP (x, 0);
5230       op1 = XEXP (x, 1);
5231
5232       if (op1 == const0_rtx
5233           && GET_CODE (op0) == AND)
5234         {
5235           x = op0;
5236           goto cost_logic;
5237         }
5238
5239       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5240         {
5241           /* TODO: A write to the CC flags possibly costs extra, this
5242              needs encoding in the cost tables.  */
5243
5244           /* CC_ZESWPmode supports zero extend for free.  */
5245           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5246             op0 = XEXP (op0, 0);
5247
5248           /* ANDS.  */
5249           if (GET_CODE (op0) == AND)
5250             {
5251               x = op0;
5252               goto cost_logic;
5253             }
5254
5255           if (GET_CODE (op0) == PLUS)
5256             {
5257               /* ADDS (and CMN alias).  */
5258               x = op0;
5259               goto cost_plus;
5260             }
5261
5262           if (GET_CODE (op0) == MINUS)
5263             {
5264               /* SUBS.  */
5265               x = op0;
5266               goto cost_minus;
5267             }
5268
5269           if (GET_CODE (op1) == NEG)
5270             {
5271               /* CMN.  */
5272               if (speed)
5273                 *cost += extra_cost->alu.arith;
5274
5275               *cost += rtx_cost (op0, COMPARE, 0, speed);
5276               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5277               return true;
5278             }
5279
5280           /* CMP.
5281
5282              Compare can freely swap the order of operands, and
5283              canonicalization puts the more complex operation first.
5284              But the integer MINUS logic expects the shift/extend
5285              operation in op1.  */
5286           if (! (REG_P (op0)
5287                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5288           {
5289             op0 = XEXP (x, 1);
5290             op1 = XEXP (x, 0);
5291           }
5292           goto cost_minus;
5293         }
5294
5295       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5296         {
5297           /* FCMP.  */
5298           if (speed)
5299             *cost += extra_cost->fp[mode == DFmode].compare;
5300
5301           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5302             {
5303               /* FCMP supports constant 0.0 for no extra cost. */
5304               return true;
5305             }
5306           return false;
5307         }
5308
5309       return false;
5310
5311     case MINUS:
5312       {
5313         op0 = XEXP (x, 0);
5314         op1 = XEXP (x, 1);
5315
5316 cost_minus:
5317         /* Detect valid immediates.  */
5318         if ((GET_MODE_CLASS (mode) == MODE_INT
5319              || (GET_MODE_CLASS (mode) == MODE_CC
5320                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5321             && CONST_INT_P (op1)
5322             && aarch64_uimm12_shift (INTVAL (op1)))
5323           {
5324             *cost += rtx_cost (op0, MINUS, 0, speed);
5325
5326             if (speed)
5327               /* SUB(S) (immediate).  */
5328               *cost += extra_cost->alu.arith;
5329             return true;
5330
5331           }
5332
5333         /* Look for SUB (extended register).  */
5334         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5335           {
5336             if (speed)
5337               *cost += extra_cost->alu.arith_shift;
5338
5339             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5340                                (enum rtx_code) GET_CODE (op1),
5341                                0, speed);
5342             return true;
5343           }
5344
5345         rtx new_op1 = aarch64_strip_extend (op1);
5346
5347         /* Cost this as an FMA-alike operation.  */
5348         if ((GET_CODE (new_op1) == MULT
5349              || GET_CODE (new_op1) == ASHIFT)
5350             && code != COMPARE)
5351           {
5352             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5353                                             (enum rtx_code) code,
5354                                             speed);
5355             *cost += rtx_cost (op0, MINUS, 0, speed);
5356             return true;
5357           }
5358
5359         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5360
5361         if (speed)
5362           {
5363             if (GET_MODE_CLASS (mode) == MODE_INT)
5364               /* SUB(S).  */
5365               *cost += extra_cost->alu.arith;
5366             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5367               /* FSUB.  */
5368               *cost += extra_cost->fp[mode == DFmode].addsub;
5369           }
5370         return true;
5371       }
5372
5373     case PLUS:
5374       {
5375         rtx new_op0;
5376
5377         op0 = XEXP (x, 0);
5378         op1 = XEXP (x, 1);
5379
5380 cost_plus:
5381         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5382             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5383           {
5384             /* CSINC.  */
5385             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5386             *cost += rtx_cost (op1, PLUS, 1, speed);
5387             return true;
5388           }
5389
5390         if (GET_MODE_CLASS (mode) == MODE_INT
5391             && CONST_INT_P (op1)
5392             && aarch64_uimm12_shift (INTVAL (op1)))
5393           {
5394             *cost += rtx_cost (op0, PLUS, 0, speed);
5395
5396             if (speed)
5397               /* ADD (immediate).  */
5398               *cost += extra_cost->alu.arith;
5399             return true;
5400           }
5401
5402         /* Look for ADD (extended register).  */
5403         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5404           {
5405             if (speed)
5406               *cost += extra_cost->alu.arith_shift;
5407
5408             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5409                                (enum rtx_code) GET_CODE (op0),
5410                                0, speed);
5411             return true;
5412           }
5413
5414         /* Strip any extend, leave shifts behind as we will
5415            cost them through mult_cost.  */
5416         new_op0 = aarch64_strip_extend (op0);
5417
5418         if (GET_CODE (new_op0) == MULT
5419             || GET_CODE (new_op0) == ASHIFT)
5420           {
5421             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5422                                             speed);
5423             *cost += rtx_cost (op1, PLUS, 1, speed);
5424             return true;
5425           }
5426
5427         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5428                   + rtx_cost (op1, PLUS, 1, speed));
5429
5430         if (speed)
5431           {
5432             if (GET_MODE_CLASS (mode) == MODE_INT)
5433               /* ADD.  */
5434               *cost += extra_cost->alu.arith;
5435             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5436               /* FADD.  */
5437               *cost += extra_cost->fp[mode == DFmode].addsub;
5438           }
5439         return true;
5440       }
5441
5442     case BSWAP:
5443       *cost = COSTS_N_INSNS (1);
5444
5445       if (speed)
5446         *cost += extra_cost->alu.rev;
5447
5448       return false;
5449
5450     case IOR:
5451       if (aarch_rev16_p (x))
5452         {
5453           *cost = COSTS_N_INSNS (1);
5454
5455           if (speed)
5456             *cost += extra_cost->alu.rev;
5457
5458           return true;
5459         }
5460     /* Fall through.  */
5461     case XOR:
5462     case AND:
5463     cost_logic:
5464       op0 = XEXP (x, 0);
5465       op1 = XEXP (x, 1);
5466
5467       if (code == AND
5468           && GET_CODE (op0) == MULT
5469           && CONST_INT_P (XEXP (op0, 1))
5470           && CONST_INT_P (op1)
5471           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5472                                INTVAL (op1)) != 0)
5473         {
5474           /* This is a UBFM/SBFM.  */
5475           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5476           if (speed)
5477             *cost += extra_cost->alu.bfx;
5478           return true;
5479         }
5480
5481       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5482         {
5483           /* We possibly get the immediate for free, this is not
5484              modelled.  */
5485           if (CONST_INT_P (op1)
5486               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5487             {
5488               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5489
5490               if (speed)
5491                 *cost += extra_cost->alu.logical;
5492
5493               return true;
5494             }
5495           else
5496             {
5497               rtx new_op0 = op0;
5498
5499               /* Handle ORN, EON, or BIC.  */
5500               if (GET_CODE (op0) == NOT)
5501                 op0 = XEXP (op0, 0);
5502
5503               new_op0 = aarch64_strip_shift (op0);
5504
5505               /* If we had a shift on op0 then this is a logical-shift-
5506                  by-register/immediate operation.  Otherwise, this is just
5507                  a logical operation.  */
5508               if (speed)
5509                 {
5510                   if (new_op0 != op0)
5511                     {
5512                       /* Shift by immediate.  */
5513                       if (CONST_INT_P (XEXP (op0, 1)))
5514                         *cost += extra_cost->alu.log_shift;
5515                       else
5516                         *cost += extra_cost->alu.log_shift_reg;
5517                     }
5518                   else
5519                     *cost += extra_cost->alu.logical;
5520                 }
5521
5522               /* In both cases we want to cost both operands.  */
5523               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5524                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5525
5526               return true;
5527             }
5528         }
5529       return false;
5530
5531     case NOT:
5532       /* MVN.  */
5533       if (speed)
5534         *cost += extra_cost->alu.logical;
5535
5536       /* The logical instruction could have the shifted register form,
5537          but the cost is the same if the shift is processed as a separate
5538          instruction, so we don't bother with it here.  */
5539       return false;
5540
5541     case ZERO_EXTEND:
5542
5543       op0 = XEXP (x, 0);
5544       /* If a value is written in SI mode, then zero extended to DI
5545          mode, the operation will in general be free as a write to
5546          a 'w' register implicitly zeroes the upper bits of an 'x'
5547          register.  However, if this is
5548
5549            (set (reg) (zero_extend (reg)))
5550
5551          we must cost the explicit register move.  */
5552       if (mode == DImode
5553           && GET_MODE (op0) == SImode
5554           && outer == SET)
5555         {
5556           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5557
5558           if (!op_cost && speed)
5559             /* MOV.  */
5560             *cost += extra_cost->alu.extend;
5561           else
5562             /* Free, the cost is that of the SI mode operation.  */
5563             *cost = op_cost;
5564
5565           return true;
5566         }
5567       else if (MEM_P (XEXP (x, 0)))
5568         {
5569           /* All loads can zero extend to any size for free.  */
5570           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5571           return true;
5572         }
5573
5574       /* UXTB/UXTH.  */
5575       if (speed)
5576         *cost += extra_cost->alu.extend;
5577
5578       return false;
5579
5580     case SIGN_EXTEND:
5581       if (MEM_P (XEXP (x, 0)))
5582         {
5583           /* LDRSH.  */
5584           if (speed)
5585             {
5586               rtx address = XEXP (XEXP (x, 0), 0);
5587               *cost += extra_cost->ldst.load_sign_extend;
5588
5589               *cost +=
5590                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5591                                                      0, speed));
5592             }
5593           return true;
5594         }
5595
5596       if (speed)
5597         *cost += extra_cost->alu.extend;
5598       return false;
5599
5600     case ASHIFT:
5601       op0 = XEXP (x, 0);
5602       op1 = XEXP (x, 1);
5603
5604       if (CONST_INT_P (op1))
5605         {
5606           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
5607              aliases.  */
5608           if (speed)
5609             *cost += extra_cost->alu.shift;
5610
5611           /* We can incorporate zero/sign extend for free.  */
5612           if (GET_CODE (op0) == ZERO_EXTEND
5613               || GET_CODE (op0) == SIGN_EXTEND)
5614             op0 = XEXP (op0, 0);
5615
5616           *cost += rtx_cost (op0, ASHIFT, 0, speed);
5617           return true;
5618         }
5619       else
5620         {
5621           /* LSLV.  */
5622           if (speed)
5623             *cost += extra_cost->alu.shift_reg;
5624
5625           return false;  /* All arguments need to be in registers.  */
5626         }
5627
5628     case ROTATE:
5629     case ROTATERT:
5630     case LSHIFTRT:
5631     case ASHIFTRT:
5632       op0 = XEXP (x, 0);
5633       op1 = XEXP (x, 1);
5634
5635       if (CONST_INT_P (op1))
5636         {
5637           /* ASR (immediate) and friends.  */
5638           if (speed)
5639             *cost += extra_cost->alu.shift;
5640
5641           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5642           return true;
5643         }
5644       else
5645         {
5646
5647           /* ASR (register) and friends.  */
5648           if (speed)
5649             *cost += extra_cost->alu.shift_reg;
5650
5651           return false;  /* All arguments need to be in registers.  */
5652         }
5653
5654     case SYMBOL_REF:
5655
5656       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5657         {
5658           /* LDR.  */
5659           if (speed)
5660             *cost += extra_cost->ldst.load;
5661         }
5662       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5663                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5664         {
5665           /* ADRP, followed by ADD.  */
5666           *cost += COSTS_N_INSNS (1);
5667           if (speed)
5668             *cost += 2 * extra_cost->alu.arith;
5669         }
5670       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5671                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5672         {
5673           /* ADR.  */
5674           if (speed)
5675             *cost += extra_cost->alu.arith;
5676         }
5677
5678       if (flag_pic)
5679         {
5680           /* One extra load instruction, after accessing the GOT.  */
5681           *cost += COSTS_N_INSNS (1);
5682           if (speed)
5683             *cost += extra_cost->ldst.load;
5684         }
5685       return true;
5686
5687     case HIGH:
5688     case LO_SUM:
5689       /* ADRP/ADD (immediate).  */
5690       if (speed)
5691         *cost += extra_cost->alu.arith;
5692       return true;
5693
5694     case ZERO_EXTRACT:
5695     case SIGN_EXTRACT:
5696       /* UBFX/SBFX.  */
5697       if (speed)
5698         *cost += extra_cost->alu.bfx;
5699
5700       /* We can trust that the immediates used will be correct (there
5701          are no by-register forms), so we need only cost op0.  */
5702       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5703       return true;
5704
5705     case MULT:
5706       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5707       /* aarch64_rtx_mult_cost always handles recursion to its
5708          operands.  */
5709       return true;
5710
5711     case MOD:
5712     case UMOD:
5713       if (speed)
5714         {
5715           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5716             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5717                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5718           else if (GET_MODE (x) == DFmode)
5719             *cost += (extra_cost->fp[1].mult
5720                       + extra_cost->fp[1].div);
5721           else if (GET_MODE (x) == SFmode)
5722             *cost += (extra_cost->fp[0].mult
5723                       + extra_cost->fp[0].div);
5724         }
5725       return false;  /* All arguments need to be in registers.  */
5726
5727     case DIV:
5728     case UDIV:
5729     case SQRT:
5730       if (speed)
5731         {
5732           if (GET_MODE_CLASS (mode) == MODE_INT)
5733             /* There is no integer SQRT, so only DIV and UDIV can get
5734                here.  */
5735             *cost += extra_cost->mult[mode == DImode].idiv;
5736           else
5737             *cost += extra_cost->fp[mode == DFmode].div;
5738         }
5739       return false;  /* All arguments need to be in registers.  */
5740
5741     case IF_THEN_ELSE:
5742       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5743                                          XEXP (x, 2), cost, speed);
5744
5745     case EQ:
5746     case NE:
5747     case GT:
5748     case GTU:
5749     case LT:
5750     case LTU:
5751     case GE:
5752     case GEU:
5753     case LE:
5754     case LEU:
5755
5756       return false; /* All arguments must be in registers.  */
5757
5758     case FMA:
5759       op0 = XEXP (x, 0);
5760       op1 = XEXP (x, 1);
5761       op2 = XEXP (x, 2);
5762
5763       if (speed)
5764         *cost += extra_cost->fp[mode == DFmode].fma;
5765
5766       /* FMSUB, FNMADD, and FNMSUB are free.  */
5767       if (GET_CODE (op0) == NEG)
5768         op0 = XEXP (op0, 0);
5769
5770       if (GET_CODE (op2) == NEG)
5771         op2 = XEXP (op2, 0);
5772
5773       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5774          and the by-element operand as operand 0.  */
5775       if (GET_CODE (op1) == NEG)
5776         op1 = XEXP (op1, 0);
5777
5778       /* Catch vector-by-element operations.  The by-element operand can
5779          either be (vec_duplicate (vec_select (x))) or just
5780          (vec_select (x)), depending on whether we are multiplying by
5781          a vector or a scalar.
5782
5783          Canonicalization is not very good in these cases, FMA4 will put the
5784          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
5785       if (GET_CODE (op0) == VEC_DUPLICATE)
5786         op0 = XEXP (op0, 0);
5787       else if (GET_CODE (op1) == VEC_DUPLICATE)
5788         op1 = XEXP (op1, 0);
5789
5790       if (GET_CODE (op0) == VEC_SELECT)
5791         op0 = XEXP (op0, 0);
5792       else if (GET_CODE (op1) == VEC_SELECT)
5793         op1 = XEXP (op1, 0);
5794
5795       /* If the remaining parameters are not registers,
5796          get the cost to put them into registers.  */
5797       *cost += rtx_cost (op0, FMA, 0, speed);
5798       *cost += rtx_cost (op1, FMA, 1, speed);
5799       *cost += rtx_cost (op2, FMA, 2, speed);
5800       return true;
5801
5802     case FLOAT_EXTEND:
5803       if (speed)
5804         *cost += extra_cost->fp[mode == DFmode].widen;
5805       return false;
5806
5807     case FLOAT_TRUNCATE:
5808       if (speed)
5809         *cost += extra_cost->fp[mode == DFmode].narrow;
5810       return false;
5811
5812     case FIX:
5813     case UNSIGNED_FIX:
5814       x = XEXP (x, 0);
5815       /* Strip the rounding part.  They will all be implemented
5816          by the fcvt* family of instructions anyway.  */
5817       if (GET_CODE (x) == UNSPEC)
5818         {
5819           unsigned int uns_code = XINT (x, 1);
5820
5821           if (uns_code == UNSPEC_FRINTA
5822               || uns_code == UNSPEC_FRINTM
5823               || uns_code == UNSPEC_FRINTN
5824               || uns_code == UNSPEC_FRINTP
5825               || uns_code == UNSPEC_FRINTZ)
5826             x = XVECEXP (x, 0, 0);
5827         }
5828
5829       if (speed)
5830         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5831
5832       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5833       return true;
5834
5835     case ABS:
5836       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5837         {
5838           /* FABS and FNEG are analogous.  */
5839           if (speed)
5840             *cost += extra_cost->fp[mode == DFmode].neg;
5841         }
5842       else
5843         {
5844           /* Integer ABS will either be split to
5845              two arithmetic instructions, or will be an ABS
5846              (scalar), which we don't model.  */
5847           *cost = COSTS_N_INSNS (2);
5848           if (speed)
5849             *cost += 2 * extra_cost->alu.arith;
5850         }
5851       return false;
5852
5853     case SMAX:
5854     case SMIN:
5855       if (speed)
5856         {
5857           /* FMAXNM/FMINNM/FMAX/FMIN.
5858              TODO: This may not be accurate for all implementations, but
5859              we do not model this in the cost tables.  */
5860           *cost += extra_cost->fp[mode == DFmode].addsub;
5861         }
5862       return false;
5863
5864     case UNSPEC:
5865       /* The floating point round to integer frint* instructions.  */
5866       if (aarch64_frint_unspec_p (XINT (x, 1)))
5867         {
5868           if (speed)
5869             *cost += extra_cost->fp[mode == DFmode].roundint;
5870
5871           return false;
5872         }
5873
5874       if (XINT (x, 1) == UNSPEC_RBIT)
5875         {
5876           if (speed)
5877             *cost += extra_cost->alu.rev;
5878
5879           return false;
5880         }
5881       break;
5882
5883     case TRUNCATE:
5884
5885       /* Decompose <su>muldi3_highpart.  */
5886       if (/* (truncate:DI  */
5887           mode == DImode
5888           /*   (lshiftrt:TI  */
5889           && GET_MODE (XEXP (x, 0)) == TImode
5890           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5891           /*      (mult:TI  */
5892           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5893           /*        (ANY_EXTEND:TI (reg:DI))
5894                     (ANY_EXTEND:TI (reg:DI)))  */
5895           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5896                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5897               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5898                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5899           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5900           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5901           /*     (const_int 64)  */
5902           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5903           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5904         {
5905           /* UMULH/SMULH.  */
5906           if (speed)
5907             *cost += extra_cost->mult[mode == DImode].extend;
5908           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5909                              MULT, 0, speed);
5910           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5911                              MULT, 1, speed);
5912           return true;
5913         }
5914
5915       /* Fall through.  */
5916     default:
5917       break;
5918     }
5919
5920   if (dump_file && (dump_flags & TDF_DETAILS))
5921     fprintf (dump_file,
5922       "\nFailed to cost RTX.  Assuming default cost.\n");
5923
5924   return true;
5925 }
5926
5927 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5928    calculated for X.  This cost is stored in *COST.  Returns true
5929    if the total cost of X was calculated.  */
5930 static bool
5931 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5932                    int param, int *cost, bool speed)
5933 {
5934   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5935
5936   if (dump_file && (dump_flags & TDF_DETAILS))
5937     {
5938       print_rtl_single (dump_file, x);
5939       fprintf (dump_file, "\n%s cost: %d (%s)\n",
5940                speed ? "Hot" : "Cold",
5941                *cost, result ? "final" : "partial");
5942     }
5943
5944   return result;
5945 }
5946
5947 static int
5948 aarch64_register_move_cost (enum machine_mode mode,
5949                             reg_class_t from_i, reg_class_t to_i)
5950 {
5951   enum reg_class from = (enum reg_class) from_i;
5952   enum reg_class to = (enum reg_class) to_i;
5953   const struct cpu_regmove_cost *regmove_cost
5954     = aarch64_tune_params->regmove_cost;
5955
5956   /* Moving between GPR and stack cost is the same as GP2GP.  */
5957   if ((from == GENERAL_REGS && to == STACK_REG)
5958       || (to == GENERAL_REGS && from == STACK_REG))
5959     return regmove_cost->GP2GP;
5960
5961   /* To/From the stack register, we move via the gprs.  */
5962   if (to == STACK_REG || from == STACK_REG)
5963     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5964             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5965
5966   if (from == GENERAL_REGS && to == GENERAL_REGS)
5967     return regmove_cost->GP2GP;
5968   else if (from == GENERAL_REGS)
5969     return regmove_cost->GP2FP;
5970   else if (to == GENERAL_REGS)
5971     return regmove_cost->FP2GP;
5972
5973   /* When AdvSIMD instructions are disabled it is not possible to move
5974      a 128-bit value directly between Q registers.  This is handled in
5975      secondary reload.  A general register is used as a scratch to move
5976      the upper DI value and the lower DI value is moved directly,
5977      hence the cost is the sum of three moves. */
5978   if (! TARGET_SIMD && GET_MODE_SIZE (mode) == 128)
5979     return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
5980
5981   return regmove_cost->FP2FP;
5982 }
5983
5984 static int
5985 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
5986                           reg_class_t rclass ATTRIBUTE_UNUSED,
5987                           bool in ATTRIBUTE_UNUSED)
5988 {
5989   return aarch64_tune_params->memmov_cost;
5990 }
5991
5992 /* Return the number of instructions that can be issued per cycle.  */
5993 static int
5994 aarch64_sched_issue_rate (void)
5995 {
5996   return aarch64_tune_params->issue_rate;
5997 }
5998
5999 /* Vectorizer cost model target hooks.  */
6000
6001 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6002 static int
6003 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6004                                     tree vectype,
6005                                     int misalign ATTRIBUTE_UNUSED)
6006 {
6007   unsigned elements;
6008
6009   switch (type_of_cost)
6010     {
6011       case scalar_stmt:
6012         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6013
6014       case scalar_load:
6015         return aarch64_tune_params->vec_costs->scalar_load_cost;
6016
6017       case scalar_store:
6018         return aarch64_tune_params->vec_costs->scalar_store_cost;
6019
6020       case vector_stmt:
6021         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6022
6023       case vector_load:
6024         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6025
6026       case vector_store:
6027         return aarch64_tune_params->vec_costs->vec_store_cost;
6028
6029       case vec_to_scalar:
6030         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6031
6032       case scalar_to_vec:
6033         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6034
6035       case unaligned_load:
6036         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6037
6038       case unaligned_store:
6039         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6040
6041       case cond_branch_taken:
6042         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6043
6044       case cond_branch_not_taken:
6045         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6046
6047       case vec_perm:
6048       case vec_promote_demote:
6049         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6050
6051       case vec_construct:
6052         elements = TYPE_VECTOR_SUBPARTS (vectype);
6053         return elements / 2 + 1;
6054
6055       default:
6056         gcc_unreachable ();
6057     }
6058 }
6059
6060 /* Implement targetm.vectorize.add_stmt_cost.  */
6061 static unsigned
6062 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6063                        struct _stmt_vec_info *stmt_info, int misalign,
6064                        enum vect_cost_model_location where)
6065 {
6066   unsigned *cost = (unsigned *) data;
6067   unsigned retval = 0;
6068
6069   if (flag_vect_cost_model)
6070     {
6071       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6072       int stmt_cost =
6073             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6074
6075       /* Statements in an inner loop relative to the loop being
6076          vectorized are weighted more heavily.  The value here is
6077          a function (linear for now) of the loop nest level.  */
6078       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6079         {
6080           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6081           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6082           unsigned nest_level = loop_depth (loop);
6083
6084           count *= nest_level;
6085         }
6086
6087       retval = (unsigned) (count * stmt_cost);
6088       cost[where] += retval;
6089     }
6090
6091   return retval;
6092 }
6093
6094 static void initialize_aarch64_code_model (void);
6095
6096 /* Parse the architecture extension string.  */
6097
6098 static void
6099 aarch64_parse_extension (char *str)
6100 {
6101   /* The extension string is parsed left to right.  */
6102   const struct aarch64_option_extension *opt = NULL;
6103
6104   /* Flag to say whether we are adding or removing an extension.  */
6105   int adding_ext = -1;
6106
6107   while (str != NULL && *str != 0)
6108     {
6109       char *ext;
6110       size_t len;
6111
6112       str++;
6113       ext = strchr (str, '+');
6114
6115       if (ext != NULL)
6116         len = ext - str;
6117       else
6118         len = strlen (str);
6119
6120       if (len >= 2 && strncmp (str, "no", 2) == 0)
6121         {
6122           adding_ext = 0;
6123           len -= 2;
6124           str += 2;
6125         }
6126       else if (len > 0)
6127         adding_ext = 1;
6128
6129       if (len == 0)
6130         {
6131           error ("missing feature modifier after %qs", "+no");
6132           return;
6133         }
6134
6135       /* Scan over the extensions table trying to find an exact match.  */
6136       for (opt = all_extensions; opt->name != NULL; opt++)
6137         {
6138           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6139             {
6140               /* Add or remove the extension.  */
6141               if (adding_ext)
6142                 aarch64_isa_flags |= opt->flags_on;
6143               else
6144                 aarch64_isa_flags &= ~(opt->flags_off);
6145               break;
6146             }
6147         }
6148
6149       if (opt->name == NULL)
6150         {
6151           /* Extension not found in list.  */
6152           error ("unknown feature modifier %qs", str);
6153           return;
6154         }
6155
6156       str = ext;
6157     };
6158
6159   return;
6160 }
6161
6162 /* Parse the ARCH string.  */
6163
6164 static void
6165 aarch64_parse_arch (void)
6166 {
6167   char *ext;
6168   const struct processor *arch;
6169   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6170   size_t len;
6171
6172   strcpy (str, aarch64_arch_string);
6173
6174   ext = strchr (str, '+');
6175
6176   if (ext != NULL)
6177     len = ext - str;
6178   else
6179     len = strlen (str);
6180
6181   if (len == 0)
6182     {
6183       error ("missing arch name in -march=%qs", str);
6184       return;
6185     }
6186
6187   /* Loop through the list of supported ARCHs to find a match.  */
6188   for (arch = all_architectures; arch->name != NULL; arch++)
6189     {
6190       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6191         {
6192           selected_arch = arch;
6193           aarch64_isa_flags = selected_arch->flags;
6194
6195           if (!selected_cpu)
6196             selected_cpu = &all_cores[selected_arch->core];
6197
6198           if (ext != NULL)
6199             {
6200               /* ARCH string contains at least one extension.  */
6201               aarch64_parse_extension (ext);
6202             }
6203
6204           if (strcmp (selected_arch->arch, selected_cpu->arch))
6205             {
6206               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6207                        selected_cpu->name, selected_arch->name);
6208             }
6209
6210           return;
6211         }
6212     }
6213
6214   /* ARCH name not found in list.  */
6215   error ("unknown value %qs for -march", str);
6216   return;
6217 }
6218
6219 /* Parse the CPU string.  */
6220
6221 static void
6222 aarch64_parse_cpu (void)
6223 {
6224   char *ext;
6225   const struct processor *cpu;
6226   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6227   size_t len;
6228
6229   strcpy (str, aarch64_cpu_string);
6230
6231   ext = strchr (str, '+');
6232
6233   if (ext != NULL)
6234     len = ext - str;
6235   else
6236     len = strlen (str);
6237
6238   if (len == 0)
6239     {
6240       error ("missing cpu name in -mcpu=%qs", str);
6241       return;
6242     }
6243
6244   /* Loop through the list of supported CPUs to find a match.  */
6245   for (cpu = all_cores; cpu->name != NULL; cpu++)
6246     {
6247       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6248         {
6249           selected_cpu = cpu;
6250           selected_tune = cpu;
6251           aarch64_isa_flags = selected_cpu->flags;
6252
6253           if (ext != NULL)
6254             {
6255               /* CPU string contains at least one extension.  */
6256               aarch64_parse_extension (ext);
6257             }
6258
6259           return;
6260         }
6261     }
6262
6263   /* CPU name not found in list.  */
6264   error ("unknown value %qs for -mcpu", str);
6265   return;
6266 }
6267
6268 /* Parse the TUNE string.  */
6269
6270 static void
6271 aarch64_parse_tune (void)
6272 {
6273   const struct processor *cpu;
6274   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6275   strcpy (str, aarch64_tune_string);
6276
6277   /* Loop through the list of supported CPUs to find a match.  */
6278   for (cpu = all_cores; cpu->name != NULL; cpu++)
6279     {
6280       if (strcmp (cpu->name, str) == 0)
6281         {
6282           selected_tune = cpu;
6283           return;
6284         }
6285     }
6286
6287   /* CPU name not found in list.  */
6288   error ("unknown value %qs for -mtune", str);
6289   return;
6290 }
6291
6292
6293 /* Implement TARGET_OPTION_OVERRIDE.  */
6294
6295 static void
6296 aarch64_override_options (void)
6297 {
6298   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6299      If either of -march or -mtune is given, they override their
6300      respective component of -mcpu.
6301
6302      So, first parse AARCH64_CPU_STRING, then the others, be careful
6303      with -march as, if -mcpu is not present on the command line, march
6304      must set a sensible default CPU.  */
6305   if (aarch64_cpu_string)
6306     {
6307       aarch64_parse_cpu ();
6308     }
6309
6310   if (aarch64_arch_string)
6311     {
6312       aarch64_parse_arch ();
6313     }
6314
6315   if (aarch64_tune_string)
6316     {
6317       aarch64_parse_tune ();
6318     }
6319
6320 #ifndef HAVE_AS_MABI_OPTION
6321   /* The compiler may have been configured with 2.23.* binutils, which does
6322      not have support for ILP32.  */
6323   if (TARGET_ILP32)
6324     error ("Assembler does not support -mabi=ilp32");
6325 #endif
6326
6327   initialize_aarch64_code_model ();
6328
6329   aarch64_build_bitmask_table ();
6330
6331   /* This target defaults to strict volatile bitfields.  */
6332   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6333     flag_strict_volatile_bitfields = 1;
6334
6335   /* If the user did not specify a processor, choose the default
6336      one for them.  This will be the CPU set during configuration using
6337      --with-cpu, otherwise it is "generic".  */
6338   if (!selected_cpu)
6339     {
6340       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6341       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6342     }
6343
6344   gcc_assert (selected_cpu);
6345
6346   /* The selected cpu may be an architecture, so lookup tuning by core ID.  */
6347   if (!selected_tune)
6348     selected_tune = &all_cores[selected_cpu->core];
6349
6350   aarch64_tune_flags = selected_tune->flags;
6351   aarch64_tune = selected_tune->core;
6352   aarch64_tune_params = selected_tune->tune;
6353
6354   aarch64_override_options_after_change ();
6355 }
6356
6357 /* Implement targetm.override_options_after_change.  */
6358
6359 static void
6360 aarch64_override_options_after_change (void)
6361 {
6362   if (flag_omit_frame_pointer)
6363     flag_omit_leaf_frame_pointer = false;
6364   else if (flag_omit_leaf_frame_pointer)
6365     flag_omit_frame_pointer = true;
6366 }
6367
6368 static struct machine_function *
6369 aarch64_init_machine_status (void)
6370 {
6371   struct machine_function *machine;
6372   machine = ggc_alloc_cleared_machine_function ();
6373   return machine;
6374 }
6375
6376 void
6377 aarch64_init_expanders (void)
6378 {
6379   init_machine_status = aarch64_init_machine_status;
6380 }
6381
6382 /* A checking mechanism for the implementation of the various code models.  */
6383 static void
6384 initialize_aarch64_code_model (void)
6385 {
6386    if (flag_pic)
6387      {
6388        switch (aarch64_cmodel_var)
6389          {
6390          case AARCH64_CMODEL_TINY:
6391            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6392            break;
6393          case AARCH64_CMODEL_SMALL:
6394            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6395            break;
6396          case AARCH64_CMODEL_LARGE:
6397            sorry ("code model %qs with -f%s", "large",
6398                   flag_pic > 1 ? "PIC" : "pic");
6399          default:
6400            gcc_unreachable ();
6401          }
6402      }
6403    else
6404      aarch64_cmodel = aarch64_cmodel_var;
6405 }
6406
6407 /* Return true if SYMBOL_REF X binds locally.  */
6408
6409 static bool
6410 aarch64_symbol_binds_local_p (const_rtx x)
6411 {
6412   return (SYMBOL_REF_DECL (x)
6413           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6414           : SYMBOL_REF_LOCAL_P (x));
6415 }
6416
6417 /* Return true if SYMBOL_REF X is thread local */
6418 static bool
6419 aarch64_tls_symbol_p (rtx x)
6420 {
6421   if (! TARGET_HAVE_TLS)
6422     return false;
6423
6424   if (GET_CODE (x) != SYMBOL_REF)
6425     return false;
6426
6427   return SYMBOL_REF_TLS_MODEL (x) != 0;
6428 }
6429
6430 /* Classify a TLS symbol into one of the TLS kinds.  */
6431 enum aarch64_symbol_type
6432 aarch64_classify_tls_symbol (rtx x)
6433 {
6434   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6435
6436   switch (tls_kind)
6437     {
6438     case TLS_MODEL_GLOBAL_DYNAMIC:
6439     case TLS_MODEL_LOCAL_DYNAMIC:
6440       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6441
6442     case TLS_MODEL_INITIAL_EXEC:
6443       return SYMBOL_SMALL_GOTTPREL;
6444
6445     case TLS_MODEL_LOCAL_EXEC:
6446       return SYMBOL_SMALL_TPREL;
6447
6448     case TLS_MODEL_EMULATED:
6449     case TLS_MODEL_NONE:
6450       return SYMBOL_FORCE_TO_MEM;
6451
6452     default:
6453       gcc_unreachable ();
6454     }
6455 }
6456
6457 /* Return the method that should be used to access SYMBOL_REF or
6458    LABEL_REF X in context CONTEXT.  */
6459
6460 enum aarch64_symbol_type
6461 aarch64_classify_symbol (rtx x,
6462                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6463 {
6464   if (GET_CODE (x) == LABEL_REF)
6465     {
6466       switch (aarch64_cmodel)
6467         {
6468         case AARCH64_CMODEL_LARGE:
6469           return SYMBOL_FORCE_TO_MEM;
6470
6471         case AARCH64_CMODEL_TINY_PIC:
6472         case AARCH64_CMODEL_TINY:
6473           return SYMBOL_TINY_ABSOLUTE;
6474
6475         case AARCH64_CMODEL_SMALL_PIC:
6476         case AARCH64_CMODEL_SMALL:
6477           return SYMBOL_SMALL_ABSOLUTE;
6478
6479         default:
6480           gcc_unreachable ();
6481         }
6482     }
6483
6484   if (GET_CODE (x) == SYMBOL_REF)
6485     {
6486       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6487           return SYMBOL_FORCE_TO_MEM;
6488
6489       if (aarch64_tls_symbol_p (x))
6490         return aarch64_classify_tls_symbol (x);
6491
6492       switch (aarch64_cmodel)
6493         {
6494         case AARCH64_CMODEL_TINY:
6495           if (SYMBOL_REF_WEAK (x))
6496             return SYMBOL_FORCE_TO_MEM;
6497           return SYMBOL_TINY_ABSOLUTE;
6498
6499         case AARCH64_CMODEL_SMALL:
6500           if (SYMBOL_REF_WEAK (x))
6501             return SYMBOL_FORCE_TO_MEM;
6502           return SYMBOL_SMALL_ABSOLUTE;
6503
6504         case AARCH64_CMODEL_TINY_PIC:
6505           if (!aarch64_symbol_binds_local_p (x))
6506             return SYMBOL_TINY_GOT;
6507           return SYMBOL_TINY_ABSOLUTE;
6508
6509         case AARCH64_CMODEL_SMALL_PIC:
6510           if (!aarch64_symbol_binds_local_p (x))
6511             return SYMBOL_SMALL_GOT;
6512           return SYMBOL_SMALL_ABSOLUTE;
6513
6514         default:
6515           gcc_unreachable ();
6516         }
6517     }
6518
6519   /* By default push everything into the constant pool.  */
6520   return SYMBOL_FORCE_TO_MEM;
6521 }
6522
6523 bool
6524 aarch64_constant_address_p (rtx x)
6525 {
6526   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6527 }
6528
6529 bool
6530 aarch64_legitimate_pic_operand_p (rtx x)
6531 {
6532   if (GET_CODE (x) == SYMBOL_REF
6533       || (GET_CODE (x) == CONST
6534           && GET_CODE (XEXP (x, 0)) == PLUS
6535           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6536      return false;
6537
6538   return true;
6539 }
6540
6541 /* Return true if X holds either a quarter-precision or
6542      floating-point +0.0 constant.  */
6543 static bool
6544 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6545 {
6546   if (!CONST_DOUBLE_P (x))
6547     return false;
6548
6549   /* TODO: We could handle moving 0.0 to a TFmode register,
6550      but first we would like to refactor the movtf_aarch64
6551      to be more amicable to split moves properly and
6552      correctly gate on TARGET_SIMD.  For now - reject all
6553      constants which are not to SFmode or DFmode registers.  */
6554   if (!(mode == SFmode || mode == DFmode))
6555     return false;
6556
6557   if (aarch64_float_const_zero_rtx_p (x))
6558     return true;
6559   return aarch64_float_const_representable_p (x);
6560 }
6561
6562 static bool
6563 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6564 {
6565   /* Do not allow vector struct mode constants.  We could support
6566      0 and -1 easily, but they need support in aarch64-simd.md.  */
6567   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6568     return false;
6569
6570   /* This could probably go away because
6571      we now decompose CONST_INTs according to expand_mov_immediate.  */
6572   if ((GET_CODE (x) == CONST_VECTOR
6573        && aarch64_simd_valid_immediate (x, mode, false, NULL))
6574       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6575         return !targetm.cannot_force_const_mem (mode, x);
6576
6577   if (GET_CODE (x) == HIGH
6578       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6579     return true;
6580
6581   return aarch64_constant_address_p (x);
6582 }
6583
6584 rtx
6585 aarch64_load_tp (rtx target)
6586 {
6587   if (!target
6588       || GET_MODE (target) != Pmode
6589       || !register_operand (target, Pmode))
6590     target = gen_reg_rtx (Pmode);
6591
6592   /* Can return in any reg.  */
6593   emit_insn (gen_aarch64_load_tp_hard (target));
6594   return target;
6595 }
6596
6597 /* On AAPCS systems, this is the "struct __va_list".  */
6598 static GTY(()) tree va_list_type;
6599
6600 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6601    Return the type to use as __builtin_va_list.
6602
6603    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6604
6605    struct __va_list
6606    {
6607      void *__stack;
6608      void *__gr_top;
6609      void *__vr_top;
6610      int   __gr_offs;
6611      int   __vr_offs;
6612    };  */
6613
6614 static tree
6615 aarch64_build_builtin_va_list (void)
6616 {
6617   tree va_list_name;
6618   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6619
6620   /* Create the type.  */
6621   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6622   /* Give it the required name.  */
6623   va_list_name = build_decl (BUILTINS_LOCATION,
6624                              TYPE_DECL,
6625                              get_identifier ("__va_list"),
6626                              va_list_type);
6627   DECL_ARTIFICIAL (va_list_name) = 1;
6628   TYPE_NAME (va_list_type) = va_list_name;
6629   TYPE_STUB_DECL (va_list_type) = va_list_name;
6630
6631   /* Create the fields.  */
6632   f_stack = build_decl (BUILTINS_LOCATION,
6633                         FIELD_DECL, get_identifier ("__stack"),
6634                         ptr_type_node);
6635   f_grtop = build_decl (BUILTINS_LOCATION,
6636                         FIELD_DECL, get_identifier ("__gr_top"),
6637                         ptr_type_node);
6638   f_vrtop = build_decl (BUILTINS_LOCATION,
6639                         FIELD_DECL, get_identifier ("__vr_top"),
6640                         ptr_type_node);
6641   f_groff = build_decl (BUILTINS_LOCATION,
6642                         FIELD_DECL, get_identifier ("__gr_offs"),
6643                         integer_type_node);
6644   f_vroff = build_decl (BUILTINS_LOCATION,
6645                         FIELD_DECL, get_identifier ("__vr_offs"),
6646                         integer_type_node);
6647
6648   DECL_ARTIFICIAL (f_stack) = 1;
6649   DECL_ARTIFICIAL (f_grtop) = 1;
6650   DECL_ARTIFICIAL (f_vrtop) = 1;
6651   DECL_ARTIFICIAL (f_groff) = 1;
6652   DECL_ARTIFICIAL (f_vroff) = 1;
6653
6654   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6655   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6656   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6657   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6658   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6659
6660   TYPE_FIELDS (va_list_type) = f_stack;
6661   DECL_CHAIN (f_stack) = f_grtop;
6662   DECL_CHAIN (f_grtop) = f_vrtop;
6663   DECL_CHAIN (f_vrtop) = f_groff;
6664   DECL_CHAIN (f_groff) = f_vroff;
6665
6666   /* Compute its layout.  */
6667   layout_type (va_list_type);
6668
6669   return va_list_type;
6670 }
6671
6672 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
6673 static void
6674 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6675 {
6676   const CUMULATIVE_ARGS *cum;
6677   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6678   tree stack, grtop, vrtop, groff, vroff;
6679   tree t;
6680   int gr_save_area_size;
6681   int vr_save_area_size;
6682   int vr_offset;
6683
6684   cum = &crtl->args.info;
6685   gr_save_area_size
6686     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6687   vr_save_area_size
6688     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6689
6690   if (TARGET_GENERAL_REGS_ONLY)
6691     {
6692       if (cum->aapcs_nvrn > 0)
6693         sorry ("%qs and floating point or vector arguments",
6694                "-mgeneral-regs-only");
6695       vr_save_area_size = 0;
6696     }
6697
6698   f_stack = TYPE_FIELDS (va_list_type_node);
6699   f_grtop = DECL_CHAIN (f_stack);
6700   f_vrtop = DECL_CHAIN (f_grtop);
6701   f_groff = DECL_CHAIN (f_vrtop);
6702   f_vroff = DECL_CHAIN (f_groff);
6703
6704   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6705                   NULL_TREE);
6706   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6707                   NULL_TREE);
6708   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6709                   NULL_TREE);
6710   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6711                   NULL_TREE);
6712   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6713                   NULL_TREE);
6714
6715   /* Emit code to initialize STACK, which points to the next varargs stack
6716      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
6717      by named arguments.  STACK is 8-byte aligned.  */
6718   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6719   if (cum->aapcs_stack_size > 0)
6720     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6721   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6722   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6723
6724   /* Emit code to initialize GRTOP, the top of the GR save area.
6725      virtual_incoming_args_rtx should have been 16 byte aligned.  */
6726   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6727   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6728   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6729
6730   /* Emit code to initialize VRTOP, the top of the VR save area.
6731      This address is gr_save_area_bytes below GRTOP, rounded
6732      down to the next 16-byte boundary.  */
6733   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6734   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6735                              STACK_BOUNDARY / BITS_PER_UNIT);
6736
6737   if (vr_offset)
6738     t = fold_build_pointer_plus_hwi (t, -vr_offset);
6739   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6740   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6741
6742   /* Emit code to initialize GROFF, the offset from GRTOP of the
6743      next GPR argument.  */
6744   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6745               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6746   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6747
6748   /* Likewise emit code to initialize VROFF, the offset from FTOP
6749      of the next VR argument.  */
6750   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6751               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6752   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6753 }
6754
6755 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
6756
6757 static tree
6758 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6759                               gimple_seq *post_p ATTRIBUTE_UNUSED)
6760 {
6761   tree addr;
6762   bool indirect_p;
6763   bool is_ha;           /* is HFA or HVA.  */
6764   bool dw_align;        /* double-word align.  */
6765   enum machine_mode ag_mode = VOIDmode;
6766   int nregs;
6767   enum machine_mode mode;
6768
6769   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6770   tree stack, f_top, f_off, off, arg, roundup, on_stack;
6771   HOST_WIDE_INT size, rsize, adjust, align;
6772   tree t, u, cond1, cond2;
6773
6774   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6775   if (indirect_p)
6776     type = build_pointer_type (type);
6777
6778   mode = TYPE_MODE (type);
6779
6780   f_stack = TYPE_FIELDS (va_list_type_node);
6781   f_grtop = DECL_CHAIN (f_stack);
6782   f_vrtop = DECL_CHAIN (f_grtop);
6783   f_groff = DECL_CHAIN (f_vrtop);
6784   f_vroff = DECL_CHAIN (f_groff);
6785
6786   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6787                   f_stack, NULL_TREE);
6788   size = int_size_in_bytes (type);
6789   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6790
6791   dw_align = false;
6792   adjust = 0;
6793   if (aarch64_vfp_is_call_or_return_candidate (mode,
6794                                                type,
6795                                                &ag_mode,
6796                                                &nregs,
6797                                                &is_ha))
6798     {
6799       /* TYPE passed in fp/simd registers.  */
6800       if (TARGET_GENERAL_REGS_ONLY)
6801         sorry ("%qs and floating point or vector arguments",
6802                "-mgeneral-regs-only");
6803
6804       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6805                       unshare_expr (valist), f_vrtop, NULL_TREE);
6806       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6807                       unshare_expr (valist), f_vroff, NULL_TREE);
6808
6809       rsize = nregs * UNITS_PER_VREG;
6810
6811       if (is_ha)
6812         {
6813           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6814             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6815         }
6816       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6817                && size < UNITS_PER_VREG)
6818         {
6819           adjust = UNITS_PER_VREG - size;
6820         }
6821     }
6822   else
6823     {
6824       /* TYPE passed in general registers.  */
6825       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6826                       unshare_expr (valist), f_grtop, NULL_TREE);
6827       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6828                       unshare_expr (valist), f_groff, NULL_TREE);
6829       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6830       nregs = rsize / UNITS_PER_WORD;
6831
6832       if (align > 8)
6833         dw_align = true;
6834
6835       if (BLOCK_REG_PADDING (mode, type, 1) == downward
6836           && size < UNITS_PER_WORD)
6837         {
6838           adjust = UNITS_PER_WORD  - size;
6839         }
6840     }
6841
6842   /* Get a local temporary for the field value.  */
6843   off = get_initialized_tmp_var (f_off, pre_p, NULL);
6844
6845   /* Emit code to branch if off >= 0.  */
6846   t = build2 (GE_EXPR, boolean_type_node, off,
6847               build_int_cst (TREE_TYPE (off), 0));
6848   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6849
6850   if (dw_align)
6851     {
6852       /* Emit: offs = (offs + 15) & -16.  */
6853       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6854                   build_int_cst (TREE_TYPE (off), 15));
6855       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6856                   build_int_cst (TREE_TYPE (off), -16));
6857       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6858     }
6859   else
6860     roundup = NULL;
6861
6862   /* Update ap.__[g|v]r_offs  */
6863   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6864               build_int_cst (TREE_TYPE (off), rsize));
6865   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6866
6867   /* String up.  */
6868   if (roundup)
6869     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6870
6871   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
6872   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6873               build_int_cst (TREE_TYPE (f_off), 0));
6874   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6875
6876   /* String up: make sure the assignment happens before the use.  */
6877   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6878   COND_EXPR_ELSE (cond1) = t;
6879
6880   /* Prepare the trees handling the argument that is passed on the stack;
6881      the top level node will store in ON_STACK.  */
6882   arg = get_initialized_tmp_var (stack, pre_p, NULL);
6883   if (align > 8)
6884     {
6885       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
6886       t = fold_convert (intDI_type_node, arg);
6887       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6888                   build_int_cst (TREE_TYPE (t), 15));
6889       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6890                   build_int_cst (TREE_TYPE (t), -16));
6891       t = fold_convert (TREE_TYPE (arg), t);
6892       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6893     }
6894   else
6895     roundup = NULL;
6896   /* Advance ap.__stack  */
6897   t = fold_convert (intDI_type_node, arg);
6898   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6899               build_int_cst (TREE_TYPE (t), size + 7));
6900   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6901               build_int_cst (TREE_TYPE (t), -8));
6902   t = fold_convert (TREE_TYPE (arg), t);
6903   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6904   /* String up roundup and advance.  */
6905   if (roundup)
6906     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6907   /* String up with arg */
6908   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6909   /* Big-endianness related address adjustment.  */
6910   if (BLOCK_REG_PADDING (mode, type, 1) == downward
6911       && size < UNITS_PER_WORD)
6912   {
6913     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6914                 size_int (UNITS_PER_WORD - size));
6915     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6916   }
6917
6918   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6919   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6920
6921   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
6922   t = off;
6923   if (adjust)
6924     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6925                 build_int_cst (TREE_TYPE (off), adjust));
6926
6927   t = fold_convert (sizetype, t);
6928   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6929
6930   if (is_ha)
6931     {
6932       /* type ha; // treat as "struct {ftype field[n];}"
6933          ... [computing offs]
6934          for (i = 0; i <nregs; ++i, offs += 16)
6935            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6936          return ha;  */
6937       int i;
6938       tree tmp_ha, field_t, field_ptr_t;
6939
6940       /* Declare a local variable.  */
6941       tmp_ha = create_tmp_var_raw (type, "ha");
6942       gimple_add_tmp_var (tmp_ha);
6943
6944       /* Establish the base type.  */
6945       switch (ag_mode)
6946         {
6947         case SFmode:
6948           field_t = float_type_node;
6949           field_ptr_t = float_ptr_type_node;
6950           break;
6951         case DFmode:
6952           field_t = double_type_node;
6953           field_ptr_t = double_ptr_type_node;
6954           break;
6955         case TFmode:
6956           field_t = long_double_type_node;
6957           field_ptr_t = long_double_ptr_type_node;
6958           break;
6959 /* The half precision and quad precision are not fully supported yet.  Enable
6960    the following code after the support is complete.  Need to find the correct
6961    type node for __fp16 *.  */
6962 #if 0
6963         case HFmode:
6964           field_t = float_type_node;
6965           field_ptr_t = float_ptr_type_node;
6966           break;
6967 #endif
6968         case V2SImode:
6969         case V4SImode:
6970             {
6971               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
6972               field_t = build_vector_type_for_mode (innertype, ag_mode);
6973               field_ptr_t = build_pointer_type (field_t);
6974             }
6975           break;
6976         default:
6977           gcc_assert (0);
6978         }
6979
6980       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
6981       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
6982       addr = t;
6983       t = fold_convert (field_ptr_t, addr);
6984       t = build2 (MODIFY_EXPR, field_t,
6985                   build1 (INDIRECT_REF, field_t, tmp_ha),
6986                   build1 (INDIRECT_REF, field_t, t));
6987
6988       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
6989       for (i = 1; i < nregs; ++i)
6990         {
6991           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
6992           u = fold_convert (field_ptr_t, addr);
6993           u = build2 (MODIFY_EXPR, field_t,
6994                       build2 (MEM_REF, field_t, tmp_ha,
6995                               build_int_cst (field_ptr_t,
6996                                              (i *
6997                                               int_size_in_bytes (field_t)))),
6998                       build1 (INDIRECT_REF, field_t, u));
6999           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7000         }
7001
7002       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7003       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7004     }
7005
7006   COND_EXPR_ELSE (cond2) = t;
7007   addr = fold_convert (build_pointer_type (type), cond1);
7008   addr = build_va_arg_indirect_ref (addr);
7009
7010   if (indirect_p)
7011     addr = build_va_arg_indirect_ref (addr);
7012
7013   return addr;
7014 }
7015
7016 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7017
7018 static void
7019 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7020                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7021                                 int no_rtl)
7022 {
7023   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7024   CUMULATIVE_ARGS local_cum;
7025   int gr_saved, vr_saved;
7026
7027   /* The caller has advanced CUM up to, but not beyond, the last named
7028      argument.  Advance a local copy of CUM past the last "real" named
7029      argument, to find out how many registers are left over.  */
7030   local_cum = *cum;
7031   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7032
7033   /* Found out how many registers we need to save.  */
7034   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7035   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7036
7037   if (TARGET_GENERAL_REGS_ONLY)
7038     {
7039       if (local_cum.aapcs_nvrn > 0)
7040         sorry ("%qs and floating point or vector arguments",
7041                "-mgeneral-regs-only");
7042       vr_saved = 0;
7043     }
7044
7045   if (!no_rtl)
7046     {
7047       if (gr_saved > 0)
7048         {
7049           rtx ptr, mem;
7050
7051           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7052           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7053                                - gr_saved * UNITS_PER_WORD);
7054           mem = gen_frame_mem (BLKmode, ptr);
7055           set_mem_alias_set (mem, get_varargs_alias_set ());
7056
7057           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7058                                mem, gr_saved);
7059         }
7060       if (vr_saved > 0)
7061         {
7062           /* We can't use move_block_from_reg, because it will use
7063              the wrong mode, storing D regs only.  */
7064           enum machine_mode mode = TImode;
7065           int off, i;
7066
7067           /* Set OFF to the offset from virtual_incoming_args_rtx of
7068              the first vector register.  The VR save area lies below
7069              the GR one, and is aligned to 16 bytes.  */
7070           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7071                                    STACK_BOUNDARY / BITS_PER_UNIT);
7072           off -= vr_saved * UNITS_PER_VREG;
7073
7074           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7075             {
7076               rtx ptr, mem;
7077
7078               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7079               mem = gen_frame_mem (mode, ptr);
7080               set_mem_alias_set (mem, get_varargs_alias_set ());
7081               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7082               off += UNITS_PER_VREG;
7083             }
7084         }
7085     }
7086
7087   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7088      any complication of having crtl->args.pretend_args_size changed.  */
7089   cfun->machine->frame.saved_varargs_size
7090     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7091                       STACK_BOUNDARY / BITS_PER_UNIT)
7092        + vr_saved * UNITS_PER_VREG);
7093 }
7094
7095 static void
7096 aarch64_conditional_register_usage (void)
7097 {
7098   int i;
7099   if (!TARGET_FLOAT)
7100     {
7101       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7102         {
7103           fixed_regs[i] = 1;
7104           call_used_regs[i] = 1;
7105         }
7106     }
7107 }
7108
7109 /* Walk down the type tree of TYPE counting consecutive base elements.
7110    If *MODEP is VOIDmode, then set it to the first valid floating point
7111    type.  If a non-floating point type is found, or if a floating point
7112    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7113    otherwise return the count in the sub-tree.  */
7114 static int
7115 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
7116 {
7117   enum machine_mode mode;
7118   HOST_WIDE_INT size;
7119
7120   switch (TREE_CODE (type))
7121     {
7122     case REAL_TYPE:
7123       mode = TYPE_MODE (type);
7124       if (mode != DFmode && mode != SFmode && mode != TFmode)
7125         return -1;
7126
7127       if (*modep == VOIDmode)
7128         *modep = mode;
7129
7130       if (*modep == mode)
7131         return 1;
7132
7133       break;
7134
7135     case COMPLEX_TYPE:
7136       mode = TYPE_MODE (TREE_TYPE (type));
7137       if (mode != DFmode && mode != SFmode && mode != TFmode)
7138         return -1;
7139
7140       if (*modep == VOIDmode)
7141         *modep = mode;
7142
7143       if (*modep == mode)
7144         return 2;
7145
7146       break;
7147
7148     case VECTOR_TYPE:
7149       /* Use V2SImode and V4SImode as representatives of all 64-bit
7150          and 128-bit vector types.  */
7151       size = int_size_in_bytes (type);
7152       switch (size)
7153         {
7154         case 8:
7155           mode = V2SImode;
7156           break;
7157         case 16:
7158           mode = V4SImode;
7159           break;
7160         default:
7161           return -1;
7162         }
7163
7164       if (*modep == VOIDmode)
7165         *modep = mode;
7166
7167       /* Vector modes are considered to be opaque: two vectors are
7168          equivalent for the purposes of being homogeneous aggregates
7169          if they are the same size.  */
7170       if (*modep == mode)
7171         return 1;
7172
7173       break;
7174
7175     case ARRAY_TYPE:
7176       {
7177         int count;
7178         tree index = TYPE_DOMAIN (type);
7179
7180         /* Can't handle incomplete types.  */
7181         if (!COMPLETE_TYPE_P (type))
7182           return -1;
7183
7184         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7185         if (count == -1
7186             || !index
7187             || !TYPE_MAX_VALUE (index)
7188             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7189             || !TYPE_MIN_VALUE (index)
7190             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7191             || count < 0)
7192           return -1;
7193
7194         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7195                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7196
7197         /* There must be no padding.  */
7198         if (!tree_fits_uhwi_p (TYPE_SIZE (type))
7199             || ((HOST_WIDE_INT) tree_to_uhwi (TYPE_SIZE (type))
7200                 != count * GET_MODE_BITSIZE (*modep)))
7201           return -1;
7202
7203         return count;
7204       }
7205
7206     case RECORD_TYPE:
7207       {
7208         int count = 0;
7209         int sub_count;
7210         tree field;
7211
7212         /* Can't handle incomplete types.  */
7213         if (!COMPLETE_TYPE_P (type))
7214           return -1;
7215
7216         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7217           {
7218             if (TREE_CODE (field) != FIELD_DECL)
7219               continue;
7220
7221             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7222             if (sub_count < 0)
7223               return -1;
7224             count += sub_count;
7225           }
7226
7227         /* There must be no padding.  */
7228         if (!tree_fits_uhwi_p (TYPE_SIZE (type))
7229             || ((HOST_WIDE_INT) tree_to_uhwi (TYPE_SIZE (type))
7230                 != count * GET_MODE_BITSIZE (*modep)))
7231           return -1;
7232
7233         return count;
7234       }
7235
7236     case UNION_TYPE:
7237     case QUAL_UNION_TYPE:
7238       {
7239         /* These aren't very interesting except in a degenerate case.  */
7240         int count = 0;
7241         int sub_count;
7242         tree field;
7243
7244         /* Can't handle incomplete types.  */
7245         if (!COMPLETE_TYPE_P (type))
7246           return -1;
7247
7248         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7249           {
7250             if (TREE_CODE (field) != FIELD_DECL)
7251               continue;
7252
7253             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7254             if (sub_count < 0)
7255               return -1;
7256             count = count > sub_count ? count : sub_count;
7257           }
7258
7259         /* There must be no padding.  */
7260         if (!tree_fits_uhwi_p (TYPE_SIZE (type))
7261             || ((HOST_WIDE_INT) tree_to_uhwi (TYPE_SIZE (type))
7262                 != count * GET_MODE_BITSIZE (*modep)))
7263           return -1;
7264
7265         return count;
7266       }
7267
7268     default:
7269       break;
7270     }
7271
7272   return -1;
7273 }
7274
7275 /* Return true if we use LRA instead of reload pass.  */
7276 static bool
7277 aarch64_lra_p (void)
7278 {
7279   return aarch64_lra_flag;
7280 }
7281
7282 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7283    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7284    array types.  The C99 floating-point complex types are also considered
7285    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7286    types, which are GCC extensions and out of the scope of AAPCS64, are
7287    treated as composite types here as well.
7288
7289    Note that MODE itself is not sufficient in determining whether a type
7290    is such a composite type or not.  This is because
7291    stor-layout.c:compute_record_mode may have already changed the MODE
7292    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7293    structure with only one field may have its MODE set to the mode of the
7294    field.  Also an integer mode whose size matches the size of the
7295    RECORD_TYPE type may be used to substitute the original mode
7296    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7297    solely relied on.  */
7298
7299 static bool
7300 aarch64_composite_type_p (const_tree type,
7301                           enum machine_mode mode)
7302 {
7303   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7304     return true;
7305
7306   if (mode == BLKmode
7307       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7308       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7309     return true;
7310
7311   return false;
7312 }
7313
7314 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7315    type as described in AAPCS64 \S 4.1.2.
7316
7317    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7318
7319 static bool
7320 aarch64_short_vector_p (const_tree type,
7321                         enum machine_mode mode)
7322 {
7323   HOST_WIDE_INT size = -1;
7324
7325   if (type && TREE_CODE (type) == VECTOR_TYPE)
7326     size = int_size_in_bytes (type);
7327   else if (!aarch64_composite_type_p (type, mode)
7328            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7329                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7330     size = GET_MODE_SIZE (mode);
7331
7332   return (size == 8 || size == 16) ? true : false;
7333 }
7334
7335 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7336    shall be passed or returned in simd/fp register(s) (providing these
7337    parameter passing registers are available).
7338
7339    Upon successful return, *COUNT returns the number of needed registers,
7340    *BASE_MODE returns the mode of the individual register and when IS_HAF
7341    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7342    floating-point aggregate or a homogeneous short-vector aggregate.  */
7343
7344 static bool
7345 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7346                                          const_tree type,
7347                                          enum machine_mode *base_mode,
7348                                          int *count,
7349                                          bool *is_ha)
7350 {
7351   enum machine_mode new_mode = VOIDmode;
7352   bool composite_p = aarch64_composite_type_p (type, mode);
7353
7354   if (is_ha != NULL) *is_ha = false;
7355
7356   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7357       || aarch64_short_vector_p (type, mode))
7358     {
7359       *count = 1;
7360       new_mode = mode;
7361     }
7362   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7363     {
7364       if (is_ha != NULL) *is_ha = true;
7365       *count = 2;
7366       new_mode = GET_MODE_INNER (mode);
7367     }
7368   else if (type && composite_p)
7369     {
7370       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7371
7372       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7373         {
7374           if (is_ha != NULL) *is_ha = true;
7375           *count = ag_count;
7376         }
7377       else
7378         return false;
7379     }
7380   else
7381     return false;
7382
7383   *base_mode = new_mode;
7384   return true;
7385 }
7386
7387 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7388
7389 static rtx
7390 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7391                           int incoming ATTRIBUTE_UNUSED)
7392 {
7393   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7394 }
7395
7396 /* Implements target hook vector_mode_supported_p.  */
7397 static bool
7398 aarch64_vector_mode_supported_p (enum machine_mode mode)
7399 {
7400   if (TARGET_SIMD
7401       && (mode == V4SImode  || mode == V8HImode
7402           || mode == V16QImode || mode == V2DImode
7403           || mode == V2SImode  || mode == V4HImode
7404           || mode == V8QImode || mode == V2SFmode
7405           || mode == V4SFmode || mode == V2DFmode
7406           || mode == V1DFmode))
7407     return true;
7408
7409   return false;
7410 }
7411
7412 /* Return appropriate SIMD container
7413    for MODE within a vector of WIDTH bits.  */
7414 static enum machine_mode
7415 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7416 {
7417   gcc_assert (width == 64 || width == 128);
7418   if (TARGET_SIMD)
7419     {
7420       if (width == 128)
7421         switch (mode)
7422           {
7423           case DFmode:
7424             return V2DFmode;
7425           case SFmode:
7426             return V4SFmode;
7427           case SImode:
7428             return V4SImode;
7429           case HImode:
7430             return V8HImode;
7431           case QImode:
7432             return V16QImode;
7433           case DImode:
7434             return V2DImode;
7435           default:
7436             break;
7437           }
7438       else
7439         switch (mode)
7440           {
7441           case SFmode:
7442             return V2SFmode;
7443           case SImode:
7444             return V2SImode;
7445           case HImode:
7446             return V4HImode;
7447           case QImode:
7448             return V8QImode;
7449           default:
7450             break;
7451           }
7452     }
7453   return word_mode;
7454 }
7455
7456 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7457 static enum machine_mode
7458 aarch64_preferred_simd_mode (enum machine_mode mode)
7459 {
7460   return aarch64_simd_container_mode (mode, 128);
7461 }
7462
7463 /* Return the bitmask of possible vector sizes for the vectorizer
7464    to iterate over.  */
7465 static unsigned int
7466 aarch64_autovectorize_vector_sizes (void)
7467 {
7468   return (16 | 8);
7469 }
7470
7471 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7472    vector types in order to conform to the AAPCS64 (see "Procedure
7473    Call Standard for the ARM 64-bit Architecture", Appendix A).  To
7474    qualify for emission with the mangled names defined in that document,
7475    a vector type must not only be of the correct mode but also be
7476    composed of AdvSIMD vector element types (e.g.
7477    _builtin_aarch64_simd_qi); these types are registered by
7478    aarch64_init_simd_builtins ().  In other words, vector types defined
7479    in other ways e.g. via vector_size attribute will get default
7480    mangled names.  */
7481 typedef struct
7482 {
7483   enum machine_mode mode;
7484   const char *element_type_name;
7485   const char *mangled_name;
7486 } aarch64_simd_mangle_map_entry;
7487
7488 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7489   /* 64-bit containerized types.  */
7490   { V8QImode,  "__builtin_aarch64_simd_qi",     "10__Int8x8_t" },
7491   { V8QImode,  "__builtin_aarch64_simd_uqi",    "11__Uint8x8_t" },
7492   { V4HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x4_t" },
7493   { V4HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x4_t" },
7494   { V2SImode,  "__builtin_aarch64_simd_si",     "11__Int32x2_t" },
7495   { V2SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x2_t" },
7496   { V2SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x2_t" },
7497   { V8QImode,  "__builtin_aarch64_simd_poly8",  "11__Poly8x8_t" },
7498   { V4HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7499   /* 128-bit containerized types.  */
7500   { V16QImode, "__builtin_aarch64_simd_qi",     "11__Int8x16_t" },
7501   { V16QImode, "__builtin_aarch64_simd_uqi",    "12__Uint8x16_t" },
7502   { V8HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x8_t" },
7503   { V8HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x8_t" },
7504   { V4SImode,  "__builtin_aarch64_simd_si",     "11__Int32x4_t" },
7505   { V4SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x4_t" },
7506   { V2DImode,  "__builtin_aarch64_simd_di",     "11__Int64x2_t" },
7507   { V2DImode,  "__builtin_aarch64_simd_udi",    "12__Uint64x2_t" },
7508   { V4SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x4_t" },
7509   { V2DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x2_t" },
7510   { V16QImode, "__builtin_aarch64_simd_poly8",  "12__Poly8x16_t" },
7511   { V8HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7512   { V2DImode,  "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7513   { VOIDmode, NULL, NULL }
7514 };
7515
7516 /* Implement TARGET_MANGLE_TYPE.  */
7517
7518 static const char *
7519 aarch64_mangle_type (const_tree type)
7520 {
7521   /* The AArch64 ABI documents say that "__va_list" has to be
7522      managled as if it is in the "std" namespace.  */
7523   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7524     return "St9__va_list";
7525
7526   /* Check the mode of the vector type, and the name of the vector
7527      element type, against the table.  */
7528   if (TREE_CODE (type) == VECTOR_TYPE)
7529     {
7530       aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7531
7532       while (pos->mode != VOIDmode)
7533         {
7534           tree elt_type = TREE_TYPE (type);
7535
7536           if (pos->mode == TYPE_MODE (type)
7537               && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7538               && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7539                           pos->element_type_name))
7540             return pos->mangled_name;
7541
7542           pos++;
7543         }
7544     }
7545
7546   /* Use the default mangling.  */
7547   return NULL;
7548 }
7549
7550 /* Return the equivalent letter for size.  */
7551 static char
7552 sizetochar (int size)
7553 {
7554   switch (size)
7555     {
7556     case 64: return 'd';
7557     case 32: return 's';
7558     case 16: return 'h';
7559     case 8 : return 'b';
7560     default: gcc_unreachable ();
7561     }
7562 }
7563
7564 /* Return true iff x is a uniform vector of floating-point
7565    constants, and the constant can be represented in
7566    quarter-precision form.  Note, as aarch64_float_const_representable
7567    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
7568 static bool
7569 aarch64_vect_float_const_representable_p (rtx x)
7570 {
7571   int i = 0;
7572   REAL_VALUE_TYPE r0, ri;
7573   rtx x0, xi;
7574
7575   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7576     return false;
7577
7578   x0 = CONST_VECTOR_ELT (x, 0);
7579   if (!CONST_DOUBLE_P (x0))
7580     return false;
7581
7582   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7583
7584   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7585     {
7586       xi = CONST_VECTOR_ELT (x, i);
7587       if (!CONST_DOUBLE_P (xi))
7588         return false;
7589
7590       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7591       if (!REAL_VALUES_EQUAL (r0, ri))
7592         return false;
7593     }
7594
7595   return aarch64_float_const_representable_p (x0);
7596 }
7597
7598 /* Return true for valid and false for invalid.  */
7599 bool
7600 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7601                               struct simd_immediate_info *info)
7602 {
7603 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
7604   matches = 1;                                          \
7605   for (i = 0; i < idx; i += (STRIDE))                   \
7606     if (!(TEST))                                        \
7607       matches = 0;                                      \
7608   if (matches)                                          \
7609     {                                                   \
7610       immtype = (CLASS);                                \
7611       elsize = (ELSIZE);                                \
7612       eshift = (SHIFT);                                 \
7613       emvn = (NEG);                                     \
7614       break;                                            \
7615     }
7616
7617   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7618   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7619   unsigned char bytes[16];
7620   int immtype = -1, matches;
7621   unsigned int invmask = inverse ? 0xff : 0;
7622   int eshift, emvn;
7623
7624   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7625     {
7626       if (! (aarch64_simd_imm_zero_p (op, mode)
7627              || aarch64_vect_float_const_representable_p (op)))
7628         return false;
7629
7630       if (info)
7631         {
7632           info->value = CONST_VECTOR_ELT (op, 0);
7633           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7634           info->mvn = false;
7635           info->shift = 0;
7636         }
7637
7638       return true;
7639     }
7640
7641   /* Splat vector constant out into a byte vector.  */
7642   for (i = 0; i < n_elts; i++)
7643     {
7644       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
7645          it must be laid out in the vector register in reverse order.  */
7646       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7647       unsigned HOST_WIDE_INT elpart;
7648       unsigned int part, parts;
7649
7650       if (GET_CODE (el) == CONST_INT)
7651         {
7652           elpart = INTVAL (el);
7653           parts = 1;
7654         }
7655       else if (GET_CODE (el) == CONST_DOUBLE)
7656         {
7657           elpart = CONST_DOUBLE_LOW (el);
7658           parts = 2;
7659         }
7660       else
7661         gcc_unreachable ();
7662
7663       for (part = 0; part < parts; part++)
7664         {
7665           unsigned int byte;
7666           for (byte = 0; byte < innersize; byte++)
7667             {
7668               bytes[idx++] = (elpart & 0xff) ^ invmask;
7669               elpart >>= BITS_PER_UNIT;
7670             }
7671           if (GET_CODE (el) == CONST_DOUBLE)
7672             elpart = CONST_DOUBLE_HIGH (el);
7673         }
7674     }
7675
7676   /* Sanity check.  */
7677   gcc_assert (idx == GET_MODE_SIZE (mode));
7678
7679   do
7680     {
7681       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7682              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7683
7684       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7685              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7686
7687       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7688              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7689
7690       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7691              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7692
7693       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7694
7695       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7696
7697       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7698              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7699
7700       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7701              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7702
7703       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7704              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7705
7706       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7707              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7708
7709       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7710
7711       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7712
7713       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7714              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7715
7716       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7717              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7718
7719       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7720              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7721
7722       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7723              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7724
7725       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7726
7727       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7728              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7729     }
7730   while (0);
7731
7732   if (immtype == -1)
7733     return false;
7734
7735   if (info)
7736     {
7737       info->element_width = elsize;
7738       info->mvn = emvn != 0;
7739       info->shift = eshift;
7740
7741       unsigned HOST_WIDE_INT imm = 0;
7742
7743       if (immtype >= 12 && immtype <= 15)
7744         info->msl = true;
7745
7746       /* Un-invert bytes of recognized vector, if necessary.  */
7747       if (invmask != 0)
7748         for (i = 0; i < idx; i++)
7749           bytes[i] ^= invmask;
7750
7751       if (immtype == 17)
7752         {
7753           /* FIXME: Broken on 32-bit H_W_I hosts.  */
7754           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7755
7756           for (i = 0; i < 8; i++)
7757             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7758               << (i * BITS_PER_UNIT);
7759
7760
7761           info->value = GEN_INT (imm);
7762         }
7763       else
7764         {
7765           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7766             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7767
7768           /* Construct 'abcdefgh' because the assembler cannot handle
7769              generic constants.  */
7770           if (info->mvn)
7771             imm = ~imm;
7772           imm = (imm >> info->shift) & 0xff;
7773           info->value = GEN_INT (imm);
7774         }
7775     }
7776
7777   return true;
7778 #undef CHECK
7779 }
7780
7781 static bool
7782 aarch64_const_vec_all_same_int_p (rtx x,
7783                                   HOST_WIDE_INT minval,
7784                                   HOST_WIDE_INT maxval)
7785 {
7786   HOST_WIDE_INT firstval;
7787   int count, i;
7788
7789   if (GET_CODE (x) != CONST_VECTOR
7790       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
7791     return false;
7792
7793   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
7794   if (firstval < minval || firstval > maxval)
7795     return false;
7796
7797   count = CONST_VECTOR_NUNITS (x);
7798   for (i = 1; i < count; i++)
7799     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
7800       return false;
7801
7802   return true;
7803 }
7804
7805 /* Check of immediate shift constants are within range.  */
7806 bool
7807 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7808 {
7809   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7810   if (left)
7811     return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
7812   else
7813     return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
7814 }
7815
7816 /* Return true if X is a uniform vector where all elements
7817    are either the floating-point constant 0.0 or the
7818    integer constant 0.  */
7819 bool
7820 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7821 {
7822   return x == CONST0_RTX (mode);
7823 }
7824
7825 bool
7826 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7827 {
7828   HOST_WIDE_INT imm = INTVAL (x);
7829   int i;
7830
7831   for (i = 0; i < 8; i++)
7832     {
7833       unsigned int byte = imm & 0xff;
7834       if (byte != 0xff && byte != 0)
7835        return false;
7836       imm >>= 8;
7837     }
7838
7839   return true;
7840 }
7841
7842 bool
7843 aarch64_mov_operand_p (rtx x,
7844                        enum aarch64_symbol_context context,
7845                        enum machine_mode mode)
7846 {
7847   if (GET_CODE (x) == HIGH
7848       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7849     return true;
7850
7851   if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7852     return true;
7853
7854   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7855     return true;
7856
7857   return aarch64_classify_symbolic_expression (x, context)
7858     == SYMBOL_TINY_ABSOLUTE;
7859 }
7860
7861 /* Return a const_int vector of VAL.  */
7862 rtx
7863 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7864 {
7865   int nunits = GET_MODE_NUNITS (mode);
7866   rtvec v = rtvec_alloc (nunits);
7867   int i;
7868
7869   for (i=0; i < nunits; i++)
7870     RTVEC_ELT (v, i) = GEN_INT (val);
7871
7872   return gen_rtx_CONST_VECTOR (mode, v);
7873 }
7874
7875 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
7876
7877 bool
7878 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7879 {
7880   enum machine_mode vmode;
7881
7882   gcc_assert (!VECTOR_MODE_P (mode));
7883   vmode = aarch64_preferred_simd_mode (mode);
7884   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7885   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7886 }
7887
7888 /* Construct and return a PARALLEL RTX vector.  */
7889 rtx
7890 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7891 {
7892   int nunits = GET_MODE_NUNITS (mode);
7893   rtvec v = rtvec_alloc (nunits / 2);
7894   int base = high ? nunits / 2 : 0;
7895   rtx t1;
7896   int i;
7897
7898   for (i=0; i < nunits / 2; i++)
7899     RTVEC_ELT (v, i) = GEN_INT (base + i);
7900
7901   t1 = gen_rtx_PARALLEL (mode, v);
7902   return t1;
7903 }
7904
7905 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
7906    HIGH (exclusive).  */
7907 void
7908 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7909 {
7910   HOST_WIDE_INT lane;
7911   gcc_assert (GET_CODE (operand) == CONST_INT);
7912   lane = INTVAL (operand);
7913
7914   if (lane < low || lane >= high)
7915     error ("lane out of range");
7916 }
7917
7918 void
7919 aarch64_simd_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7920 {
7921   gcc_assert (GET_CODE (operand) == CONST_INT);
7922   HOST_WIDE_INT lane = INTVAL (operand);
7923
7924   if (lane < low || lane >= high)
7925     error ("constant out of range");
7926 }
7927
7928 /* Emit code to reinterpret one AdvSIMD type as another,
7929    without altering bits.  */
7930 void
7931 aarch64_simd_reinterpret (rtx dest, rtx src)
7932 {
7933   emit_move_insn (dest, gen_lowpart (GET_MODE (dest), src));
7934 }
7935
7936 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
7937    registers).  */
7938 void
7939 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
7940                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
7941                             rtx op1)
7942 {
7943   rtx mem = gen_rtx_MEM (mode, destaddr);
7944   rtx tmp1 = gen_reg_rtx (mode);
7945   rtx tmp2 = gen_reg_rtx (mode);
7946
7947   emit_insn (intfn (tmp1, op1, tmp2));
7948
7949   emit_move_insn (mem, tmp1);
7950   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
7951   emit_move_insn (mem, tmp2);
7952 }
7953
7954 /* Return TRUE if OP is a valid vector addressing mode.  */
7955 bool
7956 aarch64_simd_mem_operand_p (rtx op)
7957 {
7958   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
7959                         || GET_CODE (XEXP (op, 0)) == REG);
7960 }
7961
7962 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
7963    not to early-clobber SRC registers in the process.
7964
7965    We assume that the operands described by SRC and DEST represent a
7966    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
7967    number of components into which the copy has been decomposed.  */
7968 void
7969 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
7970                                 rtx *src, unsigned int count)
7971 {
7972   unsigned int i;
7973
7974   if (!reg_overlap_mentioned_p (operands[0], operands[1])
7975       || REGNO (operands[0]) < REGNO (operands[1]))
7976     {
7977       for (i = 0; i < count; i++)
7978         {
7979           operands[2 * i] = dest[i];
7980           operands[2 * i + 1] = src[i];
7981         }
7982     }
7983   else
7984     {
7985       for (i = 0; i < count; i++)
7986         {
7987           operands[2 * i] = dest[count - i - 1];
7988           operands[2 * i + 1] = src[count - i - 1];
7989         }
7990     }
7991 }
7992
7993 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
7994    one of VSTRUCT modes: OI, CI or XI.  */
7995 int
7996 aarch64_simd_attr_length_move (rtx insn)
7997 {
7998   enum machine_mode mode;
7999
8000   extract_insn_cached (insn);
8001
8002   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8003     {
8004       mode = GET_MODE (recog_data.operand[0]);
8005       switch (mode)
8006         {
8007         case OImode:
8008           return 8;
8009         case CImode:
8010           return 12;
8011         case XImode:
8012           return 16;
8013         default:
8014           gcc_unreachable ();
8015         }
8016     }
8017   return 4;
8018 }
8019
8020 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8021    alignment of a vector to 128 bits.  */
8022 static HOST_WIDE_INT
8023 aarch64_simd_vector_alignment (const_tree type)
8024 {
8025   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8026   return MIN (align, 128);
8027 }
8028
8029 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8030 static bool
8031 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8032 {
8033   if (is_packed)
8034     return false;
8035
8036   /* We guarantee alignment for vectors up to 128-bits.  */
8037   if (tree_int_cst_compare (TYPE_SIZE (type),
8038                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8039     return false;
8040
8041   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8042   return true;
8043 }
8044
8045 /* If VALS is a vector constant that can be loaded into a register
8046    using DUP, generate instructions to do so and return an RTX to
8047    assign to the register.  Otherwise return NULL_RTX.  */
8048 static rtx
8049 aarch64_simd_dup_constant (rtx vals)
8050 {
8051   enum machine_mode mode = GET_MODE (vals);
8052   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8053   int n_elts = GET_MODE_NUNITS (mode);
8054   bool all_same = true;
8055   rtx x;
8056   int i;
8057
8058   if (GET_CODE (vals) != CONST_VECTOR)
8059     return NULL_RTX;
8060
8061   for (i = 1; i < n_elts; ++i)
8062     {
8063       x = CONST_VECTOR_ELT (vals, i);
8064       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8065         all_same = false;
8066     }
8067
8068   if (!all_same)
8069     return NULL_RTX;
8070
8071   /* We can load this constant by using DUP and a constant in a
8072      single ARM register.  This will be cheaper than a vector
8073      load.  */
8074   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8075   return gen_rtx_VEC_DUPLICATE (mode, x);
8076 }
8077
8078
8079 /* Generate code to load VALS, which is a PARALLEL containing only
8080    constants (for vec_init) or CONST_VECTOR, efficiently into a
8081    register.  Returns an RTX to copy into the register, or NULL_RTX
8082    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8083 static rtx
8084 aarch64_simd_make_constant (rtx vals)
8085 {
8086   enum machine_mode mode = GET_MODE (vals);
8087   rtx const_dup;
8088   rtx const_vec = NULL_RTX;
8089   int n_elts = GET_MODE_NUNITS (mode);
8090   int n_const = 0;
8091   int i;
8092
8093   if (GET_CODE (vals) == CONST_VECTOR)
8094     const_vec = vals;
8095   else if (GET_CODE (vals) == PARALLEL)
8096     {
8097       /* A CONST_VECTOR must contain only CONST_INTs and
8098          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8099          Only store valid constants in a CONST_VECTOR.  */
8100       for (i = 0; i < n_elts; ++i)
8101         {
8102           rtx x = XVECEXP (vals, 0, i);
8103           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8104             n_const++;
8105         }
8106       if (n_const == n_elts)
8107         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8108     }
8109   else
8110     gcc_unreachable ();
8111
8112   if (const_vec != NULL_RTX
8113       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8114     /* Load using MOVI/MVNI.  */
8115     return const_vec;
8116   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8117     /* Loaded using DUP.  */
8118     return const_dup;
8119   else if (const_vec != NULL_RTX)
8120     /* Load from constant pool. We can not take advantage of single-cycle
8121        LD1 because we need a PC-relative addressing mode.  */
8122     return const_vec;
8123   else
8124     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8125        We can not construct an initializer.  */
8126     return NULL_RTX;
8127 }
8128
8129 void
8130 aarch64_expand_vector_init (rtx target, rtx vals)
8131 {
8132   enum machine_mode mode = GET_MODE (target);
8133   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8134   int n_elts = GET_MODE_NUNITS (mode);
8135   int n_var = 0, one_var = -1;
8136   bool all_same = true;
8137   rtx x, mem;
8138   int i;
8139
8140   x = XVECEXP (vals, 0, 0);
8141   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8142     n_var = 1, one_var = 0;
8143
8144   for (i = 1; i < n_elts; ++i)
8145     {
8146       x = XVECEXP (vals, 0, i);
8147       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8148         ++n_var, one_var = i;
8149
8150       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8151         all_same = false;
8152     }
8153
8154   if (n_var == 0)
8155     {
8156       rtx constant = aarch64_simd_make_constant (vals);
8157       if (constant != NULL_RTX)
8158         {
8159           emit_move_insn (target, constant);
8160           return;
8161         }
8162     }
8163
8164   /* Splat a single non-constant element if we can.  */
8165   if (all_same)
8166     {
8167       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8168       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8169       return;
8170     }
8171
8172   /* One field is non-constant.  Load constant then overwrite varying
8173      field.  This is more efficient than using the stack.  */
8174   if (n_var == 1)
8175     {
8176       rtx copy = copy_rtx (vals);
8177       rtx index = GEN_INT (one_var);
8178       enum insn_code icode;
8179
8180       /* Load constant part of vector, substitute neighboring value for
8181          varying element.  */
8182       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8183       aarch64_expand_vector_init (target, copy);
8184
8185       /* Insert variable.  */
8186       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8187       icode = optab_handler (vec_set_optab, mode);
8188       gcc_assert (icode != CODE_FOR_nothing);
8189       emit_insn (GEN_FCN (icode) (target, x, index));
8190       return;
8191     }
8192
8193   /* Construct the vector in memory one field at a time
8194      and load the whole vector.  */
8195   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8196   for (i = 0; i < n_elts; i++)
8197     emit_move_insn (adjust_address_nv (mem, inner_mode,
8198                                     i * GET_MODE_SIZE (inner_mode)),
8199                     XVECEXP (vals, 0, i));
8200   emit_move_insn (target, mem);
8201
8202 }
8203
8204 static unsigned HOST_WIDE_INT
8205 aarch64_shift_truncation_mask (enum machine_mode mode)
8206 {
8207   return
8208     (aarch64_vector_mode_supported_p (mode)
8209      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8210 }
8211
8212 #ifndef TLS_SECTION_ASM_FLAG
8213 #define TLS_SECTION_ASM_FLAG 'T'
8214 #endif
8215
8216 void
8217 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8218                                tree decl ATTRIBUTE_UNUSED)
8219 {
8220   char flagchars[10], *f = flagchars;
8221
8222   /* If we have already declared this section, we can use an
8223      abbreviated form to switch back to it -- unless this section is
8224      part of a COMDAT groups, in which case GAS requires the full
8225      declaration every time.  */
8226   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8227       && (flags & SECTION_DECLARED))
8228     {
8229       fprintf (asm_out_file, "\t.section\t%s\n", name);
8230       return;
8231     }
8232
8233   if (!(flags & SECTION_DEBUG))
8234     *f++ = 'a';
8235   if (flags & SECTION_WRITE)
8236     *f++ = 'w';
8237   if (flags & SECTION_CODE)
8238     *f++ = 'x';
8239   if (flags & SECTION_SMALL)
8240     *f++ = 's';
8241   if (flags & SECTION_MERGE)
8242     *f++ = 'M';
8243   if (flags & SECTION_STRINGS)
8244     *f++ = 'S';
8245   if (flags & SECTION_TLS)
8246     *f++ = TLS_SECTION_ASM_FLAG;
8247   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8248     *f++ = 'G';
8249   *f = '\0';
8250
8251   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8252
8253   if (!(flags & SECTION_NOTYPE))
8254     {
8255       const char *type;
8256       const char *format;
8257
8258       if (flags & SECTION_BSS)
8259         type = "nobits";
8260       else
8261         type = "progbits";
8262
8263 #ifdef TYPE_OPERAND_FMT
8264       format = "," TYPE_OPERAND_FMT;
8265 #else
8266       format = ",@%s";
8267 #endif
8268
8269       fprintf (asm_out_file, format, type);
8270
8271       if (flags & SECTION_ENTSIZE)
8272         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8273       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8274         {
8275           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8276             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8277           else
8278             fprintf (asm_out_file, ",%s,comdat",
8279                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8280         }
8281     }
8282
8283   putc ('\n', asm_out_file);
8284 }
8285
8286 /* Select a format to encode pointers in exception handling data.  */
8287 int
8288 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8289 {
8290    int type;
8291    switch (aarch64_cmodel)
8292      {
8293      case AARCH64_CMODEL_TINY:
8294      case AARCH64_CMODEL_TINY_PIC:
8295      case AARCH64_CMODEL_SMALL:
8296      case AARCH64_CMODEL_SMALL_PIC:
8297        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8298           for everything.  */
8299        type = DW_EH_PE_sdata4;
8300        break;
8301      default:
8302        /* No assumptions here.  8-byte relocs required.  */
8303        type = DW_EH_PE_sdata8;
8304        break;
8305      }
8306    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8307 }
8308
8309 /* Emit load exclusive.  */
8310
8311 static void
8312 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8313                              rtx mem, rtx model_rtx)
8314 {
8315   rtx (*gen) (rtx, rtx, rtx);
8316
8317   switch (mode)
8318     {
8319     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8320     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8321     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8322     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8323     default:
8324       gcc_unreachable ();
8325     }
8326
8327   emit_insn (gen (rval, mem, model_rtx));
8328 }
8329
8330 /* Emit store exclusive.  */
8331
8332 static void
8333 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8334                               rtx rval, rtx mem, rtx model_rtx)
8335 {
8336   rtx (*gen) (rtx, rtx, rtx, rtx);
8337
8338   switch (mode)
8339     {
8340     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8341     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8342     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8343     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8344     default:
8345       gcc_unreachable ();
8346     }
8347
8348   emit_insn (gen (bval, rval, mem, model_rtx));
8349 }
8350
8351 /* Mark the previous jump instruction as unlikely.  */
8352
8353 static void
8354 aarch64_emit_unlikely_jump (rtx insn)
8355 {
8356   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8357
8358   insn = emit_jump_insn (insn);
8359   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8360 }
8361
8362 /* Expand a compare and swap pattern.  */
8363
8364 void
8365 aarch64_expand_compare_and_swap (rtx operands[])
8366 {
8367   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8368   enum machine_mode mode, cmp_mode;
8369   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8370
8371   bval = operands[0];
8372   rval = operands[1];
8373   mem = operands[2];
8374   oldval = operands[3];
8375   newval = operands[4];
8376   is_weak = operands[5];
8377   mod_s = operands[6];
8378   mod_f = operands[7];
8379   mode = GET_MODE (mem);
8380   cmp_mode = mode;
8381
8382   /* Normally the succ memory model must be stronger than fail, but in the
8383      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8384      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8385
8386   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8387       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8388     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8389
8390   switch (mode)
8391     {
8392     case QImode:
8393     case HImode:
8394       /* For short modes, we're going to perform the comparison in SImode,
8395          so do the zero-extension now.  */
8396       cmp_mode = SImode;
8397       rval = gen_reg_rtx (SImode);
8398       oldval = convert_modes (SImode, mode, oldval, true);
8399       /* Fall through.  */
8400
8401     case SImode:
8402     case DImode:
8403       /* Force the value into a register if needed.  */
8404       if (!aarch64_plus_operand (oldval, mode))
8405         oldval = force_reg (cmp_mode, oldval);
8406       break;
8407
8408     default:
8409       gcc_unreachable ();
8410     }
8411
8412   switch (mode)
8413     {
8414     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8415     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8416     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8417     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8418     default:
8419       gcc_unreachable ();
8420     }
8421
8422   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8423
8424   if (mode == QImode || mode == HImode)
8425     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8426
8427   x = gen_rtx_REG (CCmode, CC_REGNUM);
8428   x = gen_rtx_EQ (SImode, x, const0_rtx);
8429   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8430 }
8431
8432 /* Split a compare and swap pattern.  */
8433
8434 void
8435 aarch64_split_compare_and_swap (rtx operands[])
8436 {
8437   rtx rval, mem, oldval, newval, scratch;
8438   enum machine_mode mode;
8439   bool is_weak;
8440   rtx label1, label2, x, cond;
8441
8442   rval = operands[0];
8443   mem = operands[1];
8444   oldval = operands[2];
8445   newval = operands[3];
8446   is_weak = (operands[4] != const0_rtx);
8447   scratch = operands[7];
8448   mode = GET_MODE (mem);
8449
8450   label1 = NULL_RTX;
8451   if (!is_weak)
8452     {
8453       label1 = gen_label_rtx ();
8454       emit_label (label1);
8455     }
8456   label2 = gen_label_rtx ();
8457
8458   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8459
8460   cond = aarch64_gen_compare_reg (NE, rval, oldval);
8461   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8462   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8463                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8464   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8465
8466   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8467
8468   if (!is_weak)
8469     {
8470       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8471       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8472                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8473       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8474     }
8475   else
8476     {
8477       cond = gen_rtx_REG (CCmode, CC_REGNUM);
8478       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8479       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8480     }
8481
8482   emit_label (label2);
8483 }
8484
8485 /* Split an atomic operation.  */
8486
8487 void
8488 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8489                      rtx value, rtx model_rtx, rtx cond)
8490 {
8491   enum machine_mode mode = GET_MODE (mem);
8492   enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8493   rtx label, x;
8494
8495   label = gen_label_rtx ();
8496   emit_label (label);
8497
8498   if (new_out)
8499     new_out = gen_lowpart (wmode, new_out);
8500   if (old_out)
8501     old_out = gen_lowpart (wmode, old_out);
8502   else
8503     old_out = new_out;
8504   value = simplify_gen_subreg (wmode, value, mode, 0);
8505
8506   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8507
8508   switch (code)
8509     {
8510     case SET:
8511       new_out = value;
8512       break;
8513
8514     case NOT:
8515       x = gen_rtx_AND (wmode, old_out, value);
8516       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8517       x = gen_rtx_NOT (wmode, new_out);
8518       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8519       break;
8520
8521     case MINUS:
8522       if (CONST_INT_P (value))
8523         {
8524           value = GEN_INT (-INTVAL (value));
8525           code = PLUS;
8526         }
8527       /* Fall through.  */
8528
8529     default:
8530       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8531       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8532       break;
8533     }
8534
8535   aarch64_emit_store_exclusive (mode, cond, mem,
8536                                 gen_lowpart (mode, new_out), model_rtx);
8537
8538   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8539   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8540                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8541   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8542 }
8543
8544 static void
8545 aarch64_print_extension (void)
8546 {
8547   const struct aarch64_option_extension *opt = NULL;
8548
8549   for (opt = all_extensions; opt->name != NULL; opt++)
8550     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8551       asm_fprintf (asm_out_file, "+%s", opt->name);
8552
8553   asm_fprintf (asm_out_file, "\n");
8554 }
8555
8556 static void
8557 aarch64_start_file (void)
8558 {
8559   if (selected_arch)
8560     {
8561       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8562       aarch64_print_extension ();
8563     }
8564   else if (selected_cpu)
8565     {
8566       const char *truncated_name
8567             = aarch64_rewrite_selected_cpu (selected_cpu->name);
8568       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8569       aarch64_print_extension ();
8570     }
8571   default_file_start();
8572 }
8573
8574 /* Target hook for c_mode_for_suffix.  */
8575 static enum machine_mode
8576 aarch64_c_mode_for_suffix (char suffix)
8577 {
8578   if (suffix == 'q')
8579     return TFmode;
8580
8581   return VOIDmode;
8582 }
8583
8584 /* We can only represent floating point constants which will fit in
8585    "quarter-precision" values.  These values are characterised by
8586    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
8587    by:
8588
8589    (-1)^s * (n/16) * 2^r
8590
8591    Where:
8592      's' is the sign bit.
8593      'n' is an integer in the range 16 <= n <= 31.
8594      'r' is an integer in the range -3 <= r <= 4.  */
8595
8596 /* Return true iff X can be represented by a quarter-precision
8597    floating point immediate operand X.  Note, we cannot represent 0.0.  */
8598 bool
8599 aarch64_float_const_representable_p (rtx x)
8600 {
8601   /* This represents our current view of how many bits
8602      make up the mantissa.  */
8603   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8604   int exponent;
8605   unsigned HOST_WIDE_INT mantissa, mask;
8606   HOST_WIDE_INT m1, m2;
8607   REAL_VALUE_TYPE r, m;
8608
8609   if (!CONST_DOUBLE_P (x))
8610     return false;
8611
8612   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8613
8614   /* We cannot represent infinities, NaNs or +/-zero.  We won't
8615      know if we have +zero until we analyse the mantissa, but we
8616      can reject the other invalid values.  */
8617   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8618       || REAL_VALUE_MINUS_ZERO (r))
8619     return false;
8620
8621   /* Extract exponent.  */
8622   r = real_value_abs (&r);
8623   exponent = REAL_EXP (&r);
8624
8625   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8626      highest (sign) bit, with a fixed binary point at bit point_pos.
8627      m1 holds the low part of the mantissa, m2 the high part.
8628      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8629      bits for the mantissa, this can fail (low bits will be lost).  */
8630   real_ldexp (&m, &r, point_pos - exponent);
8631   REAL_VALUE_TO_INT (&m1, &m2, m);
8632
8633   /* If the low part of the mantissa has bits set we cannot represent
8634      the value.  */
8635   if (m1 != 0)
8636     return false;
8637   /* We have rejected the lower HOST_WIDE_INT, so update our
8638      understanding of how many bits lie in the mantissa and
8639      look only at the high HOST_WIDE_INT.  */
8640   mantissa = m2;
8641   point_pos -= HOST_BITS_PER_WIDE_INT;
8642
8643   /* We can only represent values with a mantissa of the form 1.xxxx.  */
8644   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8645   if ((mantissa & mask) != 0)
8646     return false;
8647
8648   /* Having filtered unrepresentable values, we may now remove all
8649      but the highest 5 bits.  */
8650   mantissa >>= point_pos - 5;
8651
8652   /* We cannot represent the value 0.0, so reject it.  This is handled
8653      elsewhere.  */
8654   if (mantissa == 0)
8655     return false;
8656
8657   /* Then, as bit 4 is always set, we can mask it off, leaving
8658      the mantissa in the range [0, 15].  */
8659   mantissa &= ~(1 << 4);
8660   gcc_assert (mantissa <= 15);
8661
8662   /* GCC internally does not use IEEE754-like encoding (where normalized
8663      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
8664      Our mantissa values are shifted 4 places to the left relative to
8665      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8666      by 5 places to correct for GCC's representation.  */
8667   exponent = 5 - exponent;
8668
8669   return (exponent >= 0 && exponent <= 7);
8670 }
8671
8672 char*
8673 aarch64_output_simd_mov_immediate (rtx const_vector,
8674                                    enum machine_mode mode,
8675                                    unsigned width)
8676 {
8677   bool is_valid;
8678   static char templ[40];
8679   const char *mnemonic;
8680   const char *shift_op;
8681   unsigned int lane_count = 0;
8682   char element_char;
8683
8684   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8685
8686   /* This will return true to show const_vector is legal for use as either
8687      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
8688      also update INFO to show how the immediate should be generated.  */
8689   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8690   gcc_assert (is_valid);
8691
8692   element_char = sizetochar (info.element_width);
8693   lane_count = width / info.element_width;
8694
8695   mode = GET_MODE_INNER (mode);
8696   if (mode == SFmode || mode == DFmode)
8697     {
8698       gcc_assert (info.shift == 0 && ! info.mvn);
8699       if (aarch64_float_const_zero_rtx_p (info.value))
8700         info.value = GEN_INT (0);
8701       else
8702         {
8703 #define buf_size 20
8704           REAL_VALUE_TYPE r;
8705           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8706           char float_buf[buf_size] = {'\0'};
8707           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8708 #undef buf_size
8709
8710           if (lane_count == 1)
8711             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8712           else
8713             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8714                       lane_count, element_char, float_buf);
8715           return templ;
8716         }
8717     }
8718
8719   mnemonic = info.mvn ? "mvni" : "movi";
8720   shift_op = info.msl ? "msl" : "lsl";
8721
8722   if (lane_count == 1)
8723     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8724               mnemonic, UINTVAL (info.value));
8725   else if (info.shift)
8726     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8727               ", %s %d", mnemonic, lane_count, element_char,
8728               UINTVAL (info.value), shift_op, info.shift);
8729   else
8730     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8731               mnemonic, lane_count, element_char, UINTVAL (info.value));
8732   return templ;
8733 }
8734
8735 char*
8736 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8737                                           enum machine_mode mode)
8738 {
8739   enum machine_mode vmode;
8740
8741   gcc_assert (!VECTOR_MODE_P (mode));
8742   vmode = aarch64_simd_container_mode (mode, 64);
8743   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8744   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8745 }
8746
8747 /* Split operands into moves from op[1] + op[2] into op[0].  */
8748
8749 void
8750 aarch64_split_combinev16qi (rtx operands[3])
8751 {
8752   unsigned int dest = REGNO (operands[0]);
8753   unsigned int src1 = REGNO (operands[1]);
8754   unsigned int src2 = REGNO (operands[2]);
8755   enum machine_mode halfmode = GET_MODE (operands[1]);
8756   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8757   rtx destlo, desthi;
8758
8759   gcc_assert (halfmode == V16QImode);
8760
8761   if (src1 == dest && src2 == dest + halfregs)
8762     {
8763       /* No-op move.  Can't split to nothing; emit something.  */
8764       emit_note (NOTE_INSN_DELETED);
8765       return;
8766     }
8767
8768   /* Preserve register attributes for variable tracking.  */
8769   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8770   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8771                                GET_MODE_SIZE (halfmode));
8772
8773   /* Special case of reversed high/low parts.  */
8774   if (reg_overlap_mentioned_p (operands[2], destlo)
8775       && reg_overlap_mentioned_p (operands[1], desthi))
8776     {
8777       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8778       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8779       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8780     }
8781   else if (!reg_overlap_mentioned_p (operands[2], destlo))
8782     {
8783       /* Try to avoid unnecessary moves if part of the result
8784          is in the right place already.  */
8785       if (src1 != dest)
8786         emit_move_insn (destlo, operands[1]);
8787       if (src2 != dest + halfregs)
8788         emit_move_insn (desthi, operands[2]);
8789     }
8790   else
8791     {
8792       if (src2 != dest + halfregs)
8793         emit_move_insn (desthi, operands[2]);
8794       if (src1 != dest)
8795         emit_move_insn (destlo, operands[1]);
8796     }
8797 }
8798
8799 /* vec_perm support.  */
8800
8801 #define MAX_VECT_LEN 16
8802
8803 struct expand_vec_perm_d
8804 {
8805   rtx target, op0, op1;
8806   unsigned char perm[MAX_VECT_LEN];
8807   enum machine_mode vmode;
8808   unsigned char nelt;
8809   bool one_vector_p;
8810   bool testing_p;
8811 };
8812
8813 /* Generate a variable permutation.  */
8814
8815 static void
8816 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8817 {
8818   enum machine_mode vmode = GET_MODE (target);
8819   bool one_vector_p = rtx_equal_p (op0, op1);
8820
8821   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8822   gcc_checking_assert (GET_MODE (op0) == vmode);
8823   gcc_checking_assert (GET_MODE (op1) == vmode);
8824   gcc_checking_assert (GET_MODE (sel) == vmode);
8825   gcc_checking_assert (TARGET_SIMD);
8826
8827   if (one_vector_p)
8828     {
8829       if (vmode == V8QImode)
8830         {
8831           /* Expand the argument to a V16QI mode by duplicating it.  */
8832           rtx pair = gen_reg_rtx (V16QImode);
8833           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8834           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8835         }
8836       else
8837         {
8838           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8839         }
8840     }
8841   else
8842     {
8843       rtx pair;
8844
8845       if (vmode == V8QImode)
8846         {
8847           pair = gen_reg_rtx (V16QImode);
8848           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8849           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8850         }
8851       else
8852         {
8853           pair = gen_reg_rtx (OImode);
8854           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8855           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8856         }
8857     }
8858 }
8859
8860 void
8861 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8862 {
8863   enum machine_mode vmode = GET_MODE (target);
8864   unsigned int nelt = GET_MODE_NUNITS (vmode);
8865   bool one_vector_p = rtx_equal_p (op0, op1);
8866   rtx mask;
8867
8868   /* The TBL instruction does not use a modulo index, so we must take care
8869      of that ourselves.  */
8870   mask = aarch64_simd_gen_const_vector_dup (vmode,
8871       one_vector_p ? nelt - 1 : 2 * nelt - 1);
8872   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8873
8874   /* For big-endian, we also need to reverse the index within the vector
8875      (but not which vector).  */
8876   if (BYTES_BIG_ENDIAN)
8877     {
8878       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
8879       if (!one_vector_p)
8880         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
8881       sel = expand_simple_binop (vmode, XOR, sel, mask,
8882                                  NULL, 0, OPTAB_LIB_WIDEN);
8883     }
8884   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8885 }
8886
8887 /* Recognize patterns suitable for the TRN instructions.  */
8888 static bool
8889 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8890 {
8891   unsigned int i, odd, mask, nelt = d->nelt;
8892   rtx out, in0, in1, x;
8893   rtx (*gen) (rtx, rtx, rtx);
8894   enum machine_mode vmode = d->vmode;
8895
8896   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8897     return false;
8898
8899   /* Note that these are little-endian tests.
8900      We correct for big-endian later.  */
8901   if (d->perm[0] == 0)
8902     odd = 0;
8903   else if (d->perm[0] == 1)
8904     odd = 1;
8905   else
8906     return false;
8907   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8908
8909   for (i = 0; i < nelt; i += 2)
8910     {
8911       if (d->perm[i] != i + odd)
8912         return false;
8913       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
8914         return false;
8915     }
8916
8917   /* Success!  */
8918   if (d->testing_p)
8919     return true;
8920
8921   in0 = d->op0;
8922   in1 = d->op1;
8923   if (BYTES_BIG_ENDIAN)
8924     {
8925       x = in0, in0 = in1, in1 = x;
8926       odd = !odd;
8927     }
8928   out = d->target;
8929
8930   if (odd)
8931     {
8932       switch (vmode)
8933         {
8934         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
8935         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
8936         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
8937         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
8938         case V4SImode: gen = gen_aarch64_trn2v4si; break;
8939         case V2SImode: gen = gen_aarch64_trn2v2si; break;
8940         case V2DImode: gen = gen_aarch64_trn2v2di; break;
8941         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
8942         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
8943         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
8944         default:
8945           return false;
8946         }
8947     }
8948   else
8949     {
8950       switch (vmode)
8951         {
8952         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
8953         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
8954         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
8955         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
8956         case V4SImode: gen = gen_aarch64_trn1v4si; break;
8957         case V2SImode: gen = gen_aarch64_trn1v2si; break;
8958         case V2DImode: gen = gen_aarch64_trn1v2di; break;
8959         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
8960         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
8961         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
8962         default:
8963           return false;
8964         }
8965     }
8966
8967   emit_insn (gen (out, in0, in1));
8968   return true;
8969 }
8970
8971 /* Recognize patterns suitable for the UZP instructions.  */
8972 static bool
8973 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
8974 {
8975   unsigned int i, odd, mask, nelt = d->nelt;
8976   rtx out, in0, in1, x;
8977   rtx (*gen) (rtx, rtx, rtx);
8978   enum machine_mode vmode = d->vmode;
8979
8980   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8981     return false;
8982
8983   /* Note that these are little-endian tests.
8984      We correct for big-endian later.  */
8985   if (d->perm[0] == 0)
8986     odd = 0;
8987   else if (d->perm[0] == 1)
8988     odd = 1;
8989   else
8990     return false;
8991   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8992
8993   for (i = 0; i < nelt; i++)
8994     {
8995       unsigned elt = (i * 2 + odd) & mask;
8996       if (d->perm[i] != elt)
8997         return false;
8998     }
8999
9000   /* Success!  */
9001   if (d->testing_p)
9002     return true;
9003
9004   in0 = d->op0;
9005   in1 = d->op1;
9006   if (BYTES_BIG_ENDIAN)
9007     {
9008       x = in0, in0 = in1, in1 = x;
9009       odd = !odd;
9010     }
9011   out = d->target;
9012
9013   if (odd)
9014     {
9015       switch (vmode)
9016         {
9017         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9018         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9019         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9020         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9021         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9022         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9023         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9024         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9025         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9026         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9027         default:
9028           return false;
9029         }
9030     }
9031   else
9032     {
9033       switch (vmode)
9034         {
9035         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9036         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9037         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9038         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9039         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9040         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9041         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9042         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9043         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9044         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9045         default:
9046           return false;
9047         }
9048     }
9049
9050   emit_insn (gen (out, in0, in1));
9051   return true;
9052 }
9053
9054 /* Recognize patterns suitable for the ZIP instructions.  */
9055 static bool
9056 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9057 {
9058   unsigned int i, high, mask, nelt = d->nelt;
9059   rtx out, in0, in1, x;
9060   rtx (*gen) (rtx, rtx, rtx);
9061   enum machine_mode vmode = d->vmode;
9062
9063   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9064     return false;
9065
9066   /* Note that these are little-endian tests.
9067      We correct for big-endian later.  */
9068   high = nelt / 2;
9069   if (d->perm[0] == high)
9070     /* Do Nothing.  */
9071     ;
9072   else if (d->perm[0] == 0)
9073     high = 0;
9074   else
9075     return false;
9076   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9077
9078   for (i = 0; i < nelt / 2; i++)
9079     {
9080       unsigned elt = (i + high) & mask;
9081       if (d->perm[i * 2] != elt)
9082         return false;
9083       elt = (elt + nelt) & mask;
9084       if (d->perm[i * 2 + 1] != elt)
9085         return false;
9086     }
9087
9088   /* Success!  */
9089   if (d->testing_p)
9090     return true;
9091
9092   in0 = d->op0;
9093   in1 = d->op1;
9094   if (BYTES_BIG_ENDIAN)
9095     {
9096       x = in0, in0 = in1, in1 = x;
9097       high = !high;
9098     }
9099   out = d->target;
9100
9101   if (high)
9102     {
9103       switch (vmode)
9104         {
9105         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9106         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9107         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9108         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9109         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9110         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9111         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9112         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9113         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9114         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9115         default:
9116           return false;
9117         }
9118     }
9119   else
9120     {
9121       switch (vmode)
9122         {
9123         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9124         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9125         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9126         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9127         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9128         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9129         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9130         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9131         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9132         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9133         default:
9134           return false;
9135         }
9136     }
9137
9138   emit_insn (gen (out, in0, in1));
9139   return true;
9140 }
9141
9142 /* Recognize patterns for the EXT insn.  */
9143
9144 static bool
9145 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9146 {
9147   unsigned int i, nelt = d->nelt;
9148   rtx (*gen) (rtx, rtx, rtx, rtx);
9149   rtx offset;
9150
9151   unsigned int location = d->perm[0]; /* Always < nelt.  */
9152
9153   /* Check if the extracted indices are increasing by one.  */
9154   for (i = 1; i < nelt; i++)
9155     {
9156       unsigned int required = location + i;
9157       if (d->one_vector_p)
9158         {
9159           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9160           required &= (nelt - 1);
9161         }
9162       if (d->perm[i] != required)
9163         return false;
9164     }
9165
9166   switch (d->vmode)
9167     {
9168     case V16QImode: gen = gen_aarch64_extv16qi; break;
9169     case V8QImode: gen = gen_aarch64_extv8qi; break;
9170     case V4HImode: gen = gen_aarch64_extv4hi; break;
9171     case V8HImode: gen = gen_aarch64_extv8hi; break;
9172     case V2SImode: gen = gen_aarch64_extv2si; break;
9173     case V4SImode: gen = gen_aarch64_extv4si; break;
9174     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9175     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9176     case V2DImode: gen = gen_aarch64_extv2di; break;
9177     case V2DFmode: gen = gen_aarch64_extv2df; break;
9178     default:
9179       return false;
9180     }
9181
9182   /* Success! */
9183   if (d->testing_p)
9184     return true;
9185
9186   /* The case where (location == 0) is a no-op for both big- and little-endian,
9187      and is removed by the mid-end at optimization levels -O1 and higher.  */
9188
9189   if (BYTES_BIG_ENDIAN && (location != 0))
9190     {
9191       /* After setup, we want the high elements of the first vector (stored
9192          at the LSB end of the register), and the low elements of the second
9193          vector (stored at the MSB end of the register). So swap.  */
9194       rtx temp = d->op0;
9195       d->op0 = d->op1;
9196       d->op1 = temp;
9197       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9198       location = nelt - location;
9199     }
9200
9201   offset = GEN_INT (location);
9202   emit_insn (gen (d->target, d->op0, d->op1, offset));
9203   return true;
9204 }
9205
9206 /* Recognize patterns for the REV insns.  */
9207
9208 static bool
9209 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9210 {
9211   unsigned int i, j, diff, nelt = d->nelt;
9212   rtx (*gen) (rtx, rtx);
9213
9214   if (!d->one_vector_p)
9215     return false;
9216
9217   diff = d->perm[0];
9218   switch (diff)
9219     {
9220     case 7:
9221       switch (d->vmode)
9222         {
9223         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9224         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9225         default:
9226           return false;
9227         }
9228       break;
9229     case 3:
9230       switch (d->vmode)
9231         {
9232         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9233         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9234         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9235         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9236         default:
9237           return false;
9238         }
9239       break;
9240     case 1:
9241       switch (d->vmode)
9242         {
9243         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9244         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9245         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9246         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9247         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9248         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9249         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9250         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9251         default:
9252           return false;
9253         }
9254       break;
9255     default:
9256       return false;
9257     }
9258
9259   for (i = 0; i < nelt ; i += diff + 1)
9260     for (j = 0; j <= diff; j += 1)
9261       {
9262         /* This is guaranteed to be true as the value of diff
9263            is 7, 3, 1 and we should have enough elements in the
9264            queue to generate this.  Getting a vector mask with a
9265            value of diff other than these values implies that
9266            something is wrong by the time we get here.  */
9267         gcc_assert (i + j < nelt);
9268         if (d->perm[i + j] != i + diff - j)
9269           return false;
9270       }
9271
9272   /* Success! */
9273   if (d->testing_p)
9274     return true;
9275
9276   emit_insn (gen (d->target, d->op0));
9277   return true;
9278 }
9279
9280 static bool
9281 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9282 {
9283   rtx (*gen) (rtx, rtx, rtx);
9284   rtx out = d->target;
9285   rtx in0;
9286   enum machine_mode vmode = d->vmode;
9287   unsigned int i, elt, nelt = d->nelt;
9288   rtx lane;
9289
9290   /* TODO: This may not be big-endian safe.  */
9291   if (BYTES_BIG_ENDIAN)
9292     return false;
9293
9294   elt = d->perm[0];
9295   for (i = 1; i < nelt; i++)
9296     {
9297       if (elt != d->perm[i])
9298         return false;
9299     }
9300
9301   /* The generic preparation in aarch64_expand_vec_perm_const_1
9302      swaps the operand order and the permute indices if it finds
9303      d->perm[0] to be in the second operand.  Thus, we can always
9304      use d->op0 and need not do any extra arithmetic to get the
9305      correct lane number.  */
9306   in0 = d->op0;
9307   lane = GEN_INT (elt);
9308
9309   switch (vmode)
9310     {
9311     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9312     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9313     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9314     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9315     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9316     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9317     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9318     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9319     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9320     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9321     default:
9322       return false;
9323     }
9324
9325   emit_insn (gen (out, in0, lane));
9326   return true;
9327 }
9328
9329 static bool
9330 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9331 {
9332   rtx rperm[MAX_VECT_LEN], sel;
9333   enum machine_mode vmode = d->vmode;
9334   unsigned int i, nelt = d->nelt;
9335
9336   if (d->testing_p)
9337     return true;
9338
9339   /* Generic code will try constant permutation twice.  Once with the
9340      original mode and again with the elements lowered to QImode.
9341      So wait and don't do the selector expansion ourselves.  */
9342   if (vmode != V8QImode && vmode != V16QImode)
9343     return false;
9344
9345   for (i = 0; i < nelt; ++i)
9346     {
9347       int nunits = GET_MODE_NUNITS (vmode);
9348
9349       /* If big-endian and two vectors we end up with a weird mixed-endian
9350          mode on NEON.  Reverse the index within each word but not the word
9351          itself.  */
9352       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9353                                            : d->perm[i]);
9354     }
9355   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9356   sel = force_reg (vmode, sel);
9357
9358   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9359   return true;
9360 }
9361
9362 static bool
9363 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9364 {
9365   /* The pattern matching functions above are written to look for a small
9366      number to begin the sequence (0, 1, N/2).  If we begin with an index
9367      from the second operand, we can swap the operands.  */
9368   if (d->perm[0] >= d->nelt)
9369     {
9370       unsigned i, nelt = d->nelt;
9371       rtx x;
9372
9373       for (i = 0; i < nelt; ++i)
9374         d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
9375
9376       x = d->op0;
9377       d->op0 = d->op1;
9378       d->op1 = x;
9379     }
9380
9381   if (TARGET_SIMD)
9382     {
9383       if (aarch64_evpc_rev (d))
9384         return true;
9385       else if (aarch64_evpc_ext (d))
9386         return true;
9387       else if (aarch64_evpc_zip (d))
9388         return true;
9389       else if (aarch64_evpc_uzp (d))
9390         return true;
9391       else if (aarch64_evpc_trn (d))
9392         return true;
9393       else if (aarch64_evpc_dup (d))
9394         return true;
9395       return aarch64_evpc_tbl (d);
9396     }
9397   return false;
9398 }
9399
9400 /* Expand a vec_perm_const pattern.  */
9401
9402 bool
9403 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9404 {
9405   struct expand_vec_perm_d d;
9406   int i, nelt, which;
9407
9408   d.target = target;
9409   d.op0 = op0;
9410   d.op1 = op1;
9411
9412   d.vmode = GET_MODE (target);
9413   gcc_assert (VECTOR_MODE_P (d.vmode));
9414   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9415   d.testing_p = false;
9416
9417   for (i = which = 0; i < nelt; ++i)
9418     {
9419       rtx e = XVECEXP (sel, 0, i);
9420       int ei = INTVAL (e) & (2 * nelt - 1);
9421       which |= (ei < nelt ? 1 : 2);
9422       d.perm[i] = ei;
9423     }
9424
9425   switch (which)
9426     {
9427     default:
9428       gcc_unreachable ();
9429
9430     case 3:
9431       d.one_vector_p = false;
9432       if (!rtx_equal_p (op0, op1))
9433         break;
9434
9435       /* The elements of PERM do not suggest that only the first operand
9436          is used, but both operands are identical.  Allow easier matching
9437          of the permutation by folding the permutation into the single
9438          input vector.  */
9439       /* Fall Through.  */
9440     case 2:
9441       for (i = 0; i < nelt; ++i)
9442         d.perm[i] &= nelt - 1;
9443       d.op0 = op1;
9444       d.one_vector_p = true;
9445       break;
9446
9447     case 1:
9448       d.op1 = op0;
9449       d.one_vector_p = true;
9450       break;
9451     }
9452
9453   return aarch64_expand_vec_perm_const_1 (&d);
9454 }
9455
9456 static bool
9457 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9458                                      const unsigned char *sel)
9459 {
9460   struct expand_vec_perm_d d;
9461   unsigned int i, nelt, which;
9462   bool ret;
9463
9464   d.vmode = vmode;
9465   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9466   d.testing_p = true;
9467   memcpy (d.perm, sel, nelt);
9468
9469   /* Calculate whether all elements are in one vector.  */
9470   for (i = which = 0; i < nelt; ++i)
9471     {
9472       unsigned char e = d.perm[i];
9473       gcc_assert (e < 2 * nelt);
9474       which |= (e < nelt ? 1 : 2);
9475     }
9476
9477   /* If all elements are from the second vector, reindex as if from the
9478      first vector.  */
9479   if (which == 2)
9480     for (i = 0; i < nelt; ++i)
9481       d.perm[i] -= nelt;
9482
9483   /* Check whether the mask can be applied to a single vector.  */
9484   d.one_vector_p = (which != 3);
9485
9486   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9487   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9488   if (!d.one_vector_p)
9489     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9490
9491   start_sequence ();
9492   ret = aarch64_expand_vec_perm_const_1 (&d);
9493   end_sequence ();
9494
9495   return ret;
9496 }
9497
9498 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
9499 bool
9500 aarch64_cannot_change_mode_class (enum machine_mode from,
9501                                   enum machine_mode to,
9502                                   enum reg_class rclass)
9503 {
9504   /* Full-reg subregs are allowed on general regs or any class if they are
9505      the same size.  */
9506   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9507       || !reg_classes_intersect_p (FP_REGS, rclass))
9508     return false;
9509
9510   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
9511      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9512      2. Scalar to Scalar for integer modes or same size float modes.
9513      3. Vector to Vector modes.
9514      4. On little-endian only, Vector-Structure to Vector modes.  */
9515   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9516     {
9517       if (aarch64_vector_mode_supported_p (from)
9518           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9519         return false;
9520
9521       if (GET_MODE_NUNITS (from) == 1
9522           && GET_MODE_NUNITS (to) == 1
9523           && (GET_MODE_CLASS (from) == MODE_INT
9524               || from == to))
9525         return false;
9526
9527       if (aarch64_vector_mode_supported_p (from)
9528           && aarch64_vector_mode_supported_p (to))
9529         return false;
9530
9531       /* Within an vector structure straddling multiple vector registers
9532          we are in a mixed-endian representation.  As such, we can't
9533          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
9534          switch between vectors and vector structures cheaply.  */
9535       if (!BYTES_BIG_ENDIAN)
9536         if ((aarch64_vector_mode_supported_p (from)
9537               && aarch64_vect_struct_mode_p (to))
9538             || (aarch64_vector_mode_supported_p (to)
9539               && aarch64_vect_struct_mode_p (from)))
9540           return false;
9541     }
9542
9543   return true;
9544 }
9545
9546 /* Implement MODES_TIEABLE_P.  */
9547
9548 bool
9549 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9550 {
9551   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9552     return true;
9553
9554   /* We specifically want to allow elements of "structure" modes to
9555      be tieable to the structure.  This more general condition allows
9556      other rarer situations too.  */
9557   if (TARGET_SIMD
9558       && aarch64_vector_mode_p (mode1)
9559       && aarch64_vector_mode_p (mode2))
9560     return true;
9561
9562   return false;
9563 }
9564
9565 /* Return a new RTX holding the result of moving POINTER forward by
9566    AMOUNT bytes.  */
9567
9568 static rtx
9569 aarch64_move_pointer (rtx pointer, int amount)
9570 {
9571   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9572
9573   return adjust_automodify_address (pointer, GET_MODE (pointer),
9574                                     next, amount);
9575 }
9576
9577 /* Return a new RTX holding the result of moving POINTER forward by the
9578    size of the mode it points to.  */
9579
9580 static rtx
9581 aarch64_progress_pointer (rtx pointer)
9582 {
9583   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9584
9585   return aarch64_move_pointer (pointer, amount);
9586 }
9587
9588 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9589    MODE bytes.  */
9590
9591 static void
9592 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9593                                               enum machine_mode mode)
9594 {
9595   rtx reg = gen_reg_rtx (mode);
9596
9597   /* "Cast" the pointers to the correct mode.  */
9598   *src = adjust_address (*src, mode, 0);
9599   *dst = adjust_address (*dst, mode, 0);
9600   /* Emit the memcpy.  */
9601   emit_move_insn (reg, *src);
9602   emit_move_insn (*dst, reg);
9603   /* Move the pointers forward.  */
9604   *src = aarch64_progress_pointer (*src);
9605   *dst = aarch64_progress_pointer (*dst);
9606 }
9607
9608 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
9609    we succeed, otherwise return false.  */
9610
9611 bool
9612 aarch64_expand_movmem (rtx *operands)
9613 {
9614   unsigned int n;
9615   rtx dst = operands[0];
9616   rtx src = operands[1];
9617   rtx base;
9618   bool speed_p = !optimize_function_for_size_p (cfun);
9619
9620   /* When optimizing for size, give a better estimate of the length of a
9621      memcpy call, but use the default otherwise.  */
9622   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9623
9624   /* We can't do anything smart if the amount to copy is not constant.  */
9625   if (!CONST_INT_P (operands[2]))
9626     return false;
9627
9628   n = UINTVAL (operands[2]);
9629
9630   /* Try to keep the number of instructions low.  For cases below 16 bytes we
9631      need to make at most two moves.  For cases above 16 bytes it will be one
9632      move for each 16 byte chunk, then at most two additional moves.  */
9633   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9634     return false;
9635
9636   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9637   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9638
9639   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9640   src = adjust_automodify_address (src, VOIDmode, base, 0);
9641
9642   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9643      1-byte chunk.  */
9644   if (n < 4)
9645     {
9646       if (n >= 2)
9647         {
9648           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9649           n -= 2;
9650         }
9651
9652       if (n == 1)
9653         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9654
9655       return true;
9656     }
9657
9658   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
9659      4-byte chunk, partially overlapping with the previously copied chunk.  */
9660   if (n < 8)
9661     {
9662       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9663       n -= 4;
9664       if (n > 0)
9665         {
9666           int move = n - 4;
9667
9668           src = aarch64_move_pointer (src, move);
9669           dst = aarch64_move_pointer (dst, move);
9670           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9671         }
9672       return true;
9673     }
9674
9675   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
9676      them, then (if applicable) an 8-byte chunk.  */
9677   while (n >= 8)
9678     {
9679       if (n / 16)
9680         {
9681           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9682           n -= 16;
9683         }
9684       else
9685         {
9686           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9687           n -= 8;
9688         }
9689     }
9690
9691   /* Finish the final bytes of the copy.  We can always do this in one
9692      instruction.  We either copy the exact amount we need, or partially
9693      overlap with the previous chunk we copied and copy 8-bytes.  */
9694   if (n == 0)
9695     return true;
9696   else if (n == 1)
9697     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9698   else if (n == 2)
9699     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9700   else if (n == 4)
9701     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9702   else
9703     {
9704       if (n == 3)
9705         {
9706           src = aarch64_move_pointer (src, -1);
9707           dst = aarch64_move_pointer (dst, -1);
9708           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9709         }
9710       else
9711         {
9712           int move = n - 8;
9713
9714           src = aarch64_move_pointer (src, move);
9715           dst = aarch64_move_pointer (dst, move);
9716           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9717         }
9718     }
9719
9720   return true;
9721 }
9722
9723 #undef TARGET_ADDRESS_COST
9724 #define TARGET_ADDRESS_COST aarch64_address_cost
9725
9726 /* This hook will determines whether unnamed bitfields affect the alignment
9727    of the containing structure.  The hook returns true if the structure
9728    should inherit the alignment requirements of an unnamed bitfield's
9729    type.  */
9730 #undef TARGET_ALIGN_ANON_BITFIELD
9731 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9732
9733 #undef TARGET_ASM_ALIGNED_DI_OP
9734 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9735
9736 #undef TARGET_ASM_ALIGNED_HI_OP
9737 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9738
9739 #undef TARGET_ASM_ALIGNED_SI_OP
9740 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9741
9742 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9743 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9744   hook_bool_const_tree_hwi_hwi_const_tree_true
9745
9746 #undef TARGET_ASM_FILE_START
9747 #define TARGET_ASM_FILE_START aarch64_start_file
9748
9749 #undef TARGET_ASM_OUTPUT_MI_THUNK
9750 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9751
9752 #undef TARGET_ASM_SELECT_RTX_SECTION
9753 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9754
9755 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9756 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9757
9758 #undef TARGET_BUILD_BUILTIN_VA_LIST
9759 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9760
9761 #undef TARGET_CALLEE_COPIES
9762 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9763
9764 #undef TARGET_CAN_ELIMINATE
9765 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9766
9767 #undef TARGET_CANNOT_FORCE_CONST_MEM
9768 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9769
9770 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9771 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9772
9773 /* Only the least significant bit is used for initialization guard
9774    variables.  */
9775 #undef TARGET_CXX_GUARD_MASK_BIT
9776 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9777
9778 #undef TARGET_C_MODE_FOR_SUFFIX
9779 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9780
9781 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9782 #undef  TARGET_DEFAULT_TARGET_FLAGS
9783 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9784 #endif
9785
9786 #undef TARGET_CLASS_MAX_NREGS
9787 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9788
9789 #undef TARGET_BUILTIN_DECL
9790 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9791
9792 #undef  TARGET_EXPAND_BUILTIN
9793 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9794
9795 #undef TARGET_EXPAND_BUILTIN_VA_START
9796 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9797
9798 #undef TARGET_FOLD_BUILTIN
9799 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9800
9801 #undef TARGET_FUNCTION_ARG
9802 #define TARGET_FUNCTION_ARG aarch64_function_arg
9803
9804 #undef TARGET_FUNCTION_ARG_ADVANCE
9805 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9806
9807 #undef TARGET_FUNCTION_ARG_BOUNDARY
9808 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9809
9810 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9811 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9812
9813 #undef TARGET_FUNCTION_VALUE
9814 #define TARGET_FUNCTION_VALUE aarch64_function_value
9815
9816 #undef TARGET_FUNCTION_VALUE_REGNO_P
9817 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9818
9819 #undef TARGET_FRAME_POINTER_REQUIRED
9820 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9821
9822 #undef TARGET_GIMPLE_FOLD_BUILTIN
9823 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9824
9825 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9826 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9827
9828 #undef  TARGET_INIT_BUILTINS
9829 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
9830
9831 #undef TARGET_LEGITIMATE_ADDRESS_P
9832 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9833
9834 #undef TARGET_LEGITIMATE_CONSTANT_P
9835 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9836
9837 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9838 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9839
9840 #undef TARGET_LRA_P
9841 #define TARGET_LRA_P aarch64_lra_p
9842
9843 #undef TARGET_MANGLE_TYPE
9844 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9845
9846 #undef TARGET_MEMORY_MOVE_COST
9847 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9848
9849 #undef TARGET_MUST_PASS_IN_STACK
9850 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9851
9852 /* This target hook should return true if accesses to volatile bitfields
9853    should use the narrowest mode possible.  It should return false if these
9854    accesses should use the bitfield container type.  */
9855 #undef TARGET_NARROW_VOLATILE_BITFIELD
9856 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9857
9858 #undef  TARGET_OPTION_OVERRIDE
9859 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9860
9861 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9862 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9863   aarch64_override_options_after_change
9864
9865 #undef TARGET_PASS_BY_REFERENCE
9866 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9867
9868 #undef TARGET_PREFERRED_RELOAD_CLASS
9869 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9870
9871 #undef TARGET_SECONDARY_RELOAD
9872 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9873
9874 #undef TARGET_SHIFT_TRUNCATION_MASK
9875 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9876
9877 #undef TARGET_SETUP_INCOMING_VARARGS
9878 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9879
9880 #undef TARGET_STRUCT_VALUE_RTX
9881 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
9882
9883 #undef TARGET_REGISTER_MOVE_COST
9884 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9885
9886 #undef TARGET_RETURN_IN_MEMORY
9887 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9888
9889 #undef TARGET_RETURN_IN_MSB
9890 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9891
9892 #undef TARGET_RTX_COSTS
9893 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9894
9895 #undef TARGET_SCHED_ISSUE_RATE
9896 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9897
9898 #undef TARGET_TRAMPOLINE_INIT
9899 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9900
9901 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9902 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9903
9904 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9905 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9906
9907 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9908 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9909
9910 #undef TARGET_VECTORIZE_ADD_STMT_COST
9911 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
9912
9913 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
9914 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
9915   aarch64_builtin_vectorization_cost
9916
9917 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
9918 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
9919
9920 #undef TARGET_VECTORIZE_BUILTINS
9921 #define TARGET_VECTORIZE_BUILTINS
9922
9923 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
9924 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
9925   aarch64_builtin_vectorized_function
9926
9927 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
9928 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
9929   aarch64_autovectorize_vector_sizes
9930
9931 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
9932 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
9933   aarch64_atomic_assign_expand_fenv
9934
9935 /* Section anchor support.  */
9936
9937 #undef TARGET_MIN_ANCHOR_OFFSET
9938 #define TARGET_MIN_ANCHOR_OFFSET -256
9939
9940 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
9941    byte offset; we can do much more for larger data types, but have no way
9942    to determine the size of the access.  We assume accesses are aligned.  */
9943 #undef TARGET_MAX_ANCHOR_OFFSET
9944 #define TARGET_MAX_ANCHOR_OFFSET 4095
9945
9946 #undef TARGET_VECTOR_ALIGNMENT
9947 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
9948
9949 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
9950 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
9951   aarch64_simd_vector_alignment_reachable
9952
9953 /* vec_perm support.  */
9954
9955 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
9956 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
9957   aarch64_vectorize_vec_perm_const_ok
9958
9959
9960 #undef TARGET_FIXED_CONDITION_CODE_REGS
9961 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
9962
9963 struct gcc_target targetm = TARGET_INITIALIZER;
9964
9965 #include "gt-aarch64.h"