gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "tree.h"
  29 #include "stringpool.h"
  30 #include "stor-layout.h"
  31 #include "calls.h"
  32 #include "varasm.h"
  33 #include "regs.h"
  34 #include "df.h"
  35 #include "hard-reg-set.h"
  36 #include "output.h"
  37 #include "expr.h"
  38 #include "reload.h"
  39 #include "toplev.h"
  40 #include "target.h"
  41 #include "target-def.h"
  42 #include "targhooks.h"
  43 #include "ggc.h"
  44 #include "function.h"
  45 #include "tm_p.h"
  46 #include "recog.h"
  47 #include "langhooks.h"
  48 #include "diagnostic-core.h"
  49 #include "pointer-set.h"
  50 #include "hash-table.h"
  51 #include "vec.h"
  52 #include "basic-block.h"
  53 #include "tree-ssa-alias.h"
  54 #include "internal-fn.h"
  55 #include "gimple-fold.h"
  56 #include "tree-eh.h"
  57 #include "gimple-expr.h"
  58 #include "is-a.h"
  59 #include "gimple.h"
  60 #include "gimplify.h"
  61 #include "optabs.h"
  62 #include "dwarf2.h"
  63 #include "cfgloop.h"
  64 #include "tree-vectorizer.h"
  65 #include "config/arm/aarch-cost-tables.h"
  66 #include "dumpfile.h"
  67 #include "builtins.h"
  68
  69 /* Defined for convenience.  */
  70 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  71
  72 /* Classifies an address.
  73
  74    ADDRESS_REG_IMM
  75        A simple base register plus immediate offset.
  76
  77    ADDRESS_REG_WB
  78        A base register indexed by immediate offset with writeback.
  79
  80    ADDRESS_REG_REG
  81        A base register indexed by (optionally scaled) register.
  82
  83    ADDRESS_REG_UXTW
  84        A base register indexed by (optionally scaled) zero-extended register.
  85
  86    ADDRESS_REG_SXTW
  87        A base register indexed by (optionally scaled) sign-extended register.
  88
  89    ADDRESS_LO_SUM
  90        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  91
  92    ADDRESS_SYMBOLIC:
  93        A constant symbolic address, in pc-relative literal pool.  */
  94
  95 enum aarch64_address_type {
  96   ADDRESS_REG_IMM,
  97   ADDRESS_REG_WB,
  98   ADDRESS_REG_REG,
  99   ADDRESS_REG_UXTW,
 100   ADDRESS_REG_SXTW,
 101   ADDRESS_LO_SUM,
 102   ADDRESS_SYMBOLIC
 103 };
 104
 105 struct aarch64_address_info {
 106   enum aarch64_address_type type;
 107   rtx base;
 108   rtx offset;
 109   int shift;
 110   enum aarch64_symbol_type symbol_type;
 111 };
 112
 113 struct simd_immediate_info
 114 {
 115   rtx value;
 116   int shift;
 117   int element_width;
 118   bool mvn;
 119   bool msl;
 120 };
 121
 122 /* The current code model.  */
 123 enum aarch64_code_model aarch64_cmodel;
 124
 125 #ifdef HAVE_AS_TLS
 126 #undef TARGET_HAVE_TLS
 127 #define TARGET_HAVE_TLS 1
 128 #endif
 129
 130 static bool aarch64_lra_p (void);
 131 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
 132 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
 133                                                      const_tree,
 134                                                      enum machine_mode *, int *,
 135                                                      bool *);
 136 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_override_options_after_change (void);
 139 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 140 static unsigned bit_count (unsigned HOST_WIDE_INT);
 141 static bool aarch64_const_vec_all_same_int_p (rtx,
 142                                               HOST_WIDE_INT, HOST_WIDE_INT);
 143
 144 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* The current tuning set.  */
 152 const struct tune_params *aarch64_tune_params;
 153
 154 /* Mask to specify which instructions we are allowed to generate.  */
 155 unsigned long aarch64_isa_flags = 0;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Tuning parameters.  */
 161
 162 #if HAVE_DESIGNATED_INITIALIZERS
 163 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 164 #else
 165 #define NAMED_PARAM(NAME, VAL) (VAL)
 166 #endif
 167
 168 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 169 __extension__
 170 #endif
 171
 172 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 173 __extension__
 174 #endif
 175 static const struct cpu_addrcost_table generic_addrcost_table =
 176 {
 177 #if HAVE_DESIGNATED_INITIALIZERS
 178   .addr_scale_costs =
 179 #endif
 180     {
 181       NAMED_PARAM (qi, 0),
 182       NAMED_PARAM (hi, 0),
 183       NAMED_PARAM (si, 0),
 184       NAMED_PARAM (ti, 0),
 185     },
 186   NAMED_PARAM (pre_modify, 0),
 187   NAMED_PARAM (post_modify, 0),
 188   NAMED_PARAM (register_offset, 0),
 189   NAMED_PARAM (register_extend, 0),
 190   NAMED_PARAM (imm_offset, 0)
 191 };
 192
 193 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 194 __extension__
 195 #endif
 196 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 197 {
 198 #if HAVE_DESIGNATED_INITIALIZERS
 199   .addr_scale_costs =
 200 #endif
 201     {
 202       NAMED_PARAM (qi, 0),
 203       NAMED_PARAM (hi, 1),
 204       NAMED_PARAM (si, 0),
 205       NAMED_PARAM (ti, 1),
 206     },
 207   NAMED_PARAM (pre_modify, 0),
 208   NAMED_PARAM (post_modify, 0),
 209   NAMED_PARAM (register_offset, 0),
 210   NAMED_PARAM (register_extend, 0),
 211   NAMED_PARAM (imm_offset, 0),
 212 };
 213
 214 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 215 __extension__
 216 #endif
 217 static const struct cpu_regmove_cost generic_regmove_cost =
 218 {
 219   NAMED_PARAM (GP2GP, 1),
 220   NAMED_PARAM (GP2FP, 2),
 221   NAMED_PARAM (FP2GP, 2),
 222   /* We currently do not provide direct support for TFmode Q->Q move.
 223      Therefore we need to raise the cost above 2 in order to have
 224      reload handle the situation.  */
 225   NAMED_PARAM (FP2FP, 4)
 226 };
 227
 228 /* Generic costs for vector insn classes.  */
 229 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 230 __extension__
 231 #endif
 232 static const struct cpu_vector_cost generic_vector_cost =
 233 {
 234   NAMED_PARAM (scalar_stmt_cost, 1),
 235   NAMED_PARAM (scalar_load_cost, 1),
 236   NAMED_PARAM (scalar_store_cost, 1),
 237   NAMED_PARAM (vec_stmt_cost, 1),
 238   NAMED_PARAM (vec_to_scalar_cost, 1),
 239   NAMED_PARAM (scalar_to_vec_cost, 1),
 240   NAMED_PARAM (vec_align_load_cost, 1),
 241   NAMED_PARAM (vec_unalign_load_cost, 1),
 242   NAMED_PARAM (vec_unalign_store_cost, 1),
 243   NAMED_PARAM (vec_store_cost, 1),
 244   NAMED_PARAM (cond_taken_branch_cost, 3),
 245   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 246 };
 247
 248 /* Generic costs for vector insn classes.  */
 249 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 250 __extension__
 251 #endif
 252 static const struct cpu_vector_cost cortexa57_vector_cost =
 253 {
 254   NAMED_PARAM (scalar_stmt_cost, 1),
 255   NAMED_PARAM (scalar_load_cost, 4),
 256   NAMED_PARAM (scalar_store_cost, 1),
 257   NAMED_PARAM (vec_stmt_cost, 3),
 258   NAMED_PARAM (vec_to_scalar_cost, 8),
 259   NAMED_PARAM (scalar_to_vec_cost, 8),
 260   NAMED_PARAM (vec_align_load_cost, 5),
 261   NAMED_PARAM (vec_unalign_load_cost, 5),
 262   NAMED_PARAM (vec_unalign_store_cost, 1),
 263   NAMED_PARAM (vec_store_cost, 1),
 264   NAMED_PARAM (cond_taken_branch_cost, 1),
 265   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 266 };
 267
 268 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 269 __extension__
 270 #endif
 271 static const struct tune_params generic_tunings =
 272 {
 273   &cortexa57_extra_costs,
 274   &generic_addrcost_table,
 275   &generic_regmove_cost,
 276   &generic_vector_cost,
 277   NAMED_PARAM (memmov_cost, 4),
 278   NAMED_PARAM (issue_rate, 2)
 279 };
 280
 281 static const struct tune_params cortexa53_tunings =
 282 {
 283   &cortexa53_extra_costs,
 284   &generic_addrcost_table,
 285   &generic_regmove_cost,
 286   &generic_vector_cost,
 287   NAMED_PARAM (memmov_cost, 4),
 288   NAMED_PARAM (issue_rate, 2)
 289 };
 290
 291 static const struct tune_params cortexa57_tunings =
 292 {
 293   &cortexa57_extra_costs,
 294   &cortexa57_addrcost_table,
 295   &generic_regmove_cost,
 296   &cortexa57_vector_cost,
 297   NAMED_PARAM (memmov_cost, 4),
 298   NAMED_PARAM (issue_rate, 3)
 299 };
 300
 301 /* A processor implementing AArch64.  */
 302 struct processor
 303 {
 304   const char *const name;
 305   enum aarch64_processor core;
 306   const char *arch;
 307   const unsigned long flags;
 308   const struct tune_params *const tune;
 309 };
 310
 311 /* Processor cores implementing AArch64.  */
 312 static const struct processor all_cores[] =
 313 {
 314 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
 315   {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
 316 #include "aarch64-cores.def"
 317 #undef AARCH64_CORE
 318   {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
 319   {NULL, aarch64_none, NULL, 0, NULL}
 320 };
 321
 322 /* Architectures implementing AArch64.  */
 323 static const struct processor all_architectures[] =
 324 {
 325 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 326   {NAME, CORE, #ARCH, FLAGS, NULL},
 327 #include "aarch64-arches.def"
 328 #undef AARCH64_ARCH
 329   {NULL, aarch64_none, NULL, 0, NULL}
 330 };
 331
 332 /* Target specification.  These are populated as commandline arguments
 333    are processed, or NULL if not specified.  */
 334 static const struct processor *selected_arch;
 335 static const struct processor *selected_cpu;
 336 static const struct processor *selected_tune;
 337
 338 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 339
 340 /* An ISA extension in the co-processor and main instruction set space.  */
 341 struct aarch64_option_extension
 342 {
 343   const char *const name;
 344   const unsigned long flags_on;
 345   const unsigned long flags_off;
 346 };
 347
 348 /* ISA extensions in AArch64.  */
 349 static const struct aarch64_option_extension all_extensions[] =
 350 {
 351 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 352   {NAME, FLAGS_ON, FLAGS_OFF},
 353 #include "aarch64-option-extensions.def"
 354 #undef AARCH64_OPT_EXTENSION
 355   {NULL, 0, 0}
 356 };
 357
 358 /* Used to track the size of an address when generating a pre/post
 359    increment address.  */
 360 static enum machine_mode aarch64_memory_reference_mode;
 361
 362 /* Used to force GTY into this file.  */
 363 static GTY(()) int gty_dummy;
 364
 365 /* A table of valid AArch64 "bitmask immediate" values for
 366    logical instructions.  */
 367
 368 #define AARCH64_NUM_BITMASKS  5334
 369 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 370
 371 typedef enum aarch64_cond_code
 372 {
 373   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 374   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 375   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 376 }
 377 aarch64_cc;
 378
 379 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 380
 381 /* The condition codes of the processor, and the inverse function.  */
 382 static const char * const aarch64_condition_codes[] =
 383 {
 384   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 385   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 386 };
 387
 388 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 389 unsigned
 390 aarch64_dbx_register_number (unsigned regno)
 391 {
 392    if (GP_REGNUM_P (regno))
 393      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 394    else if (regno == SP_REGNUM)
 395      return AARCH64_DWARF_SP;
 396    else if (FP_REGNUM_P (regno))
 397      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 398
 399    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 400       equivalent DWARF register.  */
 401    return DWARF_FRAME_REGISTERS;
 402 }
 403
 404 /* Return TRUE if MODE is any of the large INT modes.  */
 405 static bool
 406 aarch64_vect_struct_mode_p (enum machine_mode mode)
 407 {
 408   return mode == OImode || mode == CImode || mode == XImode;
 409 }
 410
 411 /* Return TRUE if MODE is any of the vector modes.  */
 412 static bool
 413 aarch64_vector_mode_p (enum machine_mode mode)
 414 {
 415   return aarch64_vector_mode_supported_p (mode)
 416          || aarch64_vect_struct_mode_p (mode);
 417 }
 418
 419 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 420 static bool
 421 aarch64_array_mode_supported_p (enum machine_mode mode,
 422                                 unsigned HOST_WIDE_INT nelems)
 423 {
 424   if (TARGET_SIMD
 425       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 426       && (nelems >= 2 && nelems <= 4))
 427     return true;
 428
 429   return false;
 430 }
 431
 432 /* Implement HARD_REGNO_NREGS.  */
 433
 434 int
 435 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
 436 {
 437   switch (aarch64_regno_regclass (regno))
 438     {
 439     case FP_REGS:
 440     case FP_LO_REGS:
 441       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 442     default:
 443       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 444     }
 445   gcc_unreachable ();
 446 }
 447
 448 /* Implement HARD_REGNO_MODE_OK.  */
 449
 450 int
 451 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
 452 {
 453   if (GET_MODE_CLASS (mode) == MODE_CC)
 454     return regno == CC_REGNUM;
 455
 456   if (regno == SP_REGNUM)
 457     /* The purpose of comparing with ptr_mode is to support the
 458        global register variable associated with the stack pointer
 459        register via the syntax of asm ("wsp") in ILP32.  */
 460     return mode == Pmode || mode == ptr_mode;
 461
 462   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 463     return mode == Pmode;
 464
 465   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 466     return 1;
 467
 468   if (FP_REGNUM_P (regno))
 469     {
 470       if (aarch64_vect_struct_mode_p (mode))
 471         return
 472           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 473       else
 474         return 1;
 475     }
 476
 477   return 0;
 478 }
 479
 480 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 481 enum machine_mode
 482 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 483                                      enum machine_mode mode)
 484 {
 485   /* Handle modes that fit within single registers.  */
 486   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 487     {
 488       if (GET_MODE_SIZE (mode) >= 4)
 489         return mode;
 490       else
 491         return SImode;
 492     }
 493   /* Fall back to generic for multi-reg and very large modes.  */
 494   else
 495     return choose_hard_reg_mode (regno, nregs, false);
 496 }
 497
 498 /* Return true if calls to DECL should be treated as
 499    long-calls (ie called via a register).  */
 500 static bool
 501 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 502 {
 503   return false;
 504 }
 505
 506 /* Return true if calls to symbol-ref SYM should be treated as
 507    long-calls (ie called via a register).  */
 508 bool
 509 aarch64_is_long_call_p (rtx sym)
 510 {
 511   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 512 }
 513
 514 /* Return true if the offsets to a zero/sign-extract operation
 515    represent an expression that matches an extend operation.  The
 516    operands represent the paramters from
 517
 518    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 519 bool
 520 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 521                                 rtx extract_imm)
 522 {
 523   HOST_WIDE_INT mult_val, extract_val;
 524
 525   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 526     return false;
 527
 528   mult_val = INTVAL (mult_imm);
 529   extract_val = INTVAL (extract_imm);
 530
 531   if (extract_val > 8
 532       && extract_val < GET_MODE_BITSIZE (mode)
 533       && exact_log2 (extract_val & ~7) > 0
 534       && (extract_val & 7) <= 4
 535       && mult_val == (1 << (extract_val & 7)))
 536     return true;
 537
 538   return false;
 539 }
 540
 541 /* Emit an insn that's a simple single-set.  Both the operands must be
 542    known to be valid.  */
 543 inline static rtx
 544 emit_set_insn (rtx x, rtx y)
 545 {
 546   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 547 }
 548
 549 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 550    return the rtx for register 0 in the proper mode.  */
 551 rtx
 552 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 553 {
 554   enum machine_mode mode = SELECT_CC_MODE (code, x, y);
 555   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 556
 557   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 558   return cc_reg;
 559 }
 560
 561 /* Build the SYMBOL_REF for __tls_get_addr.  */
 562
 563 static GTY(()) rtx tls_get_addr_libfunc;
 564
 565 rtx
 566 aarch64_tls_get_addr (void)
 567 {
 568   if (!tls_get_addr_libfunc)
 569     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 570   return tls_get_addr_libfunc;
 571 }
 572
 573 /* Return the TLS model to use for ADDR.  */
 574
 575 static enum tls_model
 576 tls_symbolic_operand_type (rtx addr)
 577 {
 578   enum tls_model tls_kind = TLS_MODEL_NONE;
 579   rtx sym, addend;
 580
 581   if (GET_CODE (addr) == CONST)
 582     {
 583       split_const (addr, &sym, &addend);
 584       if (GET_CODE (sym) == SYMBOL_REF)
 585         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 586     }
 587   else if (GET_CODE (addr) == SYMBOL_REF)
 588     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 589
 590   return tls_kind;
 591 }
 592
 593 /* We'll allow lo_sum's in addresses in our legitimate addresses
 594    so that combine would take care of combining addresses where
 595    necessary, but for generation purposes, we'll generate the address
 596    as :
 597    RTL                               Absolute
 598    tmp = hi (symbol_ref);            adrp  x1, foo
 599    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 600                                      nop
 601
 602    PIC                               TLS
 603    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 604    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 605                                      bl   __tls_get_addr
 606                                      nop
 607
 608    Load TLS symbol, depending on TLS mechanism and TLS access model.
 609
 610    Global Dynamic - Traditional TLS:
 611    adrp tmp, :tlsgd:imm
 612    add  dest, tmp, #:tlsgd_lo12:imm
 613    bl   __tls_get_addr
 614
 615    Global Dynamic - TLS Descriptors:
 616    adrp dest, :tlsdesc:imm
 617    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 618    add  dest, dest, #:tlsdesc_lo12:imm
 619    blr  tmp
 620    mrs  tp, tpidr_el0
 621    add  dest, dest, tp
 622
 623    Initial Exec:
 624    mrs  tp, tpidr_el0
 625    adrp tmp, :gottprel:imm
 626    ldr  dest, [tmp, #:gottprel_lo12:imm]
 627    add  dest, dest, tp
 628
 629    Local Exec:
 630    mrs  tp, tpidr_el0
 631    add  t0, tp, #:tprel_hi12:imm
 632    add  t0, #:tprel_lo12_nc:imm
 633 */
 634
 635 static void
 636 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 637                                    enum aarch64_symbol_type type)
 638 {
 639   switch (type)
 640     {
 641     case SYMBOL_SMALL_ABSOLUTE:
 642       {
 643         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 644         rtx tmp_reg = dest;
 645         enum machine_mode mode = GET_MODE (dest);
 646
 647         gcc_assert (mode == Pmode || mode == ptr_mode);
 648
 649         if (can_create_pseudo_p ())
 650           tmp_reg = gen_reg_rtx (mode);
 651
 652         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 653         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 654         return;
 655       }
 656
 657     case SYMBOL_TINY_ABSOLUTE:
 658       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 659       return;
 660
 661     case SYMBOL_SMALL_GOT:
 662       {
 663         /* In ILP32, the mode of dest can be either SImode or DImode,
 664            while the got entry is always of SImode size.  The mode of
 665            dest depends on how dest is used: if dest is assigned to a
 666            pointer (e.g. in the memory), it has SImode; it may have
 667            DImode if dest is dereferenced to access the memeory.
 668            This is why we have to handle three different ldr_got_small
 669            patterns here (two patterns for ILP32).  */
 670         rtx tmp_reg = dest;
 671         enum machine_mode mode = GET_MODE (dest);
 672
 673         if (can_create_pseudo_p ())
 674           tmp_reg = gen_reg_rtx (mode);
 675
 676         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 677         if (mode == ptr_mode)
 678           {
 679             if (mode == DImode)
 680               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 681             else
 682               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 683           }
 684         else
 685           {
 686             gcc_assert (mode == Pmode);
 687             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 688           }
 689
 690         return;
 691       }
 692
 693     case SYMBOL_SMALL_TLSGD:
 694       {
 695         rtx insns;
 696         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 697
 698         start_sequence ();
 699         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 700         insns = get_insns ();
 701         end_sequence ();
 702
 703         RTL_CONST_CALL_P (insns) = 1;
 704         emit_libcall_block (insns, dest, result, imm);
 705         return;
 706       }
 707
 708     case SYMBOL_SMALL_TLSDESC:
 709       {
 710         enum machine_mode mode = GET_MODE (dest);
 711         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 712         rtx tp;
 713
 714         gcc_assert (mode == Pmode || mode == ptr_mode);
 715
 716         /* In ILP32, the got entry is always of SImode size.  Unlike
 717            small GOT, the dest is fixed at reg 0.  */
 718         if (TARGET_ILP32)
 719           emit_insn (gen_tlsdesc_small_si (imm));
 720         else
 721           emit_insn (gen_tlsdesc_small_di (imm));
 722         tp = aarch64_load_tp (NULL);
 723
 724         if (mode != Pmode)
 725           tp = gen_lowpart (mode, tp);
 726
 727         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 728         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 729         return;
 730       }
 731
 732     case SYMBOL_SMALL_GOTTPREL:
 733       {
 734         /* In ILP32, the mode of dest can be either SImode or DImode,
 735            while the got entry is always of SImode size.  The mode of
 736            dest depends on how dest is used: if dest is assigned to a
 737            pointer (e.g. in the memory), it has SImode; it may have
 738            DImode if dest is dereferenced to access the memeory.
 739            This is why we have to handle three different tlsie_small
 740            patterns here (two patterns for ILP32).  */
 741         enum machine_mode mode = GET_MODE (dest);
 742         rtx tmp_reg = gen_reg_rtx (mode);
 743         rtx tp = aarch64_load_tp (NULL);
 744
 745         if (mode == ptr_mode)
 746           {
 747             if (mode == DImode)
 748               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 749             else
 750               {
 751                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 752                 tp = gen_lowpart (mode, tp);
 753               }
 754           }
 755         else
 756           {
 757             gcc_assert (mode == Pmode);
 758             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 759           }
 760
 761         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 762         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 763         return;
 764       }
 765
 766     case SYMBOL_SMALL_TPREL:
 767       {
 768         rtx tp = aarch64_load_tp (NULL);
 769         emit_insn (gen_tlsle_small (dest, tp, imm));
 770         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 771         return;
 772       }
 773
 774     case SYMBOL_TINY_GOT:
 775       emit_insn (gen_ldr_got_tiny (dest, imm));
 776       return;
 777
 778     default:
 779       gcc_unreachable ();
 780     }
 781 }
 782
 783 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 784    handle all moves if !can_create_pseudo_p ().  The distinction is
 785    important because, unlike emit_move_insn, the move expanders know
 786    how to force Pmode objects into the constant pool even when the
 787    constant pool address is not itself legitimate.  */
 788 static rtx
 789 aarch64_emit_move (rtx dest, rtx src)
 790 {
 791   return (can_create_pseudo_p ()
 792           ? emit_move_insn (dest, src)
 793           : emit_move_insn_1 (dest, src));
 794 }
 795
 796 /* Split a 128-bit move operation into two 64-bit move operations,
 797    taking care to handle partial overlap of register to register
 798    copies.  Special cases are needed when moving between GP regs and
 799    FP regs.  SRC can be a register, constant or memory; DST a register
 800    or memory.  If either operand is memory it must not have any side
 801    effects.  */
 802 void
 803 aarch64_split_128bit_move (rtx dst, rtx src)
 804 {
 805   rtx dst_lo, dst_hi;
 806   rtx src_lo, src_hi;
 807
 808   enum machine_mode mode = GET_MODE (dst);
 809
 810   gcc_assert (mode == TImode || mode == TFmode);
 811   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 812   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 813
 814   if (REG_P (dst) && REG_P (src))
 815     {
 816       int src_regno = REGNO (src);
 817       int dst_regno = REGNO (dst);
 818
 819       /* Handle FP <-> GP regs.  */
 820       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 821         {
 822           src_lo = gen_lowpart (word_mode, src);
 823           src_hi = gen_highpart (word_mode, src);
 824
 825           if (mode == TImode)
 826             {
 827               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 828               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 829             }
 830           else
 831             {
 832               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 833               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 834             }
 835           return;
 836         }
 837       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 838         {
 839           dst_lo = gen_lowpart (word_mode, dst);
 840           dst_hi = gen_highpart (word_mode, dst);
 841
 842           if (mode == TImode)
 843             {
 844               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 845               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 846             }
 847           else
 848             {
 849               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 850               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 851             }
 852           return;
 853         }
 854     }
 855
 856   dst_lo = gen_lowpart (word_mode, dst);
 857   dst_hi = gen_highpart (word_mode, dst);
 858   src_lo = gen_lowpart (word_mode, src);
 859   src_hi = gen_highpart_mode (word_mode, mode, src);
 860
 861   /* At most one pairing may overlap.  */
 862   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 863     {
 864       aarch64_emit_move (dst_hi, src_hi);
 865       aarch64_emit_move (dst_lo, src_lo);
 866     }
 867   else
 868     {
 869       aarch64_emit_move (dst_lo, src_lo);
 870       aarch64_emit_move (dst_hi, src_hi);
 871     }
 872 }
 873
 874 bool
 875 aarch64_split_128bit_move_p (rtx dst, rtx src)
 876 {
 877   return (! REG_P (src)
 878           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 879 }
 880
 881 /* Split a complex SIMD combine.  */
 882
 883 void
 884 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 885 {
 886   enum machine_mode src_mode = GET_MODE (src1);
 887   enum machine_mode dst_mode = GET_MODE (dst);
 888
 889   gcc_assert (VECTOR_MODE_P (dst_mode));
 890
 891   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
 892     {
 893       rtx (*gen) (rtx, rtx, rtx);
 894
 895       switch (src_mode)
 896         {
 897         case V8QImode:
 898           gen = gen_aarch64_simd_combinev8qi;
 899           break;
 900         case V4HImode:
 901           gen = gen_aarch64_simd_combinev4hi;
 902           break;
 903         case V2SImode:
 904           gen = gen_aarch64_simd_combinev2si;
 905           break;
 906         case V2SFmode:
 907           gen = gen_aarch64_simd_combinev2sf;
 908           break;
 909         case DImode:
 910           gen = gen_aarch64_simd_combinedi;
 911           break;
 912         case DFmode:
 913           gen = gen_aarch64_simd_combinedf;
 914           break;
 915         default:
 916           gcc_unreachable ();
 917         }
 918
 919       emit_insn (gen (dst, src1, src2));
 920       return;
 921     }
 922 }
 923
 924 /* Split a complex SIMD move.  */
 925
 926 void
 927 aarch64_split_simd_move (rtx dst, rtx src)
 928 {
 929   enum machine_mode src_mode = GET_MODE (src);
 930   enum machine_mode dst_mode = GET_MODE (dst);
 931
 932   gcc_assert (VECTOR_MODE_P (dst_mode));
 933
 934   if (REG_P (dst) && REG_P (src))
 935     {
 936       rtx (*gen) (rtx, rtx);
 937
 938       gcc_assert (VECTOR_MODE_P (src_mode));
 939
 940       switch (src_mode)
 941         {
 942         case V16QImode:
 943           gen = gen_aarch64_split_simd_movv16qi;
 944           break;
 945         case V8HImode:
 946           gen = gen_aarch64_split_simd_movv8hi;
 947           break;
 948         case V4SImode:
 949           gen = gen_aarch64_split_simd_movv4si;
 950           break;
 951         case V2DImode:
 952           gen = gen_aarch64_split_simd_movv2di;
 953           break;
 954         case V4SFmode:
 955           gen = gen_aarch64_split_simd_movv4sf;
 956           break;
 957         case V2DFmode:
 958           gen = gen_aarch64_split_simd_movv2df;
 959           break;
 960         default:
 961           gcc_unreachable ();
 962         }
 963
 964       emit_insn (gen (dst, src));
 965       return;
 966     }
 967 }
 968
 969 static rtx
 970 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
 971 {
 972   if (can_create_pseudo_p ())
 973     return force_reg (mode, value);
 974   else
 975     {
 976       x = aarch64_emit_move (x, value);
 977       return x;
 978     }
 979 }
 980
 981
 982 static rtx
 983 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
 984 {
 985   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
 986     {
 987       rtx high;
 988       /* Load the full offset into a register.  This
 989          might be improvable in the future.  */
 990       high = GEN_INT (offset);
 991       offset = 0;
 992       high = aarch64_force_temporary (mode, temp, high);
 993       reg = aarch64_force_temporary (mode, temp,
 994                                      gen_rtx_PLUS (mode, high, reg));
 995     }
 996   return plus_constant (mode, reg, offset);
 997 }
 998
 999 void
1000 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1001 {
1002   enum machine_mode mode = GET_MODE (dest);
1003   unsigned HOST_WIDE_INT mask;
1004   int i;
1005   bool first;
1006   unsigned HOST_WIDE_INT val;
1007   bool subtargets;
1008   rtx subtarget;
1009   int one_match, zero_match;
1010
1011   gcc_assert (mode == SImode || mode == DImode);
1012
1013   /* Check on what type of symbol it is.  */
1014   if (GET_CODE (imm) == SYMBOL_REF
1015       || GET_CODE (imm) == LABEL_REF
1016       || GET_CODE (imm) == CONST)
1017     {
1018       rtx mem, base, offset;
1019       enum aarch64_symbol_type sty;
1020
1021       /* If we have (const (plus symbol offset)), separate out the offset
1022          before we start classifying the symbol.  */
1023       split_const (imm, &base, &offset);
1024
1025       sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1026       switch (sty)
1027         {
1028         case SYMBOL_FORCE_TO_MEM:
1029           if (offset != const0_rtx
1030               && targetm.cannot_force_const_mem (mode, imm))
1031             {
1032               gcc_assert (can_create_pseudo_p ());
1033               base = aarch64_force_temporary (mode, dest, base);
1034               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1035               aarch64_emit_move (dest, base);
1036               return;
1037             }
1038           mem = force_const_mem (ptr_mode, imm);
1039           gcc_assert (mem);
1040           if (mode != ptr_mode)
1041             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1042           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1043           return;
1044
1045         case SYMBOL_SMALL_TLSGD:
1046         case SYMBOL_SMALL_TLSDESC:
1047         case SYMBOL_SMALL_GOTTPREL:
1048         case SYMBOL_SMALL_GOT:
1049         case SYMBOL_TINY_GOT:
1050           if (offset != const0_rtx)
1051             {
1052               gcc_assert(can_create_pseudo_p ());
1053               base = aarch64_force_temporary (mode, dest, base);
1054               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1055               aarch64_emit_move (dest, base);
1056               return;
1057             }
1058           /* FALLTHRU */
1059
1060         case SYMBOL_SMALL_TPREL:
1061         case SYMBOL_SMALL_ABSOLUTE:
1062         case SYMBOL_TINY_ABSOLUTE:
1063           aarch64_load_symref_appropriately (dest, imm, sty);
1064           return;
1065
1066         default:
1067           gcc_unreachable ();
1068         }
1069     }
1070
1071   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1072     {
1073       emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1074       return;
1075     }
1076
1077   if (!CONST_INT_P (imm))
1078     {
1079       if (GET_CODE (imm) == HIGH)
1080         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1081       else
1082         {
1083           rtx mem = force_const_mem (mode, imm);
1084           gcc_assert (mem);
1085           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1086         }
1087
1088       return;
1089     }
1090
1091   if (mode == SImode)
1092     {
1093       /* We know we can't do this in 1 insn, and we must be able to do it
1094          in two; so don't mess around looking for sequences that don't buy
1095          us anything.  */
1096       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1097       emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1098                                  GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1099       return;
1100     }
1101
1102   /* Remaining cases are all for DImode.  */
1103
1104   val = INTVAL (imm);
1105   subtargets = optimize && can_create_pseudo_p ();
1106
1107   one_match = 0;
1108   zero_match = 0;
1109   mask = 0xffff;
1110
1111   for (i = 0; i < 64; i += 16, mask <<= 16)
1112     {
1113       if ((val & mask) == 0)
1114         zero_match++;
1115       else if ((val & mask) == mask)
1116         one_match++;
1117     }
1118
1119   if (one_match == 2)
1120     {
1121       mask = 0xffff;
1122       for (i = 0; i < 64; i += 16, mask <<= 16)
1123         {
1124           if ((val & mask) != mask)
1125             {
1126               emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1127               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1128                                          GEN_INT ((val >> i) & 0xffff)));
1129               return;
1130             }
1131         }
1132       gcc_unreachable ();
1133     }
1134
1135   if (zero_match == 2)
1136     goto simple_sequence;
1137
1138   mask = 0x0ffff0000UL;
1139   for (i = 16; i < 64; i += 16, mask <<= 16)
1140     {
1141       HOST_WIDE_INT comp = mask & ~(mask - 1);
1142
1143       if (aarch64_uimm12_shift (val - (val & mask)))
1144         {
1145           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1146
1147           emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1148           emit_insn (gen_adddi3 (dest, subtarget,
1149                                  GEN_INT (val - (val & mask))));
1150           return;
1151         }
1152       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1153         {
1154           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1155
1156           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1157                                   GEN_INT ((val + comp) & mask)));
1158           emit_insn (gen_adddi3 (dest, subtarget,
1159                                  GEN_INT (val - ((val + comp) & mask))));
1160           return;
1161         }
1162       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1163         {
1164           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1165
1166           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1167                                   GEN_INT ((val - comp) | ~mask)));
1168           emit_insn (gen_adddi3 (dest, subtarget,
1169                                  GEN_INT (val - ((val - comp) | ~mask))));
1170           return;
1171         }
1172       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1173         {
1174           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1175
1176           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1177                                   GEN_INT (val | ~mask)));
1178           emit_insn (gen_adddi3 (dest, subtarget,
1179                                  GEN_INT (val - (val | ~mask))));
1180           return;
1181         }
1182     }
1183
1184   /* See if we can do it by arithmetically combining two
1185      immediates.  */
1186   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1187     {
1188       int j;
1189       mask = 0xffff;
1190
1191       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1192           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1193         {
1194           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1195           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1196                                   GEN_INT (aarch64_bitmasks[i])));
1197           emit_insn (gen_adddi3 (dest, subtarget,
1198                                  GEN_INT (val - aarch64_bitmasks[i])));
1199           return;
1200         }
1201
1202       for (j = 0; j < 64; j += 16, mask <<= 16)
1203         {
1204           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1205             {
1206               emit_insn (gen_rtx_SET (VOIDmode, dest,
1207                                       GEN_INT (aarch64_bitmasks[i])));
1208               emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1209                                          GEN_INT ((val >> j) & 0xffff)));
1210               return;
1211             }
1212         }
1213     }
1214
1215   /* See if we can do it by logically combining two immediates.  */
1216   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1217     {
1218       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1219         {
1220           int j;
1221
1222           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1223             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1224               {
1225                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1226                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1227                                         GEN_INT (aarch64_bitmasks[i])));
1228                 emit_insn (gen_iordi3 (dest, subtarget,
1229                                        GEN_INT (aarch64_bitmasks[j])));
1230                 return;
1231               }
1232         }
1233       else if ((val & aarch64_bitmasks[i]) == val)
1234         {
1235           int j;
1236
1237           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1238             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1239               {
1240
1241                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1242                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1243                                         GEN_INT (aarch64_bitmasks[j])));
1244                 emit_insn (gen_anddi3 (dest, subtarget,
1245                                        GEN_INT (aarch64_bitmasks[i])));
1246                 return;
1247               }
1248         }
1249     }
1250
1251  simple_sequence:
1252   first = true;
1253   mask = 0xffff;
1254   for (i = 0; i < 64; i += 16, mask <<= 16)
1255     {
1256       if ((val & mask) != 0)
1257         {
1258           if (first)
1259             {
1260               emit_insn (gen_rtx_SET (VOIDmode, dest,
1261                                       GEN_INT (val & mask)));
1262               first = false;
1263             }
1264           else
1265             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1266                                        GEN_INT ((val >> i) & 0xffff)));
1267         }
1268     }
1269 }
1270
1271 static bool
1272 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1273                                  tree exp ATTRIBUTE_UNUSED)
1274 {
1275   /* Currently, always true.  */
1276   return true;
1277 }
1278
1279 /* Implement TARGET_PASS_BY_REFERENCE.  */
1280
1281 static bool
1282 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1283                            enum machine_mode mode,
1284                            const_tree type,
1285                            bool named ATTRIBUTE_UNUSED)
1286 {
1287   HOST_WIDE_INT size;
1288   enum machine_mode dummymode;
1289   int nregs;
1290
1291   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1292   size = (mode == BLKmode && type)
1293     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1294
1295   /* Aggregates are passed by reference based on their size.  */
1296   if (type && AGGREGATE_TYPE_P (type))
1297     {
1298       size = int_size_in_bytes (type);
1299     }
1300
1301   /* Variable sized arguments are always returned by reference.  */
1302   if (size < 0)
1303     return true;
1304
1305   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1306   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1307                                                &dummymode, &nregs,
1308                                                NULL))
1309     return false;
1310
1311   /* Arguments which are variable sized or larger than 2 registers are
1312      passed by reference unless they are a homogenous floating point
1313      aggregate.  */
1314   return size > 2 * UNITS_PER_WORD;
1315 }
1316
1317 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1318 static bool
1319 aarch64_return_in_msb (const_tree valtype)
1320 {
1321   enum machine_mode dummy_mode;
1322   int dummy_int;
1323
1324   /* Never happens in little-endian mode.  */
1325   if (!BYTES_BIG_ENDIAN)
1326     return false;
1327
1328   /* Only composite types smaller than or equal to 16 bytes can
1329      be potentially returned in registers.  */
1330   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1331       || int_size_in_bytes (valtype) <= 0
1332       || int_size_in_bytes (valtype) > 16)
1333     return false;
1334
1335   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1336      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1337      is always passed/returned in the least significant bits of fp/simd
1338      register(s).  */
1339   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1340                                                &dummy_mode, &dummy_int, NULL))
1341     return false;
1342
1343   return true;
1344 }
1345
1346 /* Implement TARGET_FUNCTION_VALUE.
1347    Define how to find the value returned by a function.  */
1348
1349 static rtx
1350 aarch64_function_value (const_tree type, const_tree func,
1351                         bool outgoing ATTRIBUTE_UNUSED)
1352 {
1353   enum machine_mode mode;
1354   int unsignedp;
1355   int count;
1356   enum machine_mode ag_mode;
1357
1358   mode = TYPE_MODE (type);
1359   if (INTEGRAL_TYPE_P (type))
1360     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1361
1362   if (aarch64_return_in_msb (type))
1363     {
1364       HOST_WIDE_INT size = int_size_in_bytes (type);
1365
1366       if (size % UNITS_PER_WORD != 0)
1367         {
1368           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1369           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1370         }
1371     }
1372
1373   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1374                                                &ag_mode, &count, NULL))
1375     {
1376       if (!aarch64_composite_type_p (type, mode))
1377         {
1378           gcc_assert (count == 1 && mode == ag_mode);
1379           return gen_rtx_REG (mode, V0_REGNUM);
1380         }
1381       else
1382         {
1383           int i;
1384           rtx par;
1385
1386           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1387           for (i = 0; i < count; i++)
1388             {
1389               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1390               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1391                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1392               XVECEXP (par, 0, i) = tmp;
1393             }
1394           return par;
1395         }
1396     }
1397   else
1398     return gen_rtx_REG (mode, R0_REGNUM);
1399 }
1400
1401 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1402    Return true if REGNO is the number of a hard register in which the values
1403    of called function may come back.  */
1404
1405 static bool
1406 aarch64_function_value_regno_p (const unsigned int regno)
1407 {
1408   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1409      of 16-byte return values are: 128-bit integers and 16-byte small
1410      structures (excluding homogeneous floating-point aggregates).  */
1411   if (regno == R0_REGNUM || regno == R1_REGNUM)
1412     return true;
1413
1414   /* Up to four fp/simd registers can return a function value, e.g. a
1415      homogeneous floating-point aggregate having four members.  */
1416   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1417     return !TARGET_GENERAL_REGS_ONLY;
1418
1419   return false;
1420 }
1421
1422 /* Implement TARGET_RETURN_IN_MEMORY.
1423
1424    If the type T of the result of a function is such that
1425      void func (T arg)
1426    would require that arg be passed as a value in a register (or set of
1427    registers) according to the parameter passing rules, then the result
1428    is returned in the same registers as would be used for such an
1429    argument.  */
1430
1431 static bool
1432 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1433 {
1434   HOST_WIDE_INT size;
1435   enum machine_mode ag_mode;
1436   int count;
1437
1438   if (!AGGREGATE_TYPE_P (type)
1439       && TREE_CODE (type) != COMPLEX_TYPE
1440       && TREE_CODE (type) != VECTOR_TYPE)
1441     /* Simple scalar types always returned in registers.  */
1442     return false;
1443
1444   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1445                                                type,
1446                                                &ag_mode,
1447                                                &count,
1448                                                NULL))
1449     return false;
1450
1451   /* Types larger than 2 registers returned in memory.  */
1452   size = int_size_in_bytes (type);
1453   return (size < 0 || size > 2 * UNITS_PER_WORD);
1454 }
1455
1456 static bool
1457 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1458                                const_tree type, int *nregs)
1459 {
1460   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1461   return aarch64_vfp_is_call_or_return_candidate (mode,
1462                                                   type,
1463                                                   &pcum->aapcs_vfp_rmode,
1464                                                   nregs,
1465                                                   NULL);
1466 }
1467
1468 /* Given MODE and TYPE of a function argument, return the alignment in
1469    bits.  The idea is to suppress any stronger alignment requested by
1470    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1471    This is a helper function for local use only.  */
1472
1473 static unsigned int
1474 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1475 {
1476   unsigned int alignment;
1477
1478   if (type)
1479     {
1480       if (!integer_zerop (TYPE_SIZE (type)))
1481         {
1482           if (TYPE_MODE (type) == mode)
1483             alignment = TYPE_ALIGN (type);
1484           else
1485             alignment = GET_MODE_ALIGNMENT (mode);
1486         }
1487       else
1488         alignment = 0;
1489     }
1490   else
1491     alignment = GET_MODE_ALIGNMENT (mode);
1492
1493   return alignment;
1494 }
1495
1496 /* Layout a function argument according to the AAPCS64 rules.  The rule
1497    numbers refer to the rule numbers in the AAPCS64.  */
1498
1499 static void
1500 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1501                     const_tree type,
1502                     bool named ATTRIBUTE_UNUSED)
1503 {
1504   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1505   int ncrn, nvrn, nregs;
1506   bool allocate_ncrn, allocate_nvrn;
1507   HOST_WIDE_INT size;
1508
1509   /* We need to do this once per argument.  */
1510   if (pcum->aapcs_arg_processed)
1511     return;
1512
1513   pcum->aapcs_arg_processed = true;
1514
1515   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1516   size
1517     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1518                         UNITS_PER_WORD);
1519
1520   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1521   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1522                                                  mode,
1523                                                  type,
1524                                                  &nregs);
1525
1526   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1527      The following code thus handles passing by SIMD/FP registers first.  */
1528
1529   nvrn = pcum->aapcs_nvrn;
1530
1531   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1532      and homogenous short-vector aggregates (HVA).  */
1533   if (allocate_nvrn)
1534     {
1535       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1536         {
1537           pcum->aapcs_nextnvrn = nvrn + nregs;
1538           if (!aarch64_composite_type_p (type, mode))
1539             {
1540               gcc_assert (nregs == 1);
1541               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1542             }
1543           else
1544             {
1545               rtx par;
1546               int i;
1547               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1548               for (i = 0; i < nregs; i++)
1549                 {
1550                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1551                                          V0_REGNUM + nvrn + i);
1552                   tmp = gen_rtx_EXPR_LIST
1553                     (VOIDmode, tmp,
1554                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1555                   XVECEXP (par, 0, i) = tmp;
1556                 }
1557               pcum->aapcs_reg = par;
1558             }
1559           return;
1560         }
1561       else
1562         {
1563           /* C.3 NSRN is set to 8.  */
1564           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1565           goto on_stack;
1566         }
1567     }
1568
1569   ncrn = pcum->aapcs_ncrn;
1570   nregs = size / UNITS_PER_WORD;
1571
1572   /* C6 - C9.  though the sign and zero extension semantics are
1573      handled elsewhere.  This is the case where the argument fits
1574      entirely general registers.  */
1575   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1576     {
1577       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1578
1579       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1580
1581       /* C.8 if the argument has an alignment of 16 then the NGRN is
1582          rounded up to the next even number.  */
1583       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1584         {
1585           ++ncrn;
1586           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1587         }
1588       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1589          A reg is still generated for it, but the caller should be smart
1590          enough not to use it.  */
1591       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1592         {
1593           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1594         }
1595       else
1596         {
1597           rtx par;
1598           int i;
1599
1600           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1601           for (i = 0; i < nregs; i++)
1602             {
1603               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1604               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1605                                        GEN_INT (i * UNITS_PER_WORD));
1606               XVECEXP (par, 0, i) = tmp;
1607             }
1608           pcum->aapcs_reg = par;
1609         }
1610
1611       pcum->aapcs_nextncrn = ncrn + nregs;
1612       return;
1613     }
1614
1615   /* C.11  */
1616   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1617
1618   /* The argument is passed on stack; record the needed number of words for
1619      this argument and align the total size if necessary.  */
1620 on_stack:
1621   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1622   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1623     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1624                                                16 / UNITS_PER_WORD);
1625   return;
1626 }
1627
1628 /* Implement TARGET_FUNCTION_ARG.  */
1629
1630 static rtx
1631 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1632                       const_tree type, bool named)
1633 {
1634   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1635   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1636
1637   if (mode == VOIDmode)
1638     return NULL_RTX;
1639
1640   aarch64_layout_arg (pcum_v, mode, type, named);
1641   return pcum->aapcs_reg;
1642 }
1643
1644 void
1645 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1646                            const_tree fntype ATTRIBUTE_UNUSED,
1647                            rtx libname ATTRIBUTE_UNUSED,
1648                            const_tree fndecl ATTRIBUTE_UNUSED,
1649                            unsigned n_named ATTRIBUTE_UNUSED)
1650 {
1651   pcum->aapcs_ncrn = 0;
1652   pcum->aapcs_nvrn = 0;
1653   pcum->aapcs_nextncrn = 0;
1654   pcum->aapcs_nextnvrn = 0;
1655   pcum->pcs_variant = ARM_PCS_AAPCS64;
1656   pcum->aapcs_reg = NULL_RTX;
1657   pcum->aapcs_arg_processed = false;
1658   pcum->aapcs_stack_words = 0;
1659   pcum->aapcs_stack_size = 0;
1660
1661   return;
1662 }
1663
1664 static void
1665 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1666                               enum machine_mode mode,
1667                               const_tree type,
1668                               bool named)
1669 {
1670   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1671   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1672     {
1673       aarch64_layout_arg (pcum_v, mode, type, named);
1674       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1675                   != (pcum->aapcs_stack_words != 0));
1676       pcum->aapcs_arg_processed = false;
1677       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1678       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1679       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1680       pcum->aapcs_stack_words = 0;
1681       pcum->aapcs_reg = NULL_RTX;
1682     }
1683 }
1684
1685 bool
1686 aarch64_function_arg_regno_p (unsigned regno)
1687 {
1688   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1689           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1690 }
1691
1692 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1693    PARM_BOUNDARY bits of alignment, but will be given anything up
1694    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1695    that both before and after the layout of each argument, the Next
1696    Stacked Argument Address (NSAA) will have a minimum alignment of
1697    8 bytes.  */
1698
1699 static unsigned int
1700 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1701 {
1702   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1703
1704   if (alignment < PARM_BOUNDARY)
1705     alignment = PARM_BOUNDARY;
1706   if (alignment > STACK_BOUNDARY)
1707     alignment = STACK_BOUNDARY;
1708   return alignment;
1709 }
1710
1711 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1712
1713    Return true if an argument passed on the stack should be padded upwards,
1714    i.e. if the least-significant byte of the stack slot has useful data.
1715
1716    Small aggregate types are placed in the lowest memory address.
1717
1718    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1719
1720 bool
1721 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1722 {
1723   /* On little-endian targets, the least significant byte of every stack
1724      argument is passed at the lowest byte address of the stack slot.  */
1725   if (!BYTES_BIG_ENDIAN)
1726     return true;
1727
1728   /* Otherwise, integral, floating-point and pointer types are padded downward:
1729      the least significant byte of a stack argument is passed at the highest
1730      byte address of the stack slot.  */
1731   if (type
1732       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1733          || POINTER_TYPE_P (type))
1734       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1735     return false;
1736
1737   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1738   return true;
1739 }
1740
1741 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1742
1743    It specifies padding for the last (may also be the only)
1744    element of a block move between registers and memory.  If
1745    assuming the block is in the memory, padding upward means that
1746    the last element is padded after its highest significant byte,
1747    while in downward padding, the last element is padded at the
1748    its least significant byte side.
1749
1750    Small aggregates and small complex types are always padded
1751    upwards.
1752
1753    We don't need to worry about homogeneous floating-point or
1754    short-vector aggregates; their move is not affected by the
1755    padding direction determined here.  Regardless of endianness,
1756    each element of such an aggregate is put in the least
1757    significant bits of a fp/simd register.
1758
1759    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1760    register has useful data, and return the opposite if the most
1761    significant byte does.  */
1762
1763 bool
1764 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1765                      bool first ATTRIBUTE_UNUSED)
1766 {
1767
1768   /* Small composite types are always padded upward.  */
1769   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1770     {
1771       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1772                             : GET_MODE_SIZE (mode));
1773       if (size < 2 * UNITS_PER_WORD)
1774         return true;
1775     }
1776
1777   /* Otherwise, use the default padding.  */
1778   return !BYTES_BIG_ENDIAN;
1779 }
1780
1781 static enum machine_mode
1782 aarch64_libgcc_cmp_return_mode (void)
1783 {
1784   return SImode;
1785 }
1786
1787 static bool
1788 aarch64_frame_pointer_required (void)
1789 {
1790   /* If the function contains dynamic stack allocations, we need to
1791      use the frame pointer to access the static parts of the frame.  */
1792   if (cfun->calls_alloca)
1793     return true;
1794
1795   /* In aarch64_override_options_after_change
1796      flag_omit_leaf_frame_pointer turns off the frame pointer by
1797      default.  Turn it back on now if we've not got a leaf
1798      function.  */
1799   if (flag_omit_leaf_frame_pointer
1800       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Mark the registers that need to be saved by the callee and calculate
1807    the size of the callee-saved registers area and frame record (both FP
1808    and LR may be omitted).  */
1809 static void
1810 aarch64_layout_frame (void)
1811 {
1812   HOST_WIDE_INT offset = 0;
1813   int regno;
1814
1815   if (reload_completed && cfun->machine->frame.laid_out)
1816     return;
1817
1818 #define SLOT_NOT_REQUIRED (-2)
1819 #define SLOT_REQUIRED     (-1)
1820
1821   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
1822   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
1823
1824   /* First mark all the registers that really need to be saved...  */
1825   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1826     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1827
1828   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1829     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1830
1831   /* ... that includes the eh data registers (if needed)...  */
1832   if (crtl->calls_eh_return)
1833     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1834       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1835         = SLOT_REQUIRED;
1836
1837   /* ... and any callee saved register that dataflow says is live.  */
1838   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1839     if (df_regs_ever_live_p (regno)
1840         && !call_used_regs[regno])
1841       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1842
1843   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1844     if (df_regs_ever_live_p (regno)
1845         && !call_used_regs[regno])
1846       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1847
1848   if (frame_pointer_needed)
1849     {
1850       /* FP and LR are placed in the linkage record.  */
1851       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1852       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
1853       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1854       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
1855       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1856       offset += 2 * UNITS_PER_WORD;
1857     }
1858
1859   /* Now assign stack slots for them.  */
1860   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1861     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1862       {
1863         cfun->machine->frame.reg_offset[regno] = offset;
1864         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1865           cfun->machine->frame.wb_candidate1 = regno;
1866         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
1867           cfun->machine->frame.wb_candidate2 = regno;
1868         offset += UNITS_PER_WORD;
1869       }
1870
1871   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1872     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1873       {
1874         cfun->machine->frame.reg_offset[regno] = offset;
1875         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1876           cfun->machine->frame.wb_candidate1 = regno;
1877         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
1878                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
1879           cfun->machine->frame.wb_candidate2 = regno;
1880         offset += UNITS_PER_WORD;
1881       }
1882
1883   cfun->machine->frame.padding0 =
1884     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1885   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1886
1887   cfun->machine->frame.saved_regs_size = offset;
1888
1889   cfun->machine->frame.hard_fp_offset
1890     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1891                         + get_frame_size ()
1892                         + cfun->machine->frame.saved_regs_size,
1893                         STACK_BOUNDARY / BITS_PER_UNIT);
1894
1895   cfun->machine->frame.frame_size
1896     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1897                         + crtl->outgoing_args_size,
1898                         STACK_BOUNDARY / BITS_PER_UNIT);
1899
1900   cfun->machine->frame.laid_out = true;
1901 }
1902
1903 /* Make the last instruction frame-related and note that it performs
1904    the operation described by FRAME_PATTERN.  */
1905
1906 static void
1907 aarch64_set_frame_expr (rtx frame_pattern)
1908 {
1909   rtx insn;
1910
1911   insn = get_last_insn ();
1912   RTX_FRAME_RELATED_P (insn) = 1;
1913   RTX_FRAME_RELATED_P (frame_pattern) = 1;
1914   REG_NOTES (insn) = alloc_EXPR_LIST (REG_FRAME_RELATED_EXPR,
1915                                       frame_pattern,
1916                                       REG_NOTES (insn));
1917 }
1918
1919 static bool
1920 aarch64_register_saved_on_entry (int regno)
1921 {
1922   return cfun->machine->frame.reg_offset[regno] >= 0;
1923 }
1924
1925 static unsigned
1926 aarch64_next_callee_save (unsigned regno, unsigned limit)
1927 {
1928   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1929     regno ++;
1930   return regno;
1931 }
1932
1933 static void
1934 aarch64_pushwb_single_reg (enum machine_mode mode, unsigned regno,
1935                            HOST_WIDE_INT adjustment)
1936  {
1937   rtx base_rtx = stack_pointer_rtx;
1938   rtx insn, reg, mem;
1939
1940   reg = gen_rtx_REG (mode, regno);
1941   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
1942                             plus_constant (Pmode, base_rtx, -adjustment));
1943   mem = gen_rtx_MEM (mode, mem);
1944
1945   insn = emit_move_insn (mem, reg);
1946   RTX_FRAME_RELATED_P (insn) = 1;
1947 }
1948
1949 static void
1950 aarch64_popwb_single_reg (enum machine_mode mode, unsigned regno,
1951                           HOST_WIDE_INT adjustment)
1952 {
1953   rtx base_rtx = stack_pointer_rtx;
1954   rtx insn, reg, mem;
1955
1956   reg = gen_rtx_REG (mode, regno);
1957   mem = gen_rtx_POST_MODIFY (Pmode, base_rtx,
1958                              plus_constant (Pmode, base_rtx, adjustment));
1959   mem = gen_rtx_MEM (mode, mem);
1960
1961   insn = emit_move_insn (reg, mem);
1962   add_reg_note (insn, REG_CFA_RESTORE, reg);
1963   RTX_FRAME_RELATED_P (insn) = 1;
1964 }
1965
1966 static rtx
1967 aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1968                           HOST_WIDE_INT adjustment)
1969 {
1970   switch (mode)
1971     {
1972     case DImode:
1973       return gen_storewb_pairdi_di (base, base, reg, reg2,
1974                                     GEN_INT (-adjustment),
1975                                     GEN_INT (UNITS_PER_WORD - adjustment));
1976     case DFmode:
1977       return gen_storewb_pairdf_di (base, base, reg, reg2,
1978                                     GEN_INT (-adjustment),
1979                                     GEN_INT (UNITS_PER_WORD - adjustment));
1980     default:
1981       gcc_unreachable ();
1982     }
1983 }
1984
1985 static void
1986 aarch64_pushwb_pair_reg (enum machine_mode mode, unsigned regno1,
1987                          unsigned regno2, HOST_WIDE_INT adjustment)
1988 {
1989   rtx insn;
1990   rtx reg1 = gen_rtx_REG (mode, regno1);
1991   rtx reg2 = gen_rtx_REG (mode, regno2);
1992
1993   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
1994                                               reg2, adjustment));
1995   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
1996
1997   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1998   RTX_FRAME_RELATED_P (insn) = 1;
1999 }
2000
2001 static rtx
2002 aarch64_gen_loadwb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
2003                          HOST_WIDE_INT adjustment)
2004 {
2005   switch (mode)
2006     {
2007     case DImode:
2008       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2009                                    GEN_INT (UNITS_PER_WORD));
2010     case DFmode:
2011       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2012                                    GEN_INT (UNITS_PER_WORD));
2013     default:
2014       gcc_unreachable ();
2015     }
2016 }
2017
2018 static void
2019 aarch64_popwb_pair_reg (enum machine_mode mode, unsigned regno1,
2020                         unsigned regno2, HOST_WIDE_INT adjustment, rtx cfa)
2021 {
2022   rtx insn;
2023   rtx reg1 = gen_rtx_REG (mode, regno1);
2024   rtx reg2 = gen_rtx_REG (mode, regno2);
2025
2026   insn = emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
2027                                              reg2, adjustment));
2028   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2029   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2030   RTX_FRAME_RELATED_P (insn) = 1;
2031
2032   if (cfa)
2033     add_reg_note (insn, REG_CFA_ADJUST_CFA,
2034                   (gen_rtx_SET (Pmode, stack_pointer_rtx,
2035                                 plus_constant (Pmode, cfa, adjustment))));
2036
2037   add_reg_note (insn, REG_CFA_RESTORE, reg1);
2038   add_reg_note (insn, REG_CFA_RESTORE, reg2);
2039 }
2040
2041 static rtx
2042 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2043                         rtx reg2)
2044 {
2045   switch (mode)
2046     {
2047     case DImode:
2048       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2049
2050     case DFmode:
2051       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2052
2053     default:
2054       gcc_unreachable ();
2055     }
2056 }
2057
2058 static rtx
2059 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2060                        rtx mem2)
2061 {
2062   switch (mode)
2063     {
2064     case DImode:
2065       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2066
2067     case DFmode:
2068       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2069
2070     default:
2071       gcc_unreachable ();
2072     }
2073 }
2074
2075
2076 static void
2077 aarch64_save_callee_saves (enum machine_mode mode, HOST_WIDE_INT start_offset,
2078                            unsigned start, unsigned limit, bool skip_wb)
2079 {
2080   rtx insn;
2081   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2082                                                  ? gen_frame_mem : gen_rtx_MEM);
2083   unsigned regno;
2084   unsigned regno2;
2085
2086   for (regno = aarch64_next_callee_save (start, limit);
2087        regno <= limit;
2088        regno = aarch64_next_callee_save (regno + 1, limit))
2089     {
2090       rtx reg, mem;
2091       HOST_WIDE_INT offset;
2092
2093       if (skip_wb
2094           && (regno == cfun->machine->frame.wb_candidate1
2095               || regno == cfun->machine->frame.wb_candidate2))
2096         continue;
2097
2098       reg = gen_rtx_REG (mode, regno);
2099       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2100       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2101                                               offset));
2102
2103       regno2 = aarch64_next_callee_save (regno + 1, limit);
2104
2105       if (regno2 <= limit
2106           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2107               == cfun->machine->frame.reg_offset[regno2]))
2108
2109         {
2110           rtx reg2 = gen_rtx_REG (mode, regno2);
2111           rtx mem2;
2112
2113           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2114           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2115                                                    offset));
2116           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2117                                                     reg2));
2118
2119           /* The first part of a frame-related parallel insn is
2120              always assumed to be relevant to the frame
2121              calculations; subsequent parts, are only
2122              frame-related if explicitly marked.  */
2123           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2124           regno = regno2;
2125         }
2126       else
2127         insn = emit_move_insn (mem, reg);
2128
2129       RTX_FRAME_RELATED_P (insn) = 1;
2130     }
2131 }
2132
2133 static void
2134 aarch64_restore_callee_saves (enum machine_mode mode,
2135                               HOST_WIDE_INT start_offset, unsigned start,
2136                               unsigned limit, bool skip_wb)
2137 {
2138   rtx insn;
2139   rtx base_rtx = stack_pointer_rtx;
2140   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2141                                                  ? gen_frame_mem : gen_rtx_MEM);
2142   unsigned regno;
2143   unsigned regno2;
2144   HOST_WIDE_INT offset;
2145
2146   for (regno = aarch64_next_callee_save (start, limit);
2147        regno <= limit;
2148        regno = aarch64_next_callee_save (regno + 1, limit))
2149     {
2150       rtx reg, mem;
2151
2152       if (skip_wb
2153           && (regno == cfun->machine->frame.wb_candidate1
2154               || regno == cfun->machine->frame.wb_candidate2))
2155         continue;
2156
2157       reg = gen_rtx_REG (mode, regno);
2158       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2159       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2160
2161       regno2 = aarch64_next_callee_save (regno + 1, limit);
2162
2163       if (regno2 <= limit
2164           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2165               == cfun->machine->frame.reg_offset[regno2]))
2166         {
2167           rtx reg2 = gen_rtx_REG (mode, regno2);
2168           rtx mem2;
2169
2170           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2171           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2172           insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2,
2173                                                    mem2));
2174           add_reg_note (insn, REG_CFA_RESTORE, reg);
2175           add_reg_note (insn, REG_CFA_RESTORE, reg2);
2176
2177           /* The first part of a frame-related parallel insn is
2178              always assumed to be relevant to the frame
2179              calculations; subsequent parts, are only
2180              frame-related if explicitly marked.  */
2181           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2182           regno = regno2;
2183         }
2184       else
2185         {
2186           insn = emit_move_insn (reg, mem);
2187           add_reg_note (insn, REG_CFA_RESTORE, reg);
2188         }
2189
2190       RTX_FRAME_RELATED_P (insn) = 1;
2191     }
2192 }
2193
2194 /* AArch64 stack frames generated by this compiler look like:
2195
2196         +-------------------------------+
2197         |                               |
2198         |  incoming stack arguments     |
2199         |                               |
2200         +-------------------------------+
2201         |                               | <-- incoming stack pointer (aligned)
2202         |  callee-allocated save area   |
2203         |  for register varargs         |
2204         |                               |
2205         +-------------------------------+
2206         |  local variables              | <-- frame_pointer_rtx
2207         |                               |
2208         +-------------------------------+
2209         |  padding0                     | \
2210         +-------------------------------+  |
2211         |  callee-saved registers       |  | frame.saved_regs_size
2212         +-------------------------------+  |
2213         |  LR'                          |  |
2214         +-------------------------------+  |
2215         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2216         +-------------------------------+
2217         |  dynamic allocation           |
2218         +-------------------------------+
2219         |  padding                      |
2220         +-------------------------------+
2221         |  outgoing stack arguments     | <-- arg_pointer
2222         |                               |
2223         +-------------------------------+
2224         |                               | <-- stack_pointer_rtx (aligned)
2225
2226    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2227    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2228    unchanged.  */
2229
2230 /* Generate the prologue instructions for entry into a function.
2231    Establish the stack frame by decreasing the stack pointer with a
2232    properly calculated size and, if necessary, create a frame record
2233    filled with the values of LR and previous frame pointer.  The
2234    current FP is also set up if it is in use.  */
2235
2236 void
2237 aarch64_expand_prologue (void)
2238 {
2239   /* sub sp, sp, #<frame_size>
2240      stp {fp, lr}, [sp, #<frame_size> - 16]
2241      add fp, sp, #<frame_size> - hardfp_offset
2242      stp {cs_reg}, [fp, #-16] etc.
2243
2244      sub sp, sp, <final_adjustment_if_any>
2245   */
2246   HOST_WIDE_INT frame_size, offset;
2247   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2248   rtx insn;
2249
2250   aarch64_layout_frame ();
2251
2252   if (flag_stack_usage_info)
2253     current_function_static_stack_size = cfun->machine->frame.frame_size;
2254
2255   frame_size = cfun->machine->frame.frame_size;
2256   offset = cfun->machine->frame.frame_size;
2257
2258   fp_offset = cfun->machine->frame.frame_size
2259               - cfun->machine->frame.hard_fp_offset;
2260
2261   /* Store pairs and load pairs have a range only -512 to 504.  */
2262   if (offset >= 512)
2263     {
2264       /* When the frame has a large size, an initial decrease is done on
2265          the stack pointer to jump over the callee-allocated save area for
2266          register varargs, the local variable area and/or the callee-saved
2267          register area.  This will allow the pre-index write-back
2268          store pair instructions to be used for setting up the stack frame
2269          efficiently.  */
2270       offset = cfun->machine->frame.hard_fp_offset;
2271       if (offset >= 512)
2272         offset = cfun->machine->frame.saved_regs_size;
2273
2274       frame_size -= (offset + crtl->outgoing_args_size);
2275       fp_offset = 0;
2276
2277       if (frame_size >= 0x1000000)
2278         {
2279           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2280           emit_move_insn (op0, GEN_INT (-frame_size));
2281           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2282           aarch64_set_frame_expr (gen_rtx_SET
2283                                   (Pmode, stack_pointer_rtx,
2284                                    plus_constant (Pmode,
2285                                                   stack_pointer_rtx,
2286                                                   -frame_size)));
2287         }
2288       else if (frame_size > 0)
2289         {
2290           if ((frame_size & 0xfff) != frame_size)
2291             {
2292               insn = emit_insn (gen_add2_insn
2293                                 (stack_pointer_rtx,
2294                                  GEN_INT (-(frame_size
2295                                             & ~(HOST_WIDE_INT)0xfff))));
2296               RTX_FRAME_RELATED_P (insn) = 1;
2297             }
2298           if ((frame_size & 0xfff) != 0)
2299             {
2300               insn = emit_insn (gen_add2_insn
2301                                 (stack_pointer_rtx,
2302                                  GEN_INT (-(frame_size
2303                                             & (HOST_WIDE_INT)0xfff))));
2304               RTX_FRAME_RELATED_P (insn) = 1;
2305             }
2306         }
2307     }
2308   else
2309     frame_size = -1;
2310
2311   if (offset > 0)
2312     {
2313       bool skip_wb = false;
2314
2315       if (frame_pointer_needed)
2316         {
2317           skip_wb = true;
2318
2319           if (fp_offset)
2320             {
2321               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2322                                                GEN_INT (-offset)));
2323               RTX_FRAME_RELATED_P (insn) = 1;
2324               aarch64_set_frame_expr (gen_rtx_SET
2325                                       (Pmode, stack_pointer_rtx,
2326                                        gen_rtx_MINUS (Pmode, stack_pointer_rtx,
2327                                                       GEN_INT (offset))));
2328
2329               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2330                                          R30_REGNUM, false);
2331             }
2332           else
2333             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2334
2335           /* Set up frame pointer to point to the location of the
2336              previous frame pointer on the stack.  */
2337           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2338                                            stack_pointer_rtx,
2339                                            GEN_INT (fp_offset)));
2340           aarch64_set_frame_expr (gen_rtx_SET
2341                                   (Pmode, hard_frame_pointer_rtx,
2342                                    plus_constant (Pmode,
2343                                                   stack_pointer_rtx,
2344                                                   fp_offset)));
2345           RTX_FRAME_RELATED_P (insn) = 1;
2346           insn = emit_insn (gen_stack_tie (stack_pointer_rtx,
2347                                            hard_frame_pointer_rtx));
2348         }
2349       else
2350         {
2351           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2352           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2353
2354           if (fp_offset
2355               || reg1 == FIRST_PSEUDO_REGISTER
2356               || (reg2 == FIRST_PSEUDO_REGISTER
2357                   && offset >= 256))
2358             {
2359               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2360                                                GEN_INT (-offset)));
2361               RTX_FRAME_RELATED_P (insn) = 1;
2362             }
2363           else
2364             {
2365               enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2366
2367               skip_wb = true;
2368
2369               if (reg2 == FIRST_PSEUDO_REGISTER)
2370                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2371               else
2372                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2373             }
2374         }
2375
2376       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2377                                  skip_wb);
2378       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2379                                  skip_wb);
2380     }
2381
2382   /* when offset >= 512,
2383      sub sp, sp, #<outgoing_args_size> */
2384   if (frame_size > -1)
2385     {
2386       if (crtl->outgoing_args_size > 0)
2387         {
2388           insn = emit_insn (gen_add2_insn
2389                             (stack_pointer_rtx,
2390                              GEN_INT (- crtl->outgoing_args_size)));
2391           RTX_FRAME_RELATED_P (insn) = 1;
2392         }
2393     }
2394 }
2395
2396 /* Generate the epilogue instructions for returning from a function.  */
2397 void
2398 aarch64_expand_epilogue (bool for_sibcall)
2399 {
2400   HOST_WIDE_INT frame_size, offset;
2401   HOST_WIDE_INT fp_offset;
2402   rtx insn;
2403   rtx cfa_reg;
2404
2405   aarch64_layout_frame ();
2406
2407   offset = frame_size = cfun->machine->frame.frame_size;
2408   fp_offset = cfun->machine->frame.frame_size
2409               - cfun->machine->frame.hard_fp_offset;
2410
2411   cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx;
2412
2413   /* Store pairs and load pairs have a range only -512 to 504.  */
2414   if (offset >= 512)
2415     {
2416       offset = cfun->machine->frame.hard_fp_offset;
2417       if (offset >= 512)
2418         offset = cfun->machine->frame.saved_regs_size;
2419
2420       frame_size -= (offset + crtl->outgoing_args_size);
2421       fp_offset = 0;
2422       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2423         {
2424           insn = emit_insn (gen_add2_insn
2425                             (stack_pointer_rtx,
2426                              GEN_INT (crtl->outgoing_args_size)));
2427           RTX_FRAME_RELATED_P (insn) = 1;
2428         }
2429     }
2430   else
2431     frame_size = -1;
2432
2433   /* If there were outgoing arguments or we've done dynamic stack
2434      allocation, then restore the stack pointer from the frame
2435      pointer.  This is at most one insn and more efficient than using
2436      GCC's internal mechanism.  */
2437   if (frame_pointer_needed
2438       && (crtl->outgoing_args_size || cfun->calls_alloca))
2439     {
2440       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2441                                        hard_frame_pointer_rtx,
2442                                        GEN_INT (0)));
2443       offset = offset - fp_offset;
2444       RTX_FRAME_RELATED_P (insn) = 1;
2445       /* As SP is set to (FP - fp_offset), according to the rules in
2446          dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
2447          from the value of SP from now on.  */
2448       cfa_reg = stack_pointer_rtx;
2449     }
2450
2451   if (offset > 0)
2452     {
2453       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2454       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2455       bool skip_wb = true;
2456
2457       if (frame_pointer_needed)
2458         fp_offset = 0;
2459       else if (fp_offset
2460                || reg1 == FIRST_PSEUDO_REGISTER
2461                || (reg2 == FIRST_PSEUDO_REGISTER
2462                    && offset >= 256))
2463         skip_wb = false;
2464
2465       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2466                                     skip_wb);
2467       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2468                                     skip_wb);
2469
2470       if (skip_wb)
2471         {
2472           enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2473
2474           if (reg2 == FIRST_PSEUDO_REGISTER)
2475             aarch64_popwb_single_reg (mode1, reg1, offset);
2476           else
2477             {
2478               if (reg1 != HARD_FRAME_POINTER_REGNUM)
2479                 cfa_reg = NULL;
2480
2481               aarch64_popwb_pair_reg (mode1, reg1, reg2, offset, cfa_reg);
2482             }
2483         }
2484       else
2485         {
2486           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2487                                            GEN_INT (offset)));
2488           RTX_FRAME_RELATED_P (insn) = 1;
2489         }
2490     }
2491
2492   /* Stack adjustment for exception handler.  */
2493   if (crtl->calls_eh_return)
2494     {
2495       /* We need to unwind the stack by the offset computed by
2496          EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
2497          based on SP.  Ideally we would update the SP and define the
2498          CFA along the lines of:
2499
2500          SP = SP + EH_RETURN_STACKADJ_RTX
2501          (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
2502
2503          However the dwarf emitter only understands a constant
2504          register offset.
2505
2506          The solution chosen here is to use the otherwise unused IP0
2507          as a temporary register to hold the current SP value.  The
2508          CFA is described using IP0 then SP is modified.  */
2509
2510       rtx ip0 = gen_rtx_REG (DImode, IP0_REGNUM);
2511
2512       insn = emit_move_insn (ip0, stack_pointer_rtx);
2513       add_reg_note (insn, REG_CFA_DEF_CFA, ip0);
2514       RTX_FRAME_RELATED_P (insn) = 1;
2515
2516       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2517
2518       /* Ensure the assignment to IP0 does not get optimized away.  */
2519       emit_use (ip0);
2520     }
2521
2522   if (frame_size > -1)
2523     {
2524       if (frame_size >= 0x1000000)
2525         {
2526           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2527           emit_move_insn (op0, GEN_INT (frame_size));
2528           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2529           aarch64_set_frame_expr (gen_rtx_SET
2530                                   (Pmode, stack_pointer_rtx,
2531                                    plus_constant (Pmode,
2532                                                   stack_pointer_rtx,
2533                                                   frame_size)));
2534         }
2535       else if (frame_size > 0)
2536         {
2537           if ((frame_size & 0xfff) != 0)
2538             {
2539               insn = emit_insn (gen_add2_insn
2540                                 (stack_pointer_rtx,
2541                                  GEN_INT ((frame_size
2542                                            & (HOST_WIDE_INT) 0xfff))));
2543               RTX_FRAME_RELATED_P (insn) = 1;
2544             }
2545           if ((frame_size & 0xfff) != frame_size)
2546             {
2547               insn = emit_insn (gen_add2_insn
2548                                 (stack_pointer_rtx,
2549                                  GEN_INT ((frame_size
2550                                            & ~ (HOST_WIDE_INT) 0xfff))));
2551               RTX_FRAME_RELATED_P (insn) = 1;
2552             }
2553         }
2554
2555       aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx,
2556                                            plus_constant (Pmode,
2557                                                           stack_pointer_rtx,
2558                                                           offset)));
2559     }
2560
2561   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2562   if (!for_sibcall)
2563     emit_jump_insn (ret_rtx);
2564 }
2565
2566 /* Return the place to copy the exception unwinding return address to.
2567    This will probably be a stack slot, but could (in theory be the
2568    return register).  */
2569 rtx
2570 aarch64_final_eh_return_addr (void)
2571 {
2572   HOST_WIDE_INT fp_offset;
2573
2574   aarch64_layout_frame ();
2575
2576   fp_offset = cfun->machine->frame.frame_size
2577               - cfun->machine->frame.hard_fp_offset;
2578
2579   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2580     return gen_rtx_REG (DImode, LR_REGNUM);
2581
2582   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2583      result in a store to save LR introduced by builtin_eh_return () being
2584      incorrectly deleted because the alias is not detected.
2585      So in the calculation of the address to copy the exception unwinding
2586      return address to, we note 2 cases.
2587      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2588      we return a SP-relative location since all the addresses are SP-relative
2589      in this case.  This prevents the store from being optimized away.
2590      If the fp_offset is not 0, then the addresses will be FP-relative and
2591      therefore we return a FP-relative location.  */
2592
2593   if (frame_pointer_needed)
2594     {
2595       if (fp_offset)
2596         return gen_frame_mem (DImode,
2597                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2598       else
2599         return gen_frame_mem (DImode,
2600                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2601     }
2602
2603   /* If FP is not needed, we calculate the location of LR, which would be
2604      at the top of the saved registers block.  */
2605
2606   return gen_frame_mem (DImode,
2607                         plus_constant (Pmode,
2608                                        stack_pointer_rtx,
2609                                        fp_offset
2610                                        + cfun->machine->frame.saved_regs_size
2611                                        - 2 * UNITS_PER_WORD));
2612 }
2613
2614 /* Possibly output code to build up a constant in a register.  For
2615    the benefit of the costs infrastructure, returns the number of
2616    instructions which would be emitted.  GENERATE inhibits or
2617    enables code generation.  */
2618
2619 static int
2620 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2621 {
2622   int insns = 0;
2623
2624   if (aarch64_bitmask_imm (val, DImode))
2625     {
2626       if (generate)
2627         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2628       insns = 1;
2629     }
2630   else
2631     {
2632       int i;
2633       int ncount = 0;
2634       int zcount = 0;
2635       HOST_WIDE_INT valp = val >> 16;
2636       HOST_WIDE_INT valm;
2637       HOST_WIDE_INT tval;
2638
2639       for (i = 16; i < 64; i += 16)
2640         {
2641           valm = (valp & 0xffff);
2642
2643           if (valm != 0)
2644             ++ zcount;
2645
2646           if (valm != 0xffff)
2647             ++ ncount;
2648
2649           valp >>= 16;
2650         }
2651
2652       /* zcount contains the number of additional MOVK instructions
2653          required if the constant is built up with an initial MOVZ instruction,
2654          while ncount is the number of MOVK instructions required if starting
2655          with a MOVN instruction.  Choose the sequence that yields the fewest
2656          number of instructions, preferring MOVZ instructions when they are both
2657          the same.  */
2658       if (ncount < zcount)
2659         {
2660           if (generate)
2661             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2662                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2663           tval = 0xffff;
2664           insns++;
2665         }
2666       else
2667         {
2668           if (generate)
2669             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2670                             GEN_INT (val & 0xffff));
2671           tval = 0;
2672           insns++;
2673         }
2674
2675       val >>= 16;
2676
2677       for (i = 16; i < 64; i += 16)
2678         {
2679           if ((val & 0xffff) != tval)
2680             {
2681               if (generate)
2682                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2683                                            GEN_INT (i),
2684                                            GEN_INT (val & 0xffff)));
2685               insns++;
2686             }
2687           val >>= 16;
2688         }
2689     }
2690   return insns;
2691 }
2692
2693 static void
2694 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2695 {
2696   HOST_WIDE_INT mdelta = delta;
2697   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2698   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2699
2700   if (mdelta < 0)
2701     mdelta = -mdelta;
2702
2703   if (mdelta >= 4096 * 4096)
2704     {
2705       (void) aarch64_build_constant (scratchreg, delta, true);
2706       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2707     }
2708   else if (mdelta > 0)
2709     {
2710       if (mdelta >= 4096)
2711         {
2712           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2713           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2714           if (delta < 0)
2715             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2716                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2717           else
2718             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2719                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2720         }
2721       if (mdelta % 4096 != 0)
2722         {
2723           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2724           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2725                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2726         }
2727     }
2728 }
2729
2730 /* Output code to add DELTA to the first argument, and then jump
2731    to FUNCTION.  Used for C++ multiple inheritance.  */
2732 static void
2733 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2734                          HOST_WIDE_INT delta,
2735                          HOST_WIDE_INT vcall_offset,
2736                          tree function)
2737 {
2738   /* The this pointer is always in x0.  Note that this differs from
2739      Arm where the this pointer maybe bumped to r1 if r0 is required
2740      to return a pointer to an aggregate.  On AArch64 a result value
2741      pointer will be in x8.  */
2742   int this_regno = R0_REGNUM;
2743   rtx this_rtx, temp0, temp1, addr, insn, funexp;
2744
2745   reload_completed = 1;
2746   emit_note (NOTE_INSN_PROLOGUE_END);
2747
2748   if (vcall_offset == 0)
2749     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2750   else
2751     {
2752       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2753
2754       this_rtx = gen_rtx_REG (Pmode, this_regno);
2755       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2756       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2757
2758       addr = this_rtx;
2759       if (delta != 0)
2760         {
2761           if (delta >= -256 && delta < 256)
2762             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2763                                        plus_constant (Pmode, this_rtx, delta));
2764           else
2765             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2766         }
2767
2768       if (Pmode == ptr_mode)
2769         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2770       else
2771         aarch64_emit_move (temp0,
2772                            gen_rtx_ZERO_EXTEND (Pmode,
2773                                                 gen_rtx_MEM (ptr_mode, addr)));
2774
2775       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2776           addr = plus_constant (Pmode, temp0, vcall_offset);
2777       else
2778         {
2779           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2780           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2781         }
2782
2783       if (Pmode == ptr_mode)
2784         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2785       else
2786         aarch64_emit_move (temp1,
2787                            gen_rtx_SIGN_EXTEND (Pmode,
2788                                                 gen_rtx_MEM (ptr_mode, addr)));
2789
2790       emit_insn (gen_add2_insn (this_rtx, temp1));
2791     }
2792
2793   /* Generate a tail call to the target function.  */
2794   if (!TREE_USED (function))
2795     {
2796       assemble_external (function);
2797       TREE_USED (function) = 1;
2798     }
2799   funexp = XEXP (DECL_RTL (function), 0);
2800   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2801   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2802   SIBLING_CALL_P (insn) = 1;
2803
2804   insn = get_insns ();
2805   shorten_branches (insn);
2806   final_start_function (insn, file, 1);
2807   final (insn, file, 1);
2808   final_end_function ();
2809
2810   /* Stop pretending to be a post-reload pass.  */
2811   reload_completed = 0;
2812 }
2813
2814 static int
2815 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2816 {
2817   if (GET_CODE (*x) == SYMBOL_REF)
2818     return SYMBOL_REF_TLS_MODEL (*x) != 0;
2819
2820   /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2821      TLS offsets, not real symbol references.  */
2822   if (GET_CODE (*x) == UNSPEC
2823       && XINT (*x, 1) == UNSPEC_TLS)
2824     return -1;
2825
2826   return 0;
2827 }
2828
2829 static bool
2830 aarch64_tls_referenced_p (rtx x)
2831 {
2832   if (!TARGET_HAVE_TLS)
2833     return false;
2834
2835   return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2836 }
2837
2838
2839 static int
2840 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2841 {
2842   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2843   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2844
2845   if (*imm1 < *imm2)
2846     return -1;
2847   if (*imm1 > *imm2)
2848     return +1;
2849   return 0;
2850 }
2851
2852
2853 static void
2854 aarch64_build_bitmask_table (void)
2855 {
2856   unsigned HOST_WIDE_INT mask, imm;
2857   unsigned int log_e, e, s, r;
2858   unsigned int nimms = 0;
2859
2860   for (log_e = 1; log_e <= 6; log_e++)
2861     {
2862       e = 1 << log_e;
2863       if (e == 64)
2864         mask = ~(HOST_WIDE_INT) 0;
2865       else
2866         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2867       for (s = 1; s < e; s++)
2868         {
2869           for (r = 0; r < e; r++)
2870             {
2871               /* set s consecutive bits to 1 (s < 64) */
2872               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2873               /* rotate right by r */
2874               if (r != 0)
2875                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2876               /* replicate the constant depending on SIMD size */
2877               switch (log_e) {
2878               case 1: imm |= (imm <<  2);
2879               case 2: imm |= (imm <<  4);
2880               case 3: imm |= (imm <<  8);
2881               case 4: imm |= (imm << 16);
2882               case 5: imm |= (imm << 32);
2883               case 6:
2884                 break;
2885               default:
2886                 gcc_unreachable ();
2887               }
2888               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2889               aarch64_bitmasks[nimms++] = imm;
2890             }
2891         }
2892     }
2893
2894   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2895   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2896          aarch64_bitmasks_cmp);
2897 }
2898
2899
2900 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2901    a left shift of 0 or 12 bits.  */
2902 bool
2903 aarch64_uimm12_shift (HOST_WIDE_INT val)
2904 {
2905   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2906           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2907           );
2908 }
2909
2910
2911 /* Return true if val is an immediate that can be loaded into a
2912    register by a MOVZ instruction.  */
2913 static bool
2914 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2915 {
2916   if (GET_MODE_SIZE (mode) > 4)
2917     {
2918       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2919           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2920         return 1;
2921     }
2922   else
2923     {
2924       /* Ignore sign extension.  */
2925       val &= (HOST_WIDE_INT) 0xffffffff;
2926     }
2927   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2928           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2929 }
2930
2931
2932 /* Return true if val is a valid bitmask immediate.  */
2933 bool
2934 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2935 {
2936   if (GET_MODE_SIZE (mode) < 8)
2937     {
2938       /* Replicate bit pattern.  */
2939       val &= (HOST_WIDE_INT) 0xffffffff;
2940       val |= val << 32;
2941     }
2942   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2943                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2944 }
2945
2946
2947 /* Return true if val is an immediate that can be loaded into a
2948    register in a single instruction.  */
2949 bool
2950 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2951 {
2952   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2953     return 1;
2954   return aarch64_bitmask_imm (val, mode);
2955 }
2956
2957 static bool
2958 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2959 {
2960   rtx base, offset;
2961
2962   if (GET_CODE (x) == HIGH)
2963     return true;
2964
2965   split_const (x, &base, &offset);
2966   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2967     {
2968       if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2969           != SYMBOL_FORCE_TO_MEM)
2970         return true;
2971       else
2972         /* Avoid generating a 64-bit relocation in ILP32; leave
2973            to aarch64_expand_mov_immediate to handle it properly.  */
2974         return mode != ptr_mode;
2975     }
2976
2977   return aarch64_tls_referenced_p (x);
2978 }
2979
2980 /* Return true if register REGNO is a valid index register.
2981    STRICT_P is true if REG_OK_STRICT is in effect.  */
2982
2983 bool
2984 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2985 {
2986   if (!HARD_REGISTER_NUM_P (regno))
2987     {
2988       if (!strict_p)
2989         return true;
2990
2991       if (!reg_renumber)
2992         return false;
2993
2994       regno = reg_renumber[regno];
2995     }
2996   return GP_REGNUM_P (regno);
2997 }
2998
2999 /* Return true if register REGNO is a valid base register for mode MODE.
3000    STRICT_P is true if REG_OK_STRICT is in effect.  */
3001
3002 bool
3003 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3004 {
3005   if (!HARD_REGISTER_NUM_P (regno))
3006     {
3007       if (!strict_p)
3008         return true;
3009
3010       if (!reg_renumber)
3011         return false;
3012
3013       regno = reg_renumber[regno];
3014     }
3015
3016   /* The fake registers will be eliminated to either the stack or
3017      hard frame pointer, both of which are usually valid base registers.
3018      Reload deals with the cases where the eliminated form isn't valid.  */
3019   return (GP_REGNUM_P (regno)
3020           || regno == SP_REGNUM
3021           || regno == FRAME_POINTER_REGNUM
3022           || regno == ARG_POINTER_REGNUM);
3023 }
3024
3025 /* Return true if X is a valid base register for mode MODE.
3026    STRICT_P is true if REG_OK_STRICT is in effect.  */
3027
3028 static bool
3029 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3030 {
3031   if (!strict_p && GET_CODE (x) == SUBREG)
3032     x = SUBREG_REG (x);
3033
3034   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3035 }
3036
3037 /* Return true if address offset is a valid index.  If it is, fill in INFO
3038    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3039
3040 static bool
3041 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3042                         enum machine_mode mode, bool strict_p)
3043 {
3044   enum aarch64_address_type type;
3045   rtx index;
3046   int shift;
3047
3048   /* (reg:P) */
3049   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3050       && GET_MODE (x) == Pmode)
3051     {
3052       type = ADDRESS_REG_REG;
3053       index = x;
3054       shift = 0;
3055     }
3056   /* (sign_extend:DI (reg:SI)) */
3057   else if ((GET_CODE (x) == SIGN_EXTEND
3058             || GET_CODE (x) == ZERO_EXTEND)
3059            && GET_MODE (x) == DImode
3060            && GET_MODE (XEXP (x, 0)) == SImode)
3061     {
3062       type = (GET_CODE (x) == SIGN_EXTEND)
3063         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3064       index = XEXP (x, 0);
3065       shift = 0;
3066     }
3067   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3068   else if (GET_CODE (x) == MULT
3069            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3070                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3071            && GET_MODE (XEXP (x, 0)) == DImode
3072            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3073            && CONST_INT_P (XEXP (x, 1)))
3074     {
3075       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3076         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3077       index = XEXP (XEXP (x, 0), 0);
3078       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3079     }
3080   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3081   else if (GET_CODE (x) == ASHIFT
3082            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3083                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3084            && GET_MODE (XEXP (x, 0)) == DImode
3085            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3086            && CONST_INT_P (XEXP (x, 1)))
3087     {
3088       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3089         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3090       index = XEXP (XEXP (x, 0), 0);
3091       shift = INTVAL (XEXP (x, 1));
3092     }
3093   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3094   else if ((GET_CODE (x) == SIGN_EXTRACT
3095             || GET_CODE (x) == ZERO_EXTRACT)
3096            && GET_MODE (x) == DImode
3097            && GET_CODE (XEXP (x, 0)) == MULT
3098            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3099            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3100     {
3101       type = (GET_CODE (x) == SIGN_EXTRACT)
3102         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3103       index = XEXP (XEXP (x, 0), 0);
3104       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3105       if (INTVAL (XEXP (x, 1)) != 32 + shift
3106           || INTVAL (XEXP (x, 2)) != 0)
3107         shift = -1;
3108     }
3109   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3110      (const_int 0xffffffff<<shift)) */
3111   else if (GET_CODE (x) == AND
3112            && GET_MODE (x) == DImode
3113            && GET_CODE (XEXP (x, 0)) == MULT
3114            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3115            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3116            && CONST_INT_P (XEXP (x, 1)))
3117     {
3118       type = ADDRESS_REG_UXTW;
3119       index = XEXP (XEXP (x, 0), 0);
3120       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3121       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3122         shift = -1;
3123     }
3124   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3125   else if ((GET_CODE (x) == SIGN_EXTRACT
3126             || GET_CODE (x) == ZERO_EXTRACT)
3127            && GET_MODE (x) == DImode
3128            && GET_CODE (XEXP (x, 0)) == ASHIFT
3129            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3130            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3131     {
3132       type = (GET_CODE (x) == SIGN_EXTRACT)
3133         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3134       index = XEXP (XEXP (x, 0), 0);
3135       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3136       if (INTVAL (XEXP (x, 1)) != 32 + shift
3137           || INTVAL (XEXP (x, 2)) != 0)
3138         shift = -1;
3139     }
3140   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3141      (const_int 0xffffffff<<shift)) */
3142   else if (GET_CODE (x) == AND
3143            && GET_MODE (x) == DImode
3144            && GET_CODE (XEXP (x, 0)) == ASHIFT
3145            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3146            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3147            && CONST_INT_P (XEXP (x, 1)))
3148     {
3149       type = ADDRESS_REG_UXTW;
3150       index = XEXP (XEXP (x, 0), 0);
3151       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3152       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3153         shift = -1;
3154     }
3155   /* (mult:P (reg:P) (const_int scale)) */
3156   else if (GET_CODE (x) == MULT
3157            && GET_MODE (x) == Pmode
3158            && GET_MODE (XEXP (x, 0)) == Pmode
3159            && CONST_INT_P (XEXP (x, 1)))
3160     {
3161       type = ADDRESS_REG_REG;
3162       index = XEXP (x, 0);
3163       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3164     }
3165   /* (ashift:P (reg:P) (const_int shift)) */
3166   else if (GET_CODE (x) == ASHIFT
3167            && GET_MODE (x) == Pmode
3168            && GET_MODE (XEXP (x, 0)) == Pmode
3169            && CONST_INT_P (XEXP (x, 1)))
3170     {
3171       type = ADDRESS_REG_REG;
3172       index = XEXP (x, 0);
3173       shift = INTVAL (XEXP (x, 1));
3174     }
3175   else
3176     return false;
3177
3178   if (GET_CODE (index) == SUBREG)
3179     index = SUBREG_REG (index);
3180
3181   if ((shift == 0 ||
3182        (shift > 0 && shift <= 3
3183         && (1 << shift) == GET_MODE_SIZE (mode)))
3184       && REG_P (index)
3185       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3186     {
3187       info->type = type;
3188       info->offset = index;
3189       info->shift = shift;
3190       return true;
3191     }
3192
3193   return false;
3194 }
3195
3196 bool
3197 aarch64_offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3198 {
3199   return (offset >= -64 * GET_MODE_SIZE (mode)
3200           && offset < 64 * GET_MODE_SIZE (mode)
3201           && offset % GET_MODE_SIZE (mode) == 0);
3202 }
3203
3204 static inline bool
3205 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3206                                HOST_WIDE_INT offset)
3207 {
3208   return offset >= -256 && offset < 256;
3209 }
3210
3211 static inline bool
3212 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3213 {
3214   return (offset >= 0
3215           && offset < 4096 * GET_MODE_SIZE (mode)
3216           && offset % GET_MODE_SIZE (mode) == 0);
3217 }
3218
3219 /* Return true if X is a valid address for machine mode MODE.  If it is,
3220    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3221    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3222
3223 static bool
3224 aarch64_classify_address (struct aarch64_address_info *info,
3225                           rtx x, enum machine_mode mode,
3226                           RTX_CODE outer_code, bool strict_p)
3227 {
3228   enum rtx_code code = GET_CODE (x);
3229   rtx op0, op1;
3230   bool allow_reg_index_p =
3231     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3232                                || aarch64_vector_mode_supported_p (mode));
3233   /* Don't support anything other than POST_INC or REG addressing for
3234      AdvSIMD.  */
3235   if (aarch64_vect_struct_mode_p (mode)
3236       && (code != POST_INC && code != REG))
3237     return false;
3238
3239   switch (code)
3240     {
3241     case REG:
3242     case SUBREG:
3243       info->type = ADDRESS_REG_IMM;
3244       info->base = x;
3245       info->offset = const0_rtx;
3246       return aarch64_base_register_rtx_p (x, strict_p);
3247
3248     case PLUS:
3249       op0 = XEXP (x, 0);
3250       op1 = XEXP (x, 1);
3251
3252       if (! strict_p
3253           && REG_P (op0)
3254           && (op0 == virtual_stack_vars_rtx
3255               || op0 == frame_pointer_rtx
3256               || op0 == arg_pointer_rtx)
3257           && CONST_INT_P (op1))
3258         {
3259           info->type = ADDRESS_REG_IMM;
3260           info->base = op0;
3261           info->offset = op1;
3262
3263           return true;
3264         }
3265
3266       if (GET_MODE_SIZE (mode) != 0
3267           && CONST_INT_P (op1)
3268           && aarch64_base_register_rtx_p (op0, strict_p))
3269         {
3270           HOST_WIDE_INT offset = INTVAL (op1);
3271
3272           info->type = ADDRESS_REG_IMM;
3273           info->base = op0;
3274           info->offset = op1;
3275
3276           /* TImode and TFmode values are allowed in both pairs of X
3277              registers and individual Q registers.  The available
3278              address modes are:
3279              X,X: 7-bit signed scaled offset
3280              Q:   9-bit signed offset
3281              We conservatively require an offset representable in either mode.
3282            */
3283           if (mode == TImode || mode == TFmode)
3284             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3285                     && offset_9bit_signed_unscaled_p (mode, offset));
3286
3287           if (outer_code == PARALLEL)
3288             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3289                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3290           else
3291             return (offset_9bit_signed_unscaled_p (mode, offset)
3292                     || offset_12bit_unsigned_scaled_p (mode, offset));
3293         }
3294
3295       if (allow_reg_index_p)
3296         {
3297           /* Look for base + (scaled/extended) index register.  */
3298           if (aarch64_base_register_rtx_p (op0, strict_p)
3299               && aarch64_classify_index (info, op1, mode, strict_p))
3300             {
3301               info->base = op0;
3302               return true;
3303             }
3304           if (aarch64_base_register_rtx_p (op1, strict_p)
3305               && aarch64_classify_index (info, op0, mode, strict_p))
3306             {
3307               info->base = op1;
3308               return true;
3309             }
3310         }
3311
3312       return false;
3313
3314     case POST_INC:
3315     case POST_DEC:
3316     case PRE_INC:
3317     case PRE_DEC:
3318       info->type = ADDRESS_REG_WB;
3319       info->base = XEXP (x, 0);
3320       info->offset = NULL_RTX;
3321       return aarch64_base_register_rtx_p (info->base, strict_p);
3322
3323     case POST_MODIFY:
3324     case PRE_MODIFY:
3325       info->type = ADDRESS_REG_WB;
3326       info->base = XEXP (x, 0);
3327       if (GET_CODE (XEXP (x, 1)) == PLUS
3328           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3329           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3330           && aarch64_base_register_rtx_p (info->base, strict_p))
3331         {
3332           HOST_WIDE_INT offset;
3333           info->offset = XEXP (XEXP (x, 1), 1);
3334           offset = INTVAL (info->offset);
3335
3336           /* TImode and TFmode values are allowed in both pairs of X
3337              registers and individual Q registers.  The available
3338              address modes are:
3339              X,X: 7-bit signed scaled offset
3340              Q:   9-bit signed offset
3341              We conservatively require an offset representable in either mode.
3342            */
3343           if (mode == TImode || mode == TFmode)
3344             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3345                     && offset_9bit_signed_unscaled_p (mode, offset));
3346
3347           if (outer_code == PARALLEL)
3348             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3349                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3350           else
3351             return offset_9bit_signed_unscaled_p (mode, offset);
3352         }
3353       return false;
3354
3355     case CONST:
3356     case SYMBOL_REF:
3357     case LABEL_REF:
3358       /* load literal: pc-relative constant pool entry.  Only supported
3359          for SI mode or larger.  */
3360       info->type = ADDRESS_SYMBOLIC;
3361       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3362         {
3363           rtx sym, addend;
3364
3365           split_const (x, &sym, &addend);
3366           return (GET_CODE (sym) == LABEL_REF
3367                   || (GET_CODE (sym) == SYMBOL_REF
3368                       && CONSTANT_POOL_ADDRESS_P (sym)));
3369         }
3370       return false;
3371
3372     case LO_SUM:
3373       info->type = ADDRESS_LO_SUM;
3374       info->base = XEXP (x, 0);
3375       info->offset = XEXP (x, 1);
3376       if (allow_reg_index_p
3377           && aarch64_base_register_rtx_p (info->base, strict_p))
3378         {
3379           rtx sym, offs;
3380           split_const (info->offset, &sym, &offs);
3381           if (GET_CODE (sym) == SYMBOL_REF
3382               && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3383                   == SYMBOL_SMALL_ABSOLUTE))
3384             {
3385               /* The symbol and offset must be aligned to the access size.  */
3386               unsigned int align;
3387               unsigned int ref_size;
3388
3389               if (CONSTANT_POOL_ADDRESS_P (sym))
3390                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3391               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3392                 {
3393                   tree exp = SYMBOL_REF_DECL (sym);
3394                   align = TYPE_ALIGN (TREE_TYPE (exp));
3395                   align = CONSTANT_ALIGNMENT (exp, align);
3396                 }
3397               else if (SYMBOL_REF_DECL (sym))
3398                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3399               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3400                        && SYMBOL_REF_BLOCK (sym) != NULL)
3401                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3402               else
3403                 align = BITS_PER_UNIT;
3404
3405               ref_size = GET_MODE_SIZE (mode);
3406               if (ref_size == 0)
3407                 ref_size = GET_MODE_SIZE (DImode);
3408
3409               return ((INTVAL (offs) & (ref_size - 1)) == 0
3410                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3411             }
3412         }
3413       return false;
3414
3415     default:
3416       return false;
3417     }
3418 }
3419
3420 bool
3421 aarch64_symbolic_address_p (rtx x)
3422 {
3423   rtx offset;
3424
3425   split_const (x, &x, &offset);
3426   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3427 }
3428
3429 /* Classify the base of symbolic expression X, given that X appears in
3430    context CONTEXT.  */
3431
3432 enum aarch64_symbol_type
3433 aarch64_classify_symbolic_expression (rtx x,
3434                                       enum aarch64_symbol_context context)
3435 {
3436   rtx offset;
3437
3438   split_const (x, &x, &offset);
3439   return aarch64_classify_symbol (x, context);
3440 }
3441
3442
3443 /* Return TRUE if X is a legitimate address for accessing memory in
3444    mode MODE.  */
3445 static bool
3446 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3447 {
3448   struct aarch64_address_info addr;
3449
3450   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3451 }
3452
3453 /* Return TRUE if X is a legitimate address for accessing memory in
3454    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3455    pair operation.  */
3456 bool
3457 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3458                               RTX_CODE outer_code, bool strict_p)
3459 {
3460   struct aarch64_address_info addr;
3461
3462   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3463 }
3464
3465 /* Return TRUE if rtx X is immediate constant 0.0 */
3466 bool
3467 aarch64_float_const_zero_rtx_p (rtx x)
3468 {
3469   REAL_VALUE_TYPE r;
3470
3471   if (GET_MODE (x) == VOIDmode)
3472     return false;
3473
3474   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3475   if (REAL_VALUE_MINUS_ZERO (r))
3476     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3477   return REAL_VALUES_EQUAL (r, dconst0);
3478 }
3479
3480 /* Return the fixed registers used for condition codes.  */
3481
3482 static bool
3483 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3484 {
3485   *p1 = CC_REGNUM;
3486   *p2 = INVALID_REGNUM;
3487   return true;
3488 }
3489
3490 /* Emit call insn with PAT and do aarch64-specific handling.  */
3491
3492 void
3493 aarch64_emit_call_insn (rtx pat)
3494 {
3495   rtx insn = emit_call_insn (pat);
3496
3497   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3498   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3499   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3500 }
3501
3502 enum machine_mode
3503 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3504 {
3505   /* All floating point compares return CCFP if it is an equality
3506      comparison, and CCFPE otherwise.  */
3507   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3508     {
3509       switch (code)
3510         {
3511         case EQ:
3512         case NE:
3513         case UNORDERED:
3514         case ORDERED:
3515         case UNLT:
3516         case UNLE:
3517         case UNGT:
3518         case UNGE:
3519         case UNEQ:
3520         case LTGT:
3521           return CCFPmode;
3522
3523         case LT:
3524         case LE:
3525         case GT:
3526         case GE:
3527           return CCFPEmode;
3528
3529         default:
3530           gcc_unreachable ();
3531         }
3532     }
3533
3534   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3535       && y == const0_rtx
3536       && (code == EQ || code == NE || code == LT || code == GE)
3537       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3538           || GET_CODE (x) == NEG))
3539     return CC_NZmode;
3540
3541   /* A compare with a shifted operand.  Because of canonicalization,
3542      the comparison will have to be swapped when we emit the assembly
3543      code.  */
3544   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3545       && (REG_P (y) || GET_CODE (y) == SUBREG)
3546       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3547           || GET_CODE (x) == LSHIFTRT
3548           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3549     return CC_SWPmode;
3550
3551   /* Similarly for a negated operand, but we can only do this for
3552      equalities.  */
3553   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3554       && (REG_P (y) || GET_CODE (y) == SUBREG)
3555       && (code == EQ || code == NE)
3556       && GET_CODE (x) == NEG)
3557     return CC_Zmode;
3558
3559   /* A compare of a mode narrower than SI mode against zero can be done
3560      by extending the value in the comparison.  */
3561   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3562       && y == const0_rtx)
3563     /* Only use sign-extension if we really need it.  */
3564     return ((code == GT || code == GE || code == LE || code == LT)
3565             ? CC_SESWPmode : CC_ZESWPmode);
3566
3567   /* For everything else, return CCmode.  */
3568   return CCmode;
3569 }
3570
3571 static unsigned
3572 aarch64_get_condition_code (rtx x)
3573 {
3574   enum machine_mode mode = GET_MODE (XEXP (x, 0));
3575   enum rtx_code comp_code = GET_CODE (x);
3576
3577   if (GET_MODE_CLASS (mode) != MODE_CC)
3578     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3579
3580   switch (mode)
3581     {
3582     case CCFPmode:
3583     case CCFPEmode:
3584       switch (comp_code)
3585         {
3586         case GE: return AARCH64_GE;
3587         case GT: return AARCH64_GT;
3588         case LE: return AARCH64_LS;
3589         case LT: return AARCH64_MI;
3590         case NE: return AARCH64_NE;
3591         case EQ: return AARCH64_EQ;
3592         case ORDERED: return AARCH64_VC;
3593         case UNORDERED: return AARCH64_VS;
3594         case UNLT: return AARCH64_LT;
3595         case UNLE: return AARCH64_LE;
3596         case UNGT: return AARCH64_HI;
3597         case UNGE: return AARCH64_PL;
3598         default: gcc_unreachable ();
3599         }
3600       break;
3601
3602     case CCmode:
3603       switch (comp_code)
3604         {
3605         case NE: return AARCH64_NE;
3606         case EQ: return AARCH64_EQ;
3607         case GE: return AARCH64_GE;
3608         case GT: return AARCH64_GT;
3609         case LE: return AARCH64_LE;
3610         case LT: return AARCH64_LT;
3611         case GEU: return AARCH64_CS;
3612         case GTU: return AARCH64_HI;
3613         case LEU: return AARCH64_LS;
3614         case LTU: return AARCH64_CC;
3615         default: gcc_unreachable ();
3616         }
3617       break;
3618
3619     case CC_SWPmode:
3620     case CC_ZESWPmode:
3621     case CC_SESWPmode:
3622       switch (comp_code)
3623         {
3624         case NE: return AARCH64_NE;
3625         case EQ: return AARCH64_EQ;
3626         case GE: return AARCH64_LE;
3627         case GT: return AARCH64_LT;
3628         case LE: return AARCH64_GE;
3629         case LT: return AARCH64_GT;
3630         case GEU: return AARCH64_LS;
3631         case GTU: return AARCH64_CC;
3632         case LEU: return AARCH64_CS;
3633         case LTU: return AARCH64_HI;
3634         default: gcc_unreachable ();
3635         }
3636       break;
3637
3638     case CC_NZmode:
3639       switch (comp_code)
3640         {
3641         case NE: return AARCH64_NE;
3642         case EQ: return AARCH64_EQ;
3643         case GE: return AARCH64_PL;
3644         case LT: return AARCH64_MI;
3645         default: gcc_unreachable ();
3646         }
3647       break;
3648
3649     case CC_Zmode:
3650       switch (comp_code)
3651         {
3652         case NE: return AARCH64_NE;
3653         case EQ: return AARCH64_EQ;
3654         default: gcc_unreachable ();
3655         }
3656       break;
3657
3658     default:
3659       gcc_unreachable ();
3660       break;
3661     }
3662 }
3663
3664 static unsigned
3665 bit_count (unsigned HOST_WIDE_INT value)
3666 {
3667   unsigned count = 0;
3668
3669   while (value)
3670     {
3671       count++;
3672       value &= value - 1;
3673     }
3674
3675   return count;
3676 }
3677
3678 void
3679 aarch64_print_operand (FILE *f, rtx x, char code)
3680 {
3681   switch (code)
3682     {
3683     /* An integer or symbol address without a preceding # sign.  */
3684     case 'c':
3685       switch (GET_CODE (x))
3686         {
3687         case CONST_INT:
3688           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3689           break;
3690
3691         case SYMBOL_REF:
3692           output_addr_const (f, x);
3693           break;
3694
3695         case CONST:
3696           if (GET_CODE (XEXP (x, 0)) == PLUS
3697               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3698             {
3699               output_addr_const (f, x);
3700               break;
3701             }
3702           /* Fall through.  */
3703
3704         default:
3705           output_operand_lossage ("Unsupported operand for code '%c'", code);
3706         }
3707       break;
3708
3709     case 'e':
3710       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3711       {
3712         int n;
3713
3714         if (!CONST_INT_P (x)
3715             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3716           {
3717             output_operand_lossage ("invalid operand for '%%%c'", code);
3718             return;
3719           }
3720
3721         switch (n)
3722           {
3723           case 3:
3724             fputc ('b', f);
3725             break;
3726           case 4:
3727             fputc ('h', f);
3728             break;
3729           case 5:
3730             fputc ('w', f);
3731             break;
3732           default:
3733             output_operand_lossage ("invalid operand for '%%%c'", code);
3734             return;
3735           }
3736       }
3737       break;
3738
3739     case 'p':
3740       {
3741         int n;
3742
3743         /* Print N such that 2^N == X.  */
3744         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
3745           {
3746             output_operand_lossage ("invalid operand for '%%%c'", code);
3747             return;
3748           }
3749
3750         asm_fprintf (f, "%d", n);
3751       }
3752       break;
3753
3754     case 'P':
3755       /* Print the number of non-zero bits in X (a const_int).  */
3756       if (!CONST_INT_P (x))
3757         {
3758           output_operand_lossage ("invalid operand for '%%%c'", code);
3759           return;
3760         }
3761
3762       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3763       break;
3764
3765     case 'H':
3766       /* Print the higher numbered register of a pair (TImode) of regs.  */
3767       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
3768         {
3769           output_operand_lossage ("invalid operand for '%%%c'", code);
3770           return;
3771         }
3772
3773       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3774       break;
3775
3776     case 'm':
3777       /* Print a condition (eq, ne, etc).  */
3778
3779       /* CONST_TRUE_RTX means always -- that's the default.  */
3780       if (x == const_true_rtx)
3781         return;
3782
3783       if (!COMPARISON_P (x))
3784         {
3785           output_operand_lossage ("invalid operand for '%%%c'", code);
3786           return;
3787         }
3788
3789       fputs (aarch64_condition_codes[aarch64_get_condition_code (x)], f);
3790       break;
3791
3792     case 'M':
3793       /* Print the inverse of a condition (eq <-> ne, etc).  */
3794
3795       /* CONST_TRUE_RTX means never -- that's the default.  */
3796       if (x == const_true_rtx)
3797         {
3798           fputs ("nv", f);
3799           return;
3800         }
3801
3802       if (!COMPARISON_P (x))
3803         {
3804           output_operand_lossage ("invalid operand for '%%%c'", code);
3805           return;
3806         }
3807
3808       fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3809                                   (aarch64_get_condition_code (x))], f);
3810       break;
3811
3812     case 'b':
3813     case 'h':
3814     case 's':
3815     case 'd':
3816     case 'q':
3817       /* Print a scalar FP/SIMD register name.  */
3818       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3819         {
3820           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3821           return;
3822         }
3823       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3824       break;
3825
3826     case 'S':
3827     case 'T':
3828     case 'U':
3829     case 'V':
3830       /* Print the first FP/SIMD register name in a list.  */
3831       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3832         {
3833           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3834           return;
3835         }
3836       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3837       break;
3838
3839     case 'X':
3840       /* Print bottom 16 bits of integer constant in hex.  */
3841       if (!CONST_INT_P (x))
3842         {
3843           output_operand_lossage ("invalid operand for '%%%c'", code);
3844           return;
3845         }
3846       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3847       break;
3848
3849     case 'w':
3850     case 'x':
3851       /* Print a general register name or the zero register (32-bit or
3852          64-bit).  */
3853       if (x == const0_rtx
3854           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3855         {
3856           asm_fprintf (f, "%czr", code);
3857           break;
3858         }
3859
3860       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3861         {
3862           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3863           break;
3864         }
3865
3866       if (REG_P (x) && REGNO (x) == SP_REGNUM)
3867         {
3868           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3869           break;
3870         }
3871
3872       /* Fall through */
3873
3874     case 0:
3875       /* Print a normal operand, if it's a general register, then we
3876          assume DImode.  */
3877       if (x == NULL)
3878         {
3879           output_operand_lossage ("missing operand");
3880           return;
3881         }
3882
3883       switch (GET_CODE (x))
3884         {
3885         case REG:
3886           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3887           break;
3888
3889         case MEM:
3890           aarch64_memory_reference_mode = GET_MODE (x);
3891           output_address (XEXP (x, 0));
3892           break;
3893
3894         case LABEL_REF:
3895         case SYMBOL_REF:
3896           output_addr_const (asm_out_file, x);
3897           break;
3898
3899         case CONST_INT:
3900           asm_fprintf (f, "%wd", INTVAL (x));
3901           break;
3902
3903         case CONST_VECTOR:
3904           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3905             {
3906               gcc_assert (aarch64_const_vec_all_same_int_p (x,
3907                                                             HOST_WIDE_INT_MIN,
3908                                                             HOST_WIDE_INT_MAX));
3909               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3910             }
3911           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3912             {
3913               fputc ('0', f);
3914             }
3915           else
3916             gcc_unreachable ();
3917           break;
3918
3919         case CONST_DOUBLE:
3920           /* CONST_DOUBLE can represent a double-width integer.
3921              In this case, the mode of x is VOIDmode.  */
3922           if (GET_MODE (x) == VOIDmode)
3923             ; /* Do Nothing.  */
3924           else if (aarch64_float_const_zero_rtx_p (x))
3925             {
3926               fputc ('0', f);
3927               break;
3928             }
3929           else if (aarch64_float_const_representable_p (x))
3930             {
3931 #define buf_size 20
3932               char float_buf[buf_size] = {'\0'};
3933               REAL_VALUE_TYPE r;
3934               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3935               real_to_decimal_for_mode (float_buf, &r,
3936                                         buf_size, buf_size,
3937                                         1, GET_MODE (x));
3938               asm_fprintf (asm_out_file, "%s", float_buf);
3939               break;
3940 #undef buf_size
3941             }
3942           output_operand_lossage ("invalid constant");
3943           return;
3944         default:
3945           output_operand_lossage ("invalid operand");
3946           return;
3947         }
3948       break;
3949
3950     case 'A':
3951       if (GET_CODE (x) == HIGH)
3952         x = XEXP (x, 0);
3953
3954       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3955         {
3956         case SYMBOL_SMALL_GOT:
3957           asm_fprintf (asm_out_file, ":got:");
3958           break;
3959
3960         case SYMBOL_SMALL_TLSGD:
3961           asm_fprintf (asm_out_file, ":tlsgd:");
3962           break;
3963
3964         case SYMBOL_SMALL_TLSDESC:
3965           asm_fprintf (asm_out_file, ":tlsdesc:");
3966           break;
3967
3968         case SYMBOL_SMALL_GOTTPREL:
3969           asm_fprintf (asm_out_file, ":gottprel:");
3970           break;
3971
3972         case SYMBOL_SMALL_TPREL:
3973           asm_fprintf (asm_out_file, ":tprel:");
3974           break;
3975
3976         case SYMBOL_TINY_GOT:
3977           gcc_unreachable ();
3978           break;
3979
3980         default:
3981           break;
3982         }
3983       output_addr_const (asm_out_file, x);
3984       break;
3985
3986     case 'L':
3987       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3988         {
3989         case SYMBOL_SMALL_GOT:
3990           asm_fprintf (asm_out_file, ":lo12:");
3991           break;
3992
3993         case SYMBOL_SMALL_TLSGD:
3994           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3995           break;
3996
3997         case SYMBOL_SMALL_TLSDESC:
3998           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3999           break;
4000
4001         case SYMBOL_SMALL_GOTTPREL:
4002           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4003           break;
4004
4005         case SYMBOL_SMALL_TPREL:
4006           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4007           break;
4008
4009         case SYMBOL_TINY_GOT:
4010           asm_fprintf (asm_out_file, ":got:");
4011           break;
4012
4013         default:
4014           break;
4015         }
4016       output_addr_const (asm_out_file, x);
4017       break;
4018
4019     case 'G':
4020
4021       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4022         {
4023         case SYMBOL_SMALL_TPREL:
4024           asm_fprintf (asm_out_file, ":tprel_hi12:");
4025           break;
4026         default:
4027           break;
4028         }
4029       output_addr_const (asm_out_file, x);
4030       break;
4031
4032     default:
4033       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4034       return;
4035     }
4036 }
4037
4038 void
4039 aarch64_print_operand_address (FILE *f, rtx x)
4040 {
4041   struct aarch64_address_info addr;
4042
4043   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4044                              MEM, true))
4045     switch (addr.type)
4046       {
4047       case ADDRESS_REG_IMM:
4048         if (addr.offset == const0_rtx)
4049           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4050         else
4051           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4052                        INTVAL (addr.offset));
4053         return;
4054
4055       case ADDRESS_REG_REG:
4056         if (addr.shift == 0)
4057           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4058                        reg_names [REGNO (addr.offset)]);
4059         else
4060           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4061                        reg_names [REGNO (addr.offset)], addr.shift);
4062         return;
4063
4064       case ADDRESS_REG_UXTW:
4065         if (addr.shift == 0)
4066           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4067                        REGNO (addr.offset) - R0_REGNUM);
4068         else
4069           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4070                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4071         return;
4072
4073       case ADDRESS_REG_SXTW:
4074         if (addr.shift == 0)
4075           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4076                        REGNO (addr.offset) - R0_REGNUM);
4077         else
4078           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4079                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4080         return;
4081
4082       case ADDRESS_REG_WB:
4083         switch (GET_CODE (x))
4084           {
4085           case PRE_INC:
4086             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4087                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4088             return;
4089           case POST_INC:
4090             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4091                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4092             return;
4093           case PRE_DEC:
4094             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4095                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4096             return;
4097           case POST_DEC:
4098             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4099                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4100             return;
4101           case PRE_MODIFY:
4102             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4103                          INTVAL (addr.offset));
4104             return;
4105           case POST_MODIFY:
4106             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4107                          INTVAL (addr.offset));
4108             return;
4109           default:
4110             break;
4111           }
4112         break;
4113
4114       case ADDRESS_LO_SUM:
4115         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4116         output_addr_const (f, addr.offset);
4117         asm_fprintf (f, "]");
4118         return;
4119
4120       case ADDRESS_SYMBOLIC:
4121         break;
4122       }
4123
4124   output_addr_const (f, x);
4125 }
4126
4127 bool
4128 aarch64_label_mentioned_p (rtx x)
4129 {
4130   const char *fmt;
4131   int i;
4132
4133   if (GET_CODE (x) == LABEL_REF)
4134     return true;
4135
4136   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4137      referencing instruction, but they are constant offsets, not
4138      symbols.  */
4139   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4140     return false;
4141
4142   fmt = GET_RTX_FORMAT (GET_CODE (x));
4143   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4144     {
4145       if (fmt[i] == 'E')
4146         {
4147           int j;
4148
4149           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4150             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4151               return 1;
4152         }
4153       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4154         return 1;
4155     }
4156
4157   return 0;
4158 }
4159
4160 /* Implement REGNO_REG_CLASS.  */
4161
4162 enum reg_class
4163 aarch64_regno_regclass (unsigned regno)
4164 {
4165   if (GP_REGNUM_P (regno))
4166     return GENERAL_REGS;
4167
4168   if (regno == SP_REGNUM)
4169     return STACK_REG;
4170
4171   if (regno == FRAME_POINTER_REGNUM
4172       || regno == ARG_POINTER_REGNUM)
4173     return POINTER_REGS;
4174
4175   if (FP_REGNUM_P (regno))
4176     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4177
4178   return NO_REGS;
4179 }
4180
4181 /* Try a machine-dependent way of reloading an illegitimate address
4182    operand.  If we find one, push the reload and return the new rtx.  */
4183
4184 rtx
4185 aarch64_legitimize_reload_address (rtx *x_p,
4186                                    enum machine_mode mode,
4187                                    int opnum, int type,
4188                                    int ind_levels ATTRIBUTE_UNUSED)
4189 {
4190   rtx x = *x_p;
4191
4192   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4193   if (aarch64_vect_struct_mode_p (mode)
4194       && GET_CODE (x) == PLUS
4195       && REG_P (XEXP (x, 0))
4196       && CONST_INT_P (XEXP (x, 1)))
4197     {
4198       rtx orig_rtx = x;
4199       x = copy_rtx (x);
4200       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4201                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4202                    opnum, (enum reload_type) type);
4203       return x;
4204     }
4205
4206   /* We must recognize output that we have already generated ourselves.  */
4207   if (GET_CODE (x) == PLUS
4208       && GET_CODE (XEXP (x, 0)) == PLUS
4209       && REG_P (XEXP (XEXP (x, 0), 0))
4210       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4211       && CONST_INT_P (XEXP (x, 1)))
4212     {
4213       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4214                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4215                    opnum, (enum reload_type) type);
4216       return x;
4217     }
4218
4219   /* We wish to handle large displacements off a base register by splitting
4220      the addend across an add and the mem insn.  This can cut the number of
4221      extra insns needed from 3 to 1.  It is only useful for load/store of a
4222      single register with 12 bit offset field.  */
4223   if (GET_CODE (x) == PLUS
4224       && REG_P (XEXP (x, 0))
4225       && CONST_INT_P (XEXP (x, 1))
4226       && HARD_REGISTER_P (XEXP (x, 0))
4227       && mode != TImode
4228       && mode != TFmode
4229       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4230     {
4231       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4232       HOST_WIDE_INT low = val & 0xfff;
4233       HOST_WIDE_INT high = val - low;
4234       HOST_WIDE_INT offs;
4235       rtx cst;
4236       enum machine_mode xmode = GET_MODE (x);
4237
4238       /* In ILP32, xmode can be either DImode or SImode.  */
4239       gcc_assert (xmode == DImode || xmode == SImode);
4240
4241       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4242          BLKmode alignment.  */
4243       if (GET_MODE_SIZE (mode) == 0)
4244         return NULL_RTX;
4245
4246       offs = low % GET_MODE_SIZE (mode);
4247
4248       /* Align misaligned offset by adjusting high part to compensate.  */
4249       if (offs != 0)
4250         {
4251           if (aarch64_uimm12_shift (high + offs))
4252             {
4253               /* Align down.  */
4254               low = low - offs;
4255               high = high + offs;
4256             }
4257           else
4258             {
4259               /* Align up.  */
4260               offs = GET_MODE_SIZE (mode) - offs;
4261               low = low + offs;
4262               high = high + (low & 0x1000) - offs;
4263               low &= 0xfff;
4264             }
4265         }
4266
4267       /* Check for overflow.  */
4268       if (high + low != val)
4269         return NULL_RTX;
4270
4271       cst = GEN_INT (high);
4272       if (!aarch64_uimm12_shift (high))
4273         cst = force_const_mem (xmode, cst);
4274
4275       /* Reload high part into base reg, leaving the low part
4276          in the mem instruction.
4277          Note that replacing this gen_rtx_PLUS with plus_constant is
4278          wrong in this case because we rely on the
4279          (plus (plus reg c1) c2) structure being preserved so that
4280          XEXP (*p, 0) in push_reload below uses the correct term.  */
4281       x = gen_rtx_PLUS (xmode,
4282                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4283                         GEN_INT (low));
4284
4285       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4286                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4287                    opnum, (enum reload_type) type);
4288       return x;
4289     }
4290
4291   return NULL_RTX;
4292 }
4293
4294
4295 static reg_class_t
4296 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4297                           reg_class_t rclass,
4298                           enum machine_mode mode,
4299                           secondary_reload_info *sri)
4300 {
4301   /* Without the TARGET_SIMD instructions we cannot move a Q register
4302      to a Q register directly.  We need a scratch.  */
4303   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4304       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4305       && reg_class_subset_p (rclass, FP_REGS))
4306     {
4307       if (mode == TFmode)
4308         sri->icode = CODE_FOR_aarch64_reload_movtf;
4309       else if (mode == TImode)
4310         sri->icode = CODE_FOR_aarch64_reload_movti;
4311       return NO_REGS;
4312     }
4313
4314   /* A TFmode or TImode memory access should be handled via an FP_REGS
4315      because AArch64 has richer addressing modes for LDR/STR instructions
4316      than LDP/STP instructions.  */
4317   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4318       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4319     return FP_REGS;
4320
4321   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4322       return GENERAL_REGS;
4323
4324   return NO_REGS;
4325 }
4326
4327 static bool
4328 aarch64_can_eliminate (const int from, const int to)
4329 {
4330   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4331      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4332
4333   if (frame_pointer_needed)
4334     {
4335       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4336         return true;
4337       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4338         return false;
4339       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4340           && !cfun->calls_alloca)
4341         return true;
4342       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4343         return true;
4344
4345       return false;
4346     }
4347
4348   return true;
4349 }
4350
4351 HOST_WIDE_INT
4352 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4353 {
4354   aarch64_layout_frame ();
4355
4356   if (to == HARD_FRAME_POINTER_REGNUM)
4357     {
4358       if (from == ARG_POINTER_REGNUM)
4359         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4360
4361       if (from == FRAME_POINTER_REGNUM)
4362         return (cfun->machine->frame.hard_fp_offset
4363                 - cfun->machine->frame.saved_varargs_size);
4364     }
4365
4366   if (to == STACK_POINTER_REGNUM)
4367     {
4368       if (from == FRAME_POINTER_REGNUM)
4369           return (cfun->machine->frame.frame_size
4370                   - cfun->machine->frame.saved_varargs_size);
4371     }
4372
4373   return cfun->machine->frame.frame_size;
4374 }
4375
4376 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4377    previous frame.  */
4378
4379 rtx
4380 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4381 {
4382   if (count != 0)
4383     return const0_rtx;
4384   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4385 }
4386
4387
4388 static void
4389 aarch64_asm_trampoline_template (FILE *f)
4390 {
4391   if (TARGET_ILP32)
4392     {
4393       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4394       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4395     }
4396   else
4397     {
4398       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4399       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4400     }
4401   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4402   assemble_aligned_integer (4, const0_rtx);
4403   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4404   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4405 }
4406
4407 static void
4408 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4409 {
4410   rtx fnaddr, mem, a_tramp;
4411   const int tramp_code_sz = 16;
4412
4413   /* Don't need to copy the trailing D-words, we fill those in below.  */
4414   emit_block_move (m_tramp, assemble_trampoline_template (),
4415                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4416   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4417   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4418   if (GET_MODE (fnaddr) != ptr_mode)
4419     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4420   emit_move_insn (mem, fnaddr);
4421
4422   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4423   emit_move_insn (mem, chain_value);
4424
4425   /* XXX We should really define a "clear_cache" pattern and use
4426      gen_clear_cache().  */
4427   a_tramp = XEXP (m_tramp, 0);
4428   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4429                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4430                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4431                      ptr_mode);
4432 }
4433
4434 static unsigned char
4435 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4436 {
4437   switch (regclass)
4438     {
4439     case CALLER_SAVE_REGS:
4440     case POINTER_REGS:
4441     case GENERAL_REGS:
4442     case ALL_REGS:
4443     case FP_REGS:
4444     case FP_LO_REGS:
4445       return
4446         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4447                                        (GET_MODE_SIZE (mode) + 7) / 8;
4448     case STACK_REG:
4449       return 1;
4450
4451     case NO_REGS:
4452       return 0;
4453
4454     default:
4455       break;
4456     }
4457   gcc_unreachable ();
4458 }
4459
4460 static reg_class_t
4461 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4462 {
4463   if (regclass == POINTER_REGS)
4464     return GENERAL_REGS;
4465
4466   if (regclass == STACK_REG)
4467     {
4468       if (REG_P(x)
4469           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4470           return regclass;
4471
4472       return NO_REGS;
4473     }
4474
4475   /* If it's an integer immediate that MOVI can't handle, then
4476      FP_REGS is not an option, so we return NO_REGS instead.  */
4477   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4478       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4479     return NO_REGS;
4480
4481   /* Register eliminiation can result in a request for
4482      SP+constant->FP_REGS.  We cannot support such operations which
4483      use SP as source and an FP_REG as destination, so reject out
4484      right now.  */
4485   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4486     {
4487       rtx lhs = XEXP (x, 0);
4488
4489       /* Look through a possible SUBREG introduced by ILP32.  */
4490       if (GET_CODE (lhs) == SUBREG)
4491         lhs = SUBREG_REG (lhs);
4492
4493       gcc_assert (REG_P (lhs));
4494       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4495                                       POINTER_REGS));
4496       return NO_REGS;
4497     }
4498
4499   return regclass;
4500 }
4501
4502 void
4503 aarch64_asm_output_labelref (FILE* f, const char *name)
4504 {
4505   asm_fprintf (f, "%U%s", name);
4506 }
4507
4508 static void
4509 aarch64_elf_asm_constructor (rtx symbol, int priority)
4510 {
4511   if (priority == DEFAULT_INIT_PRIORITY)
4512     default_ctor_section_asm_out_constructor (symbol, priority);
4513   else
4514     {
4515       section *s;
4516       char buf[18];
4517       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4518       s = get_section (buf, SECTION_WRITE, NULL);
4519       switch_to_section (s);
4520       assemble_align (POINTER_SIZE);
4521       assemble_aligned_integer (POINTER_BYTES, symbol);
4522     }
4523 }
4524
4525 static void
4526 aarch64_elf_asm_destructor (rtx symbol, int priority)
4527 {
4528   if (priority == DEFAULT_INIT_PRIORITY)
4529     default_dtor_section_asm_out_destructor (symbol, priority);
4530   else
4531     {
4532       section *s;
4533       char buf[18];
4534       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4535       s = get_section (buf, SECTION_WRITE, NULL);
4536       switch_to_section (s);
4537       assemble_align (POINTER_SIZE);
4538       assemble_aligned_integer (POINTER_BYTES, symbol);
4539     }
4540 }
4541
4542 const char*
4543 aarch64_output_casesi (rtx *operands)
4544 {
4545   char buf[100];
4546   char label[100];
4547   rtx diff_vec = PATTERN (NEXT_INSN (operands[2]));
4548   int index;
4549   static const char *const patterns[4][2] =
4550   {
4551     {
4552       "ldrb\t%w3, [%0,%w1,uxtw]",
4553       "add\t%3, %4, %w3, sxtb #2"
4554     },
4555     {
4556       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4557       "add\t%3, %4, %w3, sxth #2"
4558     },
4559     {
4560       "ldr\t%w3, [%0,%w1,uxtw #2]",
4561       "add\t%3, %4, %w3, sxtw #2"
4562     },
4563     /* We assume that DImode is only generated when not optimizing and
4564        that we don't really need 64-bit address offsets.  That would
4565        imply an object file with 8GB of code in a single function!  */
4566     {
4567       "ldr\t%w3, [%0,%w1,uxtw #2]",
4568       "add\t%3, %4, %w3, sxtw #2"
4569     }
4570   };
4571
4572   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4573
4574   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4575
4576   gcc_assert (index >= 0 && index <= 3);
4577
4578   /* Need to implement table size reduction, by chaning the code below.  */
4579   output_asm_insn (patterns[index][0], operands);
4580   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4581   snprintf (buf, sizeof (buf),
4582             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4583   output_asm_insn (buf, operands);
4584   output_asm_insn (patterns[index][1], operands);
4585   output_asm_insn ("br\t%3", operands);
4586   assemble_label (asm_out_file, label);
4587   return "";
4588 }
4589
4590
4591 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4592    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4593    operator.  */
4594
4595 int
4596 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4597 {
4598   if (shift >= 0 && shift <= 3)
4599     {
4600       int size;
4601       for (size = 8; size <= 32; size *= 2)
4602         {
4603           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4604           if (mask == bits << shift)
4605             return size;
4606         }
4607     }
4608   return 0;
4609 }
4610
4611 static bool
4612 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4613                                    const_rtx x ATTRIBUTE_UNUSED)
4614 {
4615   /* We can't use blocks for constants when we're using a per-function
4616      constant pool.  */
4617   return false;
4618 }
4619
4620 static section *
4621 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4622                             rtx x ATTRIBUTE_UNUSED,
4623                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4624 {
4625   /* Force all constant pool entries into the current function section.  */
4626   return function_section (current_function_decl);
4627 }
4628
4629
4630 /* Costs.  */
4631
4632 /* Helper function for rtx cost calculation.  Strip a shift expression
4633    from X.  Returns the inner operand if successful, or the original
4634    expression on failure.  */
4635 static rtx
4636 aarch64_strip_shift (rtx x)
4637 {
4638   rtx op = x;
4639
4640   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4641      we can convert both to ROR during final output.  */
4642   if ((GET_CODE (op) == ASHIFT
4643        || GET_CODE (op) == ASHIFTRT
4644        || GET_CODE (op) == LSHIFTRT
4645        || GET_CODE (op) == ROTATERT
4646        || GET_CODE (op) == ROTATE)
4647       && CONST_INT_P (XEXP (op, 1)))
4648     return XEXP (op, 0);
4649
4650   if (GET_CODE (op) == MULT
4651       && CONST_INT_P (XEXP (op, 1))
4652       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4653     return XEXP (op, 0);
4654
4655   return x;
4656 }
4657
4658 /* Helper function for rtx cost calculation.  Strip an extend
4659    expression from X.  Returns the inner operand if successful, or the
4660    original expression on failure.  We deal with a number of possible
4661    canonicalization variations here.  */
4662 static rtx
4663 aarch64_strip_extend (rtx x)
4664 {
4665   rtx op = x;
4666
4667   /* Zero and sign extraction of a widened value.  */
4668   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4669       && XEXP (op, 2) == const0_rtx
4670       && GET_CODE (XEXP (op, 0)) == MULT
4671       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4672                                          XEXP (op, 1)))
4673     return XEXP (XEXP (op, 0), 0);
4674
4675   /* It can also be represented (for zero-extend) as an AND with an
4676      immediate.  */
4677   if (GET_CODE (op) == AND
4678       && GET_CODE (XEXP (op, 0)) == MULT
4679       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4680       && CONST_INT_P (XEXP (op, 1))
4681       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4682                            INTVAL (XEXP (op, 1))) != 0)
4683     return XEXP (XEXP (op, 0), 0);
4684
4685   /* Now handle extended register, as this may also have an optional
4686      left shift by 1..4.  */
4687   if (GET_CODE (op) == ASHIFT
4688       && CONST_INT_P (XEXP (op, 1))
4689       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4690     op = XEXP (op, 0);
4691
4692   if (GET_CODE (op) == ZERO_EXTEND
4693       || GET_CODE (op) == SIGN_EXTEND)
4694     op = XEXP (op, 0);
4695
4696   if (op != x)
4697     return op;
4698
4699   return x;
4700 }
4701
4702 /* Helper function for rtx cost calculation.  Calculate the cost of
4703    a MULT, which may be part of a multiply-accumulate rtx.  Return
4704    the calculated cost of the expression, recursing manually in to
4705    operands where needed.  */
4706
4707 static int
4708 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4709 {
4710   rtx op0, op1;
4711   const struct cpu_cost_table *extra_cost
4712     = aarch64_tune_params->insn_extra_cost;
4713   int cost = 0;
4714   bool maybe_fma = (outer == PLUS || outer == MINUS);
4715   enum machine_mode mode = GET_MODE (x);
4716
4717   gcc_checking_assert (code == MULT);
4718
4719   op0 = XEXP (x, 0);
4720   op1 = XEXP (x, 1);
4721
4722   if (VECTOR_MODE_P (mode))
4723     mode = GET_MODE_INNER (mode);
4724
4725   /* Integer multiply/fma.  */
4726   if (GET_MODE_CLASS (mode) == MODE_INT)
4727     {
4728       /* The multiply will be canonicalized as a shift, cost it as such.  */
4729       if (CONST_INT_P (op1)
4730           && exact_log2 (INTVAL (op1)) > 0)
4731         {
4732           if (speed)
4733             {
4734               if (maybe_fma)
4735                 /* ADD (shifted register).  */
4736                 cost += extra_cost->alu.arith_shift;
4737               else
4738                 /* LSL (immediate).  */
4739                 cost += extra_cost->alu.shift;
4740             }
4741
4742           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4743
4744           return cost;
4745         }
4746
4747       /* Integer multiplies or FMAs have zero/sign extending variants.  */
4748       if ((GET_CODE (op0) == ZERO_EXTEND
4749            && GET_CODE (op1) == ZERO_EXTEND)
4750           || (GET_CODE (op0) == SIGN_EXTEND
4751               && GET_CODE (op1) == SIGN_EXTEND))
4752         {
4753           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4754                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4755
4756           if (speed)
4757             {
4758               if (maybe_fma)
4759                 /* MADD/SMADDL/UMADDL.  */
4760                 cost += extra_cost->mult[0].extend_add;
4761               else
4762                 /* MUL/SMULL/UMULL.  */
4763                 cost += extra_cost->mult[0].extend;
4764             }
4765
4766           return cost;
4767         }
4768
4769       /* This is either an integer multiply or an FMA.  In both cases
4770          we want to recurse and cost the operands.  */
4771       cost += rtx_cost (op0, MULT, 0, speed)
4772               + rtx_cost (op1, MULT, 1, speed);
4773
4774       if (speed)
4775         {
4776           if (maybe_fma)
4777             /* MADD.  */
4778             cost += extra_cost->mult[mode == DImode].add;
4779           else
4780             /* MUL.  */
4781             cost += extra_cost->mult[mode == DImode].simple;
4782         }
4783
4784       return cost;
4785     }
4786   else
4787     {
4788       if (speed)
4789         {
4790           /* Floating-point FMA/FMUL can also support negations of the
4791              operands.  */
4792           if (GET_CODE (op0) == NEG)
4793             op0 = XEXP (op0, 0);
4794           if (GET_CODE (op1) == NEG)
4795             op1 = XEXP (op1, 0);
4796
4797           if (maybe_fma)
4798             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
4799             cost += extra_cost->fp[mode == DFmode].fma;
4800           else
4801             /* FMUL/FNMUL.  */
4802             cost += extra_cost->fp[mode == DFmode].mult;
4803         }
4804
4805       cost += rtx_cost (op0, MULT, 0, speed)
4806               + rtx_cost (op1, MULT, 1, speed);
4807       return cost;
4808     }
4809 }
4810
4811 static int
4812 aarch64_address_cost (rtx x,
4813                       enum machine_mode mode,
4814                       addr_space_t as ATTRIBUTE_UNUSED,
4815                       bool speed)
4816 {
4817   enum rtx_code c = GET_CODE (x);
4818   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4819   struct aarch64_address_info info;
4820   int cost = 0;
4821   info.shift = 0;
4822
4823   if (!aarch64_classify_address (&info, x, mode, c, false))
4824     {
4825       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4826         {
4827           /* This is a CONST or SYMBOL ref which will be split
4828              in a different way depending on the code model in use.
4829              Cost it through the generic infrastructure.  */
4830           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4831           /* Divide through by the cost of one instruction to
4832              bring it to the same units as the address costs.  */
4833           cost_symbol_ref /= COSTS_N_INSNS (1);
4834           /* The cost is then the cost of preparing the address,
4835              followed by an immediate (possibly 0) offset.  */
4836           return cost_symbol_ref + addr_cost->imm_offset;
4837         }
4838       else
4839         {
4840           /* This is most likely a jump table from a case
4841              statement.  */
4842           return addr_cost->register_offset;
4843         }
4844     }
4845
4846   switch (info.type)
4847     {
4848       case ADDRESS_LO_SUM:
4849       case ADDRESS_SYMBOLIC:
4850       case ADDRESS_REG_IMM:
4851         cost += addr_cost->imm_offset;
4852         break;
4853
4854       case ADDRESS_REG_WB:
4855         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4856           cost += addr_cost->pre_modify;
4857         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4858           cost += addr_cost->post_modify;
4859         else
4860           gcc_unreachable ();
4861
4862         break;
4863
4864       case ADDRESS_REG_REG:
4865         cost += addr_cost->register_offset;
4866         break;
4867
4868       case ADDRESS_REG_UXTW:
4869       case ADDRESS_REG_SXTW:
4870         cost += addr_cost->register_extend;
4871         break;
4872
4873       default:
4874         gcc_unreachable ();
4875     }
4876
4877
4878   if (info.shift > 0)
4879     {
4880       /* For the sake of calculating the cost of the shifted register
4881          component, we can treat same sized modes in the same way.  */
4882       switch (GET_MODE_BITSIZE (mode))
4883         {
4884           case 16:
4885             cost += addr_cost->addr_scale_costs.hi;
4886             break;
4887
4888           case 32:
4889             cost += addr_cost->addr_scale_costs.si;
4890             break;
4891
4892           case 64:
4893             cost += addr_cost->addr_scale_costs.di;
4894             break;
4895
4896           /* We can't tell, or this is a 128-bit vector.  */
4897           default:
4898             cost += addr_cost->addr_scale_costs.ti;
4899             break;
4900         }
4901     }
4902
4903   return cost;
4904 }
4905
4906 /* Return true if the RTX X in mode MODE is a zero or sign extract
4907    usable in an ADD or SUB (extended register) instruction.  */
4908 static bool
4909 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4910 {
4911   /* Catch add with a sign extract.
4912      This is add_<optab><mode>_multp2.  */
4913   if (GET_CODE (x) == SIGN_EXTRACT
4914       || GET_CODE (x) == ZERO_EXTRACT)
4915     {
4916       rtx op0 = XEXP (x, 0);
4917       rtx op1 = XEXP (x, 1);
4918       rtx op2 = XEXP (x, 2);
4919
4920       if (GET_CODE (op0) == MULT
4921           && CONST_INT_P (op1)
4922           && op2 == const0_rtx
4923           && CONST_INT_P (XEXP (op0, 1))
4924           && aarch64_is_extend_from_extract (mode,
4925                                              XEXP (op0, 1),
4926                                              op1))
4927         {
4928           return true;
4929         }
4930     }
4931
4932   return false;
4933 }
4934
4935 static bool
4936 aarch64_frint_unspec_p (unsigned int u)
4937 {
4938   switch (u)
4939     {
4940       case UNSPEC_FRINTZ:
4941       case UNSPEC_FRINTP:
4942       case UNSPEC_FRINTM:
4943       case UNSPEC_FRINTA:
4944       case UNSPEC_FRINTN:
4945       case UNSPEC_FRINTX:
4946       case UNSPEC_FRINTI:
4947         return true;
4948
4949       default:
4950         return false;
4951     }
4952 }
4953
4954 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4955    storing it in *COST.  Result is true if the total cost of the operation
4956    has now been calculated.  */
4957 static bool
4958 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4959 {
4960   rtx inner;
4961   rtx comparator;
4962   enum rtx_code cmpcode;
4963
4964   if (COMPARISON_P (op0))
4965     {
4966       inner = XEXP (op0, 0);
4967       comparator = XEXP (op0, 1);
4968       cmpcode = GET_CODE (op0);
4969     }
4970   else
4971     {
4972       inner = op0;
4973       comparator = const0_rtx;
4974       cmpcode = NE;
4975     }
4976
4977   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
4978     {
4979       /* Conditional branch.  */
4980       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4981         return true;
4982       else
4983         {
4984           if (cmpcode == NE || cmpcode == EQ)
4985             {
4986               if (comparator == const0_rtx)
4987                 {
4988                   /* TBZ/TBNZ/CBZ/CBNZ.  */
4989                   if (GET_CODE (inner) == ZERO_EXTRACT)
4990                     /* TBZ/TBNZ.  */
4991                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
4992                                        0, speed);
4993                 else
4994                   /* CBZ/CBNZ.  */
4995                   *cost += rtx_cost (inner, cmpcode, 0, speed);
4996
4997                 return true;
4998               }
4999             }
5000           else if (cmpcode == LT || cmpcode == GE)
5001             {
5002               /* TBZ/TBNZ.  */
5003               if (comparator == const0_rtx)
5004                 return true;
5005             }
5006         }
5007     }
5008   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5009     {
5010       /* It's a conditional operation based on the status flags,
5011          so it must be some flavor of CSEL.  */
5012
5013       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5014       if (GET_CODE (op1) == NEG
5015           || GET_CODE (op1) == NOT
5016           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5017         op1 = XEXP (op1, 0);
5018
5019       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5020       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5021       return true;
5022     }
5023
5024   /* We don't know what this is, cost all operands.  */
5025   return false;
5026 }
5027
5028 /* Calculate the cost of calculating X, storing it in *COST.  Result
5029    is true if the total cost of the operation has now been calculated.  */
5030 static bool
5031 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5032                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5033 {
5034   rtx op0, op1, op2;
5035   const struct cpu_cost_table *extra_cost
5036     = aarch64_tune_params->insn_extra_cost;
5037   enum machine_mode mode = GET_MODE (x);
5038
5039   /* By default, assume that everything has equivalent cost to the
5040      cheapest instruction.  Any additional costs are applied as a delta
5041      above this default.  */
5042   *cost = COSTS_N_INSNS (1);
5043
5044   /* TODO: The cost infrastructure currently does not handle
5045      vector operations.  Assume that all vector operations
5046      are equally expensive.  */
5047   if (VECTOR_MODE_P (mode))
5048     {
5049       if (speed)
5050         *cost += extra_cost->vect.alu;
5051       return true;
5052     }
5053
5054   switch (code)
5055     {
5056     case SET:
5057       /* The cost depends entirely on the operands to SET.  */
5058       *cost = 0;
5059       op0 = SET_DEST (x);
5060       op1 = SET_SRC (x);
5061
5062       switch (GET_CODE (op0))
5063         {
5064         case MEM:
5065           if (speed)
5066             {
5067               rtx address = XEXP (op0, 0);
5068               if (GET_MODE_CLASS (mode) == MODE_INT)
5069                 *cost += extra_cost->ldst.store;
5070               else if (mode == SFmode)
5071                 *cost += extra_cost->ldst.storef;
5072               else if (mode == DFmode)
5073                 *cost += extra_cost->ldst.stored;
5074
5075               *cost +=
5076                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5077                                                      0, speed));
5078             }
5079
5080           *cost += rtx_cost (op1, SET, 1, speed);
5081           return true;
5082
5083         case SUBREG:
5084           if (! REG_P (SUBREG_REG (op0)))
5085             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5086
5087           /* Fall through.  */
5088         case REG:
5089           /* const0_rtx is in general free, but we will use an
5090              instruction to set a register to 0.  */
5091           if (REG_P (op1) || op1 == const0_rtx)
5092             {
5093               /* The cost is 1 per register copied.  */
5094               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5095                               / UNITS_PER_WORD;
5096               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5097             }
5098           else
5099             /* Cost is just the cost of the RHS of the set.  */
5100             *cost += rtx_cost (op1, SET, 1, speed);
5101           return true;
5102
5103         case ZERO_EXTRACT:
5104         case SIGN_EXTRACT:
5105           /* Bit-field insertion.  Strip any redundant widening of
5106              the RHS to meet the width of the target.  */
5107           if (GET_CODE (op1) == SUBREG)
5108             op1 = SUBREG_REG (op1);
5109           if ((GET_CODE (op1) == ZERO_EXTEND
5110                || GET_CODE (op1) == SIGN_EXTEND)
5111               && CONST_INT_P (XEXP (op0, 1))
5112               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5113                   >= INTVAL (XEXP (op0, 1))))
5114             op1 = XEXP (op1, 0);
5115
5116           if (CONST_INT_P (op1))
5117             {
5118               /* MOV immediate is assumed to always be cheap.  */
5119               *cost = COSTS_N_INSNS (1);
5120             }
5121           else
5122             {
5123               /* BFM.  */
5124               if (speed)
5125                 *cost += extra_cost->alu.bfi;
5126               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5127             }
5128
5129           return true;
5130
5131         default:
5132           /* We can't make sense of this, assume default cost.  */
5133           *cost = COSTS_N_INSNS (1);
5134           return false;
5135         }
5136       return false;
5137
5138     case CONST_INT:
5139       /* If an instruction can incorporate a constant within the
5140          instruction, the instruction's expression avoids calling
5141          rtx_cost() on the constant.  If rtx_cost() is called on a
5142          constant, then it is usually because the constant must be
5143          moved into a register by one or more instructions.
5144
5145          The exception is constant 0, which can be expressed
5146          as XZR/WZR and is therefore free.  The exception to this is
5147          if we have (set (reg) (const0_rtx)) in which case we must cost
5148          the move.  However, we can catch that when we cost the SET, so
5149          we don't need to consider that here.  */
5150       if (x == const0_rtx)
5151         *cost = 0;
5152       else
5153         {
5154           /* To an approximation, building any other constant is
5155              proportionally expensive to the number of instructions
5156              required to build that constant.  This is true whether we
5157              are compiling for SPEED or otherwise.  */
5158           *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5159                                                          INTVAL (x),
5160                                                          false));
5161         }
5162       return true;
5163
5164     case CONST_DOUBLE:
5165       if (speed)
5166         {
5167           /* mov[df,sf]_aarch64.  */
5168           if (aarch64_float_const_representable_p (x))
5169             /* FMOV (scalar immediate).  */
5170             *cost += extra_cost->fp[mode == DFmode].fpconst;
5171           else if (!aarch64_float_const_zero_rtx_p (x))
5172             {
5173               /* This will be a load from memory.  */
5174               if (mode == DFmode)
5175                 *cost += extra_cost->ldst.loadd;
5176               else
5177                 *cost += extra_cost->ldst.loadf;
5178             }
5179           else
5180             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5181                or MOV v0.s[0], wzr - neither of which are modeled by the
5182                cost tables.  Just use the default cost.  */
5183             {
5184             }
5185         }
5186
5187       return true;
5188
5189     case MEM:
5190       if (speed)
5191         {
5192           /* For loads we want the base cost of a load, plus an
5193              approximation for the additional cost of the addressing
5194              mode.  */
5195           rtx address = XEXP (x, 0);
5196           if (GET_MODE_CLASS (mode) == MODE_INT)
5197             *cost += extra_cost->ldst.load;
5198           else if (mode == SFmode)
5199             *cost += extra_cost->ldst.loadf;
5200           else if (mode == DFmode)
5201             *cost += extra_cost->ldst.loadd;
5202
5203           *cost +=
5204                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5205                                                      0, speed));
5206         }
5207
5208       return true;
5209
5210     case NEG:
5211       op0 = XEXP (x, 0);
5212
5213       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5214        {
5215           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5216               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5217             {
5218               /* CSETM.  */
5219               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5220               return true;
5221             }
5222
5223           /* Cost this as SUB wzr, X.  */
5224           op0 = CONST0_RTX (GET_MODE (x));
5225           op1 = XEXP (x, 0);
5226           goto cost_minus;
5227         }
5228
5229       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5230         {
5231           /* Support (neg(fma...)) as a single instruction only if
5232              sign of zeros is unimportant.  This matches the decision
5233              making in aarch64.md.  */
5234           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5235             {
5236               /* FNMADD.  */
5237               *cost = rtx_cost (op0, NEG, 0, speed);
5238               return true;
5239             }
5240           if (speed)
5241             /* FNEG.  */
5242             *cost += extra_cost->fp[mode == DFmode].neg;
5243           return false;
5244         }
5245
5246       return false;
5247
5248     case CLRSB:
5249     case CLZ:
5250       if (speed)
5251         *cost += extra_cost->alu.clz;
5252
5253       return false;
5254
5255     case COMPARE:
5256       op0 = XEXP (x, 0);
5257       op1 = XEXP (x, 1);
5258
5259       if (op1 == const0_rtx
5260           && GET_CODE (op0) == AND)
5261         {
5262           x = op0;
5263           goto cost_logic;
5264         }
5265
5266       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5267         {
5268           /* TODO: A write to the CC flags possibly costs extra, this
5269              needs encoding in the cost tables.  */
5270
5271           /* CC_ZESWPmode supports zero extend for free.  */
5272           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5273             op0 = XEXP (op0, 0);
5274
5275           /* ANDS.  */
5276           if (GET_CODE (op0) == AND)
5277             {
5278               x = op0;
5279               goto cost_logic;
5280             }
5281
5282           if (GET_CODE (op0) == PLUS)
5283             {
5284               /* ADDS (and CMN alias).  */
5285               x = op0;
5286               goto cost_plus;
5287             }
5288
5289           if (GET_CODE (op0) == MINUS)
5290             {
5291               /* SUBS.  */
5292               x = op0;
5293               goto cost_minus;
5294             }
5295
5296           if (GET_CODE (op1) == NEG)
5297             {
5298               /* CMN.  */
5299               if (speed)
5300                 *cost += extra_cost->alu.arith;
5301
5302               *cost += rtx_cost (op0, COMPARE, 0, speed);
5303               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5304               return true;
5305             }
5306
5307           /* CMP.
5308
5309              Compare can freely swap the order of operands, and
5310              canonicalization puts the more complex operation first.
5311              But the integer MINUS logic expects the shift/extend
5312              operation in op1.  */
5313           if (! (REG_P (op0)
5314                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5315           {
5316             op0 = XEXP (x, 1);
5317             op1 = XEXP (x, 0);
5318           }
5319           goto cost_minus;
5320         }
5321
5322       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5323         {
5324           /* FCMP.  */
5325           if (speed)
5326             *cost += extra_cost->fp[mode == DFmode].compare;
5327
5328           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5329             {
5330               /* FCMP supports constant 0.0 for no extra cost. */
5331               return true;
5332             }
5333           return false;
5334         }
5335
5336       return false;
5337
5338     case MINUS:
5339       {
5340         op0 = XEXP (x, 0);
5341         op1 = XEXP (x, 1);
5342
5343 cost_minus:
5344         /* Detect valid immediates.  */
5345         if ((GET_MODE_CLASS (mode) == MODE_INT
5346              || (GET_MODE_CLASS (mode) == MODE_CC
5347                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5348             && CONST_INT_P (op1)
5349             && aarch64_uimm12_shift (INTVAL (op1)))
5350           {
5351             *cost += rtx_cost (op0, MINUS, 0, speed);
5352
5353             if (speed)
5354               /* SUB(S) (immediate).  */
5355               *cost += extra_cost->alu.arith;
5356             return true;
5357
5358           }
5359
5360         /* Look for SUB (extended register).  */
5361         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5362           {
5363             if (speed)
5364               *cost += extra_cost->alu.arith_shift;
5365
5366             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5367                                (enum rtx_code) GET_CODE (op1),
5368                                0, speed);
5369             return true;
5370           }
5371
5372         rtx new_op1 = aarch64_strip_extend (op1);
5373
5374         /* Cost this as an FMA-alike operation.  */
5375         if ((GET_CODE (new_op1) == MULT
5376              || GET_CODE (new_op1) == ASHIFT)
5377             && code != COMPARE)
5378           {
5379             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5380                                             (enum rtx_code) code,
5381                                             speed);
5382             *cost += rtx_cost (op0, MINUS, 0, speed);
5383             return true;
5384           }
5385
5386         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5387
5388         if (speed)
5389           {
5390             if (GET_MODE_CLASS (mode) == MODE_INT)
5391               /* SUB(S).  */
5392               *cost += extra_cost->alu.arith;
5393             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5394               /* FSUB.  */
5395               *cost += extra_cost->fp[mode == DFmode].addsub;
5396           }
5397         return true;
5398       }
5399
5400     case PLUS:
5401       {
5402         rtx new_op0;
5403
5404         op0 = XEXP (x, 0);
5405         op1 = XEXP (x, 1);
5406
5407 cost_plus:
5408         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5409             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5410           {
5411             /* CSINC.  */
5412             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5413             *cost += rtx_cost (op1, PLUS, 1, speed);
5414             return true;
5415           }
5416
5417         if (GET_MODE_CLASS (mode) == MODE_INT
5418             && CONST_INT_P (op1)
5419             && aarch64_uimm12_shift (INTVAL (op1)))
5420           {
5421             *cost += rtx_cost (op0, PLUS, 0, speed);
5422
5423             if (speed)
5424               /* ADD (immediate).  */
5425               *cost += extra_cost->alu.arith;
5426             return true;
5427           }
5428
5429         /* Look for ADD (extended register).  */
5430         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5431           {
5432             if (speed)
5433               *cost += extra_cost->alu.arith_shift;
5434
5435             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5436                                (enum rtx_code) GET_CODE (op0),
5437                                0, speed);
5438             return true;
5439           }
5440
5441         /* Strip any extend, leave shifts behind as we will
5442            cost them through mult_cost.  */
5443         new_op0 = aarch64_strip_extend (op0);
5444
5445         if (GET_CODE (new_op0) == MULT
5446             || GET_CODE (new_op0) == ASHIFT)
5447           {
5448             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5449                                             speed);
5450             *cost += rtx_cost (op1, PLUS, 1, speed);
5451             return true;
5452           }
5453
5454         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5455                   + rtx_cost (op1, PLUS, 1, speed));
5456
5457         if (speed)
5458           {
5459             if (GET_MODE_CLASS (mode) == MODE_INT)
5460               /* ADD.  */
5461               *cost += extra_cost->alu.arith;
5462             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5463               /* FADD.  */
5464               *cost += extra_cost->fp[mode == DFmode].addsub;
5465           }
5466         return true;
5467       }
5468
5469     case BSWAP:
5470       *cost = COSTS_N_INSNS (1);
5471
5472       if (speed)
5473         *cost += extra_cost->alu.rev;
5474
5475       return false;
5476
5477     case IOR:
5478       if (aarch_rev16_p (x))
5479         {
5480           *cost = COSTS_N_INSNS (1);
5481
5482           if (speed)
5483             *cost += extra_cost->alu.rev;
5484
5485           return true;
5486         }
5487     /* Fall through.  */
5488     case XOR:
5489     case AND:
5490     cost_logic:
5491       op0 = XEXP (x, 0);
5492       op1 = XEXP (x, 1);
5493
5494       if (code == AND
5495           && GET_CODE (op0) == MULT
5496           && CONST_INT_P (XEXP (op0, 1))
5497           && CONST_INT_P (op1)
5498           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5499                                INTVAL (op1)) != 0)
5500         {
5501           /* This is a UBFM/SBFM.  */
5502           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5503           if (speed)
5504             *cost += extra_cost->alu.bfx;
5505           return true;
5506         }
5507
5508       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5509         {
5510           /* We possibly get the immediate for free, this is not
5511              modelled.  */
5512           if (CONST_INT_P (op1)
5513               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5514             {
5515               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5516
5517               if (speed)
5518                 *cost += extra_cost->alu.logical;
5519
5520               return true;
5521             }
5522           else
5523             {
5524               rtx new_op0 = op0;
5525
5526               /* Handle ORN, EON, or BIC.  */
5527               if (GET_CODE (op0) == NOT)
5528                 op0 = XEXP (op0, 0);
5529
5530               new_op0 = aarch64_strip_shift (op0);
5531
5532               /* If we had a shift on op0 then this is a logical-shift-
5533                  by-register/immediate operation.  Otherwise, this is just
5534                  a logical operation.  */
5535               if (speed)
5536                 {
5537                   if (new_op0 != op0)
5538                     {
5539                       /* Shift by immediate.  */
5540                       if (CONST_INT_P (XEXP (op0, 1)))
5541                         *cost += extra_cost->alu.log_shift;
5542                       else
5543                         *cost += extra_cost->alu.log_shift_reg;
5544                     }
5545                   else
5546                     *cost += extra_cost->alu.logical;
5547                 }
5548
5549               /* In both cases we want to cost both operands.  */
5550               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5551                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5552
5553               return true;
5554             }
5555         }
5556       return false;
5557
5558     case NOT:
5559       /* MVN.  */
5560       if (speed)
5561         *cost += extra_cost->alu.logical;
5562
5563       /* The logical instruction could have the shifted register form,
5564          but the cost is the same if the shift is processed as a separate
5565          instruction, so we don't bother with it here.  */
5566       return false;
5567
5568     case ZERO_EXTEND:
5569
5570       op0 = XEXP (x, 0);
5571       /* If a value is written in SI mode, then zero extended to DI
5572          mode, the operation will in general be free as a write to
5573          a 'w' register implicitly zeroes the upper bits of an 'x'
5574          register.  However, if this is
5575
5576            (set (reg) (zero_extend (reg)))
5577
5578          we must cost the explicit register move.  */
5579       if (mode == DImode
5580           && GET_MODE (op0) == SImode
5581           && outer == SET)
5582         {
5583           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5584
5585           if (!op_cost && speed)
5586             /* MOV.  */
5587             *cost += extra_cost->alu.extend;
5588           else
5589             /* Free, the cost is that of the SI mode operation.  */
5590             *cost = op_cost;
5591
5592           return true;
5593         }
5594       else if (MEM_P (XEXP (x, 0)))
5595         {
5596           /* All loads can zero extend to any size for free.  */
5597           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5598           return true;
5599         }
5600
5601       /* UXTB/UXTH.  */
5602       if (speed)
5603         *cost += extra_cost->alu.extend;
5604
5605       return false;
5606
5607     case SIGN_EXTEND:
5608       if (MEM_P (XEXP (x, 0)))
5609         {
5610           /* LDRSH.  */
5611           if (speed)
5612             {
5613               rtx address = XEXP (XEXP (x, 0), 0);
5614               *cost += extra_cost->ldst.load_sign_extend;
5615
5616               *cost +=
5617                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5618                                                      0, speed));
5619             }
5620           return true;
5621         }
5622
5623       if (speed)
5624         *cost += extra_cost->alu.extend;
5625       return false;
5626
5627     case ASHIFT:
5628       op0 = XEXP (x, 0);
5629       op1 = XEXP (x, 1);
5630
5631       if (CONST_INT_P (op1))
5632         {
5633           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
5634              aliases.  */
5635           if (speed)
5636             *cost += extra_cost->alu.shift;
5637
5638           /* We can incorporate zero/sign extend for free.  */
5639           if (GET_CODE (op0) == ZERO_EXTEND
5640               || GET_CODE (op0) == SIGN_EXTEND)
5641             op0 = XEXP (op0, 0);
5642
5643           *cost += rtx_cost (op0, ASHIFT, 0, speed);
5644           return true;
5645         }
5646       else
5647         {
5648           /* LSLV.  */
5649           if (speed)
5650             *cost += extra_cost->alu.shift_reg;
5651
5652           return false;  /* All arguments need to be in registers.  */
5653         }
5654
5655     case ROTATE:
5656     case ROTATERT:
5657     case LSHIFTRT:
5658     case ASHIFTRT:
5659       op0 = XEXP (x, 0);
5660       op1 = XEXP (x, 1);
5661
5662       if (CONST_INT_P (op1))
5663         {
5664           /* ASR (immediate) and friends.  */
5665           if (speed)
5666             *cost += extra_cost->alu.shift;
5667
5668           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5669           return true;
5670         }
5671       else
5672         {
5673
5674           /* ASR (register) and friends.  */
5675           if (speed)
5676             *cost += extra_cost->alu.shift_reg;
5677
5678           return false;  /* All arguments need to be in registers.  */
5679         }
5680
5681     case SYMBOL_REF:
5682
5683       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5684         {
5685           /* LDR.  */
5686           if (speed)
5687             *cost += extra_cost->ldst.load;
5688         }
5689       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5690                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5691         {
5692           /* ADRP, followed by ADD.  */
5693           *cost += COSTS_N_INSNS (1);
5694           if (speed)
5695             *cost += 2 * extra_cost->alu.arith;
5696         }
5697       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5698                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5699         {
5700           /* ADR.  */
5701           if (speed)
5702             *cost += extra_cost->alu.arith;
5703         }
5704
5705       if (flag_pic)
5706         {
5707           /* One extra load instruction, after accessing the GOT.  */
5708           *cost += COSTS_N_INSNS (1);
5709           if (speed)
5710             *cost += extra_cost->ldst.load;
5711         }
5712       return true;
5713
5714     case HIGH:
5715     case LO_SUM:
5716       /* ADRP/ADD (immediate).  */
5717       if (speed)
5718         *cost += extra_cost->alu.arith;
5719       return true;
5720
5721     case ZERO_EXTRACT:
5722     case SIGN_EXTRACT:
5723       /* UBFX/SBFX.  */
5724       if (speed)
5725         *cost += extra_cost->alu.bfx;
5726
5727       /* We can trust that the immediates used will be correct (there
5728          are no by-register forms), so we need only cost op0.  */
5729       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5730       return true;
5731
5732     case MULT:
5733       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5734       /* aarch64_rtx_mult_cost always handles recursion to its
5735          operands.  */
5736       return true;
5737
5738     case MOD:
5739     case UMOD:
5740       if (speed)
5741         {
5742           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5743             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5744                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5745           else if (GET_MODE (x) == DFmode)
5746             *cost += (extra_cost->fp[1].mult
5747                       + extra_cost->fp[1].div);
5748           else if (GET_MODE (x) == SFmode)
5749             *cost += (extra_cost->fp[0].mult
5750                       + extra_cost->fp[0].div);
5751         }
5752       return false;  /* All arguments need to be in registers.  */
5753
5754     case DIV:
5755     case UDIV:
5756     case SQRT:
5757       if (speed)
5758         {
5759           if (GET_MODE_CLASS (mode) == MODE_INT)
5760             /* There is no integer SQRT, so only DIV and UDIV can get
5761                here.  */
5762             *cost += extra_cost->mult[mode == DImode].idiv;
5763           else
5764             *cost += extra_cost->fp[mode == DFmode].div;
5765         }
5766       return false;  /* All arguments need to be in registers.  */
5767
5768     case IF_THEN_ELSE:
5769       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5770                                          XEXP (x, 2), cost, speed);
5771
5772     case EQ:
5773     case NE:
5774     case GT:
5775     case GTU:
5776     case LT:
5777     case LTU:
5778     case GE:
5779     case GEU:
5780     case LE:
5781     case LEU:
5782
5783       return false; /* All arguments must be in registers.  */
5784
5785     case FMA:
5786       op0 = XEXP (x, 0);
5787       op1 = XEXP (x, 1);
5788       op2 = XEXP (x, 2);
5789
5790       if (speed)
5791         *cost += extra_cost->fp[mode == DFmode].fma;
5792
5793       /* FMSUB, FNMADD, and FNMSUB are free.  */
5794       if (GET_CODE (op0) == NEG)
5795         op0 = XEXP (op0, 0);
5796
5797       if (GET_CODE (op2) == NEG)
5798         op2 = XEXP (op2, 0);
5799
5800       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5801          and the by-element operand as operand 0.  */
5802       if (GET_CODE (op1) == NEG)
5803         op1 = XEXP (op1, 0);
5804
5805       /* Catch vector-by-element operations.  The by-element operand can
5806          either be (vec_duplicate (vec_select (x))) or just
5807          (vec_select (x)), depending on whether we are multiplying by
5808          a vector or a scalar.
5809
5810          Canonicalization is not very good in these cases, FMA4 will put the
5811          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
5812       if (GET_CODE (op0) == VEC_DUPLICATE)
5813         op0 = XEXP (op0, 0);
5814       else if (GET_CODE (op1) == VEC_DUPLICATE)
5815         op1 = XEXP (op1, 0);
5816
5817       if (GET_CODE (op0) == VEC_SELECT)
5818         op0 = XEXP (op0, 0);
5819       else if (GET_CODE (op1) == VEC_SELECT)
5820         op1 = XEXP (op1, 0);
5821
5822       /* If the remaining parameters are not registers,
5823          get the cost to put them into registers.  */
5824       *cost += rtx_cost (op0, FMA, 0, speed);
5825       *cost += rtx_cost (op1, FMA, 1, speed);
5826       *cost += rtx_cost (op2, FMA, 2, speed);
5827       return true;
5828
5829     case FLOAT_EXTEND:
5830       if (speed)
5831         *cost += extra_cost->fp[mode == DFmode].widen;
5832       return false;
5833
5834     case FLOAT_TRUNCATE:
5835       if (speed)
5836         *cost += extra_cost->fp[mode == DFmode].narrow;
5837       return false;
5838
5839     case FIX:
5840     case UNSIGNED_FIX:
5841       x = XEXP (x, 0);
5842       /* Strip the rounding part.  They will all be implemented
5843          by the fcvt* family of instructions anyway.  */
5844       if (GET_CODE (x) == UNSPEC)
5845         {
5846           unsigned int uns_code = XINT (x, 1);
5847
5848           if (uns_code == UNSPEC_FRINTA
5849               || uns_code == UNSPEC_FRINTM
5850               || uns_code == UNSPEC_FRINTN
5851               || uns_code == UNSPEC_FRINTP
5852               || uns_code == UNSPEC_FRINTZ)
5853             x = XVECEXP (x, 0, 0);
5854         }
5855
5856       if (speed)
5857         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5858
5859       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5860       return true;
5861
5862     case ABS:
5863       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5864         {
5865           /* FABS and FNEG are analogous.  */
5866           if (speed)
5867             *cost += extra_cost->fp[mode == DFmode].neg;
5868         }
5869       else
5870         {
5871           /* Integer ABS will either be split to
5872              two arithmetic instructions, or will be an ABS
5873              (scalar), which we don't model.  */
5874           *cost = COSTS_N_INSNS (2);
5875           if (speed)
5876             *cost += 2 * extra_cost->alu.arith;
5877         }
5878       return false;
5879
5880     case SMAX:
5881     case SMIN:
5882       if (speed)
5883         {
5884           /* FMAXNM/FMINNM/FMAX/FMIN.
5885              TODO: This may not be accurate for all implementations, but
5886              we do not model this in the cost tables.  */
5887           *cost += extra_cost->fp[mode == DFmode].addsub;
5888         }
5889       return false;
5890
5891     case UNSPEC:
5892       /* The floating point round to integer frint* instructions.  */
5893       if (aarch64_frint_unspec_p (XINT (x, 1)))
5894         {
5895           if (speed)
5896             *cost += extra_cost->fp[mode == DFmode].roundint;
5897
5898           return false;
5899         }
5900
5901       if (XINT (x, 1) == UNSPEC_RBIT)
5902         {
5903           if (speed)
5904             *cost += extra_cost->alu.rev;
5905
5906           return false;
5907         }
5908       break;
5909
5910     case TRUNCATE:
5911
5912       /* Decompose <su>muldi3_highpart.  */
5913       if (/* (truncate:DI  */
5914           mode == DImode
5915           /*   (lshiftrt:TI  */
5916           && GET_MODE (XEXP (x, 0)) == TImode
5917           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5918           /*      (mult:TI  */
5919           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5920           /*        (ANY_EXTEND:TI (reg:DI))
5921                     (ANY_EXTEND:TI (reg:DI)))  */
5922           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5923                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5924               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5925                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5926           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5927           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5928           /*     (const_int 64)  */
5929           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5930           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5931         {
5932           /* UMULH/SMULH.  */
5933           if (speed)
5934             *cost += extra_cost->mult[mode == DImode].extend;
5935           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5936                              MULT, 0, speed);
5937           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5938                              MULT, 1, speed);
5939           return true;
5940         }
5941
5942       /* Fall through.  */
5943     default:
5944       break;
5945     }
5946
5947   if (dump_file && (dump_flags & TDF_DETAILS))
5948     fprintf (dump_file,
5949       "\nFailed to cost RTX.  Assuming default cost.\n");
5950
5951   return true;
5952 }
5953
5954 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5955    calculated for X.  This cost is stored in *COST.  Returns true
5956    if the total cost of X was calculated.  */
5957 static bool
5958 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5959                    int param, int *cost, bool speed)
5960 {
5961   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5962
5963   if (dump_file && (dump_flags & TDF_DETAILS))
5964     {
5965       print_rtl_single (dump_file, x);
5966       fprintf (dump_file, "\n%s cost: %d (%s)\n",
5967                speed ? "Hot" : "Cold",
5968                *cost, result ? "final" : "partial");
5969     }
5970
5971   return result;
5972 }
5973
5974 static int
5975 aarch64_register_move_cost (enum machine_mode mode,
5976                             reg_class_t from_i, reg_class_t to_i)
5977 {
5978   enum reg_class from = (enum reg_class) from_i;
5979   enum reg_class to = (enum reg_class) to_i;
5980   const struct cpu_regmove_cost *regmove_cost
5981     = aarch64_tune_params->regmove_cost;
5982
5983   /* Moving between GPR and stack cost is the same as GP2GP.  */
5984   if ((from == GENERAL_REGS && to == STACK_REG)
5985       || (to == GENERAL_REGS && from == STACK_REG))
5986     return regmove_cost->GP2GP;
5987
5988   /* To/From the stack register, we move via the gprs.  */
5989   if (to == STACK_REG || from == STACK_REG)
5990     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5991             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5992
5993   if (from == GENERAL_REGS && to == GENERAL_REGS)
5994     return regmove_cost->GP2GP;
5995   else if (from == GENERAL_REGS)
5996     return regmove_cost->GP2FP;
5997   else if (to == GENERAL_REGS)
5998     return regmove_cost->FP2GP;
5999
6000   /* When AdvSIMD instructions are disabled it is not possible to move
6001      a 128-bit value directly between Q registers.  This is handled in
6002      secondary reload.  A general register is used as a scratch to move
6003      the upper DI value and the lower DI value is moved directly,
6004      hence the cost is the sum of three moves. */
6005   if (! TARGET_SIMD && GET_MODE_SIZE (mode) == 128)
6006     return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6007
6008   return regmove_cost->FP2FP;
6009 }
6010
6011 static int
6012 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
6013                           reg_class_t rclass ATTRIBUTE_UNUSED,
6014                           bool in ATTRIBUTE_UNUSED)
6015 {
6016   return aarch64_tune_params->memmov_cost;
6017 }
6018
6019 /* Return the number of instructions that can be issued per cycle.  */
6020 static int
6021 aarch64_sched_issue_rate (void)
6022 {
6023   return aarch64_tune_params->issue_rate;
6024 }
6025
6026 /* Vectorizer cost model target hooks.  */
6027
6028 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6029 static int
6030 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6031                                     tree vectype,
6032                                     int misalign ATTRIBUTE_UNUSED)
6033 {
6034   unsigned elements;
6035
6036   switch (type_of_cost)
6037     {
6038       case scalar_stmt:
6039         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6040
6041       case scalar_load:
6042         return aarch64_tune_params->vec_costs->scalar_load_cost;
6043
6044       case scalar_store:
6045         return aarch64_tune_params->vec_costs->scalar_store_cost;
6046
6047       case vector_stmt:
6048         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6049
6050       case vector_load:
6051         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6052
6053       case vector_store:
6054         return aarch64_tune_params->vec_costs->vec_store_cost;
6055
6056       case vec_to_scalar:
6057         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6058
6059       case scalar_to_vec:
6060         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6061
6062       case unaligned_load:
6063         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6064
6065       case unaligned_store:
6066         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6067
6068       case cond_branch_taken:
6069         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6070
6071       case cond_branch_not_taken:
6072         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6073
6074       case vec_perm:
6075       case vec_promote_demote:
6076         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6077
6078       case vec_construct:
6079         elements = TYPE_VECTOR_SUBPARTS (vectype);
6080         return elements / 2 + 1;
6081
6082       default:
6083         gcc_unreachable ();
6084     }
6085 }
6086
6087 /* Implement targetm.vectorize.add_stmt_cost.  */
6088 static unsigned
6089 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6090                        struct _stmt_vec_info *stmt_info, int misalign,
6091                        enum vect_cost_model_location where)
6092 {
6093   unsigned *cost = (unsigned *) data;
6094   unsigned retval = 0;
6095
6096   if (flag_vect_cost_model)
6097     {
6098       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6099       int stmt_cost =
6100             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6101
6102       /* Statements in an inner loop relative to the loop being
6103          vectorized are weighted more heavily.  The value here is
6104          a function (linear for now) of the loop nest level.  */
6105       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6106         {
6107           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6108           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6109           unsigned nest_level = loop_depth (loop);
6110
6111           count *= nest_level;
6112         }
6113
6114       retval = (unsigned) (count * stmt_cost);
6115       cost[where] += retval;
6116     }
6117
6118   return retval;
6119 }
6120
6121 static void initialize_aarch64_code_model (void);
6122
6123 /* Parse the architecture extension string.  */
6124
6125 static void
6126 aarch64_parse_extension (char *str)
6127 {
6128   /* The extension string is parsed left to right.  */
6129   const struct aarch64_option_extension *opt = NULL;
6130
6131   /* Flag to say whether we are adding or removing an extension.  */
6132   int adding_ext = -1;
6133
6134   while (str != NULL && *str != 0)
6135     {
6136       char *ext;
6137       size_t len;
6138
6139       str++;
6140       ext = strchr (str, '+');
6141
6142       if (ext != NULL)
6143         len = ext - str;
6144       else
6145         len = strlen (str);
6146
6147       if (len >= 2 && strncmp (str, "no", 2) == 0)
6148         {
6149           adding_ext = 0;
6150           len -= 2;
6151           str += 2;
6152         }
6153       else if (len > 0)
6154         adding_ext = 1;
6155
6156       if (len == 0)
6157         {
6158           error ("missing feature modifier after %qs", "+no");
6159           return;
6160         }
6161
6162       /* Scan over the extensions table trying to find an exact match.  */
6163       for (opt = all_extensions; opt->name != NULL; opt++)
6164         {
6165           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6166             {
6167               /* Add or remove the extension.  */
6168               if (adding_ext)
6169                 aarch64_isa_flags |= opt->flags_on;
6170               else
6171                 aarch64_isa_flags &= ~(opt->flags_off);
6172               break;
6173             }
6174         }
6175
6176       if (opt->name == NULL)
6177         {
6178           /* Extension not found in list.  */
6179           error ("unknown feature modifier %qs", str);
6180           return;
6181         }
6182
6183       str = ext;
6184     };
6185
6186   return;
6187 }
6188
6189 /* Parse the ARCH string.  */
6190
6191 static void
6192 aarch64_parse_arch (void)
6193 {
6194   char *ext;
6195   const struct processor *arch;
6196   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6197   size_t len;
6198
6199   strcpy (str, aarch64_arch_string);
6200
6201   ext = strchr (str, '+');
6202
6203   if (ext != NULL)
6204     len = ext - str;
6205   else
6206     len = strlen (str);
6207
6208   if (len == 0)
6209     {
6210       error ("missing arch name in -march=%qs", str);
6211       return;
6212     }
6213
6214   /* Loop through the list of supported ARCHs to find a match.  */
6215   for (arch = all_architectures; arch->name != NULL; arch++)
6216     {
6217       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6218         {
6219           selected_arch = arch;
6220           aarch64_isa_flags = selected_arch->flags;
6221
6222           if (!selected_cpu)
6223             selected_cpu = &all_cores[selected_arch->core];
6224
6225           if (ext != NULL)
6226             {
6227               /* ARCH string contains at least one extension.  */
6228               aarch64_parse_extension (ext);
6229             }
6230
6231           if (strcmp (selected_arch->arch, selected_cpu->arch))
6232             {
6233               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6234                        selected_cpu->name, selected_arch->name);
6235             }
6236
6237           return;
6238         }
6239     }
6240
6241   /* ARCH name not found in list.  */
6242   error ("unknown value %qs for -march", str);
6243   return;
6244 }
6245
6246 /* Parse the CPU string.  */
6247
6248 static void
6249 aarch64_parse_cpu (void)
6250 {
6251   char *ext;
6252   const struct processor *cpu;
6253   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6254   size_t len;
6255
6256   strcpy (str, aarch64_cpu_string);
6257
6258   ext = strchr (str, '+');
6259
6260   if (ext != NULL)
6261     len = ext - str;
6262   else
6263     len = strlen (str);
6264
6265   if (len == 0)
6266     {
6267       error ("missing cpu name in -mcpu=%qs", str);
6268       return;
6269     }
6270
6271   /* Loop through the list of supported CPUs to find a match.  */
6272   for (cpu = all_cores; cpu->name != NULL; cpu++)
6273     {
6274       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6275         {
6276           selected_cpu = cpu;
6277           selected_tune = cpu;
6278           aarch64_isa_flags = selected_cpu->flags;
6279
6280           if (ext != NULL)
6281             {
6282               /* CPU string contains at least one extension.  */
6283               aarch64_parse_extension (ext);
6284             }
6285
6286           return;
6287         }
6288     }
6289
6290   /* CPU name not found in list.  */
6291   error ("unknown value %qs for -mcpu", str);
6292   return;
6293 }
6294
6295 /* Parse the TUNE string.  */
6296
6297 static void
6298 aarch64_parse_tune (void)
6299 {
6300   const struct processor *cpu;
6301   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6302   strcpy (str, aarch64_tune_string);
6303
6304   /* Loop through the list of supported CPUs to find a match.  */
6305   for (cpu = all_cores; cpu->name != NULL; cpu++)
6306     {
6307       if (strcmp (cpu->name, str) == 0)
6308         {
6309           selected_tune = cpu;
6310           return;
6311         }
6312     }
6313
6314   /* CPU name not found in list.  */
6315   error ("unknown value %qs for -mtune", str);
6316   return;
6317 }
6318
6319
6320 /* Implement TARGET_OPTION_OVERRIDE.  */
6321
6322 static void
6323 aarch64_override_options (void)
6324 {
6325   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6326      If either of -march or -mtune is given, they override their
6327      respective component of -mcpu.
6328
6329      So, first parse AARCH64_CPU_STRING, then the others, be careful
6330      with -march as, if -mcpu is not present on the command line, march
6331      must set a sensible default CPU.  */
6332   if (aarch64_cpu_string)
6333     {
6334       aarch64_parse_cpu ();
6335     }
6336
6337   if (aarch64_arch_string)
6338     {
6339       aarch64_parse_arch ();
6340     }
6341
6342   if (aarch64_tune_string)
6343     {
6344       aarch64_parse_tune ();
6345     }
6346
6347 #ifndef HAVE_AS_MABI_OPTION
6348   /* The compiler may have been configured with 2.23.* binutils, which does
6349      not have support for ILP32.  */
6350   if (TARGET_ILP32)
6351     error ("Assembler does not support -mabi=ilp32");
6352 #endif
6353
6354   initialize_aarch64_code_model ();
6355
6356   aarch64_build_bitmask_table ();
6357
6358   /* This target defaults to strict volatile bitfields.  */
6359   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6360     flag_strict_volatile_bitfields = 1;
6361
6362   /* If the user did not specify a processor, choose the default
6363      one for them.  This will be the CPU set during configuration using
6364      --with-cpu, otherwise it is "generic".  */
6365   if (!selected_cpu)
6366     {
6367       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6368       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6369     }
6370
6371   gcc_assert (selected_cpu);
6372
6373   /* The selected cpu may be an architecture, so lookup tuning by core ID.  */
6374   if (!selected_tune)
6375     selected_tune = &all_cores[selected_cpu->core];
6376
6377   aarch64_tune_flags = selected_tune->flags;
6378   aarch64_tune = selected_tune->core;
6379   aarch64_tune_params = selected_tune->tune;
6380
6381   aarch64_override_options_after_change ();
6382 }
6383
6384 /* Implement targetm.override_options_after_change.  */
6385
6386 static void
6387 aarch64_override_options_after_change (void)
6388 {
6389   if (flag_omit_frame_pointer)
6390     flag_omit_leaf_frame_pointer = false;
6391   else if (flag_omit_leaf_frame_pointer)
6392     flag_omit_frame_pointer = true;
6393 }
6394
6395 static struct machine_function *
6396 aarch64_init_machine_status (void)
6397 {
6398   struct machine_function *machine;
6399   machine = ggc_cleared_alloc<machine_function> ();
6400   return machine;
6401 }
6402
6403 void
6404 aarch64_init_expanders (void)
6405 {
6406   init_machine_status = aarch64_init_machine_status;
6407 }
6408
6409 /* A checking mechanism for the implementation of the various code models.  */
6410 static void
6411 initialize_aarch64_code_model (void)
6412 {
6413    if (flag_pic)
6414      {
6415        switch (aarch64_cmodel_var)
6416          {
6417          case AARCH64_CMODEL_TINY:
6418            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6419            break;
6420          case AARCH64_CMODEL_SMALL:
6421            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6422            break;
6423          case AARCH64_CMODEL_LARGE:
6424            sorry ("code model %qs with -f%s", "large",
6425                   flag_pic > 1 ? "PIC" : "pic");
6426          default:
6427            gcc_unreachable ();
6428          }
6429      }
6430    else
6431      aarch64_cmodel = aarch64_cmodel_var;
6432 }
6433
6434 /* Return true if SYMBOL_REF X binds locally.  */
6435
6436 static bool
6437 aarch64_symbol_binds_local_p (const_rtx x)
6438 {
6439   return (SYMBOL_REF_DECL (x)
6440           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6441           : SYMBOL_REF_LOCAL_P (x));
6442 }
6443
6444 /* Return true if SYMBOL_REF X is thread local */
6445 static bool
6446 aarch64_tls_symbol_p (rtx x)
6447 {
6448   if (! TARGET_HAVE_TLS)
6449     return false;
6450
6451   if (GET_CODE (x) != SYMBOL_REF)
6452     return false;
6453
6454   return SYMBOL_REF_TLS_MODEL (x) != 0;
6455 }
6456
6457 /* Classify a TLS symbol into one of the TLS kinds.  */
6458 enum aarch64_symbol_type
6459 aarch64_classify_tls_symbol (rtx x)
6460 {
6461   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6462
6463   switch (tls_kind)
6464     {
6465     case TLS_MODEL_GLOBAL_DYNAMIC:
6466     case TLS_MODEL_LOCAL_DYNAMIC:
6467       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6468
6469     case TLS_MODEL_INITIAL_EXEC:
6470       return SYMBOL_SMALL_GOTTPREL;
6471
6472     case TLS_MODEL_LOCAL_EXEC:
6473       return SYMBOL_SMALL_TPREL;
6474
6475     case TLS_MODEL_EMULATED:
6476     case TLS_MODEL_NONE:
6477       return SYMBOL_FORCE_TO_MEM;
6478
6479     default:
6480       gcc_unreachable ();
6481     }
6482 }
6483
6484 /* Return the method that should be used to access SYMBOL_REF or
6485    LABEL_REF X in context CONTEXT.  */
6486
6487 enum aarch64_symbol_type
6488 aarch64_classify_symbol (rtx x,
6489                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6490 {
6491   if (GET_CODE (x) == LABEL_REF)
6492     {
6493       switch (aarch64_cmodel)
6494         {
6495         case AARCH64_CMODEL_LARGE:
6496           return SYMBOL_FORCE_TO_MEM;
6497
6498         case AARCH64_CMODEL_TINY_PIC:
6499         case AARCH64_CMODEL_TINY:
6500           return SYMBOL_TINY_ABSOLUTE;
6501
6502         case AARCH64_CMODEL_SMALL_PIC:
6503         case AARCH64_CMODEL_SMALL:
6504           return SYMBOL_SMALL_ABSOLUTE;
6505
6506         default:
6507           gcc_unreachable ();
6508         }
6509     }
6510
6511   if (GET_CODE (x) == SYMBOL_REF)
6512     {
6513       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6514           return SYMBOL_FORCE_TO_MEM;
6515
6516       if (aarch64_tls_symbol_p (x))
6517         return aarch64_classify_tls_symbol (x);
6518
6519       switch (aarch64_cmodel)
6520         {
6521         case AARCH64_CMODEL_TINY:
6522           if (SYMBOL_REF_WEAK (x))
6523             return SYMBOL_FORCE_TO_MEM;
6524           return SYMBOL_TINY_ABSOLUTE;
6525
6526         case AARCH64_CMODEL_SMALL:
6527           if (SYMBOL_REF_WEAK (x))
6528             return SYMBOL_FORCE_TO_MEM;
6529           return SYMBOL_SMALL_ABSOLUTE;
6530
6531         case AARCH64_CMODEL_TINY_PIC:
6532           if (!aarch64_symbol_binds_local_p (x))
6533             return SYMBOL_TINY_GOT;
6534           return SYMBOL_TINY_ABSOLUTE;
6535
6536         case AARCH64_CMODEL_SMALL_PIC:
6537           if (!aarch64_symbol_binds_local_p (x))
6538             return SYMBOL_SMALL_GOT;
6539           return SYMBOL_SMALL_ABSOLUTE;
6540
6541         default:
6542           gcc_unreachable ();
6543         }
6544     }
6545
6546   /* By default push everything into the constant pool.  */
6547   return SYMBOL_FORCE_TO_MEM;
6548 }
6549
6550 bool
6551 aarch64_constant_address_p (rtx x)
6552 {
6553   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6554 }
6555
6556 bool
6557 aarch64_legitimate_pic_operand_p (rtx x)
6558 {
6559   if (GET_CODE (x) == SYMBOL_REF
6560       || (GET_CODE (x) == CONST
6561           && GET_CODE (XEXP (x, 0)) == PLUS
6562           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6563      return false;
6564
6565   return true;
6566 }
6567
6568 /* Return true if X holds either a quarter-precision or
6569      floating-point +0.0 constant.  */
6570 static bool
6571 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6572 {
6573   if (!CONST_DOUBLE_P (x))
6574     return false;
6575
6576   /* TODO: We could handle moving 0.0 to a TFmode register,
6577      but first we would like to refactor the movtf_aarch64
6578      to be more amicable to split moves properly and
6579      correctly gate on TARGET_SIMD.  For now - reject all
6580      constants which are not to SFmode or DFmode registers.  */
6581   if (!(mode == SFmode || mode == DFmode))
6582     return false;
6583
6584   if (aarch64_float_const_zero_rtx_p (x))
6585     return true;
6586   return aarch64_float_const_representable_p (x);
6587 }
6588
6589 static bool
6590 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6591 {
6592   /* Do not allow vector struct mode constants.  We could support
6593      0 and -1 easily, but they need support in aarch64-simd.md.  */
6594   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6595     return false;
6596
6597   /* This could probably go away because
6598      we now decompose CONST_INTs according to expand_mov_immediate.  */
6599   if ((GET_CODE (x) == CONST_VECTOR
6600        && aarch64_simd_valid_immediate (x, mode, false, NULL))
6601       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6602         return !targetm.cannot_force_const_mem (mode, x);
6603
6604   if (GET_CODE (x) == HIGH
6605       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6606     return true;
6607
6608   return aarch64_constant_address_p (x);
6609 }
6610
6611 rtx
6612 aarch64_load_tp (rtx target)
6613 {
6614   if (!target
6615       || GET_MODE (target) != Pmode
6616       || !register_operand (target, Pmode))
6617     target = gen_reg_rtx (Pmode);
6618
6619   /* Can return in any reg.  */
6620   emit_insn (gen_aarch64_load_tp_hard (target));
6621   return target;
6622 }
6623
6624 /* On AAPCS systems, this is the "struct __va_list".  */
6625 static GTY(()) tree va_list_type;
6626
6627 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6628    Return the type to use as __builtin_va_list.
6629
6630    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6631
6632    struct __va_list
6633    {
6634      void *__stack;
6635      void *__gr_top;
6636      void *__vr_top;
6637      int   __gr_offs;
6638      int   __vr_offs;
6639    };  */
6640
6641 static tree
6642 aarch64_build_builtin_va_list (void)
6643 {
6644   tree va_list_name;
6645   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6646
6647   /* Create the type.  */
6648   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6649   /* Give it the required name.  */
6650   va_list_name = build_decl (BUILTINS_LOCATION,
6651                              TYPE_DECL,
6652                              get_identifier ("__va_list"),
6653                              va_list_type);
6654   DECL_ARTIFICIAL (va_list_name) = 1;
6655   TYPE_NAME (va_list_type) = va_list_name;
6656   TYPE_STUB_DECL (va_list_type) = va_list_name;
6657
6658   /* Create the fields.  */
6659   f_stack = build_decl (BUILTINS_LOCATION,
6660                         FIELD_DECL, get_identifier ("__stack"),
6661                         ptr_type_node);
6662   f_grtop = build_decl (BUILTINS_LOCATION,
6663                         FIELD_DECL, get_identifier ("__gr_top"),
6664                         ptr_type_node);
6665   f_vrtop = build_decl (BUILTINS_LOCATION,
6666                         FIELD_DECL, get_identifier ("__vr_top"),
6667                         ptr_type_node);
6668   f_groff = build_decl (BUILTINS_LOCATION,
6669                         FIELD_DECL, get_identifier ("__gr_offs"),
6670                         integer_type_node);
6671   f_vroff = build_decl (BUILTINS_LOCATION,
6672                         FIELD_DECL, get_identifier ("__vr_offs"),
6673                         integer_type_node);
6674
6675   DECL_ARTIFICIAL (f_stack) = 1;
6676   DECL_ARTIFICIAL (f_grtop) = 1;
6677   DECL_ARTIFICIAL (f_vrtop) = 1;
6678   DECL_ARTIFICIAL (f_groff) = 1;
6679   DECL_ARTIFICIAL (f_vroff) = 1;
6680
6681   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6682   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6683   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6684   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6685   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6686
6687   TYPE_FIELDS (va_list_type) = f_stack;
6688   DECL_CHAIN (f_stack) = f_grtop;
6689   DECL_CHAIN (f_grtop) = f_vrtop;
6690   DECL_CHAIN (f_vrtop) = f_groff;
6691   DECL_CHAIN (f_groff) = f_vroff;
6692
6693   /* Compute its layout.  */
6694   layout_type (va_list_type);
6695
6696   return va_list_type;
6697 }
6698
6699 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
6700 static void
6701 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6702 {
6703   const CUMULATIVE_ARGS *cum;
6704   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6705   tree stack, grtop, vrtop, groff, vroff;
6706   tree t;
6707   int gr_save_area_size;
6708   int vr_save_area_size;
6709   int vr_offset;
6710
6711   cum = &crtl->args.info;
6712   gr_save_area_size
6713     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6714   vr_save_area_size
6715     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6716
6717   if (TARGET_GENERAL_REGS_ONLY)
6718     {
6719       if (cum->aapcs_nvrn > 0)
6720         sorry ("%qs and floating point or vector arguments",
6721                "-mgeneral-regs-only");
6722       vr_save_area_size = 0;
6723     }
6724
6725   f_stack = TYPE_FIELDS (va_list_type_node);
6726   f_grtop = DECL_CHAIN (f_stack);
6727   f_vrtop = DECL_CHAIN (f_grtop);
6728   f_groff = DECL_CHAIN (f_vrtop);
6729   f_vroff = DECL_CHAIN (f_groff);
6730
6731   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6732                   NULL_TREE);
6733   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6734                   NULL_TREE);
6735   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6736                   NULL_TREE);
6737   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6738                   NULL_TREE);
6739   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6740                   NULL_TREE);
6741
6742   /* Emit code to initialize STACK, which points to the next varargs stack
6743      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
6744      by named arguments.  STACK is 8-byte aligned.  */
6745   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6746   if (cum->aapcs_stack_size > 0)
6747     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6748   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6749   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6750
6751   /* Emit code to initialize GRTOP, the top of the GR save area.
6752      virtual_incoming_args_rtx should have been 16 byte aligned.  */
6753   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6754   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6755   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6756
6757   /* Emit code to initialize VRTOP, the top of the VR save area.
6758      This address is gr_save_area_bytes below GRTOP, rounded
6759      down to the next 16-byte boundary.  */
6760   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6761   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6762                              STACK_BOUNDARY / BITS_PER_UNIT);
6763
6764   if (vr_offset)
6765     t = fold_build_pointer_plus_hwi (t, -vr_offset);
6766   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6767   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6768
6769   /* Emit code to initialize GROFF, the offset from GRTOP of the
6770      next GPR argument.  */
6771   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6772               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6773   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6774
6775   /* Likewise emit code to initialize VROFF, the offset from FTOP
6776      of the next VR argument.  */
6777   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6778               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6779   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6780 }
6781
6782 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
6783
6784 static tree
6785 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6786                               gimple_seq *post_p ATTRIBUTE_UNUSED)
6787 {
6788   tree addr;
6789   bool indirect_p;
6790   bool is_ha;           /* is HFA or HVA.  */
6791   bool dw_align;        /* double-word align.  */
6792   enum machine_mode ag_mode = VOIDmode;
6793   int nregs;
6794   enum machine_mode mode;
6795
6796   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6797   tree stack, f_top, f_off, off, arg, roundup, on_stack;
6798   HOST_WIDE_INT size, rsize, adjust, align;
6799   tree t, u, cond1, cond2;
6800
6801   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6802   if (indirect_p)
6803     type = build_pointer_type (type);
6804
6805   mode = TYPE_MODE (type);
6806
6807   f_stack = TYPE_FIELDS (va_list_type_node);
6808   f_grtop = DECL_CHAIN (f_stack);
6809   f_vrtop = DECL_CHAIN (f_grtop);
6810   f_groff = DECL_CHAIN (f_vrtop);
6811   f_vroff = DECL_CHAIN (f_groff);
6812
6813   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6814                   f_stack, NULL_TREE);
6815   size = int_size_in_bytes (type);
6816   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6817
6818   dw_align = false;
6819   adjust = 0;
6820   if (aarch64_vfp_is_call_or_return_candidate (mode,
6821                                                type,
6822                                                &ag_mode,
6823                                                &nregs,
6824                                                &is_ha))
6825     {
6826       /* TYPE passed in fp/simd registers.  */
6827       if (TARGET_GENERAL_REGS_ONLY)
6828         sorry ("%qs and floating point or vector arguments",
6829                "-mgeneral-regs-only");
6830
6831       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6832                       unshare_expr (valist), f_vrtop, NULL_TREE);
6833       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6834                       unshare_expr (valist), f_vroff, NULL_TREE);
6835
6836       rsize = nregs * UNITS_PER_VREG;
6837
6838       if (is_ha)
6839         {
6840           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6841             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6842         }
6843       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6844                && size < UNITS_PER_VREG)
6845         {
6846           adjust = UNITS_PER_VREG - size;
6847         }
6848     }
6849   else
6850     {
6851       /* TYPE passed in general registers.  */
6852       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6853                       unshare_expr (valist), f_grtop, NULL_TREE);
6854       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6855                       unshare_expr (valist), f_groff, NULL_TREE);
6856       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6857       nregs = rsize / UNITS_PER_WORD;
6858
6859       if (align > 8)
6860         dw_align = true;
6861
6862       if (BLOCK_REG_PADDING (mode, type, 1) == downward
6863           && size < UNITS_PER_WORD)
6864         {
6865           adjust = UNITS_PER_WORD  - size;
6866         }
6867     }
6868
6869   /* Get a local temporary for the field value.  */
6870   off = get_initialized_tmp_var (f_off, pre_p, NULL);
6871
6872   /* Emit code to branch if off >= 0.  */
6873   t = build2 (GE_EXPR, boolean_type_node, off,
6874               build_int_cst (TREE_TYPE (off), 0));
6875   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6876
6877   if (dw_align)
6878     {
6879       /* Emit: offs = (offs + 15) & -16.  */
6880       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6881                   build_int_cst (TREE_TYPE (off), 15));
6882       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6883                   build_int_cst (TREE_TYPE (off), -16));
6884       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6885     }
6886   else
6887     roundup = NULL;
6888
6889   /* Update ap.__[g|v]r_offs  */
6890   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6891               build_int_cst (TREE_TYPE (off), rsize));
6892   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6893
6894   /* String up.  */
6895   if (roundup)
6896     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6897
6898   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
6899   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6900               build_int_cst (TREE_TYPE (f_off), 0));
6901   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6902
6903   /* String up: make sure the assignment happens before the use.  */
6904   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6905   COND_EXPR_ELSE (cond1) = t;
6906
6907   /* Prepare the trees handling the argument that is passed on the stack;
6908      the top level node will store in ON_STACK.  */
6909   arg = get_initialized_tmp_var (stack, pre_p, NULL);
6910   if (align > 8)
6911     {
6912       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
6913       t = fold_convert (intDI_type_node, arg);
6914       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6915                   build_int_cst (TREE_TYPE (t), 15));
6916       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6917                   build_int_cst (TREE_TYPE (t), -16));
6918       t = fold_convert (TREE_TYPE (arg), t);
6919       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6920     }
6921   else
6922     roundup = NULL;
6923   /* Advance ap.__stack  */
6924   t = fold_convert (intDI_type_node, arg);
6925   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6926               build_int_cst (TREE_TYPE (t), size + 7));
6927   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6928               build_int_cst (TREE_TYPE (t), -8));
6929   t = fold_convert (TREE_TYPE (arg), t);
6930   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6931   /* String up roundup and advance.  */
6932   if (roundup)
6933     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6934   /* String up with arg */
6935   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6936   /* Big-endianness related address adjustment.  */
6937   if (BLOCK_REG_PADDING (mode, type, 1) == downward
6938       && size < UNITS_PER_WORD)
6939   {
6940     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6941                 size_int (UNITS_PER_WORD - size));
6942     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6943   }
6944
6945   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6946   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6947
6948   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
6949   t = off;
6950   if (adjust)
6951     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6952                 build_int_cst (TREE_TYPE (off), adjust));
6953
6954   t = fold_convert (sizetype, t);
6955   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6956
6957   if (is_ha)
6958     {
6959       /* type ha; // treat as "struct {ftype field[n];}"
6960          ... [computing offs]
6961          for (i = 0; i <nregs; ++i, offs += 16)
6962            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6963          return ha;  */
6964       int i;
6965       tree tmp_ha, field_t, field_ptr_t;
6966
6967       /* Declare a local variable.  */
6968       tmp_ha = create_tmp_var_raw (type, "ha");
6969       gimple_add_tmp_var (tmp_ha);
6970
6971       /* Establish the base type.  */
6972       switch (ag_mode)
6973         {
6974         case SFmode:
6975           field_t = float_type_node;
6976           field_ptr_t = float_ptr_type_node;
6977           break;
6978         case DFmode:
6979           field_t = double_type_node;
6980           field_ptr_t = double_ptr_type_node;
6981           break;
6982         case TFmode:
6983           field_t = long_double_type_node;
6984           field_ptr_t = long_double_ptr_type_node;
6985           break;
6986 /* The half precision and quad precision are not fully supported yet.  Enable
6987    the following code after the support is complete.  Need to find the correct
6988    type node for __fp16 *.  */
6989 #if 0
6990         case HFmode:
6991           field_t = float_type_node;
6992           field_ptr_t = float_ptr_type_node;
6993           break;
6994 #endif
6995         case V2SImode:
6996         case V4SImode:
6997             {
6998               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
6999               field_t = build_vector_type_for_mode (innertype, ag_mode);
7000               field_ptr_t = build_pointer_type (field_t);
7001             }
7002           break;
7003         default:
7004           gcc_assert (0);
7005         }
7006
7007       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7008       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7009       addr = t;
7010       t = fold_convert (field_ptr_t, addr);
7011       t = build2 (MODIFY_EXPR, field_t,
7012                   build1 (INDIRECT_REF, field_t, tmp_ha),
7013                   build1 (INDIRECT_REF, field_t, t));
7014
7015       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7016       for (i = 1; i < nregs; ++i)
7017         {
7018           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7019           u = fold_convert (field_ptr_t, addr);
7020           u = build2 (MODIFY_EXPR, field_t,
7021                       build2 (MEM_REF, field_t, tmp_ha,
7022                               build_int_cst (field_ptr_t,
7023                                              (i *
7024                                               int_size_in_bytes (field_t)))),
7025                       build1 (INDIRECT_REF, field_t, u));
7026           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7027         }
7028
7029       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7030       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7031     }
7032
7033   COND_EXPR_ELSE (cond2) = t;
7034   addr = fold_convert (build_pointer_type (type), cond1);
7035   addr = build_va_arg_indirect_ref (addr);
7036
7037   if (indirect_p)
7038     addr = build_va_arg_indirect_ref (addr);
7039
7040   return addr;
7041 }
7042
7043 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7044
7045 static void
7046 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7047                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7048                                 int no_rtl)
7049 {
7050   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7051   CUMULATIVE_ARGS local_cum;
7052   int gr_saved, vr_saved;
7053
7054   /* The caller has advanced CUM up to, but not beyond, the last named
7055      argument.  Advance a local copy of CUM past the last "real" named
7056      argument, to find out how many registers are left over.  */
7057   local_cum = *cum;
7058   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7059
7060   /* Found out how many registers we need to save.  */
7061   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7062   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7063
7064   if (TARGET_GENERAL_REGS_ONLY)
7065     {
7066       if (local_cum.aapcs_nvrn > 0)
7067         sorry ("%qs and floating point or vector arguments",
7068                "-mgeneral-regs-only");
7069       vr_saved = 0;
7070     }
7071
7072   if (!no_rtl)
7073     {
7074       if (gr_saved > 0)
7075         {
7076           rtx ptr, mem;
7077
7078           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7079           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7080                                - gr_saved * UNITS_PER_WORD);
7081           mem = gen_frame_mem (BLKmode, ptr);
7082           set_mem_alias_set (mem, get_varargs_alias_set ());
7083
7084           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7085                                mem, gr_saved);
7086         }
7087       if (vr_saved > 0)
7088         {
7089           /* We can't use move_block_from_reg, because it will use
7090              the wrong mode, storing D regs only.  */
7091           enum machine_mode mode = TImode;
7092           int off, i;
7093
7094           /* Set OFF to the offset from virtual_incoming_args_rtx of
7095              the first vector register.  The VR save area lies below
7096              the GR one, and is aligned to 16 bytes.  */
7097           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7098                                    STACK_BOUNDARY / BITS_PER_UNIT);
7099           off -= vr_saved * UNITS_PER_VREG;
7100
7101           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7102             {
7103               rtx ptr, mem;
7104
7105               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7106               mem = gen_frame_mem (mode, ptr);
7107               set_mem_alias_set (mem, get_varargs_alias_set ());
7108               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7109               off += UNITS_PER_VREG;
7110             }
7111         }
7112     }
7113
7114   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7115      any complication of having crtl->args.pretend_args_size changed.  */
7116   cfun->machine->frame.saved_varargs_size
7117     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7118                       STACK_BOUNDARY / BITS_PER_UNIT)
7119        + vr_saved * UNITS_PER_VREG);
7120 }
7121
7122 static void
7123 aarch64_conditional_register_usage (void)
7124 {
7125   int i;
7126   if (!TARGET_FLOAT)
7127     {
7128       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7129         {
7130           fixed_regs[i] = 1;
7131           call_used_regs[i] = 1;
7132         }
7133     }
7134 }
7135
7136 /* Walk down the type tree of TYPE counting consecutive base elements.
7137    If *MODEP is VOIDmode, then set it to the first valid floating point
7138    type.  If a non-floating point type is found, or if a floating point
7139    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7140    otherwise return the count in the sub-tree.  */
7141 static int
7142 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
7143 {
7144   enum machine_mode mode;
7145   HOST_WIDE_INT size;
7146
7147   switch (TREE_CODE (type))
7148     {
7149     case REAL_TYPE:
7150       mode = TYPE_MODE (type);
7151       if (mode != DFmode && mode != SFmode && mode != TFmode)
7152         return -1;
7153
7154       if (*modep == VOIDmode)
7155         *modep = mode;
7156
7157       if (*modep == mode)
7158         return 1;
7159
7160       break;
7161
7162     case COMPLEX_TYPE:
7163       mode = TYPE_MODE (TREE_TYPE (type));
7164       if (mode != DFmode && mode != SFmode && mode != TFmode)
7165         return -1;
7166
7167       if (*modep == VOIDmode)
7168         *modep = mode;
7169
7170       if (*modep == mode)
7171         return 2;
7172
7173       break;
7174
7175     case VECTOR_TYPE:
7176       /* Use V2SImode and V4SImode as representatives of all 64-bit
7177          and 128-bit vector types.  */
7178       size = int_size_in_bytes (type);
7179       switch (size)
7180         {
7181         case 8:
7182           mode = V2SImode;
7183           break;
7184         case 16:
7185           mode = V4SImode;
7186           break;
7187         default:
7188           return -1;
7189         }
7190
7191       if (*modep == VOIDmode)
7192         *modep = mode;
7193
7194       /* Vector modes are considered to be opaque: two vectors are
7195          equivalent for the purposes of being homogeneous aggregates
7196          if they are the same size.  */
7197       if (*modep == mode)
7198         return 1;
7199
7200       break;
7201
7202     case ARRAY_TYPE:
7203       {
7204         int count;
7205         tree index = TYPE_DOMAIN (type);
7206
7207         /* Can't handle incomplete types nor sizes that are not
7208            fixed.  */
7209         if (!COMPLETE_TYPE_P (type)
7210             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7211           return -1;
7212
7213         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7214         if (count == -1
7215             || !index
7216             || !TYPE_MAX_VALUE (index)
7217             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7218             || !TYPE_MIN_VALUE (index)
7219             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7220             || count < 0)
7221           return -1;
7222
7223         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7224                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7225
7226         /* There must be no padding.  */
7227         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7228           return -1;
7229
7230         return count;
7231       }
7232
7233     case RECORD_TYPE:
7234       {
7235         int count = 0;
7236         int sub_count;
7237         tree field;
7238
7239         /* Can't handle incomplete types nor sizes that are not
7240            fixed.  */
7241         if (!COMPLETE_TYPE_P (type)
7242             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7243           return -1;
7244
7245         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7246           {
7247             if (TREE_CODE (field) != FIELD_DECL)
7248               continue;
7249
7250             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7251             if (sub_count < 0)
7252               return -1;
7253             count += sub_count;
7254           }
7255
7256         /* There must be no padding.  */
7257         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7258           return -1;
7259
7260         return count;
7261       }
7262
7263     case UNION_TYPE:
7264     case QUAL_UNION_TYPE:
7265       {
7266         /* These aren't very interesting except in a degenerate case.  */
7267         int count = 0;
7268         int sub_count;
7269         tree field;
7270
7271         /* Can't handle incomplete types nor sizes that are not
7272            fixed.  */
7273         if (!COMPLETE_TYPE_P (type)
7274             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7275           return -1;
7276
7277         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7278           {
7279             if (TREE_CODE (field) != FIELD_DECL)
7280               continue;
7281
7282             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7283             if (sub_count < 0)
7284               return -1;
7285             count = count > sub_count ? count : sub_count;
7286           }
7287
7288         /* There must be no padding.  */
7289         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7290           return -1;
7291
7292         return count;
7293       }
7294
7295     default:
7296       break;
7297     }
7298
7299   return -1;
7300 }
7301
7302 /* Return true if we use LRA instead of reload pass.  */
7303 static bool
7304 aarch64_lra_p (void)
7305 {
7306   return aarch64_lra_flag;
7307 }
7308
7309 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7310    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7311    array types.  The C99 floating-point complex types are also considered
7312    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7313    types, which are GCC extensions and out of the scope of AAPCS64, are
7314    treated as composite types here as well.
7315
7316    Note that MODE itself is not sufficient in determining whether a type
7317    is such a composite type or not.  This is because
7318    stor-layout.c:compute_record_mode may have already changed the MODE
7319    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7320    structure with only one field may have its MODE set to the mode of the
7321    field.  Also an integer mode whose size matches the size of the
7322    RECORD_TYPE type may be used to substitute the original mode
7323    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7324    solely relied on.  */
7325
7326 static bool
7327 aarch64_composite_type_p (const_tree type,
7328                           enum machine_mode mode)
7329 {
7330   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7331     return true;
7332
7333   if (mode == BLKmode
7334       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7335       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7336     return true;
7337
7338   return false;
7339 }
7340
7341 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7342    type as described in AAPCS64 \S 4.1.2.
7343
7344    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7345
7346 static bool
7347 aarch64_short_vector_p (const_tree type,
7348                         enum machine_mode mode)
7349 {
7350   HOST_WIDE_INT size = -1;
7351
7352   if (type && TREE_CODE (type) == VECTOR_TYPE)
7353     size = int_size_in_bytes (type);
7354   else if (!aarch64_composite_type_p (type, mode)
7355            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7356                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7357     size = GET_MODE_SIZE (mode);
7358
7359   return (size == 8 || size == 16) ? true : false;
7360 }
7361
7362 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7363    shall be passed or returned in simd/fp register(s) (providing these
7364    parameter passing registers are available).
7365
7366    Upon successful return, *COUNT returns the number of needed registers,
7367    *BASE_MODE returns the mode of the individual register and when IS_HAF
7368    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7369    floating-point aggregate or a homogeneous short-vector aggregate.  */
7370
7371 static bool
7372 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7373                                          const_tree type,
7374                                          enum machine_mode *base_mode,
7375                                          int *count,
7376                                          bool *is_ha)
7377 {
7378   enum machine_mode new_mode = VOIDmode;
7379   bool composite_p = aarch64_composite_type_p (type, mode);
7380
7381   if (is_ha != NULL) *is_ha = false;
7382
7383   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7384       || aarch64_short_vector_p (type, mode))
7385     {
7386       *count = 1;
7387       new_mode = mode;
7388     }
7389   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7390     {
7391       if (is_ha != NULL) *is_ha = true;
7392       *count = 2;
7393       new_mode = GET_MODE_INNER (mode);
7394     }
7395   else if (type && composite_p)
7396     {
7397       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7398
7399       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7400         {
7401           if (is_ha != NULL) *is_ha = true;
7402           *count = ag_count;
7403         }
7404       else
7405         return false;
7406     }
7407   else
7408     return false;
7409
7410   *base_mode = new_mode;
7411   return true;
7412 }
7413
7414 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7415
7416 static rtx
7417 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7418                           int incoming ATTRIBUTE_UNUSED)
7419 {
7420   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7421 }
7422
7423 /* Implements target hook vector_mode_supported_p.  */
7424 static bool
7425 aarch64_vector_mode_supported_p (enum machine_mode mode)
7426 {
7427   if (TARGET_SIMD
7428       && (mode == V4SImode  || mode == V8HImode
7429           || mode == V16QImode || mode == V2DImode
7430           || mode == V2SImode  || mode == V4HImode
7431           || mode == V8QImode || mode == V2SFmode
7432           || mode == V4SFmode || mode == V2DFmode
7433           || mode == V1DFmode))
7434     return true;
7435
7436   return false;
7437 }
7438
7439 /* Return appropriate SIMD container
7440    for MODE within a vector of WIDTH bits.  */
7441 static enum machine_mode
7442 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7443 {
7444   gcc_assert (width == 64 || width == 128);
7445   if (TARGET_SIMD)
7446     {
7447       if (width == 128)
7448         switch (mode)
7449           {
7450           case DFmode:
7451             return V2DFmode;
7452           case SFmode:
7453             return V4SFmode;
7454           case SImode:
7455             return V4SImode;
7456           case HImode:
7457             return V8HImode;
7458           case QImode:
7459             return V16QImode;
7460           case DImode:
7461             return V2DImode;
7462           default:
7463             break;
7464           }
7465       else
7466         switch (mode)
7467           {
7468           case SFmode:
7469             return V2SFmode;
7470           case SImode:
7471             return V2SImode;
7472           case HImode:
7473             return V4HImode;
7474           case QImode:
7475             return V8QImode;
7476           default:
7477             break;
7478           }
7479     }
7480   return word_mode;
7481 }
7482
7483 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7484 static enum machine_mode
7485 aarch64_preferred_simd_mode (enum machine_mode mode)
7486 {
7487   return aarch64_simd_container_mode (mode, 128);
7488 }
7489
7490 /* Return the bitmask of possible vector sizes for the vectorizer
7491    to iterate over.  */
7492 static unsigned int
7493 aarch64_autovectorize_vector_sizes (void)
7494 {
7495   return (16 | 8);
7496 }
7497
7498 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7499    vector types in order to conform to the AAPCS64 (see "Procedure
7500    Call Standard for the ARM 64-bit Architecture", Appendix A).  To
7501    qualify for emission with the mangled names defined in that document,
7502    a vector type must not only be of the correct mode but also be
7503    composed of AdvSIMD vector element types (e.g.
7504    _builtin_aarch64_simd_qi); these types are registered by
7505    aarch64_init_simd_builtins ().  In other words, vector types defined
7506    in other ways e.g. via vector_size attribute will get default
7507    mangled names.  */
7508 typedef struct
7509 {
7510   enum machine_mode mode;
7511   const char *element_type_name;
7512   const char *mangled_name;
7513 } aarch64_simd_mangle_map_entry;
7514
7515 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7516   /* 64-bit containerized types.  */
7517   { V8QImode,  "__builtin_aarch64_simd_qi",     "10__Int8x8_t" },
7518   { V8QImode,  "__builtin_aarch64_simd_uqi",    "11__Uint8x8_t" },
7519   { V4HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x4_t" },
7520   { V4HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x4_t" },
7521   { V2SImode,  "__builtin_aarch64_simd_si",     "11__Int32x2_t" },
7522   { V2SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x2_t" },
7523   { V2SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x2_t" },
7524   { DImode,    "__builtin_aarch64_simd_di",     "11__Int64x1_t" },
7525   { DImode,    "__builtin_aarch64_simd_udi",    "12__Uint64x1_t" },
7526   { V1DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x1_t" },
7527   { V8QImode,  "__builtin_aarch64_simd_poly8",  "11__Poly8x8_t" },
7528   { V4HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7529   /* 128-bit containerized types.  */
7530   { V16QImode, "__builtin_aarch64_simd_qi",     "11__Int8x16_t" },
7531   { V16QImode, "__builtin_aarch64_simd_uqi",    "12__Uint8x16_t" },
7532   { V8HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x8_t" },
7533   { V8HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x8_t" },
7534   { V4SImode,  "__builtin_aarch64_simd_si",     "11__Int32x4_t" },
7535   { V4SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x4_t" },
7536   { V2DImode,  "__builtin_aarch64_simd_di",     "11__Int64x2_t" },
7537   { V2DImode,  "__builtin_aarch64_simd_udi",    "12__Uint64x2_t" },
7538   { V4SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x4_t" },
7539   { V2DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x2_t" },
7540   { V16QImode, "__builtin_aarch64_simd_poly8",  "12__Poly8x16_t" },
7541   { V8HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7542   { V2DImode,  "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7543   { VOIDmode, NULL, NULL }
7544 };
7545
7546 /* Implement TARGET_MANGLE_TYPE.  */
7547
7548 static const char *
7549 aarch64_mangle_type (const_tree type)
7550 {
7551   /* The AArch64 ABI documents say that "__va_list" has to be
7552      managled as if it is in the "std" namespace.  */
7553   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7554     return "St9__va_list";
7555
7556   /* Check the mode of the vector type, and the name of the vector
7557      element type, against the table.  */
7558   if (TREE_CODE (type) == VECTOR_TYPE)
7559     {
7560       aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7561
7562       while (pos->mode != VOIDmode)
7563         {
7564           tree elt_type = TREE_TYPE (type);
7565
7566           if (pos->mode == TYPE_MODE (type)
7567               && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7568               && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7569                           pos->element_type_name))
7570             return pos->mangled_name;
7571
7572           pos++;
7573         }
7574     }
7575
7576   /* Use the default mangling.  */
7577   return NULL;
7578 }
7579
7580 /* Return the equivalent letter for size.  */
7581 static char
7582 sizetochar (int size)
7583 {
7584   switch (size)
7585     {
7586     case 64: return 'd';
7587     case 32: return 's';
7588     case 16: return 'h';
7589     case 8 : return 'b';
7590     default: gcc_unreachable ();
7591     }
7592 }
7593
7594 /* Return true iff x is a uniform vector of floating-point
7595    constants, and the constant can be represented in
7596    quarter-precision form.  Note, as aarch64_float_const_representable
7597    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
7598 static bool
7599 aarch64_vect_float_const_representable_p (rtx x)
7600 {
7601   int i = 0;
7602   REAL_VALUE_TYPE r0, ri;
7603   rtx x0, xi;
7604
7605   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7606     return false;
7607
7608   x0 = CONST_VECTOR_ELT (x, 0);
7609   if (!CONST_DOUBLE_P (x0))
7610     return false;
7611
7612   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7613
7614   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7615     {
7616       xi = CONST_VECTOR_ELT (x, i);
7617       if (!CONST_DOUBLE_P (xi))
7618         return false;
7619
7620       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7621       if (!REAL_VALUES_EQUAL (r0, ri))
7622         return false;
7623     }
7624
7625   return aarch64_float_const_representable_p (x0);
7626 }
7627
7628 /* Return true for valid and false for invalid.  */
7629 bool
7630 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7631                               struct simd_immediate_info *info)
7632 {
7633 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
7634   matches = 1;                                          \
7635   for (i = 0; i < idx; i += (STRIDE))                   \
7636     if (!(TEST))                                        \
7637       matches = 0;                                      \
7638   if (matches)                                          \
7639     {                                                   \
7640       immtype = (CLASS);                                \
7641       elsize = (ELSIZE);                                \
7642       eshift = (SHIFT);                                 \
7643       emvn = (NEG);                                     \
7644       break;                                            \
7645     }
7646
7647   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7648   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7649   unsigned char bytes[16];
7650   int immtype = -1, matches;
7651   unsigned int invmask = inverse ? 0xff : 0;
7652   int eshift, emvn;
7653
7654   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7655     {
7656       if (! (aarch64_simd_imm_zero_p (op, mode)
7657              || aarch64_vect_float_const_representable_p (op)))
7658         return false;
7659
7660       if (info)
7661         {
7662           info->value = CONST_VECTOR_ELT (op, 0);
7663           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7664           info->mvn = false;
7665           info->shift = 0;
7666         }
7667
7668       return true;
7669     }
7670
7671   /* Splat vector constant out into a byte vector.  */
7672   for (i = 0; i < n_elts; i++)
7673     {
7674       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
7675          it must be laid out in the vector register in reverse order.  */
7676       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7677       unsigned HOST_WIDE_INT elpart;
7678       unsigned int part, parts;
7679
7680       if (CONST_INT_P (el))
7681         {
7682           elpart = INTVAL (el);
7683           parts = 1;
7684         }
7685       else if (GET_CODE (el) == CONST_DOUBLE)
7686         {
7687           elpart = CONST_DOUBLE_LOW (el);
7688           parts = 2;
7689         }
7690       else
7691         gcc_unreachable ();
7692
7693       for (part = 0; part < parts; part++)
7694         {
7695           unsigned int byte;
7696           for (byte = 0; byte < innersize; byte++)
7697             {
7698               bytes[idx++] = (elpart & 0xff) ^ invmask;
7699               elpart >>= BITS_PER_UNIT;
7700             }
7701           if (GET_CODE (el) == CONST_DOUBLE)
7702             elpart = CONST_DOUBLE_HIGH (el);
7703         }
7704     }
7705
7706   /* Sanity check.  */
7707   gcc_assert (idx == GET_MODE_SIZE (mode));
7708
7709   do
7710     {
7711       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7712              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7713
7714       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7715              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7716
7717       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7718              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7719
7720       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7721              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7722
7723       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7724
7725       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7726
7727       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7728              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7729
7730       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7731              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7732
7733       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7734              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7735
7736       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7737              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7738
7739       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7740
7741       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7742
7743       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7744              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7745
7746       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7747              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7748
7749       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7750              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7751
7752       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7753              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7754
7755       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7756
7757       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7758              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7759     }
7760   while (0);
7761
7762   if (immtype == -1)
7763     return false;
7764
7765   if (info)
7766     {
7767       info->element_width = elsize;
7768       info->mvn = emvn != 0;
7769       info->shift = eshift;
7770
7771       unsigned HOST_WIDE_INT imm = 0;
7772
7773       if (immtype >= 12 && immtype <= 15)
7774         info->msl = true;
7775
7776       /* Un-invert bytes of recognized vector, if necessary.  */
7777       if (invmask != 0)
7778         for (i = 0; i < idx; i++)
7779           bytes[i] ^= invmask;
7780
7781       if (immtype == 17)
7782         {
7783           /* FIXME: Broken on 32-bit H_W_I hosts.  */
7784           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7785
7786           for (i = 0; i < 8; i++)
7787             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7788               << (i * BITS_PER_UNIT);
7789
7790
7791           info->value = GEN_INT (imm);
7792         }
7793       else
7794         {
7795           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7796             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7797
7798           /* Construct 'abcdefgh' because the assembler cannot handle
7799              generic constants.  */
7800           if (info->mvn)
7801             imm = ~imm;
7802           imm = (imm >> info->shift) & 0xff;
7803           info->value = GEN_INT (imm);
7804         }
7805     }
7806
7807   return true;
7808 #undef CHECK
7809 }
7810
7811 static bool
7812 aarch64_const_vec_all_same_int_p (rtx x,
7813                                   HOST_WIDE_INT minval,
7814                                   HOST_WIDE_INT maxval)
7815 {
7816   HOST_WIDE_INT firstval;
7817   int count, i;
7818
7819   if (GET_CODE (x) != CONST_VECTOR
7820       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
7821     return false;
7822
7823   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
7824   if (firstval < minval || firstval > maxval)
7825     return false;
7826
7827   count = CONST_VECTOR_NUNITS (x);
7828   for (i = 1; i < count; i++)
7829     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
7830       return false;
7831
7832   return true;
7833 }
7834
7835 /* Check of immediate shift constants are within range.  */
7836 bool
7837 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7838 {
7839   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7840   if (left)
7841     return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
7842   else
7843     return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
7844 }
7845
7846 /* Return true if X is a uniform vector where all elements
7847    are either the floating-point constant 0.0 or the
7848    integer constant 0.  */
7849 bool
7850 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7851 {
7852   return x == CONST0_RTX (mode);
7853 }
7854
7855 bool
7856 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7857 {
7858   HOST_WIDE_INT imm = INTVAL (x);
7859   int i;
7860
7861   for (i = 0; i < 8; i++)
7862     {
7863       unsigned int byte = imm & 0xff;
7864       if (byte != 0xff && byte != 0)
7865        return false;
7866       imm >>= 8;
7867     }
7868
7869   return true;
7870 }
7871
7872 bool
7873 aarch64_mov_operand_p (rtx x,
7874                        enum aarch64_symbol_context context,
7875                        enum machine_mode mode)
7876 {
7877   if (GET_CODE (x) == HIGH
7878       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7879     return true;
7880
7881   if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7882     return true;
7883
7884   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7885     return true;
7886
7887   return aarch64_classify_symbolic_expression (x, context)
7888     == SYMBOL_TINY_ABSOLUTE;
7889 }
7890
7891 /* Return a const_int vector of VAL.  */
7892 rtx
7893 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7894 {
7895   int nunits = GET_MODE_NUNITS (mode);
7896   rtvec v = rtvec_alloc (nunits);
7897   int i;
7898
7899   for (i=0; i < nunits; i++)
7900     RTVEC_ELT (v, i) = GEN_INT (val);
7901
7902   return gen_rtx_CONST_VECTOR (mode, v);
7903 }
7904
7905 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
7906
7907 bool
7908 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7909 {
7910   enum machine_mode vmode;
7911
7912   gcc_assert (!VECTOR_MODE_P (mode));
7913   vmode = aarch64_preferred_simd_mode (mode);
7914   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7915   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7916 }
7917
7918 /* Construct and return a PARALLEL RTX vector with elements numbering the
7919    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
7920    the vector - from the perspective of the architecture.  This does not
7921    line up with GCC's perspective on lane numbers, so we end up with
7922    different masks depending on our target endian-ness.  The diagram
7923    below may help.  We must draw the distinction when building masks
7924    which select one half of the vector.  An instruction selecting
7925    architectural low-lanes for a big-endian target, must be described using
7926    a mask selecting GCC high-lanes.
7927
7928                  Big-Endian             Little-Endian
7929
7930 GCC             0   1   2   3           3   2   1   0
7931               | x | x | x | x |       | x | x | x | x |
7932 Architecture    3   2   1   0           3   2   1   0
7933
7934 Low Mask:         { 2, 3 }                { 0, 1 }
7935 High Mask:        { 0, 1 }                { 2, 3 }
7936 */
7937
7938 rtx
7939 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7940 {
7941   int nunits = GET_MODE_NUNITS (mode);
7942   rtvec v = rtvec_alloc (nunits / 2);
7943   int high_base = nunits / 2;
7944   int low_base = 0;
7945   int base;
7946   rtx t1;
7947   int i;
7948
7949   if (BYTES_BIG_ENDIAN)
7950     base = high ? low_base : high_base;
7951   else
7952     base = high ? high_base : low_base;
7953
7954   for (i = 0; i < nunits / 2; i++)
7955     RTVEC_ELT (v, i) = GEN_INT (base + i);
7956
7957   t1 = gen_rtx_PARALLEL (mode, v);
7958   return t1;
7959 }
7960
7961 /* Check OP for validity as a PARALLEL RTX vector with elements
7962    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
7963    from the perspective of the architecture.  See the diagram above
7964    aarch64_simd_vect_par_cnst_half for more details.  */
7965
7966 bool
7967 aarch64_simd_check_vect_par_cnst_half (rtx op, enum machine_mode mode,
7968                                        bool high)
7969 {
7970   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
7971   HOST_WIDE_INT count_op = XVECLEN (op, 0);
7972   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
7973   int i = 0;
7974
7975   if (!VECTOR_MODE_P (mode))
7976     return false;
7977
7978   if (count_op != count_ideal)
7979     return false;
7980
7981   for (i = 0; i < count_ideal; i++)
7982     {
7983       rtx elt_op = XVECEXP (op, 0, i);
7984       rtx elt_ideal = XVECEXP (ideal, 0, i);
7985
7986       if (!CONST_INT_P (elt_op)
7987           || INTVAL (elt_ideal) != INTVAL (elt_op))
7988         return false;
7989     }
7990   return true;
7991 }
7992
7993 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
7994    HIGH (exclusive).  */
7995 void
7996 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7997 {
7998   HOST_WIDE_INT lane;
7999   gcc_assert (CONST_INT_P (operand));
8000   lane = INTVAL (operand);
8001
8002   if (lane < low || lane >= high)
8003     error ("lane out of range");
8004 }
8005
8006 void
8007 aarch64_simd_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
8008 {
8009   gcc_assert (CONST_INT_P (operand));
8010   HOST_WIDE_INT lane = INTVAL (operand);
8011
8012   if (lane < low || lane >= high)
8013     error ("constant out of range");
8014 }
8015
8016 /* Emit code to reinterpret one AdvSIMD type as another,
8017    without altering bits.  */
8018 void
8019 aarch64_simd_reinterpret (rtx dest, rtx src)
8020 {
8021   emit_move_insn (dest, gen_lowpart (GET_MODE (dest), src));
8022 }
8023
8024 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
8025    registers).  */
8026 void
8027 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
8028                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
8029                             rtx op1)
8030 {
8031   rtx mem = gen_rtx_MEM (mode, destaddr);
8032   rtx tmp1 = gen_reg_rtx (mode);
8033   rtx tmp2 = gen_reg_rtx (mode);
8034
8035   emit_insn (intfn (tmp1, op1, tmp2));
8036
8037   emit_move_insn (mem, tmp1);
8038   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8039   emit_move_insn (mem, tmp2);
8040 }
8041
8042 /* Return TRUE if OP is a valid vector addressing mode.  */
8043 bool
8044 aarch64_simd_mem_operand_p (rtx op)
8045 {
8046   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8047                         || REG_P (XEXP (op, 0)));
8048 }
8049
8050 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8051    not to early-clobber SRC registers in the process.
8052
8053    We assume that the operands described by SRC and DEST represent a
8054    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
8055    number of components into which the copy has been decomposed.  */
8056 void
8057 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8058                                 rtx *src, unsigned int count)
8059 {
8060   unsigned int i;
8061
8062   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8063       || REGNO (operands[0]) < REGNO (operands[1]))
8064     {
8065       for (i = 0; i < count; i++)
8066         {
8067           operands[2 * i] = dest[i];
8068           operands[2 * i + 1] = src[i];
8069         }
8070     }
8071   else
8072     {
8073       for (i = 0; i < count; i++)
8074         {
8075           operands[2 * i] = dest[count - i - 1];
8076           operands[2 * i + 1] = src[count - i - 1];
8077         }
8078     }
8079 }
8080
8081 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8082    one of VSTRUCT modes: OI, CI or XI.  */
8083 int
8084 aarch64_simd_attr_length_move (rtx insn)
8085 {
8086   enum machine_mode mode;
8087
8088   extract_insn_cached (insn);
8089
8090   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8091     {
8092       mode = GET_MODE (recog_data.operand[0]);
8093       switch (mode)
8094         {
8095         case OImode:
8096           return 8;
8097         case CImode:
8098           return 12;
8099         case XImode:
8100           return 16;
8101         default:
8102           gcc_unreachable ();
8103         }
8104     }
8105   return 4;
8106 }
8107
8108 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8109    alignment of a vector to 128 bits.  */
8110 static HOST_WIDE_INT
8111 aarch64_simd_vector_alignment (const_tree type)
8112 {
8113   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8114   return MIN (align, 128);
8115 }
8116
8117 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8118 static bool
8119 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8120 {
8121   if (is_packed)
8122     return false;
8123
8124   /* We guarantee alignment for vectors up to 128-bits.  */
8125   if (tree_int_cst_compare (TYPE_SIZE (type),
8126                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8127     return false;
8128
8129   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8130   return true;
8131 }
8132
8133 /* If VALS is a vector constant that can be loaded into a register
8134    using DUP, generate instructions to do so and return an RTX to
8135    assign to the register.  Otherwise return NULL_RTX.  */
8136 static rtx
8137 aarch64_simd_dup_constant (rtx vals)
8138 {
8139   enum machine_mode mode = GET_MODE (vals);
8140   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8141   int n_elts = GET_MODE_NUNITS (mode);
8142   bool all_same = true;
8143   rtx x;
8144   int i;
8145
8146   if (GET_CODE (vals) != CONST_VECTOR)
8147     return NULL_RTX;
8148
8149   for (i = 1; i < n_elts; ++i)
8150     {
8151       x = CONST_VECTOR_ELT (vals, i);
8152       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8153         all_same = false;
8154     }
8155
8156   if (!all_same)
8157     return NULL_RTX;
8158
8159   /* We can load this constant by using DUP and a constant in a
8160      single ARM register.  This will be cheaper than a vector
8161      load.  */
8162   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8163   return gen_rtx_VEC_DUPLICATE (mode, x);
8164 }
8165
8166
8167 /* Generate code to load VALS, which is a PARALLEL containing only
8168    constants (for vec_init) or CONST_VECTOR, efficiently into a
8169    register.  Returns an RTX to copy into the register, or NULL_RTX
8170    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8171 static rtx
8172 aarch64_simd_make_constant (rtx vals)
8173 {
8174   enum machine_mode mode = GET_MODE (vals);
8175   rtx const_dup;
8176   rtx const_vec = NULL_RTX;
8177   int n_elts = GET_MODE_NUNITS (mode);
8178   int n_const = 0;
8179   int i;
8180
8181   if (GET_CODE (vals) == CONST_VECTOR)
8182     const_vec = vals;
8183   else if (GET_CODE (vals) == PARALLEL)
8184     {
8185       /* A CONST_VECTOR must contain only CONST_INTs and
8186          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8187          Only store valid constants in a CONST_VECTOR.  */
8188       for (i = 0; i < n_elts; ++i)
8189         {
8190           rtx x = XVECEXP (vals, 0, i);
8191           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8192             n_const++;
8193         }
8194       if (n_const == n_elts)
8195         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8196     }
8197   else
8198     gcc_unreachable ();
8199
8200   if (const_vec != NULL_RTX
8201       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8202     /* Load using MOVI/MVNI.  */
8203     return const_vec;
8204   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8205     /* Loaded using DUP.  */
8206     return const_dup;
8207   else if (const_vec != NULL_RTX)
8208     /* Load from constant pool. We can not take advantage of single-cycle
8209        LD1 because we need a PC-relative addressing mode.  */
8210     return const_vec;
8211   else
8212     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8213        We can not construct an initializer.  */
8214     return NULL_RTX;
8215 }
8216
8217 void
8218 aarch64_expand_vector_init (rtx target, rtx vals)
8219 {
8220   enum machine_mode mode = GET_MODE (target);
8221   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8222   int n_elts = GET_MODE_NUNITS (mode);
8223   int n_var = 0, one_var = -1;
8224   bool all_same = true;
8225   rtx x, mem;
8226   int i;
8227
8228   x = XVECEXP (vals, 0, 0);
8229   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8230     n_var = 1, one_var = 0;
8231
8232   for (i = 1; i < n_elts; ++i)
8233     {
8234       x = XVECEXP (vals, 0, i);
8235       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8236         ++n_var, one_var = i;
8237
8238       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8239         all_same = false;
8240     }
8241
8242   if (n_var == 0)
8243     {
8244       rtx constant = aarch64_simd_make_constant (vals);
8245       if (constant != NULL_RTX)
8246         {
8247           emit_move_insn (target, constant);
8248           return;
8249         }
8250     }
8251
8252   /* Splat a single non-constant element if we can.  */
8253   if (all_same)
8254     {
8255       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8256       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8257       return;
8258     }
8259
8260   /* One field is non-constant.  Load constant then overwrite varying
8261      field.  This is more efficient than using the stack.  */
8262   if (n_var == 1)
8263     {
8264       rtx copy = copy_rtx (vals);
8265       rtx index = GEN_INT (one_var);
8266       enum insn_code icode;
8267
8268       /* Load constant part of vector, substitute neighboring value for
8269          varying element.  */
8270       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8271       aarch64_expand_vector_init (target, copy);
8272
8273       /* Insert variable.  */
8274       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8275       icode = optab_handler (vec_set_optab, mode);
8276       gcc_assert (icode != CODE_FOR_nothing);
8277       emit_insn (GEN_FCN (icode) (target, x, index));
8278       return;
8279     }
8280
8281   /* Construct the vector in memory one field at a time
8282      and load the whole vector.  */
8283   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8284   for (i = 0; i < n_elts; i++)
8285     emit_move_insn (adjust_address_nv (mem, inner_mode,
8286                                     i * GET_MODE_SIZE (inner_mode)),
8287                     XVECEXP (vals, 0, i));
8288   emit_move_insn (target, mem);
8289
8290 }
8291
8292 static unsigned HOST_WIDE_INT
8293 aarch64_shift_truncation_mask (enum machine_mode mode)
8294 {
8295   return
8296     (aarch64_vector_mode_supported_p (mode)
8297      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8298 }
8299
8300 #ifndef TLS_SECTION_ASM_FLAG
8301 #define TLS_SECTION_ASM_FLAG 'T'
8302 #endif
8303
8304 void
8305 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8306                                tree decl ATTRIBUTE_UNUSED)
8307 {
8308   char flagchars[10], *f = flagchars;
8309
8310   /* If we have already declared this section, we can use an
8311      abbreviated form to switch back to it -- unless this section is
8312      part of a COMDAT groups, in which case GAS requires the full
8313      declaration every time.  */
8314   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8315       && (flags & SECTION_DECLARED))
8316     {
8317       fprintf (asm_out_file, "\t.section\t%s\n", name);
8318       return;
8319     }
8320
8321   if (!(flags & SECTION_DEBUG))
8322     *f++ = 'a';
8323   if (flags & SECTION_WRITE)
8324     *f++ = 'w';
8325   if (flags & SECTION_CODE)
8326     *f++ = 'x';
8327   if (flags & SECTION_SMALL)
8328     *f++ = 's';
8329   if (flags & SECTION_MERGE)
8330     *f++ = 'M';
8331   if (flags & SECTION_STRINGS)
8332     *f++ = 'S';
8333   if (flags & SECTION_TLS)
8334     *f++ = TLS_SECTION_ASM_FLAG;
8335   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8336     *f++ = 'G';
8337   *f = '\0';
8338
8339   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8340
8341   if (!(flags & SECTION_NOTYPE))
8342     {
8343       const char *type;
8344       const char *format;
8345
8346       if (flags & SECTION_BSS)
8347         type = "nobits";
8348       else
8349         type = "progbits";
8350
8351 #ifdef TYPE_OPERAND_FMT
8352       format = "," TYPE_OPERAND_FMT;
8353 #else
8354       format = ",@%s";
8355 #endif
8356
8357       fprintf (asm_out_file, format, type);
8358
8359       if (flags & SECTION_ENTSIZE)
8360         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8361       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8362         {
8363           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8364             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8365           else
8366             fprintf (asm_out_file, ",%s,comdat",
8367                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8368         }
8369     }
8370
8371   putc ('\n', asm_out_file);
8372 }
8373
8374 /* Select a format to encode pointers in exception handling data.  */
8375 int
8376 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8377 {
8378    int type;
8379    switch (aarch64_cmodel)
8380      {
8381      case AARCH64_CMODEL_TINY:
8382      case AARCH64_CMODEL_TINY_PIC:
8383      case AARCH64_CMODEL_SMALL:
8384      case AARCH64_CMODEL_SMALL_PIC:
8385        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8386           for everything.  */
8387        type = DW_EH_PE_sdata4;
8388        break;
8389      default:
8390        /* No assumptions here.  8-byte relocs required.  */
8391        type = DW_EH_PE_sdata8;
8392        break;
8393      }
8394    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8395 }
8396
8397 /* Emit load exclusive.  */
8398
8399 static void
8400 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8401                              rtx mem, rtx model_rtx)
8402 {
8403   rtx (*gen) (rtx, rtx, rtx);
8404
8405   switch (mode)
8406     {
8407     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8408     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8409     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8410     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8411     default:
8412       gcc_unreachable ();
8413     }
8414
8415   emit_insn (gen (rval, mem, model_rtx));
8416 }
8417
8418 /* Emit store exclusive.  */
8419
8420 static void
8421 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8422                               rtx rval, rtx mem, rtx model_rtx)
8423 {
8424   rtx (*gen) (rtx, rtx, rtx, rtx);
8425
8426   switch (mode)
8427     {
8428     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8429     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8430     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8431     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8432     default:
8433       gcc_unreachable ();
8434     }
8435
8436   emit_insn (gen (bval, rval, mem, model_rtx));
8437 }
8438
8439 /* Mark the previous jump instruction as unlikely.  */
8440
8441 static void
8442 aarch64_emit_unlikely_jump (rtx insn)
8443 {
8444   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8445
8446   insn = emit_jump_insn (insn);
8447   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8448 }
8449
8450 /* Expand a compare and swap pattern.  */
8451
8452 void
8453 aarch64_expand_compare_and_swap (rtx operands[])
8454 {
8455   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8456   enum machine_mode mode, cmp_mode;
8457   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8458
8459   bval = operands[0];
8460   rval = operands[1];
8461   mem = operands[2];
8462   oldval = operands[3];
8463   newval = operands[4];
8464   is_weak = operands[5];
8465   mod_s = operands[6];
8466   mod_f = operands[7];
8467   mode = GET_MODE (mem);
8468   cmp_mode = mode;
8469
8470   /* Normally the succ memory model must be stronger than fail, but in the
8471      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8472      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8473
8474   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8475       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8476     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8477
8478   switch (mode)
8479     {
8480     case QImode:
8481     case HImode:
8482       /* For short modes, we're going to perform the comparison in SImode,
8483          so do the zero-extension now.  */
8484       cmp_mode = SImode;
8485       rval = gen_reg_rtx (SImode);
8486       oldval = convert_modes (SImode, mode, oldval, true);
8487       /* Fall through.  */
8488
8489     case SImode:
8490     case DImode:
8491       /* Force the value into a register if needed.  */
8492       if (!aarch64_plus_operand (oldval, mode))
8493         oldval = force_reg (cmp_mode, oldval);
8494       break;
8495
8496     default:
8497       gcc_unreachable ();
8498     }
8499
8500   switch (mode)
8501     {
8502     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8503     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8504     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8505     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8506     default:
8507       gcc_unreachable ();
8508     }
8509
8510   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8511
8512   if (mode == QImode || mode == HImode)
8513     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8514
8515   x = gen_rtx_REG (CCmode, CC_REGNUM);
8516   x = gen_rtx_EQ (SImode, x, const0_rtx);
8517   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8518 }
8519
8520 /* Split a compare and swap pattern.  */
8521
8522 void
8523 aarch64_split_compare_and_swap (rtx operands[])
8524 {
8525   rtx rval, mem, oldval, newval, scratch;
8526   enum machine_mode mode;
8527   bool is_weak;
8528   rtx label1, label2, x, cond;
8529
8530   rval = operands[0];
8531   mem = operands[1];
8532   oldval = operands[2];
8533   newval = operands[3];
8534   is_weak = (operands[4] != const0_rtx);
8535   scratch = operands[7];
8536   mode = GET_MODE (mem);
8537
8538   label1 = NULL_RTX;
8539   if (!is_weak)
8540     {
8541       label1 = gen_label_rtx ();
8542       emit_label (label1);
8543     }
8544   label2 = gen_label_rtx ();
8545
8546   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8547
8548   cond = aarch64_gen_compare_reg (NE, rval, oldval);
8549   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8550   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8551                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8552   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8553
8554   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8555
8556   if (!is_weak)
8557     {
8558       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8559       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8560                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8561       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8562     }
8563   else
8564     {
8565       cond = gen_rtx_REG (CCmode, CC_REGNUM);
8566       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8567       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8568     }
8569
8570   emit_label (label2);
8571 }
8572
8573 /* Split an atomic operation.  */
8574
8575 void
8576 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8577                      rtx value, rtx model_rtx, rtx cond)
8578 {
8579   enum machine_mode mode = GET_MODE (mem);
8580   enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8581   rtx label, x;
8582
8583   label = gen_label_rtx ();
8584   emit_label (label);
8585
8586   if (new_out)
8587     new_out = gen_lowpart (wmode, new_out);
8588   if (old_out)
8589     old_out = gen_lowpart (wmode, old_out);
8590   else
8591     old_out = new_out;
8592   value = simplify_gen_subreg (wmode, value, mode, 0);
8593
8594   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8595
8596   switch (code)
8597     {
8598     case SET:
8599       new_out = value;
8600       break;
8601
8602     case NOT:
8603       x = gen_rtx_AND (wmode, old_out, value);
8604       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8605       x = gen_rtx_NOT (wmode, new_out);
8606       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8607       break;
8608
8609     case MINUS:
8610       if (CONST_INT_P (value))
8611         {
8612           value = GEN_INT (-INTVAL (value));
8613           code = PLUS;
8614         }
8615       /* Fall through.  */
8616
8617     default:
8618       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8619       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8620       break;
8621     }
8622
8623   aarch64_emit_store_exclusive (mode, cond, mem,
8624                                 gen_lowpart (mode, new_out), model_rtx);
8625
8626   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8627   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8628                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8629   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8630 }
8631
8632 static void
8633 aarch64_print_extension (void)
8634 {
8635   const struct aarch64_option_extension *opt = NULL;
8636
8637   for (opt = all_extensions; opt->name != NULL; opt++)
8638     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8639       asm_fprintf (asm_out_file, "+%s", opt->name);
8640
8641   asm_fprintf (asm_out_file, "\n");
8642 }
8643
8644 static void
8645 aarch64_start_file (void)
8646 {
8647   if (selected_arch)
8648     {
8649       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8650       aarch64_print_extension ();
8651     }
8652   else if (selected_cpu)
8653     {
8654       const char *truncated_name
8655             = aarch64_rewrite_selected_cpu (selected_cpu->name);
8656       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8657       aarch64_print_extension ();
8658     }
8659   default_file_start();
8660 }
8661
8662 /* Target hook for c_mode_for_suffix.  */
8663 static enum machine_mode
8664 aarch64_c_mode_for_suffix (char suffix)
8665 {
8666   if (suffix == 'q')
8667     return TFmode;
8668
8669   return VOIDmode;
8670 }
8671
8672 /* We can only represent floating point constants which will fit in
8673    "quarter-precision" values.  These values are characterised by
8674    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
8675    by:
8676
8677    (-1)^s * (n/16) * 2^r
8678
8679    Where:
8680      's' is the sign bit.
8681      'n' is an integer in the range 16 <= n <= 31.
8682      'r' is an integer in the range -3 <= r <= 4.  */
8683
8684 /* Return true iff X can be represented by a quarter-precision
8685    floating point immediate operand X.  Note, we cannot represent 0.0.  */
8686 bool
8687 aarch64_float_const_representable_p (rtx x)
8688 {
8689   /* This represents our current view of how many bits
8690      make up the mantissa.  */
8691   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8692   int exponent;
8693   unsigned HOST_WIDE_INT mantissa, mask;
8694   REAL_VALUE_TYPE r, m;
8695   bool fail;
8696
8697   if (!CONST_DOUBLE_P (x))
8698     return false;
8699
8700   if (GET_MODE (x) == VOIDmode)
8701     return false;
8702
8703   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8704
8705   /* We cannot represent infinities, NaNs or +/-zero.  We won't
8706      know if we have +zero until we analyse the mantissa, but we
8707      can reject the other invalid values.  */
8708   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8709       || REAL_VALUE_MINUS_ZERO (r))
8710     return false;
8711
8712   /* Extract exponent.  */
8713   r = real_value_abs (&r);
8714   exponent = REAL_EXP (&r);
8715
8716   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8717      highest (sign) bit, with a fixed binary point at bit point_pos.
8718      m1 holds the low part of the mantissa, m2 the high part.
8719      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8720      bits for the mantissa, this can fail (low bits will be lost).  */
8721   real_ldexp (&m, &r, point_pos - exponent);
8722   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8723
8724   /* If the low part of the mantissa has bits set we cannot represent
8725      the value.  */
8726   if (w.elt (0) != 0)
8727     return false;
8728   /* We have rejected the lower HOST_WIDE_INT, so update our
8729      understanding of how many bits lie in the mantissa and
8730      look only at the high HOST_WIDE_INT.  */
8731   mantissa = w.elt (1);
8732   point_pos -= HOST_BITS_PER_WIDE_INT;
8733
8734   /* We can only represent values with a mantissa of the form 1.xxxx.  */
8735   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8736   if ((mantissa & mask) != 0)
8737     return false;
8738
8739   /* Having filtered unrepresentable values, we may now remove all
8740      but the highest 5 bits.  */
8741   mantissa >>= point_pos - 5;
8742
8743   /* We cannot represent the value 0.0, so reject it.  This is handled
8744      elsewhere.  */
8745   if (mantissa == 0)
8746     return false;
8747
8748   /* Then, as bit 4 is always set, we can mask it off, leaving
8749      the mantissa in the range [0, 15].  */
8750   mantissa &= ~(1 << 4);
8751   gcc_assert (mantissa <= 15);
8752
8753   /* GCC internally does not use IEEE754-like encoding (where normalized
8754      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
8755      Our mantissa values are shifted 4 places to the left relative to
8756      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8757      by 5 places to correct for GCC's representation.  */
8758   exponent = 5 - exponent;
8759
8760   return (exponent >= 0 && exponent <= 7);
8761 }
8762
8763 char*
8764 aarch64_output_simd_mov_immediate (rtx const_vector,
8765                                    enum machine_mode mode,
8766                                    unsigned width)
8767 {
8768   bool is_valid;
8769   static char templ[40];
8770   const char *mnemonic;
8771   const char *shift_op;
8772   unsigned int lane_count = 0;
8773   char element_char;
8774
8775   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8776
8777   /* This will return true to show const_vector is legal for use as either
8778      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
8779      also update INFO to show how the immediate should be generated.  */
8780   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8781   gcc_assert (is_valid);
8782
8783   element_char = sizetochar (info.element_width);
8784   lane_count = width / info.element_width;
8785
8786   mode = GET_MODE_INNER (mode);
8787   if (mode == SFmode || mode == DFmode)
8788     {
8789       gcc_assert (info.shift == 0 && ! info.mvn);
8790       if (aarch64_float_const_zero_rtx_p (info.value))
8791         info.value = GEN_INT (0);
8792       else
8793         {
8794 #define buf_size 20
8795           REAL_VALUE_TYPE r;
8796           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8797           char float_buf[buf_size] = {'\0'};
8798           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8799 #undef buf_size
8800
8801           if (lane_count == 1)
8802             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8803           else
8804             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8805                       lane_count, element_char, float_buf);
8806           return templ;
8807         }
8808     }
8809
8810   mnemonic = info.mvn ? "mvni" : "movi";
8811   shift_op = info.msl ? "msl" : "lsl";
8812
8813   if (lane_count == 1)
8814     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8815               mnemonic, UINTVAL (info.value));
8816   else if (info.shift)
8817     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8818               ", %s %d", mnemonic, lane_count, element_char,
8819               UINTVAL (info.value), shift_op, info.shift);
8820   else
8821     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8822               mnemonic, lane_count, element_char, UINTVAL (info.value));
8823   return templ;
8824 }
8825
8826 char*
8827 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8828                                           enum machine_mode mode)
8829 {
8830   enum machine_mode vmode;
8831
8832   gcc_assert (!VECTOR_MODE_P (mode));
8833   vmode = aarch64_simd_container_mode (mode, 64);
8834   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8835   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8836 }
8837
8838 /* Split operands into moves from op[1] + op[2] into op[0].  */
8839
8840 void
8841 aarch64_split_combinev16qi (rtx operands[3])
8842 {
8843   unsigned int dest = REGNO (operands[0]);
8844   unsigned int src1 = REGNO (operands[1]);
8845   unsigned int src2 = REGNO (operands[2]);
8846   enum machine_mode halfmode = GET_MODE (operands[1]);
8847   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8848   rtx destlo, desthi;
8849
8850   gcc_assert (halfmode == V16QImode);
8851
8852   if (src1 == dest && src2 == dest + halfregs)
8853     {
8854       /* No-op move.  Can't split to nothing; emit something.  */
8855       emit_note (NOTE_INSN_DELETED);
8856       return;
8857     }
8858
8859   /* Preserve register attributes for variable tracking.  */
8860   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8861   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8862                                GET_MODE_SIZE (halfmode));
8863
8864   /* Special case of reversed high/low parts.  */
8865   if (reg_overlap_mentioned_p (operands[2], destlo)
8866       && reg_overlap_mentioned_p (operands[1], desthi))
8867     {
8868       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8869       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8870       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8871     }
8872   else if (!reg_overlap_mentioned_p (operands[2], destlo))
8873     {
8874       /* Try to avoid unnecessary moves if part of the result
8875          is in the right place already.  */
8876       if (src1 != dest)
8877         emit_move_insn (destlo, operands[1]);
8878       if (src2 != dest + halfregs)
8879         emit_move_insn (desthi, operands[2]);
8880     }
8881   else
8882     {
8883       if (src2 != dest + halfregs)
8884         emit_move_insn (desthi, operands[2]);
8885       if (src1 != dest)
8886         emit_move_insn (destlo, operands[1]);
8887     }
8888 }
8889
8890 /* vec_perm support.  */
8891
8892 #define MAX_VECT_LEN 16
8893
8894 struct expand_vec_perm_d
8895 {
8896   rtx target, op0, op1;
8897   unsigned char perm[MAX_VECT_LEN];
8898   enum machine_mode vmode;
8899   unsigned char nelt;
8900   bool one_vector_p;
8901   bool testing_p;
8902 };
8903
8904 /* Generate a variable permutation.  */
8905
8906 static void
8907 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8908 {
8909   enum machine_mode vmode = GET_MODE (target);
8910   bool one_vector_p = rtx_equal_p (op0, op1);
8911
8912   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8913   gcc_checking_assert (GET_MODE (op0) == vmode);
8914   gcc_checking_assert (GET_MODE (op1) == vmode);
8915   gcc_checking_assert (GET_MODE (sel) == vmode);
8916   gcc_checking_assert (TARGET_SIMD);
8917
8918   if (one_vector_p)
8919     {
8920       if (vmode == V8QImode)
8921         {
8922           /* Expand the argument to a V16QI mode by duplicating it.  */
8923           rtx pair = gen_reg_rtx (V16QImode);
8924           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8925           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8926         }
8927       else
8928         {
8929           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8930         }
8931     }
8932   else
8933     {
8934       rtx pair;
8935
8936       if (vmode == V8QImode)
8937         {
8938           pair = gen_reg_rtx (V16QImode);
8939           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8940           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8941         }
8942       else
8943         {
8944           pair = gen_reg_rtx (OImode);
8945           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8946           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8947         }
8948     }
8949 }
8950
8951 void
8952 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8953 {
8954   enum machine_mode vmode = GET_MODE (target);
8955   unsigned int nelt = GET_MODE_NUNITS (vmode);
8956   bool one_vector_p = rtx_equal_p (op0, op1);
8957   rtx mask;
8958
8959   /* The TBL instruction does not use a modulo index, so we must take care
8960      of that ourselves.  */
8961   mask = aarch64_simd_gen_const_vector_dup (vmode,
8962       one_vector_p ? nelt - 1 : 2 * nelt - 1);
8963   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8964
8965   /* For big-endian, we also need to reverse the index within the vector
8966      (but not which vector).  */
8967   if (BYTES_BIG_ENDIAN)
8968     {
8969       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
8970       if (!one_vector_p)
8971         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
8972       sel = expand_simple_binop (vmode, XOR, sel, mask,
8973                                  NULL, 0, OPTAB_LIB_WIDEN);
8974     }
8975   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8976 }
8977
8978 /* Recognize patterns suitable for the TRN instructions.  */
8979 static bool
8980 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8981 {
8982   unsigned int i, odd, mask, nelt = d->nelt;
8983   rtx out, in0, in1, x;
8984   rtx (*gen) (rtx, rtx, rtx);
8985   enum machine_mode vmode = d->vmode;
8986
8987   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8988     return false;
8989
8990   /* Note that these are little-endian tests.
8991      We correct for big-endian later.  */
8992   if (d->perm[0] == 0)
8993     odd = 0;
8994   else if (d->perm[0] == 1)
8995     odd = 1;
8996   else
8997     return false;
8998   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8999
9000   for (i = 0; i < nelt; i += 2)
9001     {
9002       if (d->perm[i] != i + odd)
9003         return false;
9004       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9005         return false;
9006     }
9007
9008   /* Success!  */
9009   if (d->testing_p)
9010     return true;
9011
9012   in0 = d->op0;
9013   in1 = d->op1;
9014   if (BYTES_BIG_ENDIAN)
9015     {
9016       x = in0, in0 = in1, in1 = x;
9017       odd = !odd;
9018     }
9019   out = d->target;
9020
9021   if (odd)
9022     {
9023       switch (vmode)
9024         {
9025         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9026         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9027         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9028         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9029         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9030         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9031         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9032         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9033         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9034         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9035         default:
9036           return false;
9037         }
9038     }
9039   else
9040     {
9041       switch (vmode)
9042         {
9043         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9044         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9045         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9046         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9047         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9048         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9049         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9050         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9051         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9052         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9053         default:
9054           return false;
9055         }
9056     }
9057
9058   emit_insn (gen (out, in0, in1));
9059   return true;
9060 }
9061
9062 /* Recognize patterns suitable for the UZP instructions.  */
9063 static bool
9064 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9065 {
9066   unsigned int i, odd, mask, nelt = d->nelt;
9067   rtx out, in0, in1, x;
9068   rtx (*gen) (rtx, rtx, rtx);
9069   enum machine_mode vmode = d->vmode;
9070
9071   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9072     return false;
9073
9074   /* Note that these are little-endian tests.
9075      We correct for big-endian later.  */
9076   if (d->perm[0] == 0)
9077     odd = 0;
9078   else if (d->perm[0] == 1)
9079     odd = 1;
9080   else
9081     return false;
9082   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9083
9084   for (i = 0; i < nelt; i++)
9085     {
9086       unsigned elt = (i * 2 + odd) & mask;
9087       if (d->perm[i] != elt)
9088         return false;
9089     }
9090
9091   /* Success!  */
9092   if (d->testing_p)
9093     return true;
9094
9095   in0 = d->op0;
9096   in1 = d->op1;
9097   if (BYTES_BIG_ENDIAN)
9098     {
9099       x = in0, in0 = in1, in1 = x;
9100       odd = !odd;
9101     }
9102   out = d->target;
9103
9104   if (odd)
9105     {
9106       switch (vmode)
9107         {
9108         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9109         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9110         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9111         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9112         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9113         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9114         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9115         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9116         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9117         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9118         default:
9119           return false;
9120         }
9121     }
9122   else
9123     {
9124       switch (vmode)
9125         {
9126         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9127         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9128         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9129         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9130         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9131         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9132         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9133         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9134         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9135         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9136         default:
9137           return false;
9138         }
9139     }
9140
9141   emit_insn (gen (out, in0, in1));
9142   return true;
9143 }
9144
9145 /* Recognize patterns suitable for the ZIP instructions.  */
9146 static bool
9147 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9148 {
9149   unsigned int i, high, mask, nelt = d->nelt;
9150   rtx out, in0, in1, x;
9151   rtx (*gen) (rtx, rtx, rtx);
9152   enum machine_mode vmode = d->vmode;
9153
9154   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9155     return false;
9156
9157   /* Note that these are little-endian tests.
9158      We correct for big-endian later.  */
9159   high = nelt / 2;
9160   if (d->perm[0] == high)
9161     /* Do Nothing.  */
9162     ;
9163   else if (d->perm[0] == 0)
9164     high = 0;
9165   else
9166     return false;
9167   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9168
9169   for (i = 0; i < nelt / 2; i++)
9170     {
9171       unsigned elt = (i + high) & mask;
9172       if (d->perm[i * 2] != elt)
9173         return false;
9174       elt = (elt + nelt) & mask;
9175       if (d->perm[i * 2 + 1] != elt)
9176         return false;
9177     }
9178
9179   /* Success!  */
9180   if (d->testing_p)
9181     return true;
9182
9183   in0 = d->op0;
9184   in1 = d->op1;
9185   if (BYTES_BIG_ENDIAN)
9186     {
9187       x = in0, in0 = in1, in1 = x;
9188       high = !high;
9189     }
9190   out = d->target;
9191
9192   if (high)
9193     {
9194       switch (vmode)
9195         {
9196         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9197         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9198         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9199         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9200         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9201         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9202         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9203         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9204         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9205         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9206         default:
9207           return false;
9208         }
9209     }
9210   else
9211     {
9212       switch (vmode)
9213         {
9214         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9215         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9216         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9217         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9218         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9219         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9220         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9221         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9222         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9223         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9224         default:
9225           return false;
9226         }
9227     }
9228
9229   emit_insn (gen (out, in0, in1));
9230   return true;
9231 }
9232
9233 /* Recognize patterns for the EXT insn.  */
9234
9235 static bool
9236 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9237 {
9238   unsigned int i, nelt = d->nelt;
9239   rtx (*gen) (rtx, rtx, rtx, rtx);
9240   rtx offset;
9241
9242   unsigned int location = d->perm[0]; /* Always < nelt.  */
9243
9244   /* Check if the extracted indices are increasing by one.  */
9245   for (i = 1; i < nelt; i++)
9246     {
9247       unsigned int required = location + i;
9248       if (d->one_vector_p)
9249         {
9250           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9251           required &= (nelt - 1);
9252         }
9253       if (d->perm[i] != required)
9254         return false;
9255     }
9256
9257   switch (d->vmode)
9258     {
9259     case V16QImode: gen = gen_aarch64_extv16qi; break;
9260     case V8QImode: gen = gen_aarch64_extv8qi; break;
9261     case V4HImode: gen = gen_aarch64_extv4hi; break;
9262     case V8HImode: gen = gen_aarch64_extv8hi; break;
9263     case V2SImode: gen = gen_aarch64_extv2si; break;
9264     case V4SImode: gen = gen_aarch64_extv4si; break;
9265     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9266     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9267     case V2DImode: gen = gen_aarch64_extv2di; break;
9268     case V2DFmode: gen = gen_aarch64_extv2df; break;
9269     default:
9270       return false;
9271     }
9272
9273   /* Success! */
9274   if (d->testing_p)
9275     return true;
9276
9277   /* The case where (location == 0) is a no-op for both big- and little-endian,
9278      and is removed by the mid-end at optimization levels -O1 and higher.  */
9279
9280   if (BYTES_BIG_ENDIAN && (location != 0))
9281     {
9282       /* After setup, we want the high elements of the first vector (stored
9283          at the LSB end of the register), and the low elements of the second
9284          vector (stored at the MSB end of the register). So swap.  */
9285       rtx temp = d->op0;
9286       d->op0 = d->op1;
9287       d->op1 = temp;
9288       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9289       location = nelt - location;
9290     }
9291
9292   offset = GEN_INT (location);
9293   emit_insn (gen (d->target, d->op0, d->op1, offset));
9294   return true;
9295 }
9296
9297 /* Recognize patterns for the REV insns.  */
9298
9299 static bool
9300 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9301 {
9302   unsigned int i, j, diff, nelt = d->nelt;
9303   rtx (*gen) (rtx, rtx);
9304
9305   if (!d->one_vector_p)
9306     return false;
9307
9308   diff = d->perm[0];
9309   switch (diff)
9310     {
9311     case 7:
9312       switch (d->vmode)
9313         {
9314         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9315         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9316         default:
9317           return false;
9318         }
9319       break;
9320     case 3:
9321       switch (d->vmode)
9322         {
9323         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9324         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9325         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9326         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9327         default:
9328           return false;
9329         }
9330       break;
9331     case 1:
9332       switch (d->vmode)
9333         {
9334         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9335         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9336         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9337         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9338         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9339         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9340         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9341         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9342         default:
9343           return false;
9344         }
9345       break;
9346     default:
9347       return false;
9348     }
9349
9350   for (i = 0; i < nelt ; i += diff + 1)
9351     for (j = 0; j <= diff; j += 1)
9352       {
9353         /* This is guaranteed to be true as the value of diff
9354            is 7, 3, 1 and we should have enough elements in the
9355            queue to generate this.  Getting a vector mask with a
9356            value of diff other than these values implies that
9357            something is wrong by the time we get here.  */
9358         gcc_assert (i + j < nelt);
9359         if (d->perm[i + j] != i + diff - j)
9360           return false;
9361       }
9362
9363   /* Success! */
9364   if (d->testing_p)
9365     return true;
9366
9367   emit_insn (gen (d->target, d->op0));
9368   return true;
9369 }
9370
9371 static bool
9372 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9373 {
9374   rtx (*gen) (rtx, rtx, rtx);
9375   rtx out = d->target;
9376   rtx in0;
9377   enum machine_mode vmode = d->vmode;
9378   unsigned int i, elt, nelt = d->nelt;
9379   rtx lane;
9380
9381   elt = d->perm[0];
9382   for (i = 1; i < nelt; i++)
9383     {
9384       if (elt != d->perm[i])
9385         return false;
9386     }
9387
9388   /* The generic preparation in aarch64_expand_vec_perm_const_1
9389      swaps the operand order and the permute indices if it finds
9390      d->perm[0] to be in the second operand.  Thus, we can always
9391      use d->op0 and need not do any extra arithmetic to get the
9392      correct lane number.  */
9393   in0 = d->op0;
9394   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9395
9396   switch (vmode)
9397     {
9398     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9399     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9400     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9401     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9402     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9403     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9404     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9405     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9406     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9407     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9408     default:
9409       return false;
9410     }
9411
9412   emit_insn (gen (out, in0, lane));
9413   return true;
9414 }
9415
9416 static bool
9417 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9418 {
9419   rtx rperm[MAX_VECT_LEN], sel;
9420   enum machine_mode vmode = d->vmode;
9421   unsigned int i, nelt = d->nelt;
9422
9423   if (d->testing_p)
9424     return true;
9425
9426   /* Generic code will try constant permutation twice.  Once with the
9427      original mode and again with the elements lowered to QImode.
9428      So wait and don't do the selector expansion ourselves.  */
9429   if (vmode != V8QImode && vmode != V16QImode)
9430     return false;
9431
9432   for (i = 0; i < nelt; ++i)
9433     {
9434       int nunits = GET_MODE_NUNITS (vmode);
9435
9436       /* If big-endian and two vectors we end up with a weird mixed-endian
9437          mode on NEON.  Reverse the index within each word but not the word
9438          itself.  */
9439       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9440                                            : d->perm[i]);
9441     }
9442   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9443   sel = force_reg (vmode, sel);
9444
9445   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9446   return true;
9447 }
9448
9449 static bool
9450 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9451 {
9452   /* The pattern matching functions above are written to look for a small
9453      number to begin the sequence (0, 1, N/2).  If we begin with an index
9454      from the second operand, we can swap the operands.  */
9455   if (d->perm[0] >= d->nelt)
9456     {
9457       unsigned i, nelt = d->nelt;
9458       rtx x;
9459
9460       gcc_assert (nelt == (nelt & -nelt));
9461       for (i = 0; i < nelt; ++i)
9462         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9463
9464       x = d->op0;
9465       d->op0 = d->op1;
9466       d->op1 = x;
9467     }
9468
9469   if (TARGET_SIMD)
9470     {
9471       if (aarch64_evpc_rev (d))
9472         return true;
9473       else if (aarch64_evpc_ext (d))
9474         return true;
9475       else if (aarch64_evpc_dup (d))
9476         return true;
9477       else if (aarch64_evpc_zip (d))
9478         return true;
9479       else if (aarch64_evpc_uzp (d))
9480         return true;
9481       else if (aarch64_evpc_trn (d))
9482         return true;
9483       return aarch64_evpc_tbl (d);
9484     }
9485   return false;
9486 }
9487
9488 /* Expand a vec_perm_const pattern.  */
9489
9490 bool
9491 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9492 {
9493   struct expand_vec_perm_d d;
9494   int i, nelt, which;
9495
9496   d.target = target;
9497   d.op0 = op0;
9498   d.op1 = op1;
9499
9500   d.vmode = GET_MODE (target);
9501   gcc_assert (VECTOR_MODE_P (d.vmode));
9502   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9503   d.testing_p = false;
9504
9505   for (i = which = 0; i < nelt; ++i)
9506     {
9507       rtx e = XVECEXP (sel, 0, i);
9508       int ei = INTVAL (e) & (2 * nelt - 1);
9509       which |= (ei < nelt ? 1 : 2);
9510       d.perm[i] = ei;
9511     }
9512
9513   switch (which)
9514     {
9515     default:
9516       gcc_unreachable ();
9517
9518     case 3:
9519       d.one_vector_p = false;
9520       if (!rtx_equal_p (op0, op1))
9521         break;
9522
9523       /* The elements of PERM do not suggest that only the first operand
9524          is used, but both operands are identical.  Allow easier matching
9525          of the permutation by folding the permutation into the single
9526          input vector.  */
9527       /* Fall Through.  */
9528     case 2:
9529       for (i = 0; i < nelt; ++i)
9530         d.perm[i] &= nelt - 1;
9531       d.op0 = op1;
9532       d.one_vector_p = true;
9533       break;
9534
9535     case 1:
9536       d.op1 = op0;
9537       d.one_vector_p = true;
9538       break;
9539     }
9540
9541   return aarch64_expand_vec_perm_const_1 (&d);
9542 }
9543
9544 static bool
9545 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9546                                      const unsigned char *sel)
9547 {
9548   struct expand_vec_perm_d d;
9549   unsigned int i, nelt, which;
9550   bool ret;
9551
9552   d.vmode = vmode;
9553   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9554   d.testing_p = true;
9555   memcpy (d.perm, sel, nelt);
9556
9557   /* Calculate whether all elements are in one vector.  */
9558   for (i = which = 0; i < nelt; ++i)
9559     {
9560       unsigned char e = d.perm[i];
9561       gcc_assert (e < 2 * nelt);
9562       which |= (e < nelt ? 1 : 2);
9563     }
9564
9565   /* If all elements are from the second vector, reindex as if from the
9566      first vector.  */
9567   if (which == 2)
9568     for (i = 0; i < nelt; ++i)
9569       d.perm[i] -= nelt;
9570
9571   /* Check whether the mask can be applied to a single vector.  */
9572   d.one_vector_p = (which != 3);
9573
9574   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9575   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9576   if (!d.one_vector_p)
9577     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9578
9579   start_sequence ();
9580   ret = aarch64_expand_vec_perm_const_1 (&d);
9581   end_sequence ();
9582
9583   return ret;
9584 }
9585
9586 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
9587 bool
9588 aarch64_cannot_change_mode_class (enum machine_mode from,
9589                                   enum machine_mode to,
9590                                   enum reg_class rclass)
9591 {
9592   /* Full-reg subregs are allowed on general regs or any class if they are
9593      the same size.  */
9594   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9595       || !reg_classes_intersect_p (FP_REGS, rclass))
9596     return false;
9597
9598   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
9599      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9600      2. Scalar to Scalar for integer modes or same size float modes.
9601      3. Vector to Vector modes.
9602      4. On little-endian only, Vector-Structure to Vector modes.  */
9603   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9604     {
9605       if (aarch64_vector_mode_supported_p (from)
9606           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9607         return false;
9608
9609       if (GET_MODE_NUNITS (from) == 1
9610           && GET_MODE_NUNITS (to) == 1
9611           && (GET_MODE_CLASS (from) == MODE_INT
9612               || from == to))
9613         return false;
9614
9615       if (aarch64_vector_mode_supported_p (from)
9616           && aarch64_vector_mode_supported_p (to))
9617         return false;
9618
9619       /* Within an vector structure straddling multiple vector registers
9620          we are in a mixed-endian representation.  As such, we can't
9621          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
9622          switch between vectors and vector structures cheaply.  */
9623       if (!BYTES_BIG_ENDIAN)
9624         if ((aarch64_vector_mode_supported_p (from)
9625               && aarch64_vect_struct_mode_p (to))
9626             || (aarch64_vector_mode_supported_p (to)
9627               && aarch64_vect_struct_mode_p (from)))
9628           return false;
9629     }
9630
9631   return true;
9632 }
9633
9634 /* Implement MODES_TIEABLE_P.  */
9635
9636 bool
9637 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9638 {
9639   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9640     return true;
9641
9642   /* We specifically want to allow elements of "structure" modes to
9643      be tieable to the structure.  This more general condition allows
9644      other rarer situations too.  */
9645   if (TARGET_SIMD
9646       && aarch64_vector_mode_p (mode1)
9647       && aarch64_vector_mode_p (mode2))
9648     return true;
9649
9650   return false;
9651 }
9652
9653 /* Return a new RTX holding the result of moving POINTER forward by
9654    AMOUNT bytes.  */
9655
9656 static rtx
9657 aarch64_move_pointer (rtx pointer, int amount)
9658 {
9659   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9660
9661   return adjust_automodify_address (pointer, GET_MODE (pointer),
9662                                     next, amount);
9663 }
9664
9665 /* Return a new RTX holding the result of moving POINTER forward by the
9666    size of the mode it points to.  */
9667
9668 static rtx
9669 aarch64_progress_pointer (rtx pointer)
9670 {
9671   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9672
9673   return aarch64_move_pointer (pointer, amount);
9674 }
9675
9676 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9677    MODE bytes.  */
9678
9679 static void
9680 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9681                                               enum machine_mode mode)
9682 {
9683   rtx reg = gen_reg_rtx (mode);
9684
9685   /* "Cast" the pointers to the correct mode.  */
9686   *src = adjust_address (*src, mode, 0);
9687   *dst = adjust_address (*dst, mode, 0);
9688   /* Emit the memcpy.  */
9689   emit_move_insn (reg, *src);
9690   emit_move_insn (*dst, reg);
9691   /* Move the pointers forward.  */
9692   *src = aarch64_progress_pointer (*src);
9693   *dst = aarch64_progress_pointer (*dst);
9694 }
9695
9696 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
9697    we succeed, otherwise return false.  */
9698
9699 bool
9700 aarch64_expand_movmem (rtx *operands)
9701 {
9702   unsigned int n;
9703   rtx dst = operands[0];
9704   rtx src = operands[1];
9705   rtx base;
9706   bool speed_p = !optimize_function_for_size_p (cfun);
9707
9708   /* When optimizing for size, give a better estimate of the length of a
9709      memcpy call, but use the default otherwise.  */
9710   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9711
9712   /* We can't do anything smart if the amount to copy is not constant.  */
9713   if (!CONST_INT_P (operands[2]))
9714     return false;
9715
9716   n = UINTVAL (operands[2]);
9717
9718   /* Try to keep the number of instructions low.  For cases below 16 bytes we
9719      need to make at most two moves.  For cases above 16 bytes it will be one
9720      move for each 16 byte chunk, then at most two additional moves.  */
9721   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9722     return false;
9723
9724   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9725   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9726
9727   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9728   src = adjust_automodify_address (src, VOIDmode, base, 0);
9729
9730   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9731      1-byte chunk.  */
9732   if (n < 4)
9733     {
9734       if (n >= 2)
9735         {
9736           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9737           n -= 2;
9738         }
9739
9740       if (n == 1)
9741         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9742
9743       return true;
9744     }
9745
9746   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
9747      4-byte chunk, partially overlapping with the previously copied chunk.  */
9748   if (n < 8)
9749     {
9750       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9751       n -= 4;
9752       if (n > 0)
9753         {
9754           int move = n - 4;
9755
9756           src = aarch64_move_pointer (src, move);
9757           dst = aarch64_move_pointer (dst, move);
9758           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9759         }
9760       return true;
9761     }
9762
9763   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
9764      them, then (if applicable) an 8-byte chunk.  */
9765   while (n >= 8)
9766     {
9767       if (n / 16)
9768         {
9769           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9770           n -= 16;
9771         }
9772       else
9773         {
9774           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9775           n -= 8;
9776         }
9777     }
9778
9779   /* Finish the final bytes of the copy.  We can always do this in one
9780      instruction.  We either copy the exact amount we need, or partially
9781      overlap with the previous chunk we copied and copy 8-bytes.  */
9782   if (n == 0)
9783     return true;
9784   else if (n == 1)
9785     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9786   else if (n == 2)
9787     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9788   else if (n == 4)
9789     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9790   else
9791     {
9792       if (n == 3)
9793         {
9794           src = aarch64_move_pointer (src, -1);
9795           dst = aarch64_move_pointer (dst, -1);
9796           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9797         }
9798       else
9799         {
9800           int move = n - 8;
9801
9802           src = aarch64_move_pointer (src, move);
9803           dst = aarch64_move_pointer (dst, move);
9804           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9805         }
9806     }
9807
9808   return true;
9809 }
9810
9811 #undef TARGET_ADDRESS_COST
9812 #define TARGET_ADDRESS_COST aarch64_address_cost
9813
9814 /* This hook will determines whether unnamed bitfields affect the alignment
9815    of the containing structure.  The hook returns true if the structure
9816    should inherit the alignment requirements of an unnamed bitfield's
9817    type.  */
9818 #undef TARGET_ALIGN_ANON_BITFIELD
9819 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9820
9821 #undef TARGET_ASM_ALIGNED_DI_OP
9822 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9823
9824 #undef TARGET_ASM_ALIGNED_HI_OP
9825 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9826
9827 #undef TARGET_ASM_ALIGNED_SI_OP
9828 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9829
9830 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9831 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9832   hook_bool_const_tree_hwi_hwi_const_tree_true
9833
9834 #undef TARGET_ASM_FILE_START
9835 #define TARGET_ASM_FILE_START aarch64_start_file
9836
9837 #undef TARGET_ASM_OUTPUT_MI_THUNK
9838 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9839
9840 #undef TARGET_ASM_SELECT_RTX_SECTION
9841 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9842
9843 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9844 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9845
9846 #undef TARGET_BUILD_BUILTIN_VA_LIST
9847 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9848
9849 #undef TARGET_CALLEE_COPIES
9850 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9851
9852 #undef TARGET_CAN_ELIMINATE
9853 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9854
9855 #undef TARGET_CANNOT_FORCE_CONST_MEM
9856 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9857
9858 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9859 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9860
9861 /* Only the least significant bit is used for initialization guard
9862    variables.  */
9863 #undef TARGET_CXX_GUARD_MASK_BIT
9864 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9865
9866 #undef TARGET_C_MODE_FOR_SUFFIX
9867 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9868
9869 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9870 #undef  TARGET_DEFAULT_TARGET_FLAGS
9871 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9872 #endif
9873
9874 #undef TARGET_CLASS_MAX_NREGS
9875 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9876
9877 #undef TARGET_BUILTIN_DECL
9878 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9879
9880 #undef  TARGET_EXPAND_BUILTIN
9881 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9882
9883 #undef TARGET_EXPAND_BUILTIN_VA_START
9884 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9885
9886 #undef TARGET_FOLD_BUILTIN
9887 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9888
9889 #undef TARGET_FUNCTION_ARG
9890 #define TARGET_FUNCTION_ARG aarch64_function_arg
9891
9892 #undef TARGET_FUNCTION_ARG_ADVANCE
9893 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9894
9895 #undef TARGET_FUNCTION_ARG_BOUNDARY
9896 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9897
9898 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9899 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9900
9901 #undef TARGET_FUNCTION_VALUE
9902 #define TARGET_FUNCTION_VALUE aarch64_function_value
9903
9904 #undef TARGET_FUNCTION_VALUE_REGNO_P
9905 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9906
9907 #undef TARGET_FRAME_POINTER_REQUIRED
9908 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9909
9910 #undef TARGET_GIMPLE_FOLD_BUILTIN
9911 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9912
9913 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9914 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9915
9916 #undef  TARGET_INIT_BUILTINS
9917 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
9918
9919 #undef TARGET_LEGITIMATE_ADDRESS_P
9920 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9921
9922 #undef TARGET_LEGITIMATE_CONSTANT_P
9923 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9924
9925 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9926 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9927
9928 #undef TARGET_LRA_P
9929 #define TARGET_LRA_P aarch64_lra_p
9930
9931 #undef TARGET_MANGLE_TYPE
9932 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9933
9934 #undef TARGET_MEMORY_MOVE_COST
9935 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9936
9937 #undef TARGET_MUST_PASS_IN_STACK
9938 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9939
9940 /* This target hook should return true if accesses to volatile bitfields
9941    should use the narrowest mode possible.  It should return false if these
9942    accesses should use the bitfield container type.  */
9943 #undef TARGET_NARROW_VOLATILE_BITFIELD
9944 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9945
9946 #undef  TARGET_OPTION_OVERRIDE
9947 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9948
9949 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9950 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9951   aarch64_override_options_after_change
9952
9953 #undef TARGET_PASS_BY_REFERENCE
9954 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9955
9956 #undef TARGET_PREFERRED_RELOAD_CLASS
9957 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9958
9959 #undef TARGET_SECONDARY_RELOAD
9960 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9961
9962 #undef TARGET_SHIFT_TRUNCATION_MASK
9963 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9964
9965 #undef TARGET_SETUP_INCOMING_VARARGS
9966 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9967
9968 #undef TARGET_STRUCT_VALUE_RTX
9969 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
9970
9971 #undef TARGET_REGISTER_MOVE_COST
9972 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9973
9974 #undef TARGET_RETURN_IN_MEMORY
9975 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9976
9977 #undef TARGET_RETURN_IN_MSB
9978 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9979
9980 #undef TARGET_RTX_COSTS
9981 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9982
9983 #undef TARGET_SCHED_ISSUE_RATE
9984 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9985
9986 #undef TARGET_TRAMPOLINE_INIT
9987 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9988
9989 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9990 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9991
9992 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9993 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9994
9995 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9996 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9997
9998 #undef TARGET_VECTORIZE_ADD_STMT_COST
9999 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
10000
10001 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
10002 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
10003   aarch64_builtin_vectorization_cost
10004
10005 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
10006 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
10007
10008 #undef TARGET_VECTORIZE_BUILTINS
10009 #define TARGET_VECTORIZE_BUILTINS
10010
10011 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
10012 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
10013   aarch64_builtin_vectorized_function
10014
10015 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
10016 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
10017   aarch64_autovectorize_vector_sizes
10018
10019 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
10020 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
10021   aarch64_atomic_assign_expand_fenv
10022
10023 /* Section anchor support.  */
10024
10025 #undef TARGET_MIN_ANCHOR_OFFSET
10026 #define TARGET_MIN_ANCHOR_OFFSET -256
10027
10028 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
10029    byte offset; we can do much more for larger data types, but have no way
10030    to determine the size of the access.  We assume accesses are aligned.  */
10031 #undef TARGET_MAX_ANCHOR_OFFSET
10032 #define TARGET_MAX_ANCHOR_OFFSET 4095
10033
10034 #undef TARGET_VECTOR_ALIGNMENT
10035 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
10036
10037 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
10038 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
10039   aarch64_simd_vector_alignment_reachable
10040
10041 /* vec_perm support.  */
10042
10043 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
10044 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
10045   aarch64_vectorize_vec_perm_const_ok
10046
10047
10048 #undef TARGET_FIXED_CONDITION_CODE_REGS
10049 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
10050
10051 #undef TARGET_FLAGS_REGNUM
10052 #define TARGET_FLAGS_REGNUM CC_REGNUM
10053
10054 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
10055 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
10056
10057 struct gcc_target targetm = TARGET_INITIALIZER;
10058
10059 #include "gt-aarch64.h"