gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "tree.h"
  29 #include "stringpool.h"
  30 #include "stor-layout.h"
  31 #include "calls.h"
  32 #include "varasm.h"
  33 #include "regs.h"
  34 #include "df.h"
  35 #include "hard-reg-set.h"
  36 #include "output.h"
  37 #include "expr.h"
  38 #include "reload.h"
  39 #include "toplev.h"
  40 #include "target.h"
  41 #include "target-def.h"
  42 #include "targhooks.h"
  43 #include "ggc.h"
  44 #include "function.h"
  45 #include "tm_p.h"
  46 #include "recog.h"
  47 #include "langhooks.h"
  48 #include "diagnostic-core.h"
  49 #include "pointer-set.h"
  50 #include "hash-table.h"
  51 #include "vec.h"
  52 #include "basic-block.h"
  53 #include "tree-ssa-alias.h"
  54 #include "internal-fn.h"
  55 #include "gimple-fold.h"
  56 #include "tree-eh.h"
  57 #include "gimple-expr.h"
  58 #include "is-a.h"
  59 #include "gimple.h"
  60 #include "gimplify.h"
  61 #include "optabs.h"
  62 #include "dwarf2.h"
  63 #include "cfgloop.h"
  64 #include "tree-vectorizer.h"
  65 #include "config/arm/aarch-cost-tables.h"
  66 #include "dumpfile.h"
  67 #include "builtins.h"
  68
  69 /* Defined for convenience.  */
  70 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  71
  72 /* Classifies an address.
  73
  74    ADDRESS_REG_IMM
  75        A simple base register plus immediate offset.
  76
  77    ADDRESS_REG_WB
  78        A base register indexed by immediate offset with writeback.
  79
  80    ADDRESS_REG_REG
  81        A base register indexed by (optionally scaled) register.
  82
  83    ADDRESS_REG_UXTW
  84        A base register indexed by (optionally scaled) zero-extended register.
  85
  86    ADDRESS_REG_SXTW
  87        A base register indexed by (optionally scaled) sign-extended register.
  88
  89    ADDRESS_LO_SUM
  90        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  91
  92    ADDRESS_SYMBOLIC:
  93        A constant symbolic address, in pc-relative literal pool.  */
  94
  95 enum aarch64_address_type {
  96   ADDRESS_REG_IMM,
  97   ADDRESS_REG_WB,
  98   ADDRESS_REG_REG,
  99   ADDRESS_REG_UXTW,
 100   ADDRESS_REG_SXTW,
 101   ADDRESS_LO_SUM,
 102   ADDRESS_SYMBOLIC
 103 };
 104
 105 struct aarch64_address_info {
 106   enum aarch64_address_type type;
 107   rtx base;
 108   rtx offset;
 109   int shift;
 110   enum aarch64_symbol_type symbol_type;
 111 };
 112
 113 struct simd_immediate_info
 114 {
 115   rtx value;
 116   int shift;
 117   int element_width;
 118   bool mvn;
 119   bool msl;
 120 };
 121
 122 /* The current code model.  */
 123 enum aarch64_code_model aarch64_cmodel;
 124
 125 #ifdef HAVE_AS_TLS
 126 #undef TARGET_HAVE_TLS
 127 #define TARGET_HAVE_TLS 1
 128 #endif
 129
 130 static bool aarch64_lra_p (void);
 131 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
 132 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
 133                                                      const_tree,
 134                                                      enum machine_mode *, int *,
 135                                                      bool *);
 136 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_override_options_after_change (void);
 139 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 140 static unsigned bit_count (unsigned HOST_WIDE_INT);
 141 static bool aarch64_const_vec_all_same_int_p (rtx,
 142                                               HOST_WIDE_INT, HOST_WIDE_INT);
 143
 144 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* The current tuning set.  */
 152 const struct tune_params *aarch64_tune_params;
 153
 154 /* Mask to specify which instructions we are allowed to generate.  */
 155 unsigned long aarch64_isa_flags = 0;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Tuning parameters.  */
 161
 162 #if HAVE_DESIGNATED_INITIALIZERS
 163 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 164 #else
 165 #define NAMED_PARAM(NAME, VAL) (VAL)
 166 #endif
 167
 168 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 169 __extension__
 170 #endif
 171
 172 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 173 __extension__
 174 #endif
 175 static const struct cpu_addrcost_table generic_addrcost_table =
 176 {
 177 #if HAVE_DESIGNATED_INITIALIZERS
 178   .addr_scale_costs =
 179 #endif
 180     {
 181       NAMED_PARAM (qi, 0),
 182       NAMED_PARAM (hi, 0),
 183       NAMED_PARAM (si, 0),
 184       NAMED_PARAM (ti, 0),
 185     },
 186   NAMED_PARAM (pre_modify, 0),
 187   NAMED_PARAM (post_modify, 0),
 188   NAMED_PARAM (register_offset, 0),
 189   NAMED_PARAM (register_extend, 0),
 190   NAMED_PARAM (imm_offset, 0)
 191 };
 192
 193 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 194 __extension__
 195 #endif
 196 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 197 {
 198 #if HAVE_DESIGNATED_INITIALIZERS
 199   .addr_scale_costs =
 200 #endif
 201     {
 202       NAMED_PARAM (qi, 0),
 203       NAMED_PARAM (hi, 1),
 204       NAMED_PARAM (si, 0),
 205       NAMED_PARAM (ti, 1),
 206     },
 207   NAMED_PARAM (pre_modify, 0),
 208   NAMED_PARAM (post_modify, 0),
 209   NAMED_PARAM (register_offset, 0),
 210   NAMED_PARAM (register_extend, 0),
 211   NAMED_PARAM (imm_offset, 0),
 212 };
 213
 214 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 215 __extension__
 216 #endif
 217 static const struct cpu_regmove_cost generic_regmove_cost =
 218 {
 219   NAMED_PARAM (GP2GP, 1),
 220   NAMED_PARAM (GP2FP, 2),
 221   NAMED_PARAM (FP2GP, 2),
 222   /* We currently do not provide direct support for TFmode Q->Q move.
 223      Therefore we need to raise the cost above 2 in order to have
 224      reload handle the situation.  */
 225   NAMED_PARAM (FP2FP, 4)
 226 };
 227
 228 /* Generic costs for vector insn classes.  */
 229 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 230 __extension__
 231 #endif
 232 static const struct cpu_vector_cost generic_vector_cost =
 233 {
 234   NAMED_PARAM (scalar_stmt_cost, 1),
 235   NAMED_PARAM (scalar_load_cost, 1),
 236   NAMED_PARAM (scalar_store_cost, 1),
 237   NAMED_PARAM (vec_stmt_cost, 1),
 238   NAMED_PARAM (vec_to_scalar_cost, 1),
 239   NAMED_PARAM (scalar_to_vec_cost, 1),
 240   NAMED_PARAM (vec_align_load_cost, 1),
 241   NAMED_PARAM (vec_unalign_load_cost, 1),
 242   NAMED_PARAM (vec_unalign_store_cost, 1),
 243   NAMED_PARAM (vec_store_cost, 1),
 244   NAMED_PARAM (cond_taken_branch_cost, 3),
 245   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 246 };
 247
 248 /* Generic costs for vector insn classes.  */
 249 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 250 __extension__
 251 #endif
 252 static const struct cpu_vector_cost cortexa57_vector_cost =
 253 {
 254   NAMED_PARAM (scalar_stmt_cost, 1),
 255   NAMED_PARAM (scalar_load_cost, 4),
 256   NAMED_PARAM (scalar_store_cost, 1),
 257   NAMED_PARAM (vec_stmt_cost, 3),
 258   NAMED_PARAM (vec_to_scalar_cost, 8),
 259   NAMED_PARAM (scalar_to_vec_cost, 8),
 260   NAMED_PARAM (vec_align_load_cost, 5),
 261   NAMED_PARAM (vec_unalign_load_cost, 5),
 262   NAMED_PARAM (vec_unalign_store_cost, 1),
 263   NAMED_PARAM (vec_store_cost, 1),
 264   NAMED_PARAM (cond_taken_branch_cost, 1),
 265   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 266 };
 267
 268 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 269 __extension__
 270 #endif
 271 static const struct tune_params generic_tunings =
 272 {
 273   &cortexa57_extra_costs,
 274   &generic_addrcost_table,
 275   &generic_regmove_cost,
 276   &generic_vector_cost,
 277   NAMED_PARAM (memmov_cost, 4),
 278   NAMED_PARAM (issue_rate, 2)
 279 };
 280
 281 static const struct tune_params cortexa53_tunings =
 282 {
 283   &cortexa53_extra_costs,
 284   &generic_addrcost_table,
 285   &generic_regmove_cost,
 286   &generic_vector_cost,
 287   NAMED_PARAM (memmov_cost, 4),
 288   NAMED_PARAM (issue_rate, 2)
 289 };
 290
 291 static const struct tune_params cortexa57_tunings =
 292 {
 293   &cortexa57_extra_costs,
 294   &cortexa57_addrcost_table,
 295   &generic_regmove_cost,
 296   &cortexa57_vector_cost,
 297   NAMED_PARAM (memmov_cost, 4),
 298   NAMED_PARAM (issue_rate, 3)
 299 };
 300
 301 /* A processor implementing AArch64.  */
 302 struct processor
 303 {
 304   const char *const name;
 305   enum aarch64_processor core;
 306   const char *arch;
 307   const unsigned long flags;
 308   const struct tune_params *const tune;
 309 };
 310
 311 /* Processor cores implementing AArch64.  */
 312 static const struct processor all_cores[] =
 313 {
 314 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
 315   {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
 316 #include "aarch64-cores.def"
 317 #undef AARCH64_CORE
 318   {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
 319   {NULL, aarch64_none, NULL, 0, NULL}
 320 };
 321
 322 /* Architectures implementing AArch64.  */
 323 static const struct processor all_architectures[] =
 324 {
 325 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 326   {NAME, CORE, #ARCH, FLAGS, NULL},
 327 #include "aarch64-arches.def"
 328 #undef AARCH64_ARCH
 329   {NULL, aarch64_none, NULL, 0, NULL}
 330 };
 331
 332 /* Target specification.  These are populated as commandline arguments
 333    are processed, or NULL if not specified.  */
 334 static const struct processor *selected_arch;
 335 static const struct processor *selected_cpu;
 336 static const struct processor *selected_tune;
 337
 338 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 339
 340 /* An ISA extension in the co-processor and main instruction set space.  */
 341 struct aarch64_option_extension
 342 {
 343   const char *const name;
 344   const unsigned long flags_on;
 345   const unsigned long flags_off;
 346 };
 347
 348 /* ISA extensions in AArch64.  */
 349 static const struct aarch64_option_extension all_extensions[] =
 350 {
 351 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 352   {NAME, FLAGS_ON, FLAGS_OFF},
 353 #include "aarch64-option-extensions.def"
 354 #undef AARCH64_OPT_EXTENSION
 355   {NULL, 0, 0}
 356 };
 357
 358 /* Used to track the size of an address when generating a pre/post
 359    increment address.  */
 360 static enum machine_mode aarch64_memory_reference_mode;
 361
 362 /* Used to force GTY into this file.  */
 363 static GTY(()) int gty_dummy;
 364
 365 /* A table of valid AArch64 "bitmask immediate" values for
 366    logical instructions.  */
 367
 368 #define AARCH64_NUM_BITMASKS  5334
 369 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 370
 371 typedef enum aarch64_cond_code
 372 {
 373   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 374   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 375   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 376 }
 377 aarch64_cc;
 378
 379 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 380
 381 /* The condition codes of the processor, and the inverse function.  */
 382 static const char * const aarch64_condition_codes[] =
 383 {
 384   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 385   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 386 };
 387
 388 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 389 unsigned
 390 aarch64_dbx_register_number (unsigned regno)
 391 {
 392    if (GP_REGNUM_P (regno))
 393      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 394    else if (regno == SP_REGNUM)
 395      return AARCH64_DWARF_SP;
 396    else if (FP_REGNUM_P (regno))
 397      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 398
 399    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 400       equivalent DWARF register.  */
 401    return DWARF_FRAME_REGISTERS;
 402 }
 403
 404 /* Return TRUE if MODE is any of the large INT modes.  */
 405 static bool
 406 aarch64_vect_struct_mode_p (enum machine_mode mode)
 407 {
 408   return mode == OImode || mode == CImode || mode == XImode;
 409 }
 410
 411 /* Return TRUE if MODE is any of the vector modes.  */
 412 static bool
 413 aarch64_vector_mode_p (enum machine_mode mode)
 414 {
 415   return aarch64_vector_mode_supported_p (mode)
 416          || aarch64_vect_struct_mode_p (mode);
 417 }
 418
 419 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 420 static bool
 421 aarch64_array_mode_supported_p (enum machine_mode mode,
 422                                 unsigned HOST_WIDE_INT nelems)
 423 {
 424   if (TARGET_SIMD
 425       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 426       && (nelems >= 2 && nelems <= 4))
 427     return true;
 428
 429   return false;
 430 }
 431
 432 /* Implement HARD_REGNO_NREGS.  */
 433
 434 int
 435 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
 436 {
 437   switch (aarch64_regno_regclass (regno))
 438     {
 439     case FP_REGS:
 440     case FP_LO_REGS:
 441       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 442     default:
 443       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 444     }
 445   gcc_unreachable ();
 446 }
 447
 448 /* Implement HARD_REGNO_MODE_OK.  */
 449
 450 int
 451 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
 452 {
 453   if (GET_MODE_CLASS (mode) == MODE_CC)
 454     return regno == CC_REGNUM;
 455
 456   if (regno == SP_REGNUM)
 457     /* The purpose of comparing with ptr_mode is to support the
 458        global register variable associated with the stack pointer
 459        register via the syntax of asm ("wsp") in ILP32.  */
 460     return mode == Pmode || mode == ptr_mode;
 461
 462   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 463     return mode == Pmode;
 464
 465   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 466     return 1;
 467
 468   if (FP_REGNUM_P (regno))
 469     {
 470       if (aarch64_vect_struct_mode_p (mode))
 471         return
 472           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 473       else
 474         return 1;
 475     }
 476
 477   return 0;
 478 }
 479
 480 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 481 enum machine_mode
 482 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 483                                      enum machine_mode mode)
 484 {
 485   /* Handle modes that fit within single registers.  */
 486   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 487     {
 488       if (GET_MODE_SIZE (mode) >= 4)
 489         return mode;
 490       else
 491         return SImode;
 492     }
 493   /* Fall back to generic for multi-reg and very large modes.  */
 494   else
 495     return choose_hard_reg_mode (regno, nregs, false);
 496 }
 497
 498 /* Return true if calls to DECL should be treated as
 499    long-calls (ie called via a register).  */
 500 static bool
 501 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 502 {
 503   return false;
 504 }
 505
 506 /* Return true if calls to symbol-ref SYM should be treated as
 507    long-calls (ie called via a register).  */
 508 bool
 509 aarch64_is_long_call_p (rtx sym)
 510 {
 511   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 512 }
 513
 514 /* Return true if the offsets to a zero/sign-extract operation
 515    represent an expression that matches an extend operation.  The
 516    operands represent the paramters from
 517
 518    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 519 bool
 520 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 521                                 rtx extract_imm)
 522 {
 523   HOST_WIDE_INT mult_val, extract_val;
 524
 525   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 526     return false;
 527
 528   mult_val = INTVAL (mult_imm);
 529   extract_val = INTVAL (extract_imm);
 530
 531   if (extract_val > 8
 532       && extract_val < GET_MODE_BITSIZE (mode)
 533       && exact_log2 (extract_val & ~7) > 0
 534       && (extract_val & 7) <= 4
 535       && mult_val == (1 << (extract_val & 7)))
 536     return true;
 537
 538   return false;
 539 }
 540
 541 /* Emit an insn that's a simple single-set.  Both the operands must be
 542    known to be valid.  */
 543 inline static rtx
 544 emit_set_insn (rtx x, rtx y)
 545 {
 546   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 547 }
 548
 549 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 550    return the rtx for register 0 in the proper mode.  */
 551 rtx
 552 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 553 {
 554   enum machine_mode mode = SELECT_CC_MODE (code, x, y);
 555   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 556
 557   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 558   return cc_reg;
 559 }
 560
 561 /* Build the SYMBOL_REF for __tls_get_addr.  */
 562
 563 static GTY(()) rtx tls_get_addr_libfunc;
 564
 565 rtx
 566 aarch64_tls_get_addr (void)
 567 {
 568   if (!tls_get_addr_libfunc)
 569     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 570   return tls_get_addr_libfunc;
 571 }
 572
 573 /* Return the TLS model to use for ADDR.  */
 574
 575 static enum tls_model
 576 tls_symbolic_operand_type (rtx addr)
 577 {
 578   enum tls_model tls_kind = TLS_MODEL_NONE;
 579   rtx sym, addend;
 580
 581   if (GET_CODE (addr) == CONST)
 582     {
 583       split_const (addr, &sym, &addend);
 584       if (GET_CODE (sym) == SYMBOL_REF)
 585         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 586     }
 587   else if (GET_CODE (addr) == SYMBOL_REF)
 588     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 589
 590   return tls_kind;
 591 }
 592
 593 /* We'll allow lo_sum's in addresses in our legitimate addresses
 594    so that combine would take care of combining addresses where
 595    necessary, but for generation purposes, we'll generate the address
 596    as :
 597    RTL                               Absolute
 598    tmp = hi (symbol_ref);            adrp  x1, foo
 599    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 600                                      nop
 601
 602    PIC                               TLS
 603    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 604    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 605                                      bl   __tls_get_addr
 606                                      nop
 607
 608    Load TLS symbol, depending on TLS mechanism and TLS access model.
 609
 610    Global Dynamic - Traditional TLS:
 611    adrp tmp, :tlsgd:imm
 612    add  dest, tmp, #:tlsgd_lo12:imm
 613    bl   __tls_get_addr
 614
 615    Global Dynamic - TLS Descriptors:
 616    adrp dest, :tlsdesc:imm
 617    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 618    add  dest, dest, #:tlsdesc_lo12:imm
 619    blr  tmp
 620    mrs  tp, tpidr_el0
 621    add  dest, dest, tp
 622
 623    Initial Exec:
 624    mrs  tp, tpidr_el0
 625    adrp tmp, :gottprel:imm
 626    ldr  dest, [tmp, #:gottprel_lo12:imm]
 627    add  dest, dest, tp
 628
 629    Local Exec:
 630    mrs  tp, tpidr_el0
 631    add  t0, tp, #:tprel_hi12:imm
 632    add  t0, #:tprel_lo12_nc:imm
 633 */
 634
 635 static void
 636 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 637                                    enum aarch64_symbol_type type)
 638 {
 639   switch (type)
 640     {
 641     case SYMBOL_SMALL_ABSOLUTE:
 642       {
 643         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 644         rtx tmp_reg = dest;
 645         enum machine_mode mode = GET_MODE (dest);
 646
 647         gcc_assert (mode == Pmode || mode == ptr_mode);
 648
 649         if (can_create_pseudo_p ())
 650           tmp_reg = gen_reg_rtx (mode);
 651
 652         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 653         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 654         return;
 655       }
 656
 657     case SYMBOL_TINY_ABSOLUTE:
 658       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 659       return;
 660
 661     case SYMBOL_SMALL_GOT:
 662       {
 663         /* In ILP32, the mode of dest can be either SImode or DImode,
 664            while the got entry is always of SImode size.  The mode of
 665            dest depends on how dest is used: if dest is assigned to a
 666            pointer (e.g. in the memory), it has SImode; it may have
 667            DImode if dest is dereferenced to access the memeory.
 668            This is why we have to handle three different ldr_got_small
 669            patterns here (two patterns for ILP32).  */
 670         rtx tmp_reg = dest;
 671         enum machine_mode mode = GET_MODE (dest);
 672
 673         if (can_create_pseudo_p ())
 674           tmp_reg = gen_reg_rtx (mode);
 675
 676         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 677         if (mode == ptr_mode)
 678           {
 679             if (mode == DImode)
 680               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 681             else
 682               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 683           }
 684         else
 685           {
 686             gcc_assert (mode == Pmode);
 687             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 688           }
 689
 690         return;
 691       }
 692
 693     case SYMBOL_SMALL_TLSGD:
 694       {
 695         rtx insns;
 696         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 697
 698         start_sequence ();
 699         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 700         insns = get_insns ();
 701         end_sequence ();
 702
 703         RTL_CONST_CALL_P (insns) = 1;
 704         emit_libcall_block (insns, dest, result, imm);
 705         return;
 706       }
 707
 708     case SYMBOL_SMALL_TLSDESC:
 709       {
 710         enum machine_mode mode = GET_MODE (dest);
 711         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 712         rtx tp;
 713
 714         gcc_assert (mode == Pmode || mode == ptr_mode);
 715
 716         /* In ILP32, the got entry is always of SImode size.  Unlike
 717            small GOT, the dest is fixed at reg 0.  */
 718         if (TARGET_ILP32)
 719           emit_insn (gen_tlsdesc_small_si (imm));
 720         else
 721           emit_insn (gen_tlsdesc_small_di (imm));
 722         tp = aarch64_load_tp (NULL);
 723
 724         if (mode != Pmode)
 725           tp = gen_lowpart (mode, tp);
 726
 727         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 728         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 729         return;
 730       }
 731
 732     case SYMBOL_SMALL_GOTTPREL:
 733       {
 734         /* In ILP32, the mode of dest can be either SImode or DImode,
 735            while the got entry is always of SImode size.  The mode of
 736            dest depends on how dest is used: if dest is assigned to a
 737            pointer (e.g. in the memory), it has SImode; it may have
 738            DImode if dest is dereferenced to access the memeory.
 739            This is why we have to handle three different tlsie_small
 740            patterns here (two patterns for ILP32).  */
 741         enum machine_mode mode = GET_MODE (dest);
 742         rtx tmp_reg = gen_reg_rtx (mode);
 743         rtx tp = aarch64_load_tp (NULL);
 744
 745         if (mode == ptr_mode)
 746           {
 747             if (mode == DImode)
 748               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 749             else
 750               {
 751                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 752                 tp = gen_lowpart (mode, tp);
 753               }
 754           }
 755         else
 756           {
 757             gcc_assert (mode == Pmode);
 758             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 759           }
 760
 761         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 762         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 763         return;
 764       }
 765
 766     case SYMBOL_SMALL_TPREL:
 767       {
 768         rtx tp = aarch64_load_tp (NULL);
 769         emit_insn (gen_tlsle_small (dest, tp, imm));
 770         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 771         return;
 772       }
 773
 774     case SYMBOL_TINY_GOT:
 775       emit_insn (gen_ldr_got_tiny (dest, imm));
 776       return;
 777
 778     default:
 779       gcc_unreachable ();
 780     }
 781 }
 782
 783 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 784    handle all moves if !can_create_pseudo_p ().  The distinction is
 785    important because, unlike emit_move_insn, the move expanders know
 786    how to force Pmode objects into the constant pool even when the
 787    constant pool address is not itself legitimate.  */
 788 static rtx
 789 aarch64_emit_move (rtx dest, rtx src)
 790 {
 791   return (can_create_pseudo_p ()
 792           ? emit_move_insn (dest, src)
 793           : emit_move_insn_1 (dest, src));
 794 }
 795
 796 /* Split a 128-bit move operation into two 64-bit move operations,
 797    taking care to handle partial overlap of register to register
 798    copies.  Special cases are needed when moving between GP regs and
 799    FP regs.  SRC can be a register, constant or memory; DST a register
 800    or memory.  If either operand is memory it must not have any side
 801    effects.  */
 802 void
 803 aarch64_split_128bit_move (rtx dst, rtx src)
 804 {
 805   rtx dst_lo, dst_hi;
 806   rtx src_lo, src_hi;
 807
 808   enum machine_mode mode = GET_MODE (dst);
 809
 810   gcc_assert (mode == TImode || mode == TFmode);
 811   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 812   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 813
 814   if (REG_P (dst) && REG_P (src))
 815     {
 816       int src_regno = REGNO (src);
 817       int dst_regno = REGNO (dst);
 818
 819       /* Handle FP <-> GP regs.  */
 820       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 821         {
 822           src_lo = gen_lowpart (word_mode, src);
 823           src_hi = gen_highpart (word_mode, src);
 824
 825           if (mode == TImode)
 826             {
 827               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 828               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 829             }
 830           else
 831             {
 832               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 833               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 834             }
 835           return;
 836         }
 837       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 838         {
 839           dst_lo = gen_lowpart (word_mode, dst);
 840           dst_hi = gen_highpart (word_mode, dst);
 841
 842           if (mode == TImode)
 843             {
 844               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 845               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 846             }
 847           else
 848             {
 849               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 850               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 851             }
 852           return;
 853         }
 854     }
 855
 856   dst_lo = gen_lowpart (word_mode, dst);
 857   dst_hi = gen_highpart (word_mode, dst);
 858   src_lo = gen_lowpart (word_mode, src);
 859   src_hi = gen_highpart_mode (word_mode, mode, src);
 860
 861   /* At most one pairing may overlap.  */
 862   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 863     {
 864       aarch64_emit_move (dst_hi, src_hi);
 865       aarch64_emit_move (dst_lo, src_lo);
 866     }
 867   else
 868     {
 869       aarch64_emit_move (dst_lo, src_lo);
 870       aarch64_emit_move (dst_hi, src_hi);
 871     }
 872 }
 873
 874 bool
 875 aarch64_split_128bit_move_p (rtx dst, rtx src)
 876 {
 877   return (! REG_P (src)
 878           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 879 }
 880
 881 /* Split a complex SIMD combine.  */
 882
 883 void
 884 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 885 {
 886   enum machine_mode src_mode = GET_MODE (src1);
 887   enum machine_mode dst_mode = GET_MODE (dst);
 888
 889   gcc_assert (VECTOR_MODE_P (dst_mode));
 890
 891   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
 892     {
 893       rtx (*gen) (rtx, rtx, rtx);
 894
 895       switch (src_mode)
 896         {
 897         case V8QImode:
 898           gen = gen_aarch64_simd_combinev8qi;
 899           break;
 900         case V4HImode:
 901           gen = gen_aarch64_simd_combinev4hi;
 902           break;
 903         case V2SImode:
 904           gen = gen_aarch64_simd_combinev2si;
 905           break;
 906         case V2SFmode:
 907           gen = gen_aarch64_simd_combinev2sf;
 908           break;
 909         case DImode:
 910           gen = gen_aarch64_simd_combinedi;
 911           break;
 912         case DFmode:
 913           gen = gen_aarch64_simd_combinedf;
 914           break;
 915         default:
 916           gcc_unreachable ();
 917         }
 918
 919       emit_insn (gen (dst, src1, src2));
 920       return;
 921     }
 922 }
 923
 924 /* Split a complex SIMD move.  */
 925
 926 void
 927 aarch64_split_simd_move (rtx dst, rtx src)
 928 {
 929   enum machine_mode src_mode = GET_MODE (src);
 930   enum machine_mode dst_mode = GET_MODE (dst);
 931
 932   gcc_assert (VECTOR_MODE_P (dst_mode));
 933
 934   if (REG_P (dst) && REG_P (src))
 935     {
 936       rtx (*gen) (rtx, rtx);
 937
 938       gcc_assert (VECTOR_MODE_P (src_mode));
 939
 940       switch (src_mode)
 941         {
 942         case V16QImode:
 943           gen = gen_aarch64_split_simd_movv16qi;
 944           break;
 945         case V8HImode:
 946           gen = gen_aarch64_split_simd_movv8hi;
 947           break;
 948         case V4SImode:
 949           gen = gen_aarch64_split_simd_movv4si;
 950           break;
 951         case V2DImode:
 952           gen = gen_aarch64_split_simd_movv2di;
 953           break;
 954         case V4SFmode:
 955           gen = gen_aarch64_split_simd_movv4sf;
 956           break;
 957         case V2DFmode:
 958           gen = gen_aarch64_split_simd_movv2df;
 959           break;
 960         default:
 961           gcc_unreachable ();
 962         }
 963
 964       emit_insn (gen (dst, src));
 965       return;
 966     }
 967 }
 968
 969 static rtx
 970 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
 971 {
 972   if (can_create_pseudo_p ())
 973     return force_reg (mode, value);
 974   else
 975     {
 976       x = aarch64_emit_move (x, value);
 977       return x;
 978     }
 979 }
 980
 981
 982 static rtx
 983 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
 984 {
 985   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
 986     {
 987       rtx high;
 988       /* Load the full offset into a register.  This
 989          might be improvable in the future.  */
 990       high = GEN_INT (offset);
 991       offset = 0;
 992       high = aarch64_force_temporary (mode, temp, high);
 993       reg = aarch64_force_temporary (mode, temp,
 994                                      gen_rtx_PLUS (mode, high, reg));
 995     }
 996   return plus_constant (mode, reg, offset);
 997 }
 998
 999 void
1000 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1001 {
1002   enum machine_mode mode = GET_MODE (dest);
1003   unsigned HOST_WIDE_INT mask;
1004   int i;
1005   bool first;
1006   unsigned HOST_WIDE_INT val;
1007   bool subtargets;
1008   rtx subtarget;
1009   int one_match, zero_match;
1010
1011   gcc_assert (mode == SImode || mode == DImode);
1012
1013   /* Check on what type of symbol it is.  */
1014   if (GET_CODE (imm) == SYMBOL_REF
1015       || GET_CODE (imm) == LABEL_REF
1016       || GET_CODE (imm) == CONST)
1017     {
1018       rtx mem, base, offset;
1019       enum aarch64_symbol_type sty;
1020
1021       /* If we have (const (plus symbol offset)), separate out the offset
1022          before we start classifying the symbol.  */
1023       split_const (imm, &base, &offset);
1024
1025       sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1026       switch (sty)
1027         {
1028         case SYMBOL_FORCE_TO_MEM:
1029           if (offset != const0_rtx
1030               && targetm.cannot_force_const_mem (mode, imm))
1031             {
1032               gcc_assert (can_create_pseudo_p ());
1033               base = aarch64_force_temporary (mode, dest, base);
1034               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1035               aarch64_emit_move (dest, base);
1036               return;
1037             }
1038           mem = force_const_mem (ptr_mode, imm);
1039           gcc_assert (mem);
1040           if (mode != ptr_mode)
1041             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1042           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1043           return;
1044
1045         case SYMBOL_SMALL_TLSGD:
1046         case SYMBOL_SMALL_TLSDESC:
1047         case SYMBOL_SMALL_GOTTPREL:
1048         case SYMBOL_SMALL_GOT:
1049         case SYMBOL_TINY_GOT:
1050           if (offset != const0_rtx)
1051             {
1052               gcc_assert(can_create_pseudo_p ());
1053               base = aarch64_force_temporary (mode, dest, base);
1054               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1055               aarch64_emit_move (dest, base);
1056               return;
1057             }
1058           /* FALLTHRU */
1059
1060         case SYMBOL_SMALL_TPREL:
1061         case SYMBOL_SMALL_ABSOLUTE:
1062         case SYMBOL_TINY_ABSOLUTE:
1063           aarch64_load_symref_appropriately (dest, imm, sty);
1064           return;
1065
1066         default:
1067           gcc_unreachable ();
1068         }
1069     }
1070
1071   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1072     {
1073       emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1074       return;
1075     }
1076
1077   if (!CONST_INT_P (imm))
1078     {
1079       if (GET_CODE (imm) == HIGH)
1080         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1081       else
1082         {
1083           rtx mem = force_const_mem (mode, imm);
1084           gcc_assert (mem);
1085           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1086         }
1087
1088       return;
1089     }
1090
1091   if (mode == SImode)
1092     {
1093       /* We know we can't do this in 1 insn, and we must be able to do it
1094          in two; so don't mess around looking for sequences that don't buy
1095          us anything.  */
1096       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1097       emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1098                                  GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1099       return;
1100     }
1101
1102   /* Remaining cases are all for DImode.  */
1103
1104   val = INTVAL (imm);
1105   subtargets = optimize && can_create_pseudo_p ();
1106
1107   one_match = 0;
1108   zero_match = 0;
1109   mask = 0xffff;
1110
1111   for (i = 0; i < 64; i += 16, mask <<= 16)
1112     {
1113       if ((val & mask) == 0)
1114         zero_match++;
1115       else if ((val & mask) == mask)
1116         one_match++;
1117     }
1118
1119   if (one_match == 2)
1120     {
1121       mask = 0xffff;
1122       for (i = 0; i < 64; i += 16, mask <<= 16)
1123         {
1124           if ((val & mask) != mask)
1125             {
1126               emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1127               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1128                                          GEN_INT ((val >> i) & 0xffff)));
1129               return;
1130             }
1131         }
1132       gcc_unreachable ();
1133     }
1134
1135   if (zero_match == 2)
1136     goto simple_sequence;
1137
1138   mask = 0x0ffff0000UL;
1139   for (i = 16; i < 64; i += 16, mask <<= 16)
1140     {
1141       HOST_WIDE_INT comp = mask & ~(mask - 1);
1142
1143       if (aarch64_uimm12_shift (val - (val & mask)))
1144         {
1145           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1146
1147           emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1148           emit_insn (gen_adddi3 (dest, subtarget,
1149                                  GEN_INT (val - (val & mask))));
1150           return;
1151         }
1152       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1153         {
1154           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1155
1156           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1157                                   GEN_INT ((val + comp) & mask)));
1158           emit_insn (gen_adddi3 (dest, subtarget,
1159                                  GEN_INT (val - ((val + comp) & mask))));
1160           return;
1161         }
1162       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1163         {
1164           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1165
1166           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1167                                   GEN_INT ((val - comp) | ~mask)));
1168           emit_insn (gen_adddi3 (dest, subtarget,
1169                                  GEN_INT (val - ((val - comp) | ~mask))));
1170           return;
1171         }
1172       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1173         {
1174           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1175
1176           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1177                                   GEN_INT (val | ~mask)));
1178           emit_insn (gen_adddi3 (dest, subtarget,
1179                                  GEN_INT (val - (val | ~mask))));
1180           return;
1181         }
1182     }
1183
1184   /* See if we can do it by arithmetically combining two
1185      immediates.  */
1186   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1187     {
1188       int j;
1189       mask = 0xffff;
1190
1191       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1192           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1193         {
1194           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1195           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1196                                   GEN_INT (aarch64_bitmasks[i])));
1197           emit_insn (gen_adddi3 (dest, subtarget,
1198                                  GEN_INT (val - aarch64_bitmasks[i])));
1199           return;
1200         }
1201
1202       for (j = 0; j < 64; j += 16, mask <<= 16)
1203         {
1204           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1205             {
1206               emit_insn (gen_rtx_SET (VOIDmode, dest,
1207                                       GEN_INT (aarch64_bitmasks[i])));
1208               emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1209                                          GEN_INT ((val >> j) & 0xffff)));
1210               return;
1211             }
1212         }
1213     }
1214
1215   /* See if we can do it by logically combining two immediates.  */
1216   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1217     {
1218       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1219         {
1220           int j;
1221
1222           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1223             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1224               {
1225                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1226                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1227                                         GEN_INT (aarch64_bitmasks[i])));
1228                 emit_insn (gen_iordi3 (dest, subtarget,
1229                                        GEN_INT (aarch64_bitmasks[j])));
1230                 return;
1231               }
1232         }
1233       else if ((val & aarch64_bitmasks[i]) == val)
1234         {
1235           int j;
1236
1237           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1238             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1239               {
1240
1241                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1242                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1243                                         GEN_INT (aarch64_bitmasks[j])));
1244                 emit_insn (gen_anddi3 (dest, subtarget,
1245                                        GEN_INT (aarch64_bitmasks[i])));
1246                 return;
1247               }
1248         }
1249     }
1250
1251  simple_sequence:
1252   first = true;
1253   mask = 0xffff;
1254   for (i = 0; i < 64; i += 16, mask <<= 16)
1255     {
1256       if ((val & mask) != 0)
1257         {
1258           if (first)
1259             {
1260               emit_insn (gen_rtx_SET (VOIDmode, dest,
1261                                       GEN_INT (val & mask)));
1262               first = false;
1263             }
1264           else
1265             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1266                                        GEN_INT ((val >> i) & 0xffff)));
1267         }
1268     }
1269 }
1270
1271 static bool
1272 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1273                                  tree exp ATTRIBUTE_UNUSED)
1274 {
1275   /* Currently, always true.  */
1276   return true;
1277 }
1278
1279 /* Implement TARGET_PASS_BY_REFERENCE.  */
1280
1281 static bool
1282 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1283                            enum machine_mode mode,
1284                            const_tree type,
1285                            bool named ATTRIBUTE_UNUSED)
1286 {
1287   HOST_WIDE_INT size;
1288   enum machine_mode dummymode;
1289   int nregs;
1290
1291   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1292   size = (mode == BLKmode && type)
1293     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1294
1295   /* Aggregates are passed by reference based on their size.  */
1296   if (type && AGGREGATE_TYPE_P (type))
1297     {
1298       size = int_size_in_bytes (type);
1299     }
1300
1301   /* Variable sized arguments are always returned by reference.  */
1302   if (size < 0)
1303     return true;
1304
1305   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1306   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1307                                                &dummymode, &nregs,
1308                                                NULL))
1309     return false;
1310
1311   /* Arguments which are variable sized or larger than 2 registers are
1312      passed by reference unless they are a homogenous floating point
1313      aggregate.  */
1314   return size > 2 * UNITS_PER_WORD;
1315 }
1316
1317 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1318 static bool
1319 aarch64_return_in_msb (const_tree valtype)
1320 {
1321   enum machine_mode dummy_mode;
1322   int dummy_int;
1323
1324   /* Never happens in little-endian mode.  */
1325   if (!BYTES_BIG_ENDIAN)
1326     return false;
1327
1328   /* Only composite types smaller than or equal to 16 bytes can
1329      be potentially returned in registers.  */
1330   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1331       || int_size_in_bytes (valtype) <= 0
1332       || int_size_in_bytes (valtype) > 16)
1333     return false;
1334
1335   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1336      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1337      is always passed/returned in the least significant bits of fp/simd
1338      register(s).  */
1339   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1340                                                &dummy_mode, &dummy_int, NULL))
1341     return false;
1342
1343   return true;
1344 }
1345
1346 /* Implement TARGET_FUNCTION_VALUE.
1347    Define how to find the value returned by a function.  */
1348
1349 static rtx
1350 aarch64_function_value (const_tree type, const_tree func,
1351                         bool outgoing ATTRIBUTE_UNUSED)
1352 {
1353   enum machine_mode mode;
1354   int unsignedp;
1355   int count;
1356   enum machine_mode ag_mode;
1357
1358   mode = TYPE_MODE (type);
1359   if (INTEGRAL_TYPE_P (type))
1360     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1361
1362   if (aarch64_return_in_msb (type))
1363     {
1364       HOST_WIDE_INT size = int_size_in_bytes (type);
1365
1366       if (size % UNITS_PER_WORD != 0)
1367         {
1368           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1369           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1370         }
1371     }
1372
1373   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1374                                                &ag_mode, &count, NULL))
1375     {
1376       if (!aarch64_composite_type_p (type, mode))
1377         {
1378           gcc_assert (count == 1 && mode == ag_mode);
1379           return gen_rtx_REG (mode, V0_REGNUM);
1380         }
1381       else
1382         {
1383           int i;
1384           rtx par;
1385
1386           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1387           for (i = 0; i < count; i++)
1388             {
1389               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1390               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1391                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1392               XVECEXP (par, 0, i) = tmp;
1393             }
1394           return par;
1395         }
1396     }
1397   else
1398     return gen_rtx_REG (mode, R0_REGNUM);
1399 }
1400
1401 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1402    Return true if REGNO is the number of a hard register in which the values
1403    of called function may come back.  */
1404
1405 static bool
1406 aarch64_function_value_regno_p (const unsigned int regno)
1407 {
1408   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1409      of 16-byte return values are: 128-bit integers and 16-byte small
1410      structures (excluding homogeneous floating-point aggregates).  */
1411   if (regno == R0_REGNUM || regno == R1_REGNUM)
1412     return true;
1413
1414   /* Up to four fp/simd registers can return a function value, e.g. a
1415      homogeneous floating-point aggregate having four members.  */
1416   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1417     return !TARGET_GENERAL_REGS_ONLY;
1418
1419   return false;
1420 }
1421
1422 /* Implement TARGET_RETURN_IN_MEMORY.
1423
1424    If the type T of the result of a function is such that
1425      void func (T arg)
1426    would require that arg be passed as a value in a register (or set of
1427    registers) according to the parameter passing rules, then the result
1428    is returned in the same registers as would be used for such an
1429    argument.  */
1430
1431 static bool
1432 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1433 {
1434   HOST_WIDE_INT size;
1435   enum machine_mode ag_mode;
1436   int count;
1437
1438   if (!AGGREGATE_TYPE_P (type)
1439       && TREE_CODE (type) != COMPLEX_TYPE
1440       && TREE_CODE (type) != VECTOR_TYPE)
1441     /* Simple scalar types always returned in registers.  */
1442     return false;
1443
1444   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1445                                                type,
1446                                                &ag_mode,
1447                                                &count,
1448                                                NULL))
1449     return false;
1450
1451   /* Types larger than 2 registers returned in memory.  */
1452   size = int_size_in_bytes (type);
1453   return (size < 0 || size > 2 * UNITS_PER_WORD);
1454 }
1455
1456 static bool
1457 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1458                                const_tree type, int *nregs)
1459 {
1460   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1461   return aarch64_vfp_is_call_or_return_candidate (mode,
1462                                                   type,
1463                                                   &pcum->aapcs_vfp_rmode,
1464                                                   nregs,
1465                                                   NULL);
1466 }
1467
1468 /* Given MODE and TYPE of a function argument, return the alignment in
1469    bits.  The idea is to suppress any stronger alignment requested by
1470    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1471    This is a helper function for local use only.  */
1472
1473 static unsigned int
1474 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1475 {
1476   unsigned int alignment;
1477
1478   if (type)
1479     {
1480       if (!integer_zerop (TYPE_SIZE (type)))
1481         {
1482           if (TYPE_MODE (type) == mode)
1483             alignment = TYPE_ALIGN (type);
1484           else
1485             alignment = GET_MODE_ALIGNMENT (mode);
1486         }
1487       else
1488         alignment = 0;
1489     }
1490   else
1491     alignment = GET_MODE_ALIGNMENT (mode);
1492
1493   return alignment;
1494 }
1495
1496 /* Layout a function argument according to the AAPCS64 rules.  The rule
1497    numbers refer to the rule numbers in the AAPCS64.  */
1498
1499 static void
1500 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1501                     const_tree type,
1502                     bool named ATTRIBUTE_UNUSED)
1503 {
1504   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1505   int ncrn, nvrn, nregs;
1506   bool allocate_ncrn, allocate_nvrn;
1507   HOST_WIDE_INT size;
1508
1509   /* We need to do this once per argument.  */
1510   if (pcum->aapcs_arg_processed)
1511     return;
1512
1513   pcum->aapcs_arg_processed = true;
1514
1515   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1516   size
1517     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1518                         UNITS_PER_WORD);
1519
1520   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1521   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1522                                                  mode,
1523                                                  type,
1524                                                  &nregs);
1525
1526   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1527      The following code thus handles passing by SIMD/FP registers first.  */
1528
1529   nvrn = pcum->aapcs_nvrn;
1530
1531   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1532      and homogenous short-vector aggregates (HVA).  */
1533   if (allocate_nvrn)
1534     {
1535       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1536         {
1537           pcum->aapcs_nextnvrn = nvrn + nregs;
1538           if (!aarch64_composite_type_p (type, mode))
1539             {
1540               gcc_assert (nregs == 1);
1541               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1542             }
1543           else
1544             {
1545               rtx par;
1546               int i;
1547               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1548               for (i = 0; i < nregs; i++)
1549                 {
1550                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1551                                          V0_REGNUM + nvrn + i);
1552                   tmp = gen_rtx_EXPR_LIST
1553                     (VOIDmode, tmp,
1554                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1555                   XVECEXP (par, 0, i) = tmp;
1556                 }
1557               pcum->aapcs_reg = par;
1558             }
1559           return;
1560         }
1561       else
1562         {
1563           /* C.3 NSRN is set to 8.  */
1564           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1565           goto on_stack;
1566         }
1567     }
1568
1569   ncrn = pcum->aapcs_ncrn;
1570   nregs = size / UNITS_PER_WORD;
1571
1572   /* C6 - C9.  though the sign and zero extension semantics are
1573      handled elsewhere.  This is the case where the argument fits
1574      entirely general registers.  */
1575   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1576     {
1577       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1578
1579       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1580
1581       /* C.8 if the argument has an alignment of 16 then the NGRN is
1582          rounded up to the next even number.  */
1583       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1584         {
1585           ++ncrn;
1586           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1587         }
1588       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1589          A reg is still generated for it, but the caller should be smart
1590          enough not to use it.  */
1591       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1592         {
1593           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1594         }
1595       else
1596         {
1597           rtx par;
1598           int i;
1599
1600           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1601           for (i = 0; i < nregs; i++)
1602             {
1603               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1604               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1605                                        GEN_INT (i * UNITS_PER_WORD));
1606               XVECEXP (par, 0, i) = tmp;
1607             }
1608           pcum->aapcs_reg = par;
1609         }
1610
1611       pcum->aapcs_nextncrn = ncrn + nregs;
1612       return;
1613     }
1614
1615   /* C.11  */
1616   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1617
1618   /* The argument is passed on stack; record the needed number of words for
1619      this argument and align the total size if necessary.  */
1620 on_stack:
1621   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1622   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1623     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1624                                                16 / UNITS_PER_WORD);
1625   return;
1626 }
1627
1628 /* Implement TARGET_FUNCTION_ARG.  */
1629
1630 static rtx
1631 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1632                       const_tree type, bool named)
1633 {
1634   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1635   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1636
1637   if (mode == VOIDmode)
1638     return NULL_RTX;
1639
1640   aarch64_layout_arg (pcum_v, mode, type, named);
1641   return pcum->aapcs_reg;
1642 }
1643
1644 void
1645 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1646                            const_tree fntype ATTRIBUTE_UNUSED,
1647                            rtx libname ATTRIBUTE_UNUSED,
1648                            const_tree fndecl ATTRIBUTE_UNUSED,
1649                            unsigned n_named ATTRIBUTE_UNUSED)
1650 {
1651   pcum->aapcs_ncrn = 0;
1652   pcum->aapcs_nvrn = 0;
1653   pcum->aapcs_nextncrn = 0;
1654   pcum->aapcs_nextnvrn = 0;
1655   pcum->pcs_variant = ARM_PCS_AAPCS64;
1656   pcum->aapcs_reg = NULL_RTX;
1657   pcum->aapcs_arg_processed = false;
1658   pcum->aapcs_stack_words = 0;
1659   pcum->aapcs_stack_size = 0;
1660
1661   return;
1662 }
1663
1664 static void
1665 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1666                               enum machine_mode mode,
1667                               const_tree type,
1668                               bool named)
1669 {
1670   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1671   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1672     {
1673       aarch64_layout_arg (pcum_v, mode, type, named);
1674       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1675                   != (pcum->aapcs_stack_words != 0));
1676       pcum->aapcs_arg_processed = false;
1677       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1678       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1679       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1680       pcum->aapcs_stack_words = 0;
1681       pcum->aapcs_reg = NULL_RTX;
1682     }
1683 }
1684
1685 bool
1686 aarch64_function_arg_regno_p (unsigned regno)
1687 {
1688   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1689           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1690 }
1691
1692 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1693    PARM_BOUNDARY bits of alignment, but will be given anything up
1694    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1695    that both before and after the layout of each argument, the Next
1696    Stacked Argument Address (NSAA) will have a minimum alignment of
1697    8 bytes.  */
1698
1699 static unsigned int
1700 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1701 {
1702   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1703
1704   if (alignment < PARM_BOUNDARY)
1705     alignment = PARM_BOUNDARY;
1706   if (alignment > STACK_BOUNDARY)
1707     alignment = STACK_BOUNDARY;
1708   return alignment;
1709 }
1710
1711 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1712
1713    Return true if an argument passed on the stack should be padded upwards,
1714    i.e. if the least-significant byte of the stack slot has useful data.
1715
1716    Small aggregate types are placed in the lowest memory address.
1717
1718    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1719
1720 bool
1721 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1722 {
1723   /* On little-endian targets, the least significant byte of every stack
1724      argument is passed at the lowest byte address of the stack slot.  */
1725   if (!BYTES_BIG_ENDIAN)
1726     return true;
1727
1728   /* Otherwise, integral, floating-point and pointer types are padded downward:
1729      the least significant byte of a stack argument is passed at the highest
1730      byte address of the stack slot.  */
1731   if (type
1732       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1733          || POINTER_TYPE_P (type))
1734       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1735     return false;
1736
1737   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1738   return true;
1739 }
1740
1741 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1742
1743    It specifies padding for the last (may also be the only)
1744    element of a block move between registers and memory.  If
1745    assuming the block is in the memory, padding upward means that
1746    the last element is padded after its highest significant byte,
1747    while in downward padding, the last element is padded at the
1748    its least significant byte side.
1749
1750    Small aggregates and small complex types are always padded
1751    upwards.
1752
1753    We don't need to worry about homogeneous floating-point or
1754    short-vector aggregates; their move is not affected by the
1755    padding direction determined here.  Regardless of endianness,
1756    each element of such an aggregate is put in the least
1757    significant bits of a fp/simd register.
1758
1759    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1760    register has useful data, and return the opposite if the most
1761    significant byte does.  */
1762
1763 bool
1764 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1765                      bool first ATTRIBUTE_UNUSED)
1766 {
1767
1768   /* Small composite types are always padded upward.  */
1769   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1770     {
1771       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1772                             : GET_MODE_SIZE (mode));
1773       if (size < 2 * UNITS_PER_WORD)
1774         return true;
1775     }
1776
1777   /* Otherwise, use the default padding.  */
1778   return !BYTES_BIG_ENDIAN;
1779 }
1780
1781 static enum machine_mode
1782 aarch64_libgcc_cmp_return_mode (void)
1783 {
1784   return SImode;
1785 }
1786
1787 static bool
1788 aarch64_frame_pointer_required (void)
1789 {
1790   /* If the function contains dynamic stack allocations, we need to
1791      use the frame pointer to access the static parts of the frame.  */
1792   if (cfun->calls_alloca)
1793     return true;
1794
1795   /* In aarch64_override_options_after_change
1796      flag_omit_leaf_frame_pointer turns off the frame pointer by
1797      default.  Turn it back on now if we've not got a leaf
1798      function.  */
1799   if (flag_omit_leaf_frame_pointer
1800       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Mark the registers that need to be saved by the callee and calculate
1807    the size of the callee-saved registers area and frame record (both FP
1808    and LR may be omitted).  */
1809 static void
1810 aarch64_layout_frame (void)
1811 {
1812   HOST_WIDE_INT offset = 0;
1813   int regno;
1814
1815   if (reload_completed && cfun->machine->frame.laid_out)
1816     return;
1817
1818 #define SLOT_NOT_REQUIRED (-2)
1819 #define SLOT_REQUIRED     (-1)
1820
1821   /* First mark all the registers that really need to be saved...  */
1822   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1823     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1824
1825   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1826     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1827
1828   /* ... that includes the eh data registers (if needed)...  */
1829   if (crtl->calls_eh_return)
1830     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1831       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1832         = SLOT_REQUIRED;
1833
1834   /* ... and any callee saved register that dataflow says is live.  */
1835   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1836     if (df_regs_ever_live_p (regno)
1837         && !call_used_regs[regno])
1838       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1839
1840   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1841     if (df_regs_ever_live_p (regno)
1842         && !call_used_regs[regno])
1843       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1844
1845   if (frame_pointer_needed)
1846     {
1847       /* FP and LR are placed in the linkage record.  */
1848       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1849       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1850       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1851       offset += 2 * UNITS_PER_WORD;
1852     }
1853
1854   /* Now assign stack slots for them.  */
1855   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1856     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1857       {
1858         cfun->machine->frame.reg_offset[regno] = offset;
1859         offset += UNITS_PER_WORD;
1860       }
1861
1862   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1863     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1864       {
1865         cfun->machine->frame.reg_offset[regno] = offset;
1866         offset += UNITS_PER_WORD;
1867       }
1868
1869   cfun->machine->frame.padding0 =
1870     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1871   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1872
1873   cfun->machine->frame.saved_regs_size = offset;
1874
1875   cfun->machine->frame.hard_fp_offset
1876     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1877                         + get_frame_size ()
1878                         + cfun->machine->frame.saved_regs_size,
1879                         STACK_BOUNDARY / BITS_PER_UNIT);
1880
1881   cfun->machine->frame.frame_size
1882     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1883                         + crtl->outgoing_args_size,
1884                         STACK_BOUNDARY / BITS_PER_UNIT);
1885
1886   cfun->machine->frame.laid_out = true;
1887 }
1888
1889 /* Make the last instruction frame-related and note that it performs
1890    the operation described by FRAME_PATTERN.  */
1891
1892 static void
1893 aarch64_set_frame_expr (rtx frame_pattern)
1894 {
1895   rtx insn;
1896
1897   insn = get_last_insn ();
1898   RTX_FRAME_RELATED_P (insn) = 1;
1899   RTX_FRAME_RELATED_P (frame_pattern) = 1;
1900   REG_NOTES (insn) = alloc_EXPR_LIST (REG_FRAME_RELATED_EXPR,
1901                                       frame_pattern,
1902                                       REG_NOTES (insn));
1903 }
1904
1905 static bool
1906 aarch64_register_saved_on_entry (int regno)
1907 {
1908   return cfun->machine->frame.reg_offset[regno] >= 0;
1909 }
1910
1911 static unsigned
1912 aarch64_next_callee_save (unsigned regno, unsigned limit)
1913 {
1914   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1915     regno ++;
1916   return regno;
1917 }
1918
1919 static rtx
1920 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
1921                         rtx reg2)
1922 {
1923   switch (mode)
1924     {
1925     case DImode:
1926       return gen_store_pairdi (mem1, reg1, mem2, reg2);
1927
1928     case DFmode:
1929       return gen_store_pairdf (mem1, reg1, mem2, reg2);
1930
1931     default:
1932       gcc_unreachable ();
1933     }
1934 }
1935
1936 static rtx
1937 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
1938                        rtx mem2)
1939 {
1940   switch (mode)
1941     {
1942     case DImode:
1943       return gen_load_pairdi (reg1, mem1, reg2, mem2);
1944
1945     case DFmode:
1946       return gen_load_pairdf (reg1, mem1, reg2, mem2);
1947
1948     default:
1949       gcc_unreachable ();
1950     }
1951 }
1952
1953
1954 /* offset from the stack pointer of where the saves and
1955    restore's have to happen.  */
1956 static void
1957 aarch64_save_or_restore_callee_saves (enum machine_mode mode,
1958                                       HOST_WIDE_INT start_offset,
1959                                       unsigned start, unsigned limit,
1960                                       bool restore)
1961 {
1962   rtx insn;
1963   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
1964                                                  ? gen_frame_mem : gen_rtx_MEM);
1965   unsigned regno;
1966   unsigned regno2;
1967
1968   for (regno = aarch64_next_callee_save (start, limit);
1969        regno <= limit;
1970        regno = aarch64_next_callee_save (regno + 1, limit))
1971     {
1972       rtx reg = gen_rtx_REG (mode, regno);
1973       rtx mem;
1974
1975       HOST_WIDE_INT offset = start_offset
1976                              + cfun->machine->frame.reg_offset[regno];
1977       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
1978                                               offset));
1979
1980       regno2 = aarch64_next_callee_save (regno + 1, limit);
1981
1982       if (regno2 <= limit
1983           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
1984               == cfun->machine->frame.reg_offset[regno2]))
1985
1986         {
1987           rtx reg2 = gen_rtx_REG (mode, regno2);
1988           rtx mem2;
1989
1990           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
1991           mem2 = gen_mem_ref (mode,
1992                               plus_constant (Pmode, stack_pointer_rtx, offset));
1993           if (restore == false)
1994             insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
1995                                                       reg2));
1996           else
1997             {
1998               insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2,
1999                                                        mem2));
2000               add_reg_note (insn, REG_CFA_RESTORE, reg);
2001               add_reg_note (insn, REG_CFA_RESTORE, reg2);
2002             }
2003
2004           /* The first part of a frame-related parallel insn is
2005              always assumed to be relevant to the frame
2006              calculations; subsequent parts, are only
2007              frame-related if explicitly marked.  */
2008           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2009           regno = regno2;
2010         }
2011       else
2012         {
2013           if (restore == false)
2014             insn = emit_move_insn (mem, reg);
2015           else
2016             {
2017               insn = emit_move_insn (reg, mem);
2018               add_reg_note (insn, REG_CFA_RESTORE, reg);
2019             }
2020         }
2021       RTX_FRAME_RELATED_P (insn) = 1;
2022     }
2023 }
2024
2025 /* AArch64 stack frames generated by this compiler look like:
2026
2027         +-------------------------------+
2028         |                               |
2029         |  incoming stack arguments     |
2030         |                               |
2031         +-------------------------------+
2032         |                               | <-- incoming stack pointer (aligned)
2033         |  callee-allocated save area   |
2034         |  for register varargs         |
2035         |                               |
2036         +-------------------------------+
2037         |  local variables              | <-- frame_pointer_rtx
2038         |                               |
2039         +-------------------------------+
2040         |  padding0                     | \
2041         +-------------------------------+  |
2042         |  callee-saved registers       |  | frame.saved_regs_size
2043         +-------------------------------+  |
2044         |  LR'                          |  |
2045         +-------------------------------+  |
2046         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2047         +-------------------------------+
2048         |  dynamic allocation           |
2049         +-------------------------------+
2050         |  padding                      |
2051         +-------------------------------+
2052         |  outgoing stack arguments     | <-- arg_pointer
2053         |                               |
2054         +-------------------------------+
2055         |                               | <-- stack_pointer_rtx (aligned)
2056
2057    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2058    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2059    unchanged.  */
2060
2061 /* Generate the prologue instructions for entry into a function.
2062    Establish the stack frame by decreasing the stack pointer with a
2063    properly calculated size and, if necessary, create a frame record
2064    filled with the values of LR and previous frame pointer.  The
2065    current FP is also set up if it is in use.  */
2066
2067 void
2068 aarch64_expand_prologue (void)
2069 {
2070   /* sub sp, sp, #<frame_size>
2071      stp {fp, lr}, [sp, #<frame_size> - 16]
2072      add fp, sp, #<frame_size> - hardfp_offset
2073      stp {cs_reg}, [fp, #-16] etc.
2074
2075      sub sp, sp, <final_adjustment_if_any>
2076   */
2077   HOST_WIDE_INT frame_size, offset;
2078   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2079   rtx insn;
2080
2081   aarch64_layout_frame ();
2082
2083   if (flag_stack_usage_info)
2084     current_function_static_stack_size = cfun->machine->frame.frame_size;
2085
2086   frame_size = cfun->machine->frame.frame_size;
2087   offset = cfun->machine->frame.frame_size;
2088
2089   fp_offset = cfun->machine->frame.frame_size
2090               - cfun->machine->frame.hard_fp_offset;
2091
2092   /* Store pairs and load pairs have a range only -512 to 504.  */
2093   if (offset >= 512)
2094     {
2095       /* When the frame has a large size, an initial decrease is done on
2096          the stack pointer to jump over the callee-allocated save area for
2097          register varargs, the local variable area and/or the callee-saved
2098          register area.  This will allow the pre-index write-back
2099          store pair instructions to be used for setting up the stack frame
2100          efficiently.  */
2101       offset = cfun->machine->frame.hard_fp_offset;
2102       if (offset >= 512)
2103         offset = cfun->machine->frame.saved_regs_size;
2104
2105       frame_size -= (offset + crtl->outgoing_args_size);
2106       fp_offset = 0;
2107
2108       if (frame_size >= 0x1000000)
2109         {
2110           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2111           emit_move_insn (op0, GEN_INT (-frame_size));
2112           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2113           aarch64_set_frame_expr (gen_rtx_SET
2114                                   (Pmode, stack_pointer_rtx,
2115                                    plus_constant (Pmode,
2116                                                   stack_pointer_rtx,
2117                                                   -frame_size)));
2118         }
2119       else if (frame_size > 0)
2120         {
2121           if ((frame_size & 0xfff) != frame_size)
2122             {
2123               insn = emit_insn (gen_add2_insn
2124                                 (stack_pointer_rtx,
2125                                  GEN_INT (-(frame_size
2126                                             & ~(HOST_WIDE_INT)0xfff))));
2127               RTX_FRAME_RELATED_P (insn) = 1;
2128             }
2129           if ((frame_size & 0xfff) != 0)
2130             {
2131               insn = emit_insn (gen_add2_insn
2132                                 (stack_pointer_rtx,
2133                                  GEN_INT (-(frame_size
2134                                             & (HOST_WIDE_INT)0xfff))));
2135               RTX_FRAME_RELATED_P (insn) = 1;
2136             }
2137         }
2138     }
2139   else
2140     frame_size = -1;
2141
2142   if (offset > 0)
2143     {
2144       /* Save the frame pointer and lr if the frame pointer is needed
2145          first.  Make the frame pointer point to the location of the
2146          old frame pointer on the stack.  */
2147       if (frame_pointer_needed)
2148         {
2149           rtx mem_fp, mem_lr;
2150
2151           if (fp_offset)
2152             {
2153               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2154                                                GEN_INT (-offset)));
2155               RTX_FRAME_RELATED_P (insn) = 1;
2156               aarch64_set_frame_expr (gen_rtx_SET
2157                                       (Pmode, stack_pointer_rtx,
2158                                        gen_rtx_MINUS (Pmode,
2159                                                       stack_pointer_rtx,
2160                                                       GEN_INT (offset))));
2161               mem_fp = gen_frame_mem (DImode,
2162                                       plus_constant (Pmode,
2163                                                      stack_pointer_rtx,
2164                                                      fp_offset));
2165               mem_lr = gen_frame_mem (DImode,
2166                                       plus_constant (Pmode,
2167                                                      stack_pointer_rtx,
2168                                                      fp_offset
2169                                                      + UNITS_PER_WORD));
2170               insn = emit_insn (gen_store_pairdi (mem_fp,
2171                                                   hard_frame_pointer_rtx,
2172                                                   mem_lr,
2173                                                   gen_rtx_REG (DImode,
2174                                                                LR_REGNUM)));
2175             }
2176           else
2177             {
2178               insn = emit_insn (gen_storewb_pairdi_di
2179                                 (stack_pointer_rtx, stack_pointer_rtx,
2180                                  hard_frame_pointer_rtx,
2181                                  gen_rtx_REG (DImode, LR_REGNUM),
2182                                  GEN_INT (-offset),
2183                                  GEN_INT (GET_MODE_SIZE (DImode) - offset)));
2184               RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2185             }
2186
2187           /* The first part of a frame-related parallel insn is always
2188              assumed to be relevant to the frame calculations;
2189              subsequent parts, are only frame-related if explicitly
2190              marked.  */
2191           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192           RTX_FRAME_RELATED_P (insn) = 1;
2193
2194           /* Set up frame pointer to point to the location of the
2195              previous frame pointer on the stack.  */
2196           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2197                                            stack_pointer_rtx,
2198                                            GEN_INT (fp_offset)));
2199           aarch64_set_frame_expr (gen_rtx_SET
2200                                   (Pmode, hard_frame_pointer_rtx,
2201                                    plus_constant (Pmode,
2202                                                   stack_pointer_rtx,
2203                                                   fp_offset)));
2204           RTX_FRAME_RELATED_P (insn) = 1;
2205           insn = emit_insn (gen_stack_tie (stack_pointer_rtx,
2206                                            hard_frame_pointer_rtx));
2207         }
2208       else
2209         {
2210           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2211                                            GEN_INT (-offset)));
2212           RTX_FRAME_RELATED_P (insn) = 1;
2213         }
2214
2215       aarch64_save_or_restore_callee_saves (DImode, fp_offset, R0_REGNUM,
2216                                             frame_pointer_needed
2217                                             ? R28_REGNUM : R30_REGNUM, false);
2218       aarch64_save_or_restore_callee_saves (DFmode, fp_offset, V0_REGNUM,
2219                                             V31_REGNUM, false);
2220     }
2221
2222   /* when offset >= 512,
2223      sub sp, sp, #<outgoing_args_size> */
2224   if (frame_size > -1)
2225     {
2226       if (crtl->outgoing_args_size > 0)
2227         {
2228           insn = emit_insn (gen_add2_insn
2229                             (stack_pointer_rtx,
2230                              GEN_INT (- crtl->outgoing_args_size)));
2231           RTX_FRAME_RELATED_P (insn) = 1;
2232         }
2233     }
2234 }
2235
2236 /* Generate the epilogue instructions for returning from a function.  */
2237 void
2238 aarch64_expand_epilogue (bool for_sibcall)
2239 {
2240   HOST_WIDE_INT frame_size, offset;
2241   HOST_WIDE_INT fp_offset;
2242   rtx insn;
2243   rtx cfa_reg;
2244
2245   aarch64_layout_frame ();
2246
2247   offset = frame_size = cfun->machine->frame.frame_size;
2248   fp_offset = cfun->machine->frame.frame_size
2249               - cfun->machine->frame.hard_fp_offset;
2250
2251   cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx;
2252
2253   /* Store pairs and load pairs have a range only -512 to 504.  */
2254   if (offset >= 512)
2255     {
2256       offset = cfun->machine->frame.hard_fp_offset;
2257       if (offset >= 512)
2258         offset = cfun->machine->frame.saved_regs_size;
2259
2260       frame_size -= (offset + crtl->outgoing_args_size);
2261       fp_offset = 0;
2262       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2263         {
2264           insn = emit_insn (gen_add2_insn
2265                             (stack_pointer_rtx,
2266                              GEN_INT (crtl->outgoing_args_size)));
2267           RTX_FRAME_RELATED_P (insn) = 1;
2268         }
2269     }
2270   else
2271     frame_size = -1;
2272
2273   /* If there were outgoing arguments or we've done dynamic stack
2274      allocation, then restore the stack pointer from the frame
2275      pointer.  This is at most one insn and more efficient than using
2276      GCC's internal mechanism.  */
2277   if (frame_pointer_needed
2278       && (crtl->outgoing_args_size || cfun->calls_alloca))
2279     {
2280       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2281                                        hard_frame_pointer_rtx,
2282                                        GEN_INT (- fp_offset)));
2283       RTX_FRAME_RELATED_P (insn) = 1;
2284       /* As SP is set to (FP - fp_offset), according to the rules in
2285          dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
2286          from the value of SP from now on.  */
2287       cfa_reg = stack_pointer_rtx;
2288     }
2289
2290   aarch64_save_or_restore_callee_saves (DImode, fp_offset, R0_REGNUM,
2291                                         frame_pointer_needed
2292                                         ? R28_REGNUM : R30_REGNUM, true);
2293   aarch64_save_or_restore_callee_saves (DFmode, fp_offset, V0_REGNUM,
2294                                         V31_REGNUM, true);
2295
2296   /* Restore the frame pointer and lr if the frame pointer is needed.  */
2297   if (offset > 0)
2298     {
2299       if (frame_pointer_needed)
2300         {
2301           rtx mem_fp, mem_lr;
2302
2303           if (fp_offset)
2304             {
2305               mem_fp = gen_frame_mem (DImode,
2306                                       plus_constant (Pmode,
2307                                                      stack_pointer_rtx,
2308                                                      fp_offset));
2309               mem_lr = gen_frame_mem (DImode,
2310                                       plus_constant (Pmode,
2311                                                      stack_pointer_rtx,
2312                                                      fp_offset
2313                                                      + UNITS_PER_WORD));
2314               insn = emit_insn (gen_load_pairdi (hard_frame_pointer_rtx,
2315                                                  mem_fp,
2316                                                  gen_rtx_REG (DImode,
2317                                                               LR_REGNUM),
2318                                                  mem_lr));
2319             }
2320           else
2321             {
2322               insn = emit_insn (gen_loadwb_pairdi_di
2323                                 (stack_pointer_rtx,
2324                                  stack_pointer_rtx,
2325                                  hard_frame_pointer_rtx,
2326                                  gen_rtx_REG (DImode, LR_REGNUM),
2327                                  GEN_INT (offset),
2328                                  GEN_INT (GET_MODE_SIZE (DImode) + offset)));
2329               RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2330               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2331                             (gen_rtx_SET (Pmode, stack_pointer_rtx,
2332                                           plus_constant (Pmode, cfa_reg,
2333                                                          offset))));
2334             }
2335
2336           /* The first part of a frame-related parallel insn
2337              is always assumed to be relevant to the frame
2338              calculations; subsequent parts, are only
2339              frame-related if explicitly marked.  */
2340           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2341           RTX_FRAME_RELATED_P (insn) = 1;
2342           add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
2343           add_reg_note (insn, REG_CFA_RESTORE,
2344                         gen_rtx_REG (DImode, LR_REGNUM));
2345
2346           if (fp_offset)
2347             {
2348               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2349                                                GEN_INT (offset)));
2350               RTX_FRAME_RELATED_P (insn) = 1;
2351             }
2352         }
2353       else
2354         {
2355           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2356                                            GEN_INT (offset)));
2357           RTX_FRAME_RELATED_P (insn) = 1;
2358         }
2359     }
2360
2361   /* Stack adjustment for exception handler.  */
2362   if (crtl->calls_eh_return)
2363     {
2364       /* We need to unwind the stack by the offset computed by
2365          EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
2366          based on SP.  Ideally we would update the SP and define the
2367          CFA along the lines of:
2368
2369          SP = SP + EH_RETURN_STACKADJ_RTX
2370          (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
2371
2372          However the dwarf emitter only understands a constant
2373          register offset.
2374
2375          The solution chosen here is to use the otherwise unused IP0
2376          as a temporary register to hold the current SP value.  The
2377          CFA is described using IP0 then SP is modified.  */
2378
2379       rtx ip0 = gen_rtx_REG (DImode, IP0_REGNUM);
2380
2381       insn = emit_move_insn (ip0, stack_pointer_rtx);
2382       add_reg_note (insn, REG_CFA_DEF_CFA, ip0);
2383       RTX_FRAME_RELATED_P (insn) = 1;
2384
2385       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2386
2387       /* Ensure the assignment to IP0 does not get optimized away.  */
2388       emit_use (ip0);
2389     }
2390
2391   if (frame_size > -1)
2392     {
2393       if (frame_size >= 0x1000000)
2394         {
2395           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2396           emit_move_insn (op0, GEN_INT (frame_size));
2397           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2398           aarch64_set_frame_expr (gen_rtx_SET
2399                                   (Pmode, stack_pointer_rtx,
2400                                    plus_constant (Pmode,
2401                                                   stack_pointer_rtx,
2402                                                   frame_size)));
2403         }
2404       else if (frame_size > 0)
2405         {
2406           if ((frame_size & 0xfff) != 0)
2407             {
2408               insn = emit_insn (gen_add2_insn
2409                                 (stack_pointer_rtx,
2410                                  GEN_INT ((frame_size
2411                                            & (HOST_WIDE_INT) 0xfff))));
2412               RTX_FRAME_RELATED_P (insn) = 1;
2413             }
2414           if ((frame_size & 0xfff) != frame_size)
2415             {
2416               insn = emit_insn (gen_add2_insn
2417                                 (stack_pointer_rtx,
2418                                  GEN_INT ((frame_size
2419                                            & ~ (HOST_WIDE_INT) 0xfff))));
2420               RTX_FRAME_RELATED_P (insn) = 1;
2421             }
2422         }
2423
2424       aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx,
2425                                            plus_constant (Pmode,
2426                                                           stack_pointer_rtx,
2427                                                           offset)));
2428     }
2429
2430   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2431   if (!for_sibcall)
2432     emit_jump_insn (ret_rtx);
2433 }
2434
2435 /* Return the place to copy the exception unwinding return address to.
2436    This will probably be a stack slot, but could (in theory be the
2437    return register).  */
2438 rtx
2439 aarch64_final_eh_return_addr (void)
2440 {
2441   HOST_WIDE_INT fp_offset;
2442
2443   aarch64_layout_frame ();
2444
2445   fp_offset = cfun->machine->frame.frame_size
2446               - cfun->machine->frame.hard_fp_offset;
2447
2448   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2449     return gen_rtx_REG (DImode, LR_REGNUM);
2450
2451   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2452      result in a store to save LR introduced by builtin_eh_return () being
2453      incorrectly deleted because the alias is not detected.
2454      So in the calculation of the address to copy the exception unwinding
2455      return address to, we note 2 cases.
2456      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2457      we return a SP-relative location since all the addresses are SP-relative
2458      in this case.  This prevents the store from being optimized away.
2459      If the fp_offset is not 0, then the addresses will be FP-relative and
2460      therefore we return a FP-relative location.  */
2461
2462   if (frame_pointer_needed)
2463     {
2464       if (fp_offset)
2465         return gen_frame_mem (DImode,
2466                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2467       else
2468         return gen_frame_mem (DImode,
2469                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2470     }
2471
2472   /* If FP is not needed, we calculate the location of LR, which would be
2473      at the top of the saved registers block.  */
2474
2475   return gen_frame_mem (DImode,
2476                         plus_constant (Pmode,
2477                                        stack_pointer_rtx,
2478                                        fp_offset
2479                                        + cfun->machine->frame.saved_regs_size
2480                                        - 2 * UNITS_PER_WORD));
2481 }
2482
2483 /* Possibly output code to build up a constant in a register.  For
2484    the benefit of the costs infrastructure, returns the number of
2485    instructions which would be emitted.  GENERATE inhibits or
2486    enables code generation.  */
2487
2488 static int
2489 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2490 {
2491   int insns = 0;
2492
2493   if (aarch64_bitmask_imm (val, DImode))
2494     {
2495       if (generate)
2496         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2497       insns = 1;
2498     }
2499   else
2500     {
2501       int i;
2502       int ncount = 0;
2503       int zcount = 0;
2504       HOST_WIDE_INT valp = val >> 16;
2505       HOST_WIDE_INT valm;
2506       HOST_WIDE_INT tval;
2507
2508       for (i = 16; i < 64; i += 16)
2509         {
2510           valm = (valp & 0xffff);
2511
2512           if (valm != 0)
2513             ++ zcount;
2514
2515           if (valm != 0xffff)
2516             ++ ncount;
2517
2518           valp >>= 16;
2519         }
2520
2521       /* zcount contains the number of additional MOVK instructions
2522          required if the constant is built up with an initial MOVZ instruction,
2523          while ncount is the number of MOVK instructions required if starting
2524          with a MOVN instruction.  Choose the sequence that yields the fewest
2525          number of instructions, preferring MOVZ instructions when they are both
2526          the same.  */
2527       if (ncount < zcount)
2528         {
2529           if (generate)
2530             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2531                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2532           tval = 0xffff;
2533           insns++;
2534         }
2535       else
2536         {
2537           if (generate)
2538             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2539                             GEN_INT (val & 0xffff));
2540           tval = 0;
2541           insns++;
2542         }
2543
2544       val >>= 16;
2545
2546       for (i = 16; i < 64; i += 16)
2547         {
2548           if ((val & 0xffff) != tval)
2549             {
2550               if (generate)
2551                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2552                                            GEN_INT (i),
2553                                            GEN_INT (val & 0xffff)));
2554               insns++;
2555             }
2556           val >>= 16;
2557         }
2558     }
2559   return insns;
2560 }
2561
2562 static void
2563 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2564 {
2565   HOST_WIDE_INT mdelta = delta;
2566   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2567   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2568
2569   if (mdelta < 0)
2570     mdelta = -mdelta;
2571
2572   if (mdelta >= 4096 * 4096)
2573     {
2574       (void) aarch64_build_constant (scratchreg, delta, true);
2575       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2576     }
2577   else if (mdelta > 0)
2578     {
2579       if (mdelta >= 4096)
2580         {
2581           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2582           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2583           if (delta < 0)
2584             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2585                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2586           else
2587             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2588                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2589         }
2590       if (mdelta % 4096 != 0)
2591         {
2592           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2593           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2594                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2595         }
2596     }
2597 }
2598
2599 /* Output code to add DELTA to the first argument, and then jump
2600    to FUNCTION.  Used for C++ multiple inheritance.  */
2601 static void
2602 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2603                          HOST_WIDE_INT delta,
2604                          HOST_WIDE_INT vcall_offset,
2605                          tree function)
2606 {
2607   /* The this pointer is always in x0.  Note that this differs from
2608      Arm where the this pointer maybe bumped to r1 if r0 is required
2609      to return a pointer to an aggregate.  On AArch64 a result value
2610      pointer will be in x8.  */
2611   int this_regno = R0_REGNUM;
2612   rtx this_rtx, temp0, temp1, addr, insn, funexp;
2613
2614   reload_completed = 1;
2615   emit_note (NOTE_INSN_PROLOGUE_END);
2616
2617   if (vcall_offset == 0)
2618     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2619   else
2620     {
2621       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2622
2623       this_rtx = gen_rtx_REG (Pmode, this_regno);
2624       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2625       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2626
2627       addr = this_rtx;
2628       if (delta != 0)
2629         {
2630           if (delta >= -256 && delta < 256)
2631             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2632                                        plus_constant (Pmode, this_rtx, delta));
2633           else
2634             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2635         }
2636
2637       if (Pmode == ptr_mode)
2638         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2639       else
2640         aarch64_emit_move (temp0,
2641                            gen_rtx_ZERO_EXTEND (Pmode,
2642                                                 gen_rtx_MEM (ptr_mode, addr)));
2643
2644       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2645           addr = plus_constant (Pmode, temp0, vcall_offset);
2646       else
2647         {
2648           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2649           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2650         }
2651
2652       if (Pmode == ptr_mode)
2653         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2654       else
2655         aarch64_emit_move (temp1,
2656                            gen_rtx_SIGN_EXTEND (Pmode,
2657                                                 gen_rtx_MEM (ptr_mode, addr)));
2658
2659       emit_insn (gen_add2_insn (this_rtx, temp1));
2660     }
2661
2662   /* Generate a tail call to the target function.  */
2663   if (!TREE_USED (function))
2664     {
2665       assemble_external (function);
2666       TREE_USED (function) = 1;
2667     }
2668   funexp = XEXP (DECL_RTL (function), 0);
2669   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2670   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2671   SIBLING_CALL_P (insn) = 1;
2672
2673   insn = get_insns ();
2674   shorten_branches (insn);
2675   final_start_function (insn, file, 1);
2676   final (insn, file, 1);
2677   final_end_function ();
2678
2679   /* Stop pretending to be a post-reload pass.  */
2680   reload_completed = 0;
2681 }
2682
2683 static int
2684 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2685 {
2686   if (GET_CODE (*x) == SYMBOL_REF)
2687     return SYMBOL_REF_TLS_MODEL (*x) != 0;
2688
2689   /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2690      TLS offsets, not real symbol references.  */
2691   if (GET_CODE (*x) == UNSPEC
2692       && XINT (*x, 1) == UNSPEC_TLS)
2693     return -1;
2694
2695   return 0;
2696 }
2697
2698 static bool
2699 aarch64_tls_referenced_p (rtx x)
2700 {
2701   if (!TARGET_HAVE_TLS)
2702     return false;
2703
2704   return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2705 }
2706
2707
2708 static int
2709 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2710 {
2711   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2712   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2713
2714   if (*imm1 < *imm2)
2715     return -1;
2716   if (*imm1 > *imm2)
2717     return +1;
2718   return 0;
2719 }
2720
2721
2722 static void
2723 aarch64_build_bitmask_table (void)
2724 {
2725   unsigned HOST_WIDE_INT mask, imm;
2726   unsigned int log_e, e, s, r;
2727   unsigned int nimms = 0;
2728
2729   for (log_e = 1; log_e <= 6; log_e++)
2730     {
2731       e = 1 << log_e;
2732       if (e == 64)
2733         mask = ~(HOST_WIDE_INT) 0;
2734       else
2735         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2736       for (s = 1; s < e; s++)
2737         {
2738           for (r = 0; r < e; r++)
2739             {
2740               /* set s consecutive bits to 1 (s < 64) */
2741               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2742               /* rotate right by r */
2743               if (r != 0)
2744                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2745               /* replicate the constant depending on SIMD size */
2746               switch (log_e) {
2747               case 1: imm |= (imm <<  2);
2748               case 2: imm |= (imm <<  4);
2749               case 3: imm |= (imm <<  8);
2750               case 4: imm |= (imm << 16);
2751               case 5: imm |= (imm << 32);
2752               case 6:
2753                 break;
2754               default:
2755                 gcc_unreachable ();
2756               }
2757               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2758               aarch64_bitmasks[nimms++] = imm;
2759             }
2760         }
2761     }
2762
2763   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2764   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2765          aarch64_bitmasks_cmp);
2766 }
2767
2768
2769 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2770    a left shift of 0 or 12 bits.  */
2771 bool
2772 aarch64_uimm12_shift (HOST_WIDE_INT val)
2773 {
2774   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2775           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2776           );
2777 }
2778
2779
2780 /* Return true if val is an immediate that can be loaded into a
2781    register by a MOVZ instruction.  */
2782 static bool
2783 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2784 {
2785   if (GET_MODE_SIZE (mode) > 4)
2786     {
2787       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2788           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2789         return 1;
2790     }
2791   else
2792     {
2793       /* Ignore sign extension.  */
2794       val &= (HOST_WIDE_INT) 0xffffffff;
2795     }
2796   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2797           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2798 }
2799
2800
2801 /* Return true if val is a valid bitmask immediate.  */
2802 bool
2803 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2804 {
2805   if (GET_MODE_SIZE (mode) < 8)
2806     {
2807       /* Replicate bit pattern.  */
2808       val &= (HOST_WIDE_INT) 0xffffffff;
2809       val |= val << 32;
2810     }
2811   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2812                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2813 }
2814
2815
2816 /* Return true if val is an immediate that can be loaded into a
2817    register in a single instruction.  */
2818 bool
2819 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2820 {
2821   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2822     return 1;
2823   return aarch64_bitmask_imm (val, mode);
2824 }
2825
2826 static bool
2827 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2828 {
2829   rtx base, offset;
2830
2831   if (GET_CODE (x) == HIGH)
2832     return true;
2833
2834   split_const (x, &base, &offset);
2835   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2836     {
2837       if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2838           != SYMBOL_FORCE_TO_MEM)
2839         return true;
2840       else
2841         /* Avoid generating a 64-bit relocation in ILP32; leave
2842            to aarch64_expand_mov_immediate to handle it properly.  */
2843         return mode != ptr_mode;
2844     }
2845
2846   return aarch64_tls_referenced_p (x);
2847 }
2848
2849 /* Return true if register REGNO is a valid index register.
2850    STRICT_P is true if REG_OK_STRICT is in effect.  */
2851
2852 bool
2853 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2854 {
2855   if (!HARD_REGISTER_NUM_P (regno))
2856     {
2857       if (!strict_p)
2858         return true;
2859
2860       if (!reg_renumber)
2861         return false;
2862
2863       regno = reg_renumber[regno];
2864     }
2865   return GP_REGNUM_P (regno);
2866 }
2867
2868 /* Return true if register REGNO is a valid base register for mode MODE.
2869    STRICT_P is true if REG_OK_STRICT is in effect.  */
2870
2871 bool
2872 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2873 {
2874   if (!HARD_REGISTER_NUM_P (regno))
2875     {
2876       if (!strict_p)
2877         return true;
2878
2879       if (!reg_renumber)
2880         return false;
2881
2882       regno = reg_renumber[regno];
2883     }
2884
2885   /* The fake registers will be eliminated to either the stack or
2886      hard frame pointer, both of which are usually valid base registers.
2887      Reload deals with the cases where the eliminated form isn't valid.  */
2888   return (GP_REGNUM_P (regno)
2889           || regno == SP_REGNUM
2890           || regno == FRAME_POINTER_REGNUM
2891           || regno == ARG_POINTER_REGNUM);
2892 }
2893
2894 /* Return true if X is a valid base register for mode MODE.
2895    STRICT_P is true if REG_OK_STRICT is in effect.  */
2896
2897 static bool
2898 aarch64_base_register_rtx_p (rtx x, bool strict_p)
2899 {
2900   if (!strict_p && GET_CODE (x) == SUBREG)
2901     x = SUBREG_REG (x);
2902
2903   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
2904 }
2905
2906 /* Return true if address offset is a valid index.  If it is, fill in INFO
2907    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
2908
2909 static bool
2910 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
2911                         enum machine_mode mode, bool strict_p)
2912 {
2913   enum aarch64_address_type type;
2914   rtx index;
2915   int shift;
2916
2917   /* (reg:P) */
2918   if ((REG_P (x) || GET_CODE (x) == SUBREG)
2919       && GET_MODE (x) == Pmode)
2920     {
2921       type = ADDRESS_REG_REG;
2922       index = x;
2923       shift = 0;
2924     }
2925   /* (sign_extend:DI (reg:SI)) */
2926   else if ((GET_CODE (x) == SIGN_EXTEND
2927             || GET_CODE (x) == ZERO_EXTEND)
2928            && GET_MODE (x) == DImode
2929            && GET_MODE (XEXP (x, 0)) == SImode)
2930     {
2931       type = (GET_CODE (x) == SIGN_EXTEND)
2932         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2933       index = XEXP (x, 0);
2934       shift = 0;
2935     }
2936   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
2937   else if (GET_CODE (x) == MULT
2938            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2939                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2940            && GET_MODE (XEXP (x, 0)) == DImode
2941            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2942            && CONST_INT_P (XEXP (x, 1)))
2943     {
2944       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
2945         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2946       index = XEXP (XEXP (x, 0), 0);
2947       shift = exact_log2 (INTVAL (XEXP (x, 1)));
2948     }
2949   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
2950   else if (GET_CODE (x) == ASHIFT
2951            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2952                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2953            && GET_MODE (XEXP (x, 0)) == DImode
2954            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2955            && CONST_INT_P (XEXP (x, 1)))
2956     {
2957       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
2958         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2959       index = XEXP (XEXP (x, 0), 0);
2960       shift = INTVAL (XEXP (x, 1));
2961     }
2962   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
2963   else if ((GET_CODE (x) == SIGN_EXTRACT
2964             || GET_CODE (x) == ZERO_EXTRACT)
2965            && GET_MODE (x) == DImode
2966            && GET_CODE (XEXP (x, 0)) == MULT
2967            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
2968            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
2969     {
2970       type = (GET_CODE (x) == SIGN_EXTRACT)
2971         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2972       index = XEXP (XEXP (x, 0), 0);
2973       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
2974       if (INTVAL (XEXP (x, 1)) != 32 + shift
2975           || INTVAL (XEXP (x, 2)) != 0)
2976         shift = -1;
2977     }
2978   /* (and:DI (mult:DI (reg:DI) (const_int scale))
2979      (const_int 0xffffffff<<shift)) */
2980   else if (GET_CODE (x) == AND
2981            && GET_MODE (x) == DImode
2982            && GET_CODE (XEXP (x, 0)) == MULT
2983            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
2984            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
2985            && CONST_INT_P (XEXP (x, 1)))
2986     {
2987       type = ADDRESS_REG_UXTW;
2988       index = XEXP (XEXP (x, 0), 0);
2989       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
2990       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
2991         shift = -1;
2992     }
2993   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
2994   else if ((GET_CODE (x) == SIGN_EXTRACT
2995             || GET_CODE (x) == ZERO_EXTRACT)
2996            && GET_MODE (x) == DImode
2997            && GET_CODE (XEXP (x, 0)) == ASHIFT
2998            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
2999            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3000     {
3001       type = (GET_CODE (x) == SIGN_EXTRACT)
3002         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3003       index = XEXP (XEXP (x, 0), 0);
3004       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3005       if (INTVAL (XEXP (x, 1)) != 32 + shift
3006           || INTVAL (XEXP (x, 2)) != 0)
3007         shift = -1;
3008     }
3009   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3010      (const_int 0xffffffff<<shift)) */
3011   else if (GET_CODE (x) == AND
3012            && GET_MODE (x) == DImode
3013            && GET_CODE (XEXP (x, 0)) == ASHIFT
3014            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3015            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3016            && CONST_INT_P (XEXP (x, 1)))
3017     {
3018       type = ADDRESS_REG_UXTW;
3019       index = XEXP (XEXP (x, 0), 0);
3020       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3021       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3022         shift = -1;
3023     }
3024   /* (mult:P (reg:P) (const_int scale)) */
3025   else if (GET_CODE (x) == MULT
3026            && GET_MODE (x) == Pmode
3027            && GET_MODE (XEXP (x, 0)) == Pmode
3028            && CONST_INT_P (XEXP (x, 1)))
3029     {
3030       type = ADDRESS_REG_REG;
3031       index = XEXP (x, 0);
3032       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3033     }
3034   /* (ashift:P (reg:P) (const_int shift)) */
3035   else if (GET_CODE (x) == ASHIFT
3036            && GET_MODE (x) == Pmode
3037            && GET_MODE (XEXP (x, 0)) == Pmode
3038            && CONST_INT_P (XEXP (x, 1)))
3039     {
3040       type = ADDRESS_REG_REG;
3041       index = XEXP (x, 0);
3042       shift = INTVAL (XEXP (x, 1));
3043     }
3044   else
3045     return false;
3046
3047   if (GET_CODE (index) == SUBREG)
3048     index = SUBREG_REG (index);
3049
3050   if ((shift == 0 ||
3051        (shift > 0 && shift <= 3
3052         && (1 << shift) == GET_MODE_SIZE (mode)))
3053       && REG_P (index)
3054       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3055     {
3056       info->type = type;
3057       info->offset = index;
3058       info->shift = shift;
3059       return true;
3060     }
3061
3062   return false;
3063 }
3064
3065 static inline bool
3066 offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3067 {
3068   return (offset >= -64 * GET_MODE_SIZE (mode)
3069           && offset < 64 * GET_MODE_SIZE (mode)
3070           && offset % GET_MODE_SIZE (mode) == 0);
3071 }
3072
3073 static inline bool
3074 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3075                                HOST_WIDE_INT offset)
3076 {
3077   return offset >= -256 && offset < 256;
3078 }
3079
3080 static inline bool
3081 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3082 {
3083   return (offset >= 0
3084           && offset < 4096 * GET_MODE_SIZE (mode)
3085           && offset % GET_MODE_SIZE (mode) == 0);
3086 }
3087
3088 /* Return true if X is a valid address for machine mode MODE.  If it is,
3089    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3090    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3091
3092 static bool
3093 aarch64_classify_address (struct aarch64_address_info *info,
3094                           rtx x, enum machine_mode mode,
3095                           RTX_CODE outer_code, bool strict_p)
3096 {
3097   enum rtx_code code = GET_CODE (x);
3098   rtx op0, op1;
3099   bool allow_reg_index_p =
3100     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3101                                || aarch64_vector_mode_supported_p (mode));
3102   /* Don't support anything other than POST_INC or REG addressing for
3103      AdvSIMD.  */
3104   if (aarch64_vect_struct_mode_p (mode)
3105       && (code != POST_INC && code != REG))
3106     return false;
3107
3108   switch (code)
3109     {
3110     case REG:
3111     case SUBREG:
3112       info->type = ADDRESS_REG_IMM;
3113       info->base = x;
3114       info->offset = const0_rtx;
3115       return aarch64_base_register_rtx_p (x, strict_p);
3116
3117     case PLUS:
3118       op0 = XEXP (x, 0);
3119       op1 = XEXP (x, 1);
3120       if (GET_MODE_SIZE (mode) != 0
3121           && CONST_INT_P (op1)
3122           && aarch64_base_register_rtx_p (op0, strict_p))
3123         {
3124           HOST_WIDE_INT offset = INTVAL (op1);
3125
3126           info->type = ADDRESS_REG_IMM;
3127           info->base = op0;
3128           info->offset = op1;
3129
3130           /* TImode and TFmode values are allowed in both pairs of X
3131              registers and individual Q registers.  The available
3132              address modes are:
3133              X,X: 7-bit signed scaled offset
3134              Q:   9-bit signed offset
3135              We conservatively require an offset representable in either mode.
3136            */
3137           if (mode == TImode || mode == TFmode)
3138             return (offset_7bit_signed_scaled_p (mode, offset)
3139                     && offset_9bit_signed_unscaled_p (mode, offset));
3140
3141           if (outer_code == PARALLEL)
3142             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3143                     && offset_7bit_signed_scaled_p (mode, offset));
3144           else
3145             return (offset_9bit_signed_unscaled_p (mode, offset)
3146                     || offset_12bit_unsigned_scaled_p (mode, offset));
3147         }
3148
3149       if (allow_reg_index_p)
3150         {
3151           /* Look for base + (scaled/extended) index register.  */
3152           if (aarch64_base_register_rtx_p (op0, strict_p)
3153               && aarch64_classify_index (info, op1, mode, strict_p))
3154             {
3155               info->base = op0;
3156               return true;
3157             }
3158           if (aarch64_base_register_rtx_p (op1, strict_p)
3159               && aarch64_classify_index (info, op0, mode, strict_p))
3160             {
3161               info->base = op1;
3162               return true;
3163             }
3164         }
3165
3166       return false;
3167
3168     case POST_INC:
3169     case POST_DEC:
3170     case PRE_INC:
3171     case PRE_DEC:
3172       info->type = ADDRESS_REG_WB;
3173       info->base = XEXP (x, 0);
3174       info->offset = NULL_RTX;
3175       return aarch64_base_register_rtx_p (info->base, strict_p);
3176
3177     case POST_MODIFY:
3178     case PRE_MODIFY:
3179       info->type = ADDRESS_REG_WB;
3180       info->base = XEXP (x, 0);
3181       if (GET_CODE (XEXP (x, 1)) == PLUS
3182           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3183           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3184           && aarch64_base_register_rtx_p (info->base, strict_p))
3185         {
3186           HOST_WIDE_INT offset;
3187           info->offset = XEXP (XEXP (x, 1), 1);
3188           offset = INTVAL (info->offset);
3189
3190           /* TImode and TFmode values are allowed in both pairs of X
3191              registers and individual Q registers.  The available
3192              address modes are:
3193              X,X: 7-bit signed scaled offset
3194              Q:   9-bit signed offset
3195              We conservatively require an offset representable in either mode.
3196            */
3197           if (mode == TImode || mode == TFmode)
3198             return (offset_7bit_signed_scaled_p (mode, offset)
3199                     && offset_9bit_signed_unscaled_p (mode, offset));
3200
3201           if (outer_code == PARALLEL)
3202             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3203                     && offset_7bit_signed_scaled_p (mode, offset));
3204           else
3205             return offset_9bit_signed_unscaled_p (mode, offset);
3206         }
3207       return false;
3208
3209     case CONST:
3210     case SYMBOL_REF:
3211     case LABEL_REF:
3212       /* load literal: pc-relative constant pool entry.  Only supported
3213          for SI mode or larger.  */
3214       info->type = ADDRESS_SYMBOLIC;
3215       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3216         {
3217           rtx sym, addend;
3218
3219           split_const (x, &sym, &addend);
3220           return (GET_CODE (sym) == LABEL_REF
3221                   || (GET_CODE (sym) == SYMBOL_REF
3222                       && CONSTANT_POOL_ADDRESS_P (sym)));
3223         }
3224       return false;
3225
3226     case LO_SUM:
3227       info->type = ADDRESS_LO_SUM;
3228       info->base = XEXP (x, 0);
3229       info->offset = XEXP (x, 1);
3230       if (allow_reg_index_p
3231           && aarch64_base_register_rtx_p (info->base, strict_p))
3232         {
3233           rtx sym, offs;
3234           split_const (info->offset, &sym, &offs);
3235           if (GET_CODE (sym) == SYMBOL_REF
3236               && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3237                   == SYMBOL_SMALL_ABSOLUTE))
3238             {
3239               /* The symbol and offset must be aligned to the access size.  */
3240               unsigned int align;
3241               unsigned int ref_size;
3242
3243               if (CONSTANT_POOL_ADDRESS_P (sym))
3244                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3245               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3246                 {
3247                   tree exp = SYMBOL_REF_DECL (sym);
3248                   align = TYPE_ALIGN (TREE_TYPE (exp));
3249                   align = CONSTANT_ALIGNMENT (exp, align);
3250                 }
3251               else if (SYMBOL_REF_DECL (sym))
3252                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3253               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3254                        && SYMBOL_REF_BLOCK (sym) != NULL)
3255                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3256               else
3257                 align = BITS_PER_UNIT;
3258
3259               ref_size = GET_MODE_SIZE (mode);
3260               if (ref_size == 0)
3261                 ref_size = GET_MODE_SIZE (DImode);
3262
3263               return ((INTVAL (offs) & (ref_size - 1)) == 0
3264                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3265             }
3266         }
3267       return false;
3268
3269     default:
3270       return false;
3271     }
3272 }
3273
3274 bool
3275 aarch64_symbolic_address_p (rtx x)
3276 {
3277   rtx offset;
3278
3279   split_const (x, &x, &offset);
3280   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3281 }
3282
3283 /* Classify the base of symbolic expression X, given that X appears in
3284    context CONTEXT.  */
3285
3286 enum aarch64_symbol_type
3287 aarch64_classify_symbolic_expression (rtx x,
3288                                       enum aarch64_symbol_context context)
3289 {
3290   rtx offset;
3291
3292   split_const (x, &x, &offset);
3293   return aarch64_classify_symbol (x, context);
3294 }
3295
3296
3297 /* Return TRUE if X is a legitimate address for accessing memory in
3298    mode MODE.  */
3299 static bool
3300 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3301 {
3302   struct aarch64_address_info addr;
3303
3304   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3305 }
3306
3307 /* Return TRUE if X is a legitimate address for accessing memory in
3308    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3309    pair operation.  */
3310 bool
3311 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3312                               RTX_CODE outer_code, bool strict_p)
3313 {
3314   struct aarch64_address_info addr;
3315
3316   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3317 }
3318
3319 /* Return TRUE if rtx X is immediate constant 0.0 */
3320 bool
3321 aarch64_float_const_zero_rtx_p (rtx x)
3322 {
3323   REAL_VALUE_TYPE r;
3324
3325   if (GET_MODE (x) == VOIDmode)
3326     return false;
3327
3328   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3329   if (REAL_VALUE_MINUS_ZERO (r))
3330     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3331   return REAL_VALUES_EQUAL (r, dconst0);
3332 }
3333
3334 /* Return the fixed registers used for condition codes.  */
3335
3336 static bool
3337 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3338 {
3339   *p1 = CC_REGNUM;
3340   *p2 = INVALID_REGNUM;
3341   return true;
3342 }
3343
3344 /* Emit call insn with PAT and do aarch64-specific handling.  */
3345
3346 void
3347 aarch64_emit_call_insn (rtx pat)
3348 {
3349   rtx insn = emit_call_insn (pat);
3350
3351   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3352   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3353   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3354 }
3355
3356 enum machine_mode
3357 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3358 {
3359   /* All floating point compares return CCFP if it is an equality
3360      comparison, and CCFPE otherwise.  */
3361   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3362     {
3363       switch (code)
3364         {
3365         case EQ:
3366         case NE:
3367         case UNORDERED:
3368         case ORDERED:
3369         case UNLT:
3370         case UNLE:
3371         case UNGT:
3372         case UNGE:
3373         case UNEQ:
3374         case LTGT:
3375           return CCFPmode;
3376
3377         case LT:
3378         case LE:
3379         case GT:
3380         case GE:
3381           return CCFPEmode;
3382
3383         default:
3384           gcc_unreachable ();
3385         }
3386     }
3387
3388   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3389       && y == const0_rtx
3390       && (code == EQ || code == NE || code == LT || code == GE)
3391       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3392           || GET_CODE (x) == NEG))
3393     return CC_NZmode;
3394
3395   /* A compare with a shifted operand.  Because of canonicalization,
3396      the comparison will have to be swapped when we emit the assembly
3397      code.  */
3398   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3399       && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3400       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3401           || GET_CODE (x) == LSHIFTRT
3402           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3403     return CC_SWPmode;
3404
3405   /* Similarly for a negated operand, but we can only do this for
3406      equalities.  */
3407   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3408       && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3409       && (code == EQ || code == NE)
3410       && GET_CODE (x) == NEG)
3411     return CC_Zmode;
3412
3413   /* A compare of a mode narrower than SI mode against zero can be done
3414      by extending the value in the comparison.  */
3415   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3416       && y == const0_rtx)
3417     /* Only use sign-extension if we really need it.  */
3418     return ((code == GT || code == GE || code == LE || code == LT)
3419             ? CC_SESWPmode : CC_ZESWPmode);
3420
3421   /* For everything else, return CCmode.  */
3422   return CCmode;
3423 }
3424
3425 static unsigned
3426 aarch64_get_condition_code (rtx x)
3427 {
3428   enum machine_mode mode = GET_MODE (XEXP (x, 0));
3429   enum rtx_code comp_code = GET_CODE (x);
3430
3431   if (GET_MODE_CLASS (mode) != MODE_CC)
3432     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3433
3434   switch (mode)
3435     {
3436     case CCFPmode:
3437     case CCFPEmode:
3438       switch (comp_code)
3439         {
3440         case GE: return AARCH64_GE;
3441         case GT: return AARCH64_GT;
3442         case LE: return AARCH64_LS;
3443         case LT: return AARCH64_MI;
3444         case NE: return AARCH64_NE;
3445         case EQ: return AARCH64_EQ;
3446         case ORDERED: return AARCH64_VC;
3447         case UNORDERED: return AARCH64_VS;
3448         case UNLT: return AARCH64_LT;
3449         case UNLE: return AARCH64_LE;
3450         case UNGT: return AARCH64_HI;
3451         case UNGE: return AARCH64_PL;
3452         default: gcc_unreachable ();
3453         }
3454       break;
3455
3456     case CCmode:
3457       switch (comp_code)
3458         {
3459         case NE: return AARCH64_NE;
3460         case EQ: return AARCH64_EQ;
3461         case GE: return AARCH64_GE;
3462         case GT: return AARCH64_GT;
3463         case LE: return AARCH64_LE;
3464         case LT: return AARCH64_LT;
3465         case GEU: return AARCH64_CS;
3466         case GTU: return AARCH64_HI;
3467         case LEU: return AARCH64_LS;
3468         case LTU: return AARCH64_CC;
3469         default: gcc_unreachable ();
3470         }
3471       break;
3472
3473     case CC_SWPmode:
3474     case CC_ZESWPmode:
3475     case CC_SESWPmode:
3476       switch (comp_code)
3477         {
3478         case NE: return AARCH64_NE;
3479         case EQ: return AARCH64_EQ;
3480         case GE: return AARCH64_LE;
3481         case GT: return AARCH64_LT;
3482         case LE: return AARCH64_GE;
3483         case LT: return AARCH64_GT;
3484         case GEU: return AARCH64_LS;
3485         case GTU: return AARCH64_CC;
3486         case LEU: return AARCH64_CS;
3487         case LTU: return AARCH64_HI;
3488         default: gcc_unreachable ();
3489         }
3490       break;
3491
3492     case CC_NZmode:
3493       switch (comp_code)
3494         {
3495         case NE: return AARCH64_NE;
3496         case EQ: return AARCH64_EQ;
3497         case GE: return AARCH64_PL;
3498         case LT: return AARCH64_MI;
3499         default: gcc_unreachable ();
3500         }
3501       break;
3502
3503     case CC_Zmode:
3504       switch (comp_code)
3505         {
3506         case NE: return AARCH64_NE;
3507         case EQ: return AARCH64_EQ;
3508         default: gcc_unreachable ();
3509         }
3510       break;
3511
3512     default:
3513       gcc_unreachable ();
3514       break;
3515     }
3516 }
3517
3518 static unsigned
3519 bit_count (unsigned HOST_WIDE_INT value)
3520 {
3521   unsigned count = 0;
3522
3523   while (value)
3524     {
3525       count++;
3526       value &= value - 1;
3527     }
3528
3529   return count;
3530 }
3531
3532 void
3533 aarch64_print_operand (FILE *f, rtx x, char code)
3534 {
3535   switch (code)
3536     {
3537     /* An integer or symbol address without a preceding # sign.  */
3538     case 'c':
3539       switch (GET_CODE (x))
3540         {
3541         case CONST_INT:
3542           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3543           break;
3544
3545         case SYMBOL_REF:
3546           output_addr_const (f, x);
3547           break;
3548
3549         case CONST:
3550           if (GET_CODE (XEXP (x, 0)) == PLUS
3551               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3552             {
3553               output_addr_const (f, x);
3554               break;
3555             }
3556           /* Fall through.  */
3557
3558         default:
3559           output_operand_lossage ("Unsupported operand for code '%c'", code);
3560         }
3561       break;
3562
3563     case 'e':
3564       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3565       {
3566         int n;
3567
3568         if (GET_CODE (x) != CONST_INT
3569             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3570           {
3571             output_operand_lossage ("invalid operand for '%%%c'", code);
3572             return;
3573           }
3574
3575         switch (n)
3576           {
3577           case 3:
3578             fputc ('b', f);
3579             break;
3580           case 4:
3581             fputc ('h', f);
3582             break;
3583           case 5:
3584             fputc ('w', f);
3585             break;
3586           default:
3587             output_operand_lossage ("invalid operand for '%%%c'", code);
3588             return;
3589           }
3590       }
3591       break;
3592
3593     case 'p':
3594       {
3595         int n;
3596
3597         /* Print N such that 2^N == X.  */
3598         if (GET_CODE (x) != CONST_INT || (n = exact_log2 (INTVAL (x))) < 0)
3599           {
3600             output_operand_lossage ("invalid operand for '%%%c'", code);
3601             return;
3602           }
3603
3604         asm_fprintf (f, "%d", n);
3605       }
3606       break;
3607
3608     case 'P':
3609       /* Print the number of non-zero bits in X (a const_int).  */
3610       if (GET_CODE (x) != CONST_INT)
3611         {
3612           output_operand_lossage ("invalid operand for '%%%c'", code);
3613           return;
3614         }
3615
3616       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3617       break;
3618
3619     case 'H':
3620       /* Print the higher numbered register of a pair (TImode) of regs.  */
3621       if (GET_CODE (x) != REG || !GP_REGNUM_P (REGNO (x) + 1))
3622         {
3623           output_operand_lossage ("invalid operand for '%%%c'", code);
3624           return;
3625         }
3626
3627       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3628       break;
3629
3630     case 'm':
3631       /* Print a condition (eq, ne, etc).  */
3632
3633       /* CONST_TRUE_RTX means always -- that's the default.  */
3634       if (x == const_true_rtx)
3635         return;
3636
3637       if (!COMPARISON_P (x))
3638         {
3639           output_operand_lossage ("invalid operand for '%%%c'", code);
3640           return;
3641         }
3642
3643       fputs (aarch64_condition_codes[aarch64_get_condition_code (x)], f);
3644       break;
3645
3646     case 'M':
3647       /* Print the inverse of a condition (eq <-> ne, etc).  */
3648
3649       /* CONST_TRUE_RTX means never -- that's the default.  */
3650       if (x == const_true_rtx)
3651         {
3652           fputs ("nv", f);
3653           return;
3654         }
3655
3656       if (!COMPARISON_P (x))
3657         {
3658           output_operand_lossage ("invalid operand for '%%%c'", code);
3659           return;
3660         }
3661
3662       fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3663                                   (aarch64_get_condition_code (x))], f);
3664       break;
3665
3666     case 'b':
3667     case 'h':
3668     case 's':
3669     case 'd':
3670     case 'q':
3671       /* Print a scalar FP/SIMD register name.  */
3672       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3673         {
3674           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3675           return;
3676         }
3677       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3678       break;
3679
3680     case 'S':
3681     case 'T':
3682     case 'U':
3683     case 'V':
3684       /* Print the first FP/SIMD register name in a list.  */
3685       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3686         {
3687           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3688           return;
3689         }
3690       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3691       break;
3692
3693     case 'X':
3694       /* Print bottom 16 bits of integer constant in hex.  */
3695       if (GET_CODE (x) != CONST_INT)
3696         {
3697           output_operand_lossage ("invalid operand for '%%%c'", code);
3698           return;
3699         }
3700       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3701       break;
3702
3703     case 'w':
3704     case 'x':
3705       /* Print a general register name or the zero register (32-bit or
3706          64-bit).  */
3707       if (x == const0_rtx
3708           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3709         {
3710           asm_fprintf (f, "%czr", code);
3711           break;
3712         }
3713
3714       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3715         {
3716           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3717           break;
3718         }
3719
3720       if (REG_P (x) && REGNO (x) == SP_REGNUM)
3721         {
3722           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3723           break;
3724         }
3725
3726       /* Fall through */
3727
3728     case 0:
3729       /* Print a normal operand, if it's a general register, then we
3730          assume DImode.  */
3731       if (x == NULL)
3732         {
3733           output_operand_lossage ("missing operand");
3734           return;
3735         }
3736
3737       switch (GET_CODE (x))
3738         {
3739         case REG:
3740           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3741           break;
3742
3743         case MEM:
3744           aarch64_memory_reference_mode = GET_MODE (x);
3745           output_address (XEXP (x, 0));
3746           break;
3747
3748         case LABEL_REF:
3749         case SYMBOL_REF:
3750           output_addr_const (asm_out_file, x);
3751           break;
3752
3753         case CONST_INT:
3754           asm_fprintf (f, "%wd", INTVAL (x));
3755           break;
3756
3757         case CONST_VECTOR:
3758           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3759             {
3760               gcc_assert (aarch64_const_vec_all_same_int_p (x,
3761                                                             HOST_WIDE_INT_MIN,
3762                                                             HOST_WIDE_INT_MAX));
3763               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3764             }
3765           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3766             {
3767               fputc ('0', f);
3768             }
3769           else
3770             gcc_unreachable ();
3771           break;
3772
3773         case CONST_DOUBLE:
3774           /* CONST_DOUBLE can represent a double-width integer.
3775              In this case, the mode of x is VOIDmode.  */
3776           if (GET_MODE (x) == VOIDmode)
3777             ; /* Do Nothing.  */
3778           else if (aarch64_float_const_zero_rtx_p (x))
3779             {
3780               fputc ('0', f);
3781               break;
3782             }
3783           else if (aarch64_float_const_representable_p (x))
3784             {
3785 #define buf_size 20
3786               char float_buf[buf_size] = {'\0'};
3787               REAL_VALUE_TYPE r;
3788               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3789               real_to_decimal_for_mode (float_buf, &r,
3790                                         buf_size, buf_size,
3791                                         1, GET_MODE (x));
3792               asm_fprintf (asm_out_file, "%s", float_buf);
3793               break;
3794 #undef buf_size
3795             }
3796           output_operand_lossage ("invalid constant");
3797           return;
3798         default:
3799           output_operand_lossage ("invalid operand");
3800           return;
3801         }
3802       break;
3803
3804     case 'A':
3805       if (GET_CODE (x) == HIGH)
3806         x = XEXP (x, 0);
3807
3808       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3809         {
3810         case SYMBOL_SMALL_GOT:
3811           asm_fprintf (asm_out_file, ":got:");
3812           break;
3813
3814         case SYMBOL_SMALL_TLSGD:
3815           asm_fprintf (asm_out_file, ":tlsgd:");
3816           break;
3817
3818         case SYMBOL_SMALL_TLSDESC:
3819           asm_fprintf (asm_out_file, ":tlsdesc:");
3820           break;
3821
3822         case SYMBOL_SMALL_GOTTPREL:
3823           asm_fprintf (asm_out_file, ":gottprel:");
3824           break;
3825
3826         case SYMBOL_SMALL_TPREL:
3827           asm_fprintf (asm_out_file, ":tprel:");
3828           break;
3829
3830         case SYMBOL_TINY_GOT:
3831           gcc_unreachable ();
3832           break;
3833
3834         default:
3835           break;
3836         }
3837       output_addr_const (asm_out_file, x);
3838       break;
3839
3840     case 'L':
3841       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3842         {
3843         case SYMBOL_SMALL_GOT:
3844           asm_fprintf (asm_out_file, ":lo12:");
3845           break;
3846
3847         case SYMBOL_SMALL_TLSGD:
3848           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3849           break;
3850
3851         case SYMBOL_SMALL_TLSDESC:
3852           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3853           break;
3854
3855         case SYMBOL_SMALL_GOTTPREL:
3856           asm_fprintf (asm_out_file, ":gottprel_lo12:");
3857           break;
3858
3859         case SYMBOL_SMALL_TPREL:
3860           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3861           break;
3862
3863         case SYMBOL_TINY_GOT:
3864           asm_fprintf (asm_out_file, ":got:");
3865           break;
3866
3867         default:
3868           break;
3869         }
3870       output_addr_const (asm_out_file, x);
3871       break;
3872
3873     case 'G':
3874
3875       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3876         {
3877         case SYMBOL_SMALL_TPREL:
3878           asm_fprintf (asm_out_file, ":tprel_hi12:");
3879           break;
3880         default:
3881           break;
3882         }
3883       output_addr_const (asm_out_file, x);
3884       break;
3885
3886     default:
3887       output_operand_lossage ("invalid operand prefix '%%%c'", code);
3888       return;
3889     }
3890 }
3891
3892 void
3893 aarch64_print_operand_address (FILE *f, rtx x)
3894 {
3895   struct aarch64_address_info addr;
3896
3897   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
3898                              MEM, true))
3899     switch (addr.type)
3900       {
3901       case ADDRESS_REG_IMM:
3902         if (addr.offset == const0_rtx)
3903           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
3904         else
3905           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
3906                        INTVAL (addr.offset));
3907         return;
3908
3909       case ADDRESS_REG_REG:
3910         if (addr.shift == 0)
3911           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
3912                        reg_names [REGNO (addr.offset)]);
3913         else
3914           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
3915                        reg_names [REGNO (addr.offset)], addr.shift);
3916         return;
3917
3918       case ADDRESS_REG_UXTW:
3919         if (addr.shift == 0)
3920           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
3921                        REGNO (addr.offset) - R0_REGNUM);
3922         else
3923           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
3924                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
3925         return;
3926
3927       case ADDRESS_REG_SXTW:
3928         if (addr.shift == 0)
3929           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
3930                        REGNO (addr.offset) - R0_REGNUM);
3931         else
3932           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
3933                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
3934         return;
3935
3936       case ADDRESS_REG_WB:
3937         switch (GET_CODE (x))
3938           {
3939           case PRE_INC:
3940             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
3941                          GET_MODE_SIZE (aarch64_memory_reference_mode));
3942             return;
3943           case POST_INC:
3944             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
3945                          GET_MODE_SIZE (aarch64_memory_reference_mode));
3946             return;
3947           case PRE_DEC:
3948             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
3949                          GET_MODE_SIZE (aarch64_memory_reference_mode));
3950             return;
3951           case POST_DEC:
3952             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
3953                          GET_MODE_SIZE (aarch64_memory_reference_mode));
3954             return;
3955           case PRE_MODIFY:
3956             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
3957                          INTVAL (addr.offset));
3958             return;
3959           case POST_MODIFY:
3960             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
3961                          INTVAL (addr.offset));
3962             return;
3963           default:
3964             break;
3965           }
3966         break;
3967
3968       case ADDRESS_LO_SUM:
3969         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
3970         output_addr_const (f, addr.offset);
3971         asm_fprintf (f, "]");
3972         return;
3973
3974       case ADDRESS_SYMBOLIC:
3975         break;
3976       }
3977
3978   output_addr_const (f, x);
3979 }
3980
3981 bool
3982 aarch64_label_mentioned_p (rtx x)
3983 {
3984   const char *fmt;
3985   int i;
3986
3987   if (GET_CODE (x) == LABEL_REF)
3988     return true;
3989
3990   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
3991      referencing instruction, but they are constant offsets, not
3992      symbols.  */
3993   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3994     return false;
3995
3996   fmt = GET_RTX_FORMAT (GET_CODE (x));
3997   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3998     {
3999       if (fmt[i] == 'E')
4000         {
4001           int j;
4002
4003           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4004             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4005               return 1;
4006         }
4007       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4008         return 1;
4009     }
4010
4011   return 0;
4012 }
4013
4014 /* Implement REGNO_REG_CLASS.  */
4015
4016 enum reg_class
4017 aarch64_regno_regclass (unsigned regno)
4018 {
4019   if (GP_REGNUM_P (regno))
4020     return GENERAL_REGS;
4021
4022   if (regno == SP_REGNUM)
4023     return STACK_REG;
4024
4025   if (regno == FRAME_POINTER_REGNUM
4026       || regno == ARG_POINTER_REGNUM)
4027     return POINTER_REGS;
4028
4029   if (FP_REGNUM_P (regno))
4030     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4031
4032   return NO_REGS;
4033 }
4034
4035 /* Try a machine-dependent way of reloading an illegitimate address
4036    operand.  If we find one, push the reload and return the new rtx.  */
4037
4038 rtx
4039 aarch64_legitimize_reload_address (rtx *x_p,
4040                                    enum machine_mode mode,
4041                                    int opnum, int type,
4042                                    int ind_levels ATTRIBUTE_UNUSED)
4043 {
4044   rtx x = *x_p;
4045
4046   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4047   if (aarch64_vect_struct_mode_p (mode)
4048       && GET_CODE (x) == PLUS
4049       && REG_P (XEXP (x, 0))
4050       && CONST_INT_P (XEXP (x, 1)))
4051     {
4052       rtx orig_rtx = x;
4053       x = copy_rtx (x);
4054       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4055                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4056                    opnum, (enum reload_type) type);
4057       return x;
4058     }
4059
4060   /* We must recognize output that we have already generated ourselves.  */
4061   if (GET_CODE (x) == PLUS
4062       && GET_CODE (XEXP (x, 0)) == PLUS
4063       && REG_P (XEXP (XEXP (x, 0), 0))
4064       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4065       && CONST_INT_P (XEXP (x, 1)))
4066     {
4067       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4068                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4069                    opnum, (enum reload_type) type);
4070       return x;
4071     }
4072
4073   /* We wish to handle large displacements off a base register by splitting
4074      the addend across an add and the mem insn.  This can cut the number of
4075      extra insns needed from 3 to 1.  It is only useful for load/store of a
4076      single register with 12 bit offset field.  */
4077   if (GET_CODE (x) == PLUS
4078       && REG_P (XEXP (x, 0))
4079       && CONST_INT_P (XEXP (x, 1))
4080       && HARD_REGISTER_P (XEXP (x, 0))
4081       && mode != TImode
4082       && mode != TFmode
4083       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4084     {
4085       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4086       HOST_WIDE_INT low = val & 0xfff;
4087       HOST_WIDE_INT high = val - low;
4088       HOST_WIDE_INT offs;
4089       rtx cst;
4090       enum machine_mode xmode = GET_MODE (x);
4091
4092       /* In ILP32, xmode can be either DImode or SImode.  */
4093       gcc_assert (xmode == DImode || xmode == SImode);
4094
4095       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4096          BLKmode alignment.  */
4097       if (GET_MODE_SIZE (mode) == 0)
4098         return NULL_RTX;
4099
4100       offs = low % GET_MODE_SIZE (mode);
4101
4102       /* Align misaligned offset by adjusting high part to compensate.  */
4103       if (offs != 0)
4104         {
4105           if (aarch64_uimm12_shift (high + offs))
4106             {
4107               /* Align down.  */
4108               low = low - offs;
4109               high = high + offs;
4110             }
4111           else
4112             {
4113               /* Align up.  */
4114               offs = GET_MODE_SIZE (mode) - offs;
4115               low = low + offs;
4116               high = high + (low & 0x1000) - offs;
4117               low &= 0xfff;
4118             }
4119         }
4120
4121       /* Check for overflow.  */
4122       if (high + low != val)
4123         return NULL_RTX;
4124
4125       cst = GEN_INT (high);
4126       if (!aarch64_uimm12_shift (high))
4127         cst = force_const_mem (xmode, cst);
4128
4129       /* Reload high part into base reg, leaving the low part
4130          in the mem instruction.
4131          Note that replacing this gen_rtx_PLUS with plus_constant is
4132          wrong in this case because we rely on the
4133          (plus (plus reg c1) c2) structure being preserved so that
4134          XEXP (*p, 0) in push_reload below uses the correct term.  */
4135       x = gen_rtx_PLUS (xmode,
4136                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4137                         GEN_INT (low));
4138
4139       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4140                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4141                    opnum, (enum reload_type) type);
4142       return x;
4143     }
4144
4145   return NULL_RTX;
4146 }
4147
4148
4149 static reg_class_t
4150 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4151                           reg_class_t rclass,
4152                           enum machine_mode mode,
4153                           secondary_reload_info *sri)
4154 {
4155   /* Without the TARGET_SIMD instructions we cannot move a Q register
4156      to a Q register directly.  We need a scratch.  */
4157   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4158       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4159       && reg_class_subset_p (rclass, FP_REGS))
4160     {
4161       if (mode == TFmode)
4162         sri->icode = CODE_FOR_aarch64_reload_movtf;
4163       else if (mode == TImode)
4164         sri->icode = CODE_FOR_aarch64_reload_movti;
4165       return NO_REGS;
4166     }
4167
4168   /* A TFmode or TImode memory access should be handled via an FP_REGS
4169      because AArch64 has richer addressing modes for LDR/STR instructions
4170      than LDP/STP instructions.  */
4171   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4172       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4173     return FP_REGS;
4174
4175   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4176       return GENERAL_REGS;
4177
4178   return NO_REGS;
4179 }
4180
4181 static bool
4182 aarch64_can_eliminate (const int from, const int to)
4183 {
4184   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4185      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4186
4187   if (frame_pointer_needed)
4188     {
4189       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4190         return true;
4191       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4192         return false;
4193       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4194           && !cfun->calls_alloca)
4195         return true;
4196       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4197         return true;
4198
4199       return false;
4200     }
4201
4202   return true;
4203 }
4204
4205 HOST_WIDE_INT
4206 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4207 {
4208   aarch64_layout_frame ();
4209
4210   if (to == HARD_FRAME_POINTER_REGNUM)
4211     {
4212       if (from == ARG_POINTER_REGNUM)
4213         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4214
4215       if (from == FRAME_POINTER_REGNUM)
4216         return (cfun->machine->frame.hard_fp_offset
4217                 - cfun->machine->frame.saved_varargs_size);
4218     }
4219
4220   if (to == STACK_POINTER_REGNUM)
4221     {
4222       if (from == FRAME_POINTER_REGNUM)
4223           return (cfun->machine->frame.frame_size
4224                   - cfun->machine->frame.saved_varargs_size);
4225     }
4226
4227   return cfun->machine->frame.frame_size;
4228 }
4229
4230 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4231    previous frame.  */
4232
4233 rtx
4234 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4235 {
4236   if (count != 0)
4237     return const0_rtx;
4238   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4239 }
4240
4241
4242 static void
4243 aarch64_asm_trampoline_template (FILE *f)
4244 {
4245   if (TARGET_ILP32)
4246     {
4247       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4248       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4249     }
4250   else
4251     {
4252       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4253       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4254     }
4255   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4256   assemble_aligned_integer (4, const0_rtx);
4257   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4258   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4259 }
4260
4261 static void
4262 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4263 {
4264   rtx fnaddr, mem, a_tramp;
4265   const int tramp_code_sz = 16;
4266
4267   /* Don't need to copy the trailing D-words, we fill those in below.  */
4268   emit_block_move (m_tramp, assemble_trampoline_template (),
4269                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4270   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4271   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4272   if (GET_MODE (fnaddr) != ptr_mode)
4273     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4274   emit_move_insn (mem, fnaddr);
4275
4276   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4277   emit_move_insn (mem, chain_value);
4278
4279   /* XXX We should really define a "clear_cache" pattern and use
4280      gen_clear_cache().  */
4281   a_tramp = XEXP (m_tramp, 0);
4282   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4283                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4284                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4285                      ptr_mode);
4286 }
4287
4288 static unsigned char
4289 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4290 {
4291   switch (regclass)
4292     {
4293     case CALLER_SAVE_REGS:
4294     case POINTER_REGS:
4295     case GENERAL_REGS:
4296     case ALL_REGS:
4297     case FP_REGS:
4298     case FP_LO_REGS:
4299       return
4300         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4301                                        (GET_MODE_SIZE (mode) + 7) / 8;
4302     case STACK_REG:
4303       return 1;
4304
4305     case NO_REGS:
4306       return 0;
4307
4308     default:
4309       break;
4310     }
4311   gcc_unreachable ();
4312 }
4313
4314 static reg_class_t
4315 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4316 {
4317   if (regclass == POINTER_REGS)
4318     return GENERAL_REGS;
4319
4320   if (regclass == STACK_REG)
4321     {
4322       if (REG_P(x)
4323           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4324           return regclass;
4325
4326       return NO_REGS;
4327     }
4328
4329   /* If it's an integer immediate that MOVI can't handle, then
4330      FP_REGS is not an option, so we return NO_REGS instead.  */
4331   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4332       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4333     return NO_REGS;
4334
4335   /* Register eliminiation can result in a request for
4336      SP+constant->FP_REGS.  We cannot support such operations which
4337      use SP as source and an FP_REG as destination, so reject out
4338      right now.  */
4339   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4340     {
4341       rtx lhs = XEXP (x, 0);
4342
4343       /* Look through a possible SUBREG introduced by ILP32.  */
4344       if (GET_CODE (lhs) == SUBREG)
4345         lhs = SUBREG_REG (lhs);
4346
4347       gcc_assert (REG_P (lhs));
4348       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4349                                       POINTER_REGS));
4350       return NO_REGS;
4351     }
4352
4353   return regclass;
4354 }
4355
4356 void
4357 aarch64_asm_output_labelref (FILE* f, const char *name)
4358 {
4359   asm_fprintf (f, "%U%s", name);
4360 }
4361
4362 static void
4363 aarch64_elf_asm_constructor (rtx symbol, int priority)
4364 {
4365   if (priority == DEFAULT_INIT_PRIORITY)
4366     default_ctor_section_asm_out_constructor (symbol, priority);
4367   else
4368     {
4369       section *s;
4370       char buf[18];
4371       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4372       s = get_section (buf, SECTION_WRITE, NULL);
4373       switch_to_section (s);
4374       assemble_align (POINTER_SIZE);
4375       assemble_aligned_integer (POINTER_BYTES, symbol);
4376     }
4377 }
4378
4379 static void
4380 aarch64_elf_asm_destructor (rtx symbol, int priority)
4381 {
4382   if (priority == DEFAULT_INIT_PRIORITY)
4383     default_dtor_section_asm_out_destructor (symbol, priority);
4384   else
4385     {
4386       section *s;
4387       char buf[18];
4388       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4389       s = get_section (buf, SECTION_WRITE, NULL);
4390       switch_to_section (s);
4391       assemble_align (POINTER_SIZE);
4392       assemble_aligned_integer (POINTER_BYTES, symbol);
4393     }
4394 }
4395
4396 const char*
4397 aarch64_output_casesi (rtx *operands)
4398 {
4399   char buf[100];
4400   char label[100];
4401   rtx diff_vec = PATTERN (NEXT_INSN (operands[2]));
4402   int index;
4403   static const char *const patterns[4][2] =
4404   {
4405     {
4406       "ldrb\t%w3, [%0,%w1,uxtw]",
4407       "add\t%3, %4, %w3, sxtb #2"
4408     },
4409     {
4410       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4411       "add\t%3, %4, %w3, sxth #2"
4412     },
4413     {
4414       "ldr\t%w3, [%0,%w1,uxtw #2]",
4415       "add\t%3, %4, %w3, sxtw #2"
4416     },
4417     /* We assume that DImode is only generated when not optimizing and
4418        that we don't really need 64-bit address offsets.  That would
4419        imply an object file with 8GB of code in a single function!  */
4420     {
4421       "ldr\t%w3, [%0,%w1,uxtw #2]",
4422       "add\t%3, %4, %w3, sxtw #2"
4423     }
4424   };
4425
4426   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4427
4428   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4429
4430   gcc_assert (index >= 0 && index <= 3);
4431
4432   /* Need to implement table size reduction, by chaning the code below.  */
4433   output_asm_insn (patterns[index][0], operands);
4434   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4435   snprintf (buf, sizeof (buf),
4436             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4437   output_asm_insn (buf, operands);
4438   output_asm_insn (patterns[index][1], operands);
4439   output_asm_insn ("br\t%3", operands);
4440   assemble_label (asm_out_file, label);
4441   return "";
4442 }
4443
4444
4445 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4446    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4447    operator.  */
4448
4449 int
4450 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4451 {
4452   if (shift >= 0 && shift <= 3)
4453     {
4454       int size;
4455       for (size = 8; size <= 32; size *= 2)
4456         {
4457           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4458           if (mask == bits << shift)
4459             return size;
4460         }
4461     }
4462   return 0;
4463 }
4464
4465 static bool
4466 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4467                                    const_rtx x ATTRIBUTE_UNUSED)
4468 {
4469   /* We can't use blocks for constants when we're using a per-function
4470      constant pool.  */
4471   return false;
4472 }
4473
4474 static section *
4475 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4476                             rtx x ATTRIBUTE_UNUSED,
4477                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4478 {
4479   /* Force all constant pool entries into the current function section.  */
4480   return function_section (current_function_decl);
4481 }
4482
4483
4484 /* Costs.  */
4485
4486 /* Helper function for rtx cost calculation.  Strip a shift expression
4487    from X.  Returns the inner operand if successful, or the original
4488    expression on failure.  */
4489 static rtx
4490 aarch64_strip_shift (rtx x)
4491 {
4492   rtx op = x;
4493
4494   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4495      we can convert both to ROR during final output.  */
4496   if ((GET_CODE (op) == ASHIFT
4497        || GET_CODE (op) == ASHIFTRT
4498        || GET_CODE (op) == LSHIFTRT
4499        || GET_CODE (op) == ROTATERT
4500        || GET_CODE (op) == ROTATE)
4501       && CONST_INT_P (XEXP (op, 1)))
4502     return XEXP (op, 0);
4503
4504   if (GET_CODE (op) == MULT
4505       && CONST_INT_P (XEXP (op, 1))
4506       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4507     return XEXP (op, 0);
4508
4509   return x;
4510 }
4511
4512 /* Helper function for rtx cost calculation.  Strip an extend
4513    expression from X.  Returns the inner operand if successful, or the
4514    original expression on failure.  We deal with a number of possible
4515    canonicalization variations here.  */
4516 static rtx
4517 aarch64_strip_extend (rtx x)
4518 {
4519   rtx op = x;
4520
4521   /* Zero and sign extraction of a widened value.  */
4522   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4523       && XEXP (op, 2) == const0_rtx
4524       && GET_CODE (XEXP (op, 0)) == MULT
4525       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4526                                          XEXP (op, 1)))
4527     return XEXP (XEXP (op, 0), 0);
4528
4529   /* It can also be represented (for zero-extend) as an AND with an
4530      immediate.  */
4531   if (GET_CODE (op) == AND
4532       && GET_CODE (XEXP (op, 0)) == MULT
4533       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4534       && CONST_INT_P (XEXP (op, 1))
4535       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4536                            INTVAL (XEXP (op, 1))) != 0)
4537     return XEXP (XEXP (op, 0), 0);
4538
4539   /* Now handle extended register, as this may also have an optional
4540      left shift by 1..4.  */
4541   if (GET_CODE (op) == ASHIFT
4542       && CONST_INT_P (XEXP (op, 1))
4543       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4544     op = XEXP (op, 0);
4545
4546   if (GET_CODE (op) == ZERO_EXTEND
4547       || GET_CODE (op) == SIGN_EXTEND)
4548     op = XEXP (op, 0);
4549
4550   if (op != x)
4551     return op;
4552
4553   return x;
4554 }
4555
4556 /* Helper function for rtx cost calculation.  Calculate the cost of
4557    a MULT, which may be part of a multiply-accumulate rtx.  Return
4558    the calculated cost of the expression, recursing manually in to
4559    operands where needed.  */
4560
4561 static int
4562 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4563 {
4564   rtx op0, op1;
4565   const struct cpu_cost_table *extra_cost
4566     = aarch64_tune_params->insn_extra_cost;
4567   int cost = 0;
4568   bool maybe_fma = (outer == PLUS || outer == MINUS);
4569   enum machine_mode mode = GET_MODE (x);
4570
4571   gcc_checking_assert (code == MULT);
4572
4573   op0 = XEXP (x, 0);
4574   op1 = XEXP (x, 1);
4575
4576   if (VECTOR_MODE_P (mode))
4577     mode = GET_MODE_INNER (mode);
4578
4579   /* Integer multiply/fma.  */
4580   if (GET_MODE_CLASS (mode) == MODE_INT)
4581     {
4582       /* The multiply will be canonicalized as a shift, cost it as such.  */
4583       if (CONST_INT_P (op1)
4584           && exact_log2 (INTVAL (op1)) > 0)
4585         {
4586           if (speed)
4587             {
4588               if (maybe_fma)
4589                 /* ADD (shifted register).  */
4590                 cost += extra_cost->alu.arith_shift;
4591               else
4592                 /* LSL (immediate).  */
4593                 cost += extra_cost->alu.shift;
4594             }
4595
4596           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4597
4598           return cost;
4599         }
4600
4601       /* Integer multiplies or FMAs have zero/sign extending variants.  */
4602       if ((GET_CODE (op0) == ZERO_EXTEND
4603            && GET_CODE (op1) == ZERO_EXTEND)
4604           || (GET_CODE (op0) == SIGN_EXTEND
4605               && GET_CODE (op1) == SIGN_EXTEND))
4606         {
4607           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4608                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4609
4610           if (speed)
4611             {
4612               if (maybe_fma)
4613                 /* MADD/SMADDL/UMADDL.  */
4614                 cost += extra_cost->mult[0].extend_add;
4615               else
4616                 /* MUL/SMULL/UMULL.  */
4617                 cost += extra_cost->mult[0].extend;
4618             }
4619
4620           return cost;
4621         }
4622
4623       /* This is either an integer multiply or an FMA.  In both cases
4624          we want to recurse and cost the operands.  */
4625       cost += rtx_cost (op0, MULT, 0, speed)
4626               + rtx_cost (op1, MULT, 1, speed);
4627
4628       if (speed)
4629         {
4630           if (maybe_fma)
4631             /* MADD.  */
4632             cost += extra_cost->mult[mode == DImode].add;
4633           else
4634             /* MUL.  */
4635             cost += extra_cost->mult[mode == DImode].simple;
4636         }
4637
4638       return cost;
4639     }
4640   else
4641     {
4642       if (speed)
4643         {
4644           /* Floating-point FMA/FMUL can also support negations of the
4645              operands.  */
4646           if (GET_CODE (op0) == NEG)
4647             op0 = XEXP (op0, 0);
4648           if (GET_CODE (op1) == NEG)
4649             op1 = XEXP (op1, 0);
4650
4651           if (maybe_fma)
4652             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
4653             cost += extra_cost->fp[mode == DFmode].fma;
4654           else
4655             /* FMUL/FNMUL.  */
4656             cost += extra_cost->fp[mode == DFmode].mult;
4657         }
4658
4659       cost += rtx_cost (op0, MULT, 0, speed)
4660               + rtx_cost (op1, MULT, 1, speed);
4661       return cost;
4662     }
4663 }
4664
4665 static int
4666 aarch64_address_cost (rtx x,
4667                       enum machine_mode mode,
4668                       addr_space_t as ATTRIBUTE_UNUSED,
4669                       bool speed)
4670 {
4671   enum rtx_code c = GET_CODE (x);
4672   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4673   struct aarch64_address_info info;
4674   int cost = 0;
4675   info.shift = 0;
4676
4677   if (!aarch64_classify_address (&info, x, mode, c, false))
4678     {
4679       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4680         {
4681           /* This is a CONST or SYMBOL ref which will be split
4682              in a different way depending on the code model in use.
4683              Cost it through the generic infrastructure.  */
4684           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4685           /* Divide through by the cost of one instruction to
4686              bring it to the same units as the address costs.  */
4687           cost_symbol_ref /= COSTS_N_INSNS (1);
4688           /* The cost is then the cost of preparing the address,
4689              followed by an immediate (possibly 0) offset.  */
4690           return cost_symbol_ref + addr_cost->imm_offset;
4691         }
4692       else
4693         {
4694           /* This is most likely a jump table from a case
4695              statement.  */
4696           return addr_cost->register_offset;
4697         }
4698     }
4699
4700   switch (info.type)
4701     {
4702       case ADDRESS_LO_SUM:
4703       case ADDRESS_SYMBOLIC:
4704       case ADDRESS_REG_IMM:
4705         cost += addr_cost->imm_offset;
4706         break;
4707
4708       case ADDRESS_REG_WB:
4709         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4710           cost += addr_cost->pre_modify;
4711         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4712           cost += addr_cost->post_modify;
4713         else
4714           gcc_unreachable ();
4715
4716         break;
4717
4718       case ADDRESS_REG_REG:
4719         cost += addr_cost->register_offset;
4720         break;
4721
4722       case ADDRESS_REG_UXTW:
4723       case ADDRESS_REG_SXTW:
4724         cost += addr_cost->register_extend;
4725         break;
4726
4727       default:
4728         gcc_unreachable ();
4729     }
4730
4731
4732   if (info.shift > 0)
4733     {
4734       /* For the sake of calculating the cost of the shifted register
4735          component, we can treat same sized modes in the same way.  */
4736       switch (GET_MODE_BITSIZE (mode))
4737         {
4738           case 16:
4739             cost += addr_cost->addr_scale_costs.hi;
4740             break;
4741
4742           case 32:
4743             cost += addr_cost->addr_scale_costs.si;
4744             break;
4745
4746           case 64:
4747             cost += addr_cost->addr_scale_costs.di;
4748             break;
4749
4750           /* We can't tell, or this is a 128-bit vector.  */
4751           default:
4752             cost += addr_cost->addr_scale_costs.ti;
4753             break;
4754         }
4755     }
4756
4757   return cost;
4758 }
4759
4760 /* Return true if the RTX X in mode MODE is a zero or sign extract
4761    usable in an ADD or SUB (extended register) instruction.  */
4762 static bool
4763 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4764 {
4765   /* Catch add with a sign extract.
4766      This is add_<optab><mode>_multp2.  */
4767   if (GET_CODE (x) == SIGN_EXTRACT
4768       || GET_CODE (x) == ZERO_EXTRACT)
4769     {
4770       rtx op0 = XEXP (x, 0);
4771       rtx op1 = XEXP (x, 1);
4772       rtx op2 = XEXP (x, 2);
4773
4774       if (GET_CODE (op0) == MULT
4775           && CONST_INT_P (op1)
4776           && op2 == const0_rtx
4777           && CONST_INT_P (XEXP (op0, 1))
4778           && aarch64_is_extend_from_extract (mode,
4779                                              XEXP (op0, 1),
4780                                              op1))
4781         {
4782           return true;
4783         }
4784     }
4785
4786   return false;
4787 }
4788
4789 static bool
4790 aarch64_frint_unspec_p (unsigned int u)
4791 {
4792   switch (u)
4793     {
4794       case UNSPEC_FRINTZ:
4795       case UNSPEC_FRINTP:
4796       case UNSPEC_FRINTM:
4797       case UNSPEC_FRINTA:
4798       case UNSPEC_FRINTN:
4799       case UNSPEC_FRINTX:
4800       case UNSPEC_FRINTI:
4801         return true;
4802
4803       default:
4804         return false;
4805     }
4806 }
4807
4808 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4809    storing it in *COST.  Result is true if the total cost of the operation
4810    has now been calculated.  */
4811 static bool
4812 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4813 {
4814   rtx inner;
4815   rtx comparator;
4816   enum rtx_code cmpcode;
4817
4818   if (COMPARISON_P (op0))
4819     {
4820       inner = XEXP (op0, 0);
4821       comparator = XEXP (op0, 1);
4822       cmpcode = GET_CODE (op0);
4823     }
4824   else
4825     {
4826       inner = op0;
4827       comparator = const0_rtx;
4828       cmpcode = NE;
4829     }
4830
4831   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
4832     {
4833       /* Conditional branch.  */
4834       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4835         return true;
4836       else
4837         {
4838           if (cmpcode == NE || cmpcode == EQ)
4839             {
4840               if (comparator == const0_rtx)
4841                 {
4842                   /* TBZ/TBNZ/CBZ/CBNZ.  */
4843                   if (GET_CODE (inner) == ZERO_EXTRACT)
4844                     /* TBZ/TBNZ.  */
4845                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
4846                                        0, speed);
4847                 else
4848                   /* CBZ/CBNZ.  */
4849                   *cost += rtx_cost (inner, cmpcode, 0, speed);
4850
4851                 return true;
4852               }
4853             }
4854           else if (cmpcode == LT || cmpcode == GE)
4855             {
4856               /* TBZ/TBNZ.  */
4857               if (comparator == const0_rtx)
4858                 return true;
4859             }
4860         }
4861     }
4862   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4863     {
4864       /* It's a conditional operation based on the status flags,
4865          so it must be some flavor of CSEL.  */
4866
4867       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
4868       if (GET_CODE (op1) == NEG
4869           || GET_CODE (op1) == NOT
4870           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
4871         op1 = XEXP (op1, 0);
4872
4873       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
4874       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
4875       return true;
4876     }
4877
4878   /* We don't know what this is, cost all operands.  */
4879   return false;
4880 }
4881
4882 /* Calculate the cost of calculating X, storing it in *COST.  Result
4883    is true if the total cost of the operation has now been calculated.  */
4884 static bool
4885 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
4886                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
4887 {
4888   rtx op0, op1, op2;
4889   const struct cpu_cost_table *extra_cost
4890     = aarch64_tune_params->insn_extra_cost;
4891   enum machine_mode mode = GET_MODE (x);
4892
4893   /* By default, assume that everything has equivalent cost to the
4894      cheapest instruction.  Any additional costs are applied as a delta
4895      above this default.  */
4896   *cost = COSTS_N_INSNS (1);
4897
4898   /* TODO: The cost infrastructure currently does not handle
4899      vector operations.  Assume that all vector operations
4900      are equally expensive.  */
4901   if (VECTOR_MODE_P (mode))
4902     {
4903       if (speed)
4904         *cost += extra_cost->vect.alu;
4905       return true;
4906     }
4907
4908   switch (code)
4909     {
4910     case SET:
4911       /* The cost depends entirely on the operands to SET.  */
4912       *cost = 0;
4913       op0 = SET_DEST (x);
4914       op1 = SET_SRC (x);
4915
4916       switch (GET_CODE (op0))
4917         {
4918         case MEM:
4919           if (speed)
4920             {
4921               rtx address = XEXP (op0, 0);
4922               if (GET_MODE_CLASS (mode) == MODE_INT)
4923                 *cost += extra_cost->ldst.store;
4924               else if (mode == SFmode)
4925                 *cost += extra_cost->ldst.storef;
4926               else if (mode == DFmode)
4927                 *cost += extra_cost->ldst.stored;
4928
4929               *cost +=
4930                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
4931                                                      0, speed));
4932             }
4933
4934           *cost += rtx_cost (op1, SET, 1, speed);
4935           return true;
4936
4937         case SUBREG:
4938           if (! REG_P (SUBREG_REG (op0)))
4939             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
4940
4941           /* Fall through.  */
4942         case REG:
4943           /* const0_rtx is in general free, but we will use an
4944              instruction to set a register to 0.  */
4945           if (REG_P (op1) || op1 == const0_rtx)
4946             {
4947               /* The cost is 1 per register copied.  */
4948               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
4949                               / UNITS_PER_WORD;
4950               *cost = COSTS_N_INSNS (n_minus_1 + 1);
4951             }
4952           else
4953             /* Cost is just the cost of the RHS of the set.  */
4954             *cost += rtx_cost (op1, SET, 1, speed);
4955           return true;
4956
4957         case ZERO_EXTRACT:
4958         case SIGN_EXTRACT:
4959           /* Bit-field insertion.  Strip any redundant widening of
4960              the RHS to meet the width of the target.  */
4961           if (GET_CODE (op1) == SUBREG)
4962             op1 = SUBREG_REG (op1);
4963           if ((GET_CODE (op1) == ZERO_EXTEND
4964                || GET_CODE (op1) == SIGN_EXTEND)
4965               && GET_CODE (XEXP (op0, 1)) == CONST_INT
4966               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
4967                   >= INTVAL (XEXP (op0, 1))))
4968             op1 = XEXP (op1, 0);
4969
4970           if (CONST_INT_P (op1))
4971             {
4972               /* MOV immediate is assumed to always be cheap.  */
4973               *cost = COSTS_N_INSNS (1);
4974             }
4975           else
4976             {
4977               /* BFM.  */
4978               if (speed)
4979                 *cost += extra_cost->alu.bfi;
4980               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
4981             }
4982
4983           return true;
4984
4985         default:
4986           /* We can't make sense of this, assume default cost.  */
4987           *cost = COSTS_N_INSNS (1);
4988           return false;
4989         }
4990       return false;
4991
4992     case CONST_INT:
4993       /* If an instruction can incorporate a constant within the
4994          instruction, the instruction's expression avoids calling
4995          rtx_cost() on the constant.  If rtx_cost() is called on a
4996          constant, then it is usually because the constant must be
4997          moved into a register by one or more instructions.
4998
4999          The exception is constant 0, which can be expressed
5000          as XZR/WZR and is therefore free.  The exception to this is
5001          if we have (set (reg) (const0_rtx)) in which case we must cost
5002          the move.  However, we can catch that when we cost the SET, so
5003          we don't need to consider that here.  */
5004       if (x == const0_rtx)
5005         *cost = 0;
5006       else
5007         {
5008           /* To an approximation, building any other constant is
5009              proportionally expensive to the number of instructions
5010              required to build that constant.  This is true whether we
5011              are compiling for SPEED or otherwise.  */
5012           *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5013                                                          INTVAL (x),
5014                                                          false));
5015         }
5016       return true;
5017
5018     case CONST_DOUBLE:
5019       if (speed)
5020         {
5021           /* mov[df,sf]_aarch64.  */
5022           if (aarch64_float_const_representable_p (x))
5023             /* FMOV (scalar immediate).  */
5024             *cost += extra_cost->fp[mode == DFmode].fpconst;
5025           else if (!aarch64_float_const_zero_rtx_p (x))
5026             {
5027               /* This will be a load from memory.  */
5028               if (mode == DFmode)
5029                 *cost += extra_cost->ldst.loadd;
5030               else
5031                 *cost += extra_cost->ldst.loadf;
5032             }
5033           else
5034             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5035                or MOV v0.s[0], wzr - neither of which are modeled by the
5036                cost tables.  Just use the default cost.  */
5037             {
5038             }
5039         }
5040
5041       return true;
5042
5043     case MEM:
5044       if (speed)
5045         {
5046           /* For loads we want the base cost of a load, plus an
5047              approximation for the additional cost of the addressing
5048              mode.  */
5049           rtx address = XEXP (x, 0);
5050           if (GET_MODE_CLASS (mode) == MODE_INT)
5051             *cost += extra_cost->ldst.load;
5052           else if (mode == SFmode)
5053             *cost += extra_cost->ldst.loadf;
5054           else if (mode == DFmode)
5055             *cost += extra_cost->ldst.loadd;
5056
5057           *cost +=
5058                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5059                                                      0, speed));
5060         }
5061
5062       return true;
5063
5064     case NEG:
5065       op0 = XEXP (x, 0);
5066
5067       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5068        {
5069           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5070               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5071             {
5072               /* CSETM.  */
5073               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5074               return true;
5075             }
5076
5077           /* Cost this as SUB wzr, X.  */
5078           op0 = CONST0_RTX (GET_MODE (x));
5079           op1 = XEXP (x, 0);
5080           goto cost_minus;
5081         }
5082
5083       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5084         {
5085           /* Support (neg(fma...)) as a single instruction only if
5086              sign of zeros is unimportant.  This matches the decision
5087              making in aarch64.md.  */
5088           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5089             {
5090               /* FNMADD.  */
5091               *cost = rtx_cost (op0, NEG, 0, speed);
5092               return true;
5093             }
5094           if (speed)
5095             /* FNEG.  */
5096             *cost += extra_cost->fp[mode == DFmode].neg;
5097           return false;
5098         }
5099
5100       return false;
5101
5102     case CLRSB:
5103     case CLZ:
5104       if (speed)
5105         *cost += extra_cost->alu.clz;
5106
5107       return false;
5108
5109     case COMPARE:
5110       op0 = XEXP (x, 0);
5111       op1 = XEXP (x, 1);
5112
5113       if (op1 == const0_rtx
5114           && GET_CODE (op0) == AND)
5115         {
5116           x = op0;
5117           goto cost_logic;
5118         }
5119
5120       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5121         {
5122           /* TODO: A write to the CC flags possibly costs extra, this
5123              needs encoding in the cost tables.  */
5124
5125           /* CC_ZESWPmode supports zero extend for free.  */
5126           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5127             op0 = XEXP (op0, 0);
5128
5129           /* ANDS.  */
5130           if (GET_CODE (op0) == AND)
5131             {
5132               x = op0;
5133               goto cost_logic;
5134             }
5135
5136           if (GET_CODE (op0) == PLUS)
5137             {
5138               /* ADDS (and CMN alias).  */
5139               x = op0;
5140               goto cost_plus;
5141             }
5142
5143           if (GET_CODE (op0) == MINUS)
5144             {
5145               /* SUBS.  */
5146               x = op0;
5147               goto cost_minus;
5148             }
5149
5150           if (GET_CODE (op1) == NEG)
5151             {
5152               /* CMN.  */
5153               if (speed)
5154                 *cost += extra_cost->alu.arith;
5155
5156               *cost += rtx_cost (op0, COMPARE, 0, speed);
5157               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5158               return true;
5159             }
5160
5161           /* CMP.
5162
5163              Compare can freely swap the order of operands, and
5164              canonicalization puts the more complex operation first.
5165              But the integer MINUS logic expects the shift/extend
5166              operation in op1.  */
5167           if (! (REG_P (op0)
5168                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5169           {
5170             op0 = XEXP (x, 1);
5171             op1 = XEXP (x, 0);
5172           }
5173           goto cost_minus;
5174         }
5175
5176       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5177         {
5178           /* FCMP.  */
5179           if (speed)
5180             *cost += extra_cost->fp[mode == DFmode].compare;
5181
5182           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5183             {
5184               /* FCMP supports constant 0.0 for no extra cost. */
5185               return true;
5186             }
5187           return false;
5188         }
5189
5190       return false;
5191
5192     case MINUS:
5193       {
5194         op0 = XEXP (x, 0);
5195         op1 = XEXP (x, 1);
5196
5197 cost_minus:
5198         /* Detect valid immediates.  */
5199         if ((GET_MODE_CLASS (mode) == MODE_INT
5200              || (GET_MODE_CLASS (mode) == MODE_CC
5201                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5202             && CONST_INT_P (op1)
5203             && aarch64_uimm12_shift (INTVAL (op1)))
5204           {
5205             *cost += rtx_cost (op0, MINUS, 0, speed);
5206
5207             if (speed)
5208               /* SUB(S) (immediate).  */
5209               *cost += extra_cost->alu.arith;
5210             return true;
5211
5212           }
5213
5214         /* Look for SUB (extended register).  */
5215         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5216           {
5217             if (speed)
5218               *cost += extra_cost->alu.arith_shift;
5219
5220             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5221                                (enum rtx_code) GET_CODE (op1),
5222                                0, speed);
5223             return true;
5224           }
5225
5226         rtx new_op1 = aarch64_strip_extend (op1);
5227
5228         /* Cost this as an FMA-alike operation.  */
5229         if ((GET_CODE (new_op1) == MULT
5230              || GET_CODE (new_op1) == ASHIFT)
5231             && code != COMPARE)
5232           {
5233             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5234                                             (enum rtx_code) code,
5235                                             speed);
5236             *cost += rtx_cost (op0, MINUS, 0, speed);
5237             return true;
5238           }
5239
5240         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5241
5242         if (speed)
5243           {
5244             if (GET_MODE_CLASS (mode) == MODE_INT)
5245               /* SUB(S).  */
5246               *cost += extra_cost->alu.arith;
5247             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5248               /* FSUB.  */
5249               *cost += extra_cost->fp[mode == DFmode].addsub;
5250           }
5251         return true;
5252       }
5253
5254     case PLUS:
5255       {
5256         rtx new_op0;
5257
5258         op0 = XEXP (x, 0);
5259         op1 = XEXP (x, 1);
5260
5261 cost_plus:
5262         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5263             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5264           {
5265             /* CSINC.  */
5266             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5267             *cost += rtx_cost (op1, PLUS, 1, speed);
5268             return true;
5269           }
5270
5271         if (GET_MODE_CLASS (mode) == MODE_INT
5272             && CONST_INT_P (op1)
5273             && aarch64_uimm12_shift (INTVAL (op1)))
5274           {
5275             *cost += rtx_cost (op0, PLUS, 0, speed);
5276
5277             if (speed)
5278               /* ADD (immediate).  */
5279               *cost += extra_cost->alu.arith;
5280             return true;
5281           }
5282
5283         /* Look for ADD (extended register).  */
5284         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5285           {
5286             if (speed)
5287               *cost += extra_cost->alu.arith_shift;
5288
5289             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5290                                (enum rtx_code) GET_CODE (op0),
5291                                0, speed);
5292             return true;
5293           }
5294
5295         /* Strip any extend, leave shifts behind as we will
5296            cost them through mult_cost.  */
5297         new_op0 = aarch64_strip_extend (op0);
5298
5299         if (GET_CODE (new_op0) == MULT
5300             || GET_CODE (new_op0) == ASHIFT)
5301           {
5302             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5303                                             speed);
5304             *cost += rtx_cost (op1, PLUS, 1, speed);
5305             return true;
5306           }
5307
5308         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5309                   + rtx_cost (op1, PLUS, 1, speed));
5310
5311         if (speed)
5312           {
5313             if (GET_MODE_CLASS (mode) == MODE_INT)
5314               /* ADD.  */
5315               *cost += extra_cost->alu.arith;
5316             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5317               /* FADD.  */
5318               *cost += extra_cost->fp[mode == DFmode].addsub;
5319           }
5320         return true;
5321       }
5322
5323     case BSWAP:
5324       *cost = COSTS_N_INSNS (1);
5325
5326       if (speed)
5327         *cost += extra_cost->alu.rev;
5328
5329       return false;
5330
5331     case IOR:
5332       if (aarch_rev16_p (x))
5333         {
5334           *cost = COSTS_N_INSNS (1);
5335
5336           if (speed)
5337             *cost += extra_cost->alu.rev;
5338
5339           return true;
5340         }
5341     /* Fall through.  */
5342     case XOR:
5343     case AND:
5344     cost_logic:
5345       op0 = XEXP (x, 0);
5346       op1 = XEXP (x, 1);
5347
5348       if (code == AND
5349           && GET_CODE (op0) == MULT
5350           && CONST_INT_P (XEXP (op0, 1))
5351           && CONST_INT_P (op1)
5352           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5353                                INTVAL (op1)) != 0)
5354         {
5355           /* This is a UBFM/SBFM.  */
5356           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5357           if (speed)
5358             *cost += extra_cost->alu.bfx;
5359           return true;
5360         }
5361
5362       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5363         {
5364           /* We possibly get the immediate for free, this is not
5365              modelled.  */
5366           if (CONST_INT_P (op1)
5367               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5368             {
5369               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5370
5371               if (speed)
5372                 *cost += extra_cost->alu.logical;
5373
5374               return true;
5375             }
5376           else
5377             {
5378               rtx new_op0 = op0;
5379
5380               /* Handle ORN, EON, or BIC.  */
5381               if (GET_CODE (op0) == NOT)
5382                 op0 = XEXP (op0, 0);
5383
5384               new_op0 = aarch64_strip_shift (op0);
5385
5386               /* If we had a shift on op0 then this is a logical-shift-
5387                  by-register/immediate operation.  Otherwise, this is just
5388                  a logical operation.  */
5389               if (speed)
5390                 {
5391                   if (new_op0 != op0)
5392                     {
5393                       /* Shift by immediate.  */
5394                       if (CONST_INT_P (XEXP (op0, 1)))
5395                         *cost += extra_cost->alu.log_shift;
5396                       else
5397                         *cost += extra_cost->alu.log_shift_reg;
5398                     }
5399                   else
5400                     *cost += extra_cost->alu.logical;
5401                 }
5402
5403               /* In both cases we want to cost both operands.  */
5404               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5405                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5406
5407               return true;
5408             }
5409         }
5410       return false;
5411
5412     case NOT:
5413       /* MVN.  */
5414       if (speed)
5415         *cost += extra_cost->alu.logical;
5416
5417       /* The logical instruction could have the shifted register form,
5418          but the cost is the same if the shift is processed as a separate
5419          instruction, so we don't bother with it here.  */
5420       return false;
5421
5422     case ZERO_EXTEND:
5423
5424       op0 = XEXP (x, 0);
5425       /* If a value is written in SI mode, then zero extended to DI
5426          mode, the operation will in general be free as a write to
5427          a 'w' register implicitly zeroes the upper bits of an 'x'
5428          register.  However, if this is
5429
5430            (set (reg) (zero_extend (reg)))
5431
5432          we must cost the explicit register move.  */
5433       if (mode == DImode
5434           && GET_MODE (op0) == SImode
5435           && outer == SET)
5436         {
5437           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5438
5439           if (!op_cost && speed)
5440             /* MOV.  */
5441             *cost += extra_cost->alu.extend;
5442           else
5443             /* Free, the cost is that of the SI mode operation.  */
5444             *cost = op_cost;
5445
5446           return true;
5447         }
5448       else if (MEM_P (XEXP (x, 0)))
5449         {
5450           /* All loads can zero extend to any size for free.  */
5451           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5452           return true;
5453         }
5454
5455       /* UXTB/UXTH.  */
5456       if (speed)
5457         *cost += extra_cost->alu.extend;
5458
5459       return false;
5460
5461     case SIGN_EXTEND:
5462       if (MEM_P (XEXP (x, 0)))
5463         {
5464           /* LDRSH.  */
5465           if (speed)
5466             {
5467               rtx address = XEXP (XEXP (x, 0), 0);
5468               *cost += extra_cost->ldst.load_sign_extend;
5469
5470               *cost +=
5471                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5472                                                      0, speed));
5473             }
5474           return true;
5475         }
5476
5477       if (speed)
5478         *cost += extra_cost->alu.extend;
5479       return false;
5480
5481     case ASHIFT:
5482       op0 = XEXP (x, 0);
5483       op1 = XEXP (x, 1);
5484
5485       if (CONST_INT_P (op1))
5486         {
5487           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
5488              aliases.  */
5489           if (speed)
5490             *cost += extra_cost->alu.shift;
5491
5492           /* We can incorporate zero/sign extend for free.  */
5493           if (GET_CODE (op0) == ZERO_EXTEND
5494               || GET_CODE (op0) == SIGN_EXTEND)
5495             op0 = XEXP (op0, 0);
5496
5497           *cost += rtx_cost (op0, ASHIFT, 0, speed);
5498           return true;
5499         }
5500       else
5501         {
5502           /* LSLV.  */
5503           if (speed)
5504             *cost += extra_cost->alu.shift_reg;
5505
5506           return false;  /* All arguments need to be in registers.  */
5507         }
5508
5509     case ROTATE:
5510     case ROTATERT:
5511     case LSHIFTRT:
5512     case ASHIFTRT:
5513       op0 = XEXP (x, 0);
5514       op1 = XEXP (x, 1);
5515
5516       if (CONST_INT_P (op1))
5517         {
5518           /* ASR (immediate) and friends.  */
5519           if (speed)
5520             *cost += extra_cost->alu.shift;
5521
5522           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5523           return true;
5524         }
5525       else
5526         {
5527
5528           /* ASR (register) and friends.  */
5529           if (speed)
5530             *cost += extra_cost->alu.shift_reg;
5531
5532           return false;  /* All arguments need to be in registers.  */
5533         }
5534
5535     case SYMBOL_REF:
5536
5537       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5538         {
5539           /* LDR.  */
5540           if (speed)
5541             *cost += extra_cost->ldst.load;
5542         }
5543       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5544                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5545         {
5546           /* ADRP, followed by ADD.  */
5547           *cost += COSTS_N_INSNS (1);
5548           if (speed)
5549             *cost += 2 * extra_cost->alu.arith;
5550         }
5551       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5552                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5553         {
5554           /* ADR.  */
5555           if (speed)
5556             *cost += extra_cost->alu.arith;
5557         }
5558
5559       if (flag_pic)
5560         {
5561           /* One extra load instruction, after accessing the GOT.  */
5562           *cost += COSTS_N_INSNS (1);
5563           if (speed)
5564             *cost += extra_cost->ldst.load;
5565         }
5566       return true;
5567
5568     case HIGH:
5569     case LO_SUM:
5570       /* ADRP/ADD (immediate).  */
5571       if (speed)
5572         *cost += extra_cost->alu.arith;
5573       return true;
5574
5575     case ZERO_EXTRACT:
5576     case SIGN_EXTRACT:
5577       /* UBFX/SBFX.  */
5578       if (speed)
5579         *cost += extra_cost->alu.bfx;
5580
5581       /* We can trust that the immediates used will be correct (there
5582          are no by-register forms), so we need only cost op0.  */
5583       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5584       return true;
5585
5586     case MULT:
5587       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5588       /* aarch64_rtx_mult_cost always handles recursion to its
5589          operands.  */
5590       return true;
5591
5592     case MOD:
5593     case UMOD:
5594       if (speed)
5595         {
5596           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5597             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5598                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5599           else if (GET_MODE (x) == DFmode)
5600             *cost += (extra_cost->fp[1].mult
5601                       + extra_cost->fp[1].div);
5602           else if (GET_MODE (x) == SFmode)
5603             *cost += (extra_cost->fp[0].mult
5604                       + extra_cost->fp[0].div);
5605         }
5606       return false;  /* All arguments need to be in registers.  */
5607
5608     case DIV:
5609     case UDIV:
5610     case SQRT:
5611       if (speed)
5612         {
5613           if (GET_MODE_CLASS (mode) == MODE_INT)
5614             /* There is no integer SQRT, so only DIV and UDIV can get
5615                here.  */
5616             *cost += extra_cost->mult[mode == DImode].idiv;
5617           else
5618             *cost += extra_cost->fp[mode == DFmode].div;
5619         }
5620       return false;  /* All arguments need to be in registers.  */
5621
5622     case IF_THEN_ELSE:
5623       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5624                                          XEXP (x, 2), cost, speed);
5625
5626     case EQ:
5627     case NE:
5628     case GT:
5629     case GTU:
5630     case LT:
5631     case LTU:
5632     case GE:
5633     case GEU:
5634     case LE:
5635     case LEU:
5636
5637       return false; /* All arguments must be in registers.  */
5638
5639     case FMA:
5640       op0 = XEXP (x, 0);
5641       op1 = XEXP (x, 1);
5642       op2 = XEXP (x, 2);
5643
5644       if (speed)
5645         *cost += extra_cost->fp[mode == DFmode].fma;
5646
5647       /* FMSUB, FNMADD, and FNMSUB are free.  */
5648       if (GET_CODE (op0) == NEG)
5649         op0 = XEXP (op0, 0);
5650
5651       if (GET_CODE (op2) == NEG)
5652         op2 = XEXP (op2, 0);
5653
5654       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5655          and the by-element operand as operand 0.  */
5656       if (GET_CODE (op1) == NEG)
5657         op1 = XEXP (op1, 0);
5658
5659       /* Catch vector-by-element operations.  The by-element operand can
5660          either be (vec_duplicate (vec_select (x))) or just
5661          (vec_select (x)), depending on whether we are multiplying by
5662          a vector or a scalar.
5663
5664          Canonicalization is not very good in these cases, FMA4 will put the
5665          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
5666       if (GET_CODE (op0) == VEC_DUPLICATE)
5667         op0 = XEXP (op0, 0);
5668       else if (GET_CODE (op1) == VEC_DUPLICATE)
5669         op1 = XEXP (op1, 0);
5670
5671       if (GET_CODE (op0) == VEC_SELECT)
5672         op0 = XEXP (op0, 0);
5673       else if (GET_CODE (op1) == VEC_SELECT)
5674         op1 = XEXP (op1, 0);
5675
5676       /* If the remaining parameters are not registers,
5677          get the cost to put them into registers.  */
5678       *cost += rtx_cost (op0, FMA, 0, speed);
5679       *cost += rtx_cost (op1, FMA, 1, speed);
5680       *cost += rtx_cost (op2, FMA, 2, speed);
5681       return true;
5682
5683     case FLOAT_EXTEND:
5684       if (speed)
5685         *cost += extra_cost->fp[mode == DFmode].widen;
5686       return false;
5687
5688     case FLOAT_TRUNCATE:
5689       if (speed)
5690         *cost += extra_cost->fp[mode == DFmode].narrow;
5691       return false;
5692
5693     case FIX:
5694     case UNSIGNED_FIX:
5695       x = XEXP (x, 0);
5696       /* Strip the rounding part.  They will all be implemented
5697          by the fcvt* family of instructions anyway.  */
5698       if (GET_CODE (x) == UNSPEC)
5699         {
5700           unsigned int uns_code = XINT (x, 1);
5701
5702           if (uns_code == UNSPEC_FRINTA
5703               || uns_code == UNSPEC_FRINTM
5704               || uns_code == UNSPEC_FRINTN
5705               || uns_code == UNSPEC_FRINTP
5706               || uns_code == UNSPEC_FRINTZ)
5707             x = XVECEXP (x, 0, 0);
5708         }
5709
5710       if (speed)
5711         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5712
5713       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5714       return true;
5715
5716     case ABS:
5717       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5718         {
5719           /* FABS and FNEG are analogous.  */
5720           if (speed)
5721             *cost += extra_cost->fp[mode == DFmode].neg;
5722         }
5723       else
5724         {
5725           /* Integer ABS will either be split to
5726              two arithmetic instructions, or will be an ABS
5727              (scalar), which we don't model.  */
5728           *cost = COSTS_N_INSNS (2);
5729           if (speed)
5730             *cost += 2 * extra_cost->alu.arith;
5731         }
5732       return false;
5733
5734     case SMAX:
5735     case SMIN:
5736       if (speed)
5737         {
5738           /* FMAXNM/FMINNM/FMAX/FMIN.
5739              TODO: This may not be accurate for all implementations, but
5740              we do not model this in the cost tables.  */
5741           *cost += extra_cost->fp[mode == DFmode].addsub;
5742         }
5743       return false;
5744
5745     case UNSPEC:
5746       /* The floating point round to integer frint* instructions.  */
5747       if (aarch64_frint_unspec_p (XINT (x, 1)))
5748         {
5749           if (speed)
5750             *cost += extra_cost->fp[mode == DFmode].roundint;
5751
5752           return false;
5753         }
5754
5755       if (XINT (x, 1) == UNSPEC_RBIT)
5756         {
5757           if (speed)
5758             *cost += extra_cost->alu.rev;
5759
5760           return false;
5761         }
5762       break;
5763
5764     case TRUNCATE:
5765
5766       /* Decompose <su>muldi3_highpart.  */
5767       if (/* (truncate:DI  */
5768           mode == DImode
5769           /*   (lshiftrt:TI  */
5770           && GET_MODE (XEXP (x, 0)) == TImode
5771           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5772           /*      (mult:TI  */
5773           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5774           /*        (ANY_EXTEND:TI (reg:DI))
5775                     (ANY_EXTEND:TI (reg:DI)))  */
5776           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5777                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5778               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5779                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5780           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5781           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5782           /*     (const_int 64)  */
5783           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5784           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5785         {
5786           /* UMULH/SMULH.  */
5787           if (speed)
5788             *cost += extra_cost->mult[mode == DImode].extend;
5789           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5790                              MULT, 0, speed);
5791           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5792                              MULT, 1, speed);
5793           return true;
5794         }
5795
5796       /* Fall through.  */
5797     default:
5798       break;
5799     }
5800
5801   if (dump_file && (dump_flags & TDF_DETAILS))
5802     fprintf (dump_file,
5803       "\nFailed to cost RTX.  Assuming default cost.\n");
5804
5805   return true;
5806 }
5807
5808 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5809    calculated for X.  This cost is stored in *COST.  Returns true
5810    if the total cost of X was calculated.  */
5811 static bool
5812 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5813                    int param, int *cost, bool speed)
5814 {
5815   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5816
5817   if (dump_file && (dump_flags & TDF_DETAILS))
5818     {
5819       print_rtl_single (dump_file, x);
5820       fprintf (dump_file, "\n%s cost: %d (%s)\n",
5821                speed ? "Hot" : "Cold",
5822                *cost, result ? "final" : "partial");
5823     }
5824
5825   return result;
5826 }
5827
5828 static int
5829 aarch64_register_move_cost (enum machine_mode mode,
5830                             reg_class_t from_i, reg_class_t to_i)
5831 {
5832   enum reg_class from = (enum reg_class) from_i;
5833   enum reg_class to = (enum reg_class) to_i;
5834   const struct cpu_regmove_cost *regmove_cost
5835     = aarch64_tune_params->regmove_cost;
5836
5837   /* Moving between GPR and stack cost is the same as GP2GP.  */
5838   if ((from == GENERAL_REGS && to == STACK_REG)
5839       || (to == GENERAL_REGS && from == STACK_REG))
5840     return regmove_cost->GP2GP;
5841
5842   /* To/From the stack register, we move via the gprs.  */
5843   if (to == STACK_REG || from == STACK_REG)
5844     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5845             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5846
5847   if (from == GENERAL_REGS && to == GENERAL_REGS)
5848     return regmove_cost->GP2GP;
5849   else if (from == GENERAL_REGS)
5850     return regmove_cost->GP2FP;
5851   else if (to == GENERAL_REGS)
5852     return regmove_cost->FP2GP;
5853
5854   /* When AdvSIMD instructions are disabled it is not possible to move
5855      a 128-bit value directly between Q registers.  This is handled in
5856      secondary reload.  A general register is used as a scratch to move
5857      the upper DI value and the lower DI value is moved directly,
5858      hence the cost is the sum of three moves. */
5859   if (! TARGET_SIMD && GET_MODE_SIZE (mode) == 128)
5860     return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
5861
5862   return regmove_cost->FP2FP;
5863 }
5864
5865 static int
5866 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
5867                           reg_class_t rclass ATTRIBUTE_UNUSED,
5868                           bool in ATTRIBUTE_UNUSED)
5869 {
5870   return aarch64_tune_params->memmov_cost;
5871 }
5872
5873 /* Return the number of instructions that can be issued per cycle.  */
5874 static int
5875 aarch64_sched_issue_rate (void)
5876 {
5877   return aarch64_tune_params->issue_rate;
5878 }
5879
5880 /* Vectorizer cost model target hooks.  */
5881
5882 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
5883 static int
5884 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
5885                                     tree vectype,
5886                                     int misalign ATTRIBUTE_UNUSED)
5887 {
5888   unsigned elements;
5889
5890   switch (type_of_cost)
5891     {
5892       case scalar_stmt:
5893         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
5894
5895       case scalar_load:
5896         return aarch64_tune_params->vec_costs->scalar_load_cost;
5897
5898       case scalar_store:
5899         return aarch64_tune_params->vec_costs->scalar_store_cost;
5900
5901       case vector_stmt:
5902         return aarch64_tune_params->vec_costs->vec_stmt_cost;
5903
5904       case vector_load:
5905         return aarch64_tune_params->vec_costs->vec_align_load_cost;
5906
5907       case vector_store:
5908         return aarch64_tune_params->vec_costs->vec_store_cost;
5909
5910       case vec_to_scalar:
5911         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
5912
5913       case scalar_to_vec:
5914         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
5915
5916       case unaligned_load:
5917         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
5918
5919       case unaligned_store:
5920         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
5921
5922       case cond_branch_taken:
5923         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
5924
5925       case cond_branch_not_taken:
5926         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
5927
5928       case vec_perm:
5929       case vec_promote_demote:
5930         return aarch64_tune_params->vec_costs->vec_stmt_cost;
5931
5932       case vec_construct:
5933         elements = TYPE_VECTOR_SUBPARTS (vectype);
5934         return elements / 2 + 1;
5935
5936       default:
5937         gcc_unreachable ();
5938     }
5939 }
5940
5941 /* Implement targetm.vectorize.add_stmt_cost.  */
5942 static unsigned
5943 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
5944                        struct _stmt_vec_info *stmt_info, int misalign,
5945                        enum vect_cost_model_location where)
5946 {
5947   unsigned *cost = (unsigned *) data;
5948   unsigned retval = 0;
5949
5950   if (flag_vect_cost_model)
5951     {
5952       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
5953       int stmt_cost =
5954             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
5955
5956       /* Statements in an inner loop relative to the loop being
5957          vectorized are weighted more heavily.  The value here is
5958          a function (linear for now) of the loop nest level.  */
5959       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
5960         {
5961           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
5962           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
5963           unsigned nest_level = loop_depth (loop);
5964
5965           count *= nest_level;
5966         }
5967
5968       retval = (unsigned) (count * stmt_cost);
5969       cost[where] += retval;
5970     }
5971
5972   return retval;
5973 }
5974
5975 static void initialize_aarch64_code_model (void);
5976
5977 /* Parse the architecture extension string.  */
5978
5979 static void
5980 aarch64_parse_extension (char *str)
5981 {
5982   /* The extension string is parsed left to right.  */
5983   const struct aarch64_option_extension *opt = NULL;
5984
5985   /* Flag to say whether we are adding or removing an extension.  */
5986   int adding_ext = -1;
5987
5988   while (str != NULL && *str != 0)
5989     {
5990       char *ext;
5991       size_t len;
5992
5993       str++;
5994       ext = strchr (str, '+');
5995
5996       if (ext != NULL)
5997         len = ext - str;
5998       else
5999         len = strlen (str);
6000
6001       if (len >= 2 && strncmp (str, "no", 2) == 0)
6002         {
6003           adding_ext = 0;
6004           len -= 2;
6005           str += 2;
6006         }
6007       else if (len > 0)
6008         adding_ext = 1;
6009
6010       if (len == 0)
6011         {
6012           error ("missing feature modifier after %qs", "+no");
6013           return;
6014         }
6015
6016       /* Scan over the extensions table trying to find an exact match.  */
6017       for (opt = all_extensions; opt->name != NULL; opt++)
6018         {
6019           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6020             {
6021               /* Add or remove the extension.  */
6022               if (adding_ext)
6023                 aarch64_isa_flags |= opt->flags_on;
6024               else
6025                 aarch64_isa_flags &= ~(opt->flags_off);
6026               break;
6027             }
6028         }
6029
6030       if (opt->name == NULL)
6031         {
6032           /* Extension not found in list.  */
6033           error ("unknown feature modifier %qs", str);
6034           return;
6035         }
6036
6037       str = ext;
6038     };
6039
6040   return;
6041 }
6042
6043 /* Parse the ARCH string.  */
6044
6045 static void
6046 aarch64_parse_arch (void)
6047 {
6048   char *ext;
6049   const struct processor *arch;
6050   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6051   size_t len;
6052
6053   strcpy (str, aarch64_arch_string);
6054
6055   ext = strchr (str, '+');
6056
6057   if (ext != NULL)
6058     len = ext - str;
6059   else
6060     len = strlen (str);
6061
6062   if (len == 0)
6063     {
6064       error ("missing arch name in -march=%qs", str);
6065       return;
6066     }
6067
6068   /* Loop through the list of supported ARCHs to find a match.  */
6069   for (arch = all_architectures; arch->name != NULL; arch++)
6070     {
6071       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6072         {
6073           selected_arch = arch;
6074           aarch64_isa_flags = selected_arch->flags;
6075
6076           if (!selected_cpu)
6077             selected_cpu = &all_cores[selected_arch->core];
6078
6079           if (ext != NULL)
6080             {
6081               /* ARCH string contains at least one extension.  */
6082               aarch64_parse_extension (ext);
6083             }
6084
6085           if (strcmp (selected_arch->arch, selected_cpu->arch))
6086             {
6087               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6088                        selected_cpu->name, selected_arch->name);
6089             }
6090
6091           return;
6092         }
6093     }
6094
6095   /* ARCH name not found in list.  */
6096   error ("unknown value %qs for -march", str);
6097   return;
6098 }
6099
6100 /* Parse the CPU string.  */
6101
6102 static void
6103 aarch64_parse_cpu (void)
6104 {
6105   char *ext;
6106   const struct processor *cpu;
6107   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6108   size_t len;
6109
6110   strcpy (str, aarch64_cpu_string);
6111
6112   ext = strchr (str, '+');
6113
6114   if (ext != NULL)
6115     len = ext - str;
6116   else
6117     len = strlen (str);
6118
6119   if (len == 0)
6120     {
6121       error ("missing cpu name in -mcpu=%qs", str);
6122       return;
6123     }
6124
6125   /* Loop through the list of supported CPUs to find a match.  */
6126   for (cpu = all_cores; cpu->name != NULL; cpu++)
6127     {
6128       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6129         {
6130           selected_cpu = cpu;
6131           selected_tune = cpu;
6132           aarch64_isa_flags = selected_cpu->flags;
6133
6134           if (ext != NULL)
6135             {
6136               /* CPU string contains at least one extension.  */
6137               aarch64_parse_extension (ext);
6138             }
6139
6140           return;
6141         }
6142     }
6143
6144   /* CPU name not found in list.  */
6145   error ("unknown value %qs for -mcpu", str);
6146   return;
6147 }
6148
6149 /* Parse the TUNE string.  */
6150
6151 static void
6152 aarch64_parse_tune (void)
6153 {
6154   const struct processor *cpu;
6155   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6156   strcpy (str, aarch64_tune_string);
6157
6158   /* Loop through the list of supported CPUs to find a match.  */
6159   for (cpu = all_cores; cpu->name != NULL; cpu++)
6160     {
6161       if (strcmp (cpu->name, str) == 0)
6162         {
6163           selected_tune = cpu;
6164           return;
6165         }
6166     }
6167
6168   /* CPU name not found in list.  */
6169   error ("unknown value %qs for -mtune", str);
6170   return;
6171 }
6172
6173
6174 /* Implement TARGET_OPTION_OVERRIDE.  */
6175
6176 static void
6177 aarch64_override_options (void)
6178 {
6179   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6180      If either of -march or -mtune is given, they override their
6181      respective component of -mcpu.
6182
6183      So, first parse AARCH64_CPU_STRING, then the others, be careful
6184      with -march as, if -mcpu is not present on the command line, march
6185      must set a sensible default CPU.  */
6186   if (aarch64_cpu_string)
6187     {
6188       aarch64_parse_cpu ();
6189     }
6190
6191   if (aarch64_arch_string)
6192     {
6193       aarch64_parse_arch ();
6194     }
6195
6196   if (aarch64_tune_string)
6197     {
6198       aarch64_parse_tune ();
6199     }
6200
6201 #ifndef HAVE_AS_MABI_OPTION
6202   /* The compiler may have been configured with 2.23.* binutils, which does
6203      not have support for ILP32.  */
6204   if (TARGET_ILP32)
6205     error ("Assembler does not support -mabi=ilp32");
6206 #endif
6207
6208   initialize_aarch64_code_model ();
6209
6210   aarch64_build_bitmask_table ();
6211
6212   /* This target defaults to strict volatile bitfields.  */
6213   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6214     flag_strict_volatile_bitfields = 1;
6215
6216   /* If the user did not specify a processor, choose the default
6217      one for them.  This will be the CPU set during configuration using
6218      --with-cpu, otherwise it is "generic".  */
6219   if (!selected_cpu)
6220     {
6221       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6222       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6223     }
6224
6225   gcc_assert (selected_cpu);
6226
6227   /* The selected cpu may be an architecture, so lookup tuning by core ID.  */
6228   if (!selected_tune)
6229     selected_tune = &all_cores[selected_cpu->core];
6230
6231   aarch64_tune_flags = selected_tune->flags;
6232   aarch64_tune = selected_tune->core;
6233   aarch64_tune_params = selected_tune->tune;
6234
6235   aarch64_override_options_after_change ();
6236 }
6237
6238 /* Implement targetm.override_options_after_change.  */
6239
6240 static void
6241 aarch64_override_options_after_change (void)
6242 {
6243   if (flag_omit_frame_pointer)
6244     flag_omit_leaf_frame_pointer = false;
6245   else if (flag_omit_leaf_frame_pointer)
6246     flag_omit_frame_pointer = true;
6247 }
6248
6249 static struct machine_function *
6250 aarch64_init_machine_status (void)
6251 {
6252   struct machine_function *machine;
6253   machine = ggc_cleared_alloc<machine_function> ();
6254   return machine;
6255 }
6256
6257 void
6258 aarch64_init_expanders (void)
6259 {
6260   init_machine_status = aarch64_init_machine_status;
6261 }
6262
6263 /* A checking mechanism for the implementation of the various code models.  */
6264 static void
6265 initialize_aarch64_code_model (void)
6266 {
6267    if (flag_pic)
6268      {
6269        switch (aarch64_cmodel_var)
6270          {
6271          case AARCH64_CMODEL_TINY:
6272            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6273            break;
6274          case AARCH64_CMODEL_SMALL:
6275            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6276            break;
6277          case AARCH64_CMODEL_LARGE:
6278            sorry ("code model %qs with -f%s", "large",
6279                   flag_pic > 1 ? "PIC" : "pic");
6280          default:
6281            gcc_unreachable ();
6282          }
6283      }
6284    else
6285      aarch64_cmodel = aarch64_cmodel_var;
6286 }
6287
6288 /* Return true if SYMBOL_REF X binds locally.  */
6289
6290 static bool
6291 aarch64_symbol_binds_local_p (const_rtx x)
6292 {
6293   return (SYMBOL_REF_DECL (x)
6294           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6295           : SYMBOL_REF_LOCAL_P (x));
6296 }
6297
6298 /* Return true if SYMBOL_REF X is thread local */
6299 static bool
6300 aarch64_tls_symbol_p (rtx x)
6301 {
6302   if (! TARGET_HAVE_TLS)
6303     return false;
6304
6305   if (GET_CODE (x) != SYMBOL_REF)
6306     return false;
6307
6308   return SYMBOL_REF_TLS_MODEL (x) != 0;
6309 }
6310
6311 /* Classify a TLS symbol into one of the TLS kinds.  */
6312 enum aarch64_symbol_type
6313 aarch64_classify_tls_symbol (rtx x)
6314 {
6315   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6316
6317   switch (tls_kind)
6318     {
6319     case TLS_MODEL_GLOBAL_DYNAMIC:
6320     case TLS_MODEL_LOCAL_DYNAMIC:
6321       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6322
6323     case TLS_MODEL_INITIAL_EXEC:
6324       return SYMBOL_SMALL_GOTTPREL;
6325
6326     case TLS_MODEL_LOCAL_EXEC:
6327       return SYMBOL_SMALL_TPREL;
6328
6329     case TLS_MODEL_EMULATED:
6330     case TLS_MODEL_NONE:
6331       return SYMBOL_FORCE_TO_MEM;
6332
6333     default:
6334       gcc_unreachable ();
6335     }
6336 }
6337
6338 /* Return the method that should be used to access SYMBOL_REF or
6339    LABEL_REF X in context CONTEXT.  */
6340
6341 enum aarch64_symbol_type
6342 aarch64_classify_symbol (rtx x,
6343                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6344 {
6345   if (GET_CODE (x) == LABEL_REF)
6346     {
6347       switch (aarch64_cmodel)
6348         {
6349         case AARCH64_CMODEL_LARGE:
6350           return SYMBOL_FORCE_TO_MEM;
6351
6352         case AARCH64_CMODEL_TINY_PIC:
6353         case AARCH64_CMODEL_TINY:
6354           return SYMBOL_TINY_ABSOLUTE;
6355
6356         case AARCH64_CMODEL_SMALL_PIC:
6357         case AARCH64_CMODEL_SMALL:
6358           return SYMBOL_SMALL_ABSOLUTE;
6359
6360         default:
6361           gcc_unreachable ();
6362         }
6363     }
6364
6365   if (GET_CODE (x) == SYMBOL_REF)
6366     {
6367       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6368           return SYMBOL_FORCE_TO_MEM;
6369
6370       if (aarch64_tls_symbol_p (x))
6371         return aarch64_classify_tls_symbol (x);
6372
6373       switch (aarch64_cmodel)
6374         {
6375         case AARCH64_CMODEL_TINY:
6376           if (SYMBOL_REF_WEAK (x))
6377             return SYMBOL_FORCE_TO_MEM;
6378           return SYMBOL_TINY_ABSOLUTE;
6379
6380         case AARCH64_CMODEL_SMALL:
6381           if (SYMBOL_REF_WEAK (x))
6382             return SYMBOL_FORCE_TO_MEM;
6383           return SYMBOL_SMALL_ABSOLUTE;
6384
6385         case AARCH64_CMODEL_TINY_PIC:
6386           if (!aarch64_symbol_binds_local_p (x))
6387             return SYMBOL_TINY_GOT;
6388           return SYMBOL_TINY_ABSOLUTE;
6389
6390         case AARCH64_CMODEL_SMALL_PIC:
6391           if (!aarch64_symbol_binds_local_p (x))
6392             return SYMBOL_SMALL_GOT;
6393           return SYMBOL_SMALL_ABSOLUTE;
6394
6395         default:
6396           gcc_unreachable ();
6397         }
6398     }
6399
6400   /* By default push everything into the constant pool.  */
6401   return SYMBOL_FORCE_TO_MEM;
6402 }
6403
6404 bool
6405 aarch64_constant_address_p (rtx x)
6406 {
6407   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6408 }
6409
6410 bool
6411 aarch64_legitimate_pic_operand_p (rtx x)
6412 {
6413   if (GET_CODE (x) == SYMBOL_REF
6414       || (GET_CODE (x) == CONST
6415           && GET_CODE (XEXP (x, 0)) == PLUS
6416           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6417      return false;
6418
6419   return true;
6420 }
6421
6422 /* Return true if X holds either a quarter-precision or
6423      floating-point +0.0 constant.  */
6424 static bool
6425 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6426 {
6427   if (!CONST_DOUBLE_P (x))
6428     return false;
6429
6430   /* TODO: We could handle moving 0.0 to a TFmode register,
6431      but first we would like to refactor the movtf_aarch64
6432      to be more amicable to split moves properly and
6433      correctly gate on TARGET_SIMD.  For now - reject all
6434      constants which are not to SFmode or DFmode registers.  */
6435   if (!(mode == SFmode || mode == DFmode))
6436     return false;
6437
6438   if (aarch64_float_const_zero_rtx_p (x))
6439     return true;
6440   return aarch64_float_const_representable_p (x);
6441 }
6442
6443 static bool
6444 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6445 {
6446   /* Do not allow vector struct mode constants.  We could support
6447      0 and -1 easily, but they need support in aarch64-simd.md.  */
6448   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6449     return false;
6450
6451   /* This could probably go away because
6452      we now decompose CONST_INTs according to expand_mov_immediate.  */
6453   if ((GET_CODE (x) == CONST_VECTOR
6454        && aarch64_simd_valid_immediate (x, mode, false, NULL))
6455       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6456         return !targetm.cannot_force_const_mem (mode, x);
6457
6458   if (GET_CODE (x) == HIGH
6459       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6460     return true;
6461
6462   return aarch64_constant_address_p (x);
6463 }
6464
6465 rtx
6466 aarch64_load_tp (rtx target)
6467 {
6468   if (!target
6469       || GET_MODE (target) != Pmode
6470       || !register_operand (target, Pmode))
6471     target = gen_reg_rtx (Pmode);
6472
6473   /* Can return in any reg.  */
6474   emit_insn (gen_aarch64_load_tp_hard (target));
6475   return target;
6476 }
6477
6478 /* On AAPCS systems, this is the "struct __va_list".  */
6479 static GTY(()) tree va_list_type;
6480
6481 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6482    Return the type to use as __builtin_va_list.
6483
6484    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6485
6486    struct __va_list
6487    {
6488      void *__stack;
6489      void *__gr_top;
6490      void *__vr_top;
6491      int   __gr_offs;
6492      int   __vr_offs;
6493    };  */
6494
6495 static tree
6496 aarch64_build_builtin_va_list (void)
6497 {
6498   tree va_list_name;
6499   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6500
6501   /* Create the type.  */
6502   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6503   /* Give it the required name.  */
6504   va_list_name = build_decl (BUILTINS_LOCATION,
6505                              TYPE_DECL,
6506                              get_identifier ("__va_list"),
6507                              va_list_type);
6508   DECL_ARTIFICIAL (va_list_name) = 1;
6509   TYPE_NAME (va_list_type) = va_list_name;
6510   TYPE_STUB_DECL (va_list_type) = va_list_name;
6511
6512   /* Create the fields.  */
6513   f_stack = build_decl (BUILTINS_LOCATION,
6514                         FIELD_DECL, get_identifier ("__stack"),
6515                         ptr_type_node);
6516   f_grtop = build_decl (BUILTINS_LOCATION,
6517                         FIELD_DECL, get_identifier ("__gr_top"),
6518                         ptr_type_node);
6519   f_vrtop = build_decl (BUILTINS_LOCATION,
6520                         FIELD_DECL, get_identifier ("__vr_top"),
6521                         ptr_type_node);
6522   f_groff = build_decl (BUILTINS_LOCATION,
6523                         FIELD_DECL, get_identifier ("__gr_offs"),
6524                         integer_type_node);
6525   f_vroff = build_decl (BUILTINS_LOCATION,
6526                         FIELD_DECL, get_identifier ("__vr_offs"),
6527                         integer_type_node);
6528
6529   DECL_ARTIFICIAL (f_stack) = 1;
6530   DECL_ARTIFICIAL (f_grtop) = 1;
6531   DECL_ARTIFICIAL (f_vrtop) = 1;
6532   DECL_ARTIFICIAL (f_groff) = 1;
6533   DECL_ARTIFICIAL (f_vroff) = 1;
6534
6535   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6536   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6537   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6538   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6539   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6540
6541   TYPE_FIELDS (va_list_type) = f_stack;
6542   DECL_CHAIN (f_stack) = f_grtop;
6543   DECL_CHAIN (f_grtop) = f_vrtop;
6544   DECL_CHAIN (f_vrtop) = f_groff;
6545   DECL_CHAIN (f_groff) = f_vroff;
6546
6547   /* Compute its layout.  */
6548   layout_type (va_list_type);
6549
6550   return va_list_type;
6551 }
6552
6553 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
6554 static void
6555 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6556 {
6557   const CUMULATIVE_ARGS *cum;
6558   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6559   tree stack, grtop, vrtop, groff, vroff;
6560   tree t;
6561   int gr_save_area_size;
6562   int vr_save_area_size;
6563   int vr_offset;
6564
6565   cum = &crtl->args.info;
6566   gr_save_area_size
6567     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6568   vr_save_area_size
6569     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6570
6571   if (TARGET_GENERAL_REGS_ONLY)
6572     {
6573       if (cum->aapcs_nvrn > 0)
6574         sorry ("%qs and floating point or vector arguments",
6575                "-mgeneral-regs-only");
6576       vr_save_area_size = 0;
6577     }
6578
6579   f_stack = TYPE_FIELDS (va_list_type_node);
6580   f_grtop = DECL_CHAIN (f_stack);
6581   f_vrtop = DECL_CHAIN (f_grtop);
6582   f_groff = DECL_CHAIN (f_vrtop);
6583   f_vroff = DECL_CHAIN (f_groff);
6584
6585   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6586                   NULL_TREE);
6587   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6588                   NULL_TREE);
6589   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6590                   NULL_TREE);
6591   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6592                   NULL_TREE);
6593   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6594                   NULL_TREE);
6595
6596   /* Emit code to initialize STACK, which points to the next varargs stack
6597      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
6598      by named arguments.  STACK is 8-byte aligned.  */
6599   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6600   if (cum->aapcs_stack_size > 0)
6601     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6602   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6603   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6604
6605   /* Emit code to initialize GRTOP, the top of the GR save area.
6606      virtual_incoming_args_rtx should have been 16 byte aligned.  */
6607   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6608   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6609   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6610
6611   /* Emit code to initialize VRTOP, the top of the VR save area.
6612      This address is gr_save_area_bytes below GRTOP, rounded
6613      down to the next 16-byte boundary.  */
6614   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6615   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6616                              STACK_BOUNDARY / BITS_PER_UNIT);
6617
6618   if (vr_offset)
6619     t = fold_build_pointer_plus_hwi (t, -vr_offset);
6620   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6621   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6622
6623   /* Emit code to initialize GROFF, the offset from GRTOP of the
6624      next GPR argument.  */
6625   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6626               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6627   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6628
6629   /* Likewise emit code to initialize VROFF, the offset from FTOP
6630      of the next VR argument.  */
6631   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6632               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6633   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6634 }
6635
6636 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
6637
6638 static tree
6639 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6640                               gimple_seq *post_p ATTRIBUTE_UNUSED)
6641 {
6642   tree addr;
6643   bool indirect_p;
6644   bool is_ha;           /* is HFA or HVA.  */
6645   bool dw_align;        /* double-word align.  */
6646   enum machine_mode ag_mode = VOIDmode;
6647   int nregs;
6648   enum machine_mode mode;
6649
6650   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6651   tree stack, f_top, f_off, off, arg, roundup, on_stack;
6652   HOST_WIDE_INT size, rsize, adjust, align;
6653   tree t, u, cond1, cond2;
6654
6655   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6656   if (indirect_p)
6657     type = build_pointer_type (type);
6658
6659   mode = TYPE_MODE (type);
6660
6661   f_stack = TYPE_FIELDS (va_list_type_node);
6662   f_grtop = DECL_CHAIN (f_stack);
6663   f_vrtop = DECL_CHAIN (f_grtop);
6664   f_groff = DECL_CHAIN (f_vrtop);
6665   f_vroff = DECL_CHAIN (f_groff);
6666
6667   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6668                   f_stack, NULL_TREE);
6669   size = int_size_in_bytes (type);
6670   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6671
6672   dw_align = false;
6673   adjust = 0;
6674   if (aarch64_vfp_is_call_or_return_candidate (mode,
6675                                                type,
6676                                                &ag_mode,
6677                                                &nregs,
6678                                                &is_ha))
6679     {
6680       /* TYPE passed in fp/simd registers.  */
6681       if (TARGET_GENERAL_REGS_ONLY)
6682         sorry ("%qs and floating point or vector arguments",
6683                "-mgeneral-regs-only");
6684
6685       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6686                       unshare_expr (valist), f_vrtop, NULL_TREE);
6687       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6688                       unshare_expr (valist), f_vroff, NULL_TREE);
6689
6690       rsize = nregs * UNITS_PER_VREG;
6691
6692       if (is_ha)
6693         {
6694           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6695             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6696         }
6697       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6698                && size < UNITS_PER_VREG)
6699         {
6700           adjust = UNITS_PER_VREG - size;
6701         }
6702     }
6703   else
6704     {
6705       /* TYPE passed in general registers.  */
6706       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6707                       unshare_expr (valist), f_grtop, NULL_TREE);
6708       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6709                       unshare_expr (valist), f_groff, NULL_TREE);
6710       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6711       nregs = rsize / UNITS_PER_WORD;
6712
6713       if (align > 8)
6714         dw_align = true;
6715
6716       if (BLOCK_REG_PADDING (mode, type, 1) == downward
6717           && size < UNITS_PER_WORD)
6718         {
6719           adjust = UNITS_PER_WORD  - size;
6720         }
6721     }
6722
6723   /* Get a local temporary for the field value.  */
6724   off = get_initialized_tmp_var (f_off, pre_p, NULL);
6725
6726   /* Emit code to branch if off >= 0.  */
6727   t = build2 (GE_EXPR, boolean_type_node, off,
6728               build_int_cst (TREE_TYPE (off), 0));
6729   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6730
6731   if (dw_align)
6732     {
6733       /* Emit: offs = (offs + 15) & -16.  */
6734       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6735                   build_int_cst (TREE_TYPE (off), 15));
6736       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6737                   build_int_cst (TREE_TYPE (off), -16));
6738       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6739     }
6740   else
6741     roundup = NULL;
6742
6743   /* Update ap.__[g|v]r_offs  */
6744   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6745               build_int_cst (TREE_TYPE (off), rsize));
6746   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6747
6748   /* String up.  */
6749   if (roundup)
6750     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6751
6752   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
6753   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6754               build_int_cst (TREE_TYPE (f_off), 0));
6755   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6756
6757   /* String up: make sure the assignment happens before the use.  */
6758   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6759   COND_EXPR_ELSE (cond1) = t;
6760
6761   /* Prepare the trees handling the argument that is passed on the stack;
6762      the top level node will store in ON_STACK.  */
6763   arg = get_initialized_tmp_var (stack, pre_p, NULL);
6764   if (align > 8)
6765     {
6766       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
6767       t = fold_convert (intDI_type_node, arg);
6768       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6769                   build_int_cst (TREE_TYPE (t), 15));
6770       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6771                   build_int_cst (TREE_TYPE (t), -16));
6772       t = fold_convert (TREE_TYPE (arg), t);
6773       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6774     }
6775   else
6776     roundup = NULL;
6777   /* Advance ap.__stack  */
6778   t = fold_convert (intDI_type_node, arg);
6779   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6780               build_int_cst (TREE_TYPE (t), size + 7));
6781   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6782               build_int_cst (TREE_TYPE (t), -8));
6783   t = fold_convert (TREE_TYPE (arg), t);
6784   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6785   /* String up roundup and advance.  */
6786   if (roundup)
6787     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6788   /* String up with arg */
6789   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6790   /* Big-endianness related address adjustment.  */
6791   if (BLOCK_REG_PADDING (mode, type, 1) == downward
6792       && size < UNITS_PER_WORD)
6793   {
6794     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6795                 size_int (UNITS_PER_WORD - size));
6796     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6797   }
6798
6799   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6800   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6801
6802   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
6803   t = off;
6804   if (adjust)
6805     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6806                 build_int_cst (TREE_TYPE (off), adjust));
6807
6808   t = fold_convert (sizetype, t);
6809   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6810
6811   if (is_ha)
6812     {
6813       /* type ha; // treat as "struct {ftype field[n];}"
6814          ... [computing offs]
6815          for (i = 0; i <nregs; ++i, offs += 16)
6816            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6817          return ha;  */
6818       int i;
6819       tree tmp_ha, field_t, field_ptr_t;
6820
6821       /* Declare a local variable.  */
6822       tmp_ha = create_tmp_var_raw (type, "ha");
6823       gimple_add_tmp_var (tmp_ha);
6824
6825       /* Establish the base type.  */
6826       switch (ag_mode)
6827         {
6828         case SFmode:
6829           field_t = float_type_node;
6830           field_ptr_t = float_ptr_type_node;
6831           break;
6832         case DFmode:
6833           field_t = double_type_node;
6834           field_ptr_t = double_ptr_type_node;
6835           break;
6836         case TFmode:
6837           field_t = long_double_type_node;
6838           field_ptr_t = long_double_ptr_type_node;
6839           break;
6840 /* The half precision and quad precision are not fully supported yet.  Enable
6841    the following code after the support is complete.  Need to find the correct
6842    type node for __fp16 *.  */
6843 #if 0
6844         case HFmode:
6845           field_t = float_type_node;
6846           field_ptr_t = float_ptr_type_node;
6847           break;
6848 #endif
6849         case V2SImode:
6850         case V4SImode:
6851             {
6852               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
6853               field_t = build_vector_type_for_mode (innertype, ag_mode);
6854               field_ptr_t = build_pointer_type (field_t);
6855             }
6856           break;
6857         default:
6858           gcc_assert (0);
6859         }
6860
6861       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
6862       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
6863       addr = t;
6864       t = fold_convert (field_ptr_t, addr);
6865       t = build2 (MODIFY_EXPR, field_t,
6866                   build1 (INDIRECT_REF, field_t, tmp_ha),
6867                   build1 (INDIRECT_REF, field_t, t));
6868
6869       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
6870       for (i = 1; i < nregs; ++i)
6871         {
6872           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
6873           u = fold_convert (field_ptr_t, addr);
6874           u = build2 (MODIFY_EXPR, field_t,
6875                       build2 (MEM_REF, field_t, tmp_ha,
6876                               build_int_cst (field_ptr_t,
6877                                              (i *
6878                                               int_size_in_bytes (field_t)))),
6879                       build1 (INDIRECT_REF, field_t, u));
6880           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
6881         }
6882
6883       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
6884       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
6885     }
6886
6887   COND_EXPR_ELSE (cond2) = t;
6888   addr = fold_convert (build_pointer_type (type), cond1);
6889   addr = build_va_arg_indirect_ref (addr);
6890
6891   if (indirect_p)
6892     addr = build_va_arg_indirect_ref (addr);
6893
6894   return addr;
6895 }
6896
6897 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
6898
6899 static void
6900 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
6901                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
6902                                 int no_rtl)
6903 {
6904   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6905   CUMULATIVE_ARGS local_cum;
6906   int gr_saved, vr_saved;
6907
6908   /* The caller has advanced CUM up to, but not beyond, the last named
6909      argument.  Advance a local copy of CUM past the last "real" named
6910      argument, to find out how many registers are left over.  */
6911   local_cum = *cum;
6912   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
6913
6914   /* Found out how many registers we need to save.  */
6915   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
6916   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
6917
6918   if (TARGET_GENERAL_REGS_ONLY)
6919     {
6920       if (local_cum.aapcs_nvrn > 0)
6921         sorry ("%qs and floating point or vector arguments",
6922                "-mgeneral-regs-only");
6923       vr_saved = 0;
6924     }
6925
6926   if (!no_rtl)
6927     {
6928       if (gr_saved > 0)
6929         {
6930           rtx ptr, mem;
6931
6932           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
6933           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
6934                                - gr_saved * UNITS_PER_WORD);
6935           mem = gen_frame_mem (BLKmode, ptr);
6936           set_mem_alias_set (mem, get_varargs_alias_set ());
6937
6938           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
6939                                mem, gr_saved);
6940         }
6941       if (vr_saved > 0)
6942         {
6943           /* We can't use move_block_from_reg, because it will use
6944              the wrong mode, storing D regs only.  */
6945           enum machine_mode mode = TImode;
6946           int off, i;
6947
6948           /* Set OFF to the offset from virtual_incoming_args_rtx of
6949              the first vector register.  The VR save area lies below
6950              the GR one, and is aligned to 16 bytes.  */
6951           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
6952                                    STACK_BOUNDARY / BITS_PER_UNIT);
6953           off -= vr_saved * UNITS_PER_VREG;
6954
6955           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
6956             {
6957               rtx ptr, mem;
6958
6959               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
6960               mem = gen_frame_mem (mode, ptr);
6961               set_mem_alias_set (mem, get_varargs_alias_set ());
6962               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
6963               off += UNITS_PER_VREG;
6964             }
6965         }
6966     }
6967
6968   /* We don't save the size into *PRETEND_SIZE because we want to avoid
6969      any complication of having crtl->args.pretend_args_size changed.  */
6970   cfun->machine->frame.saved_varargs_size
6971     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
6972                       STACK_BOUNDARY / BITS_PER_UNIT)
6973        + vr_saved * UNITS_PER_VREG);
6974 }
6975
6976 static void
6977 aarch64_conditional_register_usage (void)
6978 {
6979   int i;
6980   if (!TARGET_FLOAT)
6981     {
6982       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
6983         {
6984           fixed_regs[i] = 1;
6985           call_used_regs[i] = 1;
6986         }
6987     }
6988 }
6989
6990 /* Walk down the type tree of TYPE counting consecutive base elements.
6991    If *MODEP is VOIDmode, then set it to the first valid floating point
6992    type.  If a non-floating point type is found, or if a floating point
6993    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
6994    otherwise return the count in the sub-tree.  */
6995 static int
6996 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
6997 {
6998   enum machine_mode mode;
6999   HOST_WIDE_INT size;
7000
7001   switch (TREE_CODE (type))
7002     {
7003     case REAL_TYPE:
7004       mode = TYPE_MODE (type);
7005       if (mode != DFmode && mode != SFmode && mode != TFmode)
7006         return -1;
7007
7008       if (*modep == VOIDmode)
7009         *modep = mode;
7010
7011       if (*modep == mode)
7012         return 1;
7013
7014       break;
7015
7016     case COMPLEX_TYPE:
7017       mode = TYPE_MODE (TREE_TYPE (type));
7018       if (mode != DFmode && mode != SFmode && mode != TFmode)
7019         return -1;
7020
7021       if (*modep == VOIDmode)
7022         *modep = mode;
7023
7024       if (*modep == mode)
7025         return 2;
7026
7027       break;
7028
7029     case VECTOR_TYPE:
7030       /* Use V2SImode and V4SImode as representatives of all 64-bit
7031          and 128-bit vector types.  */
7032       size = int_size_in_bytes (type);
7033       switch (size)
7034         {
7035         case 8:
7036           mode = V2SImode;
7037           break;
7038         case 16:
7039           mode = V4SImode;
7040           break;
7041         default:
7042           return -1;
7043         }
7044
7045       if (*modep == VOIDmode)
7046         *modep = mode;
7047
7048       /* Vector modes are considered to be opaque: two vectors are
7049          equivalent for the purposes of being homogeneous aggregates
7050          if they are the same size.  */
7051       if (*modep == mode)
7052         return 1;
7053
7054       break;
7055
7056     case ARRAY_TYPE:
7057       {
7058         int count;
7059         tree index = TYPE_DOMAIN (type);
7060
7061         /* Can't handle incomplete types nor sizes that are not
7062            fixed.  */
7063         if (!COMPLETE_TYPE_P (type)
7064             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7065           return -1;
7066
7067         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7068         if (count == -1
7069             || !index
7070             || !TYPE_MAX_VALUE (index)
7071             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7072             || !TYPE_MIN_VALUE (index)
7073             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7074             || count < 0)
7075           return -1;
7076
7077         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7078                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7079
7080         /* There must be no padding.  */
7081         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7082           return -1;
7083
7084         return count;
7085       }
7086
7087     case RECORD_TYPE:
7088       {
7089         int count = 0;
7090         int sub_count;
7091         tree field;
7092
7093         /* Can't handle incomplete types nor sizes that are not
7094            fixed.  */
7095         if (!COMPLETE_TYPE_P (type)
7096             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7097           return -1;
7098
7099         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7100           {
7101             if (TREE_CODE (field) != FIELD_DECL)
7102               continue;
7103
7104             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7105             if (sub_count < 0)
7106               return -1;
7107             count += sub_count;
7108           }
7109
7110         /* There must be no padding.  */
7111         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7112           return -1;
7113
7114         return count;
7115       }
7116
7117     case UNION_TYPE:
7118     case QUAL_UNION_TYPE:
7119       {
7120         /* These aren't very interesting except in a degenerate case.  */
7121         int count = 0;
7122         int sub_count;
7123         tree field;
7124
7125         /* Can't handle incomplete types nor sizes that are not
7126            fixed.  */
7127         if (!COMPLETE_TYPE_P (type)
7128             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7129           return -1;
7130
7131         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7132           {
7133             if (TREE_CODE (field) != FIELD_DECL)
7134               continue;
7135
7136             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7137             if (sub_count < 0)
7138               return -1;
7139             count = count > sub_count ? count : sub_count;
7140           }
7141
7142         /* There must be no padding.  */
7143         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7144           return -1;
7145
7146         return count;
7147       }
7148
7149     default:
7150       break;
7151     }
7152
7153   return -1;
7154 }
7155
7156 /* Return true if we use LRA instead of reload pass.  */
7157 static bool
7158 aarch64_lra_p (void)
7159 {
7160   return aarch64_lra_flag;
7161 }
7162
7163 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7164    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7165    array types.  The C99 floating-point complex types are also considered
7166    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7167    types, which are GCC extensions and out of the scope of AAPCS64, are
7168    treated as composite types here as well.
7169
7170    Note that MODE itself is not sufficient in determining whether a type
7171    is such a composite type or not.  This is because
7172    stor-layout.c:compute_record_mode may have already changed the MODE
7173    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7174    structure with only one field may have its MODE set to the mode of the
7175    field.  Also an integer mode whose size matches the size of the
7176    RECORD_TYPE type may be used to substitute the original mode
7177    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7178    solely relied on.  */
7179
7180 static bool
7181 aarch64_composite_type_p (const_tree type,
7182                           enum machine_mode mode)
7183 {
7184   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7185     return true;
7186
7187   if (mode == BLKmode
7188       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7189       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7190     return true;
7191
7192   return false;
7193 }
7194
7195 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7196    type as described in AAPCS64 \S 4.1.2.
7197
7198    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7199
7200 static bool
7201 aarch64_short_vector_p (const_tree type,
7202                         enum machine_mode mode)
7203 {
7204   HOST_WIDE_INT size = -1;
7205
7206   if (type && TREE_CODE (type) == VECTOR_TYPE)
7207     size = int_size_in_bytes (type);
7208   else if (!aarch64_composite_type_p (type, mode)
7209            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7210                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7211     size = GET_MODE_SIZE (mode);
7212
7213   return (size == 8 || size == 16) ? true : false;
7214 }
7215
7216 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7217    shall be passed or returned in simd/fp register(s) (providing these
7218    parameter passing registers are available).
7219
7220    Upon successful return, *COUNT returns the number of needed registers,
7221    *BASE_MODE returns the mode of the individual register and when IS_HAF
7222    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7223    floating-point aggregate or a homogeneous short-vector aggregate.  */
7224
7225 static bool
7226 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7227                                          const_tree type,
7228                                          enum machine_mode *base_mode,
7229                                          int *count,
7230                                          bool *is_ha)
7231 {
7232   enum machine_mode new_mode = VOIDmode;
7233   bool composite_p = aarch64_composite_type_p (type, mode);
7234
7235   if (is_ha != NULL) *is_ha = false;
7236
7237   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7238       || aarch64_short_vector_p (type, mode))
7239     {
7240       *count = 1;
7241       new_mode = mode;
7242     }
7243   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7244     {
7245       if (is_ha != NULL) *is_ha = true;
7246       *count = 2;
7247       new_mode = GET_MODE_INNER (mode);
7248     }
7249   else if (type && composite_p)
7250     {
7251       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7252
7253       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7254         {
7255           if (is_ha != NULL) *is_ha = true;
7256           *count = ag_count;
7257         }
7258       else
7259         return false;
7260     }
7261   else
7262     return false;
7263
7264   *base_mode = new_mode;
7265   return true;
7266 }
7267
7268 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7269
7270 static rtx
7271 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7272                           int incoming ATTRIBUTE_UNUSED)
7273 {
7274   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7275 }
7276
7277 /* Implements target hook vector_mode_supported_p.  */
7278 static bool
7279 aarch64_vector_mode_supported_p (enum machine_mode mode)
7280 {
7281   if (TARGET_SIMD
7282       && (mode == V4SImode  || mode == V8HImode
7283           || mode == V16QImode || mode == V2DImode
7284           || mode == V2SImode  || mode == V4HImode
7285           || mode == V8QImode || mode == V2SFmode
7286           || mode == V4SFmode || mode == V2DFmode
7287           || mode == V1DFmode))
7288     return true;
7289
7290   return false;
7291 }
7292
7293 /* Return appropriate SIMD container
7294    for MODE within a vector of WIDTH bits.  */
7295 static enum machine_mode
7296 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7297 {
7298   gcc_assert (width == 64 || width == 128);
7299   if (TARGET_SIMD)
7300     {
7301       if (width == 128)
7302         switch (mode)
7303           {
7304           case DFmode:
7305             return V2DFmode;
7306           case SFmode:
7307             return V4SFmode;
7308           case SImode:
7309             return V4SImode;
7310           case HImode:
7311             return V8HImode;
7312           case QImode:
7313             return V16QImode;
7314           case DImode:
7315             return V2DImode;
7316           default:
7317             break;
7318           }
7319       else
7320         switch (mode)
7321           {
7322           case SFmode:
7323             return V2SFmode;
7324           case SImode:
7325             return V2SImode;
7326           case HImode:
7327             return V4HImode;
7328           case QImode:
7329             return V8QImode;
7330           default:
7331             break;
7332           }
7333     }
7334   return word_mode;
7335 }
7336
7337 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7338 static enum machine_mode
7339 aarch64_preferred_simd_mode (enum machine_mode mode)
7340 {
7341   return aarch64_simd_container_mode (mode, 128);
7342 }
7343
7344 /* Return the bitmask of possible vector sizes for the vectorizer
7345    to iterate over.  */
7346 static unsigned int
7347 aarch64_autovectorize_vector_sizes (void)
7348 {
7349   return (16 | 8);
7350 }
7351
7352 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7353    vector types in order to conform to the AAPCS64 (see "Procedure
7354    Call Standard for the ARM 64-bit Architecture", Appendix A).  To
7355    qualify for emission with the mangled names defined in that document,
7356    a vector type must not only be of the correct mode but also be
7357    composed of AdvSIMD vector element types (e.g.
7358    _builtin_aarch64_simd_qi); these types are registered by
7359    aarch64_init_simd_builtins ().  In other words, vector types defined
7360    in other ways e.g. via vector_size attribute will get default
7361    mangled names.  */
7362 typedef struct
7363 {
7364   enum machine_mode mode;
7365   const char *element_type_name;
7366   const char *mangled_name;
7367 } aarch64_simd_mangle_map_entry;
7368
7369 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7370   /* 64-bit containerized types.  */
7371   { V8QImode,  "__builtin_aarch64_simd_qi",     "10__Int8x8_t" },
7372   { V8QImode,  "__builtin_aarch64_simd_uqi",    "11__Uint8x8_t" },
7373   { V4HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x4_t" },
7374   { V4HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x4_t" },
7375   { V2SImode,  "__builtin_aarch64_simd_si",     "11__Int32x2_t" },
7376   { V2SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x2_t" },
7377   { V2SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x2_t" },
7378   { DImode,    "__builtin_aarch64_simd_di",     "11__Int64x1_t" },
7379   { DImode,    "__builtin_aarch64_simd_udi",    "12__Uint64x1_t" },
7380   { V1DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x1_t" },
7381   { V8QImode,  "__builtin_aarch64_simd_poly8",  "11__Poly8x8_t" },
7382   { V4HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7383   /* 128-bit containerized types.  */
7384   { V16QImode, "__builtin_aarch64_simd_qi",     "11__Int8x16_t" },
7385   { V16QImode, "__builtin_aarch64_simd_uqi",    "12__Uint8x16_t" },
7386   { V8HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x8_t" },
7387   { V8HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x8_t" },
7388   { V4SImode,  "__builtin_aarch64_simd_si",     "11__Int32x4_t" },
7389   { V4SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x4_t" },
7390   { V2DImode,  "__builtin_aarch64_simd_di",     "11__Int64x2_t" },
7391   { V2DImode,  "__builtin_aarch64_simd_udi",    "12__Uint64x2_t" },
7392   { V4SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x4_t" },
7393   { V2DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x2_t" },
7394   { V16QImode, "__builtin_aarch64_simd_poly8",  "12__Poly8x16_t" },
7395   { V8HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7396   { V2DImode,  "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7397   { VOIDmode, NULL, NULL }
7398 };
7399
7400 /* Implement TARGET_MANGLE_TYPE.  */
7401
7402 static const char *
7403 aarch64_mangle_type (const_tree type)
7404 {
7405   /* The AArch64 ABI documents say that "__va_list" has to be
7406      managled as if it is in the "std" namespace.  */
7407   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7408     return "St9__va_list";
7409
7410   /* Check the mode of the vector type, and the name of the vector
7411      element type, against the table.  */
7412   if (TREE_CODE (type) == VECTOR_TYPE)
7413     {
7414       aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7415
7416       while (pos->mode != VOIDmode)
7417         {
7418           tree elt_type = TREE_TYPE (type);
7419
7420           if (pos->mode == TYPE_MODE (type)
7421               && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7422               && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7423                           pos->element_type_name))
7424             return pos->mangled_name;
7425
7426           pos++;
7427         }
7428     }
7429
7430   /* Use the default mangling.  */
7431   return NULL;
7432 }
7433
7434 /* Return the equivalent letter for size.  */
7435 static char
7436 sizetochar (int size)
7437 {
7438   switch (size)
7439     {
7440     case 64: return 'd';
7441     case 32: return 's';
7442     case 16: return 'h';
7443     case 8 : return 'b';
7444     default: gcc_unreachable ();
7445     }
7446 }
7447
7448 /* Return true iff x is a uniform vector of floating-point
7449    constants, and the constant can be represented in
7450    quarter-precision form.  Note, as aarch64_float_const_representable
7451    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
7452 static bool
7453 aarch64_vect_float_const_representable_p (rtx x)
7454 {
7455   int i = 0;
7456   REAL_VALUE_TYPE r0, ri;
7457   rtx x0, xi;
7458
7459   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7460     return false;
7461
7462   x0 = CONST_VECTOR_ELT (x, 0);
7463   if (!CONST_DOUBLE_P (x0))
7464     return false;
7465
7466   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7467
7468   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7469     {
7470       xi = CONST_VECTOR_ELT (x, i);
7471       if (!CONST_DOUBLE_P (xi))
7472         return false;
7473
7474       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7475       if (!REAL_VALUES_EQUAL (r0, ri))
7476         return false;
7477     }
7478
7479   return aarch64_float_const_representable_p (x0);
7480 }
7481
7482 /* Return true for valid and false for invalid.  */
7483 bool
7484 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7485                               struct simd_immediate_info *info)
7486 {
7487 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
7488   matches = 1;                                          \
7489   for (i = 0; i < idx; i += (STRIDE))                   \
7490     if (!(TEST))                                        \
7491       matches = 0;                                      \
7492   if (matches)                                          \
7493     {                                                   \
7494       immtype = (CLASS);                                \
7495       elsize = (ELSIZE);                                \
7496       eshift = (SHIFT);                                 \
7497       emvn = (NEG);                                     \
7498       break;                                            \
7499     }
7500
7501   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7502   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7503   unsigned char bytes[16];
7504   int immtype = -1, matches;
7505   unsigned int invmask = inverse ? 0xff : 0;
7506   int eshift, emvn;
7507
7508   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7509     {
7510       if (! (aarch64_simd_imm_zero_p (op, mode)
7511              || aarch64_vect_float_const_representable_p (op)))
7512         return false;
7513
7514       if (info)
7515         {
7516           info->value = CONST_VECTOR_ELT (op, 0);
7517           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7518           info->mvn = false;
7519           info->shift = 0;
7520         }
7521
7522       return true;
7523     }
7524
7525   /* Splat vector constant out into a byte vector.  */
7526   for (i = 0; i < n_elts; i++)
7527     {
7528       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
7529          it must be laid out in the vector register in reverse order.  */
7530       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7531       unsigned HOST_WIDE_INT elpart;
7532       unsigned int part, parts;
7533
7534       if (GET_CODE (el) == CONST_INT)
7535         {
7536           elpart = INTVAL (el);
7537           parts = 1;
7538         }
7539       else if (GET_CODE (el) == CONST_DOUBLE)
7540         {
7541           elpart = CONST_DOUBLE_LOW (el);
7542           parts = 2;
7543         }
7544       else
7545         gcc_unreachable ();
7546
7547       for (part = 0; part < parts; part++)
7548         {
7549           unsigned int byte;
7550           for (byte = 0; byte < innersize; byte++)
7551             {
7552               bytes[idx++] = (elpart & 0xff) ^ invmask;
7553               elpart >>= BITS_PER_UNIT;
7554             }
7555           if (GET_CODE (el) == CONST_DOUBLE)
7556             elpart = CONST_DOUBLE_HIGH (el);
7557         }
7558     }
7559
7560   /* Sanity check.  */
7561   gcc_assert (idx == GET_MODE_SIZE (mode));
7562
7563   do
7564     {
7565       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7566              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7567
7568       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7569              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7570
7571       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7572              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7573
7574       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7575              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7576
7577       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7578
7579       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7580
7581       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7582              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7583
7584       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7585              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7586
7587       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7588              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7589
7590       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7591              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7592
7593       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7594
7595       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7596
7597       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7598              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7599
7600       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7601              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7602
7603       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7604              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7605
7606       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7607              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7608
7609       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7610
7611       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7612              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7613     }
7614   while (0);
7615
7616   if (immtype == -1)
7617     return false;
7618
7619   if (info)
7620     {
7621       info->element_width = elsize;
7622       info->mvn = emvn != 0;
7623       info->shift = eshift;
7624
7625       unsigned HOST_WIDE_INT imm = 0;
7626
7627       if (immtype >= 12 && immtype <= 15)
7628         info->msl = true;
7629
7630       /* Un-invert bytes of recognized vector, if necessary.  */
7631       if (invmask != 0)
7632         for (i = 0; i < idx; i++)
7633           bytes[i] ^= invmask;
7634
7635       if (immtype == 17)
7636         {
7637           /* FIXME: Broken on 32-bit H_W_I hosts.  */
7638           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7639
7640           for (i = 0; i < 8; i++)
7641             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7642               << (i * BITS_PER_UNIT);
7643
7644
7645           info->value = GEN_INT (imm);
7646         }
7647       else
7648         {
7649           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7650             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7651
7652           /* Construct 'abcdefgh' because the assembler cannot handle
7653              generic constants.  */
7654           if (info->mvn)
7655             imm = ~imm;
7656           imm = (imm >> info->shift) & 0xff;
7657           info->value = GEN_INT (imm);
7658         }
7659     }
7660
7661   return true;
7662 #undef CHECK
7663 }
7664
7665 static bool
7666 aarch64_const_vec_all_same_int_p (rtx x,
7667                                   HOST_WIDE_INT minval,
7668                                   HOST_WIDE_INT maxval)
7669 {
7670   HOST_WIDE_INT firstval;
7671   int count, i;
7672
7673   if (GET_CODE (x) != CONST_VECTOR
7674       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
7675     return false;
7676
7677   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
7678   if (firstval < minval || firstval > maxval)
7679     return false;
7680
7681   count = CONST_VECTOR_NUNITS (x);
7682   for (i = 1; i < count; i++)
7683     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
7684       return false;
7685
7686   return true;
7687 }
7688
7689 /* Check of immediate shift constants are within range.  */
7690 bool
7691 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7692 {
7693   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7694   if (left)
7695     return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
7696   else
7697     return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
7698 }
7699
7700 /* Return true if X is a uniform vector where all elements
7701    are either the floating-point constant 0.0 or the
7702    integer constant 0.  */
7703 bool
7704 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7705 {
7706   return x == CONST0_RTX (mode);
7707 }
7708
7709 bool
7710 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7711 {
7712   HOST_WIDE_INT imm = INTVAL (x);
7713   int i;
7714
7715   for (i = 0; i < 8; i++)
7716     {
7717       unsigned int byte = imm & 0xff;
7718       if (byte != 0xff && byte != 0)
7719        return false;
7720       imm >>= 8;
7721     }
7722
7723   return true;
7724 }
7725
7726 bool
7727 aarch64_mov_operand_p (rtx x,
7728                        enum aarch64_symbol_context context,
7729                        enum machine_mode mode)
7730 {
7731   if (GET_CODE (x) == HIGH
7732       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7733     return true;
7734
7735   if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7736     return true;
7737
7738   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7739     return true;
7740
7741   return aarch64_classify_symbolic_expression (x, context)
7742     == SYMBOL_TINY_ABSOLUTE;
7743 }
7744
7745 /* Return a const_int vector of VAL.  */
7746 rtx
7747 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7748 {
7749   int nunits = GET_MODE_NUNITS (mode);
7750   rtvec v = rtvec_alloc (nunits);
7751   int i;
7752
7753   for (i=0; i < nunits; i++)
7754     RTVEC_ELT (v, i) = GEN_INT (val);
7755
7756   return gen_rtx_CONST_VECTOR (mode, v);
7757 }
7758
7759 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
7760
7761 bool
7762 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7763 {
7764   enum machine_mode vmode;
7765
7766   gcc_assert (!VECTOR_MODE_P (mode));
7767   vmode = aarch64_preferred_simd_mode (mode);
7768   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7769   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7770 }
7771
7772 /* Construct and return a PARALLEL RTX vector.  */
7773 rtx
7774 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7775 {
7776   int nunits = GET_MODE_NUNITS (mode);
7777   rtvec v = rtvec_alloc (nunits / 2);
7778   int base = high ? nunits / 2 : 0;
7779   rtx t1;
7780   int i;
7781
7782   for (i=0; i < nunits / 2; i++)
7783     RTVEC_ELT (v, i) = GEN_INT (base + i);
7784
7785   t1 = gen_rtx_PARALLEL (mode, v);
7786   return t1;
7787 }
7788
7789 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
7790    HIGH (exclusive).  */
7791 void
7792 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7793 {
7794   HOST_WIDE_INT lane;
7795   gcc_assert (GET_CODE (operand) == CONST_INT);
7796   lane = INTVAL (operand);
7797
7798   if (lane < low || lane >= high)
7799     error ("lane out of range");
7800 }
7801
7802 void
7803 aarch64_simd_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7804 {
7805   gcc_assert (GET_CODE (operand) == CONST_INT);
7806   HOST_WIDE_INT lane = INTVAL (operand);
7807
7808   if (lane < low || lane >= high)
7809     error ("constant out of range");
7810 }
7811
7812 /* Emit code to reinterpret one AdvSIMD type as another,
7813    without altering bits.  */
7814 void
7815 aarch64_simd_reinterpret (rtx dest, rtx src)
7816 {
7817   emit_move_insn (dest, gen_lowpart (GET_MODE (dest), src));
7818 }
7819
7820 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
7821    registers).  */
7822 void
7823 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
7824                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
7825                             rtx op1)
7826 {
7827   rtx mem = gen_rtx_MEM (mode, destaddr);
7828   rtx tmp1 = gen_reg_rtx (mode);
7829   rtx tmp2 = gen_reg_rtx (mode);
7830
7831   emit_insn (intfn (tmp1, op1, tmp2));
7832
7833   emit_move_insn (mem, tmp1);
7834   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
7835   emit_move_insn (mem, tmp2);
7836 }
7837
7838 /* Return TRUE if OP is a valid vector addressing mode.  */
7839 bool
7840 aarch64_simd_mem_operand_p (rtx op)
7841 {
7842   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
7843                         || GET_CODE (XEXP (op, 0)) == REG);
7844 }
7845
7846 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
7847    not to early-clobber SRC registers in the process.
7848
7849    We assume that the operands described by SRC and DEST represent a
7850    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
7851    number of components into which the copy has been decomposed.  */
7852 void
7853 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
7854                                 rtx *src, unsigned int count)
7855 {
7856   unsigned int i;
7857
7858   if (!reg_overlap_mentioned_p (operands[0], operands[1])
7859       || REGNO (operands[0]) < REGNO (operands[1]))
7860     {
7861       for (i = 0; i < count; i++)
7862         {
7863           operands[2 * i] = dest[i];
7864           operands[2 * i + 1] = src[i];
7865         }
7866     }
7867   else
7868     {
7869       for (i = 0; i < count; i++)
7870         {
7871           operands[2 * i] = dest[count - i - 1];
7872           operands[2 * i + 1] = src[count - i - 1];
7873         }
7874     }
7875 }
7876
7877 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
7878    one of VSTRUCT modes: OI, CI or XI.  */
7879 int
7880 aarch64_simd_attr_length_move (rtx insn)
7881 {
7882   enum machine_mode mode;
7883
7884   extract_insn_cached (insn);
7885
7886   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
7887     {
7888       mode = GET_MODE (recog_data.operand[0]);
7889       switch (mode)
7890         {
7891         case OImode:
7892           return 8;
7893         case CImode:
7894           return 12;
7895         case XImode:
7896           return 16;
7897         default:
7898           gcc_unreachable ();
7899         }
7900     }
7901   return 4;
7902 }
7903
7904 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
7905    alignment of a vector to 128 bits.  */
7906 static HOST_WIDE_INT
7907 aarch64_simd_vector_alignment (const_tree type)
7908 {
7909   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
7910   return MIN (align, 128);
7911 }
7912
7913 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
7914 static bool
7915 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
7916 {
7917   if (is_packed)
7918     return false;
7919
7920   /* We guarantee alignment for vectors up to 128-bits.  */
7921   if (tree_int_cst_compare (TYPE_SIZE (type),
7922                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
7923     return false;
7924
7925   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
7926   return true;
7927 }
7928
7929 /* If VALS is a vector constant that can be loaded into a register
7930    using DUP, generate instructions to do so and return an RTX to
7931    assign to the register.  Otherwise return NULL_RTX.  */
7932 static rtx
7933 aarch64_simd_dup_constant (rtx vals)
7934 {
7935   enum machine_mode mode = GET_MODE (vals);
7936   enum machine_mode inner_mode = GET_MODE_INNER (mode);
7937   int n_elts = GET_MODE_NUNITS (mode);
7938   bool all_same = true;
7939   rtx x;
7940   int i;
7941
7942   if (GET_CODE (vals) != CONST_VECTOR)
7943     return NULL_RTX;
7944
7945   for (i = 1; i < n_elts; ++i)
7946     {
7947       x = CONST_VECTOR_ELT (vals, i);
7948       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
7949         all_same = false;
7950     }
7951
7952   if (!all_same)
7953     return NULL_RTX;
7954
7955   /* We can load this constant by using DUP and a constant in a
7956      single ARM register.  This will be cheaper than a vector
7957      load.  */
7958   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
7959   return gen_rtx_VEC_DUPLICATE (mode, x);
7960 }
7961
7962
7963 /* Generate code to load VALS, which is a PARALLEL containing only
7964    constants (for vec_init) or CONST_VECTOR, efficiently into a
7965    register.  Returns an RTX to copy into the register, or NULL_RTX
7966    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
7967 static rtx
7968 aarch64_simd_make_constant (rtx vals)
7969 {
7970   enum machine_mode mode = GET_MODE (vals);
7971   rtx const_dup;
7972   rtx const_vec = NULL_RTX;
7973   int n_elts = GET_MODE_NUNITS (mode);
7974   int n_const = 0;
7975   int i;
7976
7977   if (GET_CODE (vals) == CONST_VECTOR)
7978     const_vec = vals;
7979   else if (GET_CODE (vals) == PARALLEL)
7980     {
7981       /* A CONST_VECTOR must contain only CONST_INTs and
7982          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
7983          Only store valid constants in a CONST_VECTOR.  */
7984       for (i = 0; i < n_elts; ++i)
7985         {
7986           rtx x = XVECEXP (vals, 0, i);
7987           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
7988             n_const++;
7989         }
7990       if (n_const == n_elts)
7991         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
7992     }
7993   else
7994     gcc_unreachable ();
7995
7996   if (const_vec != NULL_RTX
7997       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
7998     /* Load using MOVI/MVNI.  */
7999     return const_vec;
8000   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8001     /* Loaded using DUP.  */
8002     return const_dup;
8003   else if (const_vec != NULL_RTX)
8004     /* Load from constant pool. We can not take advantage of single-cycle
8005        LD1 because we need a PC-relative addressing mode.  */
8006     return const_vec;
8007   else
8008     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8009        We can not construct an initializer.  */
8010     return NULL_RTX;
8011 }
8012
8013 void
8014 aarch64_expand_vector_init (rtx target, rtx vals)
8015 {
8016   enum machine_mode mode = GET_MODE (target);
8017   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8018   int n_elts = GET_MODE_NUNITS (mode);
8019   int n_var = 0, one_var = -1;
8020   bool all_same = true;
8021   rtx x, mem;
8022   int i;
8023
8024   x = XVECEXP (vals, 0, 0);
8025   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8026     n_var = 1, one_var = 0;
8027
8028   for (i = 1; i < n_elts; ++i)
8029     {
8030       x = XVECEXP (vals, 0, i);
8031       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8032         ++n_var, one_var = i;
8033
8034       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8035         all_same = false;
8036     }
8037
8038   if (n_var == 0)
8039     {
8040       rtx constant = aarch64_simd_make_constant (vals);
8041       if (constant != NULL_RTX)
8042         {
8043           emit_move_insn (target, constant);
8044           return;
8045         }
8046     }
8047
8048   /* Splat a single non-constant element if we can.  */
8049   if (all_same)
8050     {
8051       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8052       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8053       return;
8054     }
8055
8056   /* One field is non-constant.  Load constant then overwrite varying
8057      field.  This is more efficient than using the stack.  */
8058   if (n_var == 1)
8059     {
8060       rtx copy = copy_rtx (vals);
8061       rtx index = GEN_INT (one_var);
8062       enum insn_code icode;
8063
8064       /* Load constant part of vector, substitute neighboring value for
8065          varying element.  */
8066       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8067       aarch64_expand_vector_init (target, copy);
8068
8069       /* Insert variable.  */
8070       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8071       icode = optab_handler (vec_set_optab, mode);
8072       gcc_assert (icode != CODE_FOR_nothing);
8073       emit_insn (GEN_FCN (icode) (target, x, index));
8074       return;
8075     }
8076
8077   /* Construct the vector in memory one field at a time
8078      and load the whole vector.  */
8079   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8080   for (i = 0; i < n_elts; i++)
8081     emit_move_insn (adjust_address_nv (mem, inner_mode,
8082                                     i * GET_MODE_SIZE (inner_mode)),
8083                     XVECEXP (vals, 0, i));
8084   emit_move_insn (target, mem);
8085
8086 }
8087
8088 static unsigned HOST_WIDE_INT
8089 aarch64_shift_truncation_mask (enum machine_mode mode)
8090 {
8091   return
8092     (aarch64_vector_mode_supported_p (mode)
8093      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8094 }
8095
8096 #ifndef TLS_SECTION_ASM_FLAG
8097 #define TLS_SECTION_ASM_FLAG 'T'
8098 #endif
8099
8100 void
8101 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8102                                tree decl ATTRIBUTE_UNUSED)
8103 {
8104   char flagchars[10], *f = flagchars;
8105
8106   /* If we have already declared this section, we can use an
8107      abbreviated form to switch back to it -- unless this section is
8108      part of a COMDAT groups, in which case GAS requires the full
8109      declaration every time.  */
8110   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8111       && (flags & SECTION_DECLARED))
8112     {
8113       fprintf (asm_out_file, "\t.section\t%s\n", name);
8114       return;
8115     }
8116
8117   if (!(flags & SECTION_DEBUG))
8118     *f++ = 'a';
8119   if (flags & SECTION_WRITE)
8120     *f++ = 'w';
8121   if (flags & SECTION_CODE)
8122     *f++ = 'x';
8123   if (flags & SECTION_SMALL)
8124     *f++ = 's';
8125   if (flags & SECTION_MERGE)
8126     *f++ = 'M';
8127   if (flags & SECTION_STRINGS)
8128     *f++ = 'S';
8129   if (flags & SECTION_TLS)
8130     *f++ = TLS_SECTION_ASM_FLAG;
8131   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8132     *f++ = 'G';
8133   *f = '\0';
8134
8135   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8136
8137   if (!(flags & SECTION_NOTYPE))
8138     {
8139       const char *type;
8140       const char *format;
8141
8142       if (flags & SECTION_BSS)
8143         type = "nobits";
8144       else
8145         type = "progbits";
8146
8147 #ifdef TYPE_OPERAND_FMT
8148       format = "," TYPE_OPERAND_FMT;
8149 #else
8150       format = ",@%s";
8151 #endif
8152
8153       fprintf (asm_out_file, format, type);
8154
8155       if (flags & SECTION_ENTSIZE)
8156         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8157       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8158         {
8159           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8160             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8161           else
8162             fprintf (asm_out_file, ",%s,comdat",
8163                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8164         }
8165     }
8166
8167   putc ('\n', asm_out_file);
8168 }
8169
8170 /* Select a format to encode pointers in exception handling data.  */
8171 int
8172 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8173 {
8174    int type;
8175    switch (aarch64_cmodel)
8176      {
8177      case AARCH64_CMODEL_TINY:
8178      case AARCH64_CMODEL_TINY_PIC:
8179      case AARCH64_CMODEL_SMALL:
8180      case AARCH64_CMODEL_SMALL_PIC:
8181        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8182           for everything.  */
8183        type = DW_EH_PE_sdata4;
8184        break;
8185      default:
8186        /* No assumptions here.  8-byte relocs required.  */
8187        type = DW_EH_PE_sdata8;
8188        break;
8189      }
8190    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8191 }
8192
8193 /* Emit load exclusive.  */
8194
8195 static void
8196 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8197                              rtx mem, rtx model_rtx)
8198 {
8199   rtx (*gen) (rtx, rtx, rtx);
8200
8201   switch (mode)
8202     {
8203     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8204     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8205     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8206     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8207     default:
8208       gcc_unreachable ();
8209     }
8210
8211   emit_insn (gen (rval, mem, model_rtx));
8212 }
8213
8214 /* Emit store exclusive.  */
8215
8216 static void
8217 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8218                               rtx rval, rtx mem, rtx model_rtx)
8219 {
8220   rtx (*gen) (rtx, rtx, rtx, rtx);
8221
8222   switch (mode)
8223     {
8224     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8225     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8226     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8227     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8228     default:
8229       gcc_unreachable ();
8230     }
8231
8232   emit_insn (gen (bval, rval, mem, model_rtx));
8233 }
8234
8235 /* Mark the previous jump instruction as unlikely.  */
8236
8237 static void
8238 aarch64_emit_unlikely_jump (rtx insn)
8239 {
8240   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8241
8242   insn = emit_jump_insn (insn);
8243   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8244 }
8245
8246 /* Expand a compare and swap pattern.  */
8247
8248 void
8249 aarch64_expand_compare_and_swap (rtx operands[])
8250 {
8251   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8252   enum machine_mode mode, cmp_mode;
8253   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8254
8255   bval = operands[0];
8256   rval = operands[1];
8257   mem = operands[2];
8258   oldval = operands[3];
8259   newval = operands[4];
8260   is_weak = operands[5];
8261   mod_s = operands[6];
8262   mod_f = operands[7];
8263   mode = GET_MODE (mem);
8264   cmp_mode = mode;
8265
8266   /* Normally the succ memory model must be stronger than fail, but in the
8267      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8268      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8269
8270   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8271       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8272     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8273
8274   switch (mode)
8275     {
8276     case QImode:
8277     case HImode:
8278       /* For short modes, we're going to perform the comparison in SImode,
8279          so do the zero-extension now.  */
8280       cmp_mode = SImode;
8281       rval = gen_reg_rtx (SImode);
8282       oldval = convert_modes (SImode, mode, oldval, true);
8283       /* Fall through.  */
8284
8285     case SImode:
8286     case DImode:
8287       /* Force the value into a register if needed.  */
8288       if (!aarch64_plus_operand (oldval, mode))
8289         oldval = force_reg (cmp_mode, oldval);
8290       break;
8291
8292     default:
8293       gcc_unreachable ();
8294     }
8295
8296   switch (mode)
8297     {
8298     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8299     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8300     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8301     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8302     default:
8303       gcc_unreachable ();
8304     }
8305
8306   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8307
8308   if (mode == QImode || mode == HImode)
8309     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8310
8311   x = gen_rtx_REG (CCmode, CC_REGNUM);
8312   x = gen_rtx_EQ (SImode, x, const0_rtx);
8313   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8314 }
8315
8316 /* Split a compare and swap pattern.  */
8317
8318 void
8319 aarch64_split_compare_and_swap (rtx operands[])
8320 {
8321   rtx rval, mem, oldval, newval, scratch;
8322   enum machine_mode mode;
8323   bool is_weak;
8324   rtx label1, label2, x, cond;
8325
8326   rval = operands[0];
8327   mem = operands[1];
8328   oldval = operands[2];
8329   newval = operands[3];
8330   is_weak = (operands[4] != const0_rtx);
8331   scratch = operands[7];
8332   mode = GET_MODE (mem);
8333
8334   label1 = NULL_RTX;
8335   if (!is_weak)
8336     {
8337       label1 = gen_label_rtx ();
8338       emit_label (label1);
8339     }
8340   label2 = gen_label_rtx ();
8341
8342   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8343
8344   cond = aarch64_gen_compare_reg (NE, rval, oldval);
8345   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8346   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8347                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8348   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8349
8350   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8351
8352   if (!is_weak)
8353     {
8354       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8355       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8356                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8357       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8358     }
8359   else
8360     {
8361       cond = gen_rtx_REG (CCmode, CC_REGNUM);
8362       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8363       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8364     }
8365
8366   emit_label (label2);
8367 }
8368
8369 /* Split an atomic operation.  */
8370
8371 void
8372 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8373                      rtx value, rtx model_rtx, rtx cond)
8374 {
8375   enum machine_mode mode = GET_MODE (mem);
8376   enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8377   rtx label, x;
8378
8379   label = gen_label_rtx ();
8380   emit_label (label);
8381
8382   if (new_out)
8383     new_out = gen_lowpart (wmode, new_out);
8384   if (old_out)
8385     old_out = gen_lowpart (wmode, old_out);
8386   else
8387     old_out = new_out;
8388   value = simplify_gen_subreg (wmode, value, mode, 0);
8389
8390   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8391
8392   switch (code)
8393     {
8394     case SET:
8395       new_out = value;
8396       break;
8397
8398     case NOT:
8399       x = gen_rtx_AND (wmode, old_out, value);
8400       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8401       x = gen_rtx_NOT (wmode, new_out);
8402       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8403       break;
8404
8405     case MINUS:
8406       if (CONST_INT_P (value))
8407         {
8408           value = GEN_INT (-INTVAL (value));
8409           code = PLUS;
8410         }
8411       /* Fall through.  */
8412
8413     default:
8414       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8415       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8416       break;
8417     }
8418
8419   aarch64_emit_store_exclusive (mode, cond, mem,
8420                                 gen_lowpart (mode, new_out), model_rtx);
8421
8422   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8423   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8424                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8425   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8426 }
8427
8428 static void
8429 aarch64_print_extension (void)
8430 {
8431   const struct aarch64_option_extension *opt = NULL;
8432
8433   for (opt = all_extensions; opt->name != NULL; opt++)
8434     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8435       asm_fprintf (asm_out_file, "+%s", opt->name);
8436
8437   asm_fprintf (asm_out_file, "\n");
8438 }
8439
8440 static void
8441 aarch64_start_file (void)
8442 {
8443   if (selected_arch)
8444     {
8445       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8446       aarch64_print_extension ();
8447     }
8448   else if (selected_cpu)
8449     {
8450       const char *truncated_name
8451             = aarch64_rewrite_selected_cpu (selected_cpu->name);
8452       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8453       aarch64_print_extension ();
8454     }
8455   default_file_start();
8456 }
8457
8458 /* Target hook for c_mode_for_suffix.  */
8459 static enum machine_mode
8460 aarch64_c_mode_for_suffix (char suffix)
8461 {
8462   if (suffix == 'q')
8463     return TFmode;
8464
8465   return VOIDmode;
8466 }
8467
8468 /* We can only represent floating point constants which will fit in
8469    "quarter-precision" values.  These values are characterised by
8470    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
8471    by:
8472
8473    (-1)^s * (n/16) * 2^r
8474
8475    Where:
8476      's' is the sign bit.
8477      'n' is an integer in the range 16 <= n <= 31.
8478      'r' is an integer in the range -3 <= r <= 4.  */
8479
8480 /* Return true iff X can be represented by a quarter-precision
8481    floating point immediate operand X.  Note, we cannot represent 0.0.  */
8482 bool
8483 aarch64_float_const_representable_p (rtx x)
8484 {
8485   /* This represents our current view of how many bits
8486      make up the mantissa.  */
8487   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8488   int exponent;
8489   unsigned HOST_WIDE_INT mantissa, mask;
8490   REAL_VALUE_TYPE r, m;
8491   bool fail;
8492
8493   if (!CONST_DOUBLE_P (x))
8494     return false;
8495
8496   if (GET_MODE (x) == VOIDmode)
8497     return false;
8498
8499   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8500
8501   /* We cannot represent infinities, NaNs or +/-zero.  We won't
8502      know if we have +zero until we analyse the mantissa, but we
8503      can reject the other invalid values.  */
8504   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8505       || REAL_VALUE_MINUS_ZERO (r))
8506     return false;
8507
8508   /* Extract exponent.  */
8509   r = real_value_abs (&r);
8510   exponent = REAL_EXP (&r);
8511
8512   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8513      highest (sign) bit, with a fixed binary point at bit point_pos.
8514      m1 holds the low part of the mantissa, m2 the high part.
8515      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8516      bits for the mantissa, this can fail (low bits will be lost).  */
8517   real_ldexp (&m, &r, point_pos - exponent);
8518   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8519
8520   /* If the low part of the mantissa has bits set we cannot represent
8521      the value.  */
8522   if (w.elt (0) != 0)
8523     return false;
8524   /* We have rejected the lower HOST_WIDE_INT, so update our
8525      understanding of how many bits lie in the mantissa and
8526      look only at the high HOST_WIDE_INT.  */
8527   mantissa = w.elt (1);
8528   point_pos -= HOST_BITS_PER_WIDE_INT;
8529
8530   /* We can only represent values with a mantissa of the form 1.xxxx.  */
8531   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8532   if ((mantissa & mask) != 0)
8533     return false;
8534
8535   /* Having filtered unrepresentable values, we may now remove all
8536      but the highest 5 bits.  */
8537   mantissa >>= point_pos - 5;
8538
8539   /* We cannot represent the value 0.0, so reject it.  This is handled
8540      elsewhere.  */
8541   if (mantissa == 0)
8542     return false;
8543
8544   /* Then, as bit 4 is always set, we can mask it off, leaving
8545      the mantissa in the range [0, 15].  */
8546   mantissa &= ~(1 << 4);
8547   gcc_assert (mantissa <= 15);
8548
8549   /* GCC internally does not use IEEE754-like encoding (where normalized
8550      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
8551      Our mantissa values are shifted 4 places to the left relative to
8552      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8553      by 5 places to correct for GCC's representation.  */
8554   exponent = 5 - exponent;
8555
8556   return (exponent >= 0 && exponent <= 7);
8557 }
8558
8559 char*
8560 aarch64_output_simd_mov_immediate (rtx const_vector,
8561                                    enum machine_mode mode,
8562                                    unsigned width)
8563 {
8564   bool is_valid;
8565   static char templ[40];
8566   const char *mnemonic;
8567   const char *shift_op;
8568   unsigned int lane_count = 0;
8569   char element_char;
8570
8571   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8572
8573   /* This will return true to show const_vector is legal for use as either
8574      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
8575      also update INFO to show how the immediate should be generated.  */
8576   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8577   gcc_assert (is_valid);
8578
8579   element_char = sizetochar (info.element_width);
8580   lane_count = width / info.element_width;
8581
8582   mode = GET_MODE_INNER (mode);
8583   if (mode == SFmode || mode == DFmode)
8584     {
8585       gcc_assert (info.shift == 0 && ! info.mvn);
8586       if (aarch64_float_const_zero_rtx_p (info.value))
8587         info.value = GEN_INT (0);
8588       else
8589         {
8590 #define buf_size 20
8591           REAL_VALUE_TYPE r;
8592           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8593           char float_buf[buf_size] = {'\0'};
8594           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8595 #undef buf_size
8596
8597           if (lane_count == 1)
8598             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8599           else
8600             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8601                       lane_count, element_char, float_buf);
8602           return templ;
8603         }
8604     }
8605
8606   mnemonic = info.mvn ? "mvni" : "movi";
8607   shift_op = info.msl ? "msl" : "lsl";
8608
8609   if (lane_count == 1)
8610     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8611               mnemonic, UINTVAL (info.value));
8612   else if (info.shift)
8613     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8614               ", %s %d", mnemonic, lane_count, element_char,
8615               UINTVAL (info.value), shift_op, info.shift);
8616   else
8617     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8618               mnemonic, lane_count, element_char, UINTVAL (info.value));
8619   return templ;
8620 }
8621
8622 char*
8623 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8624                                           enum machine_mode mode)
8625 {
8626   enum machine_mode vmode;
8627
8628   gcc_assert (!VECTOR_MODE_P (mode));
8629   vmode = aarch64_simd_container_mode (mode, 64);
8630   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8631   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8632 }
8633
8634 /* Split operands into moves from op[1] + op[2] into op[0].  */
8635
8636 void
8637 aarch64_split_combinev16qi (rtx operands[3])
8638 {
8639   unsigned int dest = REGNO (operands[0]);
8640   unsigned int src1 = REGNO (operands[1]);
8641   unsigned int src2 = REGNO (operands[2]);
8642   enum machine_mode halfmode = GET_MODE (operands[1]);
8643   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8644   rtx destlo, desthi;
8645
8646   gcc_assert (halfmode == V16QImode);
8647
8648   if (src1 == dest && src2 == dest + halfregs)
8649     {
8650       /* No-op move.  Can't split to nothing; emit something.  */
8651       emit_note (NOTE_INSN_DELETED);
8652       return;
8653     }
8654
8655   /* Preserve register attributes for variable tracking.  */
8656   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8657   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8658                                GET_MODE_SIZE (halfmode));
8659
8660   /* Special case of reversed high/low parts.  */
8661   if (reg_overlap_mentioned_p (operands[2], destlo)
8662       && reg_overlap_mentioned_p (operands[1], desthi))
8663     {
8664       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8665       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8666       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8667     }
8668   else if (!reg_overlap_mentioned_p (operands[2], destlo))
8669     {
8670       /* Try to avoid unnecessary moves if part of the result
8671          is in the right place already.  */
8672       if (src1 != dest)
8673         emit_move_insn (destlo, operands[1]);
8674       if (src2 != dest + halfregs)
8675         emit_move_insn (desthi, operands[2]);
8676     }
8677   else
8678     {
8679       if (src2 != dest + halfregs)
8680         emit_move_insn (desthi, operands[2]);
8681       if (src1 != dest)
8682         emit_move_insn (destlo, operands[1]);
8683     }
8684 }
8685
8686 /* vec_perm support.  */
8687
8688 #define MAX_VECT_LEN 16
8689
8690 struct expand_vec_perm_d
8691 {
8692   rtx target, op0, op1;
8693   unsigned char perm[MAX_VECT_LEN];
8694   enum machine_mode vmode;
8695   unsigned char nelt;
8696   bool one_vector_p;
8697   bool testing_p;
8698 };
8699
8700 /* Generate a variable permutation.  */
8701
8702 static void
8703 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8704 {
8705   enum machine_mode vmode = GET_MODE (target);
8706   bool one_vector_p = rtx_equal_p (op0, op1);
8707
8708   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8709   gcc_checking_assert (GET_MODE (op0) == vmode);
8710   gcc_checking_assert (GET_MODE (op1) == vmode);
8711   gcc_checking_assert (GET_MODE (sel) == vmode);
8712   gcc_checking_assert (TARGET_SIMD);
8713
8714   if (one_vector_p)
8715     {
8716       if (vmode == V8QImode)
8717         {
8718           /* Expand the argument to a V16QI mode by duplicating it.  */
8719           rtx pair = gen_reg_rtx (V16QImode);
8720           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8721           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8722         }
8723       else
8724         {
8725           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8726         }
8727     }
8728   else
8729     {
8730       rtx pair;
8731
8732       if (vmode == V8QImode)
8733         {
8734           pair = gen_reg_rtx (V16QImode);
8735           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8736           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8737         }
8738       else
8739         {
8740           pair = gen_reg_rtx (OImode);
8741           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8742           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8743         }
8744     }
8745 }
8746
8747 void
8748 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8749 {
8750   enum machine_mode vmode = GET_MODE (target);
8751   unsigned int nelt = GET_MODE_NUNITS (vmode);
8752   bool one_vector_p = rtx_equal_p (op0, op1);
8753   rtx mask;
8754
8755   /* The TBL instruction does not use a modulo index, so we must take care
8756      of that ourselves.  */
8757   mask = aarch64_simd_gen_const_vector_dup (vmode,
8758       one_vector_p ? nelt - 1 : 2 * nelt - 1);
8759   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8760
8761   /* For big-endian, we also need to reverse the index within the vector
8762      (but not which vector).  */
8763   if (BYTES_BIG_ENDIAN)
8764     {
8765       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
8766       if (!one_vector_p)
8767         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
8768       sel = expand_simple_binop (vmode, XOR, sel, mask,
8769                                  NULL, 0, OPTAB_LIB_WIDEN);
8770     }
8771   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8772 }
8773
8774 /* Recognize patterns suitable for the TRN instructions.  */
8775 static bool
8776 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8777 {
8778   unsigned int i, odd, mask, nelt = d->nelt;
8779   rtx out, in0, in1, x;
8780   rtx (*gen) (rtx, rtx, rtx);
8781   enum machine_mode vmode = d->vmode;
8782
8783   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8784     return false;
8785
8786   /* Note that these are little-endian tests.
8787      We correct for big-endian later.  */
8788   if (d->perm[0] == 0)
8789     odd = 0;
8790   else if (d->perm[0] == 1)
8791     odd = 1;
8792   else
8793     return false;
8794   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8795
8796   for (i = 0; i < nelt; i += 2)
8797     {
8798       if (d->perm[i] != i + odd)
8799         return false;
8800       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
8801         return false;
8802     }
8803
8804   /* Success!  */
8805   if (d->testing_p)
8806     return true;
8807
8808   in0 = d->op0;
8809   in1 = d->op1;
8810   if (BYTES_BIG_ENDIAN)
8811     {
8812       x = in0, in0 = in1, in1 = x;
8813       odd = !odd;
8814     }
8815   out = d->target;
8816
8817   if (odd)
8818     {
8819       switch (vmode)
8820         {
8821         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
8822         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
8823         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
8824         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
8825         case V4SImode: gen = gen_aarch64_trn2v4si; break;
8826         case V2SImode: gen = gen_aarch64_trn2v2si; break;
8827         case V2DImode: gen = gen_aarch64_trn2v2di; break;
8828         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
8829         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
8830         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
8831         default:
8832           return false;
8833         }
8834     }
8835   else
8836     {
8837       switch (vmode)
8838         {
8839         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
8840         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
8841         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
8842         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
8843         case V4SImode: gen = gen_aarch64_trn1v4si; break;
8844         case V2SImode: gen = gen_aarch64_trn1v2si; break;
8845         case V2DImode: gen = gen_aarch64_trn1v2di; break;
8846         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
8847         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
8848         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
8849         default:
8850           return false;
8851         }
8852     }
8853
8854   emit_insn (gen (out, in0, in1));
8855   return true;
8856 }
8857
8858 /* Recognize patterns suitable for the UZP instructions.  */
8859 static bool
8860 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
8861 {
8862   unsigned int i, odd, mask, nelt = d->nelt;
8863   rtx out, in0, in1, x;
8864   rtx (*gen) (rtx, rtx, rtx);
8865   enum machine_mode vmode = d->vmode;
8866
8867   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8868     return false;
8869
8870   /* Note that these are little-endian tests.
8871      We correct for big-endian later.  */
8872   if (d->perm[0] == 0)
8873     odd = 0;
8874   else if (d->perm[0] == 1)
8875     odd = 1;
8876   else
8877     return false;
8878   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8879
8880   for (i = 0; i < nelt; i++)
8881     {
8882       unsigned elt = (i * 2 + odd) & mask;
8883       if (d->perm[i] != elt)
8884         return false;
8885     }
8886
8887   /* Success!  */
8888   if (d->testing_p)
8889     return true;
8890
8891   in0 = d->op0;
8892   in1 = d->op1;
8893   if (BYTES_BIG_ENDIAN)
8894     {
8895       x = in0, in0 = in1, in1 = x;
8896       odd = !odd;
8897     }
8898   out = d->target;
8899
8900   if (odd)
8901     {
8902       switch (vmode)
8903         {
8904         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
8905         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
8906         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
8907         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
8908         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
8909         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
8910         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
8911         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
8912         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
8913         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
8914         default:
8915           return false;
8916         }
8917     }
8918   else
8919     {
8920       switch (vmode)
8921         {
8922         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
8923         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
8924         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
8925         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
8926         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
8927         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
8928         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
8929         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
8930         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
8931         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
8932         default:
8933           return false;
8934         }
8935     }
8936
8937   emit_insn (gen (out, in0, in1));
8938   return true;
8939 }
8940
8941 /* Recognize patterns suitable for the ZIP instructions.  */
8942 static bool
8943 aarch64_evpc_zip (struct expand_vec_perm_d *d)
8944 {
8945   unsigned int i, high, mask, nelt = d->nelt;
8946   rtx out, in0, in1, x;
8947   rtx (*gen) (rtx, rtx, rtx);
8948   enum machine_mode vmode = d->vmode;
8949
8950   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8951     return false;
8952
8953   /* Note that these are little-endian tests.
8954      We correct for big-endian later.  */
8955   high = nelt / 2;
8956   if (d->perm[0] == high)
8957     /* Do Nothing.  */
8958     ;
8959   else if (d->perm[0] == 0)
8960     high = 0;
8961   else
8962     return false;
8963   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8964
8965   for (i = 0; i < nelt / 2; i++)
8966     {
8967       unsigned elt = (i + high) & mask;
8968       if (d->perm[i * 2] != elt)
8969         return false;
8970       elt = (elt + nelt) & mask;
8971       if (d->perm[i * 2 + 1] != elt)
8972         return false;
8973     }
8974
8975   /* Success!  */
8976   if (d->testing_p)
8977     return true;
8978
8979   in0 = d->op0;
8980   in1 = d->op1;
8981   if (BYTES_BIG_ENDIAN)
8982     {
8983       x = in0, in0 = in1, in1 = x;
8984       high = !high;
8985     }
8986   out = d->target;
8987
8988   if (high)
8989     {
8990       switch (vmode)
8991         {
8992         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
8993         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
8994         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
8995         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
8996         case V4SImode: gen = gen_aarch64_zip2v4si; break;
8997         case V2SImode: gen = gen_aarch64_zip2v2si; break;
8998         case V2DImode: gen = gen_aarch64_zip2v2di; break;
8999         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9000         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9001         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9002         default:
9003           return false;
9004         }
9005     }
9006   else
9007     {
9008       switch (vmode)
9009         {
9010         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9011         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9012         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9013         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9014         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9015         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9016         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9017         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9018         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9019         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9020         default:
9021           return false;
9022         }
9023     }
9024
9025   emit_insn (gen (out, in0, in1));
9026   return true;
9027 }
9028
9029 /* Recognize patterns for the EXT insn.  */
9030
9031 static bool
9032 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9033 {
9034   unsigned int i, nelt = d->nelt;
9035   rtx (*gen) (rtx, rtx, rtx, rtx);
9036   rtx offset;
9037
9038   unsigned int location = d->perm[0]; /* Always < nelt.  */
9039
9040   /* Check if the extracted indices are increasing by one.  */
9041   for (i = 1; i < nelt; i++)
9042     {
9043       unsigned int required = location + i;
9044       if (d->one_vector_p)
9045         {
9046           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9047           required &= (nelt - 1);
9048         }
9049       if (d->perm[i] != required)
9050         return false;
9051     }
9052
9053   switch (d->vmode)
9054     {
9055     case V16QImode: gen = gen_aarch64_extv16qi; break;
9056     case V8QImode: gen = gen_aarch64_extv8qi; break;
9057     case V4HImode: gen = gen_aarch64_extv4hi; break;
9058     case V8HImode: gen = gen_aarch64_extv8hi; break;
9059     case V2SImode: gen = gen_aarch64_extv2si; break;
9060     case V4SImode: gen = gen_aarch64_extv4si; break;
9061     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9062     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9063     case V2DImode: gen = gen_aarch64_extv2di; break;
9064     case V2DFmode: gen = gen_aarch64_extv2df; break;
9065     default:
9066       return false;
9067     }
9068
9069   /* Success! */
9070   if (d->testing_p)
9071     return true;
9072
9073   /* The case where (location == 0) is a no-op for both big- and little-endian,
9074      and is removed by the mid-end at optimization levels -O1 and higher.  */
9075
9076   if (BYTES_BIG_ENDIAN && (location != 0))
9077     {
9078       /* After setup, we want the high elements of the first vector (stored
9079          at the LSB end of the register), and the low elements of the second
9080          vector (stored at the MSB end of the register). So swap.  */
9081       rtx temp = d->op0;
9082       d->op0 = d->op1;
9083       d->op1 = temp;
9084       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9085       location = nelt - location;
9086     }
9087
9088   offset = GEN_INT (location);
9089   emit_insn (gen (d->target, d->op0, d->op1, offset));
9090   return true;
9091 }
9092
9093 /* Recognize patterns for the REV insns.  */
9094
9095 static bool
9096 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9097 {
9098   unsigned int i, j, diff, nelt = d->nelt;
9099   rtx (*gen) (rtx, rtx);
9100
9101   if (!d->one_vector_p)
9102     return false;
9103
9104   diff = d->perm[0];
9105   switch (diff)
9106     {
9107     case 7:
9108       switch (d->vmode)
9109         {
9110         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9111         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9112         default:
9113           return false;
9114         }
9115       break;
9116     case 3:
9117       switch (d->vmode)
9118         {
9119         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9120         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9121         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9122         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9123         default:
9124           return false;
9125         }
9126       break;
9127     case 1:
9128       switch (d->vmode)
9129         {
9130         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9131         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9132         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9133         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9134         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9135         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9136         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9137         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9138         default:
9139           return false;
9140         }
9141       break;
9142     default:
9143       return false;
9144     }
9145
9146   for (i = 0; i < nelt ; i += diff + 1)
9147     for (j = 0; j <= diff; j += 1)
9148       {
9149         /* This is guaranteed to be true as the value of diff
9150            is 7, 3, 1 and we should have enough elements in the
9151            queue to generate this.  Getting a vector mask with a
9152            value of diff other than these values implies that
9153            something is wrong by the time we get here.  */
9154         gcc_assert (i + j < nelt);
9155         if (d->perm[i + j] != i + diff - j)
9156           return false;
9157       }
9158
9159   /* Success! */
9160   if (d->testing_p)
9161     return true;
9162
9163   emit_insn (gen (d->target, d->op0));
9164   return true;
9165 }
9166
9167 static bool
9168 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9169 {
9170   rtx (*gen) (rtx, rtx, rtx);
9171   rtx out = d->target;
9172   rtx in0;
9173   enum machine_mode vmode = d->vmode;
9174   unsigned int i, elt, nelt = d->nelt;
9175   rtx lane;
9176
9177   /* TODO: This may not be big-endian safe.  */
9178   if (BYTES_BIG_ENDIAN)
9179     return false;
9180
9181   elt = d->perm[0];
9182   for (i = 1; i < nelt; i++)
9183     {
9184       if (elt != d->perm[i])
9185         return false;
9186     }
9187
9188   /* The generic preparation in aarch64_expand_vec_perm_const_1
9189      swaps the operand order and the permute indices if it finds
9190      d->perm[0] to be in the second operand.  Thus, we can always
9191      use d->op0 and need not do any extra arithmetic to get the
9192      correct lane number.  */
9193   in0 = d->op0;
9194   lane = GEN_INT (elt);
9195
9196   switch (vmode)
9197     {
9198     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9199     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9200     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9201     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9202     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9203     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9204     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9205     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9206     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9207     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9208     default:
9209       return false;
9210     }
9211
9212   emit_insn (gen (out, in0, lane));
9213   return true;
9214 }
9215
9216 static bool
9217 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9218 {
9219   rtx rperm[MAX_VECT_LEN], sel;
9220   enum machine_mode vmode = d->vmode;
9221   unsigned int i, nelt = d->nelt;
9222
9223   if (d->testing_p)
9224     return true;
9225
9226   /* Generic code will try constant permutation twice.  Once with the
9227      original mode and again with the elements lowered to QImode.
9228      So wait and don't do the selector expansion ourselves.  */
9229   if (vmode != V8QImode && vmode != V16QImode)
9230     return false;
9231
9232   for (i = 0; i < nelt; ++i)
9233     {
9234       int nunits = GET_MODE_NUNITS (vmode);
9235
9236       /* If big-endian and two vectors we end up with a weird mixed-endian
9237          mode on NEON.  Reverse the index within each word but not the word
9238          itself.  */
9239       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9240                                            : d->perm[i]);
9241     }
9242   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9243   sel = force_reg (vmode, sel);
9244
9245   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9246   return true;
9247 }
9248
9249 static bool
9250 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9251 {
9252   /* The pattern matching functions above are written to look for a small
9253      number to begin the sequence (0, 1, N/2).  If we begin with an index
9254      from the second operand, we can swap the operands.  */
9255   if (d->perm[0] >= d->nelt)
9256     {
9257       unsigned i, nelt = d->nelt;
9258       rtx x;
9259
9260       gcc_assert (nelt == (nelt & -nelt));
9261       for (i = 0; i < nelt; ++i)
9262         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9263
9264       x = d->op0;
9265       d->op0 = d->op1;
9266       d->op1 = x;
9267     }
9268
9269   if (TARGET_SIMD)
9270     {
9271       if (aarch64_evpc_rev (d))
9272         return true;
9273       else if (aarch64_evpc_ext (d))
9274         return true;
9275       else if (aarch64_evpc_zip (d))
9276         return true;
9277       else if (aarch64_evpc_uzp (d))
9278         return true;
9279       else if (aarch64_evpc_trn (d))
9280         return true;
9281       else if (aarch64_evpc_dup (d))
9282         return true;
9283       return aarch64_evpc_tbl (d);
9284     }
9285   return false;
9286 }
9287
9288 /* Expand a vec_perm_const pattern.  */
9289
9290 bool
9291 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9292 {
9293   struct expand_vec_perm_d d;
9294   int i, nelt, which;
9295
9296   d.target = target;
9297   d.op0 = op0;
9298   d.op1 = op1;
9299
9300   d.vmode = GET_MODE (target);
9301   gcc_assert (VECTOR_MODE_P (d.vmode));
9302   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9303   d.testing_p = false;
9304
9305   for (i = which = 0; i < nelt; ++i)
9306     {
9307       rtx e = XVECEXP (sel, 0, i);
9308       int ei = INTVAL (e) & (2 * nelt - 1);
9309       which |= (ei < nelt ? 1 : 2);
9310       d.perm[i] = ei;
9311     }
9312
9313   switch (which)
9314     {
9315     default:
9316       gcc_unreachable ();
9317
9318     case 3:
9319       d.one_vector_p = false;
9320       if (!rtx_equal_p (op0, op1))
9321         break;
9322
9323       /* The elements of PERM do not suggest that only the first operand
9324          is used, but both operands are identical.  Allow easier matching
9325          of the permutation by folding the permutation into the single
9326          input vector.  */
9327       /* Fall Through.  */
9328     case 2:
9329       for (i = 0; i < nelt; ++i)
9330         d.perm[i] &= nelt - 1;
9331       d.op0 = op1;
9332       d.one_vector_p = true;
9333       break;
9334
9335     case 1:
9336       d.op1 = op0;
9337       d.one_vector_p = true;
9338       break;
9339     }
9340
9341   return aarch64_expand_vec_perm_const_1 (&d);
9342 }
9343
9344 static bool
9345 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9346                                      const unsigned char *sel)
9347 {
9348   struct expand_vec_perm_d d;
9349   unsigned int i, nelt, which;
9350   bool ret;
9351
9352   d.vmode = vmode;
9353   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9354   d.testing_p = true;
9355   memcpy (d.perm, sel, nelt);
9356
9357   /* Calculate whether all elements are in one vector.  */
9358   for (i = which = 0; i < nelt; ++i)
9359     {
9360       unsigned char e = d.perm[i];
9361       gcc_assert (e < 2 * nelt);
9362       which |= (e < nelt ? 1 : 2);
9363     }
9364
9365   /* If all elements are from the second vector, reindex as if from the
9366      first vector.  */
9367   if (which == 2)
9368     for (i = 0; i < nelt; ++i)
9369       d.perm[i] -= nelt;
9370
9371   /* Check whether the mask can be applied to a single vector.  */
9372   d.one_vector_p = (which != 3);
9373
9374   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9375   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9376   if (!d.one_vector_p)
9377     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9378
9379   start_sequence ();
9380   ret = aarch64_expand_vec_perm_const_1 (&d);
9381   end_sequence ();
9382
9383   return ret;
9384 }
9385
9386 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
9387 bool
9388 aarch64_cannot_change_mode_class (enum machine_mode from,
9389                                   enum machine_mode to,
9390                                   enum reg_class rclass)
9391 {
9392   /* Full-reg subregs are allowed on general regs or any class if they are
9393      the same size.  */
9394   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9395       || !reg_classes_intersect_p (FP_REGS, rclass))
9396     return false;
9397
9398   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
9399      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9400      2. Scalar to Scalar for integer modes or same size float modes.
9401      3. Vector to Vector modes.
9402      4. On little-endian only, Vector-Structure to Vector modes.  */
9403   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9404     {
9405       if (aarch64_vector_mode_supported_p (from)
9406           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9407         return false;
9408
9409       if (GET_MODE_NUNITS (from) == 1
9410           && GET_MODE_NUNITS (to) == 1
9411           && (GET_MODE_CLASS (from) == MODE_INT
9412               || from == to))
9413         return false;
9414
9415       if (aarch64_vector_mode_supported_p (from)
9416           && aarch64_vector_mode_supported_p (to))
9417         return false;
9418
9419       /* Within an vector structure straddling multiple vector registers
9420          we are in a mixed-endian representation.  As such, we can't
9421          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
9422          switch between vectors and vector structures cheaply.  */
9423       if (!BYTES_BIG_ENDIAN)
9424         if ((aarch64_vector_mode_supported_p (from)
9425               && aarch64_vect_struct_mode_p (to))
9426             || (aarch64_vector_mode_supported_p (to)
9427               && aarch64_vect_struct_mode_p (from)))
9428           return false;
9429     }
9430
9431   return true;
9432 }
9433
9434 /* Implement MODES_TIEABLE_P.  */
9435
9436 bool
9437 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9438 {
9439   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9440     return true;
9441
9442   /* We specifically want to allow elements of "structure" modes to
9443      be tieable to the structure.  This more general condition allows
9444      other rarer situations too.  */
9445   if (TARGET_SIMD
9446       && aarch64_vector_mode_p (mode1)
9447       && aarch64_vector_mode_p (mode2))
9448     return true;
9449
9450   return false;
9451 }
9452
9453 /* Return a new RTX holding the result of moving POINTER forward by
9454    AMOUNT bytes.  */
9455
9456 static rtx
9457 aarch64_move_pointer (rtx pointer, int amount)
9458 {
9459   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9460
9461   return adjust_automodify_address (pointer, GET_MODE (pointer),
9462                                     next, amount);
9463 }
9464
9465 /* Return a new RTX holding the result of moving POINTER forward by the
9466    size of the mode it points to.  */
9467
9468 static rtx
9469 aarch64_progress_pointer (rtx pointer)
9470 {
9471   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9472
9473   return aarch64_move_pointer (pointer, amount);
9474 }
9475
9476 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9477    MODE bytes.  */
9478
9479 static void
9480 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9481                                               enum machine_mode mode)
9482 {
9483   rtx reg = gen_reg_rtx (mode);
9484
9485   /* "Cast" the pointers to the correct mode.  */
9486   *src = adjust_address (*src, mode, 0);
9487   *dst = adjust_address (*dst, mode, 0);
9488   /* Emit the memcpy.  */
9489   emit_move_insn (reg, *src);
9490   emit_move_insn (*dst, reg);
9491   /* Move the pointers forward.  */
9492   *src = aarch64_progress_pointer (*src);
9493   *dst = aarch64_progress_pointer (*dst);
9494 }
9495
9496 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
9497    we succeed, otherwise return false.  */
9498
9499 bool
9500 aarch64_expand_movmem (rtx *operands)
9501 {
9502   unsigned int n;
9503   rtx dst = operands[0];
9504   rtx src = operands[1];
9505   rtx base;
9506   bool speed_p = !optimize_function_for_size_p (cfun);
9507
9508   /* When optimizing for size, give a better estimate of the length of a
9509      memcpy call, but use the default otherwise.  */
9510   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9511
9512   /* We can't do anything smart if the amount to copy is not constant.  */
9513   if (!CONST_INT_P (operands[2]))
9514     return false;
9515
9516   n = UINTVAL (operands[2]);
9517
9518   /* Try to keep the number of instructions low.  For cases below 16 bytes we
9519      need to make at most two moves.  For cases above 16 bytes it will be one
9520      move for each 16 byte chunk, then at most two additional moves.  */
9521   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9522     return false;
9523
9524   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9525   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9526
9527   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9528   src = adjust_automodify_address (src, VOIDmode, base, 0);
9529
9530   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9531      1-byte chunk.  */
9532   if (n < 4)
9533     {
9534       if (n >= 2)
9535         {
9536           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9537           n -= 2;
9538         }
9539
9540       if (n == 1)
9541         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9542
9543       return true;
9544     }
9545
9546   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
9547      4-byte chunk, partially overlapping with the previously copied chunk.  */
9548   if (n < 8)
9549     {
9550       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9551       n -= 4;
9552       if (n > 0)
9553         {
9554           int move = n - 4;
9555
9556           src = aarch64_move_pointer (src, move);
9557           dst = aarch64_move_pointer (dst, move);
9558           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9559         }
9560       return true;
9561     }
9562
9563   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
9564      them, then (if applicable) an 8-byte chunk.  */
9565   while (n >= 8)
9566     {
9567       if (n / 16)
9568         {
9569           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9570           n -= 16;
9571         }
9572       else
9573         {
9574           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9575           n -= 8;
9576         }
9577     }
9578
9579   /* Finish the final bytes of the copy.  We can always do this in one
9580      instruction.  We either copy the exact amount we need, or partially
9581      overlap with the previous chunk we copied and copy 8-bytes.  */
9582   if (n == 0)
9583     return true;
9584   else if (n == 1)
9585     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9586   else if (n == 2)
9587     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9588   else if (n == 4)
9589     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9590   else
9591     {
9592       if (n == 3)
9593         {
9594           src = aarch64_move_pointer (src, -1);
9595           dst = aarch64_move_pointer (dst, -1);
9596           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9597         }
9598       else
9599         {
9600           int move = n - 8;
9601
9602           src = aarch64_move_pointer (src, move);
9603           dst = aarch64_move_pointer (dst, move);
9604           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9605         }
9606     }
9607
9608   return true;
9609 }
9610
9611 #undef TARGET_ADDRESS_COST
9612 #define TARGET_ADDRESS_COST aarch64_address_cost
9613
9614 /* This hook will determines whether unnamed bitfields affect the alignment
9615    of the containing structure.  The hook returns true if the structure
9616    should inherit the alignment requirements of an unnamed bitfield's
9617    type.  */
9618 #undef TARGET_ALIGN_ANON_BITFIELD
9619 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9620
9621 #undef TARGET_ASM_ALIGNED_DI_OP
9622 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9623
9624 #undef TARGET_ASM_ALIGNED_HI_OP
9625 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9626
9627 #undef TARGET_ASM_ALIGNED_SI_OP
9628 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9629
9630 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9631 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9632   hook_bool_const_tree_hwi_hwi_const_tree_true
9633
9634 #undef TARGET_ASM_FILE_START
9635 #define TARGET_ASM_FILE_START aarch64_start_file
9636
9637 #undef TARGET_ASM_OUTPUT_MI_THUNK
9638 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9639
9640 #undef TARGET_ASM_SELECT_RTX_SECTION
9641 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9642
9643 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9644 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9645
9646 #undef TARGET_BUILD_BUILTIN_VA_LIST
9647 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9648
9649 #undef TARGET_CALLEE_COPIES
9650 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9651
9652 #undef TARGET_CAN_ELIMINATE
9653 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9654
9655 #undef TARGET_CANNOT_FORCE_CONST_MEM
9656 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9657
9658 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9659 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9660
9661 /* Only the least significant bit is used for initialization guard
9662    variables.  */
9663 #undef TARGET_CXX_GUARD_MASK_BIT
9664 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9665
9666 #undef TARGET_C_MODE_FOR_SUFFIX
9667 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9668
9669 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9670 #undef  TARGET_DEFAULT_TARGET_FLAGS
9671 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9672 #endif
9673
9674 #undef TARGET_CLASS_MAX_NREGS
9675 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9676
9677 #undef TARGET_BUILTIN_DECL
9678 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9679
9680 #undef  TARGET_EXPAND_BUILTIN
9681 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9682
9683 #undef TARGET_EXPAND_BUILTIN_VA_START
9684 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9685
9686 #undef TARGET_FOLD_BUILTIN
9687 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9688
9689 #undef TARGET_FUNCTION_ARG
9690 #define TARGET_FUNCTION_ARG aarch64_function_arg
9691
9692 #undef TARGET_FUNCTION_ARG_ADVANCE
9693 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9694
9695 #undef TARGET_FUNCTION_ARG_BOUNDARY
9696 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9697
9698 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9699 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9700
9701 #undef TARGET_FUNCTION_VALUE
9702 #define TARGET_FUNCTION_VALUE aarch64_function_value
9703
9704 #undef TARGET_FUNCTION_VALUE_REGNO_P
9705 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9706
9707 #undef TARGET_FRAME_POINTER_REQUIRED
9708 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9709
9710 #undef TARGET_GIMPLE_FOLD_BUILTIN
9711 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9712
9713 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9714 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9715
9716 #undef  TARGET_INIT_BUILTINS
9717 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
9718
9719 #undef TARGET_LEGITIMATE_ADDRESS_P
9720 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9721
9722 #undef TARGET_LEGITIMATE_CONSTANT_P
9723 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9724
9725 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9726 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9727
9728 #undef TARGET_LRA_P
9729 #define TARGET_LRA_P aarch64_lra_p
9730
9731 #undef TARGET_MANGLE_TYPE
9732 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9733
9734 #undef TARGET_MEMORY_MOVE_COST
9735 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9736
9737 #undef TARGET_MUST_PASS_IN_STACK
9738 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9739
9740 /* This target hook should return true if accesses to volatile bitfields
9741    should use the narrowest mode possible.  It should return false if these
9742    accesses should use the bitfield container type.  */
9743 #undef TARGET_NARROW_VOLATILE_BITFIELD
9744 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9745
9746 #undef  TARGET_OPTION_OVERRIDE
9747 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9748
9749 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9750 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9751   aarch64_override_options_after_change
9752
9753 #undef TARGET_PASS_BY_REFERENCE
9754 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9755
9756 #undef TARGET_PREFERRED_RELOAD_CLASS
9757 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9758
9759 #undef TARGET_SECONDARY_RELOAD
9760 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9761
9762 #undef TARGET_SHIFT_TRUNCATION_MASK
9763 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9764
9765 #undef TARGET_SETUP_INCOMING_VARARGS
9766 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9767
9768 #undef TARGET_STRUCT_VALUE_RTX
9769 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
9770
9771 #undef TARGET_REGISTER_MOVE_COST
9772 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9773
9774 #undef TARGET_RETURN_IN_MEMORY
9775 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9776
9777 #undef TARGET_RETURN_IN_MSB
9778 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9779
9780 #undef TARGET_RTX_COSTS
9781 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9782
9783 #undef TARGET_SCHED_ISSUE_RATE
9784 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9785
9786 #undef TARGET_TRAMPOLINE_INIT
9787 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9788
9789 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9790 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9791
9792 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9793 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9794
9795 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9796 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9797
9798 #undef TARGET_VECTORIZE_ADD_STMT_COST
9799 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
9800
9801 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
9802 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
9803   aarch64_builtin_vectorization_cost
9804
9805 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
9806 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
9807
9808 #undef TARGET_VECTORIZE_BUILTINS
9809 #define TARGET_VECTORIZE_BUILTINS
9810
9811 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
9812 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
9813   aarch64_builtin_vectorized_function
9814
9815 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
9816 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
9817   aarch64_autovectorize_vector_sizes
9818
9819 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
9820 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
9821   aarch64_atomic_assign_expand_fenv
9822
9823 /* Section anchor support.  */
9824
9825 #undef TARGET_MIN_ANCHOR_OFFSET
9826 #define TARGET_MIN_ANCHOR_OFFSET -256
9827
9828 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
9829    byte offset; we can do much more for larger data types, but have no way
9830    to determine the size of the access.  We assume accesses are aligned.  */
9831 #undef TARGET_MAX_ANCHOR_OFFSET
9832 #define TARGET_MAX_ANCHOR_OFFSET 4095
9833
9834 #undef TARGET_VECTOR_ALIGNMENT
9835 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
9836
9837 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
9838 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
9839   aarch64_simd_vector_alignment_reachable
9840
9841 /* vec_perm support.  */
9842
9843 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
9844 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
9845   aarch64_vectorize_vec_perm_const_ok
9846
9847
9848 #undef TARGET_FIXED_CONDITION_CODE_REGS
9849 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
9850
9851 #undef TARGET_FLAGS_REGNUM
9852 #define TARGET_FLAGS_REGNUM CC_REGNUM
9853
9854 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
9855 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
9856
9857 struct gcc_target targetm = TARGET_INITIALIZER;
9858
9859 #include "gt-aarch64.h"