gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "tree.h"
  29 #include "stringpool.h"
  30 #include "stor-layout.h"
  31 #include "calls.h"
  32 #include "varasm.h"
  33 #include "regs.h"
  34 #include "df.h"
  35 #include "hard-reg-set.h"
  36 #include "output.h"
  37 #include "expr.h"
  38 #include "reload.h"
  39 #include "toplev.h"
  40 #include "target.h"
  41 #include "target-def.h"
  42 #include "targhooks.h"
  43 #include "ggc.h"
  44 #include "function.h"
  45 #include "tm_p.h"
  46 #include "recog.h"
  47 #include "langhooks.h"
  48 #include "diagnostic-core.h"
  49 #include "pointer-set.h"
  50 #include "hash-table.h"
  51 #include "vec.h"
  52 #include "basic-block.h"
  53 #include "tree-ssa-alias.h"
  54 #include "internal-fn.h"
  55 #include "gimple-fold.h"
  56 #include "tree-eh.h"
  57 #include "gimple-expr.h"
  58 #include "is-a.h"
  59 #include "gimple.h"
  60 #include "gimplify.h"
  61 #include "optabs.h"
  62 #include "dwarf2.h"
  63 #include "cfgloop.h"
  64 #include "tree-vectorizer.h"
  65 #include "config/arm/aarch-cost-tables.h"
  66 #include "dumpfile.h"
  67 #include "builtins.h"
  68
  69 /* Defined for convenience.  */
  70 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  71
  72 /* Classifies an address.
  73
  74    ADDRESS_REG_IMM
  75        A simple base register plus immediate offset.
  76
  77    ADDRESS_REG_WB
  78        A base register indexed by immediate offset with writeback.
  79
  80    ADDRESS_REG_REG
  81        A base register indexed by (optionally scaled) register.
  82
  83    ADDRESS_REG_UXTW
  84        A base register indexed by (optionally scaled) zero-extended register.
  85
  86    ADDRESS_REG_SXTW
  87        A base register indexed by (optionally scaled) sign-extended register.
  88
  89    ADDRESS_LO_SUM
  90        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  91
  92    ADDRESS_SYMBOLIC:
  93        A constant symbolic address, in pc-relative literal pool.  */
  94
  95 enum aarch64_address_type {
  96   ADDRESS_REG_IMM,
  97   ADDRESS_REG_WB,
  98   ADDRESS_REG_REG,
  99   ADDRESS_REG_UXTW,
 100   ADDRESS_REG_SXTW,
 101   ADDRESS_LO_SUM,
 102   ADDRESS_SYMBOLIC
 103 };
 104
 105 struct aarch64_address_info {
 106   enum aarch64_address_type type;
 107   rtx base;
 108   rtx offset;
 109   int shift;
 110   enum aarch64_symbol_type symbol_type;
 111 };
 112
 113 struct simd_immediate_info
 114 {
 115   rtx value;
 116   int shift;
 117   int element_width;
 118   bool mvn;
 119   bool msl;
 120 };
 121
 122 /* The current code model.  */
 123 enum aarch64_code_model aarch64_cmodel;
 124
 125 #ifdef HAVE_AS_TLS
 126 #undef TARGET_HAVE_TLS
 127 #define TARGET_HAVE_TLS 1
 128 #endif
 129
 130 static bool aarch64_lra_p (void);
 131 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
 132 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
 133                                                      const_tree,
 134                                                      enum machine_mode *, int *,
 135                                                      bool *);
 136 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_override_options_after_change (void);
 139 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 140 static unsigned bit_count (unsigned HOST_WIDE_INT);
 141 static bool aarch64_const_vec_all_same_int_p (rtx,
 142                                               HOST_WIDE_INT, HOST_WIDE_INT);
 143
 144 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* The current tuning set.  */
 152 const struct tune_params *aarch64_tune_params;
 153
 154 /* Mask to specify which instructions we are allowed to generate.  */
 155 unsigned long aarch64_isa_flags = 0;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Tuning parameters.  */
 161
 162 #if HAVE_DESIGNATED_INITIALIZERS
 163 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 164 #else
 165 #define NAMED_PARAM(NAME, VAL) (VAL)
 166 #endif
 167
 168 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 169 __extension__
 170 #endif
 171
 172 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 173 __extension__
 174 #endif
 175 static const struct cpu_addrcost_table generic_addrcost_table =
 176 {
 177 #if HAVE_DESIGNATED_INITIALIZERS
 178   .addr_scale_costs =
 179 #endif
 180     {
 181       NAMED_PARAM (qi, 0),
 182       NAMED_PARAM (hi, 0),
 183       NAMED_PARAM (si, 0),
 184       NAMED_PARAM (ti, 0),
 185     },
 186   NAMED_PARAM (pre_modify, 0),
 187   NAMED_PARAM (post_modify, 0),
 188   NAMED_PARAM (register_offset, 0),
 189   NAMED_PARAM (register_extend, 0),
 190   NAMED_PARAM (imm_offset, 0)
 191 };
 192
 193 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 194 __extension__
 195 #endif
 196 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 197 {
 198 #if HAVE_DESIGNATED_INITIALIZERS
 199   .addr_scale_costs =
 200 #endif
 201     {
 202       NAMED_PARAM (qi, 0),
 203       NAMED_PARAM (hi, 1),
 204       NAMED_PARAM (si, 0),
 205       NAMED_PARAM (ti, 1),
 206     },
 207   NAMED_PARAM (pre_modify, 0),
 208   NAMED_PARAM (post_modify, 0),
 209   NAMED_PARAM (register_offset, 0),
 210   NAMED_PARAM (register_extend, 0),
 211   NAMED_PARAM (imm_offset, 0),
 212 };
 213
 214 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 215 __extension__
 216 #endif
 217 static const struct cpu_regmove_cost generic_regmove_cost =
 218 {
 219   NAMED_PARAM (GP2GP, 1),
 220   NAMED_PARAM (GP2FP, 2),
 221   NAMED_PARAM (FP2GP, 2),
 222   /* We currently do not provide direct support for TFmode Q->Q move.
 223      Therefore we need to raise the cost above 2 in order to have
 224      reload handle the situation.  */
 225   NAMED_PARAM (FP2FP, 4)
 226 };
 227
 228 /* Generic costs for vector insn classes.  */
 229 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 230 __extension__
 231 #endif
 232 static const struct cpu_vector_cost generic_vector_cost =
 233 {
 234   NAMED_PARAM (scalar_stmt_cost, 1),
 235   NAMED_PARAM (scalar_load_cost, 1),
 236   NAMED_PARAM (scalar_store_cost, 1),
 237   NAMED_PARAM (vec_stmt_cost, 1),
 238   NAMED_PARAM (vec_to_scalar_cost, 1),
 239   NAMED_PARAM (scalar_to_vec_cost, 1),
 240   NAMED_PARAM (vec_align_load_cost, 1),
 241   NAMED_PARAM (vec_unalign_load_cost, 1),
 242   NAMED_PARAM (vec_unalign_store_cost, 1),
 243   NAMED_PARAM (vec_store_cost, 1),
 244   NAMED_PARAM (cond_taken_branch_cost, 3),
 245   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 246 };
 247
 248 /* Generic costs for vector insn classes.  */
 249 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 250 __extension__
 251 #endif
 252 static const struct cpu_vector_cost cortexa57_vector_cost =
 253 {
 254   NAMED_PARAM (scalar_stmt_cost, 1),
 255   NAMED_PARAM (scalar_load_cost, 4),
 256   NAMED_PARAM (scalar_store_cost, 1),
 257   NAMED_PARAM (vec_stmt_cost, 3),
 258   NAMED_PARAM (vec_to_scalar_cost, 8),
 259   NAMED_PARAM (scalar_to_vec_cost, 8),
 260   NAMED_PARAM (vec_align_load_cost, 5),
 261   NAMED_PARAM (vec_unalign_load_cost, 5),
 262   NAMED_PARAM (vec_unalign_store_cost, 1),
 263   NAMED_PARAM (vec_store_cost, 1),
 264   NAMED_PARAM (cond_taken_branch_cost, 1),
 265   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 266 };
 267
 268 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 269 __extension__
 270 #endif
 271 static const struct tune_params generic_tunings =
 272 {
 273   &cortexa57_extra_costs,
 274   &generic_addrcost_table,
 275   &generic_regmove_cost,
 276   &generic_vector_cost,
 277   NAMED_PARAM (memmov_cost, 4),
 278   NAMED_PARAM (issue_rate, 2)
 279 };
 280
 281 static const struct tune_params cortexa53_tunings =
 282 {
 283   &cortexa53_extra_costs,
 284   &generic_addrcost_table,
 285   &generic_regmove_cost,
 286   &generic_vector_cost,
 287   NAMED_PARAM (memmov_cost, 4),
 288   NAMED_PARAM (issue_rate, 2)
 289 };
 290
 291 static const struct tune_params cortexa57_tunings =
 292 {
 293   &cortexa57_extra_costs,
 294   &cortexa57_addrcost_table,
 295   &generic_regmove_cost,
 296   &cortexa57_vector_cost,
 297   NAMED_PARAM (memmov_cost, 4),
 298   NAMED_PARAM (issue_rate, 3)
 299 };
 300
 301 /* A processor implementing AArch64.  */
 302 struct processor
 303 {
 304   const char *const name;
 305   enum aarch64_processor core;
 306   const char *arch;
 307   const unsigned long flags;
 308   const struct tune_params *const tune;
 309 };
 310
 311 /* Processor cores implementing AArch64.  */
 312 static const struct processor all_cores[] =
 313 {
 314 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
 315   {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
 316 #include "aarch64-cores.def"
 317 #undef AARCH64_CORE
 318   {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
 319   {NULL, aarch64_none, NULL, 0, NULL}
 320 };
 321
 322 /* Architectures implementing AArch64.  */
 323 static const struct processor all_architectures[] =
 324 {
 325 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 326   {NAME, CORE, #ARCH, FLAGS, NULL},
 327 #include "aarch64-arches.def"
 328 #undef AARCH64_ARCH
 329   {NULL, aarch64_none, NULL, 0, NULL}
 330 };
 331
 332 /* Target specification.  These are populated as commandline arguments
 333    are processed, or NULL if not specified.  */
 334 static const struct processor *selected_arch;
 335 static const struct processor *selected_cpu;
 336 static const struct processor *selected_tune;
 337
 338 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 339
 340 /* An ISA extension in the co-processor and main instruction set space.  */
 341 struct aarch64_option_extension
 342 {
 343   const char *const name;
 344   const unsigned long flags_on;
 345   const unsigned long flags_off;
 346 };
 347
 348 /* ISA extensions in AArch64.  */
 349 static const struct aarch64_option_extension all_extensions[] =
 350 {
 351 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 352   {NAME, FLAGS_ON, FLAGS_OFF},
 353 #include "aarch64-option-extensions.def"
 354 #undef AARCH64_OPT_EXTENSION
 355   {NULL, 0, 0}
 356 };
 357
 358 /* Used to track the size of an address when generating a pre/post
 359    increment address.  */
 360 static enum machine_mode aarch64_memory_reference_mode;
 361
 362 /* Used to force GTY into this file.  */
 363 static GTY(()) int gty_dummy;
 364
 365 /* A table of valid AArch64 "bitmask immediate" values for
 366    logical instructions.  */
 367
 368 #define AARCH64_NUM_BITMASKS  5334
 369 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 370
 371 typedef enum aarch64_cond_code
 372 {
 373   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 374   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 375   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 376 }
 377 aarch64_cc;
 378
 379 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 380
 381 /* The condition codes of the processor, and the inverse function.  */
 382 static const char * const aarch64_condition_codes[] =
 383 {
 384   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 385   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 386 };
 387
 388 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 389 unsigned
 390 aarch64_dbx_register_number (unsigned regno)
 391 {
 392    if (GP_REGNUM_P (regno))
 393      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 394    else if (regno == SP_REGNUM)
 395      return AARCH64_DWARF_SP;
 396    else if (FP_REGNUM_P (regno))
 397      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 398
 399    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 400       equivalent DWARF register.  */
 401    return DWARF_FRAME_REGISTERS;
 402 }
 403
 404 /* Return TRUE if MODE is any of the large INT modes.  */
 405 static bool
 406 aarch64_vect_struct_mode_p (enum machine_mode mode)
 407 {
 408   return mode == OImode || mode == CImode || mode == XImode;
 409 }
 410
 411 /* Return TRUE if MODE is any of the vector modes.  */
 412 static bool
 413 aarch64_vector_mode_p (enum machine_mode mode)
 414 {
 415   return aarch64_vector_mode_supported_p (mode)
 416          || aarch64_vect_struct_mode_p (mode);
 417 }
 418
 419 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 420 static bool
 421 aarch64_array_mode_supported_p (enum machine_mode mode,
 422                                 unsigned HOST_WIDE_INT nelems)
 423 {
 424   if (TARGET_SIMD
 425       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 426       && (nelems >= 2 && nelems <= 4))
 427     return true;
 428
 429   return false;
 430 }
 431
 432 /* Implement HARD_REGNO_NREGS.  */
 433
 434 int
 435 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
 436 {
 437   switch (aarch64_regno_regclass (regno))
 438     {
 439     case FP_REGS:
 440     case FP_LO_REGS:
 441       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 442     default:
 443       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 444     }
 445   gcc_unreachable ();
 446 }
 447
 448 /* Implement HARD_REGNO_MODE_OK.  */
 449
 450 int
 451 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
 452 {
 453   if (GET_MODE_CLASS (mode) == MODE_CC)
 454     return regno == CC_REGNUM;
 455
 456   if (regno == SP_REGNUM)
 457     /* The purpose of comparing with ptr_mode is to support the
 458        global register variable associated with the stack pointer
 459        register via the syntax of asm ("wsp") in ILP32.  */
 460     return mode == Pmode || mode == ptr_mode;
 461
 462   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 463     return mode == Pmode;
 464
 465   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 466     return 1;
 467
 468   if (FP_REGNUM_P (regno))
 469     {
 470       if (aarch64_vect_struct_mode_p (mode))
 471         return
 472           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 473       else
 474         return 1;
 475     }
 476
 477   return 0;
 478 }
 479
 480 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 481 enum machine_mode
 482 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 483                                      enum machine_mode mode)
 484 {
 485   /* Handle modes that fit within single registers.  */
 486   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 487     {
 488       if (GET_MODE_SIZE (mode) >= 4)
 489         return mode;
 490       else
 491         return SImode;
 492     }
 493   /* Fall back to generic for multi-reg and very large modes.  */
 494   else
 495     return choose_hard_reg_mode (regno, nregs, false);
 496 }
 497
 498 /* Return true if calls to DECL should be treated as
 499    long-calls (ie called via a register).  */
 500 static bool
 501 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 502 {
 503   return false;
 504 }
 505
 506 /* Return true if calls to symbol-ref SYM should be treated as
 507    long-calls (ie called via a register).  */
 508 bool
 509 aarch64_is_long_call_p (rtx sym)
 510 {
 511   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 512 }
 513
 514 /* Return true if the offsets to a zero/sign-extract operation
 515    represent an expression that matches an extend operation.  The
 516    operands represent the paramters from
 517
 518    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 519 bool
 520 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 521                                 rtx extract_imm)
 522 {
 523   HOST_WIDE_INT mult_val, extract_val;
 524
 525   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 526     return false;
 527
 528   mult_val = INTVAL (mult_imm);
 529   extract_val = INTVAL (extract_imm);
 530
 531   if (extract_val > 8
 532       && extract_val < GET_MODE_BITSIZE (mode)
 533       && exact_log2 (extract_val & ~7) > 0
 534       && (extract_val & 7) <= 4
 535       && mult_val == (1 << (extract_val & 7)))
 536     return true;
 537
 538   return false;
 539 }
 540
 541 /* Emit an insn that's a simple single-set.  Both the operands must be
 542    known to be valid.  */
 543 inline static rtx
 544 emit_set_insn (rtx x, rtx y)
 545 {
 546   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 547 }
 548
 549 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 550    return the rtx for register 0 in the proper mode.  */
 551 rtx
 552 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 553 {
 554   enum machine_mode mode = SELECT_CC_MODE (code, x, y);
 555   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 556
 557   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 558   return cc_reg;
 559 }
 560
 561 /* Build the SYMBOL_REF for __tls_get_addr.  */
 562
 563 static GTY(()) rtx tls_get_addr_libfunc;
 564
 565 rtx
 566 aarch64_tls_get_addr (void)
 567 {
 568   if (!tls_get_addr_libfunc)
 569     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 570   return tls_get_addr_libfunc;
 571 }
 572
 573 /* Return the TLS model to use for ADDR.  */
 574
 575 static enum tls_model
 576 tls_symbolic_operand_type (rtx addr)
 577 {
 578   enum tls_model tls_kind = TLS_MODEL_NONE;
 579   rtx sym, addend;
 580
 581   if (GET_CODE (addr) == CONST)
 582     {
 583       split_const (addr, &sym, &addend);
 584       if (GET_CODE (sym) == SYMBOL_REF)
 585         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 586     }
 587   else if (GET_CODE (addr) == SYMBOL_REF)
 588     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 589
 590   return tls_kind;
 591 }
 592
 593 /* We'll allow lo_sum's in addresses in our legitimate addresses
 594    so that combine would take care of combining addresses where
 595    necessary, but for generation purposes, we'll generate the address
 596    as :
 597    RTL                               Absolute
 598    tmp = hi (symbol_ref);            adrp  x1, foo
 599    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 600                                      nop
 601
 602    PIC                               TLS
 603    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 604    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 605                                      bl   __tls_get_addr
 606                                      nop
 607
 608    Load TLS symbol, depending on TLS mechanism and TLS access model.
 609
 610    Global Dynamic - Traditional TLS:
 611    adrp tmp, :tlsgd:imm
 612    add  dest, tmp, #:tlsgd_lo12:imm
 613    bl   __tls_get_addr
 614
 615    Global Dynamic - TLS Descriptors:
 616    adrp dest, :tlsdesc:imm
 617    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 618    add  dest, dest, #:tlsdesc_lo12:imm
 619    blr  tmp
 620    mrs  tp, tpidr_el0
 621    add  dest, dest, tp
 622
 623    Initial Exec:
 624    mrs  tp, tpidr_el0
 625    adrp tmp, :gottprel:imm
 626    ldr  dest, [tmp, #:gottprel_lo12:imm]
 627    add  dest, dest, tp
 628
 629    Local Exec:
 630    mrs  tp, tpidr_el0
 631    add  t0, tp, #:tprel_hi12:imm
 632    add  t0, #:tprel_lo12_nc:imm
 633 */
 634
 635 static void
 636 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 637                                    enum aarch64_symbol_type type)
 638 {
 639   switch (type)
 640     {
 641     case SYMBOL_SMALL_ABSOLUTE:
 642       {
 643         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 644         rtx tmp_reg = dest;
 645         enum machine_mode mode = GET_MODE (dest);
 646
 647         gcc_assert (mode == Pmode || mode == ptr_mode);
 648
 649         if (can_create_pseudo_p ())
 650           tmp_reg = gen_reg_rtx (mode);
 651
 652         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 653         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 654         return;
 655       }
 656
 657     case SYMBOL_TINY_ABSOLUTE:
 658       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 659       return;
 660
 661     case SYMBOL_SMALL_GOT:
 662       {
 663         /* In ILP32, the mode of dest can be either SImode or DImode,
 664            while the got entry is always of SImode size.  The mode of
 665            dest depends on how dest is used: if dest is assigned to a
 666            pointer (e.g. in the memory), it has SImode; it may have
 667            DImode if dest is dereferenced to access the memeory.
 668            This is why we have to handle three different ldr_got_small
 669            patterns here (two patterns for ILP32).  */
 670         rtx tmp_reg = dest;
 671         enum machine_mode mode = GET_MODE (dest);
 672
 673         if (can_create_pseudo_p ())
 674           tmp_reg = gen_reg_rtx (mode);
 675
 676         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 677         if (mode == ptr_mode)
 678           {
 679             if (mode == DImode)
 680               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 681             else
 682               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 683           }
 684         else
 685           {
 686             gcc_assert (mode == Pmode);
 687             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 688           }
 689
 690         return;
 691       }
 692
 693     case SYMBOL_SMALL_TLSGD:
 694       {
 695         rtx insns;
 696         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 697
 698         start_sequence ();
 699         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 700         insns = get_insns ();
 701         end_sequence ();
 702
 703         RTL_CONST_CALL_P (insns) = 1;
 704         emit_libcall_block (insns, dest, result, imm);
 705         return;
 706       }
 707
 708     case SYMBOL_SMALL_TLSDESC:
 709       {
 710         enum machine_mode mode = GET_MODE (dest);
 711         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 712         rtx tp;
 713
 714         gcc_assert (mode == Pmode || mode == ptr_mode);
 715
 716         /* In ILP32, the got entry is always of SImode size.  Unlike
 717            small GOT, the dest is fixed at reg 0.  */
 718         if (TARGET_ILP32)
 719           emit_insn (gen_tlsdesc_small_si (imm));
 720         else
 721           emit_insn (gen_tlsdesc_small_di (imm));
 722         tp = aarch64_load_tp (NULL);
 723
 724         if (mode != Pmode)
 725           tp = gen_lowpart (mode, tp);
 726
 727         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 728         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 729         return;
 730       }
 731
 732     case SYMBOL_SMALL_GOTTPREL:
 733       {
 734         /* In ILP32, the mode of dest can be either SImode or DImode,
 735            while the got entry is always of SImode size.  The mode of
 736            dest depends on how dest is used: if dest is assigned to a
 737            pointer (e.g. in the memory), it has SImode; it may have
 738            DImode if dest is dereferenced to access the memeory.
 739            This is why we have to handle three different tlsie_small
 740            patterns here (two patterns for ILP32).  */
 741         enum machine_mode mode = GET_MODE (dest);
 742         rtx tmp_reg = gen_reg_rtx (mode);
 743         rtx tp = aarch64_load_tp (NULL);
 744
 745         if (mode == ptr_mode)
 746           {
 747             if (mode == DImode)
 748               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 749             else
 750               {
 751                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 752                 tp = gen_lowpart (mode, tp);
 753               }
 754           }
 755         else
 756           {
 757             gcc_assert (mode == Pmode);
 758             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 759           }
 760
 761         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 762         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 763         return;
 764       }
 765
 766     case SYMBOL_SMALL_TPREL:
 767       {
 768         rtx tp = aarch64_load_tp (NULL);
 769         emit_insn (gen_tlsle_small (dest, tp, imm));
 770         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 771         return;
 772       }
 773
 774     case SYMBOL_TINY_GOT:
 775       emit_insn (gen_ldr_got_tiny (dest, imm));
 776       return;
 777
 778     default:
 779       gcc_unreachable ();
 780     }
 781 }
 782
 783 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 784    handle all moves if !can_create_pseudo_p ().  The distinction is
 785    important because, unlike emit_move_insn, the move expanders know
 786    how to force Pmode objects into the constant pool even when the
 787    constant pool address is not itself legitimate.  */
 788 static rtx
 789 aarch64_emit_move (rtx dest, rtx src)
 790 {
 791   return (can_create_pseudo_p ()
 792           ? emit_move_insn (dest, src)
 793           : emit_move_insn_1 (dest, src));
 794 }
 795
 796 /* Split a 128-bit move operation into two 64-bit move operations,
 797    taking care to handle partial overlap of register to register
 798    copies.  Special cases are needed when moving between GP regs and
 799    FP regs.  SRC can be a register, constant or memory; DST a register
 800    or memory.  If either operand is memory it must not have any side
 801    effects.  */
 802 void
 803 aarch64_split_128bit_move (rtx dst, rtx src)
 804 {
 805   rtx dst_lo, dst_hi;
 806   rtx src_lo, src_hi;
 807
 808   enum machine_mode mode = GET_MODE (dst);
 809
 810   gcc_assert (mode == TImode || mode == TFmode);
 811   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 812   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 813
 814   if (REG_P (dst) && REG_P (src))
 815     {
 816       int src_regno = REGNO (src);
 817       int dst_regno = REGNO (dst);
 818
 819       /* Handle FP <-> GP regs.  */
 820       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 821         {
 822           src_lo = gen_lowpart (word_mode, src);
 823           src_hi = gen_highpart (word_mode, src);
 824
 825           if (mode == TImode)
 826             {
 827               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 828               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 829             }
 830           else
 831             {
 832               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 833               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 834             }
 835           return;
 836         }
 837       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 838         {
 839           dst_lo = gen_lowpart (word_mode, dst);
 840           dst_hi = gen_highpart (word_mode, dst);
 841
 842           if (mode == TImode)
 843             {
 844               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 845               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 846             }
 847           else
 848             {
 849               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 850               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 851             }
 852           return;
 853         }
 854     }
 855
 856   dst_lo = gen_lowpart (word_mode, dst);
 857   dst_hi = gen_highpart (word_mode, dst);
 858   src_lo = gen_lowpart (word_mode, src);
 859   src_hi = gen_highpart_mode (word_mode, mode, src);
 860
 861   /* At most one pairing may overlap.  */
 862   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 863     {
 864       aarch64_emit_move (dst_hi, src_hi);
 865       aarch64_emit_move (dst_lo, src_lo);
 866     }
 867   else
 868     {
 869       aarch64_emit_move (dst_lo, src_lo);
 870       aarch64_emit_move (dst_hi, src_hi);
 871     }
 872 }
 873
 874 bool
 875 aarch64_split_128bit_move_p (rtx dst, rtx src)
 876 {
 877   return (! REG_P (src)
 878           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 879 }
 880
 881 /* Split a complex SIMD combine.  */
 882
 883 void
 884 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 885 {
 886   enum machine_mode src_mode = GET_MODE (src1);
 887   enum machine_mode dst_mode = GET_MODE (dst);
 888
 889   gcc_assert (VECTOR_MODE_P (dst_mode));
 890
 891   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
 892     {
 893       rtx (*gen) (rtx, rtx, rtx);
 894
 895       switch (src_mode)
 896         {
 897         case V8QImode:
 898           gen = gen_aarch64_simd_combinev8qi;
 899           break;
 900         case V4HImode:
 901           gen = gen_aarch64_simd_combinev4hi;
 902           break;
 903         case V2SImode:
 904           gen = gen_aarch64_simd_combinev2si;
 905           break;
 906         case V2SFmode:
 907           gen = gen_aarch64_simd_combinev2sf;
 908           break;
 909         case DImode:
 910           gen = gen_aarch64_simd_combinedi;
 911           break;
 912         case DFmode:
 913           gen = gen_aarch64_simd_combinedf;
 914           break;
 915         default:
 916           gcc_unreachable ();
 917         }
 918
 919       emit_insn (gen (dst, src1, src2));
 920       return;
 921     }
 922 }
 923
 924 /* Split a complex SIMD move.  */
 925
 926 void
 927 aarch64_split_simd_move (rtx dst, rtx src)
 928 {
 929   enum machine_mode src_mode = GET_MODE (src);
 930   enum machine_mode dst_mode = GET_MODE (dst);
 931
 932   gcc_assert (VECTOR_MODE_P (dst_mode));
 933
 934   if (REG_P (dst) && REG_P (src))
 935     {
 936       rtx (*gen) (rtx, rtx);
 937
 938       gcc_assert (VECTOR_MODE_P (src_mode));
 939
 940       switch (src_mode)
 941         {
 942         case V16QImode:
 943           gen = gen_aarch64_split_simd_movv16qi;
 944           break;
 945         case V8HImode:
 946           gen = gen_aarch64_split_simd_movv8hi;
 947           break;
 948         case V4SImode:
 949           gen = gen_aarch64_split_simd_movv4si;
 950           break;
 951         case V2DImode:
 952           gen = gen_aarch64_split_simd_movv2di;
 953           break;
 954         case V4SFmode:
 955           gen = gen_aarch64_split_simd_movv4sf;
 956           break;
 957         case V2DFmode:
 958           gen = gen_aarch64_split_simd_movv2df;
 959           break;
 960         default:
 961           gcc_unreachable ();
 962         }
 963
 964       emit_insn (gen (dst, src));
 965       return;
 966     }
 967 }
 968
 969 static rtx
 970 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
 971 {
 972   if (can_create_pseudo_p ())
 973     return force_reg (mode, value);
 974   else
 975     {
 976       x = aarch64_emit_move (x, value);
 977       return x;
 978     }
 979 }
 980
 981
 982 static rtx
 983 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
 984 {
 985   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
 986     {
 987       rtx high;
 988       /* Load the full offset into a register.  This
 989          might be improvable in the future.  */
 990       high = GEN_INT (offset);
 991       offset = 0;
 992       high = aarch64_force_temporary (mode, temp, high);
 993       reg = aarch64_force_temporary (mode, temp,
 994                                      gen_rtx_PLUS (mode, high, reg));
 995     }
 996   return plus_constant (mode, reg, offset);
 997 }
 998
 999 void
1000 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1001 {
1002   enum machine_mode mode = GET_MODE (dest);
1003   unsigned HOST_WIDE_INT mask;
1004   int i;
1005   bool first;
1006   unsigned HOST_WIDE_INT val;
1007   bool subtargets;
1008   rtx subtarget;
1009   int one_match, zero_match;
1010
1011   gcc_assert (mode == SImode || mode == DImode);
1012
1013   /* Check on what type of symbol it is.  */
1014   if (GET_CODE (imm) == SYMBOL_REF
1015       || GET_CODE (imm) == LABEL_REF
1016       || GET_CODE (imm) == CONST)
1017     {
1018       rtx mem, base, offset;
1019       enum aarch64_symbol_type sty;
1020
1021       /* If we have (const (plus symbol offset)), separate out the offset
1022          before we start classifying the symbol.  */
1023       split_const (imm, &base, &offset);
1024
1025       sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1026       switch (sty)
1027         {
1028         case SYMBOL_FORCE_TO_MEM:
1029           if (offset != const0_rtx
1030               && targetm.cannot_force_const_mem (mode, imm))
1031             {
1032               gcc_assert (can_create_pseudo_p ());
1033               base = aarch64_force_temporary (mode, dest, base);
1034               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1035               aarch64_emit_move (dest, base);
1036               return;
1037             }
1038           mem = force_const_mem (ptr_mode, imm);
1039           gcc_assert (mem);
1040           if (mode != ptr_mode)
1041             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1042           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1043           return;
1044
1045         case SYMBOL_SMALL_TLSGD:
1046         case SYMBOL_SMALL_TLSDESC:
1047         case SYMBOL_SMALL_GOTTPREL:
1048         case SYMBOL_SMALL_GOT:
1049         case SYMBOL_TINY_GOT:
1050           if (offset != const0_rtx)
1051             {
1052               gcc_assert(can_create_pseudo_p ());
1053               base = aarch64_force_temporary (mode, dest, base);
1054               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1055               aarch64_emit_move (dest, base);
1056               return;
1057             }
1058           /* FALLTHRU */
1059
1060         case SYMBOL_SMALL_TPREL:
1061         case SYMBOL_SMALL_ABSOLUTE:
1062         case SYMBOL_TINY_ABSOLUTE:
1063           aarch64_load_symref_appropriately (dest, imm, sty);
1064           return;
1065
1066         default:
1067           gcc_unreachable ();
1068         }
1069     }
1070
1071   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1072     {
1073       emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1074       return;
1075     }
1076
1077   if (!CONST_INT_P (imm))
1078     {
1079       if (GET_CODE (imm) == HIGH)
1080         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1081       else
1082         {
1083           rtx mem = force_const_mem (mode, imm);
1084           gcc_assert (mem);
1085           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1086         }
1087
1088       return;
1089     }
1090
1091   if (mode == SImode)
1092     {
1093       /* We know we can't do this in 1 insn, and we must be able to do it
1094          in two; so don't mess around looking for sequences that don't buy
1095          us anything.  */
1096       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1097       emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1098                                  GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1099       return;
1100     }
1101
1102   /* Remaining cases are all for DImode.  */
1103
1104   val = INTVAL (imm);
1105   subtargets = optimize && can_create_pseudo_p ();
1106
1107   one_match = 0;
1108   zero_match = 0;
1109   mask = 0xffff;
1110
1111   for (i = 0; i < 64; i += 16, mask <<= 16)
1112     {
1113       if ((val & mask) == 0)
1114         zero_match++;
1115       else if ((val & mask) == mask)
1116         one_match++;
1117     }
1118
1119   if (one_match == 2)
1120     {
1121       mask = 0xffff;
1122       for (i = 0; i < 64; i += 16, mask <<= 16)
1123         {
1124           if ((val & mask) != mask)
1125             {
1126               emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1127               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1128                                          GEN_INT ((val >> i) & 0xffff)));
1129               return;
1130             }
1131         }
1132       gcc_unreachable ();
1133     }
1134
1135   if (zero_match == 2)
1136     goto simple_sequence;
1137
1138   mask = 0x0ffff0000UL;
1139   for (i = 16; i < 64; i += 16, mask <<= 16)
1140     {
1141       HOST_WIDE_INT comp = mask & ~(mask - 1);
1142
1143       if (aarch64_uimm12_shift (val - (val & mask)))
1144         {
1145           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1146
1147           emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1148           emit_insn (gen_adddi3 (dest, subtarget,
1149                                  GEN_INT (val - (val & mask))));
1150           return;
1151         }
1152       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1153         {
1154           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1155
1156           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1157                                   GEN_INT ((val + comp) & mask)));
1158           emit_insn (gen_adddi3 (dest, subtarget,
1159                                  GEN_INT (val - ((val + comp) & mask))));
1160           return;
1161         }
1162       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1163         {
1164           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1165
1166           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1167                                   GEN_INT ((val - comp) | ~mask)));
1168           emit_insn (gen_adddi3 (dest, subtarget,
1169                                  GEN_INT (val - ((val - comp) | ~mask))));
1170           return;
1171         }
1172       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1173         {
1174           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1175
1176           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1177                                   GEN_INT (val | ~mask)));
1178           emit_insn (gen_adddi3 (dest, subtarget,
1179                                  GEN_INT (val - (val | ~mask))));
1180           return;
1181         }
1182     }
1183
1184   /* See if we can do it by arithmetically combining two
1185      immediates.  */
1186   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1187     {
1188       int j;
1189       mask = 0xffff;
1190
1191       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1192           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1193         {
1194           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1195           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1196                                   GEN_INT (aarch64_bitmasks[i])));
1197           emit_insn (gen_adddi3 (dest, subtarget,
1198                                  GEN_INT (val - aarch64_bitmasks[i])));
1199           return;
1200         }
1201
1202       for (j = 0; j < 64; j += 16, mask <<= 16)
1203         {
1204           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1205             {
1206               emit_insn (gen_rtx_SET (VOIDmode, dest,
1207                                       GEN_INT (aarch64_bitmasks[i])));
1208               emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1209                                          GEN_INT ((val >> j) & 0xffff)));
1210               return;
1211             }
1212         }
1213     }
1214
1215   /* See if we can do it by logically combining two immediates.  */
1216   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1217     {
1218       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1219         {
1220           int j;
1221
1222           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1223             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1224               {
1225                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1226                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1227                                         GEN_INT (aarch64_bitmasks[i])));
1228                 emit_insn (gen_iordi3 (dest, subtarget,
1229                                        GEN_INT (aarch64_bitmasks[j])));
1230                 return;
1231               }
1232         }
1233       else if ((val & aarch64_bitmasks[i]) == val)
1234         {
1235           int j;
1236
1237           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1238             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1239               {
1240
1241                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1242                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1243                                         GEN_INT (aarch64_bitmasks[j])));
1244                 emit_insn (gen_anddi3 (dest, subtarget,
1245                                        GEN_INT (aarch64_bitmasks[i])));
1246                 return;
1247               }
1248         }
1249     }
1250
1251  simple_sequence:
1252   first = true;
1253   mask = 0xffff;
1254   for (i = 0; i < 64; i += 16, mask <<= 16)
1255     {
1256       if ((val & mask) != 0)
1257         {
1258           if (first)
1259             {
1260               emit_insn (gen_rtx_SET (VOIDmode, dest,
1261                                       GEN_INT (val & mask)));
1262               first = false;
1263             }
1264           else
1265             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1266                                        GEN_INT ((val >> i) & 0xffff)));
1267         }
1268     }
1269 }
1270
1271 static bool
1272 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1273                                  tree exp ATTRIBUTE_UNUSED)
1274 {
1275   /* Currently, always true.  */
1276   return true;
1277 }
1278
1279 /* Implement TARGET_PASS_BY_REFERENCE.  */
1280
1281 static bool
1282 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1283                            enum machine_mode mode,
1284                            const_tree type,
1285                            bool named ATTRIBUTE_UNUSED)
1286 {
1287   HOST_WIDE_INT size;
1288   enum machine_mode dummymode;
1289   int nregs;
1290
1291   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1292   size = (mode == BLKmode && type)
1293     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1294
1295   /* Aggregates are passed by reference based on their size.  */
1296   if (type && AGGREGATE_TYPE_P (type))
1297     {
1298       size = int_size_in_bytes (type);
1299     }
1300
1301   /* Variable sized arguments are always returned by reference.  */
1302   if (size < 0)
1303     return true;
1304
1305   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1306   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1307                                                &dummymode, &nregs,
1308                                                NULL))
1309     return false;
1310
1311   /* Arguments which are variable sized or larger than 2 registers are
1312      passed by reference unless they are a homogenous floating point
1313      aggregate.  */
1314   return size > 2 * UNITS_PER_WORD;
1315 }
1316
1317 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1318 static bool
1319 aarch64_return_in_msb (const_tree valtype)
1320 {
1321   enum machine_mode dummy_mode;
1322   int dummy_int;
1323
1324   /* Never happens in little-endian mode.  */
1325   if (!BYTES_BIG_ENDIAN)
1326     return false;
1327
1328   /* Only composite types smaller than or equal to 16 bytes can
1329      be potentially returned in registers.  */
1330   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1331       || int_size_in_bytes (valtype) <= 0
1332       || int_size_in_bytes (valtype) > 16)
1333     return false;
1334
1335   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1336      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1337      is always passed/returned in the least significant bits of fp/simd
1338      register(s).  */
1339   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1340                                                &dummy_mode, &dummy_int, NULL))
1341     return false;
1342
1343   return true;
1344 }
1345
1346 /* Implement TARGET_FUNCTION_VALUE.
1347    Define how to find the value returned by a function.  */
1348
1349 static rtx
1350 aarch64_function_value (const_tree type, const_tree func,
1351                         bool outgoing ATTRIBUTE_UNUSED)
1352 {
1353   enum machine_mode mode;
1354   int unsignedp;
1355   int count;
1356   enum machine_mode ag_mode;
1357
1358   mode = TYPE_MODE (type);
1359   if (INTEGRAL_TYPE_P (type))
1360     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1361
1362   if (aarch64_return_in_msb (type))
1363     {
1364       HOST_WIDE_INT size = int_size_in_bytes (type);
1365
1366       if (size % UNITS_PER_WORD != 0)
1367         {
1368           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1369           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1370         }
1371     }
1372
1373   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1374                                                &ag_mode, &count, NULL))
1375     {
1376       if (!aarch64_composite_type_p (type, mode))
1377         {
1378           gcc_assert (count == 1 && mode == ag_mode);
1379           return gen_rtx_REG (mode, V0_REGNUM);
1380         }
1381       else
1382         {
1383           int i;
1384           rtx par;
1385
1386           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1387           for (i = 0; i < count; i++)
1388             {
1389               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1390               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1391                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1392               XVECEXP (par, 0, i) = tmp;
1393             }
1394           return par;
1395         }
1396     }
1397   else
1398     return gen_rtx_REG (mode, R0_REGNUM);
1399 }
1400
1401 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1402    Return true if REGNO is the number of a hard register in which the values
1403    of called function may come back.  */
1404
1405 static bool
1406 aarch64_function_value_regno_p (const unsigned int regno)
1407 {
1408   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1409      of 16-byte return values are: 128-bit integers and 16-byte small
1410      structures (excluding homogeneous floating-point aggregates).  */
1411   if (regno == R0_REGNUM || regno == R1_REGNUM)
1412     return true;
1413
1414   /* Up to four fp/simd registers can return a function value, e.g. a
1415      homogeneous floating-point aggregate having four members.  */
1416   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1417     return !TARGET_GENERAL_REGS_ONLY;
1418
1419   return false;
1420 }
1421
1422 /* Implement TARGET_RETURN_IN_MEMORY.
1423
1424    If the type T of the result of a function is such that
1425      void func (T arg)
1426    would require that arg be passed as a value in a register (or set of
1427    registers) according to the parameter passing rules, then the result
1428    is returned in the same registers as would be used for such an
1429    argument.  */
1430
1431 static bool
1432 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1433 {
1434   HOST_WIDE_INT size;
1435   enum machine_mode ag_mode;
1436   int count;
1437
1438   if (!AGGREGATE_TYPE_P (type)
1439       && TREE_CODE (type) != COMPLEX_TYPE
1440       && TREE_CODE (type) != VECTOR_TYPE)
1441     /* Simple scalar types always returned in registers.  */
1442     return false;
1443
1444   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1445                                                type,
1446                                                &ag_mode,
1447                                                &count,
1448                                                NULL))
1449     return false;
1450
1451   /* Types larger than 2 registers returned in memory.  */
1452   size = int_size_in_bytes (type);
1453   return (size < 0 || size > 2 * UNITS_PER_WORD);
1454 }
1455
1456 static bool
1457 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1458                                const_tree type, int *nregs)
1459 {
1460   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1461   return aarch64_vfp_is_call_or_return_candidate (mode,
1462                                                   type,
1463                                                   &pcum->aapcs_vfp_rmode,
1464                                                   nregs,
1465                                                   NULL);
1466 }
1467
1468 /* Given MODE and TYPE of a function argument, return the alignment in
1469    bits.  The idea is to suppress any stronger alignment requested by
1470    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1471    This is a helper function for local use only.  */
1472
1473 static unsigned int
1474 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1475 {
1476   unsigned int alignment;
1477
1478   if (type)
1479     {
1480       if (!integer_zerop (TYPE_SIZE (type)))
1481         {
1482           if (TYPE_MODE (type) == mode)
1483             alignment = TYPE_ALIGN (type);
1484           else
1485             alignment = GET_MODE_ALIGNMENT (mode);
1486         }
1487       else
1488         alignment = 0;
1489     }
1490   else
1491     alignment = GET_MODE_ALIGNMENT (mode);
1492
1493   return alignment;
1494 }
1495
1496 /* Layout a function argument according to the AAPCS64 rules.  The rule
1497    numbers refer to the rule numbers in the AAPCS64.  */
1498
1499 static void
1500 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1501                     const_tree type,
1502                     bool named ATTRIBUTE_UNUSED)
1503 {
1504   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1505   int ncrn, nvrn, nregs;
1506   bool allocate_ncrn, allocate_nvrn;
1507   HOST_WIDE_INT size;
1508
1509   /* We need to do this once per argument.  */
1510   if (pcum->aapcs_arg_processed)
1511     return;
1512
1513   pcum->aapcs_arg_processed = true;
1514
1515   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1516   size
1517     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1518                         UNITS_PER_WORD);
1519
1520   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1521   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1522                                                  mode,
1523                                                  type,
1524                                                  &nregs);
1525
1526   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1527      The following code thus handles passing by SIMD/FP registers first.  */
1528
1529   nvrn = pcum->aapcs_nvrn;
1530
1531   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1532      and homogenous short-vector aggregates (HVA).  */
1533   if (allocate_nvrn)
1534     {
1535       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1536         {
1537           pcum->aapcs_nextnvrn = nvrn + nregs;
1538           if (!aarch64_composite_type_p (type, mode))
1539             {
1540               gcc_assert (nregs == 1);
1541               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1542             }
1543           else
1544             {
1545               rtx par;
1546               int i;
1547               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1548               for (i = 0; i < nregs; i++)
1549                 {
1550                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1551                                          V0_REGNUM + nvrn + i);
1552                   tmp = gen_rtx_EXPR_LIST
1553                     (VOIDmode, tmp,
1554                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1555                   XVECEXP (par, 0, i) = tmp;
1556                 }
1557               pcum->aapcs_reg = par;
1558             }
1559           return;
1560         }
1561       else
1562         {
1563           /* C.3 NSRN is set to 8.  */
1564           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1565           goto on_stack;
1566         }
1567     }
1568
1569   ncrn = pcum->aapcs_ncrn;
1570   nregs = size / UNITS_PER_WORD;
1571
1572   /* C6 - C9.  though the sign and zero extension semantics are
1573      handled elsewhere.  This is the case where the argument fits
1574      entirely general registers.  */
1575   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1576     {
1577       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1578
1579       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1580
1581       /* C.8 if the argument has an alignment of 16 then the NGRN is
1582          rounded up to the next even number.  */
1583       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1584         {
1585           ++ncrn;
1586           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1587         }
1588       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1589          A reg is still generated for it, but the caller should be smart
1590          enough not to use it.  */
1591       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1592         {
1593           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1594         }
1595       else
1596         {
1597           rtx par;
1598           int i;
1599
1600           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1601           for (i = 0; i < nregs; i++)
1602             {
1603               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1604               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1605                                        GEN_INT (i * UNITS_PER_WORD));
1606               XVECEXP (par, 0, i) = tmp;
1607             }
1608           pcum->aapcs_reg = par;
1609         }
1610
1611       pcum->aapcs_nextncrn = ncrn + nregs;
1612       return;
1613     }
1614
1615   /* C.11  */
1616   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1617
1618   /* The argument is passed on stack; record the needed number of words for
1619      this argument and align the total size if necessary.  */
1620 on_stack:
1621   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1622   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1623     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1624                                                16 / UNITS_PER_WORD);
1625   return;
1626 }
1627
1628 /* Implement TARGET_FUNCTION_ARG.  */
1629
1630 static rtx
1631 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1632                       const_tree type, bool named)
1633 {
1634   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1635   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1636
1637   if (mode == VOIDmode)
1638     return NULL_RTX;
1639
1640   aarch64_layout_arg (pcum_v, mode, type, named);
1641   return pcum->aapcs_reg;
1642 }
1643
1644 void
1645 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1646                            const_tree fntype ATTRIBUTE_UNUSED,
1647                            rtx libname ATTRIBUTE_UNUSED,
1648                            const_tree fndecl ATTRIBUTE_UNUSED,
1649                            unsigned n_named ATTRIBUTE_UNUSED)
1650 {
1651   pcum->aapcs_ncrn = 0;
1652   pcum->aapcs_nvrn = 0;
1653   pcum->aapcs_nextncrn = 0;
1654   pcum->aapcs_nextnvrn = 0;
1655   pcum->pcs_variant = ARM_PCS_AAPCS64;
1656   pcum->aapcs_reg = NULL_RTX;
1657   pcum->aapcs_arg_processed = false;
1658   pcum->aapcs_stack_words = 0;
1659   pcum->aapcs_stack_size = 0;
1660
1661   return;
1662 }
1663
1664 static void
1665 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1666                               enum machine_mode mode,
1667                               const_tree type,
1668                               bool named)
1669 {
1670   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1671   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1672     {
1673       aarch64_layout_arg (pcum_v, mode, type, named);
1674       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1675                   != (pcum->aapcs_stack_words != 0));
1676       pcum->aapcs_arg_processed = false;
1677       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1678       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1679       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1680       pcum->aapcs_stack_words = 0;
1681       pcum->aapcs_reg = NULL_RTX;
1682     }
1683 }
1684
1685 bool
1686 aarch64_function_arg_regno_p (unsigned regno)
1687 {
1688   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1689           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1690 }
1691
1692 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1693    PARM_BOUNDARY bits of alignment, but will be given anything up
1694    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1695    that both before and after the layout of each argument, the Next
1696    Stacked Argument Address (NSAA) will have a minimum alignment of
1697    8 bytes.  */
1698
1699 static unsigned int
1700 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1701 {
1702   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1703
1704   if (alignment < PARM_BOUNDARY)
1705     alignment = PARM_BOUNDARY;
1706   if (alignment > STACK_BOUNDARY)
1707     alignment = STACK_BOUNDARY;
1708   return alignment;
1709 }
1710
1711 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1712
1713    Return true if an argument passed on the stack should be padded upwards,
1714    i.e. if the least-significant byte of the stack slot has useful data.
1715
1716    Small aggregate types are placed in the lowest memory address.
1717
1718    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1719
1720 bool
1721 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1722 {
1723   /* On little-endian targets, the least significant byte of every stack
1724      argument is passed at the lowest byte address of the stack slot.  */
1725   if (!BYTES_BIG_ENDIAN)
1726     return true;
1727
1728   /* Otherwise, integral, floating-point and pointer types are padded downward:
1729      the least significant byte of a stack argument is passed at the highest
1730      byte address of the stack slot.  */
1731   if (type
1732       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1733          || POINTER_TYPE_P (type))
1734       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1735     return false;
1736
1737   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1738   return true;
1739 }
1740
1741 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1742
1743    It specifies padding for the last (may also be the only)
1744    element of a block move between registers and memory.  If
1745    assuming the block is in the memory, padding upward means that
1746    the last element is padded after its highest significant byte,
1747    while in downward padding, the last element is padded at the
1748    its least significant byte side.
1749
1750    Small aggregates and small complex types are always padded
1751    upwards.
1752
1753    We don't need to worry about homogeneous floating-point or
1754    short-vector aggregates; their move is not affected by the
1755    padding direction determined here.  Regardless of endianness,
1756    each element of such an aggregate is put in the least
1757    significant bits of a fp/simd register.
1758
1759    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1760    register has useful data, and return the opposite if the most
1761    significant byte does.  */
1762
1763 bool
1764 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1765                      bool first ATTRIBUTE_UNUSED)
1766 {
1767
1768   /* Small composite types are always padded upward.  */
1769   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1770     {
1771       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1772                             : GET_MODE_SIZE (mode));
1773       if (size < 2 * UNITS_PER_WORD)
1774         return true;
1775     }
1776
1777   /* Otherwise, use the default padding.  */
1778   return !BYTES_BIG_ENDIAN;
1779 }
1780
1781 static enum machine_mode
1782 aarch64_libgcc_cmp_return_mode (void)
1783 {
1784   return SImode;
1785 }
1786
1787 static bool
1788 aarch64_frame_pointer_required (void)
1789 {
1790   /* If the function contains dynamic stack allocations, we need to
1791      use the frame pointer to access the static parts of the frame.  */
1792   if (cfun->calls_alloca)
1793     return true;
1794
1795   /* In aarch64_override_options_after_change
1796      flag_omit_leaf_frame_pointer turns off the frame pointer by
1797      default.  Turn it back on now if we've not got a leaf
1798      function.  */
1799   if (flag_omit_leaf_frame_pointer
1800       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Mark the registers that need to be saved by the callee and calculate
1807    the size of the callee-saved registers area and frame record (both FP
1808    and LR may be omitted).  */
1809 static void
1810 aarch64_layout_frame (void)
1811 {
1812   HOST_WIDE_INT offset = 0;
1813   int regno;
1814
1815   if (reload_completed && cfun->machine->frame.laid_out)
1816     return;
1817
1818 #define SLOT_NOT_REQUIRED (-2)
1819 #define SLOT_REQUIRED     (-1)
1820
1821   /* First mark all the registers that really need to be saved...  */
1822   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1823     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1824
1825   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1826     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1827
1828   /* ... that includes the eh data registers (if needed)...  */
1829   if (crtl->calls_eh_return)
1830     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1831       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1832         = SLOT_REQUIRED;
1833
1834   /* ... and any callee saved register that dataflow says is live.  */
1835   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1836     if (df_regs_ever_live_p (regno)
1837         && !call_used_regs[regno])
1838       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1839
1840   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1841     if (df_regs_ever_live_p (regno)
1842         && !call_used_regs[regno])
1843       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1844
1845   if (frame_pointer_needed)
1846     {
1847       /* FP and LR are placed in the linkage record.  */
1848       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1849       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1850       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1851       offset += 2 * UNITS_PER_WORD;
1852     }
1853
1854   /* Now assign stack slots for them.  */
1855   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1856     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1857       {
1858         cfun->machine->frame.reg_offset[regno] = offset;
1859         offset += UNITS_PER_WORD;
1860       }
1861
1862   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1863     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1864       {
1865         cfun->machine->frame.reg_offset[regno] = offset;
1866         offset += UNITS_PER_WORD;
1867       }
1868
1869   cfun->machine->frame.padding0 =
1870     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1871   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1872
1873   cfun->machine->frame.saved_regs_size = offset;
1874
1875   cfun->machine->frame.hard_fp_offset
1876     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1877                         + get_frame_size ()
1878                         + cfun->machine->frame.saved_regs_size,
1879                         STACK_BOUNDARY / BITS_PER_UNIT);
1880
1881   cfun->machine->frame.frame_size
1882     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1883                         + crtl->outgoing_args_size,
1884                         STACK_BOUNDARY / BITS_PER_UNIT);
1885
1886   cfun->machine->frame.laid_out = true;
1887 }
1888
1889 /* Make the last instruction frame-related and note that it performs
1890    the operation described by FRAME_PATTERN.  */
1891
1892 static void
1893 aarch64_set_frame_expr (rtx frame_pattern)
1894 {
1895   rtx insn;
1896
1897   insn = get_last_insn ();
1898   RTX_FRAME_RELATED_P (insn) = 1;
1899   RTX_FRAME_RELATED_P (frame_pattern) = 1;
1900   REG_NOTES (insn) = alloc_EXPR_LIST (REG_FRAME_RELATED_EXPR,
1901                                       frame_pattern,
1902                                       REG_NOTES (insn));
1903 }
1904
1905 static bool
1906 aarch64_register_saved_on_entry (int regno)
1907 {
1908   return cfun->machine->frame.reg_offset[regno] >= 0;
1909 }
1910
1911 static unsigned
1912 aarch64_next_callee_save (unsigned regno, unsigned limit)
1913 {
1914   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1915     regno ++;
1916   return regno;
1917 }
1918
1919 static rtx
1920 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
1921                         rtx reg2)
1922 {
1923   switch (mode)
1924     {
1925     case DImode:
1926       return gen_store_pairdi (mem1, reg1, mem2, reg2);
1927
1928     case DFmode:
1929       return gen_store_pairdf (mem1, reg1, mem2, reg2);
1930
1931     default:
1932       gcc_unreachable ();
1933     }
1934 }
1935
1936 static rtx
1937 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
1938                        rtx mem2)
1939 {
1940   switch (mode)
1941     {
1942     case DImode:
1943       return gen_load_pairdi (reg1, mem1, reg2, mem2);
1944
1945     case DFmode:
1946       return gen_load_pairdf (reg1, mem1, reg2, mem2);
1947
1948     default:
1949       gcc_unreachable ();
1950     }
1951 }
1952
1953 static void
1954 aarch64_save_or_restore_fprs (HOST_WIDE_INT start_offset, bool restore)
1955 {
1956   unsigned regno;
1957   unsigned regno2;
1958   rtx insn;
1959   rtx (*gen_mem_ref) (enum machine_mode, rtx)
1960     = frame_pointer_needed ? gen_frame_mem : gen_rtx_MEM;
1961
1962
1963   for (regno = aarch64_next_callee_save (V0_REGNUM, V31_REGNUM);
1964        regno <= V31_REGNUM;
1965        regno = aarch64_next_callee_save (regno + 1, V31_REGNUM))
1966     {
1967       rtx reg = gen_rtx_REG (DFmode, regno);
1968       rtx mem;
1969
1970       HOST_WIDE_INT offset = start_offset
1971                              + cfun->machine->frame.reg_offset[regno];
1972       mem = gen_mem_ref (DFmode, plus_constant (Pmode, stack_pointer_rtx,
1973                                                 offset));
1974
1975       regno2 = aarch64_next_callee_save (regno + 1, V31_REGNUM);
1976
1977       if (regno2 <= V31_REGNUM)
1978         {
1979           rtx reg2 = gen_rtx_REG (DFmode, regno2);
1980           rtx mem2;
1981
1982           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
1983           mem2 = gen_mem_ref (DFmode,
1984                               plus_constant (Pmode, stack_pointer_rtx, offset));
1985           if (restore == false)
1986             insn = emit_insn (aarch64_gen_store_pair (DFmode, mem, reg, mem2, reg2));
1987           else
1988             {
1989               insn = emit_insn (aarch64_gen_load_pair (DFmode, reg, mem, reg2, mem2));
1990               add_reg_note (insn, REG_CFA_RESTORE, reg);
1991               add_reg_note (insn, REG_CFA_RESTORE, reg2);
1992             }
1993
1994           /* The first part of a frame-related parallel insn is
1995              always assumed to be relevant to the frame
1996              calculations; subsequent parts, are only
1997              frame-related if explicitly marked.  */
1998           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1999           regno = regno2;
2000         }
2001       else
2002         {
2003           if (restore == false)
2004             insn = emit_move_insn (mem, reg);
2005           else
2006             {
2007               insn = emit_move_insn (reg, mem);
2008               add_reg_note (insn, REG_CFA_RESTORE, reg);
2009             }
2010         }
2011       RTX_FRAME_RELATED_P (insn) = 1;
2012     }
2013 }
2014
2015
2016 /* offset from the stack pointer of where the saves and
2017    restore's have to happen.  */
2018 static void
2019 aarch64_save_or_restore_callee_save_registers (HOST_WIDE_INT start_offset,
2020                                                bool restore)
2021 {
2022   rtx insn;
2023   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2024                                                  ? gen_frame_mem : gen_rtx_MEM);
2025   unsigned limit = frame_pointer_needed ? R28_REGNUM : R30_REGNUM;
2026   unsigned regno;
2027   unsigned regno2;
2028
2029   for (regno = aarch64_next_callee_save (R0_REGNUM, limit);
2030        regno <= limit;
2031        regno = aarch64_next_callee_save (regno + 1, limit))
2032     {
2033       rtx reg = gen_rtx_REG (DImode, regno);
2034       rtx mem;
2035
2036       HOST_WIDE_INT offset = start_offset
2037                              + cfun->machine->frame.reg_offset[regno];
2038       mem = gen_mem_ref (Pmode, plus_constant (Pmode, stack_pointer_rtx,
2039                                                offset));
2040
2041       regno2 = aarch64_next_callee_save (regno + 1, limit);
2042
2043       if (regno2 <= limit
2044           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2045               == cfun->machine->frame.reg_offset[regno2]))
2046
2047         {
2048           rtx reg2 = gen_rtx_REG (DImode, regno2);
2049           rtx mem2;
2050
2051           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2052           mem2 = gen_mem_ref (Pmode,
2053                               plus_constant (Pmode, stack_pointer_rtx, offset));
2054           if (restore == false)
2055             insn = emit_insn (aarch64_gen_store_pair (DImode, mem, reg, mem2, reg2));
2056           else
2057             {
2058               insn = emit_insn (aarch64_gen_load_pair (DImode, reg, mem, reg2, mem2));
2059               add_reg_note (insn, REG_CFA_RESTORE, reg);
2060               add_reg_note (insn, REG_CFA_RESTORE, reg2);
2061             }
2062
2063           /* The first part of a frame-related parallel insn is
2064              always assumed to be relevant to the frame
2065              calculations; subsequent parts, are only
2066              frame-related if explicitly marked.  */
2067           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2068           regno = regno2;
2069         }
2070       else
2071         {
2072           if (restore == false)
2073             insn = emit_move_insn (mem, reg);
2074           else
2075             {
2076               insn = emit_move_insn (reg, mem);
2077               add_reg_note (insn, REG_CFA_RESTORE, reg);
2078             }
2079         }
2080       RTX_FRAME_RELATED_P (insn) = 1;
2081     }
2082   aarch64_save_or_restore_fprs (start_offset, restore);
2083 }
2084
2085 /* AArch64 stack frames generated by this compiler look like:
2086
2087         +-------------------------------+
2088         |                               |
2089         |  incoming stack arguments     |
2090         |                               |
2091         +-------------------------------+
2092         |                               | <-- incoming stack pointer (aligned)
2093         |  callee-allocated save area   |
2094         |  for register varargs         |
2095         |                               |
2096         +-------------------------------+
2097         |  local variables              | <-- frame_pointer_rtx
2098         |                               |
2099         +-------------------------------+
2100         |  padding0                     | \
2101         +-------------------------------+  |
2102         |  callee-saved registers       |  | frame.saved_regs_size
2103         +-------------------------------+  |
2104         |  LR'                          |  |
2105         +-------------------------------+  |
2106         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2107         +-------------------------------+
2108         |  dynamic allocation           |
2109         +-------------------------------+
2110         |  padding                      |
2111         +-------------------------------+
2112         |  outgoing stack arguments     | <-- arg_pointer
2113         |                               |
2114         +-------------------------------+
2115         |                               | <-- stack_pointer_rtx (aligned)
2116
2117    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2118    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2119    unchanged.  */
2120
2121 /* Generate the prologue instructions for entry into a function.
2122    Establish the stack frame by decreasing the stack pointer with a
2123    properly calculated size and, if necessary, create a frame record
2124    filled with the values of LR and previous frame pointer.  The
2125    current FP is also set up if it is in use.  */
2126
2127 void
2128 aarch64_expand_prologue (void)
2129 {
2130   /* sub sp, sp, #<frame_size>
2131      stp {fp, lr}, [sp, #<frame_size> - 16]
2132      add fp, sp, #<frame_size> - hardfp_offset
2133      stp {cs_reg}, [fp, #-16] etc.
2134
2135      sub sp, sp, <final_adjustment_if_any>
2136   */
2137   HOST_WIDE_INT frame_size, offset;
2138   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2139   rtx insn;
2140
2141   aarch64_layout_frame ();
2142
2143   if (flag_stack_usage_info)
2144     current_function_static_stack_size = cfun->machine->frame.frame_size;
2145
2146   frame_size = cfun->machine->frame.frame_size;
2147   offset = cfun->machine->frame.frame_size;
2148
2149   fp_offset = cfun->machine->frame.frame_size
2150               - cfun->machine->frame.hard_fp_offset;
2151
2152   /* Store pairs and load pairs have a range only -512 to 504.  */
2153   if (offset >= 512)
2154     {
2155       /* When the frame has a large size, an initial decrease is done on
2156          the stack pointer to jump over the callee-allocated save area for
2157          register varargs, the local variable area and/or the callee-saved
2158          register area.  This will allow the pre-index write-back
2159          store pair instructions to be used for setting up the stack frame
2160          efficiently.  */
2161       offset = cfun->machine->frame.hard_fp_offset;
2162       if (offset >= 512)
2163         offset = cfun->machine->frame.saved_regs_size;
2164
2165       frame_size -= (offset + crtl->outgoing_args_size);
2166       fp_offset = 0;
2167
2168       if (frame_size >= 0x1000000)
2169         {
2170           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2171           emit_move_insn (op0, GEN_INT (-frame_size));
2172           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2173           aarch64_set_frame_expr (gen_rtx_SET
2174                                   (Pmode, stack_pointer_rtx,
2175                                    plus_constant (Pmode,
2176                                                   stack_pointer_rtx,
2177                                                   -frame_size)));
2178         }
2179       else if (frame_size > 0)
2180         {
2181           if ((frame_size & 0xfff) != frame_size)
2182             {
2183               insn = emit_insn (gen_add2_insn
2184                                 (stack_pointer_rtx,
2185                                  GEN_INT (-(frame_size
2186                                             & ~(HOST_WIDE_INT)0xfff))));
2187               RTX_FRAME_RELATED_P (insn) = 1;
2188             }
2189           if ((frame_size & 0xfff) != 0)
2190             {
2191               insn = emit_insn (gen_add2_insn
2192                                 (stack_pointer_rtx,
2193                                  GEN_INT (-(frame_size
2194                                             & (HOST_WIDE_INT)0xfff))));
2195               RTX_FRAME_RELATED_P (insn) = 1;
2196             }
2197         }
2198     }
2199   else
2200     frame_size = -1;
2201
2202   if (offset > 0)
2203     {
2204       /* Save the frame pointer and lr if the frame pointer is needed
2205          first.  Make the frame pointer point to the location of the
2206          old frame pointer on the stack.  */
2207       if (frame_pointer_needed)
2208         {
2209           rtx mem_fp, mem_lr;
2210
2211           if (fp_offset)
2212             {
2213               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2214                                                GEN_INT (-offset)));
2215               RTX_FRAME_RELATED_P (insn) = 1;
2216               aarch64_set_frame_expr (gen_rtx_SET
2217                                       (Pmode, stack_pointer_rtx,
2218                                        gen_rtx_MINUS (Pmode,
2219                                                       stack_pointer_rtx,
2220                                                       GEN_INT (offset))));
2221               mem_fp = gen_frame_mem (DImode,
2222                                       plus_constant (Pmode,
2223                                                      stack_pointer_rtx,
2224                                                      fp_offset));
2225               mem_lr = gen_frame_mem (DImode,
2226                                       plus_constant (Pmode,
2227                                                      stack_pointer_rtx,
2228                                                      fp_offset
2229                                                      + UNITS_PER_WORD));
2230               insn = emit_insn (gen_store_pairdi (mem_fp,
2231                                                   hard_frame_pointer_rtx,
2232                                                   mem_lr,
2233                                                   gen_rtx_REG (DImode,
2234                                                                LR_REGNUM)));
2235             }
2236           else
2237             {
2238               insn = emit_insn (gen_storewb_pairdi_di
2239                                 (stack_pointer_rtx, stack_pointer_rtx,
2240                                  hard_frame_pointer_rtx,
2241                                  gen_rtx_REG (DImode, LR_REGNUM),
2242                                  GEN_INT (-offset),
2243                                  GEN_INT (GET_MODE_SIZE (DImode) - offset)));
2244               RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2245             }
2246
2247           /* The first part of a frame-related parallel insn is always
2248              assumed to be relevant to the frame calculations;
2249              subsequent parts, are only frame-related if explicitly
2250              marked.  */
2251           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2252           RTX_FRAME_RELATED_P (insn) = 1;
2253
2254           /* Set up frame pointer to point to the location of the
2255              previous frame pointer on the stack.  */
2256           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2257                                            stack_pointer_rtx,
2258                                            GEN_INT (fp_offset)));
2259           aarch64_set_frame_expr (gen_rtx_SET
2260                                   (Pmode, hard_frame_pointer_rtx,
2261                                    plus_constant (Pmode,
2262                                                   stack_pointer_rtx,
2263                                                   fp_offset)));
2264           RTX_FRAME_RELATED_P (insn) = 1;
2265           insn = emit_insn (gen_stack_tie (stack_pointer_rtx,
2266                                            hard_frame_pointer_rtx));
2267         }
2268       else
2269         {
2270           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2271                                            GEN_INT (-offset)));
2272           RTX_FRAME_RELATED_P (insn) = 1;
2273         }
2274
2275       aarch64_save_or_restore_callee_save_registers (fp_offset, 0);
2276     }
2277
2278   /* when offset >= 512,
2279      sub sp, sp, #<outgoing_args_size> */
2280   if (frame_size > -1)
2281     {
2282       if (crtl->outgoing_args_size > 0)
2283         {
2284           insn = emit_insn (gen_add2_insn
2285                             (stack_pointer_rtx,
2286                              GEN_INT (- crtl->outgoing_args_size)));
2287           RTX_FRAME_RELATED_P (insn) = 1;
2288         }
2289     }
2290 }
2291
2292 /* Generate the epilogue instructions for returning from a function.  */
2293 void
2294 aarch64_expand_epilogue (bool for_sibcall)
2295 {
2296   HOST_WIDE_INT frame_size, offset;
2297   HOST_WIDE_INT fp_offset;
2298   rtx insn;
2299   rtx cfa_reg;
2300
2301   aarch64_layout_frame ();
2302
2303   offset = frame_size = cfun->machine->frame.frame_size;
2304   fp_offset = cfun->machine->frame.frame_size
2305               - cfun->machine->frame.hard_fp_offset;
2306
2307   cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx;
2308
2309   /* Store pairs and load pairs have a range only -512 to 504.  */
2310   if (offset >= 512)
2311     {
2312       offset = cfun->machine->frame.hard_fp_offset;
2313       if (offset >= 512)
2314         offset = cfun->machine->frame.saved_regs_size;
2315
2316       frame_size -= (offset + crtl->outgoing_args_size);
2317       fp_offset = 0;
2318       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2319         {
2320           insn = emit_insn (gen_add2_insn
2321                             (stack_pointer_rtx,
2322                              GEN_INT (crtl->outgoing_args_size)));
2323           RTX_FRAME_RELATED_P (insn) = 1;
2324         }
2325     }
2326   else
2327     frame_size = -1;
2328
2329   /* If there were outgoing arguments or we've done dynamic stack
2330      allocation, then restore the stack pointer from the frame
2331      pointer.  This is at most one insn and more efficient than using
2332      GCC's internal mechanism.  */
2333   if (frame_pointer_needed
2334       && (crtl->outgoing_args_size || cfun->calls_alloca))
2335     {
2336       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2337                                        hard_frame_pointer_rtx,
2338                                        GEN_INT (- fp_offset)));
2339       RTX_FRAME_RELATED_P (insn) = 1;
2340       /* As SP is set to (FP - fp_offset), according to the rules in
2341          dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
2342          from the value of SP from now on.  */
2343       cfa_reg = stack_pointer_rtx;
2344     }
2345
2346   aarch64_save_or_restore_callee_save_registers (fp_offset, 1);
2347
2348   /* Restore the frame pointer and lr if the frame pointer is needed.  */
2349   if (offset > 0)
2350     {
2351       if (frame_pointer_needed)
2352         {
2353           rtx mem_fp, mem_lr;
2354
2355           if (fp_offset)
2356             {
2357               mem_fp = gen_frame_mem (DImode,
2358                                       plus_constant (Pmode,
2359                                                      stack_pointer_rtx,
2360                                                      fp_offset));
2361               mem_lr = gen_frame_mem (DImode,
2362                                       plus_constant (Pmode,
2363                                                      stack_pointer_rtx,
2364                                                      fp_offset
2365                                                      + UNITS_PER_WORD));
2366               insn = emit_insn (gen_load_pairdi (hard_frame_pointer_rtx,
2367                                                  mem_fp,
2368                                                  gen_rtx_REG (DImode,
2369                                                               LR_REGNUM),
2370                                                  mem_lr));
2371             }
2372           else
2373             {
2374               insn = emit_insn (gen_loadwb_pairdi_di
2375                                 (stack_pointer_rtx,
2376                                  stack_pointer_rtx,
2377                                  hard_frame_pointer_rtx,
2378                                  gen_rtx_REG (DImode, LR_REGNUM),
2379                                  GEN_INT (offset),
2380                                  GEN_INT (GET_MODE_SIZE (DImode) + offset)));
2381               RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2382               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2383                             (gen_rtx_SET (Pmode, stack_pointer_rtx,
2384                                           plus_constant (Pmode, cfa_reg,
2385                                                          offset))));
2386             }
2387
2388           /* The first part of a frame-related parallel insn
2389              is always assumed to be relevant to the frame
2390              calculations; subsequent parts, are only
2391              frame-related if explicitly marked.  */
2392           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2393           RTX_FRAME_RELATED_P (insn) = 1;
2394           add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
2395           add_reg_note (insn, REG_CFA_RESTORE,
2396                         gen_rtx_REG (DImode, LR_REGNUM));
2397
2398           if (fp_offset)
2399             {
2400               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2401                                                GEN_INT (offset)));
2402               RTX_FRAME_RELATED_P (insn) = 1;
2403             }
2404         }
2405       else
2406         {
2407           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2408                                            GEN_INT (offset)));
2409           RTX_FRAME_RELATED_P (insn) = 1;
2410         }
2411     }
2412
2413   /* Stack adjustment for exception handler.  */
2414   if (crtl->calls_eh_return)
2415     {
2416       /* We need to unwind the stack by the offset computed by
2417          EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
2418          based on SP.  Ideally we would update the SP and define the
2419          CFA along the lines of:
2420
2421          SP = SP + EH_RETURN_STACKADJ_RTX
2422          (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
2423
2424          However the dwarf emitter only understands a constant
2425          register offset.
2426
2427          The solution chosen here is to use the otherwise unused IP0
2428          as a temporary register to hold the current SP value.  The
2429          CFA is described using IP0 then SP is modified.  */
2430
2431       rtx ip0 = gen_rtx_REG (DImode, IP0_REGNUM);
2432
2433       insn = emit_move_insn (ip0, stack_pointer_rtx);
2434       add_reg_note (insn, REG_CFA_DEF_CFA, ip0);
2435       RTX_FRAME_RELATED_P (insn) = 1;
2436
2437       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2438
2439       /* Ensure the assignment to IP0 does not get optimized away.  */
2440       emit_use (ip0);
2441     }
2442
2443   if (frame_size > -1)
2444     {
2445       if (frame_size >= 0x1000000)
2446         {
2447           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2448           emit_move_insn (op0, GEN_INT (frame_size));
2449           emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2450           aarch64_set_frame_expr (gen_rtx_SET
2451                                   (Pmode, stack_pointer_rtx,
2452                                    plus_constant (Pmode,
2453                                                   stack_pointer_rtx,
2454                                                   frame_size)));
2455         }
2456       else if (frame_size > 0)
2457         {
2458           if ((frame_size & 0xfff) != 0)
2459             {
2460               insn = emit_insn (gen_add2_insn
2461                                 (stack_pointer_rtx,
2462                                  GEN_INT ((frame_size
2463                                            & (HOST_WIDE_INT) 0xfff))));
2464               RTX_FRAME_RELATED_P (insn) = 1;
2465             }
2466           if ((frame_size & 0xfff) != frame_size)
2467             {
2468               insn = emit_insn (gen_add2_insn
2469                                 (stack_pointer_rtx,
2470                                  GEN_INT ((frame_size
2471                                            & ~ (HOST_WIDE_INT) 0xfff))));
2472               RTX_FRAME_RELATED_P (insn) = 1;
2473             }
2474         }
2475
2476       aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx,
2477                                            plus_constant (Pmode,
2478                                                           stack_pointer_rtx,
2479                                                           offset)));
2480     }
2481
2482   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2483   if (!for_sibcall)
2484     emit_jump_insn (ret_rtx);
2485 }
2486
2487 /* Return the place to copy the exception unwinding return address to.
2488    This will probably be a stack slot, but could (in theory be the
2489    return register).  */
2490 rtx
2491 aarch64_final_eh_return_addr (void)
2492 {
2493   HOST_WIDE_INT fp_offset;
2494
2495   aarch64_layout_frame ();
2496
2497   fp_offset = cfun->machine->frame.frame_size
2498               - cfun->machine->frame.hard_fp_offset;
2499
2500   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2501     return gen_rtx_REG (DImode, LR_REGNUM);
2502
2503   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2504      result in a store to save LR introduced by builtin_eh_return () being
2505      incorrectly deleted because the alias is not detected.
2506      So in the calculation of the address to copy the exception unwinding
2507      return address to, we note 2 cases.
2508      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2509      we return a SP-relative location since all the addresses are SP-relative
2510      in this case.  This prevents the store from being optimized away.
2511      If the fp_offset is not 0, then the addresses will be FP-relative and
2512      therefore we return a FP-relative location.  */
2513
2514   if (frame_pointer_needed)
2515     {
2516       if (fp_offset)
2517         return gen_frame_mem (DImode,
2518                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2519       else
2520         return gen_frame_mem (DImode,
2521                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2522     }
2523
2524   /* If FP is not needed, we calculate the location of LR, which would be
2525      at the top of the saved registers block.  */
2526
2527   return gen_frame_mem (DImode,
2528                         plus_constant (Pmode,
2529                                        stack_pointer_rtx,
2530                                        fp_offset
2531                                        + cfun->machine->frame.saved_regs_size
2532                                        - 2 * UNITS_PER_WORD));
2533 }
2534
2535 /* Possibly output code to build up a constant in a register.  For
2536    the benefit of the costs infrastructure, returns the number of
2537    instructions which would be emitted.  GENERATE inhibits or
2538    enables code generation.  */
2539
2540 static int
2541 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2542 {
2543   int insns = 0;
2544
2545   if (aarch64_bitmask_imm (val, DImode))
2546     {
2547       if (generate)
2548         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2549       insns = 1;
2550     }
2551   else
2552     {
2553       int i;
2554       int ncount = 0;
2555       int zcount = 0;
2556       HOST_WIDE_INT valp = val >> 16;
2557       HOST_WIDE_INT valm;
2558       HOST_WIDE_INT tval;
2559
2560       for (i = 16; i < 64; i += 16)
2561         {
2562           valm = (valp & 0xffff);
2563
2564           if (valm != 0)
2565             ++ zcount;
2566
2567           if (valm != 0xffff)
2568             ++ ncount;
2569
2570           valp >>= 16;
2571         }
2572
2573       /* zcount contains the number of additional MOVK instructions
2574          required if the constant is built up with an initial MOVZ instruction,
2575          while ncount is the number of MOVK instructions required if starting
2576          with a MOVN instruction.  Choose the sequence that yields the fewest
2577          number of instructions, preferring MOVZ instructions when they are both
2578          the same.  */
2579       if (ncount < zcount)
2580         {
2581           if (generate)
2582             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2583                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2584           tval = 0xffff;
2585           insns++;
2586         }
2587       else
2588         {
2589           if (generate)
2590             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2591                             GEN_INT (val & 0xffff));
2592           tval = 0;
2593           insns++;
2594         }
2595
2596       val >>= 16;
2597
2598       for (i = 16; i < 64; i += 16)
2599         {
2600           if ((val & 0xffff) != tval)
2601             {
2602               if (generate)
2603                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2604                                            GEN_INT (i),
2605                                            GEN_INT (val & 0xffff)));
2606               insns++;
2607             }
2608           val >>= 16;
2609         }
2610     }
2611   return insns;
2612 }
2613
2614 static void
2615 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2616 {
2617   HOST_WIDE_INT mdelta = delta;
2618   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2619   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2620
2621   if (mdelta < 0)
2622     mdelta = -mdelta;
2623
2624   if (mdelta >= 4096 * 4096)
2625     {
2626       (void) aarch64_build_constant (scratchreg, delta, true);
2627       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2628     }
2629   else if (mdelta > 0)
2630     {
2631       if (mdelta >= 4096)
2632         {
2633           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2634           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2635           if (delta < 0)
2636             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2637                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2638           else
2639             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2640                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2641         }
2642       if (mdelta % 4096 != 0)
2643         {
2644           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2645           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2646                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2647         }
2648     }
2649 }
2650
2651 /* Output code to add DELTA to the first argument, and then jump
2652    to FUNCTION.  Used for C++ multiple inheritance.  */
2653 static void
2654 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2655                          HOST_WIDE_INT delta,
2656                          HOST_WIDE_INT vcall_offset,
2657                          tree function)
2658 {
2659   /* The this pointer is always in x0.  Note that this differs from
2660      Arm where the this pointer maybe bumped to r1 if r0 is required
2661      to return a pointer to an aggregate.  On AArch64 a result value
2662      pointer will be in x8.  */
2663   int this_regno = R0_REGNUM;
2664   rtx this_rtx, temp0, temp1, addr, insn, funexp;
2665
2666   reload_completed = 1;
2667   emit_note (NOTE_INSN_PROLOGUE_END);
2668
2669   if (vcall_offset == 0)
2670     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2671   else
2672     {
2673       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2674
2675       this_rtx = gen_rtx_REG (Pmode, this_regno);
2676       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2677       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2678
2679       addr = this_rtx;
2680       if (delta != 0)
2681         {
2682           if (delta >= -256 && delta < 256)
2683             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2684                                        plus_constant (Pmode, this_rtx, delta));
2685           else
2686             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2687         }
2688
2689       if (Pmode == ptr_mode)
2690         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2691       else
2692         aarch64_emit_move (temp0,
2693                            gen_rtx_ZERO_EXTEND (Pmode,
2694                                                 gen_rtx_MEM (ptr_mode, addr)));
2695
2696       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2697           addr = plus_constant (Pmode, temp0, vcall_offset);
2698       else
2699         {
2700           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2701           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2702         }
2703
2704       if (Pmode == ptr_mode)
2705         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2706       else
2707         aarch64_emit_move (temp1,
2708                            gen_rtx_SIGN_EXTEND (Pmode,
2709                                                 gen_rtx_MEM (ptr_mode, addr)));
2710
2711       emit_insn (gen_add2_insn (this_rtx, temp1));
2712     }
2713
2714   /* Generate a tail call to the target function.  */
2715   if (!TREE_USED (function))
2716     {
2717       assemble_external (function);
2718       TREE_USED (function) = 1;
2719     }
2720   funexp = XEXP (DECL_RTL (function), 0);
2721   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2722   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2723   SIBLING_CALL_P (insn) = 1;
2724
2725   insn = get_insns ();
2726   shorten_branches (insn);
2727   final_start_function (insn, file, 1);
2728   final (insn, file, 1);
2729   final_end_function ();
2730
2731   /* Stop pretending to be a post-reload pass.  */
2732   reload_completed = 0;
2733 }
2734
2735 static int
2736 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2737 {
2738   if (GET_CODE (*x) == SYMBOL_REF)
2739     return SYMBOL_REF_TLS_MODEL (*x) != 0;
2740
2741   /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2742      TLS offsets, not real symbol references.  */
2743   if (GET_CODE (*x) == UNSPEC
2744       && XINT (*x, 1) == UNSPEC_TLS)
2745     return -1;
2746
2747   return 0;
2748 }
2749
2750 static bool
2751 aarch64_tls_referenced_p (rtx x)
2752 {
2753   if (!TARGET_HAVE_TLS)
2754     return false;
2755
2756   return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2757 }
2758
2759
2760 static int
2761 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2762 {
2763   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2764   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2765
2766   if (*imm1 < *imm2)
2767     return -1;
2768   if (*imm1 > *imm2)
2769     return +1;
2770   return 0;
2771 }
2772
2773
2774 static void
2775 aarch64_build_bitmask_table (void)
2776 {
2777   unsigned HOST_WIDE_INT mask, imm;
2778   unsigned int log_e, e, s, r;
2779   unsigned int nimms = 0;
2780
2781   for (log_e = 1; log_e <= 6; log_e++)
2782     {
2783       e = 1 << log_e;
2784       if (e == 64)
2785         mask = ~(HOST_WIDE_INT) 0;
2786       else
2787         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2788       for (s = 1; s < e; s++)
2789         {
2790           for (r = 0; r < e; r++)
2791             {
2792               /* set s consecutive bits to 1 (s < 64) */
2793               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2794               /* rotate right by r */
2795               if (r != 0)
2796                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2797               /* replicate the constant depending on SIMD size */
2798               switch (log_e) {
2799               case 1: imm |= (imm <<  2);
2800               case 2: imm |= (imm <<  4);
2801               case 3: imm |= (imm <<  8);
2802               case 4: imm |= (imm << 16);
2803               case 5: imm |= (imm << 32);
2804               case 6:
2805                 break;
2806               default:
2807                 gcc_unreachable ();
2808               }
2809               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2810               aarch64_bitmasks[nimms++] = imm;
2811             }
2812         }
2813     }
2814
2815   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2816   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2817          aarch64_bitmasks_cmp);
2818 }
2819
2820
2821 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2822    a left shift of 0 or 12 bits.  */
2823 bool
2824 aarch64_uimm12_shift (HOST_WIDE_INT val)
2825 {
2826   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2827           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2828           );
2829 }
2830
2831
2832 /* Return true if val is an immediate that can be loaded into a
2833    register by a MOVZ instruction.  */
2834 static bool
2835 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2836 {
2837   if (GET_MODE_SIZE (mode) > 4)
2838     {
2839       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2840           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2841         return 1;
2842     }
2843   else
2844     {
2845       /* Ignore sign extension.  */
2846       val &= (HOST_WIDE_INT) 0xffffffff;
2847     }
2848   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2849           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2850 }
2851
2852
2853 /* Return true if val is a valid bitmask immediate.  */
2854 bool
2855 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2856 {
2857   if (GET_MODE_SIZE (mode) < 8)
2858     {
2859       /* Replicate bit pattern.  */
2860       val &= (HOST_WIDE_INT) 0xffffffff;
2861       val |= val << 32;
2862     }
2863   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2864                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2865 }
2866
2867
2868 /* Return true if val is an immediate that can be loaded into a
2869    register in a single instruction.  */
2870 bool
2871 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2872 {
2873   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2874     return 1;
2875   return aarch64_bitmask_imm (val, mode);
2876 }
2877
2878 static bool
2879 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2880 {
2881   rtx base, offset;
2882
2883   if (GET_CODE (x) == HIGH)
2884     return true;
2885
2886   split_const (x, &base, &offset);
2887   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2888     {
2889       if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2890           != SYMBOL_FORCE_TO_MEM)
2891         return true;
2892       else
2893         /* Avoid generating a 64-bit relocation in ILP32; leave
2894            to aarch64_expand_mov_immediate to handle it properly.  */
2895         return mode != ptr_mode;
2896     }
2897
2898   return aarch64_tls_referenced_p (x);
2899 }
2900
2901 /* Return true if register REGNO is a valid index register.
2902    STRICT_P is true if REG_OK_STRICT is in effect.  */
2903
2904 bool
2905 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2906 {
2907   if (!HARD_REGISTER_NUM_P (regno))
2908     {
2909       if (!strict_p)
2910         return true;
2911
2912       if (!reg_renumber)
2913         return false;
2914
2915       regno = reg_renumber[regno];
2916     }
2917   return GP_REGNUM_P (regno);
2918 }
2919
2920 /* Return true if register REGNO is a valid base register for mode MODE.
2921    STRICT_P is true if REG_OK_STRICT is in effect.  */
2922
2923 bool
2924 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2925 {
2926   if (!HARD_REGISTER_NUM_P (regno))
2927     {
2928       if (!strict_p)
2929         return true;
2930
2931       if (!reg_renumber)
2932         return false;
2933
2934       regno = reg_renumber[regno];
2935     }
2936
2937   /* The fake registers will be eliminated to either the stack or
2938      hard frame pointer, both of which are usually valid base registers.
2939      Reload deals with the cases where the eliminated form isn't valid.  */
2940   return (GP_REGNUM_P (regno)
2941           || regno == SP_REGNUM
2942           || regno == FRAME_POINTER_REGNUM
2943           || regno == ARG_POINTER_REGNUM);
2944 }
2945
2946 /* Return true if X is a valid base register for mode MODE.
2947    STRICT_P is true if REG_OK_STRICT is in effect.  */
2948
2949 static bool
2950 aarch64_base_register_rtx_p (rtx x, bool strict_p)
2951 {
2952   if (!strict_p && GET_CODE (x) == SUBREG)
2953     x = SUBREG_REG (x);
2954
2955   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
2956 }
2957
2958 /* Return true if address offset is a valid index.  If it is, fill in INFO
2959    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
2960
2961 static bool
2962 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
2963                         enum machine_mode mode, bool strict_p)
2964 {
2965   enum aarch64_address_type type;
2966   rtx index;
2967   int shift;
2968
2969   /* (reg:P) */
2970   if ((REG_P (x) || GET_CODE (x) == SUBREG)
2971       && GET_MODE (x) == Pmode)
2972     {
2973       type = ADDRESS_REG_REG;
2974       index = x;
2975       shift = 0;
2976     }
2977   /* (sign_extend:DI (reg:SI)) */
2978   else if ((GET_CODE (x) == SIGN_EXTEND
2979             || GET_CODE (x) == ZERO_EXTEND)
2980            && GET_MODE (x) == DImode
2981            && GET_MODE (XEXP (x, 0)) == SImode)
2982     {
2983       type = (GET_CODE (x) == SIGN_EXTEND)
2984         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2985       index = XEXP (x, 0);
2986       shift = 0;
2987     }
2988   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
2989   else if (GET_CODE (x) == MULT
2990            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2991                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2992            && GET_MODE (XEXP (x, 0)) == DImode
2993            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2994            && CONST_INT_P (XEXP (x, 1)))
2995     {
2996       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
2997         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2998       index = XEXP (XEXP (x, 0), 0);
2999       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3000     }
3001   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3002   else if (GET_CODE (x) == ASHIFT
3003            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3004                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3005            && GET_MODE (XEXP (x, 0)) == DImode
3006            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3007            && CONST_INT_P (XEXP (x, 1)))
3008     {
3009       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3010         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3011       index = XEXP (XEXP (x, 0), 0);
3012       shift = INTVAL (XEXP (x, 1));
3013     }
3014   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3015   else if ((GET_CODE (x) == SIGN_EXTRACT
3016             || GET_CODE (x) == ZERO_EXTRACT)
3017            && GET_MODE (x) == DImode
3018            && GET_CODE (XEXP (x, 0)) == MULT
3019            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3020            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3021     {
3022       type = (GET_CODE (x) == SIGN_EXTRACT)
3023         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3024       index = XEXP (XEXP (x, 0), 0);
3025       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3026       if (INTVAL (XEXP (x, 1)) != 32 + shift
3027           || INTVAL (XEXP (x, 2)) != 0)
3028         shift = -1;
3029     }
3030   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3031      (const_int 0xffffffff<<shift)) */
3032   else if (GET_CODE (x) == AND
3033            && GET_MODE (x) == DImode
3034            && GET_CODE (XEXP (x, 0)) == MULT
3035            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3036            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3037            && CONST_INT_P (XEXP (x, 1)))
3038     {
3039       type = ADDRESS_REG_UXTW;
3040       index = XEXP (XEXP (x, 0), 0);
3041       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3042       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3043         shift = -1;
3044     }
3045   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3046   else if ((GET_CODE (x) == SIGN_EXTRACT
3047             || GET_CODE (x) == ZERO_EXTRACT)
3048            && GET_MODE (x) == DImode
3049            && GET_CODE (XEXP (x, 0)) == ASHIFT
3050            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3051            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3052     {
3053       type = (GET_CODE (x) == SIGN_EXTRACT)
3054         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3055       index = XEXP (XEXP (x, 0), 0);
3056       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3057       if (INTVAL (XEXP (x, 1)) != 32 + shift
3058           || INTVAL (XEXP (x, 2)) != 0)
3059         shift = -1;
3060     }
3061   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3062      (const_int 0xffffffff<<shift)) */
3063   else if (GET_CODE (x) == AND
3064            && GET_MODE (x) == DImode
3065            && GET_CODE (XEXP (x, 0)) == ASHIFT
3066            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3067            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3068            && CONST_INT_P (XEXP (x, 1)))
3069     {
3070       type = ADDRESS_REG_UXTW;
3071       index = XEXP (XEXP (x, 0), 0);
3072       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3073       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3074         shift = -1;
3075     }
3076   /* (mult:P (reg:P) (const_int scale)) */
3077   else if (GET_CODE (x) == MULT
3078            && GET_MODE (x) == Pmode
3079            && GET_MODE (XEXP (x, 0)) == Pmode
3080            && CONST_INT_P (XEXP (x, 1)))
3081     {
3082       type = ADDRESS_REG_REG;
3083       index = XEXP (x, 0);
3084       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3085     }
3086   /* (ashift:P (reg:P) (const_int shift)) */
3087   else if (GET_CODE (x) == ASHIFT
3088            && GET_MODE (x) == Pmode
3089            && GET_MODE (XEXP (x, 0)) == Pmode
3090            && CONST_INT_P (XEXP (x, 1)))
3091     {
3092       type = ADDRESS_REG_REG;
3093       index = XEXP (x, 0);
3094       shift = INTVAL (XEXP (x, 1));
3095     }
3096   else
3097     return false;
3098
3099   if (GET_CODE (index) == SUBREG)
3100     index = SUBREG_REG (index);
3101
3102   if ((shift == 0 ||
3103        (shift > 0 && shift <= 3
3104         && (1 << shift) == GET_MODE_SIZE (mode)))
3105       && REG_P (index)
3106       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3107     {
3108       info->type = type;
3109       info->offset = index;
3110       info->shift = shift;
3111       return true;
3112     }
3113
3114   return false;
3115 }
3116
3117 static inline bool
3118 offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3119 {
3120   return (offset >= -64 * GET_MODE_SIZE (mode)
3121           && offset < 64 * GET_MODE_SIZE (mode)
3122           && offset % GET_MODE_SIZE (mode) == 0);
3123 }
3124
3125 static inline bool
3126 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3127                                HOST_WIDE_INT offset)
3128 {
3129   return offset >= -256 && offset < 256;
3130 }
3131
3132 static inline bool
3133 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3134 {
3135   return (offset >= 0
3136           && offset < 4096 * GET_MODE_SIZE (mode)
3137           && offset % GET_MODE_SIZE (mode) == 0);
3138 }
3139
3140 /* Return true if X is a valid address for machine mode MODE.  If it is,
3141    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3142    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3143
3144 static bool
3145 aarch64_classify_address (struct aarch64_address_info *info,
3146                           rtx x, enum machine_mode mode,
3147                           RTX_CODE outer_code, bool strict_p)
3148 {
3149   enum rtx_code code = GET_CODE (x);
3150   rtx op0, op1;
3151   bool allow_reg_index_p =
3152     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3153                                || aarch64_vector_mode_supported_p (mode));
3154   /* Don't support anything other than POST_INC or REG addressing for
3155      AdvSIMD.  */
3156   if (aarch64_vect_struct_mode_p (mode)
3157       && (code != POST_INC && code != REG))
3158     return false;
3159
3160   switch (code)
3161     {
3162     case REG:
3163     case SUBREG:
3164       info->type = ADDRESS_REG_IMM;
3165       info->base = x;
3166       info->offset = const0_rtx;
3167       return aarch64_base_register_rtx_p (x, strict_p);
3168
3169     case PLUS:
3170       op0 = XEXP (x, 0);
3171       op1 = XEXP (x, 1);
3172       if (GET_MODE_SIZE (mode) != 0
3173           && CONST_INT_P (op1)
3174           && aarch64_base_register_rtx_p (op0, strict_p))
3175         {
3176           HOST_WIDE_INT offset = INTVAL (op1);
3177
3178           info->type = ADDRESS_REG_IMM;
3179           info->base = op0;
3180           info->offset = op1;
3181
3182           /* TImode and TFmode values are allowed in both pairs of X
3183              registers and individual Q registers.  The available
3184              address modes are:
3185              X,X: 7-bit signed scaled offset
3186              Q:   9-bit signed offset
3187              We conservatively require an offset representable in either mode.
3188            */
3189           if (mode == TImode || mode == TFmode)
3190             return (offset_7bit_signed_scaled_p (mode, offset)
3191                     && offset_9bit_signed_unscaled_p (mode, offset));
3192
3193           if (outer_code == PARALLEL)
3194             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3195                     && offset_7bit_signed_scaled_p (mode, offset));
3196           else
3197             return (offset_9bit_signed_unscaled_p (mode, offset)
3198                     || offset_12bit_unsigned_scaled_p (mode, offset));
3199         }
3200
3201       if (allow_reg_index_p)
3202         {
3203           /* Look for base + (scaled/extended) index register.  */
3204           if (aarch64_base_register_rtx_p (op0, strict_p)
3205               && aarch64_classify_index (info, op1, mode, strict_p))
3206             {
3207               info->base = op0;
3208               return true;
3209             }
3210           if (aarch64_base_register_rtx_p (op1, strict_p)
3211               && aarch64_classify_index (info, op0, mode, strict_p))
3212             {
3213               info->base = op1;
3214               return true;
3215             }
3216         }
3217
3218       return false;
3219
3220     case POST_INC:
3221     case POST_DEC:
3222     case PRE_INC:
3223     case PRE_DEC:
3224       info->type = ADDRESS_REG_WB;
3225       info->base = XEXP (x, 0);
3226       info->offset = NULL_RTX;
3227       return aarch64_base_register_rtx_p (info->base, strict_p);
3228
3229     case POST_MODIFY:
3230     case PRE_MODIFY:
3231       info->type = ADDRESS_REG_WB;
3232       info->base = XEXP (x, 0);
3233       if (GET_CODE (XEXP (x, 1)) == PLUS
3234           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3235           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3236           && aarch64_base_register_rtx_p (info->base, strict_p))
3237         {
3238           HOST_WIDE_INT offset;
3239           info->offset = XEXP (XEXP (x, 1), 1);
3240           offset = INTVAL (info->offset);
3241
3242           /* TImode and TFmode values are allowed in both pairs of X
3243              registers and individual Q registers.  The available
3244              address modes are:
3245              X,X: 7-bit signed scaled offset
3246              Q:   9-bit signed offset
3247              We conservatively require an offset representable in either mode.
3248            */
3249           if (mode == TImode || mode == TFmode)
3250             return (offset_7bit_signed_scaled_p (mode, offset)
3251                     && offset_9bit_signed_unscaled_p (mode, offset));
3252
3253           if (outer_code == PARALLEL)
3254             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3255                     && offset_7bit_signed_scaled_p (mode, offset));
3256           else
3257             return offset_9bit_signed_unscaled_p (mode, offset);
3258         }
3259       return false;
3260
3261     case CONST:
3262     case SYMBOL_REF:
3263     case LABEL_REF:
3264       /* load literal: pc-relative constant pool entry.  Only supported
3265          for SI mode or larger.  */
3266       info->type = ADDRESS_SYMBOLIC;
3267       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3268         {
3269           rtx sym, addend;
3270
3271           split_const (x, &sym, &addend);
3272           return (GET_CODE (sym) == LABEL_REF
3273                   || (GET_CODE (sym) == SYMBOL_REF
3274                       && CONSTANT_POOL_ADDRESS_P (sym)));
3275         }
3276       return false;
3277
3278     case LO_SUM:
3279       info->type = ADDRESS_LO_SUM;
3280       info->base = XEXP (x, 0);
3281       info->offset = XEXP (x, 1);
3282       if (allow_reg_index_p
3283           && aarch64_base_register_rtx_p (info->base, strict_p))
3284         {
3285           rtx sym, offs;
3286           split_const (info->offset, &sym, &offs);
3287           if (GET_CODE (sym) == SYMBOL_REF
3288               && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3289                   == SYMBOL_SMALL_ABSOLUTE))
3290             {
3291               /* The symbol and offset must be aligned to the access size.  */
3292               unsigned int align;
3293               unsigned int ref_size;
3294
3295               if (CONSTANT_POOL_ADDRESS_P (sym))
3296                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3297               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3298                 {
3299                   tree exp = SYMBOL_REF_DECL (sym);
3300                   align = TYPE_ALIGN (TREE_TYPE (exp));
3301                   align = CONSTANT_ALIGNMENT (exp, align);
3302                 }
3303               else if (SYMBOL_REF_DECL (sym))
3304                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3305               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3306                        && SYMBOL_REF_BLOCK (sym) != NULL)
3307                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3308               else
3309                 align = BITS_PER_UNIT;
3310
3311               ref_size = GET_MODE_SIZE (mode);
3312               if (ref_size == 0)
3313                 ref_size = GET_MODE_SIZE (DImode);
3314
3315               return ((INTVAL (offs) & (ref_size - 1)) == 0
3316                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3317             }
3318         }
3319       return false;
3320
3321     default:
3322       return false;
3323     }
3324 }
3325
3326 bool
3327 aarch64_symbolic_address_p (rtx x)
3328 {
3329   rtx offset;
3330
3331   split_const (x, &x, &offset);
3332   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3333 }
3334
3335 /* Classify the base of symbolic expression X, given that X appears in
3336    context CONTEXT.  */
3337
3338 enum aarch64_symbol_type
3339 aarch64_classify_symbolic_expression (rtx x,
3340                                       enum aarch64_symbol_context context)
3341 {
3342   rtx offset;
3343
3344   split_const (x, &x, &offset);
3345   return aarch64_classify_symbol (x, context);
3346 }
3347
3348
3349 /* Return TRUE if X is a legitimate address for accessing memory in
3350    mode MODE.  */
3351 static bool
3352 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3353 {
3354   struct aarch64_address_info addr;
3355
3356   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3357 }
3358
3359 /* Return TRUE if X is a legitimate address for accessing memory in
3360    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3361    pair operation.  */
3362 bool
3363 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3364                               RTX_CODE outer_code, bool strict_p)
3365 {
3366   struct aarch64_address_info addr;
3367
3368   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3369 }
3370
3371 /* Return TRUE if rtx X is immediate constant 0.0 */
3372 bool
3373 aarch64_float_const_zero_rtx_p (rtx x)
3374 {
3375   REAL_VALUE_TYPE r;
3376
3377   if (GET_MODE (x) == VOIDmode)
3378     return false;
3379
3380   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3381   if (REAL_VALUE_MINUS_ZERO (r))
3382     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3383   return REAL_VALUES_EQUAL (r, dconst0);
3384 }
3385
3386 /* Return the fixed registers used for condition codes.  */
3387
3388 static bool
3389 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3390 {
3391   *p1 = CC_REGNUM;
3392   *p2 = INVALID_REGNUM;
3393   return true;
3394 }
3395
3396 /* Emit call insn with PAT and do aarch64-specific handling.  */
3397
3398 void
3399 aarch64_emit_call_insn (rtx pat)
3400 {
3401   rtx insn = emit_call_insn (pat);
3402
3403   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3404   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3405   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3406 }
3407
3408 enum machine_mode
3409 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3410 {
3411   /* All floating point compares return CCFP if it is an equality
3412      comparison, and CCFPE otherwise.  */
3413   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3414     {
3415       switch (code)
3416         {
3417         case EQ:
3418         case NE:
3419         case UNORDERED:
3420         case ORDERED:
3421         case UNLT:
3422         case UNLE:
3423         case UNGT:
3424         case UNGE:
3425         case UNEQ:
3426         case LTGT:
3427           return CCFPmode;
3428
3429         case LT:
3430         case LE:
3431         case GT:
3432         case GE:
3433           return CCFPEmode;
3434
3435         default:
3436           gcc_unreachable ();
3437         }
3438     }
3439
3440   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3441       && y == const0_rtx
3442       && (code == EQ || code == NE || code == LT || code == GE)
3443       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3444           || GET_CODE (x) == NEG))
3445     return CC_NZmode;
3446
3447   /* A compare with a shifted operand.  Because of canonicalization,
3448      the comparison will have to be swapped when we emit the assembly
3449      code.  */
3450   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3451       && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3452       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3453           || GET_CODE (x) == LSHIFTRT
3454           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3455     return CC_SWPmode;
3456
3457   /* Similarly for a negated operand, but we can only do this for
3458      equalities.  */
3459   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3460       && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3461       && (code == EQ || code == NE)
3462       && GET_CODE (x) == NEG)
3463     return CC_Zmode;
3464
3465   /* A compare of a mode narrower than SI mode against zero can be done
3466      by extending the value in the comparison.  */
3467   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3468       && y == const0_rtx)
3469     /* Only use sign-extension if we really need it.  */
3470     return ((code == GT || code == GE || code == LE || code == LT)
3471             ? CC_SESWPmode : CC_ZESWPmode);
3472
3473   /* For everything else, return CCmode.  */
3474   return CCmode;
3475 }
3476
3477 static unsigned
3478 aarch64_get_condition_code (rtx x)
3479 {
3480   enum machine_mode mode = GET_MODE (XEXP (x, 0));
3481   enum rtx_code comp_code = GET_CODE (x);
3482
3483   if (GET_MODE_CLASS (mode) != MODE_CC)
3484     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3485
3486   switch (mode)
3487     {
3488     case CCFPmode:
3489     case CCFPEmode:
3490       switch (comp_code)
3491         {
3492         case GE: return AARCH64_GE;
3493         case GT: return AARCH64_GT;
3494         case LE: return AARCH64_LS;
3495         case LT: return AARCH64_MI;
3496         case NE: return AARCH64_NE;
3497         case EQ: return AARCH64_EQ;
3498         case ORDERED: return AARCH64_VC;
3499         case UNORDERED: return AARCH64_VS;
3500         case UNLT: return AARCH64_LT;
3501         case UNLE: return AARCH64_LE;
3502         case UNGT: return AARCH64_HI;
3503         case UNGE: return AARCH64_PL;
3504         default: gcc_unreachable ();
3505         }
3506       break;
3507
3508     case CCmode:
3509       switch (comp_code)
3510         {
3511         case NE: return AARCH64_NE;
3512         case EQ: return AARCH64_EQ;
3513         case GE: return AARCH64_GE;
3514         case GT: return AARCH64_GT;
3515         case LE: return AARCH64_LE;
3516         case LT: return AARCH64_LT;
3517         case GEU: return AARCH64_CS;
3518         case GTU: return AARCH64_HI;
3519         case LEU: return AARCH64_LS;
3520         case LTU: return AARCH64_CC;
3521         default: gcc_unreachable ();
3522         }
3523       break;
3524
3525     case CC_SWPmode:
3526     case CC_ZESWPmode:
3527     case CC_SESWPmode:
3528       switch (comp_code)
3529         {
3530         case NE: return AARCH64_NE;
3531         case EQ: return AARCH64_EQ;
3532         case GE: return AARCH64_LE;
3533         case GT: return AARCH64_LT;
3534         case LE: return AARCH64_GE;
3535         case LT: return AARCH64_GT;
3536         case GEU: return AARCH64_LS;
3537         case GTU: return AARCH64_CC;
3538         case LEU: return AARCH64_CS;
3539         case LTU: return AARCH64_HI;
3540         default: gcc_unreachable ();
3541         }
3542       break;
3543
3544     case CC_NZmode:
3545       switch (comp_code)
3546         {
3547         case NE: return AARCH64_NE;
3548         case EQ: return AARCH64_EQ;
3549         case GE: return AARCH64_PL;
3550         case LT: return AARCH64_MI;
3551         default: gcc_unreachable ();
3552         }
3553       break;
3554
3555     case CC_Zmode:
3556       switch (comp_code)
3557         {
3558         case NE: return AARCH64_NE;
3559         case EQ: return AARCH64_EQ;
3560         default: gcc_unreachable ();
3561         }
3562       break;
3563
3564     default:
3565       gcc_unreachable ();
3566       break;
3567     }
3568 }
3569
3570 static unsigned
3571 bit_count (unsigned HOST_WIDE_INT value)
3572 {
3573   unsigned count = 0;
3574
3575   while (value)
3576     {
3577       count++;
3578       value &= value - 1;
3579     }
3580
3581   return count;
3582 }
3583
3584 void
3585 aarch64_print_operand (FILE *f, rtx x, char code)
3586 {
3587   switch (code)
3588     {
3589     /* An integer or symbol address without a preceding # sign.  */
3590     case 'c':
3591       switch (GET_CODE (x))
3592         {
3593         case CONST_INT:
3594           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3595           break;
3596
3597         case SYMBOL_REF:
3598           output_addr_const (f, x);
3599           break;
3600
3601         case CONST:
3602           if (GET_CODE (XEXP (x, 0)) == PLUS
3603               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3604             {
3605               output_addr_const (f, x);
3606               break;
3607             }
3608           /* Fall through.  */
3609
3610         default:
3611           output_operand_lossage ("Unsupported operand for code '%c'", code);
3612         }
3613       break;
3614
3615     case 'e':
3616       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3617       {
3618         int n;
3619
3620         if (GET_CODE (x) != CONST_INT
3621             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3622           {
3623             output_operand_lossage ("invalid operand for '%%%c'", code);
3624             return;
3625           }
3626
3627         switch (n)
3628           {
3629           case 3:
3630             fputc ('b', f);
3631             break;
3632           case 4:
3633             fputc ('h', f);
3634             break;
3635           case 5:
3636             fputc ('w', f);
3637             break;
3638           default:
3639             output_operand_lossage ("invalid operand for '%%%c'", code);
3640             return;
3641           }
3642       }
3643       break;
3644
3645     case 'p':
3646       {
3647         int n;
3648
3649         /* Print N such that 2^N == X.  */
3650         if (GET_CODE (x) != CONST_INT || (n = exact_log2 (INTVAL (x))) < 0)
3651           {
3652             output_operand_lossage ("invalid operand for '%%%c'", code);
3653             return;
3654           }
3655
3656         asm_fprintf (f, "%d", n);
3657       }
3658       break;
3659
3660     case 'P':
3661       /* Print the number of non-zero bits in X (a const_int).  */
3662       if (GET_CODE (x) != CONST_INT)
3663         {
3664           output_operand_lossage ("invalid operand for '%%%c'", code);
3665           return;
3666         }
3667
3668       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3669       break;
3670
3671     case 'H':
3672       /* Print the higher numbered register of a pair (TImode) of regs.  */
3673       if (GET_CODE (x) != REG || !GP_REGNUM_P (REGNO (x) + 1))
3674         {
3675           output_operand_lossage ("invalid operand for '%%%c'", code);
3676           return;
3677         }
3678
3679       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3680       break;
3681
3682     case 'm':
3683       /* Print a condition (eq, ne, etc).  */
3684
3685       /* CONST_TRUE_RTX means always -- that's the default.  */
3686       if (x == const_true_rtx)
3687         return;
3688
3689       if (!COMPARISON_P (x))
3690         {
3691           output_operand_lossage ("invalid operand for '%%%c'", code);
3692           return;
3693         }
3694
3695       fputs (aarch64_condition_codes[aarch64_get_condition_code (x)], f);
3696       break;
3697
3698     case 'M':
3699       /* Print the inverse of a condition (eq <-> ne, etc).  */
3700
3701       /* CONST_TRUE_RTX means never -- that's the default.  */
3702       if (x == const_true_rtx)
3703         {
3704           fputs ("nv", f);
3705           return;
3706         }
3707
3708       if (!COMPARISON_P (x))
3709         {
3710           output_operand_lossage ("invalid operand for '%%%c'", code);
3711           return;
3712         }
3713
3714       fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3715                                   (aarch64_get_condition_code (x))], f);
3716       break;
3717
3718     case 'b':
3719     case 'h':
3720     case 's':
3721     case 'd':
3722     case 'q':
3723       /* Print a scalar FP/SIMD register name.  */
3724       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3725         {
3726           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3727           return;
3728         }
3729       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3730       break;
3731
3732     case 'S':
3733     case 'T':
3734     case 'U':
3735     case 'V':
3736       /* Print the first FP/SIMD register name in a list.  */
3737       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3738         {
3739           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3740           return;
3741         }
3742       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3743       break;
3744
3745     case 'X':
3746       /* Print bottom 16 bits of integer constant in hex.  */
3747       if (GET_CODE (x) != CONST_INT)
3748         {
3749           output_operand_lossage ("invalid operand for '%%%c'", code);
3750           return;
3751         }
3752       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3753       break;
3754
3755     case 'w':
3756     case 'x':
3757       /* Print a general register name or the zero register (32-bit or
3758          64-bit).  */
3759       if (x == const0_rtx
3760           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3761         {
3762           asm_fprintf (f, "%czr", code);
3763           break;
3764         }
3765
3766       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3767         {
3768           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3769           break;
3770         }
3771
3772       if (REG_P (x) && REGNO (x) == SP_REGNUM)
3773         {
3774           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3775           break;
3776         }
3777
3778       /* Fall through */
3779
3780     case 0:
3781       /* Print a normal operand, if it's a general register, then we
3782          assume DImode.  */
3783       if (x == NULL)
3784         {
3785           output_operand_lossage ("missing operand");
3786           return;
3787         }
3788
3789       switch (GET_CODE (x))
3790         {
3791         case REG:
3792           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3793           break;
3794
3795         case MEM:
3796           aarch64_memory_reference_mode = GET_MODE (x);
3797           output_address (XEXP (x, 0));
3798           break;
3799
3800         case LABEL_REF:
3801         case SYMBOL_REF:
3802           output_addr_const (asm_out_file, x);
3803           break;
3804
3805         case CONST_INT:
3806           asm_fprintf (f, "%wd", INTVAL (x));
3807           break;
3808
3809         case CONST_VECTOR:
3810           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3811             {
3812               gcc_assert (aarch64_const_vec_all_same_int_p (x,
3813                                                             HOST_WIDE_INT_MIN,
3814                                                             HOST_WIDE_INT_MAX));
3815               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3816             }
3817           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3818             {
3819               fputc ('0', f);
3820             }
3821           else
3822             gcc_unreachable ();
3823           break;
3824
3825         case CONST_DOUBLE:
3826           /* CONST_DOUBLE can represent a double-width integer.
3827              In this case, the mode of x is VOIDmode.  */
3828           if (GET_MODE (x) == VOIDmode)
3829             ; /* Do Nothing.  */
3830           else if (aarch64_float_const_zero_rtx_p (x))
3831             {
3832               fputc ('0', f);
3833               break;
3834             }
3835           else if (aarch64_float_const_representable_p (x))
3836             {
3837 #define buf_size 20
3838               char float_buf[buf_size] = {'\0'};
3839               REAL_VALUE_TYPE r;
3840               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3841               real_to_decimal_for_mode (float_buf, &r,
3842                                         buf_size, buf_size,
3843                                         1, GET_MODE (x));
3844               asm_fprintf (asm_out_file, "%s", float_buf);
3845               break;
3846 #undef buf_size
3847             }
3848           output_operand_lossage ("invalid constant");
3849           return;
3850         default:
3851           output_operand_lossage ("invalid operand");
3852           return;
3853         }
3854       break;
3855
3856     case 'A':
3857       if (GET_CODE (x) == HIGH)
3858         x = XEXP (x, 0);
3859
3860       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3861         {
3862         case SYMBOL_SMALL_GOT:
3863           asm_fprintf (asm_out_file, ":got:");
3864           break;
3865
3866         case SYMBOL_SMALL_TLSGD:
3867           asm_fprintf (asm_out_file, ":tlsgd:");
3868           break;
3869
3870         case SYMBOL_SMALL_TLSDESC:
3871           asm_fprintf (asm_out_file, ":tlsdesc:");
3872           break;
3873
3874         case SYMBOL_SMALL_GOTTPREL:
3875           asm_fprintf (asm_out_file, ":gottprel:");
3876           break;
3877
3878         case SYMBOL_SMALL_TPREL:
3879           asm_fprintf (asm_out_file, ":tprel:");
3880           break;
3881
3882         case SYMBOL_TINY_GOT:
3883           gcc_unreachable ();
3884           break;
3885
3886         default:
3887           break;
3888         }
3889       output_addr_const (asm_out_file, x);
3890       break;
3891
3892     case 'L':
3893       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3894         {
3895         case SYMBOL_SMALL_GOT:
3896           asm_fprintf (asm_out_file, ":lo12:");
3897           break;
3898
3899         case SYMBOL_SMALL_TLSGD:
3900           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3901           break;
3902
3903         case SYMBOL_SMALL_TLSDESC:
3904           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3905           break;
3906
3907         case SYMBOL_SMALL_GOTTPREL:
3908           asm_fprintf (asm_out_file, ":gottprel_lo12:");
3909           break;
3910
3911         case SYMBOL_SMALL_TPREL:
3912           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3913           break;
3914
3915         case SYMBOL_TINY_GOT:
3916           asm_fprintf (asm_out_file, ":got:");
3917           break;
3918
3919         default:
3920           break;
3921         }
3922       output_addr_const (asm_out_file, x);
3923       break;
3924
3925     case 'G':
3926
3927       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3928         {
3929         case SYMBOL_SMALL_TPREL:
3930           asm_fprintf (asm_out_file, ":tprel_hi12:");
3931           break;
3932         default:
3933           break;
3934         }
3935       output_addr_const (asm_out_file, x);
3936       break;
3937
3938     default:
3939       output_operand_lossage ("invalid operand prefix '%%%c'", code);
3940       return;
3941     }
3942 }
3943
3944 void
3945 aarch64_print_operand_address (FILE *f, rtx x)
3946 {
3947   struct aarch64_address_info addr;
3948
3949   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
3950                              MEM, true))
3951     switch (addr.type)
3952       {
3953       case ADDRESS_REG_IMM:
3954         if (addr.offset == const0_rtx)
3955           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
3956         else
3957           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
3958                        INTVAL (addr.offset));
3959         return;
3960
3961       case ADDRESS_REG_REG:
3962         if (addr.shift == 0)
3963           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
3964                        reg_names [REGNO (addr.offset)]);
3965         else
3966           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
3967                        reg_names [REGNO (addr.offset)], addr.shift);
3968         return;
3969
3970       case ADDRESS_REG_UXTW:
3971         if (addr.shift == 0)
3972           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
3973                        REGNO (addr.offset) - R0_REGNUM);
3974         else
3975           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
3976                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
3977         return;
3978
3979       case ADDRESS_REG_SXTW:
3980         if (addr.shift == 0)
3981           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
3982                        REGNO (addr.offset) - R0_REGNUM);
3983         else
3984           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
3985                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
3986         return;
3987
3988       case ADDRESS_REG_WB:
3989         switch (GET_CODE (x))
3990           {
3991           case PRE_INC:
3992             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
3993                          GET_MODE_SIZE (aarch64_memory_reference_mode));
3994             return;
3995           case POST_INC:
3996             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
3997                          GET_MODE_SIZE (aarch64_memory_reference_mode));
3998             return;
3999           case PRE_DEC:
4000             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4001                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4002             return;
4003           case POST_DEC:
4004             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4005                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4006             return;
4007           case PRE_MODIFY:
4008             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4009                          INTVAL (addr.offset));
4010             return;
4011           case POST_MODIFY:
4012             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4013                          INTVAL (addr.offset));
4014             return;
4015           default:
4016             break;
4017           }
4018         break;
4019
4020       case ADDRESS_LO_SUM:
4021         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4022         output_addr_const (f, addr.offset);
4023         asm_fprintf (f, "]");
4024         return;
4025
4026       case ADDRESS_SYMBOLIC:
4027         break;
4028       }
4029
4030   output_addr_const (f, x);
4031 }
4032
4033 bool
4034 aarch64_label_mentioned_p (rtx x)
4035 {
4036   const char *fmt;
4037   int i;
4038
4039   if (GET_CODE (x) == LABEL_REF)
4040     return true;
4041
4042   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4043      referencing instruction, but they are constant offsets, not
4044      symbols.  */
4045   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4046     return false;
4047
4048   fmt = GET_RTX_FORMAT (GET_CODE (x));
4049   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4050     {
4051       if (fmt[i] == 'E')
4052         {
4053           int j;
4054
4055           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4056             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4057               return 1;
4058         }
4059       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4060         return 1;
4061     }
4062
4063   return 0;
4064 }
4065
4066 /* Implement REGNO_REG_CLASS.  */
4067
4068 enum reg_class
4069 aarch64_regno_regclass (unsigned regno)
4070 {
4071   if (GP_REGNUM_P (regno))
4072     return GENERAL_REGS;
4073
4074   if (regno == SP_REGNUM)
4075     return STACK_REG;
4076
4077   if (regno == FRAME_POINTER_REGNUM
4078       || regno == ARG_POINTER_REGNUM)
4079     return POINTER_REGS;
4080
4081   if (FP_REGNUM_P (regno))
4082     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4083
4084   return NO_REGS;
4085 }
4086
4087 /* Try a machine-dependent way of reloading an illegitimate address
4088    operand.  If we find one, push the reload and return the new rtx.  */
4089
4090 rtx
4091 aarch64_legitimize_reload_address (rtx *x_p,
4092                                    enum machine_mode mode,
4093                                    int opnum, int type,
4094                                    int ind_levels ATTRIBUTE_UNUSED)
4095 {
4096   rtx x = *x_p;
4097
4098   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4099   if (aarch64_vect_struct_mode_p (mode)
4100       && GET_CODE (x) == PLUS
4101       && REG_P (XEXP (x, 0))
4102       && CONST_INT_P (XEXP (x, 1)))
4103     {
4104       rtx orig_rtx = x;
4105       x = copy_rtx (x);
4106       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4107                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4108                    opnum, (enum reload_type) type);
4109       return x;
4110     }
4111
4112   /* We must recognize output that we have already generated ourselves.  */
4113   if (GET_CODE (x) == PLUS
4114       && GET_CODE (XEXP (x, 0)) == PLUS
4115       && REG_P (XEXP (XEXP (x, 0), 0))
4116       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4117       && CONST_INT_P (XEXP (x, 1)))
4118     {
4119       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4120                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4121                    opnum, (enum reload_type) type);
4122       return x;
4123     }
4124
4125   /* We wish to handle large displacements off a base register by splitting
4126      the addend across an add and the mem insn.  This can cut the number of
4127      extra insns needed from 3 to 1.  It is only useful for load/store of a
4128      single register with 12 bit offset field.  */
4129   if (GET_CODE (x) == PLUS
4130       && REG_P (XEXP (x, 0))
4131       && CONST_INT_P (XEXP (x, 1))
4132       && HARD_REGISTER_P (XEXP (x, 0))
4133       && mode != TImode
4134       && mode != TFmode
4135       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4136     {
4137       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4138       HOST_WIDE_INT low = val & 0xfff;
4139       HOST_WIDE_INT high = val - low;
4140       HOST_WIDE_INT offs;
4141       rtx cst;
4142       enum machine_mode xmode = GET_MODE (x);
4143
4144       /* In ILP32, xmode can be either DImode or SImode.  */
4145       gcc_assert (xmode == DImode || xmode == SImode);
4146
4147       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4148          BLKmode alignment.  */
4149       if (GET_MODE_SIZE (mode) == 0)
4150         return NULL_RTX;
4151
4152       offs = low % GET_MODE_SIZE (mode);
4153
4154       /* Align misaligned offset by adjusting high part to compensate.  */
4155       if (offs != 0)
4156         {
4157           if (aarch64_uimm12_shift (high + offs))
4158             {
4159               /* Align down.  */
4160               low = low - offs;
4161               high = high + offs;
4162             }
4163           else
4164             {
4165               /* Align up.  */
4166               offs = GET_MODE_SIZE (mode) - offs;
4167               low = low + offs;
4168               high = high + (low & 0x1000) - offs;
4169               low &= 0xfff;
4170             }
4171         }
4172
4173       /* Check for overflow.  */
4174       if (high + low != val)
4175         return NULL_RTX;
4176
4177       cst = GEN_INT (high);
4178       if (!aarch64_uimm12_shift (high))
4179         cst = force_const_mem (xmode, cst);
4180
4181       /* Reload high part into base reg, leaving the low part
4182          in the mem instruction.
4183          Note that replacing this gen_rtx_PLUS with plus_constant is
4184          wrong in this case because we rely on the
4185          (plus (plus reg c1) c2) structure being preserved so that
4186          XEXP (*p, 0) in push_reload below uses the correct term.  */
4187       x = gen_rtx_PLUS (xmode,
4188                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4189                         GEN_INT (low));
4190
4191       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4192                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4193                    opnum, (enum reload_type) type);
4194       return x;
4195     }
4196
4197   return NULL_RTX;
4198 }
4199
4200
4201 static reg_class_t
4202 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4203                           reg_class_t rclass,
4204                           enum machine_mode mode,
4205                           secondary_reload_info *sri)
4206 {
4207   /* Without the TARGET_SIMD instructions we cannot move a Q register
4208      to a Q register directly.  We need a scratch.  */
4209   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4210       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4211       && reg_class_subset_p (rclass, FP_REGS))
4212     {
4213       if (mode == TFmode)
4214         sri->icode = CODE_FOR_aarch64_reload_movtf;
4215       else if (mode == TImode)
4216         sri->icode = CODE_FOR_aarch64_reload_movti;
4217       return NO_REGS;
4218     }
4219
4220   /* A TFmode or TImode memory access should be handled via an FP_REGS
4221      because AArch64 has richer addressing modes for LDR/STR instructions
4222      than LDP/STP instructions.  */
4223   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4224       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4225     return FP_REGS;
4226
4227   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4228       return GENERAL_REGS;
4229
4230   return NO_REGS;
4231 }
4232
4233 static bool
4234 aarch64_can_eliminate (const int from, const int to)
4235 {
4236   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4237      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4238
4239   if (frame_pointer_needed)
4240     {
4241       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4242         return true;
4243       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4244         return false;
4245       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4246           && !cfun->calls_alloca)
4247         return true;
4248       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4249         return true;
4250
4251       return false;
4252     }
4253
4254   return true;
4255 }
4256
4257 HOST_WIDE_INT
4258 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4259 {
4260   aarch64_layout_frame ();
4261
4262   if (to == HARD_FRAME_POINTER_REGNUM)
4263     {
4264       if (from == ARG_POINTER_REGNUM)
4265         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4266
4267       if (from == FRAME_POINTER_REGNUM)
4268         return (cfun->machine->frame.hard_fp_offset
4269                 - cfun->machine->frame.saved_varargs_size);
4270     }
4271
4272   if (to == STACK_POINTER_REGNUM)
4273     {
4274       if (from == FRAME_POINTER_REGNUM)
4275           return (cfun->machine->frame.frame_size
4276                   - cfun->machine->frame.saved_varargs_size);
4277     }
4278
4279   return cfun->machine->frame.frame_size;
4280 }
4281
4282 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4283    previous frame.  */
4284
4285 rtx
4286 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4287 {
4288   if (count != 0)
4289     return const0_rtx;
4290   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4291 }
4292
4293
4294 static void
4295 aarch64_asm_trampoline_template (FILE *f)
4296 {
4297   if (TARGET_ILP32)
4298     {
4299       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4300       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4301     }
4302   else
4303     {
4304       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4305       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4306     }
4307   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4308   assemble_aligned_integer (4, const0_rtx);
4309   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4310   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4311 }
4312
4313 static void
4314 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4315 {
4316   rtx fnaddr, mem, a_tramp;
4317   const int tramp_code_sz = 16;
4318
4319   /* Don't need to copy the trailing D-words, we fill those in below.  */
4320   emit_block_move (m_tramp, assemble_trampoline_template (),
4321                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4322   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4323   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4324   if (GET_MODE (fnaddr) != ptr_mode)
4325     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4326   emit_move_insn (mem, fnaddr);
4327
4328   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4329   emit_move_insn (mem, chain_value);
4330
4331   /* XXX We should really define a "clear_cache" pattern and use
4332      gen_clear_cache().  */
4333   a_tramp = XEXP (m_tramp, 0);
4334   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4335                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4336                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4337                      ptr_mode);
4338 }
4339
4340 static unsigned char
4341 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4342 {
4343   switch (regclass)
4344     {
4345     case CALLER_SAVE_REGS:
4346     case POINTER_REGS:
4347     case GENERAL_REGS:
4348     case ALL_REGS:
4349     case FP_REGS:
4350     case FP_LO_REGS:
4351       return
4352         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4353                                        (GET_MODE_SIZE (mode) + 7) / 8;
4354     case STACK_REG:
4355       return 1;
4356
4357     case NO_REGS:
4358       return 0;
4359
4360     default:
4361       break;
4362     }
4363   gcc_unreachable ();
4364 }
4365
4366 static reg_class_t
4367 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4368 {
4369   if (regclass == POINTER_REGS)
4370     return GENERAL_REGS;
4371
4372   if (regclass == STACK_REG)
4373     {
4374       if (REG_P(x)
4375           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4376           return regclass;
4377
4378       return NO_REGS;
4379     }
4380
4381   /* If it's an integer immediate that MOVI can't handle, then
4382      FP_REGS is not an option, so we return NO_REGS instead.  */
4383   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4384       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4385     return NO_REGS;
4386
4387   /* Register eliminiation can result in a request for
4388      SP+constant->FP_REGS.  We cannot support such operations which
4389      use SP as source and an FP_REG as destination, so reject out
4390      right now.  */
4391   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4392     {
4393       rtx lhs = XEXP (x, 0);
4394
4395       /* Look through a possible SUBREG introduced by ILP32.  */
4396       if (GET_CODE (lhs) == SUBREG)
4397         lhs = SUBREG_REG (lhs);
4398
4399       gcc_assert (REG_P (lhs));
4400       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4401                                       POINTER_REGS));
4402       return NO_REGS;
4403     }
4404
4405   return regclass;
4406 }
4407
4408 void
4409 aarch64_asm_output_labelref (FILE* f, const char *name)
4410 {
4411   asm_fprintf (f, "%U%s", name);
4412 }
4413
4414 static void
4415 aarch64_elf_asm_constructor (rtx symbol, int priority)
4416 {
4417   if (priority == DEFAULT_INIT_PRIORITY)
4418     default_ctor_section_asm_out_constructor (symbol, priority);
4419   else
4420     {
4421       section *s;
4422       char buf[18];
4423       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4424       s = get_section (buf, SECTION_WRITE, NULL);
4425       switch_to_section (s);
4426       assemble_align (POINTER_SIZE);
4427       assemble_aligned_integer (POINTER_BYTES, symbol);
4428     }
4429 }
4430
4431 static void
4432 aarch64_elf_asm_destructor (rtx symbol, int priority)
4433 {
4434   if (priority == DEFAULT_INIT_PRIORITY)
4435     default_dtor_section_asm_out_destructor (symbol, priority);
4436   else
4437     {
4438       section *s;
4439       char buf[18];
4440       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4441       s = get_section (buf, SECTION_WRITE, NULL);
4442       switch_to_section (s);
4443       assemble_align (POINTER_SIZE);
4444       assemble_aligned_integer (POINTER_BYTES, symbol);
4445     }
4446 }
4447
4448 const char*
4449 aarch64_output_casesi (rtx *operands)
4450 {
4451   char buf[100];
4452   char label[100];
4453   rtx diff_vec = PATTERN (NEXT_INSN (operands[2]));
4454   int index;
4455   static const char *const patterns[4][2] =
4456   {
4457     {
4458       "ldrb\t%w3, [%0,%w1,uxtw]",
4459       "add\t%3, %4, %w3, sxtb #2"
4460     },
4461     {
4462       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4463       "add\t%3, %4, %w3, sxth #2"
4464     },
4465     {
4466       "ldr\t%w3, [%0,%w1,uxtw #2]",
4467       "add\t%3, %4, %w3, sxtw #2"
4468     },
4469     /* We assume that DImode is only generated when not optimizing and
4470        that we don't really need 64-bit address offsets.  That would
4471        imply an object file with 8GB of code in a single function!  */
4472     {
4473       "ldr\t%w3, [%0,%w1,uxtw #2]",
4474       "add\t%3, %4, %w3, sxtw #2"
4475     }
4476   };
4477
4478   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4479
4480   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4481
4482   gcc_assert (index >= 0 && index <= 3);
4483
4484   /* Need to implement table size reduction, by chaning the code below.  */
4485   output_asm_insn (patterns[index][0], operands);
4486   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4487   snprintf (buf, sizeof (buf),
4488             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4489   output_asm_insn (buf, operands);
4490   output_asm_insn (patterns[index][1], operands);
4491   output_asm_insn ("br\t%3", operands);
4492   assemble_label (asm_out_file, label);
4493   return "";
4494 }
4495
4496
4497 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4498    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4499    operator.  */
4500
4501 int
4502 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4503 {
4504   if (shift >= 0 && shift <= 3)
4505     {
4506       int size;
4507       for (size = 8; size <= 32; size *= 2)
4508         {
4509           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4510           if (mask == bits << shift)
4511             return size;
4512         }
4513     }
4514   return 0;
4515 }
4516
4517 static bool
4518 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4519                                    const_rtx x ATTRIBUTE_UNUSED)
4520 {
4521   /* We can't use blocks for constants when we're using a per-function
4522      constant pool.  */
4523   return false;
4524 }
4525
4526 static section *
4527 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4528                             rtx x ATTRIBUTE_UNUSED,
4529                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4530 {
4531   /* Force all constant pool entries into the current function section.  */
4532   return function_section (current_function_decl);
4533 }
4534
4535
4536 /* Costs.  */
4537
4538 /* Helper function for rtx cost calculation.  Strip a shift expression
4539    from X.  Returns the inner operand if successful, or the original
4540    expression on failure.  */
4541 static rtx
4542 aarch64_strip_shift (rtx x)
4543 {
4544   rtx op = x;
4545
4546   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4547      we can convert both to ROR during final output.  */
4548   if ((GET_CODE (op) == ASHIFT
4549        || GET_CODE (op) == ASHIFTRT
4550        || GET_CODE (op) == LSHIFTRT
4551        || GET_CODE (op) == ROTATERT
4552        || GET_CODE (op) == ROTATE)
4553       && CONST_INT_P (XEXP (op, 1)))
4554     return XEXP (op, 0);
4555
4556   if (GET_CODE (op) == MULT
4557       && CONST_INT_P (XEXP (op, 1))
4558       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4559     return XEXP (op, 0);
4560
4561   return x;
4562 }
4563
4564 /* Helper function for rtx cost calculation.  Strip an extend
4565    expression from X.  Returns the inner operand if successful, or the
4566    original expression on failure.  We deal with a number of possible
4567    canonicalization variations here.  */
4568 static rtx
4569 aarch64_strip_extend (rtx x)
4570 {
4571   rtx op = x;
4572
4573   /* Zero and sign extraction of a widened value.  */
4574   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4575       && XEXP (op, 2) == const0_rtx
4576       && GET_CODE (XEXP (op, 0)) == MULT
4577       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4578                                          XEXP (op, 1)))
4579     return XEXP (XEXP (op, 0), 0);
4580
4581   /* It can also be represented (for zero-extend) as an AND with an
4582      immediate.  */
4583   if (GET_CODE (op) == AND
4584       && GET_CODE (XEXP (op, 0)) == MULT
4585       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4586       && CONST_INT_P (XEXP (op, 1))
4587       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4588                            INTVAL (XEXP (op, 1))) != 0)
4589     return XEXP (XEXP (op, 0), 0);
4590
4591   /* Now handle extended register, as this may also have an optional
4592      left shift by 1..4.  */
4593   if (GET_CODE (op) == ASHIFT
4594       && CONST_INT_P (XEXP (op, 1))
4595       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4596     op = XEXP (op, 0);
4597
4598   if (GET_CODE (op) == ZERO_EXTEND
4599       || GET_CODE (op) == SIGN_EXTEND)
4600     op = XEXP (op, 0);
4601
4602   if (op != x)
4603     return op;
4604
4605   return x;
4606 }
4607
4608 /* Helper function for rtx cost calculation.  Calculate the cost of
4609    a MULT, which may be part of a multiply-accumulate rtx.  Return
4610    the calculated cost of the expression, recursing manually in to
4611    operands where needed.  */
4612
4613 static int
4614 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4615 {
4616   rtx op0, op1;
4617   const struct cpu_cost_table *extra_cost
4618     = aarch64_tune_params->insn_extra_cost;
4619   int cost = 0;
4620   bool maybe_fma = (outer == PLUS || outer == MINUS);
4621   enum machine_mode mode = GET_MODE (x);
4622
4623   gcc_checking_assert (code == MULT);
4624
4625   op0 = XEXP (x, 0);
4626   op1 = XEXP (x, 1);
4627
4628   if (VECTOR_MODE_P (mode))
4629     mode = GET_MODE_INNER (mode);
4630
4631   /* Integer multiply/fma.  */
4632   if (GET_MODE_CLASS (mode) == MODE_INT)
4633     {
4634       /* The multiply will be canonicalized as a shift, cost it as such.  */
4635       if (CONST_INT_P (op1)
4636           && exact_log2 (INTVAL (op1)) > 0)
4637         {
4638           if (speed)
4639             {
4640               if (maybe_fma)
4641                 /* ADD (shifted register).  */
4642                 cost += extra_cost->alu.arith_shift;
4643               else
4644                 /* LSL (immediate).  */
4645                 cost += extra_cost->alu.shift;
4646             }
4647
4648           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4649
4650           return cost;
4651         }
4652
4653       /* Integer multiplies or FMAs have zero/sign extending variants.  */
4654       if ((GET_CODE (op0) == ZERO_EXTEND
4655            && GET_CODE (op1) == ZERO_EXTEND)
4656           || (GET_CODE (op0) == SIGN_EXTEND
4657               && GET_CODE (op1) == SIGN_EXTEND))
4658         {
4659           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4660                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4661
4662           if (speed)
4663             {
4664               if (maybe_fma)
4665                 /* MADD/SMADDL/UMADDL.  */
4666                 cost += extra_cost->mult[0].extend_add;
4667               else
4668                 /* MUL/SMULL/UMULL.  */
4669                 cost += extra_cost->mult[0].extend;
4670             }
4671
4672           return cost;
4673         }
4674
4675       /* This is either an integer multiply or an FMA.  In both cases
4676          we want to recurse and cost the operands.  */
4677       cost += rtx_cost (op0, MULT, 0, speed)
4678               + rtx_cost (op1, MULT, 1, speed);
4679
4680       if (speed)
4681         {
4682           if (maybe_fma)
4683             /* MADD.  */
4684             cost += extra_cost->mult[mode == DImode].add;
4685           else
4686             /* MUL.  */
4687             cost += extra_cost->mult[mode == DImode].simple;
4688         }
4689
4690       return cost;
4691     }
4692   else
4693     {
4694       if (speed)
4695         {
4696           /* Floating-point FMA/FMUL can also support negations of the
4697              operands.  */
4698           if (GET_CODE (op0) == NEG)
4699             op0 = XEXP (op0, 0);
4700           if (GET_CODE (op1) == NEG)
4701             op1 = XEXP (op1, 0);
4702
4703           if (maybe_fma)
4704             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
4705             cost += extra_cost->fp[mode == DFmode].fma;
4706           else
4707             /* FMUL/FNMUL.  */
4708             cost += extra_cost->fp[mode == DFmode].mult;
4709         }
4710
4711       cost += rtx_cost (op0, MULT, 0, speed)
4712               + rtx_cost (op1, MULT, 1, speed);
4713       return cost;
4714     }
4715 }
4716
4717 static int
4718 aarch64_address_cost (rtx x,
4719                       enum machine_mode mode,
4720                       addr_space_t as ATTRIBUTE_UNUSED,
4721                       bool speed)
4722 {
4723   enum rtx_code c = GET_CODE (x);
4724   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4725   struct aarch64_address_info info;
4726   int cost = 0;
4727   info.shift = 0;
4728
4729   if (!aarch64_classify_address (&info, x, mode, c, false))
4730     {
4731       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4732         {
4733           /* This is a CONST or SYMBOL ref which will be split
4734              in a different way depending on the code model in use.
4735              Cost it through the generic infrastructure.  */
4736           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4737           /* Divide through by the cost of one instruction to
4738              bring it to the same units as the address costs.  */
4739           cost_symbol_ref /= COSTS_N_INSNS (1);
4740           /* The cost is then the cost of preparing the address,
4741              followed by an immediate (possibly 0) offset.  */
4742           return cost_symbol_ref + addr_cost->imm_offset;
4743         }
4744       else
4745         {
4746           /* This is most likely a jump table from a case
4747              statement.  */
4748           return addr_cost->register_offset;
4749         }
4750     }
4751
4752   switch (info.type)
4753     {
4754       case ADDRESS_LO_SUM:
4755       case ADDRESS_SYMBOLIC:
4756       case ADDRESS_REG_IMM:
4757         cost += addr_cost->imm_offset;
4758         break;
4759
4760       case ADDRESS_REG_WB:
4761         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4762           cost += addr_cost->pre_modify;
4763         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4764           cost += addr_cost->post_modify;
4765         else
4766           gcc_unreachable ();
4767
4768         break;
4769
4770       case ADDRESS_REG_REG:
4771         cost += addr_cost->register_offset;
4772         break;
4773
4774       case ADDRESS_REG_UXTW:
4775       case ADDRESS_REG_SXTW:
4776         cost += addr_cost->register_extend;
4777         break;
4778
4779       default:
4780         gcc_unreachable ();
4781     }
4782
4783
4784   if (info.shift > 0)
4785     {
4786       /* For the sake of calculating the cost of the shifted register
4787          component, we can treat same sized modes in the same way.  */
4788       switch (GET_MODE_BITSIZE (mode))
4789         {
4790           case 16:
4791             cost += addr_cost->addr_scale_costs.hi;
4792             break;
4793
4794           case 32:
4795             cost += addr_cost->addr_scale_costs.si;
4796             break;
4797
4798           case 64:
4799             cost += addr_cost->addr_scale_costs.di;
4800             break;
4801
4802           /* We can't tell, or this is a 128-bit vector.  */
4803           default:
4804             cost += addr_cost->addr_scale_costs.ti;
4805             break;
4806         }
4807     }
4808
4809   return cost;
4810 }
4811
4812 /* Return true if the RTX X in mode MODE is a zero or sign extract
4813    usable in an ADD or SUB (extended register) instruction.  */
4814 static bool
4815 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4816 {
4817   /* Catch add with a sign extract.
4818      This is add_<optab><mode>_multp2.  */
4819   if (GET_CODE (x) == SIGN_EXTRACT
4820       || GET_CODE (x) == ZERO_EXTRACT)
4821     {
4822       rtx op0 = XEXP (x, 0);
4823       rtx op1 = XEXP (x, 1);
4824       rtx op2 = XEXP (x, 2);
4825
4826       if (GET_CODE (op0) == MULT
4827           && CONST_INT_P (op1)
4828           && op2 == const0_rtx
4829           && CONST_INT_P (XEXP (op0, 1))
4830           && aarch64_is_extend_from_extract (mode,
4831                                              XEXP (op0, 1),
4832                                              op1))
4833         {
4834           return true;
4835         }
4836     }
4837
4838   return false;
4839 }
4840
4841 static bool
4842 aarch64_frint_unspec_p (unsigned int u)
4843 {
4844   switch (u)
4845     {
4846       case UNSPEC_FRINTZ:
4847       case UNSPEC_FRINTP:
4848       case UNSPEC_FRINTM:
4849       case UNSPEC_FRINTA:
4850       case UNSPEC_FRINTN:
4851       case UNSPEC_FRINTX:
4852       case UNSPEC_FRINTI:
4853         return true;
4854
4855       default:
4856         return false;
4857     }
4858 }
4859
4860 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4861    storing it in *COST.  Result is true if the total cost of the operation
4862    has now been calculated.  */
4863 static bool
4864 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4865 {
4866   rtx inner;
4867   rtx comparator;
4868   enum rtx_code cmpcode;
4869
4870   if (COMPARISON_P (op0))
4871     {
4872       inner = XEXP (op0, 0);
4873       comparator = XEXP (op0, 1);
4874       cmpcode = GET_CODE (op0);
4875     }
4876   else
4877     {
4878       inner = op0;
4879       comparator = const0_rtx;
4880       cmpcode = NE;
4881     }
4882
4883   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
4884     {
4885       /* Conditional branch.  */
4886       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4887         return true;
4888       else
4889         {
4890           if (cmpcode == NE || cmpcode == EQ)
4891             {
4892               if (comparator == const0_rtx)
4893                 {
4894                   /* TBZ/TBNZ/CBZ/CBNZ.  */
4895                   if (GET_CODE (inner) == ZERO_EXTRACT)
4896                     /* TBZ/TBNZ.  */
4897                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
4898                                        0, speed);
4899                 else
4900                   /* CBZ/CBNZ.  */
4901                   *cost += rtx_cost (inner, cmpcode, 0, speed);
4902
4903                 return true;
4904               }
4905             }
4906           else if (cmpcode == LT || cmpcode == GE)
4907             {
4908               /* TBZ/TBNZ.  */
4909               if (comparator == const0_rtx)
4910                 return true;
4911             }
4912         }
4913     }
4914   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4915     {
4916       /* It's a conditional operation based on the status flags,
4917          so it must be some flavor of CSEL.  */
4918
4919       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
4920       if (GET_CODE (op1) == NEG
4921           || GET_CODE (op1) == NOT
4922           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
4923         op1 = XEXP (op1, 0);
4924
4925       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
4926       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
4927       return true;
4928     }
4929
4930   /* We don't know what this is, cost all operands.  */
4931   return false;
4932 }
4933
4934 /* Calculate the cost of calculating X, storing it in *COST.  Result
4935    is true if the total cost of the operation has now been calculated.  */
4936 static bool
4937 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
4938                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
4939 {
4940   rtx op0, op1, op2;
4941   const struct cpu_cost_table *extra_cost
4942     = aarch64_tune_params->insn_extra_cost;
4943   enum machine_mode mode = GET_MODE (x);
4944
4945   /* By default, assume that everything has equivalent cost to the
4946      cheapest instruction.  Any additional costs are applied as a delta
4947      above this default.  */
4948   *cost = COSTS_N_INSNS (1);
4949
4950   /* TODO: The cost infrastructure currently does not handle
4951      vector operations.  Assume that all vector operations
4952      are equally expensive.  */
4953   if (VECTOR_MODE_P (mode))
4954     {
4955       if (speed)
4956         *cost += extra_cost->vect.alu;
4957       return true;
4958     }
4959
4960   switch (code)
4961     {
4962     case SET:
4963       /* The cost depends entirely on the operands to SET.  */
4964       *cost = 0;
4965       op0 = SET_DEST (x);
4966       op1 = SET_SRC (x);
4967
4968       switch (GET_CODE (op0))
4969         {
4970         case MEM:
4971           if (speed)
4972             {
4973               rtx address = XEXP (op0, 0);
4974               if (GET_MODE_CLASS (mode) == MODE_INT)
4975                 *cost += extra_cost->ldst.store;
4976               else if (mode == SFmode)
4977                 *cost += extra_cost->ldst.storef;
4978               else if (mode == DFmode)
4979                 *cost += extra_cost->ldst.stored;
4980
4981               *cost +=
4982                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
4983                                                      0, speed));
4984             }
4985
4986           *cost += rtx_cost (op1, SET, 1, speed);
4987           return true;
4988
4989         case SUBREG:
4990           if (! REG_P (SUBREG_REG (op0)))
4991             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
4992
4993           /* Fall through.  */
4994         case REG:
4995           /* const0_rtx is in general free, but we will use an
4996              instruction to set a register to 0.  */
4997           if (REG_P (op1) || op1 == const0_rtx)
4998             {
4999               /* The cost is 1 per register copied.  */
5000               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5001                               / UNITS_PER_WORD;
5002               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5003             }
5004           else
5005             /* Cost is just the cost of the RHS of the set.  */
5006             *cost += rtx_cost (op1, SET, 1, speed);
5007           return true;
5008
5009         case ZERO_EXTRACT:
5010         case SIGN_EXTRACT:
5011           /* Bit-field insertion.  Strip any redundant widening of
5012              the RHS to meet the width of the target.  */
5013           if (GET_CODE (op1) == SUBREG)
5014             op1 = SUBREG_REG (op1);
5015           if ((GET_CODE (op1) == ZERO_EXTEND
5016                || GET_CODE (op1) == SIGN_EXTEND)
5017               && GET_CODE (XEXP (op0, 1)) == CONST_INT
5018               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5019                   >= INTVAL (XEXP (op0, 1))))
5020             op1 = XEXP (op1, 0);
5021
5022           if (CONST_INT_P (op1))
5023             {
5024               /* MOV immediate is assumed to always be cheap.  */
5025               *cost = COSTS_N_INSNS (1);
5026             }
5027           else
5028             {
5029               /* BFM.  */
5030               if (speed)
5031                 *cost += extra_cost->alu.bfi;
5032               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5033             }
5034
5035           return true;
5036
5037         default:
5038           /* We can't make sense of this, assume default cost.  */
5039           *cost = COSTS_N_INSNS (1);
5040           return false;
5041         }
5042       return false;
5043
5044     case CONST_INT:
5045       /* If an instruction can incorporate a constant within the
5046          instruction, the instruction's expression avoids calling
5047          rtx_cost() on the constant.  If rtx_cost() is called on a
5048          constant, then it is usually because the constant must be
5049          moved into a register by one or more instructions.
5050
5051          The exception is constant 0, which can be expressed
5052          as XZR/WZR and is therefore free.  The exception to this is
5053          if we have (set (reg) (const0_rtx)) in which case we must cost
5054          the move.  However, we can catch that when we cost the SET, so
5055          we don't need to consider that here.  */
5056       if (x == const0_rtx)
5057         *cost = 0;
5058       else
5059         {
5060           /* To an approximation, building any other constant is
5061              proportionally expensive to the number of instructions
5062              required to build that constant.  This is true whether we
5063              are compiling for SPEED or otherwise.  */
5064           *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5065                                                          INTVAL (x),
5066                                                          false));
5067         }
5068       return true;
5069
5070     case CONST_DOUBLE:
5071       if (speed)
5072         {
5073           /* mov[df,sf]_aarch64.  */
5074           if (aarch64_float_const_representable_p (x))
5075             /* FMOV (scalar immediate).  */
5076             *cost += extra_cost->fp[mode == DFmode].fpconst;
5077           else if (!aarch64_float_const_zero_rtx_p (x))
5078             {
5079               /* This will be a load from memory.  */
5080               if (mode == DFmode)
5081                 *cost += extra_cost->ldst.loadd;
5082               else
5083                 *cost += extra_cost->ldst.loadf;
5084             }
5085           else
5086             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5087                or MOV v0.s[0], wzr - neither of which are modeled by the
5088                cost tables.  Just use the default cost.  */
5089             {
5090             }
5091         }
5092
5093       return true;
5094
5095     case MEM:
5096       if (speed)
5097         {
5098           /* For loads we want the base cost of a load, plus an
5099              approximation for the additional cost of the addressing
5100              mode.  */
5101           rtx address = XEXP (x, 0);
5102           if (GET_MODE_CLASS (mode) == MODE_INT)
5103             *cost += extra_cost->ldst.load;
5104           else if (mode == SFmode)
5105             *cost += extra_cost->ldst.loadf;
5106           else if (mode == DFmode)
5107             *cost += extra_cost->ldst.loadd;
5108
5109           *cost +=
5110                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5111                                                      0, speed));
5112         }
5113
5114       return true;
5115
5116     case NEG:
5117       op0 = XEXP (x, 0);
5118
5119       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5120        {
5121           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5122               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5123             {
5124               /* CSETM.  */
5125               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5126               return true;
5127             }
5128
5129           /* Cost this as SUB wzr, X.  */
5130           op0 = CONST0_RTX (GET_MODE (x));
5131           op1 = XEXP (x, 0);
5132           goto cost_minus;
5133         }
5134
5135       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5136         {
5137           /* Support (neg(fma...)) as a single instruction only if
5138              sign of zeros is unimportant.  This matches the decision
5139              making in aarch64.md.  */
5140           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5141             {
5142               /* FNMADD.  */
5143               *cost = rtx_cost (op0, NEG, 0, speed);
5144               return true;
5145             }
5146           if (speed)
5147             /* FNEG.  */
5148             *cost += extra_cost->fp[mode == DFmode].neg;
5149           return false;
5150         }
5151
5152       return false;
5153
5154     case CLRSB:
5155     case CLZ:
5156       if (speed)
5157         *cost += extra_cost->alu.clz;
5158
5159       return false;
5160
5161     case COMPARE:
5162       op0 = XEXP (x, 0);
5163       op1 = XEXP (x, 1);
5164
5165       if (op1 == const0_rtx
5166           && GET_CODE (op0) == AND)
5167         {
5168           x = op0;
5169           goto cost_logic;
5170         }
5171
5172       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5173         {
5174           /* TODO: A write to the CC flags possibly costs extra, this
5175              needs encoding in the cost tables.  */
5176
5177           /* CC_ZESWPmode supports zero extend for free.  */
5178           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5179             op0 = XEXP (op0, 0);
5180
5181           /* ANDS.  */
5182           if (GET_CODE (op0) == AND)
5183             {
5184               x = op0;
5185               goto cost_logic;
5186             }
5187
5188           if (GET_CODE (op0) == PLUS)
5189             {
5190               /* ADDS (and CMN alias).  */
5191               x = op0;
5192               goto cost_plus;
5193             }
5194
5195           if (GET_CODE (op0) == MINUS)
5196             {
5197               /* SUBS.  */
5198               x = op0;
5199               goto cost_minus;
5200             }
5201
5202           if (GET_CODE (op1) == NEG)
5203             {
5204               /* CMN.  */
5205               if (speed)
5206                 *cost += extra_cost->alu.arith;
5207
5208               *cost += rtx_cost (op0, COMPARE, 0, speed);
5209               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5210               return true;
5211             }
5212
5213           /* CMP.
5214
5215              Compare can freely swap the order of operands, and
5216              canonicalization puts the more complex operation first.
5217              But the integer MINUS logic expects the shift/extend
5218              operation in op1.  */
5219           if (! (REG_P (op0)
5220                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5221           {
5222             op0 = XEXP (x, 1);
5223             op1 = XEXP (x, 0);
5224           }
5225           goto cost_minus;
5226         }
5227
5228       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5229         {
5230           /* FCMP.  */
5231           if (speed)
5232             *cost += extra_cost->fp[mode == DFmode].compare;
5233
5234           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5235             {
5236               /* FCMP supports constant 0.0 for no extra cost. */
5237               return true;
5238             }
5239           return false;
5240         }
5241
5242       return false;
5243
5244     case MINUS:
5245       {
5246         op0 = XEXP (x, 0);
5247         op1 = XEXP (x, 1);
5248
5249 cost_minus:
5250         /* Detect valid immediates.  */
5251         if ((GET_MODE_CLASS (mode) == MODE_INT
5252              || (GET_MODE_CLASS (mode) == MODE_CC
5253                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5254             && CONST_INT_P (op1)
5255             && aarch64_uimm12_shift (INTVAL (op1)))
5256           {
5257             *cost += rtx_cost (op0, MINUS, 0, speed);
5258
5259             if (speed)
5260               /* SUB(S) (immediate).  */
5261               *cost += extra_cost->alu.arith;
5262             return true;
5263
5264           }
5265
5266         /* Look for SUB (extended register).  */
5267         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5268           {
5269             if (speed)
5270               *cost += extra_cost->alu.arith_shift;
5271
5272             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5273                                (enum rtx_code) GET_CODE (op1),
5274                                0, speed);
5275             return true;
5276           }
5277
5278         rtx new_op1 = aarch64_strip_extend (op1);
5279
5280         /* Cost this as an FMA-alike operation.  */
5281         if ((GET_CODE (new_op1) == MULT
5282              || GET_CODE (new_op1) == ASHIFT)
5283             && code != COMPARE)
5284           {
5285             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5286                                             (enum rtx_code) code,
5287                                             speed);
5288             *cost += rtx_cost (op0, MINUS, 0, speed);
5289             return true;
5290           }
5291
5292         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5293
5294         if (speed)
5295           {
5296             if (GET_MODE_CLASS (mode) == MODE_INT)
5297               /* SUB(S).  */
5298               *cost += extra_cost->alu.arith;
5299             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5300               /* FSUB.  */
5301               *cost += extra_cost->fp[mode == DFmode].addsub;
5302           }
5303         return true;
5304       }
5305
5306     case PLUS:
5307       {
5308         rtx new_op0;
5309
5310         op0 = XEXP (x, 0);
5311         op1 = XEXP (x, 1);
5312
5313 cost_plus:
5314         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5315             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5316           {
5317             /* CSINC.  */
5318             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5319             *cost += rtx_cost (op1, PLUS, 1, speed);
5320             return true;
5321           }
5322
5323         if (GET_MODE_CLASS (mode) == MODE_INT
5324             && CONST_INT_P (op1)
5325             && aarch64_uimm12_shift (INTVAL (op1)))
5326           {
5327             *cost += rtx_cost (op0, PLUS, 0, speed);
5328
5329             if (speed)
5330               /* ADD (immediate).  */
5331               *cost += extra_cost->alu.arith;
5332             return true;
5333           }
5334
5335         /* Look for ADD (extended register).  */
5336         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5337           {
5338             if (speed)
5339               *cost += extra_cost->alu.arith_shift;
5340
5341             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5342                                (enum rtx_code) GET_CODE (op0),
5343                                0, speed);
5344             return true;
5345           }
5346
5347         /* Strip any extend, leave shifts behind as we will
5348            cost them through mult_cost.  */
5349         new_op0 = aarch64_strip_extend (op0);
5350
5351         if (GET_CODE (new_op0) == MULT
5352             || GET_CODE (new_op0) == ASHIFT)
5353           {
5354             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5355                                             speed);
5356             *cost += rtx_cost (op1, PLUS, 1, speed);
5357             return true;
5358           }
5359
5360         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5361                   + rtx_cost (op1, PLUS, 1, speed));
5362
5363         if (speed)
5364           {
5365             if (GET_MODE_CLASS (mode) == MODE_INT)
5366               /* ADD.  */
5367               *cost += extra_cost->alu.arith;
5368             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5369               /* FADD.  */
5370               *cost += extra_cost->fp[mode == DFmode].addsub;
5371           }
5372         return true;
5373       }
5374
5375     case BSWAP:
5376       *cost = COSTS_N_INSNS (1);
5377
5378       if (speed)
5379         *cost += extra_cost->alu.rev;
5380
5381       return false;
5382
5383     case IOR:
5384       if (aarch_rev16_p (x))
5385         {
5386           *cost = COSTS_N_INSNS (1);
5387
5388           if (speed)
5389             *cost += extra_cost->alu.rev;
5390
5391           return true;
5392         }
5393     /* Fall through.  */
5394     case XOR:
5395     case AND:
5396     cost_logic:
5397       op0 = XEXP (x, 0);
5398       op1 = XEXP (x, 1);
5399
5400       if (code == AND
5401           && GET_CODE (op0) == MULT
5402           && CONST_INT_P (XEXP (op0, 1))
5403           && CONST_INT_P (op1)
5404           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5405                                INTVAL (op1)) != 0)
5406         {
5407           /* This is a UBFM/SBFM.  */
5408           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5409           if (speed)
5410             *cost += extra_cost->alu.bfx;
5411           return true;
5412         }
5413
5414       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5415         {
5416           /* We possibly get the immediate for free, this is not
5417              modelled.  */
5418           if (CONST_INT_P (op1)
5419               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5420             {
5421               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5422
5423               if (speed)
5424                 *cost += extra_cost->alu.logical;
5425
5426               return true;
5427             }
5428           else
5429             {
5430               rtx new_op0 = op0;
5431
5432               /* Handle ORN, EON, or BIC.  */
5433               if (GET_CODE (op0) == NOT)
5434                 op0 = XEXP (op0, 0);
5435
5436               new_op0 = aarch64_strip_shift (op0);
5437
5438               /* If we had a shift on op0 then this is a logical-shift-
5439                  by-register/immediate operation.  Otherwise, this is just
5440                  a logical operation.  */
5441               if (speed)
5442                 {
5443                   if (new_op0 != op0)
5444                     {
5445                       /* Shift by immediate.  */
5446                       if (CONST_INT_P (XEXP (op0, 1)))
5447                         *cost += extra_cost->alu.log_shift;
5448                       else
5449                         *cost += extra_cost->alu.log_shift_reg;
5450                     }
5451                   else
5452                     *cost += extra_cost->alu.logical;
5453                 }
5454
5455               /* In both cases we want to cost both operands.  */
5456               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5457                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5458
5459               return true;
5460             }
5461         }
5462       return false;
5463
5464     case NOT:
5465       /* MVN.  */
5466       if (speed)
5467         *cost += extra_cost->alu.logical;
5468
5469       /* The logical instruction could have the shifted register form,
5470          but the cost is the same if the shift is processed as a separate
5471          instruction, so we don't bother with it here.  */
5472       return false;
5473
5474     case ZERO_EXTEND:
5475
5476       op0 = XEXP (x, 0);
5477       /* If a value is written in SI mode, then zero extended to DI
5478          mode, the operation will in general be free as a write to
5479          a 'w' register implicitly zeroes the upper bits of an 'x'
5480          register.  However, if this is
5481
5482            (set (reg) (zero_extend (reg)))
5483
5484          we must cost the explicit register move.  */
5485       if (mode == DImode
5486           && GET_MODE (op0) == SImode
5487           && outer == SET)
5488         {
5489           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5490
5491           if (!op_cost && speed)
5492             /* MOV.  */
5493             *cost += extra_cost->alu.extend;
5494           else
5495             /* Free, the cost is that of the SI mode operation.  */
5496             *cost = op_cost;
5497
5498           return true;
5499         }
5500       else if (MEM_P (XEXP (x, 0)))
5501         {
5502           /* All loads can zero extend to any size for free.  */
5503           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5504           return true;
5505         }
5506
5507       /* UXTB/UXTH.  */
5508       if (speed)
5509         *cost += extra_cost->alu.extend;
5510
5511       return false;
5512
5513     case SIGN_EXTEND:
5514       if (MEM_P (XEXP (x, 0)))
5515         {
5516           /* LDRSH.  */
5517           if (speed)
5518             {
5519               rtx address = XEXP (XEXP (x, 0), 0);
5520               *cost += extra_cost->ldst.load_sign_extend;
5521
5522               *cost +=
5523                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5524                                                      0, speed));
5525             }
5526           return true;
5527         }
5528
5529       if (speed)
5530         *cost += extra_cost->alu.extend;
5531       return false;
5532
5533     case ASHIFT:
5534       op0 = XEXP (x, 0);
5535       op1 = XEXP (x, 1);
5536
5537       if (CONST_INT_P (op1))
5538         {
5539           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
5540              aliases.  */
5541           if (speed)
5542             *cost += extra_cost->alu.shift;
5543
5544           /* We can incorporate zero/sign extend for free.  */
5545           if (GET_CODE (op0) == ZERO_EXTEND
5546               || GET_CODE (op0) == SIGN_EXTEND)
5547             op0 = XEXP (op0, 0);
5548
5549           *cost += rtx_cost (op0, ASHIFT, 0, speed);
5550           return true;
5551         }
5552       else
5553         {
5554           /* LSLV.  */
5555           if (speed)
5556             *cost += extra_cost->alu.shift_reg;
5557
5558           return false;  /* All arguments need to be in registers.  */
5559         }
5560
5561     case ROTATE:
5562     case ROTATERT:
5563     case LSHIFTRT:
5564     case ASHIFTRT:
5565       op0 = XEXP (x, 0);
5566       op1 = XEXP (x, 1);
5567
5568       if (CONST_INT_P (op1))
5569         {
5570           /* ASR (immediate) and friends.  */
5571           if (speed)
5572             *cost += extra_cost->alu.shift;
5573
5574           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5575           return true;
5576         }
5577       else
5578         {
5579
5580           /* ASR (register) and friends.  */
5581           if (speed)
5582             *cost += extra_cost->alu.shift_reg;
5583
5584           return false;  /* All arguments need to be in registers.  */
5585         }
5586
5587     case SYMBOL_REF:
5588
5589       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5590         {
5591           /* LDR.  */
5592           if (speed)
5593             *cost += extra_cost->ldst.load;
5594         }
5595       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5596                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5597         {
5598           /* ADRP, followed by ADD.  */
5599           *cost += COSTS_N_INSNS (1);
5600           if (speed)
5601             *cost += 2 * extra_cost->alu.arith;
5602         }
5603       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5604                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5605         {
5606           /* ADR.  */
5607           if (speed)
5608             *cost += extra_cost->alu.arith;
5609         }
5610
5611       if (flag_pic)
5612         {
5613           /* One extra load instruction, after accessing the GOT.  */
5614           *cost += COSTS_N_INSNS (1);
5615           if (speed)
5616             *cost += extra_cost->ldst.load;
5617         }
5618       return true;
5619
5620     case HIGH:
5621     case LO_SUM:
5622       /* ADRP/ADD (immediate).  */
5623       if (speed)
5624         *cost += extra_cost->alu.arith;
5625       return true;
5626
5627     case ZERO_EXTRACT:
5628     case SIGN_EXTRACT:
5629       /* UBFX/SBFX.  */
5630       if (speed)
5631         *cost += extra_cost->alu.bfx;
5632
5633       /* We can trust that the immediates used will be correct (there
5634          are no by-register forms), so we need only cost op0.  */
5635       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5636       return true;
5637
5638     case MULT:
5639       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5640       /* aarch64_rtx_mult_cost always handles recursion to its
5641          operands.  */
5642       return true;
5643
5644     case MOD:
5645     case UMOD:
5646       if (speed)
5647         {
5648           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5649             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5650                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5651           else if (GET_MODE (x) == DFmode)
5652             *cost += (extra_cost->fp[1].mult
5653                       + extra_cost->fp[1].div);
5654           else if (GET_MODE (x) == SFmode)
5655             *cost += (extra_cost->fp[0].mult
5656                       + extra_cost->fp[0].div);
5657         }
5658       return false;  /* All arguments need to be in registers.  */
5659
5660     case DIV:
5661     case UDIV:
5662     case SQRT:
5663       if (speed)
5664         {
5665           if (GET_MODE_CLASS (mode) == MODE_INT)
5666             /* There is no integer SQRT, so only DIV and UDIV can get
5667                here.  */
5668             *cost += extra_cost->mult[mode == DImode].idiv;
5669           else
5670             *cost += extra_cost->fp[mode == DFmode].div;
5671         }
5672       return false;  /* All arguments need to be in registers.  */
5673
5674     case IF_THEN_ELSE:
5675       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5676                                          XEXP (x, 2), cost, speed);
5677
5678     case EQ:
5679     case NE:
5680     case GT:
5681     case GTU:
5682     case LT:
5683     case LTU:
5684     case GE:
5685     case GEU:
5686     case LE:
5687     case LEU:
5688
5689       return false; /* All arguments must be in registers.  */
5690
5691     case FMA:
5692       op0 = XEXP (x, 0);
5693       op1 = XEXP (x, 1);
5694       op2 = XEXP (x, 2);
5695
5696       if (speed)
5697         *cost += extra_cost->fp[mode == DFmode].fma;
5698
5699       /* FMSUB, FNMADD, and FNMSUB are free.  */
5700       if (GET_CODE (op0) == NEG)
5701         op0 = XEXP (op0, 0);
5702
5703       if (GET_CODE (op2) == NEG)
5704         op2 = XEXP (op2, 0);
5705
5706       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5707          and the by-element operand as operand 0.  */
5708       if (GET_CODE (op1) == NEG)
5709         op1 = XEXP (op1, 0);
5710
5711       /* Catch vector-by-element operations.  The by-element operand can
5712          either be (vec_duplicate (vec_select (x))) or just
5713          (vec_select (x)), depending on whether we are multiplying by
5714          a vector or a scalar.
5715
5716          Canonicalization is not very good in these cases, FMA4 will put the
5717          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
5718       if (GET_CODE (op0) == VEC_DUPLICATE)
5719         op0 = XEXP (op0, 0);
5720       else if (GET_CODE (op1) == VEC_DUPLICATE)
5721         op1 = XEXP (op1, 0);
5722
5723       if (GET_CODE (op0) == VEC_SELECT)
5724         op0 = XEXP (op0, 0);
5725       else if (GET_CODE (op1) == VEC_SELECT)
5726         op1 = XEXP (op1, 0);
5727
5728       /* If the remaining parameters are not registers,
5729          get the cost to put them into registers.  */
5730       *cost += rtx_cost (op0, FMA, 0, speed);
5731       *cost += rtx_cost (op1, FMA, 1, speed);
5732       *cost += rtx_cost (op2, FMA, 2, speed);
5733       return true;
5734
5735     case FLOAT_EXTEND:
5736       if (speed)
5737         *cost += extra_cost->fp[mode == DFmode].widen;
5738       return false;
5739
5740     case FLOAT_TRUNCATE:
5741       if (speed)
5742         *cost += extra_cost->fp[mode == DFmode].narrow;
5743       return false;
5744
5745     case FIX:
5746     case UNSIGNED_FIX:
5747       x = XEXP (x, 0);
5748       /* Strip the rounding part.  They will all be implemented
5749          by the fcvt* family of instructions anyway.  */
5750       if (GET_CODE (x) == UNSPEC)
5751         {
5752           unsigned int uns_code = XINT (x, 1);
5753
5754           if (uns_code == UNSPEC_FRINTA
5755               || uns_code == UNSPEC_FRINTM
5756               || uns_code == UNSPEC_FRINTN
5757               || uns_code == UNSPEC_FRINTP
5758               || uns_code == UNSPEC_FRINTZ)
5759             x = XVECEXP (x, 0, 0);
5760         }
5761
5762       if (speed)
5763         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5764
5765       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5766       return true;
5767
5768     case ABS:
5769       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5770         {
5771           /* FABS and FNEG are analogous.  */
5772           if (speed)
5773             *cost += extra_cost->fp[mode == DFmode].neg;
5774         }
5775       else
5776         {
5777           /* Integer ABS will either be split to
5778              two arithmetic instructions, or will be an ABS
5779              (scalar), which we don't model.  */
5780           *cost = COSTS_N_INSNS (2);
5781           if (speed)
5782             *cost += 2 * extra_cost->alu.arith;
5783         }
5784       return false;
5785
5786     case SMAX:
5787     case SMIN:
5788       if (speed)
5789         {
5790           /* FMAXNM/FMINNM/FMAX/FMIN.
5791              TODO: This may not be accurate for all implementations, but
5792              we do not model this in the cost tables.  */
5793           *cost += extra_cost->fp[mode == DFmode].addsub;
5794         }
5795       return false;
5796
5797     case UNSPEC:
5798       /* The floating point round to integer frint* instructions.  */
5799       if (aarch64_frint_unspec_p (XINT (x, 1)))
5800         {
5801           if (speed)
5802             *cost += extra_cost->fp[mode == DFmode].roundint;
5803
5804           return false;
5805         }
5806
5807       if (XINT (x, 1) == UNSPEC_RBIT)
5808         {
5809           if (speed)
5810             *cost += extra_cost->alu.rev;
5811
5812           return false;
5813         }
5814       break;
5815
5816     case TRUNCATE:
5817
5818       /* Decompose <su>muldi3_highpart.  */
5819       if (/* (truncate:DI  */
5820           mode == DImode
5821           /*   (lshiftrt:TI  */
5822           && GET_MODE (XEXP (x, 0)) == TImode
5823           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5824           /*      (mult:TI  */
5825           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5826           /*        (ANY_EXTEND:TI (reg:DI))
5827                     (ANY_EXTEND:TI (reg:DI)))  */
5828           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5829                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5830               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5831                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5832           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5833           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5834           /*     (const_int 64)  */
5835           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5836           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5837         {
5838           /* UMULH/SMULH.  */
5839           if (speed)
5840             *cost += extra_cost->mult[mode == DImode].extend;
5841           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5842                              MULT, 0, speed);
5843           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5844                              MULT, 1, speed);
5845           return true;
5846         }
5847
5848       /* Fall through.  */
5849     default:
5850       break;
5851     }
5852
5853   if (dump_file && (dump_flags & TDF_DETAILS))
5854     fprintf (dump_file,
5855       "\nFailed to cost RTX.  Assuming default cost.\n");
5856
5857   return true;
5858 }
5859
5860 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5861    calculated for X.  This cost is stored in *COST.  Returns true
5862    if the total cost of X was calculated.  */
5863 static bool
5864 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5865                    int param, int *cost, bool speed)
5866 {
5867   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5868
5869   if (dump_file && (dump_flags & TDF_DETAILS))
5870     {
5871       print_rtl_single (dump_file, x);
5872       fprintf (dump_file, "\n%s cost: %d (%s)\n",
5873                speed ? "Hot" : "Cold",
5874                *cost, result ? "final" : "partial");
5875     }
5876
5877   return result;
5878 }
5879
5880 static int
5881 aarch64_register_move_cost (enum machine_mode mode,
5882                             reg_class_t from_i, reg_class_t to_i)
5883 {
5884   enum reg_class from = (enum reg_class) from_i;
5885   enum reg_class to = (enum reg_class) to_i;
5886   const struct cpu_regmove_cost *regmove_cost
5887     = aarch64_tune_params->regmove_cost;
5888
5889   /* Moving between GPR and stack cost is the same as GP2GP.  */
5890   if ((from == GENERAL_REGS && to == STACK_REG)
5891       || (to == GENERAL_REGS && from == STACK_REG))
5892     return regmove_cost->GP2GP;
5893
5894   /* To/From the stack register, we move via the gprs.  */
5895   if (to == STACK_REG || from == STACK_REG)
5896     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5897             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5898
5899   if (from == GENERAL_REGS && to == GENERAL_REGS)
5900     return regmove_cost->GP2GP;
5901   else if (from == GENERAL_REGS)
5902     return regmove_cost->GP2FP;
5903   else if (to == GENERAL_REGS)
5904     return regmove_cost->FP2GP;
5905
5906   /* When AdvSIMD instructions are disabled it is not possible to move
5907      a 128-bit value directly between Q registers.  This is handled in
5908      secondary reload.  A general register is used as a scratch to move
5909      the upper DI value and the lower DI value is moved directly,
5910      hence the cost is the sum of three moves. */
5911   if (! TARGET_SIMD && GET_MODE_SIZE (mode) == 128)
5912     return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
5913
5914   return regmove_cost->FP2FP;
5915 }
5916
5917 static int
5918 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
5919                           reg_class_t rclass ATTRIBUTE_UNUSED,
5920                           bool in ATTRIBUTE_UNUSED)
5921 {
5922   return aarch64_tune_params->memmov_cost;
5923 }
5924
5925 /* Return the number of instructions that can be issued per cycle.  */
5926 static int
5927 aarch64_sched_issue_rate (void)
5928 {
5929   return aarch64_tune_params->issue_rate;
5930 }
5931
5932 /* Vectorizer cost model target hooks.  */
5933
5934 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
5935 static int
5936 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
5937                                     tree vectype,
5938                                     int misalign ATTRIBUTE_UNUSED)
5939 {
5940   unsigned elements;
5941
5942   switch (type_of_cost)
5943     {
5944       case scalar_stmt:
5945         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
5946
5947       case scalar_load:
5948         return aarch64_tune_params->vec_costs->scalar_load_cost;
5949
5950       case scalar_store:
5951         return aarch64_tune_params->vec_costs->scalar_store_cost;
5952
5953       case vector_stmt:
5954         return aarch64_tune_params->vec_costs->vec_stmt_cost;
5955
5956       case vector_load:
5957         return aarch64_tune_params->vec_costs->vec_align_load_cost;
5958
5959       case vector_store:
5960         return aarch64_tune_params->vec_costs->vec_store_cost;
5961
5962       case vec_to_scalar:
5963         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
5964
5965       case scalar_to_vec:
5966         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
5967
5968       case unaligned_load:
5969         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
5970
5971       case unaligned_store:
5972         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
5973
5974       case cond_branch_taken:
5975         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
5976
5977       case cond_branch_not_taken:
5978         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
5979
5980       case vec_perm:
5981       case vec_promote_demote:
5982         return aarch64_tune_params->vec_costs->vec_stmt_cost;
5983
5984       case vec_construct:
5985         elements = TYPE_VECTOR_SUBPARTS (vectype);
5986         return elements / 2 + 1;
5987
5988       default:
5989         gcc_unreachable ();
5990     }
5991 }
5992
5993 /* Implement targetm.vectorize.add_stmt_cost.  */
5994 static unsigned
5995 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
5996                        struct _stmt_vec_info *stmt_info, int misalign,
5997                        enum vect_cost_model_location where)
5998 {
5999   unsigned *cost = (unsigned *) data;
6000   unsigned retval = 0;
6001
6002   if (flag_vect_cost_model)
6003     {
6004       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6005       int stmt_cost =
6006             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6007
6008       /* Statements in an inner loop relative to the loop being
6009          vectorized are weighted more heavily.  The value here is
6010          a function (linear for now) of the loop nest level.  */
6011       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6012         {
6013           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6014           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6015           unsigned nest_level = loop_depth (loop);
6016
6017           count *= nest_level;
6018         }
6019
6020       retval = (unsigned) (count * stmt_cost);
6021       cost[where] += retval;
6022     }
6023
6024   return retval;
6025 }
6026
6027 static void initialize_aarch64_code_model (void);
6028
6029 /* Parse the architecture extension string.  */
6030
6031 static void
6032 aarch64_parse_extension (char *str)
6033 {
6034   /* The extension string is parsed left to right.  */
6035   const struct aarch64_option_extension *opt = NULL;
6036
6037   /* Flag to say whether we are adding or removing an extension.  */
6038   int adding_ext = -1;
6039
6040   while (str != NULL && *str != 0)
6041     {
6042       char *ext;
6043       size_t len;
6044
6045       str++;
6046       ext = strchr (str, '+');
6047
6048       if (ext != NULL)
6049         len = ext - str;
6050       else
6051         len = strlen (str);
6052
6053       if (len >= 2 && strncmp (str, "no", 2) == 0)
6054         {
6055           adding_ext = 0;
6056           len -= 2;
6057           str += 2;
6058         }
6059       else if (len > 0)
6060         adding_ext = 1;
6061
6062       if (len == 0)
6063         {
6064           error ("missing feature modifier after %qs", "+no");
6065           return;
6066         }
6067
6068       /* Scan over the extensions table trying to find an exact match.  */
6069       for (opt = all_extensions; opt->name != NULL; opt++)
6070         {
6071           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6072             {
6073               /* Add or remove the extension.  */
6074               if (adding_ext)
6075                 aarch64_isa_flags |= opt->flags_on;
6076               else
6077                 aarch64_isa_flags &= ~(opt->flags_off);
6078               break;
6079             }
6080         }
6081
6082       if (opt->name == NULL)
6083         {
6084           /* Extension not found in list.  */
6085           error ("unknown feature modifier %qs", str);
6086           return;
6087         }
6088
6089       str = ext;
6090     };
6091
6092   return;
6093 }
6094
6095 /* Parse the ARCH string.  */
6096
6097 static void
6098 aarch64_parse_arch (void)
6099 {
6100   char *ext;
6101   const struct processor *arch;
6102   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6103   size_t len;
6104
6105   strcpy (str, aarch64_arch_string);
6106
6107   ext = strchr (str, '+');
6108
6109   if (ext != NULL)
6110     len = ext - str;
6111   else
6112     len = strlen (str);
6113
6114   if (len == 0)
6115     {
6116       error ("missing arch name in -march=%qs", str);
6117       return;
6118     }
6119
6120   /* Loop through the list of supported ARCHs to find a match.  */
6121   for (arch = all_architectures; arch->name != NULL; arch++)
6122     {
6123       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6124         {
6125           selected_arch = arch;
6126           aarch64_isa_flags = selected_arch->flags;
6127
6128           if (!selected_cpu)
6129             selected_cpu = &all_cores[selected_arch->core];
6130
6131           if (ext != NULL)
6132             {
6133               /* ARCH string contains at least one extension.  */
6134               aarch64_parse_extension (ext);
6135             }
6136
6137           if (strcmp (selected_arch->arch, selected_cpu->arch))
6138             {
6139               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6140                        selected_cpu->name, selected_arch->name);
6141             }
6142
6143           return;
6144         }
6145     }
6146
6147   /* ARCH name not found in list.  */
6148   error ("unknown value %qs for -march", str);
6149   return;
6150 }
6151
6152 /* Parse the CPU string.  */
6153
6154 static void
6155 aarch64_parse_cpu (void)
6156 {
6157   char *ext;
6158   const struct processor *cpu;
6159   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6160   size_t len;
6161
6162   strcpy (str, aarch64_cpu_string);
6163
6164   ext = strchr (str, '+');
6165
6166   if (ext != NULL)
6167     len = ext - str;
6168   else
6169     len = strlen (str);
6170
6171   if (len == 0)
6172     {
6173       error ("missing cpu name in -mcpu=%qs", str);
6174       return;
6175     }
6176
6177   /* Loop through the list of supported CPUs to find a match.  */
6178   for (cpu = all_cores; cpu->name != NULL; cpu++)
6179     {
6180       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6181         {
6182           selected_cpu = cpu;
6183           selected_tune = cpu;
6184           aarch64_isa_flags = selected_cpu->flags;
6185
6186           if (ext != NULL)
6187             {
6188               /* CPU string contains at least one extension.  */
6189               aarch64_parse_extension (ext);
6190             }
6191
6192           return;
6193         }
6194     }
6195
6196   /* CPU name not found in list.  */
6197   error ("unknown value %qs for -mcpu", str);
6198   return;
6199 }
6200
6201 /* Parse the TUNE string.  */
6202
6203 static void
6204 aarch64_parse_tune (void)
6205 {
6206   const struct processor *cpu;
6207   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6208   strcpy (str, aarch64_tune_string);
6209
6210   /* Loop through the list of supported CPUs to find a match.  */
6211   for (cpu = all_cores; cpu->name != NULL; cpu++)
6212     {
6213       if (strcmp (cpu->name, str) == 0)
6214         {
6215           selected_tune = cpu;
6216           return;
6217         }
6218     }
6219
6220   /* CPU name not found in list.  */
6221   error ("unknown value %qs for -mtune", str);
6222   return;
6223 }
6224
6225
6226 /* Implement TARGET_OPTION_OVERRIDE.  */
6227
6228 static void
6229 aarch64_override_options (void)
6230 {
6231   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6232      If either of -march or -mtune is given, they override their
6233      respective component of -mcpu.
6234
6235      So, first parse AARCH64_CPU_STRING, then the others, be careful
6236      with -march as, if -mcpu is not present on the command line, march
6237      must set a sensible default CPU.  */
6238   if (aarch64_cpu_string)
6239     {
6240       aarch64_parse_cpu ();
6241     }
6242
6243   if (aarch64_arch_string)
6244     {
6245       aarch64_parse_arch ();
6246     }
6247
6248   if (aarch64_tune_string)
6249     {
6250       aarch64_parse_tune ();
6251     }
6252
6253 #ifndef HAVE_AS_MABI_OPTION
6254   /* The compiler may have been configured with 2.23.* binutils, which does
6255      not have support for ILP32.  */
6256   if (TARGET_ILP32)
6257     error ("Assembler does not support -mabi=ilp32");
6258 #endif
6259
6260   initialize_aarch64_code_model ();
6261
6262   aarch64_build_bitmask_table ();
6263
6264   /* This target defaults to strict volatile bitfields.  */
6265   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6266     flag_strict_volatile_bitfields = 1;
6267
6268   /* If the user did not specify a processor, choose the default
6269      one for them.  This will be the CPU set during configuration using
6270      --with-cpu, otherwise it is "generic".  */
6271   if (!selected_cpu)
6272     {
6273       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6274       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6275     }
6276
6277   gcc_assert (selected_cpu);
6278
6279   /* The selected cpu may be an architecture, so lookup tuning by core ID.  */
6280   if (!selected_tune)
6281     selected_tune = &all_cores[selected_cpu->core];
6282
6283   aarch64_tune_flags = selected_tune->flags;
6284   aarch64_tune = selected_tune->core;
6285   aarch64_tune_params = selected_tune->tune;
6286
6287   aarch64_override_options_after_change ();
6288 }
6289
6290 /* Implement targetm.override_options_after_change.  */
6291
6292 static void
6293 aarch64_override_options_after_change (void)
6294 {
6295   if (flag_omit_frame_pointer)
6296     flag_omit_leaf_frame_pointer = false;
6297   else if (flag_omit_leaf_frame_pointer)
6298     flag_omit_frame_pointer = true;
6299 }
6300
6301 static struct machine_function *
6302 aarch64_init_machine_status (void)
6303 {
6304   struct machine_function *machine;
6305   machine = ggc_cleared_alloc<machine_function> ();
6306   return machine;
6307 }
6308
6309 void
6310 aarch64_init_expanders (void)
6311 {
6312   init_machine_status = aarch64_init_machine_status;
6313 }
6314
6315 /* A checking mechanism for the implementation of the various code models.  */
6316 static void
6317 initialize_aarch64_code_model (void)
6318 {
6319    if (flag_pic)
6320      {
6321        switch (aarch64_cmodel_var)
6322          {
6323          case AARCH64_CMODEL_TINY:
6324            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6325            break;
6326          case AARCH64_CMODEL_SMALL:
6327            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6328            break;
6329          case AARCH64_CMODEL_LARGE:
6330            sorry ("code model %qs with -f%s", "large",
6331                   flag_pic > 1 ? "PIC" : "pic");
6332          default:
6333            gcc_unreachable ();
6334          }
6335      }
6336    else
6337      aarch64_cmodel = aarch64_cmodel_var;
6338 }
6339
6340 /* Return true if SYMBOL_REF X binds locally.  */
6341
6342 static bool
6343 aarch64_symbol_binds_local_p (const_rtx x)
6344 {
6345   return (SYMBOL_REF_DECL (x)
6346           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6347           : SYMBOL_REF_LOCAL_P (x));
6348 }
6349
6350 /* Return true if SYMBOL_REF X is thread local */
6351 static bool
6352 aarch64_tls_symbol_p (rtx x)
6353 {
6354   if (! TARGET_HAVE_TLS)
6355     return false;
6356
6357   if (GET_CODE (x) != SYMBOL_REF)
6358     return false;
6359
6360   return SYMBOL_REF_TLS_MODEL (x) != 0;
6361 }
6362
6363 /* Classify a TLS symbol into one of the TLS kinds.  */
6364 enum aarch64_symbol_type
6365 aarch64_classify_tls_symbol (rtx x)
6366 {
6367   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6368
6369   switch (tls_kind)
6370     {
6371     case TLS_MODEL_GLOBAL_DYNAMIC:
6372     case TLS_MODEL_LOCAL_DYNAMIC:
6373       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6374
6375     case TLS_MODEL_INITIAL_EXEC:
6376       return SYMBOL_SMALL_GOTTPREL;
6377
6378     case TLS_MODEL_LOCAL_EXEC:
6379       return SYMBOL_SMALL_TPREL;
6380
6381     case TLS_MODEL_EMULATED:
6382     case TLS_MODEL_NONE:
6383       return SYMBOL_FORCE_TO_MEM;
6384
6385     default:
6386       gcc_unreachable ();
6387     }
6388 }
6389
6390 /* Return the method that should be used to access SYMBOL_REF or
6391    LABEL_REF X in context CONTEXT.  */
6392
6393 enum aarch64_symbol_type
6394 aarch64_classify_symbol (rtx x,
6395                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6396 {
6397   if (GET_CODE (x) == LABEL_REF)
6398     {
6399       switch (aarch64_cmodel)
6400         {
6401         case AARCH64_CMODEL_LARGE:
6402           return SYMBOL_FORCE_TO_MEM;
6403
6404         case AARCH64_CMODEL_TINY_PIC:
6405         case AARCH64_CMODEL_TINY:
6406           return SYMBOL_TINY_ABSOLUTE;
6407
6408         case AARCH64_CMODEL_SMALL_PIC:
6409         case AARCH64_CMODEL_SMALL:
6410           return SYMBOL_SMALL_ABSOLUTE;
6411
6412         default:
6413           gcc_unreachable ();
6414         }
6415     }
6416
6417   if (GET_CODE (x) == SYMBOL_REF)
6418     {
6419       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6420           return SYMBOL_FORCE_TO_MEM;
6421
6422       if (aarch64_tls_symbol_p (x))
6423         return aarch64_classify_tls_symbol (x);
6424
6425       switch (aarch64_cmodel)
6426         {
6427         case AARCH64_CMODEL_TINY:
6428           if (SYMBOL_REF_WEAK (x))
6429             return SYMBOL_FORCE_TO_MEM;
6430           return SYMBOL_TINY_ABSOLUTE;
6431
6432         case AARCH64_CMODEL_SMALL:
6433           if (SYMBOL_REF_WEAK (x))
6434             return SYMBOL_FORCE_TO_MEM;
6435           return SYMBOL_SMALL_ABSOLUTE;
6436
6437         case AARCH64_CMODEL_TINY_PIC:
6438           if (!aarch64_symbol_binds_local_p (x))
6439             return SYMBOL_TINY_GOT;
6440           return SYMBOL_TINY_ABSOLUTE;
6441
6442         case AARCH64_CMODEL_SMALL_PIC:
6443           if (!aarch64_symbol_binds_local_p (x))
6444             return SYMBOL_SMALL_GOT;
6445           return SYMBOL_SMALL_ABSOLUTE;
6446
6447         default:
6448           gcc_unreachable ();
6449         }
6450     }
6451
6452   /* By default push everything into the constant pool.  */
6453   return SYMBOL_FORCE_TO_MEM;
6454 }
6455
6456 bool
6457 aarch64_constant_address_p (rtx x)
6458 {
6459   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6460 }
6461
6462 bool
6463 aarch64_legitimate_pic_operand_p (rtx x)
6464 {
6465   if (GET_CODE (x) == SYMBOL_REF
6466       || (GET_CODE (x) == CONST
6467           && GET_CODE (XEXP (x, 0)) == PLUS
6468           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6469      return false;
6470
6471   return true;
6472 }
6473
6474 /* Return true if X holds either a quarter-precision or
6475      floating-point +0.0 constant.  */
6476 static bool
6477 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6478 {
6479   if (!CONST_DOUBLE_P (x))
6480     return false;
6481
6482   /* TODO: We could handle moving 0.0 to a TFmode register,
6483      but first we would like to refactor the movtf_aarch64
6484      to be more amicable to split moves properly and
6485      correctly gate on TARGET_SIMD.  For now - reject all
6486      constants which are not to SFmode or DFmode registers.  */
6487   if (!(mode == SFmode || mode == DFmode))
6488     return false;
6489
6490   if (aarch64_float_const_zero_rtx_p (x))
6491     return true;
6492   return aarch64_float_const_representable_p (x);
6493 }
6494
6495 static bool
6496 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6497 {
6498   /* Do not allow vector struct mode constants.  We could support
6499      0 and -1 easily, but they need support in aarch64-simd.md.  */
6500   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6501     return false;
6502
6503   /* This could probably go away because
6504      we now decompose CONST_INTs according to expand_mov_immediate.  */
6505   if ((GET_CODE (x) == CONST_VECTOR
6506        && aarch64_simd_valid_immediate (x, mode, false, NULL))
6507       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6508         return !targetm.cannot_force_const_mem (mode, x);
6509
6510   if (GET_CODE (x) == HIGH
6511       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6512     return true;
6513
6514   return aarch64_constant_address_p (x);
6515 }
6516
6517 rtx
6518 aarch64_load_tp (rtx target)
6519 {
6520   if (!target
6521       || GET_MODE (target) != Pmode
6522       || !register_operand (target, Pmode))
6523     target = gen_reg_rtx (Pmode);
6524
6525   /* Can return in any reg.  */
6526   emit_insn (gen_aarch64_load_tp_hard (target));
6527   return target;
6528 }
6529
6530 /* On AAPCS systems, this is the "struct __va_list".  */
6531 static GTY(()) tree va_list_type;
6532
6533 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6534    Return the type to use as __builtin_va_list.
6535
6536    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6537
6538    struct __va_list
6539    {
6540      void *__stack;
6541      void *__gr_top;
6542      void *__vr_top;
6543      int   __gr_offs;
6544      int   __vr_offs;
6545    };  */
6546
6547 static tree
6548 aarch64_build_builtin_va_list (void)
6549 {
6550   tree va_list_name;
6551   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6552
6553   /* Create the type.  */
6554   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6555   /* Give it the required name.  */
6556   va_list_name = build_decl (BUILTINS_LOCATION,
6557                              TYPE_DECL,
6558                              get_identifier ("__va_list"),
6559                              va_list_type);
6560   DECL_ARTIFICIAL (va_list_name) = 1;
6561   TYPE_NAME (va_list_type) = va_list_name;
6562   TYPE_STUB_DECL (va_list_type) = va_list_name;
6563
6564   /* Create the fields.  */
6565   f_stack = build_decl (BUILTINS_LOCATION,
6566                         FIELD_DECL, get_identifier ("__stack"),
6567                         ptr_type_node);
6568   f_grtop = build_decl (BUILTINS_LOCATION,
6569                         FIELD_DECL, get_identifier ("__gr_top"),
6570                         ptr_type_node);
6571   f_vrtop = build_decl (BUILTINS_LOCATION,
6572                         FIELD_DECL, get_identifier ("__vr_top"),
6573                         ptr_type_node);
6574   f_groff = build_decl (BUILTINS_LOCATION,
6575                         FIELD_DECL, get_identifier ("__gr_offs"),
6576                         integer_type_node);
6577   f_vroff = build_decl (BUILTINS_LOCATION,
6578                         FIELD_DECL, get_identifier ("__vr_offs"),
6579                         integer_type_node);
6580
6581   DECL_ARTIFICIAL (f_stack) = 1;
6582   DECL_ARTIFICIAL (f_grtop) = 1;
6583   DECL_ARTIFICIAL (f_vrtop) = 1;
6584   DECL_ARTIFICIAL (f_groff) = 1;
6585   DECL_ARTIFICIAL (f_vroff) = 1;
6586
6587   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6588   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6589   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6590   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6591   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6592
6593   TYPE_FIELDS (va_list_type) = f_stack;
6594   DECL_CHAIN (f_stack) = f_grtop;
6595   DECL_CHAIN (f_grtop) = f_vrtop;
6596   DECL_CHAIN (f_vrtop) = f_groff;
6597   DECL_CHAIN (f_groff) = f_vroff;
6598
6599   /* Compute its layout.  */
6600   layout_type (va_list_type);
6601
6602   return va_list_type;
6603 }
6604
6605 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
6606 static void
6607 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6608 {
6609   const CUMULATIVE_ARGS *cum;
6610   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6611   tree stack, grtop, vrtop, groff, vroff;
6612   tree t;
6613   int gr_save_area_size;
6614   int vr_save_area_size;
6615   int vr_offset;
6616
6617   cum = &crtl->args.info;
6618   gr_save_area_size
6619     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6620   vr_save_area_size
6621     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6622
6623   if (TARGET_GENERAL_REGS_ONLY)
6624     {
6625       if (cum->aapcs_nvrn > 0)
6626         sorry ("%qs and floating point or vector arguments",
6627                "-mgeneral-regs-only");
6628       vr_save_area_size = 0;
6629     }
6630
6631   f_stack = TYPE_FIELDS (va_list_type_node);
6632   f_grtop = DECL_CHAIN (f_stack);
6633   f_vrtop = DECL_CHAIN (f_grtop);
6634   f_groff = DECL_CHAIN (f_vrtop);
6635   f_vroff = DECL_CHAIN (f_groff);
6636
6637   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6638                   NULL_TREE);
6639   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6640                   NULL_TREE);
6641   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6642                   NULL_TREE);
6643   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6644                   NULL_TREE);
6645   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6646                   NULL_TREE);
6647
6648   /* Emit code to initialize STACK, which points to the next varargs stack
6649      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
6650      by named arguments.  STACK is 8-byte aligned.  */
6651   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6652   if (cum->aapcs_stack_size > 0)
6653     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6654   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6655   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6656
6657   /* Emit code to initialize GRTOP, the top of the GR save area.
6658      virtual_incoming_args_rtx should have been 16 byte aligned.  */
6659   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6660   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6661   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6662
6663   /* Emit code to initialize VRTOP, the top of the VR save area.
6664      This address is gr_save_area_bytes below GRTOP, rounded
6665      down to the next 16-byte boundary.  */
6666   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6667   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6668                              STACK_BOUNDARY / BITS_PER_UNIT);
6669
6670   if (vr_offset)
6671     t = fold_build_pointer_plus_hwi (t, -vr_offset);
6672   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6673   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6674
6675   /* Emit code to initialize GROFF, the offset from GRTOP of the
6676      next GPR argument.  */
6677   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6678               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6679   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6680
6681   /* Likewise emit code to initialize VROFF, the offset from FTOP
6682      of the next VR argument.  */
6683   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6684               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6685   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6686 }
6687
6688 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
6689
6690 static tree
6691 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6692                               gimple_seq *post_p ATTRIBUTE_UNUSED)
6693 {
6694   tree addr;
6695   bool indirect_p;
6696   bool is_ha;           /* is HFA or HVA.  */
6697   bool dw_align;        /* double-word align.  */
6698   enum machine_mode ag_mode = VOIDmode;
6699   int nregs;
6700   enum machine_mode mode;
6701
6702   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6703   tree stack, f_top, f_off, off, arg, roundup, on_stack;
6704   HOST_WIDE_INT size, rsize, adjust, align;
6705   tree t, u, cond1, cond2;
6706
6707   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6708   if (indirect_p)
6709     type = build_pointer_type (type);
6710
6711   mode = TYPE_MODE (type);
6712
6713   f_stack = TYPE_FIELDS (va_list_type_node);
6714   f_grtop = DECL_CHAIN (f_stack);
6715   f_vrtop = DECL_CHAIN (f_grtop);
6716   f_groff = DECL_CHAIN (f_vrtop);
6717   f_vroff = DECL_CHAIN (f_groff);
6718
6719   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6720                   f_stack, NULL_TREE);
6721   size = int_size_in_bytes (type);
6722   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6723
6724   dw_align = false;
6725   adjust = 0;
6726   if (aarch64_vfp_is_call_or_return_candidate (mode,
6727                                                type,
6728                                                &ag_mode,
6729                                                &nregs,
6730                                                &is_ha))
6731     {
6732       /* TYPE passed in fp/simd registers.  */
6733       if (TARGET_GENERAL_REGS_ONLY)
6734         sorry ("%qs and floating point or vector arguments",
6735                "-mgeneral-regs-only");
6736
6737       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6738                       unshare_expr (valist), f_vrtop, NULL_TREE);
6739       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6740                       unshare_expr (valist), f_vroff, NULL_TREE);
6741
6742       rsize = nregs * UNITS_PER_VREG;
6743
6744       if (is_ha)
6745         {
6746           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6747             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6748         }
6749       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6750                && size < UNITS_PER_VREG)
6751         {
6752           adjust = UNITS_PER_VREG - size;
6753         }
6754     }
6755   else
6756     {
6757       /* TYPE passed in general registers.  */
6758       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6759                       unshare_expr (valist), f_grtop, NULL_TREE);
6760       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6761                       unshare_expr (valist), f_groff, NULL_TREE);
6762       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6763       nregs = rsize / UNITS_PER_WORD;
6764
6765       if (align > 8)
6766         dw_align = true;
6767
6768       if (BLOCK_REG_PADDING (mode, type, 1) == downward
6769           && size < UNITS_PER_WORD)
6770         {
6771           adjust = UNITS_PER_WORD  - size;
6772         }
6773     }
6774
6775   /* Get a local temporary for the field value.  */
6776   off = get_initialized_tmp_var (f_off, pre_p, NULL);
6777
6778   /* Emit code to branch if off >= 0.  */
6779   t = build2 (GE_EXPR, boolean_type_node, off,
6780               build_int_cst (TREE_TYPE (off), 0));
6781   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6782
6783   if (dw_align)
6784     {
6785       /* Emit: offs = (offs + 15) & -16.  */
6786       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6787                   build_int_cst (TREE_TYPE (off), 15));
6788       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6789                   build_int_cst (TREE_TYPE (off), -16));
6790       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6791     }
6792   else
6793     roundup = NULL;
6794
6795   /* Update ap.__[g|v]r_offs  */
6796   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6797               build_int_cst (TREE_TYPE (off), rsize));
6798   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6799
6800   /* String up.  */
6801   if (roundup)
6802     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6803
6804   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
6805   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6806               build_int_cst (TREE_TYPE (f_off), 0));
6807   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6808
6809   /* String up: make sure the assignment happens before the use.  */
6810   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6811   COND_EXPR_ELSE (cond1) = t;
6812
6813   /* Prepare the trees handling the argument that is passed on the stack;
6814      the top level node will store in ON_STACK.  */
6815   arg = get_initialized_tmp_var (stack, pre_p, NULL);
6816   if (align > 8)
6817     {
6818       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
6819       t = fold_convert (intDI_type_node, arg);
6820       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6821                   build_int_cst (TREE_TYPE (t), 15));
6822       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6823                   build_int_cst (TREE_TYPE (t), -16));
6824       t = fold_convert (TREE_TYPE (arg), t);
6825       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6826     }
6827   else
6828     roundup = NULL;
6829   /* Advance ap.__stack  */
6830   t = fold_convert (intDI_type_node, arg);
6831   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6832               build_int_cst (TREE_TYPE (t), size + 7));
6833   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6834               build_int_cst (TREE_TYPE (t), -8));
6835   t = fold_convert (TREE_TYPE (arg), t);
6836   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6837   /* String up roundup and advance.  */
6838   if (roundup)
6839     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6840   /* String up with arg */
6841   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6842   /* Big-endianness related address adjustment.  */
6843   if (BLOCK_REG_PADDING (mode, type, 1) == downward
6844       && size < UNITS_PER_WORD)
6845   {
6846     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6847                 size_int (UNITS_PER_WORD - size));
6848     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6849   }
6850
6851   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6852   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6853
6854   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
6855   t = off;
6856   if (adjust)
6857     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6858                 build_int_cst (TREE_TYPE (off), adjust));
6859
6860   t = fold_convert (sizetype, t);
6861   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6862
6863   if (is_ha)
6864     {
6865       /* type ha; // treat as "struct {ftype field[n];}"
6866          ... [computing offs]
6867          for (i = 0; i <nregs; ++i, offs += 16)
6868            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6869          return ha;  */
6870       int i;
6871       tree tmp_ha, field_t, field_ptr_t;
6872
6873       /* Declare a local variable.  */
6874       tmp_ha = create_tmp_var_raw (type, "ha");
6875       gimple_add_tmp_var (tmp_ha);
6876
6877       /* Establish the base type.  */
6878       switch (ag_mode)
6879         {
6880         case SFmode:
6881           field_t = float_type_node;
6882           field_ptr_t = float_ptr_type_node;
6883           break;
6884         case DFmode:
6885           field_t = double_type_node;
6886           field_ptr_t = double_ptr_type_node;
6887           break;
6888         case TFmode:
6889           field_t = long_double_type_node;
6890           field_ptr_t = long_double_ptr_type_node;
6891           break;
6892 /* The half precision and quad precision are not fully supported yet.  Enable
6893    the following code after the support is complete.  Need to find the correct
6894    type node for __fp16 *.  */
6895 #if 0
6896         case HFmode:
6897           field_t = float_type_node;
6898           field_ptr_t = float_ptr_type_node;
6899           break;
6900 #endif
6901         case V2SImode:
6902         case V4SImode:
6903             {
6904               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
6905               field_t = build_vector_type_for_mode (innertype, ag_mode);
6906               field_ptr_t = build_pointer_type (field_t);
6907             }
6908           break;
6909         default:
6910           gcc_assert (0);
6911         }
6912
6913       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
6914       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
6915       addr = t;
6916       t = fold_convert (field_ptr_t, addr);
6917       t = build2 (MODIFY_EXPR, field_t,
6918                   build1 (INDIRECT_REF, field_t, tmp_ha),
6919                   build1 (INDIRECT_REF, field_t, t));
6920
6921       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
6922       for (i = 1; i < nregs; ++i)
6923         {
6924           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
6925           u = fold_convert (field_ptr_t, addr);
6926           u = build2 (MODIFY_EXPR, field_t,
6927                       build2 (MEM_REF, field_t, tmp_ha,
6928                               build_int_cst (field_ptr_t,
6929                                              (i *
6930                                               int_size_in_bytes (field_t)))),
6931                       build1 (INDIRECT_REF, field_t, u));
6932           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
6933         }
6934
6935       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
6936       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
6937     }
6938
6939   COND_EXPR_ELSE (cond2) = t;
6940   addr = fold_convert (build_pointer_type (type), cond1);
6941   addr = build_va_arg_indirect_ref (addr);
6942
6943   if (indirect_p)
6944     addr = build_va_arg_indirect_ref (addr);
6945
6946   return addr;
6947 }
6948
6949 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
6950
6951 static void
6952 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
6953                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
6954                                 int no_rtl)
6955 {
6956   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6957   CUMULATIVE_ARGS local_cum;
6958   int gr_saved, vr_saved;
6959
6960   /* The caller has advanced CUM up to, but not beyond, the last named
6961      argument.  Advance a local copy of CUM past the last "real" named
6962      argument, to find out how many registers are left over.  */
6963   local_cum = *cum;
6964   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
6965
6966   /* Found out how many registers we need to save.  */
6967   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
6968   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
6969
6970   if (TARGET_GENERAL_REGS_ONLY)
6971     {
6972       if (local_cum.aapcs_nvrn > 0)
6973         sorry ("%qs and floating point or vector arguments",
6974                "-mgeneral-regs-only");
6975       vr_saved = 0;
6976     }
6977
6978   if (!no_rtl)
6979     {
6980       if (gr_saved > 0)
6981         {
6982           rtx ptr, mem;
6983
6984           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
6985           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
6986                                - gr_saved * UNITS_PER_WORD);
6987           mem = gen_frame_mem (BLKmode, ptr);
6988           set_mem_alias_set (mem, get_varargs_alias_set ());
6989
6990           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
6991                                mem, gr_saved);
6992         }
6993       if (vr_saved > 0)
6994         {
6995           /* We can't use move_block_from_reg, because it will use
6996              the wrong mode, storing D regs only.  */
6997           enum machine_mode mode = TImode;
6998           int off, i;
6999
7000           /* Set OFF to the offset from virtual_incoming_args_rtx of
7001              the first vector register.  The VR save area lies below
7002              the GR one, and is aligned to 16 bytes.  */
7003           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7004                                    STACK_BOUNDARY / BITS_PER_UNIT);
7005           off -= vr_saved * UNITS_PER_VREG;
7006
7007           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7008             {
7009               rtx ptr, mem;
7010
7011               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7012               mem = gen_frame_mem (mode, ptr);
7013               set_mem_alias_set (mem, get_varargs_alias_set ());
7014               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7015               off += UNITS_PER_VREG;
7016             }
7017         }
7018     }
7019
7020   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7021      any complication of having crtl->args.pretend_args_size changed.  */
7022   cfun->machine->frame.saved_varargs_size
7023     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7024                       STACK_BOUNDARY / BITS_PER_UNIT)
7025        + vr_saved * UNITS_PER_VREG);
7026 }
7027
7028 static void
7029 aarch64_conditional_register_usage (void)
7030 {
7031   int i;
7032   if (!TARGET_FLOAT)
7033     {
7034       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7035         {
7036           fixed_regs[i] = 1;
7037           call_used_regs[i] = 1;
7038         }
7039     }
7040 }
7041
7042 /* Walk down the type tree of TYPE counting consecutive base elements.
7043    If *MODEP is VOIDmode, then set it to the first valid floating point
7044    type.  If a non-floating point type is found, or if a floating point
7045    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7046    otherwise return the count in the sub-tree.  */
7047 static int
7048 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
7049 {
7050   enum machine_mode mode;
7051   HOST_WIDE_INT size;
7052
7053   switch (TREE_CODE (type))
7054     {
7055     case REAL_TYPE:
7056       mode = TYPE_MODE (type);
7057       if (mode != DFmode && mode != SFmode && mode != TFmode)
7058         return -1;
7059
7060       if (*modep == VOIDmode)
7061         *modep = mode;
7062
7063       if (*modep == mode)
7064         return 1;
7065
7066       break;
7067
7068     case COMPLEX_TYPE:
7069       mode = TYPE_MODE (TREE_TYPE (type));
7070       if (mode != DFmode && mode != SFmode && mode != TFmode)
7071         return -1;
7072
7073       if (*modep == VOIDmode)
7074         *modep = mode;
7075
7076       if (*modep == mode)
7077         return 2;
7078
7079       break;
7080
7081     case VECTOR_TYPE:
7082       /* Use V2SImode and V4SImode as representatives of all 64-bit
7083          and 128-bit vector types.  */
7084       size = int_size_in_bytes (type);
7085       switch (size)
7086         {
7087         case 8:
7088           mode = V2SImode;
7089           break;
7090         case 16:
7091           mode = V4SImode;
7092           break;
7093         default:
7094           return -1;
7095         }
7096
7097       if (*modep == VOIDmode)
7098         *modep = mode;
7099
7100       /* Vector modes are considered to be opaque: two vectors are
7101          equivalent for the purposes of being homogeneous aggregates
7102          if they are the same size.  */
7103       if (*modep == mode)
7104         return 1;
7105
7106       break;
7107
7108     case ARRAY_TYPE:
7109       {
7110         int count;
7111         tree index = TYPE_DOMAIN (type);
7112
7113         /* Can't handle incomplete types nor sizes that are not
7114            fixed.  */
7115         if (!COMPLETE_TYPE_P (type)
7116             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7117           return -1;
7118
7119         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7120         if (count == -1
7121             || !index
7122             || !TYPE_MAX_VALUE (index)
7123             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7124             || !TYPE_MIN_VALUE (index)
7125             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7126             || count < 0)
7127           return -1;
7128
7129         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7130                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7131
7132         /* There must be no padding.  */
7133         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7134           return -1;
7135
7136         return count;
7137       }
7138
7139     case RECORD_TYPE:
7140       {
7141         int count = 0;
7142         int sub_count;
7143         tree field;
7144
7145         /* Can't handle incomplete types nor sizes that are not
7146            fixed.  */
7147         if (!COMPLETE_TYPE_P (type)
7148             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7149           return -1;
7150
7151         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7152           {
7153             if (TREE_CODE (field) != FIELD_DECL)
7154               continue;
7155
7156             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7157             if (sub_count < 0)
7158               return -1;
7159             count += sub_count;
7160           }
7161
7162         /* There must be no padding.  */
7163         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7164           return -1;
7165
7166         return count;
7167       }
7168
7169     case UNION_TYPE:
7170     case QUAL_UNION_TYPE:
7171       {
7172         /* These aren't very interesting except in a degenerate case.  */
7173         int count = 0;
7174         int sub_count;
7175         tree field;
7176
7177         /* Can't handle incomplete types nor sizes that are not
7178            fixed.  */
7179         if (!COMPLETE_TYPE_P (type)
7180             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7181           return -1;
7182
7183         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7184           {
7185             if (TREE_CODE (field) != FIELD_DECL)
7186               continue;
7187
7188             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7189             if (sub_count < 0)
7190               return -1;
7191             count = count > sub_count ? count : sub_count;
7192           }
7193
7194         /* There must be no padding.  */
7195         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7196           return -1;
7197
7198         return count;
7199       }
7200
7201     default:
7202       break;
7203     }
7204
7205   return -1;
7206 }
7207
7208 /* Return true if we use LRA instead of reload pass.  */
7209 static bool
7210 aarch64_lra_p (void)
7211 {
7212   return aarch64_lra_flag;
7213 }
7214
7215 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7216    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7217    array types.  The C99 floating-point complex types are also considered
7218    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7219    types, which are GCC extensions and out of the scope of AAPCS64, are
7220    treated as composite types here as well.
7221
7222    Note that MODE itself is not sufficient in determining whether a type
7223    is such a composite type or not.  This is because
7224    stor-layout.c:compute_record_mode may have already changed the MODE
7225    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7226    structure with only one field may have its MODE set to the mode of the
7227    field.  Also an integer mode whose size matches the size of the
7228    RECORD_TYPE type may be used to substitute the original mode
7229    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7230    solely relied on.  */
7231
7232 static bool
7233 aarch64_composite_type_p (const_tree type,
7234                           enum machine_mode mode)
7235 {
7236   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7237     return true;
7238
7239   if (mode == BLKmode
7240       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7241       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7242     return true;
7243
7244   return false;
7245 }
7246
7247 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7248    type as described in AAPCS64 \S 4.1.2.
7249
7250    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7251
7252 static bool
7253 aarch64_short_vector_p (const_tree type,
7254                         enum machine_mode mode)
7255 {
7256   HOST_WIDE_INT size = -1;
7257
7258   if (type && TREE_CODE (type) == VECTOR_TYPE)
7259     size = int_size_in_bytes (type);
7260   else if (!aarch64_composite_type_p (type, mode)
7261            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7262                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7263     size = GET_MODE_SIZE (mode);
7264
7265   return (size == 8 || size == 16) ? true : false;
7266 }
7267
7268 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7269    shall be passed or returned in simd/fp register(s) (providing these
7270    parameter passing registers are available).
7271
7272    Upon successful return, *COUNT returns the number of needed registers,
7273    *BASE_MODE returns the mode of the individual register and when IS_HAF
7274    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7275    floating-point aggregate or a homogeneous short-vector aggregate.  */
7276
7277 static bool
7278 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7279                                          const_tree type,
7280                                          enum machine_mode *base_mode,
7281                                          int *count,
7282                                          bool *is_ha)
7283 {
7284   enum machine_mode new_mode = VOIDmode;
7285   bool composite_p = aarch64_composite_type_p (type, mode);
7286
7287   if (is_ha != NULL) *is_ha = false;
7288
7289   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7290       || aarch64_short_vector_p (type, mode))
7291     {
7292       *count = 1;
7293       new_mode = mode;
7294     }
7295   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7296     {
7297       if (is_ha != NULL) *is_ha = true;
7298       *count = 2;
7299       new_mode = GET_MODE_INNER (mode);
7300     }
7301   else if (type && composite_p)
7302     {
7303       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7304
7305       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7306         {
7307           if (is_ha != NULL) *is_ha = true;
7308           *count = ag_count;
7309         }
7310       else
7311         return false;
7312     }
7313   else
7314     return false;
7315
7316   *base_mode = new_mode;
7317   return true;
7318 }
7319
7320 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7321
7322 static rtx
7323 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7324                           int incoming ATTRIBUTE_UNUSED)
7325 {
7326   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7327 }
7328
7329 /* Implements target hook vector_mode_supported_p.  */
7330 static bool
7331 aarch64_vector_mode_supported_p (enum machine_mode mode)
7332 {
7333   if (TARGET_SIMD
7334       && (mode == V4SImode  || mode == V8HImode
7335           || mode == V16QImode || mode == V2DImode
7336           || mode == V2SImode  || mode == V4HImode
7337           || mode == V8QImode || mode == V2SFmode
7338           || mode == V4SFmode || mode == V2DFmode
7339           || mode == V1DFmode))
7340     return true;
7341
7342   return false;
7343 }
7344
7345 /* Return appropriate SIMD container
7346    for MODE within a vector of WIDTH bits.  */
7347 static enum machine_mode
7348 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7349 {
7350   gcc_assert (width == 64 || width == 128);
7351   if (TARGET_SIMD)
7352     {
7353       if (width == 128)
7354         switch (mode)
7355           {
7356           case DFmode:
7357             return V2DFmode;
7358           case SFmode:
7359             return V4SFmode;
7360           case SImode:
7361             return V4SImode;
7362           case HImode:
7363             return V8HImode;
7364           case QImode:
7365             return V16QImode;
7366           case DImode:
7367             return V2DImode;
7368           default:
7369             break;
7370           }
7371       else
7372         switch (mode)
7373           {
7374           case SFmode:
7375             return V2SFmode;
7376           case SImode:
7377             return V2SImode;
7378           case HImode:
7379             return V4HImode;
7380           case QImode:
7381             return V8QImode;
7382           default:
7383             break;
7384           }
7385     }
7386   return word_mode;
7387 }
7388
7389 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7390 static enum machine_mode
7391 aarch64_preferred_simd_mode (enum machine_mode mode)
7392 {
7393   return aarch64_simd_container_mode (mode, 128);
7394 }
7395
7396 /* Return the bitmask of possible vector sizes for the vectorizer
7397    to iterate over.  */
7398 static unsigned int
7399 aarch64_autovectorize_vector_sizes (void)
7400 {
7401   return (16 | 8);
7402 }
7403
7404 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7405    vector types in order to conform to the AAPCS64 (see "Procedure
7406    Call Standard for the ARM 64-bit Architecture", Appendix A).  To
7407    qualify for emission with the mangled names defined in that document,
7408    a vector type must not only be of the correct mode but also be
7409    composed of AdvSIMD vector element types (e.g.
7410    _builtin_aarch64_simd_qi); these types are registered by
7411    aarch64_init_simd_builtins ().  In other words, vector types defined
7412    in other ways e.g. via vector_size attribute will get default
7413    mangled names.  */
7414 typedef struct
7415 {
7416   enum machine_mode mode;
7417   const char *element_type_name;
7418   const char *mangled_name;
7419 } aarch64_simd_mangle_map_entry;
7420
7421 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7422   /* 64-bit containerized types.  */
7423   { V8QImode,  "__builtin_aarch64_simd_qi",     "10__Int8x8_t" },
7424   { V8QImode,  "__builtin_aarch64_simd_uqi",    "11__Uint8x8_t" },
7425   { V4HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x4_t" },
7426   { V4HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x4_t" },
7427   { V2SImode,  "__builtin_aarch64_simd_si",     "11__Int32x2_t" },
7428   { V2SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x2_t" },
7429   { V2SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x2_t" },
7430   { DImode,    "__builtin_aarch64_simd_di",     "11__Int64x1_t" },
7431   { DImode,    "__builtin_aarch64_simd_udi",    "12__Uint64x1_t" },
7432   { V1DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x1_t" },
7433   { V8QImode,  "__builtin_aarch64_simd_poly8",  "11__Poly8x8_t" },
7434   { V4HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7435   /* 128-bit containerized types.  */
7436   { V16QImode, "__builtin_aarch64_simd_qi",     "11__Int8x16_t" },
7437   { V16QImode, "__builtin_aarch64_simd_uqi",    "12__Uint8x16_t" },
7438   { V8HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x8_t" },
7439   { V8HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x8_t" },
7440   { V4SImode,  "__builtin_aarch64_simd_si",     "11__Int32x4_t" },
7441   { V4SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x4_t" },
7442   { V2DImode,  "__builtin_aarch64_simd_di",     "11__Int64x2_t" },
7443   { V2DImode,  "__builtin_aarch64_simd_udi",    "12__Uint64x2_t" },
7444   { V4SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x4_t" },
7445   { V2DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x2_t" },
7446   { V16QImode, "__builtin_aarch64_simd_poly8",  "12__Poly8x16_t" },
7447   { V8HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7448   { V2DImode,  "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7449   { VOIDmode, NULL, NULL }
7450 };
7451
7452 /* Implement TARGET_MANGLE_TYPE.  */
7453
7454 static const char *
7455 aarch64_mangle_type (const_tree type)
7456 {
7457   /* The AArch64 ABI documents say that "__va_list" has to be
7458      managled as if it is in the "std" namespace.  */
7459   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7460     return "St9__va_list";
7461
7462   /* Check the mode of the vector type, and the name of the vector
7463      element type, against the table.  */
7464   if (TREE_CODE (type) == VECTOR_TYPE)
7465     {
7466       aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7467
7468       while (pos->mode != VOIDmode)
7469         {
7470           tree elt_type = TREE_TYPE (type);
7471
7472           if (pos->mode == TYPE_MODE (type)
7473               && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7474               && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7475                           pos->element_type_name))
7476             return pos->mangled_name;
7477
7478           pos++;
7479         }
7480     }
7481
7482   /* Use the default mangling.  */
7483   return NULL;
7484 }
7485
7486 /* Return the equivalent letter for size.  */
7487 static char
7488 sizetochar (int size)
7489 {
7490   switch (size)
7491     {
7492     case 64: return 'd';
7493     case 32: return 's';
7494     case 16: return 'h';
7495     case 8 : return 'b';
7496     default: gcc_unreachable ();
7497     }
7498 }
7499
7500 /* Return true iff x is a uniform vector of floating-point
7501    constants, and the constant can be represented in
7502    quarter-precision form.  Note, as aarch64_float_const_representable
7503    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
7504 static bool
7505 aarch64_vect_float_const_representable_p (rtx x)
7506 {
7507   int i = 0;
7508   REAL_VALUE_TYPE r0, ri;
7509   rtx x0, xi;
7510
7511   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7512     return false;
7513
7514   x0 = CONST_VECTOR_ELT (x, 0);
7515   if (!CONST_DOUBLE_P (x0))
7516     return false;
7517
7518   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7519
7520   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7521     {
7522       xi = CONST_VECTOR_ELT (x, i);
7523       if (!CONST_DOUBLE_P (xi))
7524         return false;
7525
7526       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7527       if (!REAL_VALUES_EQUAL (r0, ri))
7528         return false;
7529     }
7530
7531   return aarch64_float_const_representable_p (x0);
7532 }
7533
7534 /* Return true for valid and false for invalid.  */
7535 bool
7536 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7537                               struct simd_immediate_info *info)
7538 {
7539 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
7540   matches = 1;                                          \
7541   for (i = 0; i < idx; i += (STRIDE))                   \
7542     if (!(TEST))                                        \
7543       matches = 0;                                      \
7544   if (matches)                                          \
7545     {                                                   \
7546       immtype = (CLASS);                                \
7547       elsize = (ELSIZE);                                \
7548       eshift = (SHIFT);                                 \
7549       emvn = (NEG);                                     \
7550       break;                                            \
7551     }
7552
7553   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7554   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7555   unsigned char bytes[16];
7556   int immtype = -1, matches;
7557   unsigned int invmask = inverse ? 0xff : 0;
7558   int eshift, emvn;
7559
7560   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7561     {
7562       if (! (aarch64_simd_imm_zero_p (op, mode)
7563              || aarch64_vect_float_const_representable_p (op)))
7564         return false;
7565
7566       if (info)
7567         {
7568           info->value = CONST_VECTOR_ELT (op, 0);
7569           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7570           info->mvn = false;
7571           info->shift = 0;
7572         }
7573
7574       return true;
7575     }
7576
7577   /* Splat vector constant out into a byte vector.  */
7578   for (i = 0; i < n_elts; i++)
7579     {
7580       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
7581          it must be laid out in the vector register in reverse order.  */
7582       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7583       unsigned HOST_WIDE_INT elpart;
7584       unsigned int part, parts;
7585
7586       if (GET_CODE (el) == CONST_INT)
7587         {
7588           elpart = INTVAL (el);
7589           parts = 1;
7590         }
7591       else if (GET_CODE (el) == CONST_DOUBLE)
7592         {
7593           elpart = CONST_DOUBLE_LOW (el);
7594           parts = 2;
7595         }
7596       else
7597         gcc_unreachable ();
7598
7599       for (part = 0; part < parts; part++)
7600         {
7601           unsigned int byte;
7602           for (byte = 0; byte < innersize; byte++)
7603             {
7604               bytes[idx++] = (elpart & 0xff) ^ invmask;
7605               elpart >>= BITS_PER_UNIT;
7606             }
7607           if (GET_CODE (el) == CONST_DOUBLE)
7608             elpart = CONST_DOUBLE_HIGH (el);
7609         }
7610     }
7611
7612   /* Sanity check.  */
7613   gcc_assert (idx == GET_MODE_SIZE (mode));
7614
7615   do
7616     {
7617       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7618              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7619
7620       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7621              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7622
7623       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7624              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7625
7626       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7627              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7628
7629       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7630
7631       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7632
7633       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7634              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7635
7636       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7637              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7638
7639       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7640              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7641
7642       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7643              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7644
7645       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7646
7647       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7648
7649       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7650              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7651
7652       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7653              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7654
7655       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7656              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7657
7658       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7659              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7660
7661       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7662
7663       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7664              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7665     }
7666   while (0);
7667
7668   if (immtype == -1)
7669     return false;
7670
7671   if (info)
7672     {
7673       info->element_width = elsize;
7674       info->mvn = emvn != 0;
7675       info->shift = eshift;
7676
7677       unsigned HOST_WIDE_INT imm = 0;
7678
7679       if (immtype >= 12 && immtype <= 15)
7680         info->msl = true;
7681
7682       /* Un-invert bytes of recognized vector, if necessary.  */
7683       if (invmask != 0)
7684         for (i = 0; i < idx; i++)
7685           bytes[i] ^= invmask;
7686
7687       if (immtype == 17)
7688         {
7689           /* FIXME: Broken on 32-bit H_W_I hosts.  */
7690           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7691
7692           for (i = 0; i < 8; i++)
7693             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7694               << (i * BITS_PER_UNIT);
7695
7696
7697           info->value = GEN_INT (imm);
7698         }
7699       else
7700         {
7701           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7702             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7703
7704           /* Construct 'abcdefgh' because the assembler cannot handle
7705              generic constants.  */
7706           if (info->mvn)
7707             imm = ~imm;
7708           imm = (imm >> info->shift) & 0xff;
7709           info->value = GEN_INT (imm);
7710         }
7711     }
7712
7713   return true;
7714 #undef CHECK
7715 }
7716
7717 static bool
7718 aarch64_const_vec_all_same_int_p (rtx x,
7719                                   HOST_WIDE_INT minval,
7720                                   HOST_WIDE_INT maxval)
7721 {
7722   HOST_WIDE_INT firstval;
7723   int count, i;
7724
7725   if (GET_CODE (x) != CONST_VECTOR
7726       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
7727     return false;
7728
7729   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
7730   if (firstval < minval || firstval > maxval)
7731     return false;
7732
7733   count = CONST_VECTOR_NUNITS (x);
7734   for (i = 1; i < count; i++)
7735     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
7736       return false;
7737
7738   return true;
7739 }
7740
7741 /* Check of immediate shift constants are within range.  */
7742 bool
7743 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7744 {
7745   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7746   if (left)
7747     return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
7748   else
7749     return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
7750 }
7751
7752 /* Return true if X is a uniform vector where all elements
7753    are either the floating-point constant 0.0 or the
7754    integer constant 0.  */
7755 bool
7756 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7757 {
7758   return x == CONST0_RTX (mode);
7759 }
7760
7761 bool
7762 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7763 {
7764   HOST_WIDE_INT imm = INTVAL (x);
7765   int i;
7766
7767   for (i = 0; i < 8; i++)
7768     {
7769       unsigned int byte = imm & 0xff;
7770       if (byte != 0xff && byte != 0)
7771        return false;
7772       imm >>= 8;
7773     }
7774
7775   return true;
7776 }
7777
7778 bool
7779 aarch64_mov_operand_p (rtx x,
7780                        enum aarch64_symbol_context context,
7781                        enum machine_mode mode)
7782 {
7783   if (GET_CODE (x) == HIGH
7784       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7785     return true;
7786
7787   if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7788     return true;
7789
7790   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7791     return true;
7792
7793   return aarch64_classify_symbolic_expression (x, context)
7794     == SYMBOL_TINY_ABSOLUTE;
7795 }
7796
7797 /* Return a const_int vector of VAL.  */
7798 rtx
7799 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7800 {
7801   int nunits = GET_MODE_NUNITS (mode);
7802   rtvec v = rtvec_alloc (nunits);
7803   int i;
7804
7805   for (i=0; i < nunits; i++)
7806     RTVEC_ELT (v, i) = GEN_INT (val);
7807
7808   return gen_rtx_CONST_VECTOR (mode, v);
7809 }
7810
7811 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
7812
7813 bool
7814 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7815 {
7816   enum machine_mode vmode;
7817
7818   gcc_assert (!VECTOR_MODE_P (mode));
7819   vmode = aarch64_preferred_simd_mode (mode);
7820   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7821   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7822 }
7823
7824 /* Construct and return a PARALLEL RTX vector.  */
7825 rtx
7826 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7827 {
7828   int nunits = GET_MODE_NUNITS (mode);
7829   rtvec v = rtvec_alloc (nunits / 2);
7830   int base = high ? nunits / 2 : 0;
7831   rtx t1;
7832   int i;
7833
7834   for (i=0; i < nunits / 2; i++)
7835     RTVEC_ELT (v, i) = GEN_INT (base + i);
7836
7837   t1 = gen_rtx_PARALLEL (mode, v);
7838   return t1;
7839 }
7840
7841 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
7842    HIGH (exclusive).  */
7843 void
7844 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7845 {
7846   HOST_WIDE_INT lane;
7847   gcc_assert (GET_CODE (operand) == CONST_INT);
7848   lane = INTVAL (operand);
7849
7850   if (lane < low || lane >= high)
7851     error ("lane out of range");
7852 }
7853
7854 void
7855 aarch64_simd_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7856 {
7857   gcc_assert (GET_CODE (operand) == CONST_INT);
7858   HOST_WIDE_INT lane = INTVAL (operand);
7859
7860   if (lane < low || lane >= high)
7861     error ("constant out of range");
7862 }
7863
7864 /* Emit code to reinterpret one AdvSIMD type as another,
7865    without altering bits.  */
7866 void
7867 aarch64_simd_reinterpret (rtx dest, rtx src)
7868 {
7869   emit_move_insn (dest, gen_lowpart (GET_MODE (dest), src));
7870 }
7871
7872 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
7873    registers).  */
7874 void
7875 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
7876                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
7877                             rtx op1)
7878 {
7879   rtx mem = gen_rtx_MEM (mode, destaddr);
7880   rtx tmp1 = gen_reg_rtx (mode);
7881   rtx tmp2 = gen_reg_rtx (mode);
7882
7883   emit_insn (intfn (tmp1, op1, tmp2));
7884
7885   emit_move_insn (mem, tmp1);
7886   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
7887   emit_move_insn (mem, tmp2);
7888 }
7889
7890 /* Return TRUE if OP is a valid vector addressing mode.  */
7891 bool
7892 aarch64_simd_mem_operand_p (rtx op)
7893 {
7894   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
7895                         || GET_CODE (XEXP (op, 0)) == REG);
7896 }
7897
7898 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
7899    not to early-clobber SRC registers in the process.
7900
7901    We assume that the operands described by SRC and DEST represent a
7902    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
7903    number of components into which the copy has been decomposed.  */
7904 void
7905 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
7906                                 rtx *src, unsigned int count)
7907 {
7908   unsigned int i;
7909
7910   if (!reg_overlap_mentioned_p (operands[0], operands[1])
7911       || REGNO (operands[0]) < REGNO (operands[1]))
7912     {
7913       for (i = 0; i < count; i++)
7914         {
7915           operands[2 * i] = dest[i];
7916           operands[2 * i + 1] = src[i];
7917         }
7918     }
7919   else
7920     {
7921       for (i = 0; i < count; i++)
7922         {
7923           operands[2 * i] = dest[count - i - 1];
7924           operands[2 * i + 1] = src[count - i - 1];
7925         }
7926     }
7927 }
7928
7929 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
7930    one of VSTRUCT modes: OI, CI or XI.  */
7931 int
7932 aarch64_simd_attr_length_move (rtx insn)
7933 {
7934   enum machine_mode mode;
7935
7936   extract_insn_cached (insn);
7937
7938   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
7939     {
7940       mode = GET_MODE (recog_data.operand[0]);
7941       switch (mode)
7942         {
7943         case OImode:
7944           return 8;
7945         case CImode:
7946           return 12;
7947         case XImode:
7948           return 16;
7949         default:
7950           gcc_unreachable ();
7951         }
7952     }
7953   return 4;
7954 }
7955
7956 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
7957    alignment of a vector to 128 bits.  */
7958 static HOST_WIDE_INT
7959 aarch64_simd_vector_alignment (const_tree type)
7960 {
7961   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
7962   return MIN (align, 128);
7963 }
7964
7965 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
7966 static bool
7967 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
7968 {
7969   if (is_packed)
7970     return false;
7971
7972   /* We guarantee alignment for vectors up to 128-bits.  */
7973   if (tree_int_cst_compare (TYPE_SIZE (type),
7974                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
7975     return false;
7976
7977   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
7978   return true;
7979 }
7980
7981 /* If VALS is a vector constant that can be loaded into a register
7982    using DUP, generate instructions to do so and return an RTX to
7983    assign to the register.  Otherwise return NULL_RTX.  */
7984 static rtx
7985 aarch64_simd_dup_constant (rtx vals)
7986 {
7987   enum machine_mode mode = GET_MODE (vals);
7988   enum machine_mode inner_mode = GET_MODE_INNER (mode);
7989   int n_elts = GET_MODE_NUNITS (mode);
7990   bool all_same = true;
7991   rtx x;
7992   int i;
7993
7994   if (GET_CODE (vals) != CONST_VECTOR)
7995     return NULL_RTX;
7996
7997   for (i = 1; i < n_elts; ++i)
7998     {
7999       x = CONST_VECTOR_ELT (vals, i);
8000       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8001         all_same = false;
8002     }
8003
8004   if (!all_same)
8005     return NULL_RTX;
8006
8007   /* We can load this constant by using DUP and a constant in a
8008      single ARM register.  This will be cheaper than a vector
8009      load.  */
8010   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8011   return gen_rtx_VEC_DUPLICATE (mode, x);
8012 }
8013
8014
8015 /* Generate code to load VALS, which is a PARALLEL containing only
8016    constants (for vec_init) or CONST_VECTOR, efficiently into a
8017    register.  Returns an RTX to copy into the register, or NULL_RTX
8018    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8019 static rtx
8020 aarch64_simd_make_constant (rtx vals)
8021 {
8022   enum machine_mode mode = GET_MODE (vals);
8023   rtx const_dup;
8024   rtx const_vec = NULL_RTX;
8025   int n_elts = GET_MODE_NUNITS (mode);
8026   int n_const = 0;
8027   int i;
8028
8029   if (GET_CODE (vals) == CONST_VECTOR)
8030     const_vec = vals;
8031   else if (GET_CODE (vals) == PARALLEL)
8032     {
8033       /* A CONST_VECTOR must contain only CONST_INTs and
8034          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8035          Only store valid constants in a CONST_VECTOR.  */
8036       for (i = 0; i < n_elts; ++i)
8037         {
8038           rtx x = XVECEXP (vals, 0, i);
8039           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8040             n_const++;
8041         }
8042       if (n_const == n_elts)
8043         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8044     }
8045   else
8046     gcc_unreachable ();
8047
8048   if (const_vec != NULL_RTX
8049       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8050     /* Load using MOVI/MVNI.  */
8051     return const_vec;
8052   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8053     /* Loaded using DUP.  */
8054     return const_dup;
8055   else if (const_vec != NULL_RTX)
8056     /* Load from constant pool. We can not take advantage of single-cycle
8057        LD1 because we need a PC-relative addressing mode.  */
8058     return const_vec;
8059   else
8060     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8061        We can not construct an initializer.  */
8062     return NULL_RTX;
8063 }
8064
8065 void
8066 aarch64_expand_vector_init (rtx target, rtx vals)
8067 {
8068   enum machine_mode mode = GET_MODE (target);
8069   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8070   int n_elts = GET_MODE_NUNITS (mode);
8071   int n_var = 0, one_var = -1;
8072   bool all_same = true;
8073   rtx x, mem;
8074   int i;
8075
8076   x = XVECEXP (vals, 0, 0);
8077   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8078     n_var = 1, one_var = 0;
8079
8080   for (i = 1; i < n_elts; ++i)
8081     {
8082       x = XVECEXP (vals, 0, i);
8083       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8084         ++n_var, one_var = i;
8085
8086       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8087         all_same = false;
8088     }
8089
8090   if (n_var == 0)
8091     {
8092       rtx constant = aarch64_simd_make_constant (vals);
8093       if (constant != NULL_RTX)
8094         {
8095           emit_move_insn (target, constant);
8096           return;
8097         }
8098     }
8099
8100   /* Splat a single non-constant element if we can.  */
8101   if (all_same)
8102     {
8103       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8104       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8105       return;
8106     }
8107
8108   /* One field is non-constant.  Load constant then overwrite varying
8109      field.  This is more efficient than using the stack.  */
8110   if (n_var == 1)
8111     {
8112       rtx copy = copy_rtx (vals);
8113       rtx index = GEN_INT (one_var);
8114       enum insn_code icode;
8115
8116       /* Load constant part of vector, substitute neighboring value for
8117          varying element.  */
8118       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8119       aarch64_expand_vector_init (target, copy);
8120
8121       /* Insert variable.  */
8122       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8123       icode = optab_handler (vec_set_optab, mode);
8124       gcc_assert (icode != CODE_FOR_nothing);
8125       emit_insn (GEN_FCN (icode) (target, x, index));
8126       return;
8127     }
8128
8129   /* Construct the vector in memory one field at a time
8130      and load the whole vector.  */
8131   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8132   for (i = 0; i < n_elts; i++)
8133     emit_move_insn (adjust_address_nv (mem, inner_mode,
8134                                     i * GET_MODE_SIZE (inner_mode)),
8135                     XVECEXP (vals, 0, i));
8136   emit_move_insn (target, mem);
8137
8138 }
8139
8140 static unsigned HOST_WIDE_INT
8141 aarch64_shift_truncation_mask (enum machine_mode mode)
8142 {
8143   return
8144     (aarch64_vector_mode_supported_p (mode)
8145      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8146 }
8147
8148 #ifndef TLS_SECTION_ASM_FLAG
8149 #define TLS_SECTION_ASM_FLAG 'T'
8150 #endif
8151
8152 void
8153 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8154                                tree decl ATTRIBUTE_UNUSED)
8155 {
8156   char flagchars[10], *f = flagchars;
8157
8158   /* If we have already declared this section, we can use an
8159      abbreviated form to switch back to it -- unless this section is
8160      part of a COMDAT groups, in which case GAS requires the full
8161      declaration every time.  */
8162   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8163       && (flags & SECTION_DECLARED))
8164     {
8165       fprintf (asm_out_file, "\t.section\t%s\n", name);
8166       return;
8167     }
8168
8169   if (!(flags & SECTION_DEBUG))
8170     *f++ = 'a';
8171   if (flags & SECTION_WRITE)
8172     *f++ = 'w';
8173   if (flags & SECTION_CODE)
8174     *f++ = 'x';
8175   if (flags & SECTION_SMALL)
8176     *f++ = 's';
8177   if (flags & SECTION_MERGE)
8178     *f++ = 'M';
8179   if (flags & SECTION_STRINGS)
8180     *f++ = 'S';
8181   if (flags & SECTION_TLS)
8182     *f++ = TLS_SECTION_ASM_FLAG;
8183   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8184     *f++ = 'G';
8185   *f = '\0';
8186
8187   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8188
8189   if (!(flags & SECTION_NOTYPE))
8190     {
8191       const char *type;
8192       const char *format;
8193
8194       if (flags & SECTION_BSS)
8195         type = "nobits";
8196       else
8197         type = "progbits";
8198
8199 #ifdef TYPE_OPERAND_FMT
8200       format = "," TYPE_OPERAND_FMT;
8201 #else
8202       format = ",@%s";
8203 #endif
8204
8205       fprintf (asm_out_file, format, type);
8206
8207       if (flags & SECTION_ENTSIZE)
8208         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8209       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8210         {
8211           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8212             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8213           else
8214             fprintf (asm_out_file, ",%s,comdat",
8215                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8216         }
8217     }
8218
8219   putc ('\n', asm_out_file);
8220 }
8221
8222 /* Select a format to encode pointers in exception handling data.  */
8223 int
8224 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8225 {
8226    int type;
8227    switch (aarch64_cmodel)
8228      {
8229      case AARCH64_CMODEL_TINY:
8230      case AARCH64_CMODEL_TINY_PIC:
8231      case AARCH64_CMODEL_SMALL:
8232      case AARCH64_CMODEL_SMALL_PIC:
8233        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8234           for everything.  */
8235        type = DW_EH_PE_sdata4;
8236        break;
8237      default:
8238        /* No assumptions here.  8-byte relocs required.  */
8239        type = DW_EH_PE_sdata8;
8240        break;
8241      }
8242    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8243 }
8244
8245 /* Emit load exclusive.  */
8246
8247 static void
8248 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8249                              rtx mem, rtx model_rtx)
8250 {
8251   rtx (*gen) (rtx, rtx, rtx);
8252
8253   switch (mode)
8254     {
8255     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8256     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8257     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8258     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8259     default:
8260       gcc_unreachable ();
8261     }
8262
8263   emit_insn (gen (rval, mem, model_rtx));
8264 }
8265
8266 /* Emit store exclusive.  */
8267
8268 static void
8269 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8270                               rtx rval, rtx mem, rtx model_rtx)
8271 {
8272   rtx (*gen) (rtx, rtx, rtx, rtx);
8273
8274   switch (mode)
8275     {
8276     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8277     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8278     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8279     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8280     default:
8281       gcc_unreachable ();
8282     }
8283
8284   emit_insn (gen (bval, rval, mem, model_rtx));
8285 }
8286
8287 /* Mark the previous jump instruction as unlikely.  */
8288
8289 static void
8290 aarch64_emit_unlikely_jump (rtx insn)
8291 {
8292   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8293
8294   insn = emit_jump_insn (insn);
8295   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8296 }
8297
8298 /* Expand a compare and swap pattern.  */
8299
8300 void
8301 aarch64_expand_compare_and_swap (rtx operands[])
8302 {
8303   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8304   enum machine_mode mode, cmp_mode;
8305   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8306
8307   bval = operands[0];
8308   rval = operands[1];
8309   mem = operands[2];
8310   oldval = operands[3];
8311   newval = operands[4];
8312   is_weak = operands[5];
8313   mod_s = operands[6];
8314   mod_f = operands[7];
8315   mode = GET_MODE (mem);
8316   cmp_mode = mode;
8317
8318   /* Normally the succ memory model must be stronger than fail, but in the
8319      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8320      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8321
8322   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8323       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8324     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8325
8326   switch (mode)
8327     {
8328     case QImode:
8329     case HImode:
8330       /* For short modes, we're going to perform the comparison in SImode,
8331          so do the zero-extension now.  */
8332       cmp_mode = SImode;
8333       rval = gen_reg_rtx (SImode);
8334       oldval = convert_modes (SImode, mode, oldval, true);
8335       /* Fall through.  */
8336
8337     case SImode:
8338     case DImode:
8339       /* Force the value into a register if needed.  */
8340       if (!aarch64_plus_operand (oldval, mode))
8341         oldval = force_reg (cmp_mode, oldval);
8342       break;
8343
8344     default:
8345       gcc_unreachable ();
8346     }
8347
8348   switch (mode)
8349     {
8350     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8351     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8352     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8353     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8354     default:
8355       gcc_unreachable ();
8356     }
8357
8358   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8359
8360   if (mode == QImode || mode == HImode)
8361     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8362
8363   x = gen_rtx_REG (CCmode, CC_REGNUM);
8364   x = gen_rtx_EQ (SImode, x, const0_rtx);
8365   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8366 }
8367
8368 /* Split a compare and swap pattern.  */
8369
8370 void
8371 aarch64_split_compare_and_swap (rtx operands[])
8372 {
8373   rtx rval, mem, oldval, newval, scratch;
8374   enum machine_mode mode;
8375   bool is_weak;
8376   rtx label1, label2, x, cond;
8377
8378   rval = operands[0];
8379   mem = operands[1];
8380   oldval = operands[2];
8381   newval = operands[3];
8382   is_weak = (operands[4] != const0_rtx);
8383   scratch = operands[7];
8384   mode = GET_MODE (mem);
8385
8386   label1 = NULL_RTX;
8387   if (!is_weak)
8388     {
8389       label1 = gen_label_rtx ();
8390       emit_label (label1);
8391     }
8392   label2 = gen_label_rtx ();
8393
8394   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8395
8396   cond = aarch64_gen_compare_reg (NE, rval, oldval);
8397   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8398   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8399                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8400   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8401
8402   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8403
8404   if (!is_weak)
8405     {
8406       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8407       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8408                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8409       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8410     }
8411   else
8412     {
8413       cond = gen_rtx_REG (CCmode, CC_REGNUM);
8414       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8415       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8416     }
8417
8418   emit_label (label2);
8419 }
8420
8421 /* Split an atomic operation.  */
8422
8423 void
8424 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8425                      rtx value, rtx model_rtx, rtx cond)
8426 {
8427   enum machine_mode mode = GET_MODE (mem);
8428   enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8429   rtx label, x;
8430
8431   label = gen_label_rtx ();
8432   emit_label (label);
8433
8434   if (new_out)
8435     new_out = gen_lowpart (wmode, new_out);
8436   if (old_out)
8437     old_out = gen_lowpart (wmode, old_out);
8438   else
8439     old_out = new_out;
8440   value = simplify_gen_subreg (wmode, value, mode, 0);
8441
8442   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8443
8444   switch (code)
8445     {
8446     case SET:
8447       new_out = value;
8448       break;
8449
8450     case NOT:
8451       x = gen_rtx_AND (wmode, old_out, value);
8452       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8453       x = gen_rtx_NOT (wmode, new_out);
8454       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8455       break;
8456
8457     case MINUS:
8458       if (CONST_INT_P (value))
8459         {
8460           value = GEN_INT (-INTVAL (value));
8461           code = PLUS;
8462         }
8463       /* Fall through.  */
8464
8465     default:
8466       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8467       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8468       break;
8469     }
8470
8471   aarch64_emit_store_exclusive (mode, cond, mem,
8472                                 gen_lowpart (mode, new_out), model_rtx);
8473
8474   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8475   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8476                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8477   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8478 }
8479
8480 static void
8481 aarch64_print_extension (void)
8482 {
8483   const struct aarch64_option_extension *opt = NULL;
8484
8485   for (opt = all_extensions; opt->name != NULL; opt++)
8486     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8487       asm_fprintf (asm_out_file, "+%s", opt->name);
8488
8489   asm_fprintf (asm_out_file, "\n");
8490 }
8491
8492 static void
8493 aarch64_start_file (void)
8494 {
8495   if (selected_arch)
8496     {
8497       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8498       aarch64_print_extension ();
8499     }
8500   else if (selected_cpu)
8501     {
8502       const char *truncated_name
8503             = aarch64_rewrite_selected_cpu (selected_cpu->name);
8504       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8505       aarch64_print_extension ();
8506     }
8507   default_file_start();
8508 }
8509
8510 /* Target hook for c_mode_for_suffix.  */
8511 static enum machine_mode
8512 aarch64_c_mode_for_suffix (char suffix)
8513 {
8514   if (suffix == 'q')
8515     return TFmode;
8516
8517   return VOIDmode;
8518 }
8519
8520 /* We can only represent floating point constants which will fit in
8521    "quarter-precision" values.  These values are characterised by
8522    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
8523    by:
8524
8525    (-1)^s * (n/16) * 2^r
8526
8527    Where:
8528      's' is the sign bit.
8529      'n' is an integer in the range 16 <= n <= 31.
8530      'r' is an integer in the range -3 <= r <= 4.  */
8531
8532 /* Return true iff X can be represented by a quarter-precision
8533    floating point immediate operand X.  Note, we cannot represent 0.0.  */
8534 bool
8535 aarch64_float_const_representable_p (rtx x)
8536 {
8537   /* This represents our current view of how many bits
8538      make up the mantissa.  */
8539   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8540   int exponent;
8541   unsigned HOST_WIDE_INT mantissa, mask;
8542   REAL_VALUE_TYPE r, m;
8543   bool fail;
8544
8545   if (!CONST_DOUBLE_P (x))
8546     return false;
8547
8548   if (GET_MODE (x) == VOIDmode)
8549     return false;
8550
8551   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8552
8553   /* We cannot represent infinities, NaNs or +/-zero.  We won't
8554      know if we have +zero until we analyse the mantissa, but we
8555      can reject the other invalid values.  */
8556   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8557       || REAL_VALUE_MINUS_ZERO (r))
8558     return false;
8559
8560   /* Extract exponent.  */
8561   r = real_value_abs (&r);
8562   exponent = REAL_EXP (&r);
8563
8564   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8565      highest (sign) bit, with a fixed binary point at bit point_pos.
8566      m1 holds the low part of the mantissa, m2 the high part.
8567      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8568      bits for the mantissa, this can fail (low bits will be lost).  */
8569   real_ldexp (&m, &r, point_pos - exponent);
8570   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8571
8572   /* If the low part of the mantissa has bits set we cannot represent
8573      the value.  */
8574   if (w.elt (0) != 0)
8575     return false;
8576   /* We have rejected the lower HOST_WIDE_INT, so update our
8577      understanding of how many bits lie in the mantissa and
8578      look only at the high HOST_WIDE_INT.  */
8579   mantissa = w.elt (1);
8580   point_pos -= HOST_BITS_PER_WIDE_INT;
8581
8582   /* We can only represent values with a mantissa of the form 1.xxxx.  */
8583   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8584   if ((mantissa & mask) != 0)
8585     return false;
8586
8587   /* Having filtered unrepresentable values, we may now remove all
8588      but the highest 5 bits.  */
8589   mantissa >>= point_pos - 5;
8590
8591   /* We cannot represent the value 0.0, so reject it.  This is handled
8592      elsewhere.  */
8593   if (mantissa == 0)
8594     return false;
8595
8596   /* Then, as bit 4 is always set, we can mask it off, leaving
8597      the mantissa in the range [0, 15].  */
8598   mantissa &= ~(1 << 4);
8599   gcc_assert (mantissa <= 15);
8600
8601   /* GCC internally does not use IEEE754-like encoding (where normalized
8602      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
8603      Our mantissa values are shifted 4 places to the left relative to
8604      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8605      by 5 places to correct for GCC's representation.  */
8606   exponent = 5 - exponent;
8607
8608   return (exponent >= 0 && exponent <= 7);
8609 }
8610
8611 char*
8612 aarch64_output_simd_mov_immediate (rtx const_vector,
8613                                    enum machine_mode mode,
8614                                    unsigned width)
8615 {
8616   bool is_valid;
8617   static char templ[40];
8618   const char *mnemonic;
8619   const char *shift_op;
8620   unsigned int lane_count = 0;
8621   char element_char;
8622
8623   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8624
8625   /* This will return true to show const_vector is legal for use as either
8626      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
8627      also update INFO to show how the immediate should be generated.  */
8628   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8629   gcc_assert (is_valid);
8630
8631   element_char = sizetochar (info.element_width);
8632   lane_count = width / info.element_width;
8633
8634   mode = GET_MODE_INNER (mode);
8635   if (mode == SFmode || mode == DFmode)
8636     {
8637       gcc_assert (info.shift == 0 && ! info.mvn);
8638       if (aarch64_float_const_zero_rtx_p (info.value))
8639         info.value = GEN_INT (0);
8640       else
8641         {
8642 #define buf_size 20
8643           REAL_VALUE_TYPE r;
8644           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8645           char float_buf[buf_size] = {'\0'};
8646           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8647 #undef buf_size
8648
8649           if (lane_count == 1)
8650             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8651           else
8652             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8653                       lane_count, element_char, float_buf);
8654           return templ;
8655         }
8656     }
8657
8658   mnemonic = info.mvn ? "mvni" : "movi";
8659   shift_op = info.msl ? "msl" : "lsl";
8660
8661   if (lane_count == 1)
8662     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8663               mnemonic, UINTVAL (info.value));
8664   else if (info.shift)
8665     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8666               ", %s %d", mnemonic, lane_count, element_char,
8667               UINTVAL (info.value), shift_op, info.shift);
8668   else
8669     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8670               mnemonic, lane_count, element_char, UINTVAL (info.value));
8671   return templ;
8672 }
8673
8674 char*
8675 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8676                                           enum machine_mode mode)
8677 {
8678   enum machine_mode vmode;
8679
8680   gcc_assert (!VECTOR_MODE_P (mode));
8681   vmode = aarch64_simd_container_mode (mode, 64);
8682   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8683   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8684 }
8685
8686 /* Split operands into moves from op[1] + op[2] into op[0].  */
8687
8688 void
8689 aarch64_split_combinev16qi (rtx operands[3])
8690 {
8691   unsigned int dest = REGNO (operands[0]);
8692   unsigned int src1 = REGNO (operands[1]);
8693   unsigned int src2 = REGNO (operands[2]);
8694   enum machine_mode halfmode = GET_MODE (operands[1]);
8695   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8696   rtx destlo, desthi;
8697
8698   gcc_assert (halfmode == V16QImode);
8699
8700   if (src1 == dest && src2 == dest + halfregs)
8701     {
8702       /* No-op move.  Can't split to nothing; emit something.  */
8703       emit_note (NOTE_INSN_DELETED);
8704       return;
8705     }
8706
8707   /* Preserve register attributes for variable tracking.  */
8708   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8709   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8710                                GET_MODE_SIZE (halfmode));
8711
8712   /* Special case of reversed high/low parts.  */
8713   if (reg_overlap_mentioned_p (operands[2], destlo)
8714       && reg_overlap_mentioned_p (operands[1], desthi))
8715     {
8716       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8717       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8718       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8719     }
8720   else if (!reg_overlap_mentioned_p (operands[2], destlo))
8721     {
8722       /* Try to avoid unnecessary moves if part of the result
8723          is in the right place already.  */
8724       if (src1 != dest)
8725         emit_move_insn (destlo, operands[1]);
8726       if (src2 != dest + halfregs)
8727         emit_move_insn (desthi, operands[2]);
8728     }
8729   else
8730     {
8731       if (src2 != dest + halfregs)
8732         emit_move_insn (desthi, operands[2]);
8733       if (src1 != dest)
8734         emit_move_insn (destlo, operands[1]);
8735     }
8736 }
8737
8738 /* vec_perm support.  */
8739
8740 #define MAX_VECT_LEN 16
8741
8742 struct expand_vec_perm_d
8743 {
8744   rtx target, op0, op1;
8745   unsigned char perm[MAX_VECT_LEN];
8746   enum machine_mode vmode;
8747   unsigned char nelt;
8748   bool one_vector_p;
8749   bool testing_p;
8750 };
8751
8752 /* Generate a variable permutation.  */
8753
8754 static void
8755 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8756 {
8757   enum machine_mode vmode = GET_MODE (target);
8758   bool one_vector_p = rtx_equal_p (op0, op1);
8759
8760   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8761   gcc_checking_assert (GET_MODE (op0) == vmode);
8762   gcc_checking_assert (GET_MODE (op1) == vmode);
8763   gcc_checking_assert (GET_MODE (sel) == vmode);
8764   gcc_checking_assert (TARGET_SIMD);
8765
8766   if (one_vector_p)
8767     {
8768       if (vmode == V8QImode)
8769         {
8770           /* Expand the argument to a V16QI mode by duplicating it.  */
8771           rtx pair = gen_reg_rtx (V16QImode);
8772           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8773           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8774         }
8775       else
8776         {
8777           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8778         }
8779     }
8780   else
8781     {
8782       rtx pair;
8783
8784       if (vmode == V8QImode)
8785         {
8786           pair = gen_reg_rtx (V16QImode);
8787           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8788           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8789         }
8790       else
8791         {
8792           pair = gen_reg_rtx (OImode);
8793           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8794           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8795         }
8796     }
8797 }
8798
8799 void
8800 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8801 {
8802   enum machine_mode vmode = GET_MODE (target);
8803   unsigned int nelt = GET_MODE_NUNITS (vmode);
8804   bool one_vector_p = rtx_equal_p (op0, op1);
8805   rtx mask;
8806
8807   /* The TBL instruction does not use a modulo index, so we must take care
8808      of that ourselves.  */
8809   mask = aarch64_simd_gen_const_vector_dup (vmode,
8810       one_vector_p ? nelt - 1 : 2 * nelt - 1);
8811   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8812
8813   /* For big-endian, we also need to reverse the index within the vector
8814      (but not which vector).  */
8815   if (BYTES_BIG_ENDIAN)
8816     {
8817       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
8818       if (!one_vector_p)
8819         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
8820       sel = expand_simple_binop (vmode, XOR, sel, mask,
8821                                  NULL, 0, OPTAB_LIB_WIDEN);
8822     }
8823   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8824 }
8825
8826 /* Recognize patterns suitable for the TRN instructions.  */
8827 static bool
8828 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8829 {
8830   unsigned int i, odd, mask, nelt = d->nelt;
8831   rtx out, in0, in1, x;
8832   rtx (*gen) (rtx, rtx, rtx);
8833   enum machine_mode vmode = d->vmode;
8834
8835   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8836     return false;
8837
8838   /* Note that these are little-endian tests.
8839      We correct for big-endian later.  */
8840   if (d->perm[0] == 0)
8841     odd = 0;
8842   else if (d->perm[0] == 1)
8843     odd = 1;
8844   else
8845     return false;
8846   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8847
8848   for (i = 0; i < nelt; i += 2)
8849     {
8850       if (d->perm[i] != i + odd)
8851         return false;
8852       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
8853         return false;
8854     }
8855
8856   /* Success!  */
8857   if (d->testing_p)
8858     return true;
8859
8860   in0 = d->op0;
8861   in1 = d->op1;
8862   if (BYTES_BIG_ENDIAN)
8863     {
8864       x = in0, in0 = in1, in1 = x;
8865       odd = !odd;
8866     }
8867   out = d->target;
8868
8869   if (odd)
8870     {
8871       switch (vmode)
8872         {
8873         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
8874         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
8875         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
8876         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
8877         case V4SImode: gen = gen_aarch64_trn2v4si; break;
8878         case V2SImode: gen = gen_aarch64_trn2v2si; break;
8879         case V2DImode: gen = gen_aarch64_trn2v2di; break;
8880         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
8881         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
8882         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
8883         default:
8884           return false;
8885         }
8886     }
8887   else
8888     {
8889       switch (vmode)
8890         {
8891         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
8892         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
8893         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
8894         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
8895         case V4SImode: gen = gen_aarch64_trn1v4si; break;
8896         case V2SImode: gen = gen_aarch64_trn1v2si; break;
8897         case V2DImode: gen = gen_aarch64_trn1v2di; break;
8898         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
8899         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
8900         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
8901         default:
8902           return false;
8903         }
8904     }
8905
8906   emit_insn (gen (out, in0, in1));
8907   return true;
8908 }
8909
8910 /* Recognize patterns suitable for the UZP instructions.  */
8911 static bool
8912 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
8913 {
8914   unsigned int i, odd, mask, nelt = d->nelt;
8915   rtx out, in0, in1, x;
8916   rtx (*gen) (rtx, rtx, rtx);
8917   enum machine_mode vmode = d->vmode;
8918
8919   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8920     return false;
8921
8922   /* Note that these are little-endian tests.
8923      We correct for big-endian later.  */
8924   if (d->perm[0] == 0)
8925     odd = 0;
8926   else if (d->perm[0] == 1)
8927     odd = 1;
8928   else
8929     return false;
8930   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8931
8932   for (i = 0; i < nelt; i++)
8933     {
8934       unsigned elt = (i * 2 + odd) & mask;
8935       if (d->perm[i] != elt)
8936         return false;
8937     }
8938
8939   /* Success!  */
8940   if (d->testing_p)
8941     return true;
8942
8943   in0 = d->op0;
8944   in1 = d->op1;
8945   if (BYTES_BIG_ENDIAN)
8946     {
8947       x = in0, in0 = in1, in1 = x;
8948       odd = !odd;
8949     }
8950   out = d->target;
8951
8952   if (odd)
8953     {
8954       switch (vmode)
8955         {
8956         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
8957         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
8958         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
8959         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
8960         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
8961         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
8962         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
8963         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
8964         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
8965         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
8966         default:
8967           return false;
8968         }
8969     }
8970   else
8971     {
8972       switch (vmode)
8973         {
8974         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
8975         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
8976         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
8977         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
8978         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
8979         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
8980         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
8981         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
8982         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
8983         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
8984         default:
8985           return false;
8986         }
8987     }
8988
8989   emit_insn (gen (out, in0, in1));
8990   return true;
8991 }
8992
8993 /* Recognize patterns suitable for the ZIP instructions.  */
8994 static bool
8995 aarch64_evpc_zip (struct expand_vec_perm_d *d)
8996 {
8997   unsigned int i, high, mask, nelt = d->nelt;
8998   rtx out, in0, in1, x;
8999   rtx (*gen) (rtx, rtx, rtx);
9000   enum machine_mode vmode = d->vmode;
9001
9002   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9003     return false;
9004
9005   /* Note that these are little-endian tests.
9006      We correct for big-endian later.  */
9007   high = nelt / 2;
9008   if (d->perm[0] == high)
9009     /* Do Nothing.  */
9010     ;
9011   else if (d->perm[0] == 0)
9012     high = 0;
9013   else
9014     return false;
9015   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9016
9017   for (i = 0; i < nelt / 2; i++)
9018     {
9019       unsigned elt = (i + high) & mask;
9020       if (d->perm[i * 2] != elt)
9021         return false;
9022       elt = (elt + nelt) & mask;
9023       if (d->perm[i * 2 + 1] != elt)
9024         return false;
9025     }
9026
9027   /* Success!  */
9028   if (d->testing_p)
9029     return true;
9030
9031   in0 = d->op0;
9032   in1 = d->op1;
9033   if (BYTES_BIG_ENDIAN)
9034     {
9035       x = in0, in0 = in1, in1 = x;
9036       high = !high;
9037     }
9038   out = d->target;
9039
9040   if (high)
9041     {
9042       switch (vmode)
9043         {
9044         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9045         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9046         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9047         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9048         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9049         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9050         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9051         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9052         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9053         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9054         default:
9055           return false;
9056         }
9057     }
9058   else
9059     {
9060       switch (vmode)
9061         {
9062         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9063         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9064         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9065         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9066         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9067         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9068         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9069         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9070         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9071         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9072         default:
9073           return false;
9074         }
9075     }
9076
9077   emit_insn (gen (out, in0, in1));
9078   return true;
9079 }
9080
9081 /* Recognize patterns for the EXT insn.  */
9082
9083 static bool
9084 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9085 {
9086   unsigned int i, nelt = d->nelt;
9087   rtx (*gen) (rtx, rtx, rtx, rtx);
9088   rtx offset;
9089
9090   unsigned int location = d->perm[0]; /* Always < nelt.  */
9091
9092   /* Check if the extracted indices are increasing by one.  */
9093   for (i = 1; i < nelt; i++)
9094     {
9095       unsigned int required = location + i;
9096       if (d->one_vector_p)
9097         {
9098           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9099           required &= (nelt - 1);
9100         }
9101       if (d->perm[i] != required)
9102         return false;
9103     }
9104
9105   switch (d->vmode)
9106     {
9107     case V16QImode: gen = gen_aarch64_extv16qi; break;
9108     case V8QImode: gen = gen_aarch64_extv8qi; break;
9109     case V4HImode: gen = gen_aarch64_extv4hi; break;
9110     case V8HImode: gen = gen_aarch64_extv8hi; break;
9111     case V2SImode: gen = gen_aarch64_extv2si; break;
9112     case V4SImode: gen = gen_aarch64_extv4si; break;
9113     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9114     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9115     case V2DImode: gen = gen_aarch64_extv2di; break;
9116     case V2DFmode: gen = gen_aarch64_extv2df; break;
9117     default:
9118       return false;
9119     }
9120
9121   /* Success! */
9122   if (d->testing_p)
9123     return true;
9124
9125   /* The case where (location == 0) is a no-op for both big- and little-endian,
9126      and is removed by the mid-end at optimization levels -O1 and higher.  */
9127
9128   if (BYTES_BIG_ENDIAN && (location != 0))
9129     {
9130       /* After setup, we want the high elements of the first vector (stored
9131          at the LSB end of the register), and the low elements of the second
9132          vector (stored at the MSB end of the register). So swap.  */
9133       rtx temp = d->op0;
9134       d->op0 = d->op1;
9135       d->op1 = temp;
9136       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9137       location = nelt - location;
9138     }
9139
9140   offset = GEN_INT (location);
9141   emit_insn (gen (d->target, d->op0, d->op1, offset));
9142   return true;
9143 }
9144
9145 /* Recognize patterns for the REV insns.  */
9146
9147 static bool
9148 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9149 {
9150   unsigned int i, j, diff, nelt = d->nelt;
9151   rtx (*gen) (rtx, rtx);
9152
9153   if (!d->one_vector_p)
9154     return false;
9155
9156   diff = d->perm[0];
9157   switch (diff)
9158     {
9159     case 7:
9160       switch (d->vmode)
9161         {
9162         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9163         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9164         default:
9165           return false;
9166         }
9167       break;
9168     case 3:
9169       switch (d->vmode)
9170         {
9171         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9172         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9173         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9174         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9175         default:
9176           return false;
9177         }
9178       break;
9179     case 1:
9180       switch (d->vmode)
9181         {
9182         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9183         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9184         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9185         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9186         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9187         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9188         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9189         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9190         default:
9191           return false;
9192         }
9193       break;
9194     default:
9195       return false;
9196     }
9197
9198   for (i = 0; i < nelt ; i += diff + 1)
9199     for (j = 0; j <= diff; j += 1)
9200       {
9201         /* This is guaranteed to be true as the value of diff
9202            is 7, 3, 1 and we should have enough elements in the
9203            queue to generate this.  Getting a vector mask with a
9204            value of diff other than these values implies that
9205            something is wrong by the time we get here.  */
9206         gcc_assert (i + j < nelt);
9207         if (d->perm[i + j] != i + diff - j)
9208           return false;
9209       }
9210
9211   /* Success! */
9212   if (d->testing_p)
9213     return true;
9214
9215   emit_insn (gen (d->target, d->op0));
9216   return true;
9217 }
9218
9219 static bool
9220 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9221 {
9222   rtx (*gen) (rtx, rtx, rtx);
9223   rtx out = d->target;
9224   rtx in0;
9225   enum machine_mode vmode = d->vmode;
9226   unsigned int i, elt, nelt = d->nelt;
9227   rtx lane;
9228
9229   /* TODO: This may not be big-endian safe.  */
9230   if (BYTES_BIG_ENDIAN)
9231     return false;
9232
9233   elt = d->perm[0];
9234   for (i = 1; i < nelt; i++)
9235     {
9236       if (elt != d->perm[i])
9237         return false;
9238     }
9239
9240   /* The generic preparation in aarch64_expand_vec_perm_const_1
9241      swaps the operand order and the permute indices if it finds
9242      d->perm[0] to be in the second operand.  Thus, we can always
9243      use d->op0 and need not do any extra arithmetic to get the
9244      correct lane number.  */
9245   in0 = d->op0;
9246   lane = GEN_INT (elt);
9247
9248   switch (vmode)
9249     {
9250     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9251     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9252     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9253     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9254     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9255     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9256     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9257     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9258     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9259     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9260     default:
9261       return false;
9262     }
9263
9264   emit_insn (gen (out, in0, lane));
9265   return true;
9266 }
9267
9268 static bool
9269 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9270 {
9271   rtx rperm[MAX_VECT_LEN], sel;
9272   enum machine_mode vmode = d->vmode;
9273   unsigned int i, nelt = d->nelt;
9274
9275   if (d->testing_p)
9276     return true;
9277
9278   /* Generic code will try constant permutation twice.  Once with the
9279      original mode and again with the elements lowered to QImode.
9280      So wait and don't do the selector expansion ourselves.  */
9281   if (vmode != V8QImode && vmode != V16QImode)
9282     return false;
9283
9284   for (i = 0; i < nelt; ++i)
9285     {
9286       int nunits = GET_MODE_NUNITS (vmode);
9287
9288       /* If big-endian and two vectors we end up with a weird mixed-endian
9289          mode on NEON.  Reverse the index within each word but not the word
9290          itself.  */
9291       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9292                                            : d->perm[i]);
9293     }
9294   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9295   sel = force_reg (vmode, sel);
9296
9297   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9298   return true;
9299 }
9300
9301 static bool
9302 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9303 {
9304   /* The pattern matching functions above are written to look for a small
9305      number to begin the sequence (0, 1, N/2).  If we begin with an index
9306      from the second operand, we can swap the operands.  */
9307   if (d->perm[0] >= d->nelt)
9308     {
9309       unsigned i, nelt = d->nelt;
9310       rtx x;
9311
9312       gcc_assert (nelt == (nelt & -nelt));
9313       for (i = 0; i < nelt; ++i)
9314         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9315
9316       x = d->op0;
9317       d->op0 = d->op1;
9318       d->op1 = x;
9319     }
9320
9321   if (TARGET_SIMD)
9322     {
9323       if (aarch64_evpc_rev (d))
9324         return true;
9325       else if (aarch64_evpc_ext (d))
9326         return true;
9327       else if (aarch64_evpc_zip (d))
9328         return true;
9329       else if (aarch64_evpc_uzp (d))
9330         return true;
9331       else if (aarch64_evpc_trn (d))
9332         return true;
9333       else if (aarch64_evpc_dup (d))
9334         return true;
9335       return aarch64_evpc_tbl (d);
9336     }
9337   return false;
9338 }
9339
9340 /* Expand a vec_perm_const pattern.  */
9341
9342 bool
9343 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9344 {
9345   struct expand_vec_perm_d d;
9346   int i, nelt, which;
9347
9348   d.target = target;
9349   d.op0 = op0;
9350   d.op1 = op1;
9351
9352   d.vmode = GET_MODE (target);
9353   gcc_assert (VECTOR_MODE_P (d.vmode));
9354   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9355   d.testing_p = false;
9356
9357   for (i = which = 0; i < nelt; ++i)
9358     {
9359       rtx e = XVECEXP (sel, 0, i);
9360       int ei = INTVAL (e) & (2 * nelt - 1);
9361       which |= (ei < nelt ? 1 : 2);
9362       d.perm[i] = ei;
9363     }
9364
9365   switch (which)
9366     {
9367     default:
9368       gcc_unreachable ();
9369
9370     case 3:
9371       d.one_vector_p = false;
9372       if (!rtx_equal_p (op0, op1))
9373         break;
9374
9375       /* The elements of PERM do not suggest that only the first operand
9376          is used, but both operands are identical.  Allow easier matching
9377          of the permutation by folding the permutation into the single
9378          input vector.  */
9379       /* Fall Through.  */
9380     case 2:
9381       for (i = 0; i < nelt; ++i)
9382         d.perm[i] &= nelt - 1;
9383       d.op0 = op1;
9384       d.one_vector_p = true;
9385       break;
9386
9387     case 1:
9388       d.op1 = op0;
9389       d.one_vector_p = true;
9390       break;
9391     }
9392
9393   return aarch64_expand_vec_perm_const_1 (&d);
9394 }
9395
9396 static bool
9397 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9398                                      const unsigned char *sel)
9399 {
9400   struct expand_vec_perm_d d;
9401   unsigned int i, nelt, which;
9402   bool ret;
9403
9404   d.vmode = vmode;
9405   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9406   d.testing_p = true;
9407   memcpy (d.perm, sel, nelt);
9408
9409   /* Calculate whether all elements are in one vector.  */
9410   for (i = which = 0; i < nelt; ++i)
9411     {
9412       unsigned char e = d.perm[i];
9413       gcc_assert (e < 2 * nelt);
9414       which |= (e < nelt ? 1 : 2);
9415     }
9416
9417   /* If all elements are from the second vector, reindex as if from the
9418      first vector.  */
9419   if (which == 2)
9420     for (i = 0; i < nelt; ++i)
9421       d.perm[i] -= nelt;
9422
9423   /* Check whether the mask can be applied to a single vector.  */
9424   d.one_vector_p = (which != 3);
9425
9426   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9427   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9428   if (!d.one_vector_p)
9429     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9430
9431   start_sequence ();
9432   ret = aarch64_expand_vec_perm_const_1 (&d);
9433   end_sequence ();
9434
9435   return ret;
9436 }
9437
9438 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
9439 bool
9440 aarch64_cannot_change_mode_class (enum machine_mode from,
9441                                   enum machine_mode to,
9442                                   enum reg_class rclass)
9443 {
9444   /* Full-reg subregs are allowed on general regs or any class if they are
9445      the same size.  */
9446   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9447       || !reg_classes_intersect_p (FP_REGS, rclass))
9448     return false;
9449
9450   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
9451      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9452      2. Scalar to Scalar for integer modes or same size float modes.
9453      3. Vector to Vector modes.
9454      4. On little-endian only, Vector-Structure to Vector modes.  */
9455   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9456     {
9457       if (aarch64_vector_mode_supported_p (from)
9458           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9459         return false;
9460
9461       if (GET_MODE_NUNITS (from) == 1
9462           && GET_MODE_NUNITS (to) == 1
9463           && (GET_MODE_CLASS (from) == MODE_INT
9464               || from == to))
9465         return false;
9466
9467       if (aarch64_vector_mode_supported_p (from)
9468           && aarch64_vector_mode_supported_p (to))
9469         return false;
9470
9471       /* Within an vector structure straddling multiple vector registers
9472          we are in a mixed-endian representation.  As such, we can't
9473          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
9474          switch between vectors and vector structures cheaply.  */
9475       if (!BYTES_BIG_ENDIAN)
9476         if ((aarch64_vector_mode_supported_p (from)
9477               && aarch64_vect_struct_mode_p (to))
9478             || (aarch64_vector_mode_supported_p (to)
9479               && aarch64_vect_struct_mode_p (from)))
9480           return false;
9481     }
9482
9483   return true;
9484 }
9485
9486 /* Implement MODES_TIEABLE_P.  */
9487
9488 bool
9489 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9490 {
9491   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9492     return true;
9493
9494   /* We specifically want to allow elements of "structure" modes to
9495      be tieable to the structure.  This more general condition allows
9496      other rarer situations too.  */
9497   if (TARGET_SIMD
9498       && aarch64_vector_mode_p (mode1)
9499       && aarch64_vector_mode_p (mode2))
9500     return true;
9501
9502   return false;
9503 }
9504
9505 /* Return a new RTX holding the result of moving POINTER forward by
9506    AMOUNT bytes.  */
9507
9508 static rtx
9509 aarch64_move_pointer (rtx pointer, int amount)
9510 {
9511   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9512
9513   return adjust_automodify_address (pointer, GET_MODE (pointer),
9514                                     next, amount);
9515 }
9516
9517 /* Return a new RTX holding the result of moving POINTER forward by the
9518    size of the mode it points to.  */
9519
9520 static rtx
9521 aarch64_progress_pointer (rtx pointer)
9522 {
9523   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9524
9525   return aarch64_move_pointer (pointer, amount);
9526 }
9527
9528 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9529    MODE bytes.  */
9530
9531 static void
9532 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9533                                               enum machine_mode mode)
9534 {
9535   rtx reg = gen_reg_rtx (mode);
9536
9537   /* "Cast" the pointers to the correct mode.  */
9538   *src = adjust_address (*src, mode, 0);
9539   *dst = adjust_address (*dst, mode, 0);
9540   /* Emit the memcpy.  */
9541   emit_move_insn (reg, *src);
9542   emit_move_insn (*dst, reg);
9543   /* Move the pointers forward.  */
9544   *src = aarch64_progress_pointer (*src);
9545   *dst = aarch64_progress_pointer (*dst);
9546 }
9547
9548 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
9549    we succeed, otherwise return false.  */
9550
9551 bool
9552 aarch64_expand_movmem (rtx *operands)
9553 {
9554   unsigned int n;
9555   rtx dst = operands[0];
9556   rtx src = operands[1];
9557   rtx base;
9558   bool speed_p = !optimize_function_for_size_p (cfun);
9559
9560   /* When optimizing for size, give a better estimate of the length of a
9561      memcpy call, but use the default otherwise.  */
9562   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9563
9564   /* We can't do anything smart if the amount to copy is not constant.  */
9565   if (!CONST_INT_P (operands[2]))
9566     return false;
9567
9568   n = UINTVAL (operands[2]);
9569
9570   /* Try to keep the number of instructions low.  For cases below 16 bytes we
9571      need to make at most two moves.  For cases above 16 bytes it will be one
9572      move for each 16 byte chunk, then at most two additional moves.  */
9573   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9574     return false;
9575
9576   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9577   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9578
9579   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9580   src = adjust_automodify_address (src, VOIDmode, base, 0);
9581
9582   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9583      1-byte chunk.  */
9584   if (n < 4)
9585     {
9586       if (n >= 2)
9587         {
9588           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9589           n -= 2;
9590         }
9591
9592       if (n == 1)
9593         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9594
9595       return true;
9596     }
9597
9598   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
9599      4-byte chunk, partially overlapping with the previously copied chunk.  */
9600   if (n < 8)
9601     {
9602       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9603       n -= 4;
9604       if (n > 0)
9605         {
9606           int move = n - 4;
9607
9608           src = aarch64_move_pointer (src, move);
9609           dst = aarch64_move_pointer (dst, move);
9610           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9611         }
9612       return true;
9613     }
9614
9615   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
9616      them, then (if applicable) an 8-byte chunk.  */
9617   while (n >= 8)
9618     {
9619       if (n / 16)
9620         {
9621           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9622           n -= 16;
9623         }
9624       else
9625         {
9626           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9627           n -= 8;
9628         }
9629     }
9630
9631   /* Finish the final bytes of the copy.  We can always do this in one
9632      instruction.  We either copy the exact amount we need, or partially
9633      overlap with the previous chunk we copied and copy 8-bytes.  */
9634   if (n == 0)
9635     return true;
9636   else if (n == 1)
9637     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9638   else if (n == 2)
9639     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9640   else if (n == 4)
9641     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9642   else
9643     {
9644       if (n == 3)
9645         {
9646           src = aarch64_move_pointer (src, -1);
9647           dst = aarch64_move_pointer (dst, -1);
9648           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9649         }
9650       else
9651         {
9652           int move = n - 8;
9653
9654           src = aarch64_move_pointer (src, move);
9655           dst = aarch64_move_pointer (dst, move);
9656           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9657         }
9658     }
9659
9660   return true;
9661 }
9662
9663 #undef TARGET_ADDRESS_COST
9664 #define TARGET_ADDRESS_COST aarch64_address_cost
9665
9666 /* This hook will determines whether unnamed bitfields affect the alignment
9667    of the containing structure.  The hook returns true if the structure
9668    should inherit the alignment requirements of an unnamed bitfield's
9669    type.  */
9670 #undef TARGET_ALIGN_ANON_BITFIELD
9671 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9672
9673 #undef TARGET_ASM_ALIGNED_DI_OP
9674 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9675
9676 #undef TARGET_ASM_ALIGNED_HI_OP
9677 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9678
9679 #undef TARGET_ASM_ALIGNED_SI_OP
9680 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9681
9682 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9683 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9684   hook_bool_const_tree_hwi_hwi_const_tree_true
9685
9686 #undef TARGET_ASM_FILE_START
9687 #define TARGET_ASM_FILE_START aarch64_start_file
9688
9689 #undef TARGET_ASM_OUTPUT_MI_THUNK
9690 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9691
9692 #undef TARGET_ASM_SELECT_RTX_SECTION
9693 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9694
9695 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9696 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9697
9698 #undef TARGET_BUILD_BUILTIN_VA_LIST
9699 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9700
9701 #undef TARGET_CALLEE_COPIES
9702 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9703
9704 #undef TARGET_CAN_ELIMINATE
9705 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9706
9707 #undef TARGET_CANNOT_FORCE_CONST_MEM
9708 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9709
9710 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9711 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9712
9713 /* Only the least significant bit is used for initialization guard
9714    variables.  */
9715 #undef TARGET_CXX_GUARD_MASK_BIT
9716 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9717
9718 #undef TARGET_C_MODE_FOR_SUFFIX
9719 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9720
9721 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9722 #undef  TARGET_DEFAULT_TARGET_FLAGS
9723 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9724 #endif
9725
9726 #undef TARGET_CLASS_MAX_NREGS
9727 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9728
9729 #undef TARGET_BUILTIN_DECL
9730 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9731
9732 #undef  TARGET_EXPAND_BUILTIN
9733 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9734
9735 #undef TARGET_EXPAND_BUILTIN_VA_START
9736 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9737
9738 #undef TARGET_FOLD_BUILTIN
9739 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9740
9741 #undef TARGET_FUNCTION_ARG
9742 #define TARGET_FUNCTION_ARG aarch64_function_arg
9743
9744 #undef TARGET_FUNCTION_ARG_ADVANCE
9745 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9746
9747 #undef TARGET_FUNCTION_ARG_BOUNDARY
9748 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9749
9750 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9751 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9752
9753 #undef TARGET_FUNCTION_VALUE
9754 #define TARGET_FUNCTION_VALUE aarch64_function_value
9755
9756 #undef TARGET_FUNCTION_VALUE_REGNO_P
9757 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9758
9759 #undef TARGET_FRAME_POINTER_REQUIRED
9760 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9761
9762 #undef TARGET_GIMPLE_FOLD_BUILTIN
9763 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9764
9765 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9766 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9767
9768 #undef  TARGET_INIT_BUILTINS
9769 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
9770
9771 #undef TARGET_LEGITIMATE_ADDRESS_P
9772 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9773
9774 #undef TARGET_LEGITIMATE_CONSTANT_P
9775 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9776
9777 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9778 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9779
9780 #undef TARGET_LRA_P
9781 #define TARGET_LRA_P aarch64_lra_p
9782
9783 #undef TARGET_MANGLE_TYPE
9784 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9785
9786 #undef TARGET_MEMORY_MOVE_COST
9787 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9788
9789 #undef TARGET_MUST_PASS_IN_STACK
9790 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9791
9792 /* This target hook should return true if accesses to volatile bitfields
9793    should use the narrowest mode possible.  It should return false if these
9794    accesses should use the bitfield container type.  */
9795 #undef TARGET_NARROW_VOLATILE_BITFIELD
9796 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9797
9798 #undef  TARGET_OPTION_OVERRIDE
9799 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9800
9801 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9802 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9803   aarch64_override_options_after_change
9804
9805 #undef TARGET_PASS_BY_REFERENCE
9806 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9807
9808 #undef TARGET_PREFERRED_RELOAD_CLASS
9809 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9810
9811 #undef TARGET_SECONDARY_RELOAD
9812 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9813
9814 #undef TARGET_SHIFT_TRUNCATION_MASK
9815 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9816
9817 #undef TARGET_SETUP_INCOMING_VARARGS
9818 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9819
9820 #undef TARGET_STRUCT_VALUE_RTX
9821 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
9822
9823 #undef TARGET_REGISTER_MOVE_COST
9824 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9825
9826 #undef TARGET_RETURN_IN_MEMORY
9827 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9828
9829 #undef TARGET_RETURN_IN_MSB
9830 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9831
9832 #undef TARGET_RTX_COSTS
9833 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9834
9835 #undef TARGET_SCHED_ISSUE_RATE
9836 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9837
9838 #undef TARGET_TRAMPOLINE_INIT
9839 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9840
9841 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9842 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9843
9844 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9845 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9846
9847 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9848 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9849
9850 #undef TARGET_VECTORIZE_ADD_STMT_COST
9851 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
9852
9853 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
9854 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
9855   aarch64_builtin_vectorization_cost
9856
9857 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
9858 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
9859
9860 #undef TARGET_VECTORIZE_BUILTINS
9861 #define TARGET_VECTORIZE_BUILTINS
9862
9863 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
9864 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
9865   aarch64_builtin_vectorized_function
9866
9867 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
9868 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
9869   aarch64_autovectorize_vector_sizes
9870
9871 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
9872 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
9873   aarch64_atomic_assign_expand_fenv
9874
9875 /* Section anchor support.  */
9876
9877 #undef TARGET_MIN_ANCHOR_OFFSET
9878 #define TARGET_MIN_ANCHOR_OFFSET -256
9879
9880 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
9881    byte offset; we can do much more for larger data types, but have no way
9882    to determine the size of the access.  We assume accesses are aligned.  */
9883 #undef TARGET_MAX_ANCHOR_OFFSET
9884 #define TARGET_MAX_ANCHOR_OFFSET 4095
9885
9886 #undef TARGET_VECTOR_ALIGNMENT
9887 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
9888
9889 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
9890 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
9891   aarch64_simd_vector_alignment_reachable
9892
9893 /* vec_perm support.  */
9894
9895 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
9896 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
9897   aarch64_vectorize_vec_perm_const_ok
9898
9899
9900 #undef TARGET_FIXED_CONDITION_CODE_REGS
9901 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
9902
9903 #undef TARGET_FLAGS_REGNUM
9904 #define TARGET_FLAGS_REGNUM CC_REGNUM
9905
9906 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
9907 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
9908
9909 struct gcc_target targetm = TARGET_INITIALIZER;
9910
9911 #include "gt-aarch64.h"