gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "tree.h"
  29 #include "stringpool.h"
  30 #include "stor-layout.h"
  31 #include "calls.h"
  32 #include "varasm.h"
  33 #include "regs.h"
  34 #include "df.h"
  35 #include "hard-reg-set.h"
  36 #include "output.h"
  37 #include "expr.h"
  38 #include "reload.h"
  39 #include "toplev.h"
  40 #include "target.h"
  41 #include "target-def.h"
  42 #include "targhooks.h"
  43 #include "ggc.h"
  44 #include "function.h"
  45 #include "tm_p.h"
  46 #include "recog.h"
  47 #include "langhooks.h"
  48 #include "diagnostic-core.h"
  49 #include "hash-table.h"
  50 #include "vec.h"
  51 #include "basic-block.h"
  52 #include "tree-ssa-alias.h"
  53 #include "internal-fn.h"
  54 #include "gimple-fold.h"
  55 #include "tree-eh.h"
  56 #include "gimple-expr.h"
  57 #include "is-a.h"
  58 #include "gimple.h"
  59 #include "gimplify.h"
  60 #include "optabs.h"
  61 #include "dwarf2.h"
  62 #include "cfgloop.h"
  63 #include "tree-vectorizer.h"
  64 #include "config/arm/aarch-cost-tables.h"
  65 #include "dumpfile.h"
  66 #include "builtins.h"
  67
  68 /* Defined for convenience.  */
  69 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  70
  71 /* Classifies an address.
  72
  73    ADDRESS_REG_IMM
  74        A simple base register plus immediate offset.
  75
  76    ADDRESS_REG_WB
  77        A base register indexed by immediate offset with writeback.
  78
  79    ADDRESS_REG_REG
  80        A base register indexed by (optionally scaled) register.
  81
  82    ADDRESS_REG_UXTW
  83        A base register indexed by (optionally scaled) zero-extended register.
  84
  85    ADDRESS_REG_SXTW
  86        A base register indexed by (optionally scaled) sign-extended register.
  87
  88    ADDRESS_LO_SUM
  89        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  90
  91    ADDRESS_SYMBOLIC:
  92        A constant symbolic address, in pc-relative literal pool.  */
  93
  94 enum aarch64_address_type {
  95   ADDRESS_REG_IMM,
  96   ADDRESS_REG_WB,
  97   ADDRESS_REG_REG,
  98   ADDRESS_REG_UXTW,
  99   ADDRESS_REG_SXTW,
 100   ADDRESS_LO_SUM,
 101   ADDRESS_SYMBOLIC
 102 };
 103
 104 struct aarch64_address_info {
 105   enum aarch64_address_type type;
 106   rtx base;
 107   rtx offset;
 108   int shift;
 109   enum aarch64_symbol_type symbol_type;
 110 };
 111
 112 struct simd_immediate_info
 113 {
 114   rtx value;
 115   int shift;
 116   int element_width;
 117   bool mvn;
 118   bool msl;
 119 };
 120
 121 /* The current code model.  */
 122 enum aarch64_code_model aarch64_cmodel;
 123
 124 #ifdef HAVE_AS_TLS
 125 #undef TARGET_HAVE_TLS
 126 #define TARGET_HAVE_TLS 1
 127 #endif
 128
 129 static bool aarch64_lra_p (void);
 130 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
 131 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
 132                                                      const_tree,
 133                                                      enum machine_mode *, int *,
 134                                                      bool *);
 135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_override_options_after_change (void);
 138 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 139 static unsigned bit_count (unsigned HOST_WIDE_INT);
 140 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 141                                                  const unsigned char *sel);
 142 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 143
 144 /* The processor for which instructions should be scheduled.  */
 145 enum aarch64_processor aarch64_tune = cortexa53;
 146
 147 /* The current tuning set.  */
 148 const struct tune_params *aarch64_tune_params;
 149
 150 /* Mask to specify which instructions we are allowed to generate.  */
 151 unsigned long aarch64_isa_flags = 0;
 152
 153 /* Mask to specify which instruction scheduling options should be used.  */
 154 unsigned long aarch64_tune_flags = 0;
 155
 156 /* Tuning parameters.  */
 157
 158 #if HAVE_DESIGNATED_INITIALIZERS
 159 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 160 #else
 161 #define NAMED_PARAM(NAME, VAL) (VAL)
 162 #endif
 163
 164 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 165 __extension__
 166 #endif
 167
 168 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 169 __extension__
 170 #endif
 171 static const struct cpu_addrcost_table generic_addrcost_table =
 172 {
 173 #if HAVE_DESIGNATED_INITIALIZERS
 174   .addr_scale_costs =
 175 #endif
 176     {
 177       NAMED_PARAM (hi, 0),
 178       NAMED_PARAM (si, 0),
 179       NAMED_PARAM (di, 0),
 180       NAMED_PARAM (ti, 0),
 181     },
 182   NAMED_PARAM (pre_modify, 0),
 183   NAMED_PARAM (post_modify, 0),
 184   NAMED_PARAM (register_offset, 0),
 185   NAMED_PARAM (register_extend, 0),
 186   NAMED_PARAM (imm_offset, 0)
 187 };
 188
 189 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 190 __extension__
 191 #endif
 192 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 193 {
 194 #if HAVE_DESIGNATED_INITIALIZERS
 195   .addr_scale_costs =
 196 #endif
 197     {
 198       NAMED_PARAM (hi, 1),
 199       NAMED_PARAM (si, 0),
 200       NAMED_PARAM (di, 0),
 201       NAMED_PARAM (ti, 1),
 202     },
 203   NAMED_PARAM (pre_modify, 0),
 204   NAMED_PARAM (post_modify, 0),
 205   NAMED_PARAM (register_offset, 0),
 206   NAMED_PARAM (register_extend, 0),
 207   NAMED_PARAM (imm_offset, 0),
 208 };
 209
 210 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 211 __extension__
 212 #endif
 213 static const struct cpu_regmove_cost generic_regmove_cost =
 214 {
 215   NAMED_PARAM (GP2GP, 1),
 216   NAMED_PARAM (GP2FP, 2),
 217   NAMED_PARAM (FP2GP, 2),
 218   /* We currently do not provide direct support for TFmode Q->Q move.
 219      Therefore we need to raise the cost above 2 in order to have
 220      reload handle the situation.  */
 221   NAMED_PARAM (FP2FP, 4)
 222 };
 223
 224 /* Generic costs for vector insn classes.  */
 225 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 226 __extension__
 227 #endif
 228 static const struct cpu_vector_cost generic_vector_cost =
 229 {
 230   NAMED_PARAM (scalar_stmt_cost, 1),
 231   NAMED_PARAM (scalar_load_cost, 1),
 232   NAMED_PARAM (scalar_store_cost, 1),
 233   NAMED_PARAM (vec_stmt_cost, 1),
 234   NAMED_PARAM (vec_to_scalar_cost, 1),
 235   NAMED_PARAM (scalar_to_vec_cost, 1),
 236   NAMED_PARAM (vec_align_load_cost, 1),
 237   NAMED_PARAM (vec_unalign_load_cost, 1),
 238   NAMED_PARAM (vec_unalign_store_cost, 1),
 239   NAMED_PARAM (vec_store_cost, 1),
 240   NAMED_PARAM (cond_taken_branch_cost, 3),
 241   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 242 };
 243
 244 /* Generic costs for vector insn classes.  */
 245 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 246 __extension__
 247 #endif
 248 static const struct cpu_vector_cost cortexa57_vector_cost =
 249 {
 250   NAMED_PARAM (scalar_stmt_cost, 1),
 251   NAMED_PARAM (scalar_load_cost, 4),
 252   NAMED_PARAM (scalar_store_cost, 1),
 253   NAMED_PARAM (vec_stmt_cost, 3),
 254   NAMED_PARAM (vec_to_scalar_cost, 8),
 255   NAMED_PARAM (scalar_to_vec_cost, 8),
 256   NAMED_PARAM (vec_align_load_cost, 5),
 257   NAMED_PARAM (vec_unalign_load_cost, 5),
 258   NAMED_PARAM (vec_unalign_store_cost, 1),
 259   NAMED_PARAM (vec_store_cost, 1),
 260   NAMED_PARAM (cond_taken_branch_cost, 1),
 261   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 262 };
 263
 264 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 265 __extension__
 266 #endif
 267 static const struct tune_params generic_tunings =
 268 {
 269   &cortexa57_extra_costs,
 270   &generic_addrcost_table,
 271   &generic_regmove_cost,
 272   &generic_vector_cost,
 273   NAMED_PARAM (memmov_cost, 4),
 274   NAMED_PARAM (issue_rate, 2)
 275 };
 276
 277 static const struct tune_params cortexa53_tunings =
 278 {
 279   &cortexa53_extra_costs,
 280   &generic_addrcost_table,
 281   &generic_regmove_cost,
 282   &generic_vector_cost,
 283   NAMED_PARAM (memmov_cost, 4),
 284   NAMED_PARAM (issue_rate, 2)
 285 };
 286
 287 static const struct tune_params cortexa57_tunings =
 288 {
 289   &cortexa57_extra_costs,
 290   &cortexa57_addrcost_table,
 291   &generic_regmove_cost,
 292   &cortexa57_vector_cost,
 293   NAMED_PARAM (memmov_cost, 4),
 294   NAMED_PARAM (issue_rate, 3)
 295 };
 296
 297 /* A processor implementing AArch64.  */
 298 struct processor
 299 {
 300   const char *const name;
 301   enum aarch64_processor core;
 302   const char *arch;
 303   const unsigned long flags;
 304   const struct tune_params *const tune;
 305 };
 306
 307 /* Processor cores implementing AArch64.  */
 308 static const struct processor all_cores[] =
 309 {
 310 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
 311   {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
 312 #include "aarch64-cores.def"
 313 #undef AARCH64_CORE
 314   {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
 315   {NULL, aarch64_none, NULL, 0, NULL}
 316 };
 317
 318 /* Architectures implementing AArch64.  */
 319 static const struct processor all_architectures[] =
 320 {
 321 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 322   {NAME, CORE, #ARCH, FLAGS, NULL},
 323 #include "aarch64-arches.def"
 324 #undef AARCH64_ARCH
 325   {NULL, aarch64_none, NULL, 0, NULL}
 326 };
 327
 328 /* Target specification.  These are populated as commandline arguments
 329    are processed, or NULL if not specified.  */
 330 static const struct processor *selected_arch;
 331 static const struct processor *selected_cpu;
 332 static const struct processor *selected_tune;
 333
 334 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 335
 336 /* An ISA extension in the co-processor and main instruction set space.  */
 337 struct aarch64_option_extension
 338 {
 339   const char *const name;
 340   const unsigned long flags_on;
 341   const unsigned long flags_off;
 342 };
 343
 344 /* ISA extensions in AArch64.  */
 345 static const struct aarch64_option_extension all_extensions[] =
 346 {
 347 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 348   {NAME, FLAGS_ON, FLAGS_OFF},
 349 #include "aarch64-option-extensions.def"
 350 #undef AARCH64_OPT_EXTENSION
 351   {NULL, 0, 0}
 352 };
 353
 354 /* Used to track the size of an address when generating a pre/post
 355    increment address.  */
 356 static enum machine_mode aarch64_memory_reference_mode;
 357
 358 /* Used to force GTY into this file.  */
 359 static GTY(()) int gty_dummy;
 360
 361 /* A table of valid AArch64 "bitmask immediate" values for
 362    logical instructions.  */
 363
 364 #define AARCH64_NUM_BITMASKS  5334
 365 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 366
 367 typedef enum aarch64_cond_code
 368 {
 369   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 370   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 371   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 372 }
 373 aarch64_cc;
 374
 375 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 376
 377 /* The condition codes of the processor, and the inverse function.  */
 378 static const char * const aarch64_condition_codes[] =
 379 {
 380   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 381   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 382 };
 383
 384 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 385 unsigned
 386 aarch64_dbx_register_number (unsigned regno)
 387 {
 388    if (GP_REGNUM_P (regno))
 389      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 390    else if (regno == SP_REGNUM)
 391      return AARCH64_DWARF_SP;
 392    else if (FP_REGNUM_P (regno))
 393      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 394
 395    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 396       equivalent DWARF register.  */
 397    return DWARF_FRAME_REGISTERS;
 398 }
 399
 400 /* Return TRUE if MODE is any of the large INT modes.  */
 401 static bool
 402 aarch64_vect_struct_mode_p (enum machine_mode mode)
 403 {
 404   return mode == OImode || mode == CImode || mode == XImode;
 405 }
 406
 407 /* Return TRUE if MODE is any of the vector modes.  */
 408 static bool
 409 aarch64_vector_mode_p (enum machine_mode mode)
 410 {
 411   return aarch64_vector_mode_supported_p (mode)
 412          || aarch64_vect_struct_mode_p (mode);
 413 }
 414
 415 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 416 static bool
 417 aarch64_array_mode_supported_p (enum machine_mode mode,
 418                                 unsigned HOST_WIDE_INT nelems)
 419 {
 420   if (TARGET_SIMD
 421       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 422       && (nelems >= 2 && nelems <= 4))
 423     return true;
 424
 425   return false;
 426 }
 427
 428 /* Implement HARD_REGNO_NREGS.  */
 429
 430 int
 431 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
 432 {
 433   switch (aarch64_regno_regclass (regno))
 434     {
 435     case FP_REGS:
 436     case FP_LO_REGS:
 437       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 438     default:
 439       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 440     }
 441   gcc_unreachable ();
 442 }
 443
 444 /* Implement HARD_REGNO_MODE_OK.  */
 445
 446 int
 447 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
 448 {
 449   if (GET_MODE_CLASS (mode) == MODE_CC)
 450     return regno == CC_REGNUM;
 451
 452   if (regno == SP_REGNUM)
 453     /* The purpose of comparing with ptr_mode is to support the
 454        global register variable associated with the stack pointer
 455        register via the syntax of asm ("wsp") in ILP32.  */
 456     return mode == Pmode || mode == ptr_mode;
 457
 458   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 459     return mode == Pmode;
 460
 461   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 462     return 1;
 463
 464   if (FP_REGNUM_P (regno))
 465     {
 466       if (aarch64_vect_struct_mode_p (mode))
 467         return
 468           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 469       else
 470         return 1;
 471     }
 472
 473   return 0;
 474 }
 475
 476 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 477 enum machine_mode
 478 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 479                                      enum machine_mode mode)
 480 {
 481   /* Handle modes that fit within single registers.  */
 482   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 483     {
 484       if (GET_MODE_SIZE (mode) >= 4)
 485         return mode;
 486       else
 487         return SImode;
 488     }
 489   /* Fall back to generic for multi-reg and very large modes.  */
 490   else
 491     return choose_hard_reg_mode (regno, nregs, false);
 492 }
 493
 494 /* Return true if calls to DECL should be treated as
 495    long-calls (ie called via a register).  */
 496 static bool
 497 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 498 {
 499   return false;
 500 }
 501
 502 /* Return true if calls to symbol-ref SYM should be treated as
 503    long-calls (ie called via a register).  */
 504 bool
 505 aarch64_is_long_call_p (rtx sym)
 506 {
 507   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 508 }
 509
 510 /* Return true if the offsets to a zero/sign-extract operation
 511    represent an expression that matches an extend operation.  The
 512    operands represent the paramters from
 513
 514    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 515 bool
 516 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 517                                 rtx extract_imm)
 518 {
 519   HOST_WIDE_INT mult_val, extract_val;
 520
 521   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 522     return false;
 523
 524   mult_val = INTVAL (mult_imm);
 525   extract_val = INTVAL (extract_imm);
 526
 527   if (extract_val > 8
 528       && extract_val < GET_MODE_BITSIZE (mode)
 529       && exact_log2 (extract_val & ~7) > 0
 530       && (extract_val & 7) <= 4
 531       && mult_val == (1 << (extract_val & 7)))
 532     return true;
 533
 534   return false;
 535 }
 536
 537 /* Emit an insn that's a simple single-set.  Both the operands must be
 538    known to be valid.  */
 539 inline static rtx
 540 emit_set_insn (rtx x, rtx y)
 541 {
 542   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 543 }
 544
 545 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 546    return the rtx for register 0 in the proper mode.  */
 547 rtx
 548 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 549 {
 550   enum machine_mode mode = SELECT_CC_MODE (code, x, y);
 551   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 552
 553   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 554   return cc_reg;
 555 }
 556
 557 /* Build the SYMBOL_REF for __tls_get_addr.  */
 558
 559 static GTY(()) rtx tls_get_addr_libfunc;
 560
 561 rtx
 562 aarch64_tls_get_addr (void)
 563 {
 564   if (!tls_get_addr_libfunc)
 565     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 566   return tls_get_addr_libfunc;
 567 }
 568
 569 /* Return the TLS model to use for ADDR.  */
 570
 571 static enum tls_model
 572 tls_symbolic_operand_type (rtx addr)
 573 {
 574   enum tls_model tls_kind = TLS_MODEL_NONE;
 575   rtx sym, addend;
 576
 577   if (GET_CODE (addr) == CONST)
 578     {
 579       split_const (addr, &sym, &addend);
 580       if (GET_CODE (sym) == SYMBOL_REF)
 581         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 582     }
 583   else if (GET_CODE (addr) == SYMBOL_REF)
 584     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 585
 586   return tls_kind;
 587 }
 588
 589 /* We'll allow lo_sum's in addresses in our legitimate addresses
 590    so that combine would take care of combining addresses where
 591    necessary, but for generation purposes, we'll generate the address
 592    as :
 593    RTL                               Absolute
 594    tmp = hi (symbol_ref);            adrp  x1, foo
 595    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 596                                      nop
 597
 598    PIC                               TLS
 599    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 600    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 601                                      bl   __tls_get_addr
 602                                      nop
 603
 604    Load TLS symbol, depending on TLS mechanism and TLS access model.
 605
 606    Global Dynamic - Traditional TLS:
 607    adrp tmp, :tlsgd:imm
 608    add  dest, tmp, #:tlsgd_lo12:imm
 609    bl   __tls_get_addr
 610
 611    Global Dynamic - TLS Descriptors:
 612    adrp dest, :tlsdesc:imm
 613    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 614    add  dest, dest, #:tlsdesc_lo12:imm
 615    blr  tmp
 616    mrs  tp, tpidr_el0
 617    add  dest, dest, tp
 618
 619    Initial Exec:
 620    mrs  tp, tpidr_el0
 621    adrp tmp, :gottprel:imm
 622    ldr  dest, [tmp, #:gottprel_lo12:imm]
 623    add  dest, dest, tp
 624
 625    Local Exec:
 626    mrs  tp, tpidr_el0
 627    add  t0, tp, #:tprel_hi12:imm
 628    add  t0, #:tprel_lo12_nc:imm
 629 */
 630
 631 static void
 632 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 633                                    enum aarch64_symbol_type type)
 634 {
 635   switch (type)
 636     {
 637     case SYMBOL_SMALL_ABSOLUTE:
 638       {
 639         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 640         rtx tmp_reg = dest;
 641         enum machine_mode mode = GET_MODE (dest);
 642
 643         gcc_assert (mode == Pmode || mode == ptr_mode);
 644
 645         if (can_create_pseudo_p ())
 646           tmp_reg = gen_reg_rtx (mode);
 647
 648         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 649         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 650         return;
 651       }
 652
 653     case SYMBOL_TINY_ABSOLUTE:
 654       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 655       return;
 656
 657     case SYMBOL_SMALL_GOT:
 658       {
 659         /* In ILP32, the mode of dest can be either SImode or DImode,
 660            while the got entry is always of SImode size.  The mode of
 661            dest depends on how dest is used: if dest is assigned to a
 662            pointer (e.g. in the memory), it has SImode; it may have
 663            DImode if dest is dereferenced to access the memeory.
 664            This is why we have to handle three different ldr_got_small
 665            patterns here (two patterns for ILP32).  */
 666         rtx tmp_reg = dest;
 667         enum machine_mode mode = GET_MODE (dest);
 668
 669         if (can_create_pseudo_p ())
 670           tmp_reg = gen_reg_rtx (mode);
 671
 672         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 673         if (mode == ptr_mode)
 674           {
 675             if (mode == DImode)
 676               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 677             else
 678               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 679           }
 680         else
 681           {
 682             gcc_assert (mode == Pmode);
 683             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 684           }
 685
 686         return;
 687       }
 688
 689     case SYMBOL_SMALL_TLSGD:
 690       {
 691         rtx_insn *insns;
 692         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 693
 694         start_sequence ();
 695         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 696         insns = get_insns ();
 697         end_sequence ();
 698
 699         RTL_CONST_CALL_P (insns) = 1;
 700         emit_libcall_block (insns, dest, result, imm);
 701         return;
 702       }
 703
 704     case SYMBOL_SMALL_TLSDESC:
 705       {
 706         enum machine_mode mode = GET_MODE (dest);
 707         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 708         rtx tp;
 709
 710         gcc_assert (mode == Pmode || mode == ptr_mode);
 711
 712         /* In ILP32, the got entry is always of SImode size.  Unlike
 713            small GOT, the dest is fixed at reg 0.  */
 714         if (TARGET_ILP32)
 715           emit_insn (gen_tlsdesc_small_si (imm));
 716         else
 717           emit_insn (gen_tlsdesc_small_di (imm));
 718         tp = aarch64_load_tp (NULL);
 719
 720         if (mode != Pmode)
 721           tp = gen_lowpart (mode, tp);
 722
 723         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 724         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 725         return;
 726       }
 727
 728     case SYMBOL_SMALL_GOTTPREL:
 729       {
 730         /* In ILP32, the mode of dest can be either SImode or DImode,
 731            while the got entry is always of SImode size.  The mode of
 732            dest depends on how dest is used: if dest is assigned to a
 733            pointer (e.g. in the memory), it has SImode; it may have
 734            DImode if dest is dereferenced to access the memeory.
 735            This is why we have to handle three different tlsie_small
 736            patterns here (two patterns for ILP32).  */
 737         enum machine_mode mode = GET_MODE (dest);
 738         rtx tmp_reg = gen_reg_rtx (mode);
 739         rtx tp = aarch64_load_tp (NULL);
 740
 741         if (mode == ptr_mode)
 742           {
 743             if (mode == DImode)
 744               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 745             else
 746               {
 747                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 748                 tp = gen_lowpart (mode, tp);
 749               }
 750           }
 751         else
 752           {
 753             gcc_assert (mode == Pmode);
 754             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 755           }
 756
 757         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 758         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 759         return;
 760       }
 761
 762     case SYMBOL_SMALL_TPREL:
 763       {
 764         rtx tp = aarch64_load_tp (NULL);
 765         emit_insn (gen_tlsle_small (dest, tp, imm));
 766         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 767         return;
 768       }
 769
 770     case SYMBOL_TINY_GOT:
 771       emit_insn (gen_ldr_got_tiny (dest, imm));
 772       return;
 773
 774     default:
 775       gcc_unreachable ();
 776     }
 777 }
 778
 779 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 780    handle all moves if !can_create_pseudo_p ().  The distinction is
 781    important because, unlike emit_move_insn, the move expanders know
 782    how to force Pmode objects into the constant pool even when the
 783    constant pool address is not itself legitimate.  */
 784 static rtx
 785 aarch64_emit_move (rtx dest, rtx src)
 786 {
 787   return (can_create_pseudo_p ()
 788           ? emit_move_insn (dest, src)
 789           : emit_move_insn_1 (dest, src));
 790 }
 791
 792 /* Split a 128-bit move operation into two 64-bit move operations,
 793    taking care to handle partial overlap of register to register
 794    copies.  Special cases are needed when moving between GP regs and
 795    FP regs.  SRC can be a register, constant or memory; DST a register
 796    or memory.  If either operand is memory it must not have any side
 797    effects.  */
 798 void
 799 aarch64_split_128bit_move (rtx dst, rtx src)
 800 {
 801   rtx dst_lo, dst_hi;
 802   rtx src_lo, src_hi;
 803
 804   enum machine_mode mode = GET_MODE (dst);
 805
 806   gcc_assert (mode == TImode || mode == TFmode);
 807   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 808   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 809
 810   if (REG_P (dst) && REG_P (src))
 811     {
 812       int src_regno = REGNO (src);
 813       int dst_regno = REGNO (dst);
 814
 815       /* Handle FP <-> GP regs.  */
 816       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 817         {
 818           src_lo = gen_lowpart (word_mode, src);
 819           src_hi = gen_highpart (word_mode, src);
 820
 821           if (mode == TImode)
 822             {
 823               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 824               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 825             }
 826           else
 827             {
 828               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 829               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 830             }
 831           return;
 832         }
 833       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 834         {
 835           dst_lo = gen_lowpart (word_mode, dst);
 836           dst_hi = gen_highpart (word_mode, dst);
 837
 838           if (mode == TImode)
 839             {
 840               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 841               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 842             }
 843           else
 844             {
 845               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 846               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 847             }
 848           return;
 849         }
 850     }
 851
 852   dst_lo = gen_lowpart (word_mode, dst);
 853   dst_hi = gen_highpart (word_mode, dst);
 854   src_lo = gen_lowpart (word_mode, src);
 855   src_hi = gen_highpart_mode (word_mode, mode, src);
 856
 857   /* At most one pairing may overlap.  */
 858   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 859     {
 860       aarch64_emit_move (dst_hi, src_hi);
 861       aarch64_emit_move (dst_lo, src_lo);
 862     }
 863   else
 864     {
 865       aarch64_emit_move (dst_lo, src_lo);
 866       aarch64_emit_move (dst_hi, src_hi);
 867     }
 868 }
 869
 870 bool
 871 aarch64_split_128bit_move_p (rtx dst, rtx src)
 872 {
 873   return (! REG_P (src)
 874           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 875 }
 876
 877 /* Split a complex SIMD combine.  */
 878
 879 void
 880 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 881 {
 882   enum machine_mode src_mode = GET_MODE (src1);
 883   enum machine_mode dst_mode = GET_MODE (dst);
 884
 885   gcc_assert (VECTOR_MODE_P (dst_mode));
 886
 887   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
 888     {
 889       rtx (*gen) (rtx, rtx, rtx);
 890
 891       switch (src_mode)
 892         {
 893         case V8QImode:
 894           gen = gen_aarch64_simd_combinev8qi;
 895           break;
 896         case V4HImode:
 897           gen = gen_aarch64_simd_combinev4hi;
 898           break;
 899         case V2SImode:
 900           gen = gen_aarch64_simd_combinev2si;
 901           break;
 902         case V2SFmode:
 903           gen = gen_aarch64_simd_combinev2sf;
 904           break;
 905         case DImode:
 906           gen = gen_aarch64_simd_combinedi;
 907           break;
 908         case DFmode:
 909           gen = gen_aarch64_simd_combinedf;
 910           break;
 911         default:
 912           gcc_unreachable ();
 913         }
 914
 915       emit_insn (gen (dst, src1, src2));
 916       return;
 917     }
 918 }
 919
 920 /* Split a complex SIMD move.  */
 921
 922 void
 923 aarch64_split_simd_move (rtx dst, rtx src)
 924 {
 925   enum machine_mode src_mode = GET_MODE (src);
 926   enum machine_mode dst_mode = GET_MODE (dst);
 927
 928   gcc_assert (VECTOR_MODE_P (dst_mode));
 929
 930   if (REG_P (dst) && REG_P (src))
 931     {
 932       rtx (*gen) (rtx, rtx);
 933
 934       gcc_assert (VECTOR_MODE_P (src_mode));
 935
 936       switch (src_mode)
 937         {
 938         case V16QImode:
 939           gen = gen_aarch64_split_simd_movv16qi;
 940           break;
 941         case V8HImode:
 942           gen = gen_aarch64_split_simd_movv8hi;
 943           break;
 944         case V4SImode:
 945           gen = gen_aarch64_split_simd_movv4si;
 946           break;
 947         case V2DImode:
 948           gen = gen_aarch64_split_simd_movv2di;
 949           break;
 950         case V4SFmode:
 951           gen = gen_aarch64_split_simd_movv4sf;
 952           break;
 953         case V2DFmode:
 954           gen = gen_aarch64_split_simd_movv2df;
 955           break;
 956         default:
 957           gcc_unreachable ();
 958         }
 959
 960       emit_insn (gen (dst, src));
 961       return;
 962     }
 963 }
 964
 965 static rtx
 966 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
 967 {
 968   if (can_create_pseudo_p ())
 969     return force_reg (mode, value);
 970   else
 971     {
 972       x = aarch64_emit_move (x, value);
 973       return x;
 974     }
 975 }
 976
 977
 978 static rtx
 979 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
 980 {
 981   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
 982     {
 983       rtx high;
 984       /* Load the full offset into a register.  This
 985          might be improvable in the future.  */
 986       high = GEN_INT (offset);
 987       offset = 0;
 988       high = aarch64_force_temporary (mode, temp, high);
 989       reg = aarch64_force_temporary (mode, temp,
 990                                      gen_rtx_PLUS (mode, high, reg));
 991     }
 992   return plus_constant (mode, reg, offset);
 993 }
 994
 995 void
 996 aarch64_expand_mov_immediate (rtx dest, rtx imm)
 997 {
 998   enum machine_mode mode = GET_MODE (dest);
 999   unsigned HOST_WIDE_INT mask;
1000   int i;
1001   bool first;
1002   unsigned HOST_WIDE_INT val;
1003   bool subtargets;
1004   rtx subtarget;
1005   int one_match, zero_match, first_not_ffff_match;
1006
1007   gcc_assert (mode == SImode || mode == DImode);
1008
1009   /* Check on what type of symbol it is.  */
1010   if (GET_CODE (imm) == SYMBOL_REF
1011       || GET_CODE (imm) == LABEL_REF
1012       || GET_CODE (imm) == CONST)
1013     {
1014       rtx mem, base, offset;
1015       enum aarch64_symbol_type sty;
1016
1017       /* If we have (const (plus symbol offset)), separate out the offset
1018          before we start classifying the symbol.  */
1019       split_const (imm, &base, &offset);
1020
1021       sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1022       switch (sty)
1023         {
1024         case SYMBOL_FORCE_TO_MEM:
1025           if (offset != const0_rtx
1026               && targetm.cannot_force_const_mem (mode, imm))
1027             {
1028               gcc_assert (can_create_pseudo_p ());
1029               base = aarch64_force_temporary (mode, dest, base);
1030               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1031               aarch64_emit_move (dest, base);
1032               return;
1033             }
1034           mem = force_const_mem (ptr_mode, imm);
1035           gcc_assert (mem);
1036           if (mode != ptr_mode)
1037             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1038           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1039           return;
1040
1041         case SYMBOL_SMALL_TLSGD:
1042         case SYMBOL_SMALL_TLSDESC:
1043         case SYMBOL_SMALL_GOTTPREL:
1044         case SYMBOL_SMALL_GOT:
1045         case SYMBOL_TINY_GOT:
1046           if (offset != const0_rtx)
1047             {
1048               gcc_assert(can_create_pseudo_p ());
1049               base = aarch64_force_temporary (mode, dest, base);
1050               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1051               aarch64_emit_move (dest, base);
1052               return;
1053             }
1054           /* FALLTHRU */
1055
1056         case SYMBOL_SMALL_TPREL:
1057         case SYMBOL_SMALL_ABSOLUTE:
1058         case SYMBOL_TINY_ABSOLUTE:
1059           aarch64_load_symref_appropriately (dest, imm, sty);
1060           return;
1061
1062         default:
1063           gcc_unreachable ();
1064         }
1065     }
1066
1067   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1068     {
1069       emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1070       return;
1071     }
1072
1073   if (!CONST_INT_P (imm))
1074     {
1075       if (GET_CODE (imm) == HIGH)
1076         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1077       else
1078         {
1079           rtx mem = force_const_mem (mode, imm);
1080           gcc_assert (mem);
1081           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1082         }
1083
1084       return;
1085     }
1086
1087   if (mode == SImode)
1088     {
1089       /* We know we can't do this in 1 insn, and we must be able to do it
1090          in two; so don't mess around looking for sequences that don't buy
1091          us anything.  */
1092       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1093       emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1094                                  GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1095       return;
1096     }
1097
1098   /* Remaining cases are all for DImode.  */
1099
1100   val = INTVAL (imm);
1101   subtargets = optimize && can_create_pseudo_p ();
1102
1103   one_match = 0;
1104   zero_match = 0;
1105   mask = 0xffff;
1106   first_not_ffff_match = -1;
1107
1108   for (i = 0; i < 64; i += 16, mask <<= 16)
1109     {
1110       if ((val & mask) == mask)
1111         one_match++;
1112       else
1113         {
1114           if (first_not_ffff_match < 0)
1115             first_not_ffff_match = i;
1116           if ((val & mask) == 0)
1117             zero_match++;
1118         }
1119     }
1120
1121   if (one_match == 2)
1122     {
1123       /* Set one of the quarters and then insert back into result.  */
1124       mask = 0xffffll << first_not_ffff_match;
1125       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1126       emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1127                                  GEN_INT ((val >> first_not_ffff_match)
1128                                           & 0xffff)));
1129       return;
1130     }
1131
1132   if (zero_match == 2)
1133     goto simple_sequence;
1134
1135   mask = 0x0ffff0000UL;
1136   for (i = 16; i < 64; i += 16, mask <<= 16)
1137     {
1138       HOST_WIDE_INT comp = mask & ~(mask - 1);
1139
1140       if (aarch64_uimm12_shift (val - (val & mask)))
1141         {
1142           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1143
1144           emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1145           emit_insn (gen_adddi3 (dest, subtarget,
1146                                  GEN_INT (val - (val & mask))));
1147           return;
1148         }
1149       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1150         {
1151           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1152
1153           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1154                                   GEN_INT ((val + comp) & mask)));
1155           emit_insn (gen_adddi3 (dest, subtarget,
1156                                  GEN_INT (val - ((val + comp) & mask))));
1157           return;
1158         }
1159       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1160         {
1161           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1162
1163           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1164                                   GEN_INT ((val - comp) | ~mask)));
1165           emit_insn (gen_adddi3 (dest, subtarget,
1166                                  GEN_INT (val - ((val - comp) | ~mask))));
1167           return;
1168         }
1169       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1170         {
1171           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1172
1173           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1174                                   GEN_INT (val | ~mask)));
1175           emit_insn (gen_adddi3 (dest, subtarget,
1176                                  GEN_INT (val - (val | ~mask))));
1177           return;
1178         }
1179     }
1180
1181   /* See if we can do it by arithmetically combining two
1182      immediates.  */
1183   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1184     {
1185       int j;
1186       mask = 0xffff;
1187
1188       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1189           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1190         {
1191           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1192           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1193                                   GEN_INT (aarch64_bitmasks[i])));
1194           emit_insn (gen_adddi3 (dest, subtarget,
1195                                  GEN_INT (val - aarch64_bitmasks[i])));
1196           return;
1197         }
1198
1199       for (j = 0; j < 64; j += 16, mask <<= 16)
1200         {
1201           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1202             {
1203               emit_insn (gen_rtx_SET (VOIDmode, dest,
1204                                       GEN_INT (aarch64_bitmasks[i])));
1205               emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1206                                          GEN_INT ((val >> j) & 0xffff)));
1207               return;
1208             }
1209         }
1210     }
1211
1212   /* See if we can do it by logically combining two immediates.  */
1213   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1214     {
1215       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1216         {
1217           int j;
1218
1219           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1220             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1221               {
1222                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1223                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1224                                         GEN_INT (aarch64_bitmasks[i])));
1225                 emit_insn (gen_iordi3 (dest, subtarget,
1226                                        GEN_INT (aarch64_bitmasks[j])));
1227                 return;
1228               }
1229         }
1230       else if ((val & aarch64_bitmasks[i]) == val)
1231         {
1232           int j;
1233
1234           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1235             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1236               {
1237
1238                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1239                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1240                                         GEN_INT (aarch64_bitmasks[j])));
1241                 emit_insn (gen_anddi3 (dest, subtarget,
1242                                        GEN_INT (aarch64_bitmasks[i])));
1243                 return;
1244               }
1245         }
1246     }
1247
1248   if (one_match > zero_match)
1249     {
1250       /* Set either first three quarters or all but the third.   */
1251       mask = 0xffffll << (16 - first_not_ffff_match);
1252       emit_insn (gen_rtx_SET (VOIDmode, dest,
1253                               GEN_INT (val | mask | 0xffffffff00000000ull)));
1254
1255       /* Now insert other two quarters.  */
1256       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1257            i < 64; i += 16, mask <<= 16)
1258         {
1259           if ((val & mask) != mask)
1260             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1261                                        GEN_INT ((val >> i) & 0xffff)));
1262         }
1263       return;
1264     }
1265
1266  simple_sequence:
1267   first = true;
1268   mask = 0xffff;
1269   for (i = 0; i < 64; i += 16, mask <<= 16)
1270     {
1271       if ((val & mask) != 0)
1272         {
1273           if (first)
1274             {
1275               emit_insn (gen_rtx_SET (VOIDmode, dest,
1276                                       GEN_INT (val & mask)));
1277               first = false;
1278             }
1279           else
1280             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1281                                        GEN_INT ((val >> i) & 0xffff)));
1282         }
1283     }
1284 }
1285
1286 static bool
1287 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1288                                  tree exp ATTRIBUTE_UNUSED)
1289 {
1290   /* Currently, always true.  */
1291   return true;
1292 }
1293
1294 /* Implement TARGET_PASS_BY_REFERENCE.  */
1295
1296 static bool
1297 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1298                            enum machine_mode mode,
1299                            const_tree type,
1300                            bool named ATTRIBUTE_UNUSED)
1301 {
1302   HOST_WIDE_INT size;
1303   enum machine_mode dummymode;
1304   int nregs;
1305
1306   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1307   size = (mode == BLKmode && type)
1308     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1309
1310   /* Aggregates are passed by reference based on their size.  */
1311   if (type && AGGREGATE_TYPE_P (type))
1312     {
1313       size = int_size_in_bytes (type);
1314     }
1315
1316   /* Variable sized arguments are always returned by reference.  */
1317   if (size < 0)
1318     return true;
1319
1320   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1321   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1322                                                &dummymode, &nregs,
1323                                                NULL))
1324     return false;
1325
1326   /* Arguments which are variable sized or larger than 2 registers are
1327      passed by reference unless they are a homogenous floating point
1328      aggregate.  */
1329   return size > 2 * UNITS_PER_WORD;
1330 }
1331
1332 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1333 static bool
1334 aarch64_return_in_msb (const_tree valtype)
1335 {
1336   enum machine_mode dummy_mode;
1337   int dummy_int;
1338
1339   /* Never happens in little-endian mode.  */
1340   if (!BYTES_BIG_ENDIAN)
1341     return false;
1342
1343   /* Only composite types smaller than or equal to 16 bytes can
1344      be potentially returned in registers.  */
1345   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1346       || int_size_in_bytes (valtype) <= 0
1347       || int_size_in_bytes (valtype) > 16)
1348     return false;
1349
1350   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1351      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1352      is always passed/returned in the least significant bits of fp/simd
1353      register(s).  */
1354   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1355                                                &dummy_mode, &dummy_int, NULL))
1356     return false;
1357
1358   return true;
1359 }
1360
1361 /* Implement TARGET_FUNCTION_VALUE.
1362    Define how to find the value returned by a function.  */
1363
1364 static rtx
1365 aarch64_function_value (const_tree type, const_tree func,
1366                         bool outgoing ATTRIBUTE_UNUSED)
1367 {
1368   enum machine_mode mode;
1369   int unsignedp;
1370   int count;
1371   enum machine_mode ag_mode;
1372
1373   mode = TYPE_MODE (type);
1374   if (INTEGRAL_TYPE_P (type))
1375     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1376
1377   if (aarch64_return_in_msb (type))
1378     {
1379       HOST_WIDE_INT size = int_size_in_bytes (type);
1380
1381       if (size % UNITS_PER_WORD != 0)
1382         {
1383           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1384           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1385         }
1386     }
1387
1388   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1389                                                &ag_mode, &count, NULL))
1390     {
1391       if (!aarch64_composite_type_p (type, mode))
1392         {
1393           gcc_assert (count == 1 && mode == ag_mode);
1394           return gen_rtx_REG (mode, V0_REGNUM);
1395         }
1396       else
1397         {
1398           int i;
1399           rtx par;
1400
1401           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1402           for (i = 0; i < count; i++)
1403             {
1404               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1405               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1406                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1407               XVECEXP (par, 0, i) = tmp;
1408             }
1409           return par;
1410         }
1411     }
1412   else
1413     return gen_rtx_REG (mode, R0_REGNUM);
1414 }
1415
1416 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1417    Return true if REGNO is the number of a hard register in which the values
1418    of called function may come back.  */
1419
1420 static bool
1421 aarch64_function_value_regno_p (const unsigned int regno)
1422 {
1423   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1424      of 16-byte return values are: 128-bit integers and 16-byte small
1425      structures (excluding homogeneous floating-point aggregates).  */
1426   if (regno == R0_REGNUM || regno == R1_REGNUM)
1427     return true;
1428
1429   /* Up to four fp/simd registers can return a function value, e.g. a
1430      homogeneous floating-point aggregate having four members.  */
1431   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1432     return !TARGET_GENERAL_REGS_ONLY;
1433
1434   return false;
1435 }
1436
1437 /* Implement TARGET_RETURN_IN_MEMORY.
1438
1439    If the type T of the result of a function is such that
1440      void func (T arg)
1441    would require that arg be passed as a value in a register (or set of
1442    registers) according to the parameter passing rules, then the result
1443    is returned in the same registers as would be used for such an
1444    argument.  */
1445
1446 static bool
1447 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1448 {
1449   HOST_WIDE_INT size;
1450   enum machine_mode ag_mode;
1451   int count;
1452
1453   if (!AGGREGATE_TYPE_P (type)
1454       && TREE_CODE (type) != COMPLEX_TYPE
1455       && TREE_CODE (type) != VECTOR_TYPE)
1456     /* Simple scalar types always returned in registers.  */
1457     return false;
1458
1459   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1460                                                type,
1461                                                &ag_mode,
1462                                                &count,
1463                                                NULL))
1464     return false;
1465
1466   /* Types larger than 2 registers returned in memory.  */
1467   size = int_size_in_bytes (type);
1468   return (size < 0 || size > 2 * UNITS_PER_WORD);
1469 }
1470
1471 static bool
1472 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1473                                const_tree type, int *nregs)
1474 {
1475   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1476   return aarch64_vfp_is_call_or_return_candidate (mode,
1477                                                   type,
1478                                                   &pcum->aapcs_vfp_rmode,
1479                                                   nregs,
1480                                                   NULL);
1481 }
1482
1483 /* Given MODE and TYPE of a function argument, return the alignment in
1484    bits.  The idea is to suppress any stronger alignment requested by
1485    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1486    This is a helper function for local use only.  */
1487
1488 static unsigned int
1489 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1490 {
1491   unsigned int alignment;
1492
1493   if (type)
1494     {
1495       if (!integer_zerop (TYPE_SIZE (type)))
1496         {
1497           if (TYPE_MODE (type) == mode)
1498             alignment = TYPE_ALIGN (type);
1499           else
1500             alignment = GET_MODE_ALIGNMENT (mode);
1501         }
1502       else
1503         alignment = 0;
1504     }
1505   else
1506     alignment = GET_MODE_ALIGNMENT (mode);
1507
1508   return alignment;
1509 }
1510
1511 /* Layout a function argument according to the AAPCS64 rules.  The rule
1512    numbers refer to the rule numbers in the AAPCS64.  */
1513
1514 static void
1515 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1516                     const_tree type,
1517                     bool named ATTRIBUTE_UNUSED)
1518 {
1519   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1520   int ncrn, nvrn, nregs;
1521   bool allocate_ncrn, allocate_nvrn;
1522   HOST_WIDE_INT size;
1523
1524   /* We need to do this once per argument.  */
1525   if (pcum->aapcs_arg_processed)
1526     return;
1527
1528   pcum->aapcs_arg_processed = true;
1529
1530   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1531   size
1532     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1533                         UNITS_PER_WORD);
1534
1535   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1536   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1537                                                  mode,
1538                                                  type,
1539                                                  &nregs);
1540
1541   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1542      The following code thus handles passing by SIMD/FP registers first.  */
1543
1544   nvrn = pcum->aapcs_nvrn;
1545
1546   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1547      and homogenous short-vector aggregates (HVA).  */
1548   if (allocate_nvrn)
1549     {
1550       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1551         {
1552           pcum->aapcs_nextnvrn = nvrn + nregs;
1553           if (!aarch64_composite_type_p (type, mode))
1554             {
1555               gcc_assert (nregs == 1);
1556               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1557             }
1558           else
1559             {
1560               rtx par;
1561               int i;
1562               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1563               for (i = 0; i < nregs; i++)
1564                 {
1565                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1566                                          V0_REGNUM + nvrn + i);
1567                   tmp = gen_rtx_EXPR_LIST
1568                     (VOIDmode, tmp,
1569                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1570                   XVECEXP (par, 0, i) = tmp;
1571                 }
1572               pcum->aapcs_reg = par;
1573             }
1574           return;
1575         }
1576       else
1577         {
1578           /* C.3 NSRN is set to 8.  */
1579           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1580           goto on_stack;
1581         }
1582     }
1583
1584   ncrn = pcum->aapcs_ncrn;
1585   nregs = size / UNITS_PER_WORD;
1586
1587   /* C6 - C9.  though the sign and zero extension semantics are
1588      handled elsewhere.  This is the case where the argument fits
1589      entirely general registers.  */
1590   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1591     {
1592       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1593
1594       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1595
1596       /* C.8 if the argument has an alignment of 16 then the NGRN is
1597          rounded up to the next even number.  */
1598       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1599         {
1600           ++ncrn;
1601           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1602         }
1603       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1604          A reg is still generated for it, but the caller should be smart
1605          enough not to use it.  */
1606       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1607         {
1608           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1609         }
1610       else
1611         {
1612           rtx par;
1613           int i;
1614
1615           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1616           for (i = 0; i < nregs; i++)
1617             {
1618               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1619               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1620                                        GEN_INT (i * UNITS_PER_WORD));
1621               XVECEXP (par, 0, i) = tmp;
1622             }
1623           pcum->aapcs_reg = par;
1624         }
1625
1626       pcum->aapcs_nextncrn = ncrn + nregs;
1627       return;
1628     }
1629
1630   /* C.11  */
1631   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1632
1633   /* The argument is passed on stack; record the needed number of words for
1634      this argument and align the total size if necessary.  */
1635 on_stack:
1636   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1637   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1638     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1639                                                16 / UNITS_PER_WORD);
1640   return;
1641 }
1642
1643 /* Implement TARGET_FUNCTION_ARG.  */
1644
1645 static rtx
1646 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1647                       const_tree type, bool named)
1648 {
1649   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1650   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1651
1652   if (mode == VOIDmode)
1653     return NULL_RTX;
1654
1655   aarch64_layout_arg (pcum_v, mode, type, named);
1656   return pcum->aapcs_reg;
1657 }
1658
1659 void
1660 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1661                            const_tree fntype ATTRIBUTE_UNUSED,
1662                            rtx libname ATTRIBUTE_UNUSED,
1663                            const_tree fndecl ATTRIBUTE_UNUSED,
1664                            unsigned n_named ATTRIBUTE_UNUSED)
1665 {
1666   pcum->aapcs_ncrn = 0;
1667   pcum->aapcs_nvrn = 0;
1668   pcum->aapcs_nextncrn = 0;
1669   pcum->aapcs_nextnvrn = 0;
1670   pcum->pcs_variant = ARM_PCS_AAPCS64;
1671   pcum->aapcs_reg = NULL_RTX;
1672   pcum->aapcs_arg_processed = false;
1673   pcum->aapcs_stack_words = 0;
1674   pcum->aapcs_stack_size = 0;
1675
1676   return;
1677 }
1678
1679 static void
1680 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1681                               enum machine_mode mode,
1682                               const_tree type,
1683                               bool named)
1684 {
1685   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1686   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1687     {
1688       aarch64_layout_arg (pcum_v, mode, type, named);
1689       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1690                   != (pcum->aapcs_stack_words != 0));
1691       pcum->aapcs_arg_processed = false;
1692       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1693       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1694       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1695       pcum->aapcs_stack_words = 0;
1696       pcum->aapcs_reg = NULL_RTX;
1697     }
1698 }
1699
1700 bool
1701 aarch64_function_arg_regno_p (unsigned regno)
1702 {
1703   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1704           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1705 }
1706
1707 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1708    PARM_BOUNDARY bits of alignment, but will be given anything up
1709    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1710    that both before and after the layout of each argument, the Next
1711    Stacked Argument Address (NSAA) will have a minimum alignment of
1712    8 bytes.  */
1713
1714 static unsigned int
1715 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1716 {
1717   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1718
1719   if (alignment < PARM_BOUNDARY)
1720     alignment = PARM_BOUNDARY;
1721   if (alignment > STACK_BOUNDARY)
1722     alignment = STACK_BOUNDARY;
1723   return alignment;
1724 }
1725
1726 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1727
1728    Return true if an argument passed on the stack should be padded upwards,
1729    i.e. if the least-significant byte of the stack slot has useful data.
1730
1731    Small aggregate types are placed in the lowest memory address.
1732
1733    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1734
1735 bool
1736 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1737 {
1738   /* On little-endian targets, the least significant byte of every stack
1739      argument is passed at the lowest byte address of the stack slot.  */
1740   if (!BYTES_BIG_ENDIAN)
1741     return true;
1742
1743   /* Otherwise, integral, floating-point and pointer types are padded downward:
1744      the least significant byte of a stack argument is passed at the highest
1745      byte address of the stack slot.  */
1746   if (type
1747       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1748          || POINTER_TYPE_P (type))
1749       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1750     return false;
1751
1752   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1753   return true;
1754 }
1755
1756 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1757
1758    It specifies padding for the last (may also be the only)
1759    element of a block move between registers and memory.  If
1760    assuming the block is in the memory, padding upward means that
1761    the last element is padded after its highest significant byte,
1762    while in downward padding, the last element is padded at the
1763    its least significant byte side.
1764
1765    Small aggregates and small complex types are always padded
1766    upwards.
1767
1768    We don't need to worry about homogeneous floating-point or
1769    short-vector aggregates; their move is not affected by the
1770    padding direction determined here.  Regardless of endianness,
1771    each element of such an aggregate is put in the least
1772    significant bits of a fp/simd register.
1773
1774    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1775    register has useful data, and return the opposite if the most
1776    significant byte does.  */
1777
1778 bool
1779 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1780                      bool first ATTRIBUTE_UNUSED)
1781 {
1782
1783   /* Small composite types are always padded upward.  */
1784   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1785     {
1786       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1787                             : GET_MODE_SIZE (mode));
1788       if (size < 2 * UNITS_PER_WORD)
1789         return true;
1790     }
1791
1792   /* Otherwise, use the default padding.  */
1793   return !BYTES_BIG_ENDIAN;
1794 }
1795
1796 static enum machine_mode
1797 aarch64_libgcc_cmp_return_mode (void)
1798 {
1799   return SImode;
1800 }
1801
1802 static bool
1803 aarch64_frame_pointer_required (void)
1804 {
1805   /* In aarch64_override_options_after_change
1806      flag_omit_leaf_frame_pointer turns off the frame pointer by
1807      default.  Turn it back on now if we've not got a leaf
1808      function.  */
1809   if (flag_omit_leaf_frame_pointer
1810       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1811     return true;
1812
1813   return false;
1814 }
1815
1816 /* Mark the registers that need to be saved by the callee and calculate
1817    the size of the callee-saved registers area and frame record (both FP
1818    and LR may be omitted).  */
1819 static void
1820 aarch64_layout_frame (void)
1821 {
1822   HOST_WIDE_INT offset = 0;
1823   int regno;
1824
1825   if (reload_completed && cfun->machine->frame.laid_out)
1826     return;
1827
1828 #define SLOT_NOT_REQUIRED (-2)
1829 #define SLOT_REQUIRED     (-1)
1830
1831   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
1832   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
1833
1834   /* First mark all the registers that really need to be saved...  */
1835   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1836     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1837
1838   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1839     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1840
1841   /* ... that includes the eh data registers (if needed)...  */
1842   if (crtl->calls_eh_return)
1843     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1844       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1845         = SLOT_REQUIRED;
1846
1847   /* ... and any callee saved register that dataflow says is live.  */
1848   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1849     if (df_regs_ever_live_p (regno)
1850         && !call_used_regs[regno])
1851       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1852
1853   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1854     if (df_regs_ever_live_p (regno)
1855         && !call_used_regs[regno])
1856       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1857
1858   if (frame_pointer_needed)
1859     {
1860       /* FP and LR are placed in the linkage record.  */
1861       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1862       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
1863       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1864       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
1865       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1866       offset += 2 * UNITS_PER_WORD;
1867     }
1868
1869   /* Now assign stack slots for them.  */
1870   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1871     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1872       {
1873         cfun->machine->frame.reg_offset[regno] = offset;
1874         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1875           cfun->machine->frame.wb_candidate1 = regno;
1876         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
1877           cfun->machine->frame.wb_candidate2 = regno;
1878         offset += UNITS_PER_WORD;
1879       }
1880
1881   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1882     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1883       {
1884         cfun->machine->frame.reg_offset[regno] = offset;
1885         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1886           cfun->machine->frame.wb_candidate1 = regno;
1887         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
1888                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
1889           cfun->machine->frame.wb_candidate2 = regno;
1890         offset += UNITS_PER_WORD;
1891       }
1892
1893   cfun->machine->frame.padding0 =
1894     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1895   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1896
1897   cfun->machine->frame.saved_regs_size = offset;
1898
1899   cfun->machine->frame.hard_fp_offset
1900     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1901                         + get_frame_size ()
1902                         + cfun->machine->frame.saved_regs_size,
1903                         STACK_BOUNDARY / BITS_PER_UNIT);
1904
1905   cfun->machine->frame.frame_size
1906     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1907                         + crtl->outgoing_args_size,
1908                         STACK_BOUNDARY / BITS_PER_UNIT);
1909
1910   cfun->machine->frame.laid_out = true;
1911 }
1912
1913 static bool
1914 aarch64_register_saved_on_entry (int regno)
1915 {
1916   return cfun->machine->frame.reg_offset[regno] >= 0;
1917 }
1918
1919 static unsigned
1920 aarch64_next_callee_save (unsigned regno, unsigned limit)
1921 {
1922   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1923     regno ++;
1924   return regno;
1925 }
1926
1927 static void
1928 aarch64_pushwb_single_reg (enum machine_mode mode, unsigned regno,
1929                            HOST_WIDE_INT adjustment)
1930  {
1931   rtx base_rtx = stack_pointer_rtx;
1932   rtx insn, reg, mem;
1933
1934   reg = gen_rtx_REG (mode, regno);
1935   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
1936                             plus_constant (Pmode, base_rtx, -adjustment));
1937   mem = gen_rtx_MEM (mode, mem);
1938
1939   insn = emit_move_insn (mem, reg);
1940   RTX_FRAME_RELATED_P (insn) = 1;
1941 }
1942
1943 static rtx
1944 aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1945                           HOST_WIDE_INT adjustment)
1946 {
1947   switch (mode)
1948     {
1949     case DImode:
1950       return gen_storewb_pairdi_di (base, base, reg, reg2,
1951                                     GEN_INT (-adjustment),
1952                                     GEN_INT (UNITS_PER_WORD - adjustment));
1953     case DFmode:
1954       return gen_storewb_pairdf_di (base, base, reg, reg2,
1955                                     GEN_INT (-adjustment),
1956                                     GEN_INT (UNITS_PER_WORD - adjustment));
1957     default:
1958       gcc_unreachable ();
1959     }
1960 }
1961
1962 static void
1963 aarch64_pushwb_pair_reg (enum machine_mode mode, unsigned regno1,
1964                          unsigned regno2, HOST_WIDE_INT adjustment)
1965 {
1966   rtx_insn *insn;
1967   rtx reg1 = gen_rtx_REG (mode, regno1);
1968   rtx reg2 = gen_rtx_REG (mode, regno2);
1969
1970   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
1971                                               reg2, adjustment));
1972   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
1973   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1974   RTX_FRAME_RELATED_P (insn) = 1;
1975 }
1976
1977 static rtx
1978 aarch64_gen_loadwb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1979                          HOST_WIDE_INT adjustment)
1980 {
1981   switch (mode)
1982     {
1983     case DImode:
1984       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
1985                                    GEN_INT (UNITS_PER_WORD));
1986     case DFmode:
1987       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
1988                                    GEN_INT (UNITS_PER_WORD));
1989     default:
1990       gcc_unreachable ();
1991     }
1992 }
1993
1994 static rtx
1995 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
1996                         rtx reg2)
1997 {
1998   switch (mode)
1999     {
2000     case DImode:
2001       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2002
2003     case DFmode:
2004       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2005
2006     default:
2007       gcc_unreachable ();
2008     }
2009 }
2010
2011 static rtx
2012 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2013                        rtx mem2)
2014 {
2015   switch (mode)
2016     {
2017     case DImode:
2018       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2019
2020     case DFmode:
2021       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2022
2023     default:
2024       gcc_unreachable ();
2025     }
2026 }
2027
2028
2029 static void
2030 aarch64_save_callee_saves (enum machine_mode mode, HOST_WIDE_INT start_offset,
2031                            unsigned start, unsigned limit, bool skip_wb)
2032 {
2033   rtx_insn *insn;
2034   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2035                                                  ? gen_frame_mem : gen_rtx_MEM);
2036   unsigned regno;
2037   unsigned regno2;
2038
2039   for (regno = aarch64_next_callee_save (start, limit);
2040        regno <= limit;
2041        regno = aarch64_next_callee_save (regno + 1, limit))
2042     {
2043       rtx reg, mem;
2044       HOST_WIDE_INT offset;
2045
2046       if (skip_wb
2047           && (regno == cfun->machine->frame.wb_candidate1
2048               || regno == cfun->machine->frame.wb_candidate2))
2049         continue;
2050
2051       reg = gen_rtx_REG (mode, regno);
2052       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2053       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2054                                               offset));
2055
2056       regno2 = aarch64_next_callee_save (regno + 1, limit);
2057
2058       if (regno2 <= limit
2059           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2060               == cfun->machine->frame.reg_offset[regno2]))
2061
2062         {
2063           rtx reg2 = gen_rtx_REG (mode, regno2);
2064           rtx mem2;
2065
2066           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2067           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2068                                                    offset));
2069           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2070                                                     reg2));
2071
2072           /* The first part of a frame-related parallel insn is
2073              always assumed to be relevant to the frame
2074              calculations; subsequent parts, are only
2075              frame-related if explicitly marked.  */
2076           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2077           regno = regno2;
2078         }
2079       else
2080         insn = emit_move_insn (mem, reg);
2081
2082       RTX_FRAME_RELATED_P (insn) = 1;
2083     }
2084 }
2085
2086 static void
2087 aarch64_restore_callee_saves (enum machine_mode mode,
2088                               HOST_WIDE_INT start_offset, unsigned start,
2089                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2090 {
2091   rtx base_rtx = stack_pointer_rtx;
2092   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2093                                                  ? gen_frame_mem : gen_rtx_MEM);
2094   unsigned regno;
2095   unsigned regno2;
2096   HOST_WIDE_INT offset;
2097
2098   for (regno = aarch64_next_callee_save (start, limit);
2099        regno <= limit;
2100        regno = aarch64_next_callee_save (regno + 1, limit))
2101     {
2102       rtx reg, mem;
2103
2104       if (skip_wb
2105           && (regno == cfun->machine->frame.wb_candidate1
2106               || regno == cfun->machine->frame.wb_candidate2))
2107         continue;
2108
2109       reg = gen_rtx_REG (mode, regno);
2110       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2111       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2112
2113       regno2 = aarch64_next_callee_save (regno + 1, limit);
2114
2115       if (regno2 <= limit
2116           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2117               == cfun->machine->frame.reg_offset[regno2]))
2118         {
2119           rtx reg2 = gen_rtx_REG (mode, regno2);
2120           rtx mem2;
2121
2122           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2123           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2124           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2125
2126           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2127           regno = regno2;
2128         }
2129       else
2130         emit_move_insn (reg, mem);
2131       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2132     }
2133 }
2134
2135 /* AArch64 stack frames generated by this compiler look like:
2136
2137         +-------------------------------+
2138         |                               |
2139         |  incoming stack arguments     |
2140         |                               |
2141         +-------------------------------+
2142         |                               | <-- incoming stack pointer (aligned)
2143         |  callee-allocated save area   |
2144         |  for register varargs         |
2145         |                               |
2146         +-------------------------------+
2147         |  local variables              | <-- frame_pointer_rtx
2148         |                               |
2149         +-------------------------------+
2150         |  padding0                     | \
2151         +-------------------------------+  |
2152         |  callee-saved registers       |  | frame.saved_regs_size
2153         +-------------------------------+  |
2154         |  LR'                          |  |
2155         +-------------------------------+  |
2156         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2157         +-------------------------------+
2158         |  dynamic allocation           |
2159         +-------------------------------+
2160         |  padding                      |
2161         +-------------------------------+
2162         |  outgoing stack arguments     | <-- arg_pointer
2163         |                               |
2164         +-------------------------------+
2165         |                               | <-- stack_pointer_rtx (aligned)
2166
2167    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2168    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2169    unchanged.  */
2170
2171 /* Generate the prologue instructions for entry into a function.
2172    Establish the stack frame by decreasing the stack pointer with a
2173    properly calculated size and, if necessary, create a frame record
2174    filled with the values of LR and previous frame pointer.  The
2175    current FP is also set up if it is in use.  */
2176
2177 void
2178 aarch64_expand_prologue (void)
2179 {
2180   /* sub sp, sp, #<frame_size>
2181      stp {fp, lr}, [sp, #<frame_size> - 16]
2182      add fp, sp, #<frame_size> - hardfp_offset
2183      stp {cs_reg}, [fp, #-16] etc.
2184
2185      sub sp, sp, <final_adjustment_if_any>
2186   */
2187   HOST_WIDE_INT frame_size, offset;
2188   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2189   HOST_WIDE_INT hard_fp_offset;
2190   rtx_insn *insn;
2191
2192   aarch64_layout_frame ();
2193
2194   offset = frame_size = cfun->machine->frame.frame_size;
2195   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2196   fp_offset = frame_size - hard_fp_offset;
2197
2198   if (flag_stack_usage_info)
2199     current_function_static_stack_size = frame_size;
2200
2201   /* Store pairs and load pairs have a range only -512 to 504.  */
2202   if (offset >= 512)
2203     {
2204       /* When the frame has a large size, an initial decrease is done on
2205          the stack pointer to jump over the callee-allocated save area for
2206          register varargs, the local variable area and/or the callee-saved
2207          register area.  This will allow the pre-index write-back
2208          store pair instructions to be used for setting up the stack frame
2209          efficiently.  */
2210       offset = hard_fp_offset;
2211       if (offset >= 512)
2212         offset = cfun->machine->frame.saved_regs_size;
2213
2214       frame_size -= (offset + crtl->outgoing_args_size);
2215       fp_offset = 0;
2216
2217       if (frame_size >= 0x1000000)
2218         {
2219           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2220           emit_move_insn (op0, GEN_INT (-frame_size));
2221           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2222
2223           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2224                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2225                                      plus_constant (Pmode, stack_pointer_rtx,
2226                                                     -frame_size)));
2227           RTX_FRAME_RELATED_P (insn) = 1;
2228         }
2229       else if (frame_size > 0)
2230         {
2231           int hi_ofs = frame_size & 0xfff000;
2232           int lo_ofs = frame_size & 0x000fff;
2233
2234           if (hi_ofs)
2235             {
2236               insn = emit_insn (gen_add2_insn
2237                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2238               RTX_FRAME_RELATED_P (insn) = 1;
2239             }
2240           if (lo_ofs)
2241             {
2242               insn = emit_insn (gen_add2_insn
2243                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2244               RTX_FRAME_RELATED_P (insn) = 1;
2245             }
2246         }
2247     }
2248   else
2249     frame_size = -1;
2250
2251   if (offset > 0)
2252     {
2253       bool skip_wb = false;
2254
2255       if (frame_pointer_needed)
2256         {
2257           skip_wb = true;
2258
2259           if (fp_offset)
2260             {
2261               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2262                                                GEN_INT (-offset)));
2263               RTX_FRAME_RELATED_P (insn) = 1;
2264
2265               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2266                                          R30_REGNUM, false);
2267             }
2268           else
2269             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2270
2271           /* Set up frame pointer to point to the location of the
2272              previous frame pointer on the stack.  */
2273           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2274                                            stack_pointer_rtx,
2275                                            GEN_INT (fp_offset)));
2276           RTX_FRAME_RELATED_P (insn) = 1;
2277           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2278         }
2279       else
2280         {
2281           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2282           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2283
2284           if (fp_offset
2285               || reg1 == FIRST_PSEUDO_REGISTER
2286               || (reg2 == FIRST_PSEUDO_REGISTER
2287                   && offset >= 256))
2288             {
2289               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2290                                                GEN_INT (-offset)));
2291               RTX_FRAME_RELATED_P (insn) = 1;
2292             }
2293           else
2294             {
2295               enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2296
2297               skip_wb = true;
2298
2299               if (reg2 == FIRST_PSEUDO_REGISTER)
2300                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2301               else
2302                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2303             }
2304         }
2305
2306       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2307                                  skip_wb);
2308       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2309                                  skip_wb);
2310     }
2311
2312   /* when offset >= 512,
2313      sub sp, sp, #<outgoing_args_size> */
2314   if (frame_size > -1)
2315     {
2316       if (crtl->outgoing_args_size > 0)
2317         {
2318           insn = emit_insn (gen_add2_insn
2319                             (stack_pointer_rtx,
2320                              GEN_INT (- crtl->outgoing_args_size)));
2321           RTX_FRAME_RELATED_P (insn) = 1;
2322         }
2323     }
2324 }
2325
2326 /* Generate the epilogue instructions for returning from a function.  */
2327 void
2328 aarch64_expand_epilogue (bool for_sibcall)
2329 {
2330   HOST_WIDE_INT frame_size, offset;
2331   HOST_WIDE_INT fp_offset;
2332   HOST_WIDE_INT hard_fp_offset;
2333   rtx_insn *insn;
2334
2335   aarch64_layout_frame ();
2336
2337   offset = frame_size = cfun->machine->frame.frame_size;
2338   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2339   fp_offset = frame_size - hard_fp_offset;
2340
2341   /* Store pairs and load pairs have a range only -512 to 504.  */
2342   if (offset >= 512)
2343     {
2344       offset = hard_fp_offset;
2345       if (offset >= 512)
2346         offset = cfun->machine->frame.saved_regs_size;
2347
2348       frame_size -= (offset + crtl->outgoing_args_size);
2349       fp_offset = 0;
2350       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2351         {
2352           insn = emit_insn (gen_add2_insn
2353                             (stack_pointer_rtx,
2354                              GEN_INT (crtl->outgoing_args_size)));
2355           RTX_FRAME_RELATED_P (insn) = 1;
2356         }
2357     }
2358   else
2359     frame_size = -1;
2360
2361   /* If there were outgoing arguments or we've done dynamic stack
2362      allocation, then restore the stack pointer from the frame
2363      pointer.  This is at most one insn and more efficient than using
2364      GCC's internal mechanism.  */
2365   if (frame_pointer_needed
2366       && (crtl->outgoing_args_size || cfun->calls_alloca))
2367     {
2368       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2369                                        hard_frame_pointer_rtx,
2370                                        GEN_INT (0)));
2371       offset = offset - fp_offset;
2372     }
2373
2374   if (offset > 0)
2375     {
2376       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2377       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2378       bool skip_wb = true;
2379       rtx cfi_ops = NULL;
2380
2381       if (frame_pointer_needed)
2382         fp_offset = 0;
2383       else if (fp_offset
2384                || reg1 == FIRST_PSEUDO_REGISTER
2385                || (reg2 == FIRST_PSEUDO_REGISTER
2386                    && offset >= 256))
2387         skip_wb = false;
2388
2389       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2390                                     skip_wb, &cfi_ops);
2391       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2392                                     skip_wb, &cfi_ops);
2393
2394       if (skip_wb)
2395         {
2396           enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2397           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2398
2399           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2400           if (reg2 == FIRST_PSEUDO_REGISTER)
2401             {
2402               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2403               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2404               mem = gen_rtx_MEM (mode1, mem);
2405               insn = emit_move_insn (rreg1, mem);
2406             }
2407           else
2408             {
2409               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2410
2411               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2412               insn = emit_insn (aarch64_gen_loadwb_pair
2413                                 (mode1, stack_pointer_rtx, rreg1,
2414                                  rreg2, offset));
2415             }
2416         }
2417       else
2418         {
2419           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2420                                            GEN_INT (offset)));
2421         }
2422
2423       /* Reset the CFA to be SP + FRAME_SIZE.  */
2424       rtx new_cfa = stack_pointer_rtx;
2425       if (frame_size > 0)
2426         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2427       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2428       REG_NOTES (insn) = cfi_ops;
2429       RTX_FRAME_RELATED_P (insn) = 1;
2430     }
2431
2432   if (frame_size > 0)
2433     {
2434       if (frame_size >= 0x1000000)
2435         {
2436           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2437           emit_move_insn (op0, GEN_INT (frame_size));
2438           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2439         }
2440       else
2441         {
2442           int hi_ofs = frame_size & 0xfff000;
2443           int lo_ofs = frame_size & 0x000fff;
2444
2445           if (hi_ofs && lo_ofs)
2446             {
2447               insn = emit_insn (gen_add2_insn
2448                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2449               RTX_FRAME_RELATED_P (insn) = 1;
2450               frame_size = lo_ofs;
2451             }
2452           insn = emit_insn (gen_add2_insn
2453                             (stack_pointer_rtx, GEN_INT (frame_size)));
2454         }
2455
2456       /* Reset the CFA to be SP + 0.  */
2457       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2458       RTX_FRAME_RELATED_P (insn) = 1;
2459     }
2460
2461   /* Stack adjustment for exception handler.  */
2462   if (crtl->calls_eh_return)
2463     {
2464       /* We need to unwind the stack by the offset computed by
2465          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2466          to be SP; letting the CFA move during this adjustment
2467          is just as correct as retaining the CFA from the body
2468          of the function.  Therefore, do nothing special.  */
2469       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2470     }
2471
2472   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2473   if (!for_sibcall)
2474     emit_jump_insn (ret_rtx);
2475 }
2476
2477 /* Return the place to copy the exception unwinding return address to.
2478    This will probably be a stack slot, but could (in theory be the
2479    return register).  */
2480 rtx
2481 aarch64_final_eh_return_addr (void)
2482 {
2483   HOST_WIDE_INT fp_offset;
2484
2485   aarch64_layout_frame ();
2486
2487   fp_offset = cfun->machine->frame.frame_size
2488               - cfun->machine->frame.hard_fp_offset;
2489
2490   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2491     return gen_rtx_REG (DImode, LR_REGNUM);
2492
2493   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2494      result in a store to save LR introduced by builtin_eh_return () being
2495      incorrectly deleted because the alias is not detected.
2496      So in the calculation of the address to copy the exception unwinding
2497      return address to, we note 2 cases.
2498      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2499      we return a SP-relative location since all the addresses are SP-relative
2500      in this case.  This prevents the store from being optimized away.
2501      If the fp_offset is not 0, then the addresses will be FP-relative and
2502      therefore we return a FP-relative location.  */
2503
2504   if (frame_pointer_needed)
2505     {
2506       if (fp_offset)
2507         return gen_frame_mem (DImode,
2508                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2509       else
2510         return gen_frame_mem (DImode,
2511                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2512     }
2513
2514   /* If FP is not needed, we calculate the location of LR, which would be
2515      at the top of the saved registers block.  */
2516
2517   return gen_frame_mem (DImode,
2518                         plus_constant (Pmode,
2519                                        stack_pointer_rtx,
2520                                        fp_offset
2521                                        + cfun->machine->frame.saved_regs_size
2522                                        - 2 * UNITS_PER_WORD));
2523 }
2524
2525 /* Possibly output code to build up a constant in a register.  For
2526    the benefit of the costs infrastructure, returns the number of
2527    instructions which would be emitted.  GENERATE inhibits or
2528    enables code generation.  */
2529
2530 static int
2531 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2532 {
2533   int insns = 0;
2534
2535   if (aarch64_bitmask_imm (val, DImode))
2536     {
2537       if (generate)
2538         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2539       insns = 1;
2540     }
2541   else
2542     {
2543       int i;
2544       int ncount = 0;
2545       int zcount = 0;
2546       HOST_WIDE_INT valp = val >> 16;
2547       HOST_WIDE_INT valm;
2548       HOST_WIDE_INT tval;
2549
2550       for (i = 16; i < 64; i += 16)
2551         {
2552           valm = (valp & 0xffff);
2553
2554           if (valm != 0)
2555             ++ zcount;
2556
2557           if (valm != 0xffff)
2558             ++ ncount;
2559
2560           valp >>= 16;
2561         }
2562
2563       /* zcount contains the number of additional MOVK instructions
2564          required if the constant is built up with an initial MOVZ instruction,
2565          while ncount is the number of MOVK instructions required if starting
2566          with a MOVN instruction.  Choose the sequence that yields the fewest
2567          number of instructions, preferring MOVZ instructions when they are both
2568          the same.  */
2569       if (ncount < zcount)
2570         {
2571           if (generate)
2572             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2573                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2574           tval = 0xffff;
2575           insns++;
2576         }
2577       else
2578         {
2579           if (generate)
2580             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2581                             GEN_INT (val & 0xffff));
2582           tval = 0;
2583           insns++;
2584         }
2585
2586       val >>= 16;
2587
2588       for (i = 16; i < 64; i += 16)
2589         {
2590           if ((val & 0xffff) != tval)
2591             {
2592               if (generate)
2593                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2594                                            GEN_INT (i),
2595                                            GEN_INT (val & 0xffff)));
2596               insns++;
2597             }
2598           val >>= 16;
2599         }
2600     }
2601   return insns;
2602 }
2603
2604 static void
2605 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2606 {
2607   HOST_WIDE_INT mdelta = delta;
2608   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2609   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2610
2611   if (mdelta < 0)
2612     mdelta = -mdelta;
2613
2614   if (mdelta >= 4096 * 4096)
2615     {
2616       (void) aarch64_build_constant (scratchreg, delta, true);
2617       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2618     }
2619   else if (mdelta > 0)
2620     {
2621       if (mdelta >= 4096)
2622         {
2623           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2624           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2625           if (delta < 0)
2626             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2627                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2628           else
2629             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2630                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2631         }
2632       if (mdelta % 4096 != 0)
2633         {
2634           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2635           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2636                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2637         }
2638     }
2639 }
2640
2641 /* Output code to add DELTA to the first argument, and then jump
2642    to FUNCTION.  Used for C++ multiple inheritance.  */
2643 static void
2644 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2645                          HOST_WIDE_INT delta,
2646                          HOST_WIDE_INT vcall_offset,
2647                          tree function)
2648 {
2649   /* The this pointer is always in x0.  Note that this differs from
2650      Arm where the this pointer maybe bumped to r1 if r0 is required
2651      to return a pointer to an aggregate.  On AArch64 a result value
2652      pointer will be in x8.  */
2653   int this_regno = R0_REGNUM;
2654   rtx this_rtx, temp0, temp1, addr, funexp;
2655   rtx_insn *insn;
2656
2657   reload_completed = 1;
2658   emit_note (NOTE_INSN_PROLOGUE_END);
2659
2660   if (vcall_offset == 0)
2661     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2662   else
2663     {
2664       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2665
2666       this_rtx = gen_rtx_REG (Pmode, this_regno);
2667       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2668       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2669
2670       addr = this_rtx;
2671       if (delta != 0)
2672         {
2673           if (delta >= -256 && delta < 256)
2674             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2675                                        plus_constant (Pmode, this_rtx, delta));
2676           else
2677             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2678         }
2679
2680       if (Pmode == ptr_mode)
2681         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2682       else
2683         aarch64_emit_move (temp0,
2684                            gen_rtx_ZERO_EXTEND (Pmode,
2685                                                 gen_rtx_MEM (ptr_mode, addr)));
2686
2687       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2688           addr = plus_constant (Pmode, temp0, vcall_offset);
2689       else
2690         {
2691           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2692           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2693         }
2694
2695       if (Pmode == ptr_mode)
2696         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2697       else
2698         aarch64_emit_move (temp1,
2699                            gen_rtx_SIGN_EXTEND (Pmode,
2700                                                 gen_rtx_MEM (ptr_mode, addr)));
2701
2702       emit_insn (gen_add2_insn (this_rtx, temp1));
2703     }
2704
2705   /* Generate a tail call to the target function.  */
2706   if (!TREE_USED (function))
2707     {
2708       assemble_external (function);
2709       TREE_USED (function) = 1;
2710     }
2711   funexp = XEXP (DECL_RTL (function), 0);
2712   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2713   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2714   SIBLING_CALL_P (insn) = 1;
2715
2716   insn = get_insns ();
2717   shorten_branches (insn);
2718   final_start_function (insn, file, 1);
2719   final (insn, file, 1);
2720   final_end_function ();
2721
2722   /* Stop pretending to be a post-reload pass.  */
2723   reload_completed = 0;
2724 }
2725
2726 static int
2727 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2728 {
2729   if (GET_CODE (*x) == SYMBOL_REF)
2730     return SYMBOL_REF_TLS_MODEL (*x) != 0;
2731
2732   /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2733      TLS offsets, not real symbol references.  */
2734   if (GET_CODE (*x) == UNSPEC
2735       && XINT (*x, 1) == UNSPEC_TLS)
2736     return -1;
2737
2738   return 0;
2739 }
2740
2741 static bool
2742 aarch64_tls_referenced_p (rtx x)
2743 {
2744   if (!TARGET_HAVE_TLS)
2745     return false;
2746
2747   return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2748 }
2749
2750
2751 static int
2752 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2753 {
2754   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2755   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2756
2757   if (*imm1 < *imm2)
2758     return -1;
2759   if (*imm1 > *imm2)
2760     return +1;
2761   return 0;
2762 }
2763
2764
2765 static void
2766 aarch64_build_bitmask_table (void)
2767 {
2768   unsigned HOST_WIDE_INT mask, imm;
2769   unsigned int log_e, e, s, r;
2770   unsigned int nimms = 0;
2771
2772   for (log_e = 1; log_e <= 6; log_e++)
2773     {
2774       e = 1 << log_e;
2775       if (e == 64)
2776         mask = ~(HOST_WIDE_INT) 0;
2777       else
2778         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2779       for (s = 1; s < e; s++)
2780         {
2781           for (r = 0; r < e; r++)
2782             {
2783               /* set s consecutive bits to 1 (s < 64) */
2784               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2785               /* rotate right by r */
2786               if (r != 0)
2787                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2788               /* replicate the constant depending on SIMD size */
2789               switch (log_e) {
2790               case 1: imm |= (imm <<  2);
2791               case 2: imm |= (imm <<  4);
2792               case 3: imm |= (imm <<  8);
2793               case 4: imm |= (imm << 16);
2794               case 5: imm |= (imm << 32);
2795               case 6:
2796                 break;
2797               default:
2798                 gcc_unreachable ();
2799               }
2800               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2801               aarch64_bitmasks[nimms++] = imm;
2802             }
2803         }
2804     }
2805
2806   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2807   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2808          aarch64_bitmasks_cmp);
2809 }
2810
2811
2812 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2813    a left shift of 0 or 12 bits.  */
2814 bool
2815 aarch64_uimm12_shift (HOST_WIDE_INT val)
2816 {
2817   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2818           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2819           );
2820 }
2821
2822
2823 /* Return true if val is an immediate that can be loaded into a
2824    register by a MOVZ instruction.  */
2825 static bool
2826 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2827 {
2828   if (GET_MODE_SIZE (mode) > 4)
2829     {
2830       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2831           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2832         return 1;
2833     }
2834   else
2835     {
2836       /* Ignore sign extension.  */
2837       val &= (HOST_WIDE_INT) 0xffffffff;
2838     }
2839   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2840           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2841 }
2842
2843
2844 /* Return true if val is a valid bitmask immediate.  */
2845 bool
2846 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2847 {
2848   if (GET_MODE_SIZE (mode) < 8)
2849     {
2850       /* Replicate bit pattern.  */
2851       val &= (HOST_WIDE_INT) 0xffffffff;
2852       val |= val << 32;
2853     }
2854   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2855                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2856 }
2857
2858
2859 /* Return true if val is an immediate that can be loaded into a
2860    register in a single instruction.  */
2861 bool
2862 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2863 {
2864   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2865     return 1;
2866   return aarch64_bitmask_imm (val, mode);
2867 }
2868
2869 static bool
2870 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2871 {
2872   rtx base, offset;
2873
2874   if (GET_CODE (x) == HIGH)
2875     return true;
2876
2877   split_const (x, &base, &offset);
2878   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2879     {
2880       if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2881           != SYMBOL_FORCE_TO_MEM)
2882         return true;
2883       else
2884         /* Avoid generating a 64-bit relocation in ILP32; leave
2885            to aarch64_expand_mov_immediate to handle it properly.  */
2886         return mode != ptr_mode;
2887     }
2888
2889   return aarch64_tls_referenced_p (x);
2890 }
2891
2892 /* Return true if register REGNO is a valid index register.
2893    STRICT_P is true if REG_OK_STRICT is in effect.  */
2894
2895 bool
2896 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2897 {
2898   if (!HARD_REGISTER_NUM_P (regno))
2899     {
2900       if (!strict_p)
2901         return true;
2902
2903       if (!reg_renumber)
2904         return false;
2905
2906       regno = reg_renumber[regno];
2907     }
2908   return GP_REGNUM_P (regno);
2909 }
2910
2911 /* Return true if register REGNO is a valid base register for mode MODE.
2912    STRICT_P is true if REG_OK_STRICT is in effect.  */
2913
2914 bool
2915 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2916 {
2917   if (!HARD_REGISTER_NUM_P (regno))
2918     {
2919       if (!strict_p)
2920         return true;
2921
2922       if (!reg_renumber)
2923         return false;
2924
2925       regno = reg_renumber[regno];
2926     }
2927
2928   /* The fake registers will be eliminated to either the stack or
2929      hard frame pointer, both of which are usually valid base registers.
2930      Reload deals with the cases where the eliminated form isn't valid.  */
2931   return (GP_REGNUM_P (regno)
2932           || regno == SP_REGNUM
2933           || regno == FRAME_POINTER_REGNUM
2934           || regno == ARG_POINTER_REGNUM);
2935 }
2936
2937 /* Return true if X is a valid base register for mode MODE.
2938    STRICT_P is true if REG_OK_STRICT is in effect.  */
2939
2940 static bool
2941 aarch64_base_register_rtx_p (rtx x, bool strict_p)
2942 {
2943   if (!strict_p && GET_CODE (x) == SUBREG)
2944     x = SUBREG_REG (x);
2945
2946   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
2947 }
2948
2949 /* Return true if address offset is a valid index.  If it is, fill in INFO
2950    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
2951
2952 static bool
2953 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
2954                         enum machine_mode mode, bool strict_p)
2955 {
2956   enum aarch64_address_type type;
2957   rtx index;
2958   int shift;
2959
2960   /* (reg:P) */
2961   if ((REG_P (x) || GET_CODE (x) == SUBREG)
2962       && GET_MODE (x) == Pmode)
2963     {
2964       type = ADDRESS_REG_REG;
2965       index = x;
2966       shift = 0;
2967     }
2968   /* (sign_extend:DI (reg:SI)) */
2969   else if ((GET_CODE (x) == SIGN_EXTEND
2970             || GET_CODE (x) == ZERO_EXTEND)
2971            && GET_MODE (x) == DImode
2972            && GET_MODE (XEXP (x, 0)) == SImode)
2973     {
2974       type = (GET_CODE (x) == SIGN_EXTEND)
2975         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2976       index = XEXP (x, 0);
2977       shift = 0;
2978     }
2979   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
2980   else if (GET_CODE (x) == MULT
2981            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2982                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2983            && GET_MODE (XEXP (x, 0)) == DImode
2984            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2985            && CONST_INT_P (XEXP (x, 1)))
2986     {
2987       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
2988         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2989       index = XEXP (XEXP (x, 0), 0);
2990       shift = exact_log2 (INTVAL (XEXP (x, 1)));
2991     }
2992   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
2993   else if (GET_CODE (x) == ASHIFT
2994            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2995                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2996            && GET_MODE (XEXP (x, 0)) == DImode
2997            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2998            && CONST_INT_P (XEXP (x, 1)))
2999     {
3000       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3001         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3002       index = XEXP (XEXP (x, 0), 0);
3003       shift = INTVAL (XEXP (x, 1));
3004     }
3005   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3006   else if ((GET_CODE (x) == SIGN_EXTRACT
3007             || GET_CODE (x) == ZERO_EXTRACT)
3008            && GET_MODE (x) == DImode
3009            && GET_CODE (XEXP (x, 0)) == MULT
3010            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3011            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3012     {
3013       type = (GET_CODE (x) == SIGN_EXTRACT)
3014         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3015       index = XEXP (XEXP (x, 0), 0);
3016       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3017       if (INTVAL (XEXP (x, 1)) != 32 + shift
3018           || INTVAL (XEXP (x, 2)) != 0)
3019         shift = -1;
3020     }
3021   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3022      (const_int 0xffffffff<<shift)) */
3023   else if (GET_CODE (x) == AND
3024            && GET_MODE (x) == DImode
3025            && GET_CODE (XEXP (x, 0)) == MULT
3026            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3027            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3028            && CONST_INT_P (XEXP (x, 1)))
3029     {
3030       type = ADDRESS_REG_UXTW;
3031       index = XEXP (XEXP (x, 0), 0);
3032       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3033       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3034         shift = -1;
3035     }
3036   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3037   else if ((GET_CODE (x) == SIGN_EXTRACT
3038             || GET_CODE (x) == ZERO_EXTRACT)
3039            && GET_MODE (x) == DImode
3040            && GET_CODE (XEXP (x, 0)) == ASHIFT
3041            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3042            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3043     {
3044       type = (GET_CODE (x) == SIGN_EXTRACT)
3045         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3046       index = XEXP (XEXP (x, 0), 0);
3047       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3048       if (INTVAL (XEXP (x, 1)) != 32 + shift
3049           || INTVAL (XEXP (x, 2)) != 0)
3050         shift = -1;
3051     }
3052   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3053      (const_int 0xffffffff<<shift)) */
3054   else if (GET_CODE (x) == AND
3055            && GET_MODE (x) == DImode
3056            && GET_CODE (XEXP (x, 0)) == ASHIFT
3057            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3058            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3059            && CONST_INT_P (XEXP (x, 1)))
3060     {
3061       type = ADDRESS_REG_UXTW;
3062       index = XEXP (XEXP (x, 0), 0);
3063       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3064       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3065         shift = -1;
3066     }
3067   /* (mult:P (reg:P) (const_int scale)) */
3068   else if (GET_CODE (x) == MULT
3069            && GET_MODE (x) == Pmode
3070            && GET_MODE (XEXP (x, 0)) == Pmode
3071            && CONST_INT_P (XEXP (x, 1)))
3072     {
3073       type = ADDRESS_REG_REG;
3074       index = XEXP (x, 0);
3075       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3076     }
3077   /* (ashift:P (reg:P) (const_int shift)) */
3078   else if (GET_CODE (x) == ASHIFT
3079            && GET_MODE (x) == Pmode
3080            && GET_MODE (XEXP (x, 0)) == Pmode
3081            && CONST_INT_P (XEXP (x, 1)))
3082     {
3083       type = ADDRESS_REG_REG;
3084       index = XEXP (x, 0);
3085       shift = INTVAL (XEXP (x, 1));
3086     }
3087   else
3088     return false;
3089
3090   if (GET_CODE (index) == SUBREG)
3091     index = SUBREG_REG (index);
3092
3093   if ((shift == 0 ||
3094        (shift > 0 && shift <= 3
3095         && (1 << shift) == GET_MODE_SIZE (mode)))
3096       && REG_P (index)
3097       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3098     {
3099       info->type = type;
3100       info->offset = index;
3101       info->shift = shift;
3102       return true;
3103     }
3104
3105   return false;
3106 }
3107
3108 bool
3109 aarch64_offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3110 {
3111   return (offset >= -64 * GET_MODE_SIZE (mode)
3112           && offset < 64 * GET_MODE_SIZE (mode)
3113           && offset % GET_MODE_SIZE (mode) == 0);
3114 }
3115
3116 static inline bool
3117 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3118                                HOST_WIDE_INT offset)
3119 {
3120   return offset >= -256 && offset < 256;
3121 }
3122
3123 static inline bool
3124 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3125 {
3126   return (offset >= 0
3127           && offset < 4096 * GET_MODE_SIZE (mode)
3128           && offset % GET_MODE_SIZE (mode) == 0);
3129 }
3130
3131 /* Return true if X is a valid address for machine mode MODE.  If it is,
3132    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3133    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3134
3135 static bool
3136 aarch64_classify_address (struct aarch64_address_info *info,
3137                           rtx x, enum machine_mode mode,
3138                           RTX_CODE outer_code, bool strict_p)
3139 {
3140   enum rtx_code code = GET_CODE (x);
3141   rtx op0, op1;
3142   bool allow_reg_index_p =
3143     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3144                                || aarch64_vector_mode_supported_p (mode));
3145   /* Don't support anything other than POST_INC or REG addressing for
3146      AdvSIMD.  */
3147   if (aarch64_vect_struct_mode_p (mode)
3148       && (code != POST_INC && code != REG))
3149     return false;
3150
3151   switch (code)
3152     {
3153     case REG:
3154     case SUBREG:
3155       info->type = ADDRESS_REG_IMM;
3156       info->base = x;
3157       info->offset = const0_rtx;
3158       return aarch64_base_register_rtx_p (x, strict_p);
3159
3160     case PLUS:
3161       op0 = XEXP (x, 0);
3162       op1 = XEXP (x, 1);
3163
3164       if (! strict_p
3165           && REG_P (op0)
3166           && (op0 == virtual_stack_vars_rtx
3167               || op0 == frame_pointer_rtx
3168               || op0 == arg_pointer_rtx)
3169           && CONST_INT_P (op1))
3170         {
3171           info->type = ADDRESS_REG_IMM;
3172           info->base = op0;
3173           info->offset = op1;
3174
3175           return true;
3176         }
3177
3178       if (GET_MODE_SIZE (mode) != 0
3179           && CONST_INT_P (op1)
3180           && aarch64_base_register_rtx_p (op0, strict_p))
3181         {
3182           HOST_WIDE_INT offset = INTVAL (op1);
3183
3184           info->type = ADDRESS_REG_IMM;
3185           info->base = op0;
3186           info->offset = op1;
3187
3188           /* TImode and TFmode values are allowed in both pairs of X
3189              registers and individual Q registers.  The available
3190              address modes are:
3191              X,X: 7-bit signed scaled offset
3192              Q:   9-bit signed offset
3193              We conservatively require an offset representable in either mode.
3194            */
3195           if (mode == TImode || mode == TFmode)
3196             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3197                     && offset_9bit_signed_unscaled_p (mode, offset));
3198
3199           if (outer_code == PARALLEL)
3200             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3201                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3202           else
3203             return (offset_9bit_signed_unscaled_p (mode, offset)
3204                     || offset_12bit_unsigned_scaled_p (mode, offset));
3205         }
3206
3207       if (allow_reg_index_p)
3208         {
3209           /* Look for base + (scaled/extended) index register.  */
3210           if (aarch64_base_register_rtx_p (op0, strict_p)
3211               && aarch64_classify_index (info, op1, mode, strict_p))
3212             {
3213               info->base = op0;
3214               return true;
3215             }
3216           if (aarch64_base_register_rtx_p (op1, strict_p)
3217               && aarch64_classify_index (info, op0, mode, strict_p))
3218             {
3219               info->base = op1;
3220               return true;
3221             }
3222         }
3223
3224       return false;
3225
3226     case POST_INC:
3227     case POST_DEC:
3228     case PRE_INC:
3229     case PRE_DEC:
3230       info->type = ADDRESS_REG_WB;
3231       info->base = XEXP (x, 0);
3232       info->offset = NULL_RTX;
3233       return aarch64_base_register_rtx_p (info->base, strict_p);
3234
3235     case POST_MODIFY:
3236     case PRE_MODIFY:
3237       info->type = ADDRESS_REG_WB;
3238       info->base = XEXP (x, 0);
3239       if (GET_CODE (XEXP (x, 1)) == PLUS
3240           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3241           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3242           && aarch64_base_register_rtx_p (info->base, strict_p))
3243         {
3244           HOST_WIDE_INT offset;
3245           info->offset = XEXP (XEXP (x, 1), 1);
3246           offset = INTVAL (info->offset);
3247
3248           /* TImode and TFmode values are allowed in both pairs of X
3249              registers and individual Q registers.  The available
3250              address modes are:
3251              X,X: 7-bit signed scaled offset
3252              Q:   9-bit signed offset
3253              We conservatively require an offset representable in either mode.
3254            */
3255           if (mode == TImode || mode == TFmode)
3256             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3257                     && offset_9bit_signed_unscaled_p (mode, offset));
3258
3259           if (outer_code == PARALLEL)
3260             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3261                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3262           else
3263             return offset_9bit_signed_unscaled_p (mode, offset);
3264         }
3265       return false;
3266
3267     case CONST:
3268     case SYMBOL_REF:
3269     case LABEL_REF:
3270       /* load literal: pc-relative constant pool entry.  Only supported
3271          for SI mode or larger.  */
3272       info->type = ADDRESS_SYMBOLIC;
3273       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3274         {
3275           rtx sym, addend;
3276
3277           split_const (x, &sym, &addend);
3278           return (GET_CODE (sym) == LABEL_REF
3279                   || (GET_CODE (sym) == SYMBOL_REF
3280                       && CONSTANT_POOL_ADDRESS_P (sym)));
3281         }
3282       return false;
3283
3284     case LO_SUM:
3285       info->type = ADDRESS_LO_SUM;
3286       info->base = XEXP (x, 0);
3287       info->offset = XEXP (x, 1);
3288       if (allow_reg_index_p
3289           && aarch64_base_register_rtx_p (info->base, strict_p))
3290         {
3291           rtx sym, offs;
3292           split_const (info->offset, &sym, &offs);
3293           if (GET_CODE (sym) == SYMBOL_REF
3294               && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3295                   == SYMBOL_SMALL_ABSOLUTE))
3296             {
3297               /* The symbol and offset must be aligned to the access size.  */
3298               unsigned int align;
3299               unsigned int ref_size;
3300
3301               if (CONSTANT_POOL_ADDRESS_P (sym))
3302                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3303               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3304                 {
3305                   tree exp = SYMBOL_REF_DECL (sym);
3306                   align = TYPE_ALIGN (TREE_TYPE (exp));
3307                   align = CONSTANT_ALIGNMENT (exp, align);
3308                 }
3309               else if (SYMBOL_REF_DECL (sym))
3310                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3311               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3312                        && SYMBOL_REF_BLOCK (sym) != NULL)
3313                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3314               else
3315                 align = BITS_PER_UNIT;
3316
3317               ref_size = GET_MODE_SIZE (mode);
3318               if (ref_size == 0)
3319                 ref_size = GET_MODE_SIZE (DImode);
3320
3321               return ((INTVAL (offs) & (ref_size - 1)) == 0
3322                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3323             }
3324         }
3325       return false;
3326
3327     default:
3328       return false;
3329     }
3330 }
3331
3332 bool
3333 aarch64_symbolic_address_p (rtx x)
3334 {
3335   rtx offset;
3336
3337   split_const (x, &x, &offset);
3338   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3339 }
3340
3341 /* Classify the base of symbolic expression X, given that X appears in
3342    context CONTEXT.  */
3343
3344 enum aarch64_symbol_type
3345 aarch64_classify_symbolic_expression (rtx x,
3346                                       enum aarch64_symbol_context context)
3347 {
3348   rtx offset;
3349
3350   split_const (x, &x, &offset);
3351   return aarch64_classify_symbol (x, context);
3352 }
3353
3354
3355 /* Return TRUE if X is a legitimate address for accessing memory in
3356    mode MODE.  */
3357 static bool
3358 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3359 {
3360   struct aarch64_address_info addr;
3361
3362   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3363 }
3364
3365 /* Return TRUE if X is a legitimate address for accessing memory in
3366    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3367    pair operation.  */
3368 bool
3369 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3370                               RTX_CODE outer_code, bool strict_p)
3371 {
3372   struct aarch64_address_info addr;
3373
3374   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3375 }
3376
3377 /* Return TRUE if rtx X is immediate constant 0.0 */
3378 bool
3379 aarch64_float_const_zero_rtx_p (rtx x)
3380 {
3381   REAL_VALUE_TYPE r;
3382
3383   if (GET_MODE (x) == VOIDmode)
3384     return false;
3385
3386   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3387   if (REAL_VALUE_MINUS_ZERO (r))
3388     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3389   return REAL_VALUES_EQUAL (r, dconst0);
3390 }
3391
3392 /* Return the fixed registers used for condition codes.  */
3393
3394 static bool
3395 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3396 {
3397   *p1 = CC_REGNUM;
3398   *p2 = INVALID_REGNUM;
3399   return true;
3400 }
3401
3402 /* Emit call insn with PAT and do aarch64-specific handling.  */
3403
3404 void
3405 aarch64_emit_call_insn (rtx pat)
3406 {
3407   rtx insn = emit_call_insn (pat);
3408
3409   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3410   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3411   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3412 }
3413
3414 enum machine_mode
3415 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3416 {
3417   /* All floating point compares return CCFP if it is an equality
3418      comparison, and CCFPE otherwise.  */
3419   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3420     {
3421       switch (code)
3422         {
3423         case EQ:
3424         case NE:
3425         case UNORDERED:
3426         case ORDERED:
3427         case UNLT:
3428         case UNLE:
3429         case UNGT:
3430         case UNGE:
3431         case UNEQ:
3432         case LTGT:
3433           return CCFPmode;
3434
3435         case LT:
3436         case LE:
3437         case GT:
3438         case GE:
3439           return CCFPEmode;
3440
3441         default:
3442           gcc_unreachable ();
3443         }
3444     }
3445
3446   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3447       && y == const0_rtx
3448       && (code == EQ || code == NE || code == LT || code == GE)
3449       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3450           || GET_CODE (x) == NEG))
3451     return CC_NZmode;
3452
3453   /* A compare with a shifted operand.  Because of canonicalization,
3454      the comparison will have to be swapped when we emit the assembly
3455      code.  */
3456   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3457       && (REG_P (y) || GET_CODE (y) == SUBREG)
3458       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3459           || GET_CODE (x) == LSHIFTRT
3460           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3461     return CC_SWPmode;
3462
3463   /* Similarly for a negated operand, but we can only do this for
3464      equalities.  */
3465   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3466       && (REG_P (y) || GET_CODE (y) == SUBREG)
3467       && (code == EQ || code == NE)
3468       && GET_CODE (x) == NEG)
3469     return CC_Zmode;
3470
3471   /* A compare of a mode narrower than SI mode against zero can be done
3472      by extending the value in the comparison.  */
3473   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3474       && y == const0_rtx)
3475     /* Only use sign-extension if we really need it.  */
3476     return ((code == GT || code == GE || code == LE || code == LT)
3477             ? CC_SESWPmode : CC_ZESWPmode);
3478
3479   /* For everything else, return CCmode.  */
3480   return CCmode;
3481 }
3482
3483 int
3484 aarch64_get_condition_code (rtx x)
3485 {
3486   enum machine_mode mode = GET_MODE (XEXP (x, 0));
3487   enum rtx_code comp_code = GET_CODE (x);
3488
3489   if (GET_MODE_CLASS (mode) != MODE_CC)
3490     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3491
3492   switch (mode)
3493     {
3494     case CCFPmode:
3495     case CCFPEmode:
3496       switch (comp_code)
3497         {
3498         case GE: return AARCH64_GE;
3499         case GT: return AARCH64_GT;
3500         case LE: return AARCH64_LS;
3501         case LT: return AARCH64_MI;
3502         case NE: return AARCH64_NE;
3503         case EQ: return AARCH64_EQ;
3504         case ORDERED: return AARCH64_VC;
3505         case UNORDERED: return AARCH64_VS;
3506         case UNLT: return AARCH64_LT;
3507         case UNLE: return AARCH64_LE;
3508         case UNGT: return AARCH64_HI;
3509         case UNGE: return AARCH64_PL;
3510         default: return -1;
3511         }
3512       break;
3513
3514     case CCmode:
3515       switch (comp_code)
3516         {
3517         case NE: return AARCH64_NE;
3518         case EQ: return AARCH64_EQ;
3519         case GE: return AARCH64_GE;
3520         case GT: return AARCH64_GT;
3521         case LE: return AARCH64_LE;
3522         case LT: return AARCH64_LT;
3523         case GEU: return AARCH64_CS;
3524         case GTU: return AARCH64_HI;
3525         case LEU: return AARCH64_LS;
3526         case LTU: return AARCH64_CC;
3527         default: return -1;
3528         }
3529       break;
3530
3531     case CC_SWPmode:
3532     case CC_ZESWPmode:
3533     case CC_SESWPmode:
3534       switch (comp_code)
3535         {
3536         case NE: return AARCH64_NE;
3537         case EQ: return AARCH64_EQ;
3538         case GE: return AARCH64_LE;
3539         case GT: return AARCH64_LT;
3540         case LE: return AARCH64_GE;
3541         case LT: return AARCH64_GT;
3542         case GEU: return AARCH64_LS;
3543         case GTU: return AARCH64_CC;
3544         case LEU: return AARCH64_CS;
3545         case LTU: return AARCH64_HI;
3546         default: return -1;
3547         }
3548       break;
3549
3550     case CC_NZmode:
3551       switch (comp_code)
3552         {
3553         case NE: return AARCH64_NE;
3554         case EQ: return AARCH64_EQ;
3555         case GE: return AARCH64_PL;
3556         case LT: return AARCH64_MI;
3557         default: return -1;
3558         }
3559       break;
3560
3561     case CC_Zmode:
3562       switch (comp_code)
3563         {
3564         case NE: return AARCH64_NE;
3565         case EQ: return AARCH64_EQ;
3566         default: return -1;
3567         }
3568       break;
3569
3570     default:
3571       return -1;
3572       break;
3573     }
3574 }
3575
3576 bool
3577 aarch64_const_vec_all_same_in_range_p (rtx x,
3578                                   HOST_WIDE_INT minval,
3579                                   HOST_WIDE_INT maxval)
3580 {
3581   HOST_WIDE_INT firstval;
3582   int count, i;
3583
3584   if (GET_CODE (x) != CONST_VECTOR
3585       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3586     return false;
3587
3588   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3589   if (firstval < minval || firstval > maxval)
3590     return false;
3591
3592   count = CONST_VECTOR_NUNITS (x);
3593   for (i = 1; i < count; i++)
3594     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3595       return false;
3596
3597   return true;
3598 }
3599
3600 bool
3601 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3602 {
3603   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3604 }
3605
3606 static unsigned
3607 bit_count (unsigned HOST_WIDE_INT value)
3608 {
3609   unsigned count = 0;
3610
3611   while (value)
3612     {
3613       count++;
3614       value &= value - 1;
3615     }
3616
3617   return count;
3618 }
3619
3620 void
3621 aarch64_print_operand (FILE *f, rtx x, char code)
3622 {
3623   switch (code)
3624     {
3625     /* An integer or symbol address without a preceding # sign.  */
3626     case 'c':
3627       switch (GET_CODE (x))
3628         {
3629         case CONST_INT:
3630           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3631           break;
3632
3633         case SYMBOL_REF:
3634           output_addr_const (f, x);
3635           break;
3636
3637         case CONST:
3638           if (GET_CODE (XEXP (x, 0)) == PLUS
3639               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3640             {
3641               output_addr_const (f, x);
3642               break;
3643             }
3644           /* Fall through.  */
3645
3646         default:
3647           output_operand_lossage ("Unsupported operand for code '%c'", code);
3648         }
3649       break;
3650
3651     case 'e':
3652       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3653       {
3654         int n;
3655
3656         if (!CONST_INT_P (x)
3657             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3658           {
3659             output_operand_lossage ("invalid operand for '%%%c'", code);
3660             return;
3661           }
3662
3663         switch (n)
3664           {
3665           case 3:
3666             fputc ('b', f);
3667             break;
3668           case 4:
3669             fputc ('h', f);
3670             break;
3671           case 5:
3672             fputc ('w', f);
3673             break;
3674           default:
3675             output_operand_lossage ("invalid operand for '%%%c'", code);
3676             return;
3677           }
3678       }
3679       break;
3680
3681     case 'p':
3682       {
3683         int n;
3684
3685         /* Print N such that 2^N == X.  */
3686         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
3687           {
3688             output_operand_lossage ("invalid operand for '%%%c'", code);
3689             return;
3690           }
3691
3692         asm_fprintf (f, "%d", n);
3693       }
3694       break;
3695
3696     case 'P':
3697       /* Print the number of non-zero bits in X (a const_int).  */
3698       if (!CONST_INT_P (x))
3699         {
3700           output_operand_lossage ("invalid operand for '%%%c'", code);
3701           return;
3702         }
3703
3704       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3705       break;
3706
3707     case 'H':
3708       /* Print the higher numbered register of a pair (TImode) of regs.  */
3709       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
3710         {
3711           output_operand_lossage ("invalid operand for '%%%c'", code);
3712           return;
3713         }
3714
3715       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3716       break;
3717
3718     case 'm':
3719       {
3720         int cond_code;
3721         /* Print a condition (eq, ne, etc).  */
3722
3723         /* CONST_TRUE_RTX means always -- that's the default.  */
3724         if (x == const_true_rtx)
3725           return;
3726
3727         if (!COMPARISON_P (x))
3728           {
3729             output_operand_lossage ("invalid operand for '%%%c'", code);
3730             return;
3731           }
3732
3733         cond_code = aarch64_get_condition_code (x);
3734         gcc_assert (cond_code >= 0);
3735         fputs (aarch64_condition_codes[cond_code], f);
3736       }
3737       break;
3738
3739     case 'M':
3740       {
3741         int cond_code;
3742         /* Print the inverse of a condition (eq <-> ne, etc).  */
3743
3744         /* CONST_TRUE_RTX means never -- that's the default.  */
3745         if (x == const_true_rtx)
3746           {
3747             fputs ("nv", f);
3748             return;
3749           }
3750
3751         if (!COMPARISON_P (x))
3752           {
3753             output_operand_lossage ("invalid operand for '%%%c'", code);
3754             return;
3755           }
3756         cond_code = aarch64_get_condition_code (x);
3757         gcc_assert (cond_code >= 0);
3758         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3759                                        (cond_code)], f);
3760       }
3761       break;
3762
3763     case 'b':
3764     case 'h':
3765     case 's':
3766     case 'd':
3767     case 'q':
3768       /* Print a scalar FP/SIMD register name.  */
3769       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3770         {
3771           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3772           return;
3773         }
3774       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3775       break;
3776
3777     case 'S':
3778     case 'T':
3779     case 'U':
3780     case 'V':
3781       /* Print the first FP/SIMD register name in a list.  */
3782       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3783         {
3784           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3785           return;
3786         }
3787       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3788       break;
3789
3790     case 'X':
3791       /* Print bottom 16 bits of integer constant in hex.  */
3792       if (!CONST_INT_P (x))
3793         {
3794           output_operand_lossage ("invalid operand for '%%%c'", code);
3795           return;
3796         }
3797       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3798       break;
3799
3800     case 'w':
3801     case 'x':
3802       /* Print a general register name or the zero register (32-bit or
3803          64-bit).  */
3804       if (x == const0_rtx
3805           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3806         {
3807           asm_fprintf (f, "%czr", code);
3808           break;
3809         }
3810
3811       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3812         {
3813           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3814           break;
3815         }
3816
3817       if (REG_P (x) && REGNO (x) == SP_REGNUM)
3818         {
3819           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3820           break;
3821         }
3822
3823       /* Fall through */
3824
3825     case 0:
3826       /* Print a normal operand, if it's a general register, then we
3827          assume DImode.  */
3828       if (x == NULL)
3829         {
3830           output_operand_lossage ("missing operand");
3831           return;
3832         }
3833
3834       switch (GET_CODE (x))
3835         {
3836         case REG:
3837           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3838           break;
3839
3840         case MEM:
3841           aarch64_memory_reference_mode = GET_MODE (x);
3842           output_address (XEXP (x, 0));
3843           break;
3844
3845         case LABEL_REF:
3846         case SYMBOL_REF:
3847           output_addr_const (asm_out_file, x);
3848           break;
3849
3850         case CONST_INT:
3851           asm_fprintf (f, "%wd", INTVAL (x));
3852           break;
3853
3854         case CONST_VECTOR:
3855           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3856             {
3857               gcc_assert (
3858                   aarch64_const_vec_all_same_in_range_p (x,
3859                                                          HOST_WIDE_INT_MIN,
3860                                                          HOST_WIDE_INT_MAX));
3861               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3862             }
3863           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3864             {
3865               fputc ('0', f);
3866             }
3867           else
3868             gcc_unreachable ();
3869           break;
3870
3871         case CONST_DOUBLE:
3872           /* CONST_DOUBLE can represent a double-width integer.
3873              In this case, the mode of x is VOIDmode.  */
3874           if (GET_MODE (x) == VOIDmode)
3875             ; /* Do Nothing.  */
3876           else if (aarch64_float_const_zero_rtx_p (x))
3877             {
3878               fputc ('0', f);
3879               break;
3880             }
3881           else if (aarch64_float_const_representable_p (x))
3882             {
3883 #define buf_size 20
3884               char float_buf[buf_size] = {'\0'};
3885               REAL_VALUE_TYPE r;
3886               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3887               real_to_decimal_for_mode (float_buf, &r,
3888                                         buf_size, buf_size,
3889                                         1, GET_MODE (x));
3890               asm_fprintf (asm_out_file, "%s", float_buf);
3891               break;
3892 #undef buf_size
3893             }
3894           output_operand_lossage ("invalid constant");
3895           return;
3896         default:
3897           output_operand_lossage ("invalid operand");
3898           return;
3899         }
3900       break;
3901
3902     case 'A':
3903       if (GET_CODE (x) == HIGH)
3904         x = XEXP (x, 0);
3905
3906       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3907         {
3908         case SYMBOL_SMALL_GOT:
3909           asm_fprintf (asm_out_file, ":got:");
3910           break;
3911
3912         case SYMBOL_SMALL_TLSGD:
3913           asm_fprintf (asm_out_file, ":tlsgd:");
3914           break;
3915
3916         case SYMBOL_SMALL_TLSDESC:
3917           asm_fprintf (asm_out_file, ":tlsdesc:");
3918           break;
3919
3920         case SYMBOL_SMALL_GOTTPREL:
3921           asm_fprintf (asm_out_file, ":gottprel:");
3922           break;
3923
3924         case SYMBOL_SMALL_TPREL:
3925           asm_fprintf (asm_out_file, ":tprel:");
3926           break;
3927
3928         case SYMBOL_TINY_GOT:
3929           gcc_unreachable ();
3930           break;
3931
3932         default:
3933           break;
3934         }
3935       output_addr_const (asm_out_file, x);
3936       break;
3937
3938     case 'L':
3939       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3940         {
3941         case SYMBOL_SMALL_GOT:
3942           asm_fprintf (asm_out_file, ":lo12:");
3943           break;
3944
3945         case SYMBOL_SMALL_TLSGD:
3946           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3947           break;
3948
3949         case SYMBOL_SMALL_TLSDESC:
3950           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3951           break;
3952
3953         case SYMBOL_SMALL_GOTTPREL:
3954           asm_fprintf (asm_out_file, ":gottprel_lo12:");
3955           break;
3956
3957         case SYMBOL_SMALL_TPREL:
3958           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3959           break;
3960
3961         case SYMBOL_TINY_GOT:
3962           asm_fprintf (asm_out_file, ":got:");
3963           break;
3964
3965         default:
3966           break;
3967         }
3968       output_addr_const (asm_out_file, x);
3969       break;
3970
3971     case 'G':
3972
3973       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3974         {
3975         case SYMBOL_SMALL_TPREL:
3976           asm_fprintf (asm_out_file, ":tprel_hi12:");
3977           break;
3978         default:
3979           break;
3980         }
3981       output_addr_const (asm_out_file, x);
3982       break;
3983
3984     default:
3985       output_operand_lossage ("invalid operand prefix '%%%c'", code);
3986       return;
3987     }
3988 }
3989
3990 void
3991 aarch64_print_operand_address (FILE *f, rtx x)
3992 {
3993   struct aarch64_address_info addr;
3994
3995   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
3996                              MEM, true))
3997     switch (addr.type)
3998       {
3999       case ADDRESS_REG_IMM:
4000         if (addr.offset == const0_rtx)
4001           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4002         else
4003           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4004                        INTVAL (addr.offset));
4005         return;
4006
4007       case ADDRESS_REG_REG:
4008         if (addr.shift == 0)
4009           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4010                        reg_names [REGNO (addr.offset)]);
4011         else
4012           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4013                        reg_names [REGNO (addr.offset)], addr.shift);
4014         return;
4015
4016       case ADDRESS_REG_UXTW:
4017         if (addr.shift == 0)
4018           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4019                        REGNO (addr.offset) - R0_REGNUM);
4020         else
4021           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4022                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4023         return;
4024
4025       case ADDRESS_REG_SXTW:
4026         if (addr.shift == 0)
4027           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4028                        REGNO (addr.offset) - R0_REGNUM);
4029         else
4030           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4031                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4032         return;
4033
4034       case ADDRESS_REG_WB:
4035         switch (GET_CODE (x))
4036           {
4037           case PRE_INC:
4038             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4039                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4040             return;
4041           case POST_INC:
4042             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4043                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4044             return;
4045           case PRE_DEC:
4046             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4047                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4048             return;
4049           case POST_DEC:
4050             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4051                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4052             return;
4053           case PRE_MODIFY:
4054             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4055                          INTVAL (addr.offset));
4056             return;
4057           case POST_MODIFY:
4058             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4059                          INTVAL (addr.offset));
4060             return;
4061           default:
4062             break;
4063           }
4064         break;
4065
4066       case ADDRESS_LO_SUM:
4067         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4068         output_addr_const (f, addr.offset);
4069         asm_fprintf (f, "]");
4070         return;
4071
4072       case ADDRESS_SYMBOLIC:
4073         break;
4074       }
4075
4076   output_addr_const (f, x);
4077 }
4078
4079 bool
4080 aarch64_label_mentioned_p (rtx x)
4081 {
4082   const char *fmt;
4083   int i;
4084
4085   if (GET_CODE (x) == LABEL_REF)
4086     return true;
4087
4088   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4089      referencing instruction, but they are constant offsets, not
4090      symbols.  */
4091   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4092     return false;
4093
4094   fmt = GET_RTX_FORMAT (GET_CODE (x));
4095   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4096     {
4097       if (fmt[i] == 'E')
4098         {
4099           int j;
4100
4101           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4102             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4103               return 1;
4104         }
4105       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4106         return 1;
4107     }
4108
4109   return 0;
4110 }
4111
4112 /* Implement REGNO_REG_CLASS.  */
4113
4114 enum reg_class
4115 aarch64_regno_regclass (unsigned regno)
4116 {
4117   if (GP_REGNUM_P (regno))
4118     return GENERAL_REGS;
4119
4120   if (regno == SP_REGNUM)
4121     return STACK_REG;
4122
4123   if (regno == FRAME_POINTER_REGNUM
4124       || regno == ARG_POINTER_REGNUM)
4125     return POINTER_REGS;
4126
4127   if (FP_REGNUM_P (regno))
4128     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4129
4130   return NO_REGS;
4131 }
4132
4133 /* Try a machine-dependent way of reloading an illegitimate address
4134    operand.  If we find one, push the reload and return the new rtx.  */
4135
4136 rtx
4137 aarch64_legitimize_reload_address (rtx *x_p,
4138                                    enum machine_mode mode,
4139                                    int opnum, int type,
4140                                    int ind_levels ATTRIBUTE_UNUSED)
4141 {
4142   rtx x = *x_p;
4143
4144   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4145   if (aarch64_vect_struct_mode_p (mode)
4146       && GET_CODE (x) == PLUS
4147       && REG_P (XEXP (x, 0))
4148       && CONST_INT_P (XEXP (x, 1)))
4149     {
4150       rtx orig_rtx = x;
4151       x = copy_rtx (x);
4152       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4153                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4154                    opnum, (enum reload_type) type);
4155       return x;
4156     }
4157
4158   /* We must recognize output that we have already generated ourselves.  */
4159   if (GET_CODE (x) == PLUS
4160       && GET_CODE (XEXP (x, 0)) == PLUS
4161       && REG_P (XEXP (XEXP (x, 0), 0))
4162       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4163       && CONST_INT_P (XEXP (x, 1)))
4164     {
4165       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4166                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4167                    opnum, (enum reload_type) type);
4168       return x;
4169     }
4170
4171   /* We wish to handle large displacements off a base register by splitting
4172      the addend across an add and the mem insn.  This can cut the number of
4173      extra insns needed from 3 to 1.  It is only useful for load/store of a
4174      single register with 12 bit offset field.  */
4175   if (GET_CODE (x) == PLUS
4176       && REG_P (XEXP (x, 0))
4177       && CONST_INT_P (XEXP (x, 1))
4178       && HARD_REGISTER_P (XEXP (x, 0))
4179       && mode != TImode
4180       && mode != TFmode
4181       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4182     {
4183       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4184       HOST_WIDE_INT low = val & 0xfff;
4185       HOST_WIDE_INT high = val - low;
4186       HOST_WIDE_INT offs;
4187       rtx cst;
4188       enum machine_mode xmode = GET_MODE (x);
4189
4190       /* In ILP32, xmode can be either DImode or SImode.  */
4191       gcc_assert (xmode == DImode || xmode == SImode);
4192
4193       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4194          BLKmode alignment.  */
4195       if (GET_MODE_SIZE (mode) == 0)
4196         return NULL_RTX;
4197
4198       offs = low % GET_MODE_SIZE (mode);
4199
4200       /* Align misaligned offset by adjusting high part to compensate.  */
4201       if (offs != 0)
4202         {
4203           if (aarch64_uimm12_shift (high + offs))
4204             {
4205               /* Align down.  */
4206               low = low - offs;
4207               high = high + offs;
4208             }
4209           else
4210             {
4211               /* Align up.  */
4212               offs = GET_MODE_SIZE (mode) - offs;
4213               low = low + offs;
4214               high = high + (low & 0x1000) - offs;
4215               low &= 0xfff;
4216             }
4217         }
4218
4219       /* Check for overflow.  */
4220       if (high + low != val)
4221         return NULL_RTX;
4222
4223       cst = GEN_INT (high);
4224       if (!aarch64_uimm12_shift (high))
4225         cst = force_const_mem (xmode, cst);
4226
4227       /* Reload high part into base reg, leaving the low part
4228          in the mem instruction.
4229          Note that replacing this gen_rtx_PLUS with plus_constant is
4230          wrong in this case because we rely on the
4231          (plus (plus reg c1) c2) structure being preserved so that
4232          XEXP (*p, 0) in push_reload below uses the correct term.  */
4233       x = gen_rtx_PLUS (xmode,
4234                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4235                         GEN_INT (low));
4236
4237       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4238                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4239                    opnum, (enum reload_type) type);
4240       return x;
4241     }
4242
4243   return NULL_RTX;
4244 }
4245
4246
4247 static reg_class_t
4248 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4249                           reg_class_t rclass,
4250                           enum machine_mode mode,
4251                           secondary_reload_info *sri)
4252 {
4253   /* Without the TARGET_SIMD instructions we cannot move a Q register
4254      to a Q register directly.  We need a scratch.  */
4255   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4256       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4257       && reg_class_subset_p (rclass, FP_REGS))
4258     {
4259       if (mode == TFmode)
4260         sri->icode = CODE_FOR_aarch64_reload_movtf;
4261       else if (mode == TImode)
4262         sri->icode = CODE_FOR_aarch64_reload_movti;
4263       return NO_REGS;
4264     }
4265
4266   /* A TFmode or TImode memory access should be handled via an FP_REGS
4267      because AArch64 has richer addressing modes for LDR/STR instructions
4268      than LDP/STP instructions.  */
4269   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4270       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4271     return FP_REGS;
4272
4273   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4274       return GENERAL_REGS;
4275
4276   return NO_REGS;
4277 }
4278
4279 static bool
4280 aarch64_can_eliminate (const int from, const int to)
4281 {
4282   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4283      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4284
4285   if (frame_pointer_needed)
4286     {
4287       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4288         return true;
4289       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4290         return false;
4291       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4292           && !cfun->calls_alloca)
4293         return true;
4294       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4295         return true;
4296
4297       return false;
4298     }
4299
4300   return true;
4301 }
4302
4303 HOST_WIDE_INT
4304 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4305 {
4306   aarch64_layout_frame ();
4307
4308   if (to == HARD_FRAME_POINTER_REGNUM)
4309     {
4310       if (from == ARG_POINTER_REGNUM)
4311         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4312
4313       if (from == FRAME_POINTER_REGNUM)
4314         return (cfun->machine->frame.hard_fp_offset
4315                 - cfun->machine->frame.saved_varargs_size);
4316     }
4317
4318   if (to == STACK_POINTER_REGNUM)
4319     {
4320       if (from == FRAME_POINTER_REGNUM)
4321           return (cfun->machine->frame.frame_size
4322                   - cfun->machine->frame.saved_varargs_size);
4323     }
4324
4325   return cfun->machine->frame.frame_size;
4326 }
4327
4328 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4329    previous frame.  */
4330
4331 rtx
4332 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4333 {
4334   if (count != 0)
4335     return const0_rtx;
4336   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4337 }
4338
4339
4340 static void
4341 aarch64_asm_trampoline_template (FILE *f)
4342 {
4343   if (TARGET_ILP32)
4344     {
4345       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4346       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4347     }
4348   else
4349     {
4350       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4351       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4352     }
4353   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4354   assemble_aligned_integer (4, const0_rtx);
4355   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4356   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4357 }
4358
4359 static void
4360 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4361 {
4362   rtx fnaddr, mem, a_tramp;
4363   const int tramp_code_sz = 16;
4364
4365   /* Don't need to copy the trailing D-words, we fill those in below.  */
4366   emit_block_move (m_tramp, assemble_trampoline_template (),
4367                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4368   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4369   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4370   if (GET_MODE (fnaddr) != ptr_mode)
4371     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4372   emit_move_insn (mem, fnaddr);
4373
4374   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4375   emit_move_insn (mem, chain_value);
4376
4377   /* XXX We should really define a "clear_cache" pattern and use
4378      gen_clear_cache().  */
4379   a_tramp = XEXP (m_tramp, 0);
4380   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4381                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4382                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4383                      ptr_mode);
4384 }
4385
4386 static unsigned char
4387 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4388 {
4389   switch (regclass)
4390     {
4391     case CALLER_SAVE_REGS:
4392     case POINTER_REGS:
4393     case GENERAL_REGS:
4394     case ALL_REGS:
4395     case FP_REGS:
4396     case FP_LO_REGS:
4397       return
4398         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4399                                        (GET_MODE_SIZE (mode) + 7) / 8;
4400     case STACK_REG:
4401       return 1;
4402
4403     case NO_REGS:
4404       return 0;
4405
4406     default:
4407       break;
4408     }
4409   gcc_unreachable ();
4410 }
4411
4412 static reg_class_t
4413 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4414 {
4415   if (regclass == POINTER_REGS)
4416     return GENERAL_REGS;
4417
4418   if (regclass == STACK_REG)
4419     {
4420       if (REG_P(x)
4421           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4422           return regclass;
4423
4424       return NO_REGS;
4425     }
4426
4427   /* If it's an integer immediate that MOVI can't handle, then
4428      FP_REGS is not an option, so we return NO_REGS instead.  */
4429   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4430       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4431     return NO_REGS;
4432
4433   /* Register eliminiation can result in a request for
4434      SP+constant->FP_REGS.  We cannot support such operations which
4435      use SP as source and an FP_REG as destination, so reject out
4436      right now.  */
4437   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4438     {
4439       rtx lhs = XEXP (x, 0);
4440
4441       /* Look through a possible SUBREG introduced by ILP32.  */
4442       if (GET_CODE (lhs) == SUBREG)
4443         lhs = SUBREG_REG (lhs);
4444
4445       gcc_assert (REG_P (lhs));
4446       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4447                                       POINTER_REGS));
4448       return NO_REGS;
4449     }
4450
4451   return regclass;
4452 }
4453
4454 void
4455 aarch64_asm_output_labelref (FILE* f, const char *name)
4456 {
4457   asm_fprintf (f, "%U%s", name);
4458 }
4459
4460 static void
4461 aarch64_elf_asm_constructor (rtx symbol, int priority)
4462 {
4463   if (priority == DEFAULT_INIT_PRIORITY)
4464     default_ctor_section_asm_out_constructor (symbol, priority);
4465   else
4466     {
4467       section *s;
4468       char buf[18];
4469       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4470       s = get_section (buf, SECTION_WRITE, NULL);
4471       switch_to_section (s);
4472       assemble_align (POINTER_SIZE);
4473       assemble_aligned_integer (POINTER_BYTES, symbol);
4474     }
4475 }
4476
4477 static void
4478 aarch64_elf_asm_destructor (rtx symbol, int priority)
4479 {
4480   if (priority == DEFAULT_INIT_PRIORITY)
4481     default_dtor_section_asm_out_destructor (symbol, priority);
4482   else
4483     {
4484       section *s;
4485       char buf[18];
4486       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4487       s = get_section (buf, SECTION_WRITE, NULL);
4488       switch_to_section (s);
4489       assemble_align (POINTER_SIZE);
4490       assemble_aligned_integer (POINTER_BYTES, symbol);
4491     }
4492 }
4493
4494 const char*
4495 aarch64_output_casesi (rtx *operands)
4496 {
4497   char buf[100];
4498   char label[100];
4499   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
4500   int index;
4501   static const char *const patterns[4][2] =
4502   {
4503     {
4504       "ldrb\t%w3, [%0,%w1,uxtw]",
4505       "add\t%3, %4, %w3, sxtb #2"
4506     },
4507     {
4508       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4509       "add\t%3, %4, %w3, sxth #2"
4510     },
4511     {
4512       "ldr\t%w3, [%0,%w1,uxtw #2]",
4513       "add\t%3, %4, %w3, sxtw #2"
4514     },
4515     /* We assume that DImode is only generated when not optimizing and
4516        that we don't really need 64-bit address offsets.  That would
4517        imply an object file with 8GB of code in a single function!  */
4518     {
4519       "ldr\t%w3, [%0,%w1,uxtw #2]",
4520       "add\t%3, %4, %w3, sxtw #2"
4521     }
4522   };
4523
4524   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4525
4526   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4527
4528   gcc_assert (index >= 0 && index <= 3);
4529
4530   /* Need to implement table size reduction, by chaning the code below.  */
4531   output_asm_insn (patterns[index][0], operands);
4532   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4533   snprintf (buf, sizeof (buf),
4534             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4535   output_asm_insn (buf, operands);
4536   output_asm_insn (patterns[index][1], operands);
4537   output_asm_insn ("br\t%3", operands);
4538   assemble_label (asm_out_file, label);
4539   return "";
4540 }
4541
4542
4543 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4544    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4545    operator.  */
4546
4547 int
4548 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4549 {
4550   if (shift >= 0 && shift <= 3)
4551     {
4552       int size;
4553       for (size = 8; size <= 32; size *= 2)
4554         {
4555           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4556           if (mask == bits << shift)
4557             return size;
4558         }
4559     }
4560   return 0;
4561 }
4562
4563 static bool
4564 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4565                                    const_rtx x ATTRIBUTE_UNUSED)
4566 {
4567   /* We can't use blocks for constants when we're using a per-function
4568      constant pool.  */
4569   return false;
4570 }
4571
4572 static section *
4573 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4574                             rtx x ATTRIBUTE_UNUSED,
4575                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4576 {
4577   /* Force all constant pool entries into the current function section.  */
4578   return function_section (current_function_decl);
4579 }
4580
4581
4582 /* Costs.  */
4583
4584 /* Helper function for rtx cost calculation.  Strip a shift expression
4585    from X.  Returns the inner operand if successful, or the original
4586    expression on failure.  */
4587 static rtx
4588 aarch64_strip_shift (rtx x)
4589 {
4590   rtx op = x;
4591
4592   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4593      we can convert both to ROR during final output.  */
4594   if ((GET_CODE (op) == ASHIFT
4595        || GET_CODE (op) == ASHIFTRT
4596        || GET_CODE (op) == LSHIFTRT
4597        || GET_CODE (op) == ROTATERT
4598        || GET_CODE (op) == ROTATE)
4599       && CONST_INT_P (XEXP (op, 1)))
4600     return XEXP (op, 0);
4601
4602   if (GET_CODE (op) == MULT
4603       && CONST_INT_P (XEXP (op, 1))
4604       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4605     return XEXP (op, 0);
4606
4607   return x;
4608 }
4609
4610 /* Helper function for rtx cost calculation.  Strip an extend
4611    expression from X.  Returns the inner operand if successful, or the
4612    original expression on failure.  We deal with a number of possible
4613    canonicalization variations here.  */
4614 static rtx
4615 aarch64_strip_extend (rtx x)
4616 {
4617   rtx op = x;
4618
4619   /* Zero and sign extraction of a widened value.  */
4620   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4621       && XEXP (op, 2) == const0_rtx
4622       && GET_CODE (XEXP (op, 0)) == MULT
4623       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4624                                          XEXP (op, 1)))
4625     return XEXP (XEXP (op, 0), 0);
4626
4627   /* It can also be represented (for zero-extend) as an AND with an
4628      immediate.  */
4629   if (GET_CODE (op) == AND
4630       && GET_CODE (XEXP (op, 0)) == MULT
4631       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4632       && CONST_INT_P (XEXP (op, 1))
4633       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4634                            INTVAL (XEXP (op, 1))) != 0)
4635     return XEXP (XEXP (op, 0), 0);
4636
4637   /* Now handle extended register, as this may also have an optional
4638      left shift by 1..4.  */
4639   if (GET_CODE (op) == ASHIFT
4640       && CONST_INT_P (XEXP (op, 1))
4641       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4642     op = XEXP (op, 0);
4643
4644   if (GET_CODE (op) == ZERO_EXTEND
4645       || GET_CODE (op) == SIGN_EXTEND)
4646     op = XEXP (op, 0);
4647
4648   if (op != x)
4649     return op;
4650
4651   return x;
4652 }
4653
4654 /* Helper function for rtx cost calculation.  Calculate the cost of
4655    a MULT, which may be part of a multiply-accumulate rtx.  Return
4656    the calculated cost of the expression, recursing manually in to
4657    operands where needed.  */
4658
4659 static int
4660 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4661 {
4662   rtx op0, op1;
4663   const struct cpu_cost_table *extra_cost
4664     = aarch64_tune_params->insn_extra_cost;
4665   int cost = 0;
4666   bool maybe_fma = (outer == PLUS || outer == MINUS);
4667   enum machine_mode mode = GET_MODE (x);
4668
4669   gcc_checking_assert (code == MULT);
4670
4671   op0 = XEXP (x, 0);
4672   op1 = XEXP (x, 1);
4673
4674   if (VECTOR_MODE_P (mode))
4675     mode = GET_MODE_INNER (mode);
4676
4677   /* Integer multiply/fma.  */
4678   if (GET_MODE_CLASS (mode) == MODE_INT)
4679     {
4680       /* The multiply will be canonicalized as a shift, cost it as such.  */
4681       if (CONST_INT_P (op1)
4682           && exact_log2 (INTVAL (op1)) > 0)
4683         {
4684           if (speed)
4685             {
4686               if (maybe_fma)
4687                 /* ADD (shifted register).  */
4688                 cost += extra_cost->alu.arith_shift;
4689               else
4690                 /* LSL (immediate).  */
4691                 cost += extra_cost->alu.shift;
4692             }
4693
4694           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4695
4696           return cost;
4697         }
4698
4699       /* Integer multiplies or FMAs have zero/sign extending variants.  */
4700       if ((GET_CODE (op0) == ZERO_EXTEND
4701            && GET_CODE (op1) == ZERO_EXTEND)
4702           || (GET_CODE (op0) == SIGN_EXTEND
4703               && GET_CODE (op1) == SIGN_EXTEND))
4704         {
4705           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4706                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4707
4708           if (speed)
4709             {
4710               if (maybe_fma)
4711                 /* MADD/SMADDL/UMADDL.  */
4712                 cost += extra_cost->mult[0].extend_add;
4713               else
4714                 /* MUL/SMULL/UMULL.  */
4715                 cost += extra_cost->mult[0].extend;
4716             }
4717
4718           return cost;
4719         }
4720
4721       /* This is either an integer multiply or an FMA.  In both cases
4722          we want to recurse and cost the operands.  */
4723       cost += rtx_cost (op0, MULT, 0, speed)
4724               + rtx_cost (op1, MULT, 1, speed);
4725
4726       if (speed)
4727         {
4728           if (maybe_fma)
4729             /* MADD.  */
4730             cost += extra_cost->mult[mode == DImode].add;
4731           else
4732             /* MUL.  */
4733             cost += extra_cost->mult[mode == DImode].simple;
4734         }
4735
4736       return cost;
4737     }
4738   else
4739     {
4740       if (speed)
4741         {
4742           /* Floating-point FMA/FMUL can also support negations of the
4743              operands.  */
4744           if (GET_CODE (op0) == NEG)
4745             op0 = XEXP (op0, 0);
4746           if (GET_CODE (op1) == NEG)
4747             op1 = XEXP (op1, 0);
4748
4749           if (maybe_fma)
4750             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
4751             cost += extra_cost->fp[mode == DFmode].fma;
4752           else
4753             /* FMUL/FNMUL.  */
4754             cost += extra_cost->fp[mode == DFmode].mult;
4755         }
4756
4757       cost += rtx_cost (op0, MULT, 0, speed)
4758               + rtx_cost (op1, MULT, 1, speed);
4759       return cost;
4760     }
4761 }
4762
4763 static int
4764 aarch64_address_cost (rtx x,
4765                       enum machine_mode mode,
4766                       addr_space_t as ATTRIBUTE_UNUSED,
4767                       bool speed)
4768 {
4769   enum rtx_code c = GET_CODE (x);
4770   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4771   struct aarch64_address_info info;
4772   int cost = 0;
4773   info.shift = 0;
4774
4775   if (!aarch64_classify_address (&info, x, mode, c, false))
4776     {
4777       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4778         {
4779           /* This is a CONST or SYMBOL ref which will be split
4780              in a different way depending on the code model in use.
4781              Cost it through the generic infrastructure.  */
4782           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4783           /* Divide through by the cost of one instruction to
4784              bring it to the same units as the address costs.  */
4785           cost_symbol_ref /= COSTS_N_INSNS (1);
4786           /* The cost is then the cost of preparing the address,
4787              followed by an immediate (possibly 0) offset.  */
4788           return cost_symbol_ref + addr_cost->imm_offset;
4789         }
4790       else
4791         {
4792           /* This is most likely a jump table from a case
4793              statement.  */
4794           return addr_cost->register_offset;
4795         }
4796     }
4797
4798   switch (info.type)
4799     {
4800       case ADDRESS_LO_SUM:
4801       case ADDRESS_SYMBOLIC:
4802       case ADDRESS_REG_IMM:
4803         cost += addr_cost->imm_offset;
4804         break;
4805
4806       case ADDRESS_REG_WB:
4807         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4808           cost += addr_cost->pre_modify;
4809         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4810           cost += addr_cost->post_modify;
4811         else
4812           gcc_unreachable ();
4813
4814         break;
4815
4816       case ADDRESS_REG_REG:
4817         cost += addr_cost->register_offset;
4818         break;
4819
4820       case ADDRESS_REG_UXTW:
4821       case ADDRESS_REG_SXTW:
4822         cost += addr_cost->register_extend;
4823         break;
4824
4825       default:
4826         gcc_unreachable ();
4827     }
4828
4829
4830   if (info.shift > 0)
4831     {
4832       /* For the sake of calculating the cost of the shifted register
4833          component, we can treat same sized modes in the same way.  */
4834       switch (GET_MODE_BITSIZE (mode))
4835         {
4836           case 16:
4837             cost += addr_cost->addr_scale_costs.hi;
4838             break;
4839
4840           case 32:
4841             cost += addr_cost->addr_scale_costs.si;
4842             break;
4843
4844           case 64:
4845             cost += addr_cost->addr_scale_costs.di;
4846             break;
4847
4848           /* We can't tell, or this is a 128-bit vector.  */
4849           default:
4850             cost += addr_cost->addr_scale_costs.ti;
4851             break;
4852         }
4853     }
4854
4855   return cost;
4856 }
4857
4858 /* Return true if the RTX X in mode MODE is a zero or sign extract
4859    usable in an ADD or SUB (extended register) instruction.  */
4860 static bool
4861 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4862 {
4863   /* Catch add with a sign extract.
4864      This is add_<optab><mode>_multp2.  */
4865   if (GET_CODE (x) == SIGN_EXTRACT
4866       || GET_CODE (x) == ZERO_EXTRACT)
4867     {
4868       rtx op0 = XEXP (x, 0);
4869       rtx op1 = XEXP (x, 1);
4870       rtx op2 = XEXP (x, 2);
4871
4872       if (GET_CODE (op0) == MULT
4873           && CONST_INT_P (op1)
4874           && op2 == const0_rtx
4875           && CONST_INT_P (XEXP (op0, 1))
4876           && aarch64_is_extend_from_extract (mode,
4877                                              XEXP (op0, 1),
4878                                              op1))
4879         {
4880           return true;
4881         }
4882     }
4883
4884   return false;
4885 }
4886
4887 static bool
4888 aarch64_frint_unspec_p (unsigned int u)
4889 {
4890   switch (u)
4891     {
4892       case UNSPEC_FRINTZ:
4893       case UNSPEC_FRINTP:
4894       case UNSPEC_FRINTM:
4895       case UNSPEC_FRINTA:
4896       case UNSPEC_FRINTN:
4897       case UNSPEC_FRINTX:
4898       case UNSPEC_FRINTI:
4899         return true;
4900
4901       default:
4902         return false;
4903     }
4904 }
4905
4906 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4907    storing it in *COST.  Result is true if the total cost of the operation
4908    has now been calculated.  */
4909 static bool
4910 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4911 {
4912   rtx inner;
4913   rtx comparator;
4914   enum rtx_code cmpcode;
4915
4916   if (COMPARISON_P (op0))
4917     {
4918       inner = XEXP (op0, 0);
4919       comparator = XEXP (op0, 1);
4920       cmpcode = GET_CODE (op0);
4921     }
4922   else
4923     {
4924       inner = op0;
4925       comparator = const0_rtx;
4926       cmpcode = NE;
4927     }
4928
4929   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
4930     {
4931       /* Conditional branch.  */
4932       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4933         return true;
4934       else
4935         {
4936           if (cmpcode == NE || cmpcode == EQ)
4937             {
4938               if (comparator == const0_rtx)
4939                 {
4940                   /* TBZ/TBNZ/CBZ/CBNZ.  */
4941                   if (GET_CODE (inner) == ZERO_EXTRACT)
4942                     /* TBZ/TBNZ.  */
4943                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
4944                                        0, speed);
4945                 else
4946                   /* CBZ/CBNZ.  */
4947                   *cost += rtx_cost (inner, cmpcode, 0, speed);
4948
4949                 return true;
4950               }
4951             }
4952           else if (cmpcode == LT || cmpcode == GE)
4953             {
4954               /* TBZ/TBNZ.  */
4955               if (comparator == const0_rtx)
4956                 return true;
4957             }
4958         }
4959     }
4960   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4961     {
4962       /* It's a conditional operation based on the status flags,
4963          so it must be some flavor of CSEL.  */
4964
4965       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
4966       if (GET_CODE (op1) == NEG
4967           || GET_CODE (op1) == NOT
4968           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
4969         op1 = XEXP (op1, 0);
4970
4971       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
4972       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
4973       return true;
4974     }
4975
4976   /* We don't know what this is, cost all operands.  */
4977   return false;
4978 }
4979
4980 /* Calculate the cost of calculating X, storing it in *COST.  Result
4981    is true if the total cost of the operation has now been calculated.  */
4982 static bool
4983 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
4984                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
4985 {
4986   rtx op0, op1, op2;
4987   const struct cpu_cost_table *extra_cost
4988     = aarch64_tune_params->insn_extra_cost;
4989   enum machine_mode mode = GET_MODE (x);
4990
4991   /* By default, assume that everything has equivalent cost to the
4992      cheapest instruction.  Any additional costs are applied as a delta
4993      above this default.  */
4994   *cost = COSTS_N_INSNS (1);
4995
4996   /* TODO: The cost infrastructure currently does not handle
4997      vector operations.  Assume that all vector operations
4998      are equally expensive.  */
4999   if (VECTOR_MODE_P (mode))
5000     {
5001       if (speed)
5002         *cost += extra_cost->vect.alu;
5003       return true;
5004     }
5005
5006   switch (code)
5007     {
5008     case SET:
5009       /* The cost depends entirely on the operands to SET.  */
5010       *cost = 0;
5011       op0 = SET_DEST (x);
5012       op1 = SET_SRC (x);
5013
5014       switch (GET_CODE (op0))
5015         {
5016         case MEM:
5017           if (speed)
5018             {
5019               rtx address = XEXP (op0, 0);
5020               if (GET_MODE_CLASS (mode) == MODE_INT)
5021                 *cost += extra_cost->ldst.store;
5022               else if (mode == SFmode)
5023                 *cost += extra_cost->ldst.storef;
5024               else if (mode == DFmode)
5025                 *cost += extra_cost->ldst.stored;
5026
5027               *cost +=
5028                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5029                                                      0, speed));
5030             }
5031
5032           *cost += rtx_cost (op1, SET, 1, speed);
5033           return true;
5034
5035         case SUBREG:
5036           if (! REG_P (SUBREG_REG (op0)))
5037             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5038
5039           /* Fall through.  */
5040         case REG:
5041           /* const0_rtx is in general free, but we will use an
5042              instruction to set a register to 0.  */
5043           if (REG_P (op1) || op1 == const0_rtx)
5044             {
5045               /* The cost is 1 per register copied.  */
5046               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5047                               / UNITS_PER_WORD;
5048               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5049             }
5050           else
5051             /* Cost is just the cost of the RHS of the set.  */
5052             *cost += rtx_cost (op1, SET, 1, speed);
5053           return true;
5054
5055         case ZERO_EXTRACT:
5056         case SIGN_EXTRACT:
5057           /* Bit-field insertion.  Strip any redundant widening of
5058              the RHS to meet the width of the target.  */
5059           if (GET_CODE (op1) == SUBREG)
5060             op1 = SUBREG_REG (op1);
5061           if ((GET_CODE (op1) == ZERO_EXTEND
5062                || GET_CODE (op1) == SIGN_EXTEND)
5063               && CONST_INT_P (XEXP (op0, 1))
5064               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5065                   >= INTVAL (XEXP (op0, 1))))
5066             op1 = XEXP (op1, 0);
5067
5068           if (CONST_INT_P (op1))
5069             {
5070               /* MOV immediate is assumed to always be cheap.  */
5071               *cost = COSTS_N_INSNS (1);
5072             }
5073           else
5074             {
5075               /* BFM.  */
5076               if (speed)
5077                 *cost += extra_cost->alu.bfi;
5078               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5079             }
5080
5081           return true;
5082
5083         default:
5084           /* We can't make sense of this, assume default cost.  */
5085           *cost = COSTS_N_INSNS (1);
5086           return false;
5087         }
5088       return false;
5089
5090     case CONST_INT:
5091       /* If an instruction can incorporate a constant within the
5092          instruction, the instruction's expression avoids calling
5093          rtx_cost() on the constant.  If rtx_cost() is called on a
5094          constant, then it is usually because the constant must be
5095          moved into a register by one or more instructions.
5096
5097          The exception is constant 0, which can be expressed
5098          as XZR/WZR and is therefore free.  The exception to this is
5099          if we have (set (reg) (const0_rtx)) in which case we must cost
5100          the move.  However, we can catch that when we cost the SET, so
5101          we don't need to consider that here.  */
5102       if (x == const0_rtx)
5103         *cost = 0;
5104       else
5105         {
5106           /* To an approximation, building any other constant is
5107              proportionally expensive to the number of instructions
5108              required to build that constant.  This is true whether we
5109              are compiling for SPEED or otherwise.  */
5110           *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5111                                                          INTVAL (x),
5112                                                          false));
5113         }
5114       return true;
5115
5116     case CONST_DOUBLE:
5117       if (speed)
5118         {
5119           /* mov[df,sf]_aarch64.  */
5120           if (aarch64_float_const_representable_p (x))
5121             /* FMOV (scalar immediate).  */
5122             *cost += extra_cost->fp[mode == DFmode].fpconst;
5123           else if (!aarch64_float_const_zero_rtx_p (x))
5124             {
5125               /* This will be a load from memory.  */
5126               if (mode == DFmode)
5127                 *cost += extra_cost->ldst.loadd;
5128               else
5129                 *cost += extra_cost->ldst.loadf;
5130             }
5131           else
5132             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5133                or MOV v0.s[0], wzr - neither of which are modeled by the
5134                cost tables.  Just use the default cost.  */
5135             {
5136             }
5137         }
5138
5139       return true;
5140
5141     case MEM:
5142       if (speed)
5143         {
5144           /* For loads we want the base cost of a load, plus an
5145              approximation for the additional cost of the addressing
5146              mode.  */
5147           rtx address = XEXP (x, 0);
5148           if (GET_MODE_CLASS (mode) == MODE_INT)
5149             *cost += extra_cost->ldst.load;
5150           else if (mode == SFmode)
5151             *cost += extra_cost->ldst.loadf;
5152           else if (mode == DFmode)
5153             *cost += extra_cost->ldst.loadd;
5154
5155           *cost +=
5156                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5157                                                      0, speed));
5158         }
5159
5160       return true;
5161
5162     case NEG:
5163       op0 = XEXP (x, 0);
5164
5165       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5166        {
5167           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5168               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5169             {
5170               /* CSETM.  */
5171               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5172               return true;
5173             }
5174
5175           /* Cost this as SUB wzr, X.  */
5176           op0 = CONST0_RTX (GET_MODE (x));
5177           op1 = XEXP (x, 0);
5178           goto cost_minus;
5179         }
5180
5181       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5182         {
5183           /* Support (neg(fma...)) as a single instruction only if
5184              sign of zeros is unimportant.  This matches the decision
5185              making in aarch64.md.  */
5186           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5187             {
5188               /* FNMADD.  */
5189               *cost = rtx_cost (op0, NEG, 0, speed);
5190               return true;
5191             }
5192           if (speed)
5193             /* FNEG.  */
5194             *cost += extra_cost->fp[mode == DFmode].neg;
5195           return false;
5196         }
5197
5198       return false;
5199
5200     case CLRSB:
5201     case CLZ:
5202       if (speed)
5203         *cost += extra_cost->alu.clz;
5204
5205       return false;
5206
5207     case COMPARE:
5208       op0 = XEXP (x, 0);
5209       op1 = XEXP (x, 1);
5210
5211       if (op1 == const0_rtx
5212           && GET_CODE (op0) == AND)
5213         {
5214           x = op0;
5215           goto cost_logic;
5216         }
5217
5218       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5219         {
5220           /* TODO: A write to the CC flags possibly costs extra, this
5221              needs encoding in the cost tables.  */
5222
5223           /* CC_ZESWPmode supports zero extend for free.  */
5224           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5225             op0 = XEXP (op0, 0);
5226
5227           /* ANDS.  */
5228           if (GET_CODE (op0) == AND)
5229             {
5230               x = op0;
5231               goto cost_logic;
5232             }
5233
5234           if (GET_CODE (op0) == PLUS)
5235             {
5236               /* ADDS (and CMN alias).  */
5237               x = op0;
5238               goto cost_plus;
5239             }
5240
5241           if (GET_CODE (op0) == MINUS)
5242             {
5243               /* SUBS.  */
5244               x = op0;
5245               goto cost_minus;
5246             }
5247
5248           if (GET_CODE (op1) == NEG)
5249             {
5250               /* CMN.  */
5251               if (speed)
5252                 *cost += extra_cost->alu.arith;
5253
5254               *cost += rtx_cost (op0, COMPARE, 0, speed);
5255               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5256               return true;
5257             }
5258
5259           /* CMP.
5260
5261              Compare can freely swap the order of operands, and
5262              canonicalization puts the more complex operation first.
5263              But the integer MINUS logic expects the shift/extend
5264              operation in op1.  */
5265           if (! (REG_P (op0)
5266                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5267           {
5268             op0 = XEXP (x, 1);
5269             op1 = XEXP (x, 0);
5270           }
5271           goto cost_minus;
5272         }
5273
5274       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5275         {
5276           /* FCMP.  */
5277           if (speed)
5278             *cost += extra_cost->fp[mode == DFmode].compare;
5279
5280           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5281             {
5282               /* FCMP supports constant 0.0 for no extra cost. */
5283               return true;
5284             }
5285           return false;
5286         }
5287
5288       return false;
5289
5290     case MINUS:
5291       {
5292         op0 = XEXP (x, 0);
5293         op1 = XEXP (x, 1);
5294
5295 cost_minus:
5296         /* Detect valid immediates.  */
5297         if ((GET_MODE_CLASS (mode) == MODE_INT
5298              || (GET_MODE_CLASS (mode) == MODE_CC
5299                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5300             && CONST_INT_P (op1)
5301             && aarch64_uimm12_shift (INTVAL (op1)))
5302           {
5303             *cost += rtx_cost (op0, MINUS, 0, speed);
5304
5305             if (speed)
5306               /* SUB(S) (immediate).  */
5307               *cost += extra_cost->alu.arith;
5308             return true;
5309
5310           }
5311
5312         /* Look for SUB (extended register).  */
5313         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5314           {
5315             if (speed)
5316               *cost += extra_cost->alu.arith_shift;
5317
5318             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5319                                (enum rtx_code) GET_CODE (op1),
5320                                0, speed);
5321             return true;
5322           }
5323
5324         rtx new_op1 = aarch64_strip_extend (op1);
5325
5326         /* Cost this as an FMA-alike operation.  */
5327         if ((GET_CODE (new_op1) == MULT
5328              || GET_CODE (new_op1) == ASHIFT)
5329             && code != COMPARE)
5330           {
5331             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5332                                             (enum rtx_code) code,
5333                                             speed);
5334             *cost += rtx_cost (op0, MINUS, 0, speed);
5335             return true;
5336           }
5337
5338         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5339
5340         if (speed)
5341           {
5342             if (GET_MODE_CLASS (mode) == MODE_INT)
5343               /* SUB(S).  */
5344               *cost += extra_cost->alu.arith;
5345             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5346               /* FSUB.  */
5347               *cost += extra_cost->fp[mode == DFmode].addsub;
5348           }
5349         return true;
5350       }
5351
5352     case PLUS:
5353       {
5354         rtx new_op0;
5355
5356         op0 = XEXP (x, 0);
5357         op1 = XEXP (x, 1);
5358
5359 cost_plus:
5360         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5361             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5362           {
5363             /* CSINC.  */
5364             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5365             *cost += rtx_cost (op1, PLUS, 1, speed);
5366             return true;
5367           }
5368
5369         if (GET_MODE_CLASS (mode) == MODE_INT
5370             && CONST_INT_P (op1)
5371             && aarch64_uimm12_shift (INTVAL (op1)))
5372           {
5373             *cost += rtx_cost (op0, PLUS, 0, speed);
5374
5375             if (speed)
5376               /* ADD (immediate).  */
5377               *cost += extra_cost->alu.arith;
5378             return true;
5379           }
5380
5381         /* Look for ADD (extended register).  */
5382         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5383           {
5384             if (speed)
5385               *cost += extra_cost->alu.arith_shift;
5386
5387             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5388                                (enum rtx_code) GET_CODE (op0),
5389                                0, speed);
5390             return true;
5391           }
5392
5393         /* Strip any extend, leave shifts behind as we will
5394            cost them through mult_cost.  */
5395         new_op0 = aarch64_strip_extend (op0);
5396
5397         if (GET_CODE (new_op0) == MULT
5398             || GET_CODE (new_op0) == ASHIFT)
5399           {
5400             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5401                                             speed);
5402             *cost += rtx_cost (op1, PLUS, 1, speed);
5403             return true;
5404           }
5405
5406         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5407                   + rtx_cost (op1, PLUS, 1, speed));
5408
5409         if (speed)
5410           {
5411             if (GET_MODE_CLASS (mode) == MODE_INT)
5412               /* ADD.  */
5413               *cost += extra_cost->alu.arith;
5414             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5415               /* FADD.  */
5416               *cost += extra_cost->fp[mode == DFmode].addsub;
5417           }
5418         return true;
5419       }
5420
5421     case BSWAP:
5422       *cost = COSTS_N_INSNS (1);
5423
5424       if (speed)
5425         *cost += extra_cost->alu.rev;
5426
5427       return false;
5428
5429     case IOR:
5430       if (aarch_rev16_p (x))
5431         {
5432           *cost = COSTS_N_INSNS (1);
5433
5434           if (speed)
5435             *cost += extra_cost->alu.rev;
5436
5437           return true;
5438         }
5439     /* Fall through.  */
5440     case XOR:
5441     case AND:
5442     cost_logic:
5443       op0 = XEXP (x, 0);
5444       op1 = XEXP (x, 1);
5445
5446       if (code == AND
5447           && GET_CODE (op0) == MULT
5448           && CONST_INT_P (XEXP (op0, 1))
5449           && CONST_INT_P (op1)
5450           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5451                                INTVAL (op1)) != 0)
5452         {
5453           /* This is a UBFM/SBFM.  */
5454           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5455           if (speed)
5456             *cost += extra_cost->alu.bfx;
5457           return true;
5458         }
5459
5460       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5461         {
5462           /* We possibly get the immediate for free, this is not
5463              modelled.  */
5464           if (CONST_INT_P (op1)
5465               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5466             {
5467               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5468
5469               if (speed)
5470                 *cost += extra_cost->alu.logical;
5471
5472               return true;
5473             }
5474           else
5475             {
5476               rtx new_op0 = op0;
5477
5478               /* Handle ORN, EON, or BIC.  */
5479               if (GET_CODE (op0) == NOT)
5480                 op0 = XEXP (op0, 0);
5481
5482               new_op0 = aarch64_strip_shift (op0);
5483
5484               /* If we had a shift on op0 then this is a logical-shift-
5485                  by-register/immediate operation.  Otherwise, this is just
5486                  a logical operation.  */
5487               if (speed)
5488                 {
5489                   if (new_op0 != op0)
5490                     {
5491                       /* Shift by immediate.  */
5492                       if (CONST_INT_P (XEXP (op0, 1)))
5493                         *cost += extra_cost->alu.log_shift;
5494                       else
5495                         *cost += extra_cost->alu.log_shift_reg;
5496                     }
5497                   else
5498                     *cost += extra_cost->alu.logical;
5499                 }
5500
5501               /* In both cases we want to cost both operands.  */
5502               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5503                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5504
5505               return true;
5506             }
5507         }
5508       return false;
5509
5510     case NOT:
5511       /* MVN.  */
5512       if (speed)
5513         *cost += extra_cost->alu.logical;
5514
5515       /* The logical instruction could have the shifted register form,
5516          but the cost is the same if the shift is processed as a separate
5517          instruction, so we don't bother with it here.  */
5518       return false;
5519
5520     case ZERO_EXTEND:
5521
5522       op0 = XEXP (x, 0);
5523       /* If a value is written in SI mode, then zero extended to DI
5524          mode, the operation will in general be free as a write to
5525          a 'w' register implicitly zeroes the upper bits of an 'x'
5526          register.  However, if this is
5527
5528            (set (reg) (zero_extend (reg)))
5529
5530          we must cost the explicit register move.  */
5531       if (mode == DImode
5532           && GET_MODE (op0) == SImode
5533           && outer == SET)
5534         {
5535           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5536
5537           if (!op_cost && speed)
5538             /* MOV.  */
5539             *cost += extra_cost->alu.extend;
5540           else
5541             /* Free, the cost is that of the SI mode operation.  */
5542             *cost = op_cost;
5543
5544           return true;
5545         }
5546       else if (MEM_P (XEXP (x, 0)))
5547         {
5548           /* All loads can zero extend to any size for free.  */
5549           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5550           return true;
5551         }
5552
5553       /* UXTB/UXTH.  */
5554       if (speed)
5555         *cost += extra_cost->alu.extend;
5556
5557       return false;
5558
5559     case SIGN_EXTEND:
5560       if (MEM_P (XEXP (x, 0)))
5561         {
5562           /* LDRSH.  */
5563           if (speed)
5564             {
5565               rtx address = XEXP (XEXP (x, 0), 0);
5566               *cost += extra_cost->ldst.load_sign_extend;
5567
5568               *cost +=
5569                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5570                                                      0, speed));
5571             }
5572           return true;
5573         }
5574
5575       if (speed)
5576         *cost += extra_cost->alu.extend;
5577       return false;
5578
5579     case ASHIFT:
5580       op0 = XEXP (x, 0);
5581       op1 = XEXP (x, 1);
5582
5583       if (CONST_INT_P (op1))
5584         {
5585           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
5586              aliases.  */
5587           if (speed)
5588             *cost += extra_cost->alu.shift;
5589
5590           /* We can incorporate zero/sign extend for free.  */
5591           if (GET_CODE (op0) == ZERO_EXTEND
5592               || GET_CODE (op0) == SIGN_EXTEND)
5593             op0 = XEXP (op0, 0);
5594
5595           *cost += rtx_cost (op0, ASHIFT, 0, speed);
5596           return true;
5597         }
5598       else
5599         {
5600           /* LSLV.  */
5601           if (speed)
5602             *cost += extra_cost->alu.shift_reg;
5603
5604           return false;  /* All arguments need to be in registers.  */
5605         }
5606
5607     case ROTATE:
5608     case ROTATERT:
5609     case LSHIFTRT:
5610     case ASHIFTRT:
5611       op0 = XEXP (x, 0);
5612       op1 = XEXP (x, 1);
5613
5614       if (CONST_INT_P (op1))
5615         {
5616           /* ASR (immediate) and friends.  */
5617           if (speed)
5618             *cost += extra_cost->alu.shift;
5619
5620           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5621           return true;
5622         }
5623       else
5624         {
5625
5626           /* ASR (register) and friends.  */
5627           if (speed)
5628             *cost += extra_cost->alu.shift_reg;
5629
5630           return false;  /* All arguments need to be in registers.  */
5631         }
5632
5633     case SYMBOL_REF:
5634
5635       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5636         {
5637           /* LDR.  */
5638           if (speed)
5639             *cost += extra_cost->ldst.load;
5640         }
5641       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5642                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5643         {
5644           /* ADRP, followed by ADD.  */
5645           *cost += COSTS_N_INSNS (1);
5646           if (speed)
5647             *cost += 2 * extra_cost->alu.arith;
5648         }
5649       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5650                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5651         {
5652           /* ADR.  */
5653           if (speed)
5654             *cost += extra_cost->alu.arith;
5655         }
5656
5657       if (flag_pic)
5658         {
5659           /* One extra load instruction, after accessing the GOT.  */
5660           *cost += COSTS_N_INSNS (1);
5661           if (speed)
5662             *cost += extra_cost->ldst.load;
5663         }
5664       return true;
5665
5666     case HIGH:
5667     case LO_SUM:
5668       /* ADRP/ADD (immediate).  */
5669       if (speed)
5670         *cost += extra_cost->alu.arith;
5671       return true;
5672
5673     case ZERO_EXTRACT:
5674     case SIGN_EXTRACT:
5675       /* UBFX/SBFX.  */
5676       if (speed)
5677         *cost += extra_cost->alu.bfx;
5678
5679       /* We can trust that the immediates used will be correct (there
5680          are no by-register forms), so we need only cost op0.  */
5681       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5682       return true;
5683
5684     case MULT:
5685       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5686       /* aarch64_rtx_mult_cost always handles recursion to its
5687          operands.  */
5688       return true;
5689
5690     case MOD:
5691     case UMOD:
5692       if (speed)
5693         {
5694           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5695             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5696                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5697           else if (GET_MODE (x) == DFmode)
5698             *cost += (extra_cost->fp[1].mult
5699                       + extra_cost->fp[1].div);
5700           else if (GET_MODE (x) == SFmode)
5701             *cost += (extra_cost->fp[0].mult
5702                       + extra_cost->fp[0].div);
5703         }
5704       return false;  /* All arguments need to be in registers.  */
5705
5706     case DIV:
5707     case UDIV:
5708     case SQRT:
5709       if (speed)
5710         {
5711           if (GET_MODE_CLASS (mode) == MODE_INT)
5712             /* There is no integer SQRT, so only DIV and UDIV can get
5713                here.  */
5714             *cost += extra_cost->mult[mode == DImode].idiv;
5715           else
5716             *cost += extra_cost->fp[mode == DFmode].div;
5717         }
5718       return false;  /* All arguments need to be in registers.  */
5719
5720     case IF_THEN_ELSE:
5721       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5722                                          XEXP (x, 2), cost, speed);
5723
5724     case EQ:
5725     case NE:
5726     case GT:
5727     case GTU:
5728     case LT:
5729     case LTU:
5730     case GE:
5731     case GEU:
5732     case LE:
5733     case LEU:
5734
5735       return false; /* All arguments must be in registers.  */
5736
5737     case FMA:
5738       op0 = XEXP (x, 0);
5739       op1 = XEXP (x, 1);
5740       op2 = XEXP (x, 2);
5741
5742       if (speed)
5743         *cost += extra_cost->fp[mode == DFmode].fma;
5744
5745       /* FMSUB, FNMADD, and FNMSUB are free.  */
5746       if (GET_CODE (op0) == NEG)
5747         op0 = XEXP (op0, 0);
5748
5749       if (GET_CODE (op2) == NEG)
5750         op2 = XEXP (op2, 0);
5751
5752       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5753          and the by-element operand as operand 0.  */
5754       if (GET_CODE (op1) == NEG)
5755         op1 = XEXP (op1, 0);
5756
5757       /* Catch vector-by-element operations.  The by-element operand can
5758          either be (vec_duplicate (vec_select (x))) or just
5759          (vec_select (x)), depending on whether we are multiplying by
5760          a vector or a scalar.
5761
5762          Canonicalization is not very good in these cases, FMA4 will put the
5763          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
5764       if (GET_CODE (op0) == VEC_DUPLICATE)
5765         op0 = XEXP (op0, 0);
5766       else if (GET_CODE (op1) == VEC_DUPLICATE)
5767         op1 = XEXP (op1, 0);
5768
5769       if (GET_CODE (op0) == VEC_SELECT)
5770         op0 = XEXP (op0, 0);
5771       else if (GET_CODE (op1) == VEC_SELECT)
5772         op1 = XEXP (op1, 0);
5773
5774       /* If the remaining parameters are not registers,
5775          get the cost to put them into registers.  */
5776       *cost += rtx_cost (op0, FMA, 0, speed);
5777       *cost += rtx_cost (op1, FMA, 1, speed);
5778       *cost += rtx_cost (op2, FMA, 2, speed);
5779       return true;
5780
5781     case FLOAT_EXTEND:
5782       if (speed)
5783         *cost += extra_cost->fp[mode == DFmode].widen;
5784       return false;
5785
5786     case FLOAT_TRUNCATE:
5787       if (speed)
5788         *cost += extra_cost->fp[mode == DFmode].narrow;
5789       return false;
5790
5791     case FIX:
5792     case UNSIGNED_FIX:
5793       x = XEXP (x, 0);
5794       /* Strip the rounding part.  They will all be implemented
5795          by the fcvt* family of instructions anyway.  */
5796       if (GET_CODE (x) == UNSPEC)
5797         {
5798           unsigned int uns_code = XINT (x, 1);
5799
5800           if (uns_code == UNSPEC_FRINTA
5801               || uns_code == UNSPEC_FRINTM
5802               || uns_code == UNSPEC_FRINTN
5803               || uns_code == UNSPEC_FRINTP
5804               || uns_code == UNSPEC_FRINTZ)
5805             x = XVECEXP (x, 0, 0);
5806         }
5807
5808       if (speed)
5809         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5810
5811       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5812       return true;
5813
5814     case ABS:
5815       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5816         {
5817           /* FABS and FNEG are analogous.  */
5818           if (speed)
5819             *cost += extra_cost->fp[mode == DFmode].neg;
5820         }
5821       else
5822         {
5823           /* Integer ABS will either be split to
5824              two arithmetic instructions, or will be an ABS
5825              (scalar), which we don't model.  */
5826           *cost = COSTS_N_INSNS (2);
5827           if (speed)
5828             *cost += 2 * extra_cost->alu.arith;
5829         }
5830       return false;
5831
5832     case SMAX:
5833     case SMIN:
5834       if (speed)
5835         {
5836           /* FMAXNM/FMINNM/FMAX/FMIN.
5837              TODO: This may not be accurate for all implementations, but
5838              we do not model this in the cost tables.  */
5839           *cost += extra_cost->fp[mode == DFmode].addsub;
5840         }
5841       return false;
5842
5843     case UNSPEC:
5844       /* The floating point round to integer frint* instructions.  */
5845       if (aarch64_frint_unspec_p (XINT (x, 1)))
5846         {
5847           if (speed)
5848             *cost += extra_cost->fp[mode == DFmode].roundint;
5849
5850           return false;
5851         }
5852
5853       if (XINT (x, 1) == UNSPEC_RBIT)
5854         {
5855           if (speed)
5856             *cost += extra_cost->alu.rev;
5857
5858           return false;
5859         }
5860       break;
5861
5862     case TRUNCATE:
5863
5864       /* Decompose <su>muldi3_highpart.  */
5865       if (/* (truncate:DI  */
5866           mode == DImode
5867           /*   (lshiftrt:TI  */
5868           && GET_MODE (XEXP (x, 0)) == TImode
5869           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5870           /*      (mult:TI  */
5871           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5872           /*        (ANY_EXTEND:TI (reg:DI))
5873                     (ANY_EXTEND:TI (reg:DI)))  */
5874           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5875                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5876               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5877                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5878           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5879           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5880           /*     (const_int 64)  */
5881           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5882           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5883         {
5884           /* UMULH/SMULH.  */
5885           if (speed)
5886             *cost += extra_cost->mult[mode == DImode].extend;
5887           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5888                              MULT, 0, speed);
5889           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5890                              MULT, 1, speed);
5891           return true;
5892         }
5893
5894       /* Fall through.  */
5895     default:
5896       break;
5897     }
5898
5899   if (dump_file && (dump_flags & TDF_DETAILS))
5900     fprintf (dump_file,
5901       "\nFailed to cost RTX.  Assuming default cost.\n");
5902
5903   return true;
5904 }
5905
5906 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5907    calculated for X.  This cost is stored in *COST.  Returns true
5908    if the total cost of X was calculated.  */
5909 static bool
5910 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5911                    int param, int *cost, bool speed)
5912 {
5913   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5914
5915   if (dump_file && (dump_flags & TDF_DETAILS))
5916     {
5917       print_rtl_single (dump_file, x);
5918       fprintf (dump_file, "\n%s cost: %d (%s)\n",
5919                speed ? "Hot" : "Cold",
5920                *cost, result ? "final" : "partial");
5921     }
5922
5923   return result;
5924 }
5925
5926 static int
5927 aarch64_register_move_cost (enum machine_mode mode,
5928                             reg_class_t from_i, reg_class_t to_i)
5929 {
5930   enum reg_class from = (enum reg_class) from_i;
5931   enum reg_class to = (enum reg_class) to_i;
5932   const struct cpu_regmove_cost *regmove_cost
5933     = aarch64_tune_params->regmove_cost;
5934
5935   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
5936   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
5937     to = GENERAL_REGS;
5938
5939   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
5940     from = GENERAL_REGS;
5941
5942   /* Moving between GPR and stack cost is the same as GP2GP.  */
5943   if ((from == GENERAL_REGS && to == STACK_REG)
5944       || (to == GENERAL_REGS && from == STACK_REG))
5945     return regmove_cost->GP2GP;
5946
5947   /* To/From the stack register, we move via the gprs.  */
5948   if (to == STACK_REG || from == STACK_REG)
5949     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5950             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5951
5952   if (from == GENERAL_REGS && to == GENERAL_REGS)
5953     return regmove_cost->GP2GP;
5954   else if (from == GENERAL_REGS)
5955     return regmove_cost->GP2FP;
5956   else if (to == GENERAL_REGS)
5957     return regmove_cost->FP2GP;
5958
5959   /* When AdvSIMD instructions are disabled it is not possible to move
5960      a 128-bit value directly between Q registers.  This is handled in
5961      secondary reload.  A general register is used as a scratch to move
5962      the upper DI value and the lower DI value is moved directly,
5963      hence the cost is the sum of three moves. */
5964   if (! TARGET_SIMD && GET_MODE_SIZE (mode) == 128)
5965     return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
5966
5967   return regmove_cost->FP2FP;
5968 }
5969
5970 static int
5971 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
5972                           reg_class_t rclass ATTRIBUTE_UNUSED,
5973                           bool in ATTRIBUTE_UNUSED)
5974 {
5975   return aarch64_tune_params->memmov_cost;
5976 }
5977
5978 /* Return the number of instructions that can be issued per cycle.  */
5979 static int
5980 aarch64_sched_issue_rate (void)
5981 {
5982   return aarch64_tune_params->issue_rate;
5983 }
5984
5985 /* Vectorizer cost model target hooks.  */
5986
5987 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
5988 static int
5989 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
5990                                     tree vectype,
5991                                     int misalign ATTRIBUTE_UNUSED)
5992 {
5993   unsigned elements;
5994
5995   switch (type_of_cost)
5996     {
5997       case scalar_stmt:
5998         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
5999
6000       case scalar_load:
6001         return aarch64_tune_params->vec_costs->scalar_load_cost;
6002
6003       case scalar_store:
6004         return aarch64_tune_params->vec_costs->scalar_store_cost;
6005
6006       case vector_stmt:
6007         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6008
6009       case vector_load:
6010         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6011
6012       case vector_store:
6013         return aarch64_tune_params->vec_costs->vec_store_cost;
6014
6015       case vec_to_scalar:
6016         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6017
6018       case scalar_to_vec:
6019         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6020
6021       case unaligned_load:
6022         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6023
6024       case unaligned_store:
6025         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6026
6027       case cond_branch_taken:
6028         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6029
6030       case cond_branch_not_taken:
6031         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6032
6033       case vec_perm:
6034       case vec_promote_demote:
6035         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6036
6037       case vec_construct:
6038         elements = TYPE_VECTOR_SUBPARTS (vectype);
6039         return elements / 2 + 1;
6040
6041       default:
6042         gcc_unreachable ();
6043     }
6044 }
6045
6046 /* Implement targetm.vectorize.add_stmt_cost.  */
6047 static unsigned
6048 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6049                        struct _stmt_vec_info *stmt_info, int misalign,
6050                        enum vect_cost_model_location where)
6051 {
6052   unsigned *cost = (unsigned *) data;
6053   unsigned retval = 0;
6054
6055   if (flag_vect_cost_model)
6056     {
6057       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6058       int stmt_cost =
6059             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6060
6061       /* Statements in an inner loop relative to the loop being
6062          vectorized are weighted more heavily.  The value here is
6063          a function (linear for now) of the loop nest level.  */
6064       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6065         {
6066           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6067           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6068           unsigned nest_level = loop_depth (loop);
6069
6070           count *= nest_level;
6071         }
6072
6073       retval = (unsigned) (count * stmt_cost);
6074       cost[where] += retval;
6075     }
6076
6077   return retval;
6078 }
6079
6080 static void initialize_aarch64_code_model (void);
6081
6082 /* Parse the architecture extension string.  */
6083
6084 static void
6085 aarch64_parse_extension (char *str)
6086 {
6087   /* The extension string is parsed left to right.  */
6088   const struct aarch64_option_extension *opt = NULL;
6089
6090   /* Flag to say whether we are adding or removing an extension.  */
6091   int adding_ext = -1;
6092
6093   while (str != NULL && *str != 0)
6094     {
6095       char *ext;
6096       size_t len;
6097
6098       str++;
6099       ext = strchr (str, '+');
6100
6101       if (ext != NULL)
6102         len = ext - str;
6103       else
6104         len = strlen (str);
6105
6106       if (len >= 2 && strncmp (str, "no", 2) == 0)
6107         {
6108           adding_ext = 0;
6109           len -= 2;
6110           str += 2;
6111         }
6112       else if (len > 0)
6113         adding_ext = 1;
6114
6115       if (len == 0)
6116         {
6117           error ("missing feature modifier after %qs", "+no");
6118           return;
6119         }
6120
6121       /* Scan over the extensions table trying to find an exact match.  */
6122       for (opt = all_extensions; opt->name != NULL; opt++)
6123         {
6124           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6125             {
6126               /* Add or remove the extension.  */
6127               if (adding_ext)
6128                 aarch64_isa_flags |= opt->flags_on;
6129               else
6130                 aarch64_isa_flags &= ~(opt->flags_off);
6131               break;
6132             }
6133         }
6134
6135       if (opt->name == NULL)
6136         {
6137           /* Extension not found in list.  */
6138           error ("unknown feature modifier %qs", str);
6139           return;
6140         }
6141
6142       str = ext;
6143     };
6144
6145   return;
6146 }
6147
6148 /* Parse the ARCH string.  */
6149
6150 static void
6151 aarch64_parse_arch (void)
6152 {
6153   char *ext;
6154   const struct processor *arch;
6155   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6156   size_t len;
6157
6158   strcpy (str, aarch64_arch_string);
6159
6160   ext = strchr (str, '+');
6161
6162   if (ext != NULL)
6163     len = ext - str;
6164   else
6165     len = strlen (str);
6166
6167   if (len == 0)
6168     {
6169       error ("missing arch name in -march=%qs", str);
6170       return;
6171     }
6172
6173   /* Loop through the list of supported ARCHs to find a match.  */
6174   for (arch = all_architectures; arch->name != NULL; arch++)
6175     {
6176       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6177         {
6178           selected_arch = arch;
6179           aarch64_isa_flags = selected_arch->flags;
6180
6181           if (!selected_cpu)
6182             selected_cpu = &all_cores[selected_arch->core];
6183
6184           if (ext != NULL)
6185             {
6186               /* ARCH string contains at least one extension.  */
6187               aarch64_parse_extension (ext);
6188             }
6189
6190           if (strcmp (selected_arch->arch, selected_cpu->arch))
6191             {
6192               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6193                        selected_cpu->name, selected_arch->name);
6194             }
6195
6196           return;
6197         }
6198     }
6199
6200   /* ARCH name not found in list.  */
6201   error ("unknown value %qs for -march", str);
6202   return;
6203 }
6204
6205 /* Parse the CPU string.  */
6206
6207 static void
6208 aarch64_parse_cpu (void)
6209 {
6210   char *ext;
6211   const struct processor *cpu;
6212   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6213   size_t len;
6214
6215   strcpy (str, aarch64_cpu_string);
6216
6217   ext = strchr (str, '+');
6218
6219   if (ext != NULL)
6220     len = ext - str;
6221   else
6222     len = strlen (str);
6223
6224   if (len == 0)
6225     {
6226       error ("missing cpu name in -mcpu=%qs", str);
6227       return;
6228     }
6229
6230   /* Loop through the list of supported CPUs to find a match.  */
6231   for (cpu = all_cores; cpu->name != NULL; cpu++)
6232     {
6233       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6234         {
6235           selected_cpu = cpu;
6236           selected_tune = cpu;
6237           aarch64_isa_flags = selected_cpu->flags;
6238
6239           if (ext != NULL)
6240             {
6241               /* CPU string contains at least one extension.  */
6242               aarch64_parse_extension (ext);
6243             }
6244
6245           return;
6246         }
6247     }
6248
6249   /* CPU name not found in list.  */
6250   error ("unknown value %qs for -mcpu", str);
6251   return;
6252 }
6253
6254 /* Parse the TUNE string.  */
6255
6256 static void
6257 aarch64_parse_tune (void)
6258 {
6259   const struct processor *cpu;
6260   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6261   strcpy (str, aarch64_tune_string);
6262
6263   /* Loop through the list of supported CPUs to find a match.  */
6264   for (cpu = all_cores; cpu->name != NULL; cpu++)
6265     {
6266       if (strcmp (cpu->name, str) == 0)
6267         {
6268           selected_tune = cpu;
6269           return;
6270         }
6271     }
6272
6273   /* CPU name not found in list.  */
6274   error ("unknown value %qs for -mtune", str);
6275   return;
6276 }
6277
6278
6279 /* Implement TARGET_OPTION_OVERRIDE.  */
6280
6281 static void
6282 aarch64_override_options (void)
6283 {
6284   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6285      If either of -march or -mtune is given, they override their
6286      respective component of -mcpu.
6287
6288      So, first parse AARCH64_CPU_STRING, then the others, be careful
6289      with -march as, if -mcpu is not present on the command line, march
6290      must set a sensible default CPU.  */
6291   if (aarch64_cpu_string)
6292     {
6293       aarch64_parse_cpu ();
6294     }
6295
6296   if (aarch64_arch_string)
6297     {
6298       aarch64_parse_arch ();
6299     }
6300
6301   if (aarch64_tune_string)
6302     {
6303       aarch64_parse_tune ();
6304     }
6305
6306 #ifndef HAVE_AS_MABI_OPTION
6307   /* The compiler may have been configured with 2.23.* binutils, which does
6308      not have support for ILP32.  */
6309   if (TARGET_ILP32)
6310     error ("Assembler does not support -mabi=ilp32");
6311 #endif
6312
6313   initialize_aarch64_code_model ();
6314
6315   aarch64_build_bitmask_table ();
6316
6317   /* This target defaults to strict volatile bitfields.  */
6318   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6319     flag_strict_volatile_bitfields = 1;
6320
6321   /* If the user did not specify a processor, choose the default
6322      one for them.  This will be the CPU set during configuration using
6323      --with-cpu, otherwise it is "generic".  */
6324   if (!selected_cpu)
6325     {
6326       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6327       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6328     }
6329
6330   gcc_assert (selected_cpu);
6331
6332   /* The selected cpu may be an architecture, so lookup tuning by core ID.  */
6333   if (!selected_tune)
6334     selected_tune = &all_cores[selected_cpu->core];
6335
6336   aarch64_tune_flags = selected_tune->flags;
6337   aarch64_tune = selected_tune->core;
6338   aarch64_tune_params = selected_tune->tune;
6339
6340   aarch64_override_options_after_change ();
6341 }
6342
6343 /* Implement targetm.override_options_after_change.  */
6344
6345 static void
6346 aarch64_override_options_after_change (void)
6347 {
6348   if (flag_omit_frame_pointer)
6349     flag_omit_leaf_frame_pointer = false;
6350   else if (flag_omit_leaf_frame_pointer)
6351     flag_omit_frame_pointer = true;
6352 }
6353
6354 static struct machine_function *
6355 aarch64_init_machine_status (void)
6356 {
6357   struct machine_function *machine;
6358   machine = ggc_cleared_alloc<machine_function> ();
6359   return machine;
6360 }
6361
6362 void
6363 aarch64_init_expanders (void)
6364 {
6365   init_machine_status = aarch64_init_machine_status;
6366 }
6367
6368 /* A checking mechanism for the implementation of the various code models.  */
6369 static void
6370 initialize_aarch64_code_model (void)
6371 {
6372    if (flag_pic)
6373      {
6374        switch (aarch64_cmodel_var)
6375          {
6376          case AARCH64_CMODEL_TINY:
6377            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6378            break;
6379          case AARCH64_CMODEL_SMALL:
6380            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6381            break;
6382          case AARCH64_CMODEL_LARGE:
6383            sorry ("code model %qs with -f%s", "large",
6384                   flag_pic > 1 ? "PIC" : "pic");
6385          default:
6386            gcc_unreachable ();
6387          }
6388      }
6389    else
6390      aarch64_cmodel = aarch64_cmodel_var;
6391 }
6392
6393 /* Return true if SYMBOL_REF X binds locally.  */
6394
6395 static bool
6396 aarch64_symbol_binds_local_p (const_rtx x)
6397 {
6398   return (SYMBOL_REF_DECL (x)
6399           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6400           : SYMBOL_REF_LOCAL_P (x));
6401 }
6402
6403 /* Return true if SYMBOL_REF X is thread local */
6404 static bool
6405 aarch64_tls_symbol_p (rtx x)
6406 {
6407   if (! TARGET_HAVE_TLS)
6408     return false;
6409
6410   if (GET_CODE (x) != SYMBOL_REF)
6411     return false;
6412
6413   return SYMBOL_REF_TLS_MODEL (x) != 0;
6414 }
6415
6416 /* Classify a TLS symbol into one of the TLS kinds.  */
6417 enum aarch64_symbol_type
6418 aarch64_classify_tls_symbol (rtx x)
6419 {
6420   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6421
6422   switch (tls_kind)
6423     {
6424     case TLS_MODEL_GLOBAL_DYNAMIC:
6425     case TLS_MODEL_LOCAL_DYNAMIC:
6426       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6427
6428     case TLS_MODEL_INITIAL_EXEC:
6429       return SYMBOL_SMALL_GOTTPREL;
6430
6431     case TLS_MODEL_LOCAL_EXEC:
6432       return SYMBOL_SMALL_TPREL;
6433
6434     case TLS_MODEL_EMULATED:
6435     case TLS_MODEL_NONE:
6436       return SYMBOL_FORCE_TO_MEM;
6437
6438     default:
6439       gcc_unreachable ();
6440     }
6441 }
6442
6443 /* Return the method that should be used to access SYMBOL_REF or
6444    LABEL_REF X in context CONTEXT.  */
6445
6446 enum aarch64_symbol_type
6447 aarch64_classify_symbol (rtx x,
6448                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6449 {
6450   if (GET_CODE (x) == LABEL_REF)
6451     {
6452       switch (aarch64_cmodel)
6453         {
6454         case AARCH64_CMODEL_LARGE:
6455           return SYMBOL_FORCE_TO_MEM;
6456
6457         case AARCH64_CMODEL_TINY_PIC:
6458         case AARCH64_CMODEL_TINY:
6459           return SYMBOL_TINY_ABSOLUTE;
6460
6461         case AARCH64_CMODEL_SMALL_PIC:
6462         case AARCH64_CMODEL_SMALL:
6463           return SYMBOL_SMALL_ABSOLUTE;
6464
6465         default:
6466           gcc_unreachable ();
6467         }
6468     }
6469
6470   if (GET_CODE (x) == SYMBOL_REF)
6471     {
6472       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6473           return SYMBOL_FORCE_TO_MEM;
6474
6475       if (aarch64_tls_symbol_p (x))
6476         return aarch64_classify_tls_symbol (x);
6477
6478       switch (aarch64_cmodel)
6479         {
6480         case AARCH64_CMODEL_TINY:
6481           if (SYMBOL_REF_WEAK (x))
6482             return SYMBOL_FORCE_TO_MEM;
6483           return SYMBOL_TINY_ABSOLUTE;
6484
6485         case AARCH64_CMODEL_SMALL:
6486           if (SYMBOL_REF_WEAK (x))
6487             return SYMBOL_FORCE_TO_MEM;
6488           return SYMBOL_SMALL_ABSOLUTE;
6489
6490         case AARCH64_CMODEL_TINY_PIC:
6491           if (!aarch64_symbol_binds_local_p (x))
6492             return SYMBOL_TINY_GOT;
6493           return SYMBOL_TINY_ABSOLUTE;
6494
6495         case AARCH64_CMODEL_SMALL_PIC:
6496           if (!aarch64_symbol_binds_local_p (x))
6497             return SYMBOL_SMALL_GOT;
6498           return SYMBOL_SMALL_ABSOLUTE;
6499
6500         default:
6501           gcc_unreachable ();
6502         }
6503     }
6504
6505   /* By default push everything into the constant pool.  */
6506   return SYMBOL_FORCE_TO_MEM;
6507 }
6508
6509 bool
6510 aarch64_constant_address_p (rtx x)
6511 {
6512   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6513 }
6514
6515 bool
6516 aarch64_legitimate_pic_operand_p (rtx x)
6517 {
6518   if (GET_CODE (x) == SYMBOL_REF
6519       || (GET_CODE (x) == CONST
6520           && GET_CODE (XEXP (x, 0)) == PLUS
6521           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6522      return false;
6523
6524   return true;
6525 }
6526
6527 /* Return true if X holds either a quarter-precision or
6528      floating-point +0.0 constant.  */
6529 static bool
6530 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6531 {
6532   if (!CONST_DOUBLE_P (x))
6533     return false;
6534
6535   /* TODO: We could handle moving 0.0 to a TFmode register,
6536      but first we would like to refactor the movtf_aarch64
6537      to be more amicable to split moves properly and
6538      correctly gate on TARGET_SIMD.  For now - reject all
6539      constants which are not to SFmode or DFmode registers.  */
6540   if (!(mode == SFmode || mode == DFmode))
6541     return false;
6542
6543   if (aarch64_float_const_zero_rtx_p (x))
6544     return true;
6545   return aarch64_float_const_representable_p (x);
6546 }
6547
6548 static bool
6549 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6550 {
6551   /* Do not allow vector struct mode constants.  We could support
6552      0 and -1 easily, but they need support in aarch64-simd.md.  */
6553   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6554     return false;
6555
6556   /* This could probably go away because
6557      we now decompose CONST_INTs according to expand_mov_immediate.  */
6558   if ((GET_CODE (x) == CONST_VECTOR
6559        && aarch64_simd_valid_immediate (x, mode, false, NULL))
6560       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6561         return !targetm.cannot_force_const_mem (mode, x);
6562
6563   if (GET_CODE (x) == HIGH
6564       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6565     return true;
6566
6567   return aarch64_constant_address_p (x);
6568 }
6569
6570 rtx
6571 aarch64_load_tp (rtx target)
6572 {
6573   if (!target
6574       || GET_MODE (target) != Pmode
6575       || !register_operand (target, Pmode))
6576     target = gen_reg_rtx (Pmode);
6577
6578   /* Can return in any reg.  */
6579   emit_insn (gen_aarch64_load_tp_hard (target));
6580   return target;
6581 }
6582
6583 /* On AAPCS systems, this is the "struct __va_list".  */
6584 static GTY(()) tree va_list_type;
6585
6586 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6587    Return the type to use as __builtin_va_list.
6588
6589    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6590
6591    struct __va_list
6592    {
6593      void *__stack;
6594      void *__gr_top;
6595      void *__vr_top;
6596      int   __gr_offs;
6597      int   __vr_offs;
6598    };  */
6599
6600 static tree
6601 aarch64_build_builtin_va_list (void)
6602 {
6603   tree va_list_name;
6604   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6605
6606   /* Create the type.  */
6607   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6608   /* Give it the required name.  */
6609   va_list_name = build_decl (BUILTINS_LOCATION,
6610                              TYPE_DECL,
6611                              get_identifier ("__va_list"),
6612                              va_list_type);
6613   DECL_ARTIFICIAL (va_list_name) = 1;
6614   TYPE_NAME (va_list_type) = va_list_name;
6615   TYPE_STUB_DECL (va_list_type) = va_list_name;
6616
6617   /* Create the fields.  */
6618   f_stack = build_decl (BUILTINS_LOCATION,
6619                         FIELD_DECL, get_identifier ("__stack"),
6620                         ptr_type_node);
6621   f_grtop = build_decl (BUILTINS_LOCATION,
6622                         FIELD_DECL, get_identifier ("__gr_top"),
6623                         ptr_type_node);
6624   f_vrtop = build_decl (BUILTINS_LOCATION,
6625                         FIELD_DECL, get_identifier ("__vr_top"),
6626                         ptr_type_node);
6627   f_groff = build_decl (BUILTINS_LOCATION,
6628                         FIELD_DECL, get_identifier ("__gr_offs"),
6629                         integer_type_node);
6630   f_vroff = build_decl (BUILTINS_LOCATION,
6631                         FIELD_DECL, get_identifier ("__vr_offs"),
6632                         integer_type_node);
6633
6634   DECL_ARTIFICIAL (f_stack) = 1;
6635   DECL_ARTIFICIAL (f_grtop) = 1;
6636   DECL_ARTIFICIAL (f_vrtop) = 1;
6637   DECL_ARTIFICIAL (f_groff) = 1;
6638   DECL_ARTIFICIAL (f_vroff) = 1;
6639
6640   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6641   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6642   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6643   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6644   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6645
6646   TYPE_FIELDS (va_list_type) = f_stack;
6647   DECL_CHAIN (f_stack) = f_grtop;
6648   DECL_CHAIN (f_grtop) = f_vrtop;
6649   DECL_CHAIN (f_vrtop) = f_groff;
6650   DECL_CHAIN (f_groff) = f_vroff;
6651
6652   /* Compute its layout.  */
6653   layout_type (va_list_type);
6654
6655   return va_list_type;
6656 }
6657
6658 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
6659 static void
6660 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6661 {
6662   const CUMULATIVE_ARGS *cum;
6663   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6664   tree stack, grtop, vrtop, groff, vroff;
6665   tree t;
6666   int gr_save_area_size;
6667   int vr_save_area_size;
6668   int vr_offset;
6669
6670   cum = &crtl->args.info;
6671   gr_save_area_size
6672     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6673   vr_save_area_size
6674     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6675
6676   if (TARGET_GENERAL_REGS_ONLY)
6677     {
6678       if (cum->aapcs_nvrn > 0)
6679         sorry ("%qs and floating point or vector arguments",
6680                "-mgeneral-regs-only");
6681       vr_save_area_size = 0;
6682     }
6683
6684   f_stack = TYPE_FIELDS (va_list_type_node);
6685   f_grtop = DECL_CHAIN (f_stack);
6686   f_vrtop = DECL_CHAIN (f_grtop);
6687   f_groff = DECL_CHAIN (f_vrtop);
6688   f_vroff = DECL_CHAIN (f_groff);
6689
6690   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6691                   NULL_TREE);
6692   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6693                   NULL_TREE);
6694   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6695                   NULL_TREE);
6696   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6697                   NULL_TREE);
6698   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6699                   NULL_TREE);
6700
6701   /* Emit code to initialize STACK, which points to the next varargs stack
6702      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
6703      by named arguments.  STACK is 8-byte aligned.  */
6704   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6705   if (cum->aapcs_stack_size > 0)
6706     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6707   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6708   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6709
6710   /* Emit code to initialize GRTOP, the top of the GR save area.
6711      virtual_incoming_args_rtx should have been 16 byte aligned.  */
6712   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6713   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6714   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6715
6716   /* Emit code to initialize VRTOP, the top of the VR save area.
6717      This address is gr_save_area_bytes below GRTOP, rounded
6718      down to the next 16-byte boundary.  */
6719   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6720   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6721                              STACK_BOUNDARY / BITS_PER_UNIT);
6722
6723   if (vr_offset)
6724     t = fold_build_pointer_plus_hwi (t, -vr_offset);
6725   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6726   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6727
6728   /* Emit code to initialize GROFF, the offset from GRTOP of the
6729      next GPR argument.  */
6730   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6731               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6732   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6733
6734   /* Likewise emit code to initialize VROFF, the offset from FTOP
6735      of the next VR argument.  */
6736   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6737               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6738   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6739 }
6740
6741 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
6742
6743 static tree
6744 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6745                               gimple_seq *post_p ATTRIBUTE_UNUSED)
6746 {
6747   tree addr;
6748   bool indirect_p;
6749   bool is_ha;           /* is HFA or HVA.  */
6750   bool dw_align;        /* double-word align.  */
6751   enum machine_mode ag_mode = VOIDmode;
6752   int nregs;
6753   enum machine_mode mode;
6754
6755   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6756   tree stack, f_top, f_off, off, arg, roundup, on_stack;
6757   HOST_WIDE_INT size, rsize, adjust, align;
6758   tree t, u, cond1, cond2;
6759
6760   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6761   if (indirect_p)
6762     type = build_pointer_type (type);
6763
6764   mode = TYPE_MODE (type);
6765
6766   f_stack = TYPE_FIELDS (va_list_type_node);
6767   f_grtop = DECL_CHAIN (f_stack);
6768   f_vrtop = DECL_CHAIN (f_grtop);
6769   f_groff = DECL_CHAIN (f_vrtop);
6770   f_vroff = DECL_CHAIN (f_groff);
6771
6772   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6773                   f_stack, NULL_TREE);
6774   size = int_size_in_bytes (type);
6775   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6776
6777   dw_align = false;
6778   adjust = 0;
6779   if (aarch64_vfp_is_call_or_return_candidate (mode,
6780                                                type,
6781                                                &ag_mode,
6782                                                &nregs,
6783                                                &is_ha))
6784     {
6785       /* TYPE passed in fp/simd registers.  */
6786       if (TARGET_GENERAL_REGS_ONLY)
6787         sorry ("%qs and floating point or vector arguments",
6788                "-mgeneral-regs-only");
6789
6790       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6791                       unshare_expr (valist), f_vrtop, NULL_TREE);
6792       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6793                       unshare_expr (valist), f_vroff, NULL_TREE);
6794
6795       rsize = nregs * UNITS_PER_VREG;
6796
6797       if (is_ha)
6798         {
6799           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6800             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6801         }
6802       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6803                && size < UNITS_PER_VREG)
6804         {
6805           adjust = UNITS_PER_VREG - size;
6806         }
6807     }
6808   else
6809     {
6810       /* TYPE passed in general registers.  */
6811       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6812                       unshare_expr (valist), f_grtop, NULL_TREE);
6813       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6814                       unshare_expr (valist), f_groff, NULL_TREE);
6815       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6816       nregs = rsize / UNITS_PER_WORD;
6817
6818       if (align > 8)
6819         dw_align = true;
6820
6821       if (BLOCK_REG_PADDING (mode, type, 1) == downward
6822           && size < UNITS_PER_WORD)
6823         {
6824           adjust = UNITS_PER_WORD  - size;
6825         }
6826     }
6827
6828   /* Get a local temporary for the field value.  */
6829   off = get_initialized_tmp_var (f_off, pre_p, NULL);
6830
6831   /* Emit code to branch if off >= 0.  */
6832   t = build2 (GE_EXPR, boolean_type_node, off,
6833               build_int_cst (TREE_TYPE (off), 0));
6834   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6835
6836   if (dw_align)
6837     {
6838       /* Emit: offs = (offs + 15) & -16.  */
6839       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6840                   build_int_cst (TREE_TYPE (off), 15));
6841       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6842                   build_int_cst (TREE_TYPE (off), -16));
6843       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6844     }
6845   else
6846     roundup = NULL;
6847
6848   /* Update ap.__[g|v]r_offs  */
6849   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6850               build_int_cst (TREE_TYPE (off), rsize));
6851   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6852
6853   /* String up.  */
6854   if (roundup)
6855     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6856
6857   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
6858   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6859               build_int_cst (TREE_TYPE (f_off), 0));
6860   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6861
6862   /* String up: make sure the assignment happens before the use.  */
6863   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6864   COND_EXPR_ELSE (cond1) = t;
6865
6866   /* Prepare the trees handling the argument that is passed on the stack;
6867      the top level node will store in ON_STACK.  */
6868   arg = get_initialized_tmp_var (stack, pre_p, NULL);
6869   if (align > 8)
6870     {
6871       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
6872       t = fold_convert (intDI_type_node, arg);
6873       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6874                   build_int_cst (TREE_TYPE (t), 15));
6875       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6876                   build_int_cst (TREE_TYPE (t), -16));
6877       t = fold_convert (TREE_TYPE (arg), t);
6878       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6879     }
6880   else
6881     roundup = NULL;
6882   /* Advance ap.__stack  */
6883   t = fold_convert (intDI_type_node, arg);
6884   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6885               build_int_cst (TREE_TYPE (t), size + 7));
6886   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6887               build_int_cst (TREE_TYPE (t), -8));
6888   t = fold_convert (TREE_TYPE (arg), t);
6889   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6890   /* String up roundup and advance.  */
6891   if (roundup)
6892     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6893   /* String up with arg */
6894   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6895   /* Big-endianness related address adjustment.  */
6896   if (BLOCK_REG_PADDING (mode, type, 1) == downward
6897       && size < UNITS_PER_WORD)
6898   {
6899     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6900                 size_int (UNITS_PER_WORD - size));
6901     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6902   }
6903
6904   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6905   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6906
6907   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
6908   t = off;
6909   if (adjust)
6910     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6911                 build_int_cst (TREE_TYPE (off), adjust));
6912
6913   t = fold_convert (sizetype, t);
6914   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6915
6916   if (is_ha)
6917     {
6918       /* type ha; // treat as "struct {ftype field[n];}"
6919          ... [computing offs]
6920          for (i = 0; i <nregs; ++i, offs += 16)
6921            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6922          return ha;  */
6923       int i;
6924       tree tmp_ha, field_t, field_ptr_t;
6925
6926       /* Declare a local variable.  */
6927       tmp_ha = create_tmp_var_raw (type, "ha");
6928       gimple_add_tmp_var (tmp_ha);
6929
6930       /* Establish the base type.  */
6931       switch (ag_mode)
6932         {
6933         case SFmode:
6934           field_t = float_type_node;
6935           field_ptr_t = float_ptr_type_node;
6936           break;
6937         case DFmode:
6938           field_t = double_type_node;
6939           field_ptr_t = double_ptr_type_node;
6940           break;
6941         case TFmode:
6942           field_t = long_double_type_node;
6943           field_ptr_t = long_double_ptr_type_node;
6944           break;
6945 /* The half precision and quad precision are not fully supported yet.  Enable
6946    the following code after the support is complete.  Need to find the correct
6947    type node for __fp16 *.  */
6948 #if 0
6949         case HFmode:
6950           field_t = float_type_node;
6951           field_ptr_t = float_ptr_type_node;
6952           break;
6953 #endif
6954         case V2SImode:
6955         case V4SImode:
6956             {
6957               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
6958               field_t = build_vector_type_for_mode (innertype, ag_mode);
6959               field_ptr_t = build_pointer_type (field_t);
6960             }
6961           break;
6962         default:
6963           gcc_assert (0);
6964         }
6965
6966       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
6967       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
6968       addr = t;
6969       t = fold_convert (field_ptr_t, addr);
6970       t = build2 (MODIFY_EXPR, field_t,
6971                   build1 (INDIRECT_REF, field_t, tmp_ha),
6972                   build1 (INDIRECT_REF, field_t, t));
6973
6974       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
6975       for (i = 1; i < nregs; ++i)
6976         {
6977           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
6978           u = fold_convert (field_ptr_t, addr);
6979           u = build2 (MODIFY_EXPR, field_t,
6980                       build2 (MEM_REF, field_t, tmp_ha,
6981                               build_int_cst (field_ptr_t,
6982                                              (i *
6983                                               int_size_in_bytes (field_t)))),
6984                       build1 (INDIRECT_REF, field_t, u));
6985           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
6986         }
6987
6988       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
6989       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
6990     }
6991
6992   COND_EXPR_ELSE (cond2) = t;
6993   addr = fold_convert (build_pointer_type (type), cond1);
6994   addr = build_va_arg_indirect_ref (addr);
6995
6996   if (indirect_p)
6997     addr = build_va_arg_indirect_ref (addr);
6998
6999   return addr;
7000 }
7001
7002 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7003
7004 static void
7005 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7006                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7007                                 int no_rtl)
7008 {
7009   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7010   CUMULATIVE_ARGS local_cum;
7011   int gr_saved, vr_saved;
7012
7013   /* The caller has advanced CUM up to, but not beyond, the last named
7014      argument.  Advance a local copy of CUM past the last "real" named
7015      argument, to find out how many registers are left over.  */
7016   local_cum = *cum;
7017   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7018
7019   /* Found out how many registers we need to save.  */
7020   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7021   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7022
7023   if (TARGET_GENERAL_REGS_ONLY)
7024     {
7025       if (local_cum.aapcs_nvrn > 0)
7026         sorry ("%qs and floating point or vector arguments",
7027                "-mgeneral-regs-only");
7028       vr_saved = 0;
7029     }
7030
7031   if (!no_rtl)
7032     {
7033       if (gr_saved > 0)
7034         {
7035           rtx ptr, mem;
7036
7037           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7038           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7039                                - gr_saved * UNITS_PER_WORD);
7040           mem = gen_frame_mem (BLKmode, ptr);
7041           set_mem_alias_set (mem, get_varargs_alias_set ());
7042
7043           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7044                                mem, gr_saved);
7045         }
7046       if (vr_saved > 0)
7047         {
7048           /* We can't use move_block_from_reg, because it will use
7049              the wrong mode, storing D regs only.  */
7050           enum machine_mode mode = TImode;
7051           int off, i;
7052
7053           /* Set OFF to the offset from virtual_incoming_args_rtx of
7054              the first vector register.  The VR save area lies below
7055              the GR one, and is aligned to 16 bytes.  */
7056           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7057                                    STACK_BOUNDARY / BITS_PER_UNIT);
7058           off -= vr_saved * UNITS_PER_VREG;
7059
7060           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7061             {
7062               rtx ptr, mem;
7063
7064               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7065               mem = gen_frame_mem (mode, ptr);
7066               set_mem_alias_set (mem, get_varargs_alias_set ());
7067               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7068               off += UNITS_PER_VREG;
7069             }
7070         }
7071     }
7072
7073   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7074      any complication of having crtl->args.pretend_args_size changed.  */
7075   cfun->machine->frame.saved_varargs_size
7076     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7077                       STACK_BOUNDARY / BITS_PER_UNIT)
7078        + vr_saved * UNITS_PER_VREG);
7079 }
7080
7081 static void
7082 aarch64_conditional_register_usage (void)
7083 {
7084   int i;
7085   if (!TARGET_FLOAT)
7086     {
7087       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7088         {
7089           fixed_regs[i] = 1;
7090           call_used_regs[i] = 1;
7091         }
7092     }
7093 }
7094
7095 /* Walk down the type tree of TYPE counting consecutive base elements.
7096    If *MODEP is VOIDmode, then set it to the first valid floating point
7097    type.  If a non-floating point type is found, or if a floating point
7098    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7099    otherwise return the count in the sub-tree.  */
7100 static int
7101 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
7102 {
7103   enum machine_mode mode;
7104   HOST_WIDE_INT size;
7105
7106   switch (TREE_CODE (type))
7107     {
7108     case REAL_TYPE:
7109       mode = TYPE_MODE (type);
7110       if (mode != DFmode && mode != SFmode && mode != TFmode)
7111         return -1;
7112
7113       if (*modep == VOIDmode)
7114         *modep = mode;
7115
7116       if (*modep == mode)
7117         return 1;
7118
7119       break;
7120
7121     case COMPLEX_TYPE:
7122       mode = TYPE_MODE (TREE_TYPE (type));
7123       if (mode != DFmode && mode != SFmode && mode != TFmode)
7124         return -1;
7125
7126       if (*modep == VOIDmode)
7127         *modep = mode;
7128
7129       if (*modep == mode)
7130         return 2;
7131
7132       break;
7133
7134     case VECTOR_TYPE:
7135       /* Use V2SImode and V4SImode as representatives of all 64-bit
7136          and 128-bit vector types.  */
7137       size = int_size_in_bytes (type);
7138       switch (size)
7139         {
7140         case 8:
7141           mode = V2SImode;
7142           break;
7143         case 16:
7144           mode = V4SImode;
7145           break;
7146         default:
7147           return -1;
7148         }
7149
7150       if (*modep == VOIDmode)
7151         *modep = mode;
7152
7153       /* Vector modes are considered to be opaque: two vectors are
7154          equivalent for the purposes of being homogeneous aggregates
7155          if they are the same size.  */
7156       if (*modep == mode)
7157         return 1;
7158
7159       break;
7160
7161     case ARRAY_TYPE:
7162       {
7163         int count;
7164         tree index = TYPE_DOMAIN (type);
7165
7166         /* Can't handle incomplete types nor sizes that are not
7167            fixed.  */
7168         if (!COMPLETE_TYPE_P (type)
7169             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7170           return -1;
7171
7172         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7173         if (count == -1
7174             || !index
7175             || !TYPE_MAX_VALUE (index)
7176             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7177             || !TYPE_MIN_VALUE (index)
7178             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7179             || count < 0)
7180           return -1;
7181
7182         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7183                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7184
7185         /* There must be no padding.  */
7186         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7187           return -1;
7188
7189         return count;
7190       }
7191
7192     case RECORD_TYPE:
7193       {
7194         int count = 0;
7195         int sub_count;
7196         tree field;
7197
7198         /* Can't handle incomplete types nor sizes that are not
7199            fixed.  */
7200         if (!COMPLETE_TYPE_P (type)
7201             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7202           return -1;
7203
7204         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7205           {
7206             if (TREE_CODE (field) != FIELD_DECL)
7207               continue;
7208
7209             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7210             if (sub_count < 0)
7211               return -1;
7212             count += sub_count;
7213           }
7214
7215         /* There must be no padding.  */
7216         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7217           return -1;
7218
7219         return count;
7220       }
7221
7222     case UNION_TYPE:
7223     case QUAL_UNION_TYPE:
7224       {
7225         /* These aren't very interesting except in a degenerate case.  */
7226         int count = 0;
7227         int sub_count;
7228         tree field;
7229
7230         /* Can't handle incomplete types nor sizes that are not
7231            fixed.  */
7232         if (!COMPLETE_TYPE_P (type)
7233             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7234           return -1;
7235
7236         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7237           {
7238             if (TREE_CODE (field) != FIELD_DECL)
7239               continue;
7240
7241             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7242             if (sub_count < 0)
7243               return -1;
7244             count = count > sub_count ? count : sub_count;
7245           }
7246
7247         /* There must be no padding.  */
7248         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7249           return -1;
7250
7251         return count;
7252       }
7253
7254     default:
7255       break;
7256     }
7257
7258   return -1;
7259 }
7260
7261 /* Return true if we use LRA instead of reload pass.  */
7262 static bool
7263 aarch64_lra_p (void)
7264 {
7265   return aarch64_lra_flag;
7266 }
7267
7268 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7269    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7270    array types.  The C99 floating-point complex types are also considered
7271    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7272    types, which are GCC extensions and out of the scope of AAPCS64, are
7273    treated as composite types here as well.
7274
7275    Note that MODE itself is not sufficient in determining whether a type
7276    is such a composite type or not.  This is because
7277    stor-layout.c:compute_record_mode may have already changed the MODE
7278    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7279    structure with only one field may have its MODE set to the mode of the
7280    field.  Also an integer mode whose size matches the size of the
7281    RECORD_TYPE type may be used to substitute the original mode
7282    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7283    solely relied on.  */
7284
7285 static bool
7286 aarch64_composite_type_p (const_tree type,
7287                           enum machine_mode mode)
7288 {
7289   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7290     return true;
7291
7292   if (mode == BLKmode
7293       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7294       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7295     return true;
7296
7297   return false;
7298 }
7299
7300 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7301    type as described in AAPCS64 \S 4.1.2.
7302
7303    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7304
7305 static bool
7306 aarch64_short_vector_p (const_tree type,
7307                         enum machine_mode mode)
7308 {
7309   HOST_WIDE_INT size = -1;
7310
7311   if (type && TREE_CODE (type) == VECTOR_TYPE)
7312     size = int_size_in_bytes (type);
7313   else if (!aarch64_composite_type_p (type, mode)
7314            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7315                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7316     size = GET_MODE_SIZE (mode);
7317
7318   return (size == 8 || size == 16) ? true : false;
7319 }
7320
7321 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7322    shall be passed or returned in simd/fp register(s) (providing these
7323    parameter passing registers are available).
7324
7325    Upon successful return, *COUNT returns the number of needed registers,
7326    *BASE_MODE returns the mode of the individual register and when IS_HAF
7327    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7328    floating-point aggregate or a homogeneous short-vector aggregate.  */
7329
7330 static bool
7331 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7332                                          const_tree type,
7333                                          enum machine_mode *base_mode,
7334                                          int *count,
7335                                          bool *is_ha)
7336 {
7337   enum machine_mode new_mode = VOIDmode;
7338   bool composite_p = aarch64_composite_type_p (type, mode);
7339
7340   if (is_ha != NULL) *is_ha = false;
7341
7342   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7343       || aarch64_short_vector_p (type, mode))
7344     {
7345       *count = 1;
7346       new_mode = mode;
7347     }
7348   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7349     {
7350       if (is_ha != NULL) *is_ha = true;
7351       *count = 2;
7352       new_mode = GET_MODE_INNER (mode);
7353     }
7354   else if (type && composite_p)
7355     {
7356       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7357
7358       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7359         {
7360           if (is_ha != NULL) *is_ha = true;
7361           *count = ag_count;
7362         }
7363       else
7364         return false;
7365     }
7366   else
7367     return false;
7368
7369   *base_mode = new_mode;
7370   return true;
7371 }
7372
7373 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7374
7375 static rtx
7376 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7377                           int incoming ATTRIBUTE_UNUSED)
7378 {
7379   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7380 }
7381
7382 /* Implements target hook vector_mode_supported_p.  */
7383 static bool
7384 aarch64_vector_mode_supported_p (enum machine_mode mode)
7385 {
7386   if (TARGET_SIMD
7387       && (mode == V4SImode  || mode == V8HImode
7388           || mode == V16QImode || mode == V2DImode
7389           || mode == V2SImode  || mode == V4HImode
7390           || mode == V8QImode || mode == V2SFmode
7391           || mode == V4SFmode || mode == V2DFmode
7392           || mode == V1DFmode))
7393     return true;
7394
7395   return false;
7396 }
7397
7398 /* Return appropriate SIMD container
7399    for MODE within a vector of WIDTH bits.  */
7400 static enum machine_mode
7401 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7402 {
7403   gcc_assert (width == 64 || width == 128);
7404   if (TARGET_SIMD)
7405     {
7406       if (width == 128)
7407         switch (mode)
7408           {
7409           case DFmode:
7410             return V2DFmode;
7411           case SFmode:
7412             return V4SFmode;
7413           case SImode:
7414             return V4SImode;
7415           case HImode:
7416             return V8HImode;
7417           case QImode:
7418             return V16QImode;
7419           case DImode:
7420             return V2DImode;
7421           default:
7422             break;
7423           }
7424       else
7425         switch (mode)
7426           {
7427           case SFmode:
7428             return V2SFmode;
7429           case SImode:
7430             return V2SImode;
7431           case HImode:
7432             return V4HImode;
7433           case QImode:
7434             return V8QImode;
7435           default:
7436             break;
7437           }
7438     }
7439   return word_mode;
7440 }
7441
7442 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7443 static enum machine_mode
7444 aarch64_preferred_simd_mode (enum machine_mode mode)
7445 {
7446   return aarch64_simd_container_mode (mode, 128);
7447 }
7448
7449 /* Return the bitmask of possible vector sizes for the vectorizer
7450    to iterate over.  */
7451 static unsigned int
7452 aarch64_autovectorize_vector_sizes (void)
7453 {
7454   return (16 | 8);
7455 }
7456
7457 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7458    vector types in order to conform to the AAPCS64 (see "Procedure
7459    Call Standard for the ARM 64-bit Architecture", Appendix A).  To
7460    qualify for emission with the mangled names defined in that document,
7461    a vector type must not only be of the correct mode but also be
7462    composed of AdvSIMD vector element types (e.g.
7463    _builtin_aarch64_simd_qi); these types are registered by
7464    aarch64_init_simd_builtins ().  In other words, vector types defined
7465    in other ways e.g. via vector_size attribute will get default
7466    mangled names.  */
7467 typedef struct
7468 {
7469   enum machine_mode mode;
7470   const char *element_type_name;
7471   const char *mangled_name;
7472 } aarch64_simd_mangle_map_entry;
7473
7474 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7475   /* 64-bit containerized types.  */
7476   { V8QImode,  "__builtin_aarch64_simd_qi",     "10__Int8x8_t" },
7477   { V8QImode,  "__builtin_aarch64_simd_uqi",    "11__Uint8x8_t" },
7478   { V4HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x4_t" },
7479   { V4HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x4_t" },
7480   { V2SImode,  "__builtin_aarch64_simd_si",     "11__Int32x2_t" },
7481   { V2SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x2_t" },
7482   { V2SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x2_t" },
7483   { DImode,    "__builtin_aarch64_simd_di",     "11__Int64x1_t" },
7484   { DImode,    "__builtin_aarch64_simd_udi",    "12__Uint64x1_t" },
7485   { V1DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x1_t" },
7486   { V8QImode,  "__builtin_aarch64_simd_poly8",  "11__Poly8x8_t" },
7487   { V4HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7488   /* 128-bit containerized types.  */
7489   { V16QImode, "__builtin_aarch64_simd_qi",     "11__Int8x16_t" },
7490   { V16QImode, "__builtin_aarch64_simd_uqi",    "12__Uint8x16_t" },
7491   { V8HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x8_t" },
7492   { V8HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x8_t" },
7493   { V4SImode,  "__builtin_aarch64_simd_si",     "11__Int32x4_t" },
7494   { V4SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x4_t" },
7495   { V2DImode,  "__builtin_aarch64_simd_di",     "11__Int64x2_t" },
7496   { V2DImode,  "__builtin_aarch64_simd_udi",    "12__Uint64x2_t" },
7497   { V4SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x4_t" },
7498   { V2DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x2_t" },
7499   { V16QImode, "__builtin_aarch64_simd_poly8",  "12__Poly8x16_t" },
7500   { V8HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7501   { V2DImode,  "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7502   { VOIDmode, NULL, NULL }
7503 };
7504
7505 /* Implement TARGET_MANGLE_TYPE.  */
7506
7507 static const char *
7508 aarch64_mangle_type (const_tree type)
7509 {
7510   /* The AArch64 ABI documents say that "__va_list" has to be
7511      managled as if it is in the "std" namespace.  */
7512   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7513     return "St9__va_list";
7514
7515   /* Check the mode of the vector type, and the name of the vector
7516      element type, against the table.  */
7517   if (TREE_CODE (type) == VECTOR_TYPE)
7518     {
7519       aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7520
7521       while (pos->mode != VOIDmode)
7522         {
7523           tree elt_type = TREE_TYPE (type);
7524
7525           if (pos->mode == TYPE_MODE (type)
7526               && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7527               && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7528                           pos->element_type_name))
7529             return pos->mangled_name;
7530
7531           pos++;
7532         }
7533     }
7534
7535   /* Use the default mangling.  */
7536   return NULL;
7537 }
7538
7539 /* Return the equivalent letter for size.  */
7540 static char
7541 sizetochar (int size)
7542 {
7543   switch (size)
7544     {
7545     case 64: return 'd';
7546     case 32: return 's';
7547     case 16: return 'h';
7548     case 8 : return 'b';
7549     default: gcc_unreachable ();
7550     }
7551 }
7552
7553 /* Return true iff x is a uniform vector of floating-point
7554    constants, and the constant can be represented in
7555    quarter-precision form.  Note, as aarch64_float_const_representable
7556    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
7557 static bool
7558 aarch64_vect_float_const_representable_p (rtx x)
7559 {
7560   int i = 0;
7561   REAL_VALUE_TYPE r0, ri;
7562   rtx x0, xi;
7563
7564   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7565     return false;
7566
7567   x0 = CONST_VECTOR_ELT (x, 0);
7568   if (!CONST_DOUBLE_P (x0))
7569     return false;
7570
7571   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7572
7573   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7574     {
7575       xi = CONST_VECTOR_ELT (x, i);
7576       if (!CONST_DOUBLE_P (xi))
7577         return false;
7578
7579       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7580       if (!REAL_VALUES_EQUAL (r0, ri))
7581         return false;
7582     }
7583
7584   return aarch64_float_const_representable_p (x0);
7585 }
7586
7587 /* Return true for valid and false for invalid.  */
7588 bool
7589 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7590                               struct simd_immediate_info *info)
7591 {
7592 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
7593   matches = 1;                                          \
7594   for (i = 0; i < idx; i += (STRIDE))                   \
7595     if (!(TEST))                                        \
7596       matches = 0;                                      \
7597   if (matches)                                          \
7598     {                                                   \
7599       immtype = (CLASS);                                \
7600       elsize = (ELSIZE);                                \
7601       eshift = (SHIFT);                                 \
7602       emvn = (NEG);                                     \
7603       break;                                            \
7604     }
7605
7606   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7607   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7608   unsigned char bytes[16];
7609   int immtype = -1, matches;
7610   unsigned int invmask = inverse ? 0xff : 0;
7611   int eshift, emvn;
7612
7613   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7614     {
7615       if (! (aarch64_simd_imm_zero_p (op, mode)
7616              || aarch64_vect_float_const_representable_p (op)))
7617         return false;
7618
7619       if (info)
7620         {
7621           info->value = CONST_VECTOR_ELT (op, 0);
7622           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7623           info->mvn = false;
7624           info->shift = 0;
7625         }
7626
7627       return true;
7628     }
7629
7630   /* Splat vector constant out into a byte vector.  */
7631   for (i = 0; i < n_elts; i++)
7632     {
7633       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
7634          it must be laid out in the vector register in reverse order.  */
7635       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7636       unsigned HOST_WIDE_INT elpart;
7637       unsigned int part, parts;
7638
7639       if (CONST_INT_P (el))
7640         {
7641           elpart = INTVAL (el);
7642           parts = 1;
7643         }
7644       else if (GET_CODE (el) == CONST_DOUBLE)
7645         {
7646           elpart = CONST_DOUBLE_LOW (el);
7647           parts = 2;
7648         }
7649       else
7650         gcc_unreachable ();
7651
7652       for (part = 0; part < parts; part++)
7653         {
7654           unsigned int byte;
7655           for (byte = 0; byte < innersize; byte++)
7656             {
7657               bytes[idx++] = (elpart & 0xff) ^ invmask;
7658               elpart >>= BITS_PER_UNIT;
7659             }
7660           if (GET_CODE (el) == CONST_DOUBLE)
7661             elpart = CONST_DOUBLE_HIGH (el);
7662         }
7663     }
7664
7665   /* Sanity check.  */
7666   gcc_assert (idx == GET_MODE_SIZE (mode));
7667
7668   do
7669     {
7670       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7671              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7672
7673       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7674              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7675
7676       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7677              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7678
7679       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7680              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7681
7682       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7683
7684       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7685
7686       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7687              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7688
7689       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7690              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7691
7692       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7693              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7694
7695       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7696              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7697
7698       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7699
7700       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7701
7702       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7703              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7704
7705       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7706              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7707
7708       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7709              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7710
7711       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7712              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7713
7714       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7715
7716       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7717              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7718     }
7719   while (0);
7720
7721   if (immtype == -1)
7722     return false;
7723
7724   if (info)
7725     {
7726       info->element_width = elsize;
7727       info->mvn = emvn != 0;
7728       info->shift = eshift;
7729
7730       unsigned HOST_WIDE_INT imm = 0;
7731
7732       if (immtype >= 12 && immtype <= 15)
7733         info->msl = true;
7734
7735       /* Un-invert bytes of recognized vector, if necessary.  */
7736       if (invmask != 0)
7737         for (i = 0; i < idx; i++)
7738           bytes[i] ^= invmask;
7739
7740       if (immtype == 17)
7741         {
7742           /* FIXME: Broken on 32-bit H_W_I hosts.  */
7743           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7744
7745           for (i = 0; i < 8; i++)
7746             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7747               << (i * BITS_PER_UNIT);
7748
7749
7750           info->value = GEN_INT (imm);
7751         }
7752       else
7753         {
7754           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7755             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7756
7757           /* Construct 'abcdefgh' because the assembler cannot handle
7758              generic constants.  */
7759           if (info->mvn)
7760             imm = ~imm;
7761           imm = (imm >> info->shift) & 0xff;
7762           info->value = GEN_INT (imm);
7763         }
7764     }
7765
7766   return true;
7767 #undef CHECK
7768 }
7769
7770 /* Check of immediate shift constants are within range.  */
7771 bool
7772 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7773 {
7774   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7775   if (left)
7776     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
7777   else
7778     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
7779 }
7780
7781 /* Return true if X is a uniform vector where all elements
7782    are either the floating-point constant 0.0 or the
7783    integer constant 0.  */
7784 bool
7785 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7786 {
7787   return x == CONST0_RTX (mode);
7788 }
7789
7790 bool
7791 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7792 {
7793   HOST_WIDE_INT imm = INTVAL (x);
7794   int i;
7795
7796   for (i = 0; i < 8; i++)
7797     {
7798       unsigned int byte = imm & 0xff;
7799       if (byte != 0xff && byte != 0)
7800        return false;
7801       imm >>= 8;
7802     }
7803
7804   return true;
7805 }
7806
7807 bool
7808 aarch64_mov_operand_p (rtx x,
7809                        enum aarch64_symbol_context context,
7810                        enum machine_mode mode)
7811 {
7812   if (GET_CODE (x) == HIGH
7813       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7814     return true;
7815
7816   if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7817     return true;
7818
7819   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7820     return true;
7821
7822   return aarch64_classify_symbolic_expression (x, context)
7823     == SYMBOL_TINY_ABSOLUTE;
7824 }
7825
7826 /* Return a const_int vector of VAL.  */
7827 rtx
7828 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7829 {
7830   int nunits = GET_MODE_NUNITS (mode);
7831   rtvec v = rtvec_alloc (nunits);
7832   int i;
7833
7834   for (i=0; i < nunits; i++)
7835     RTVEC_ELT (v, i) = GEN_INT (val);
7836
7837   return gen_rtx_CONST_VECTOR (mode, v);
7838 }
7839
7840 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
7841
7842 bool
7843 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7844 {
7845   enum machine_mode vmode;
7846
7847   gcc_assert (!VECTOR_MODE_P (mode));
7848   vmode = aarch64_preferred_simd_mode (mode);
7849   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7850   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7851 }
7852
7853 /* Construct and return a PARALLEL RTX vector with elements numbering the
7854    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
7855    the vector - from the perspective of the architecture.  This does not
7856    line up with GCC's perspective on lane numbers, so we end up with
7857    different masks depending on our target endian-ness.  The diagram
7858    below may help.  We must draw the distinction when building masks
7859    which select one half of the vector.  An instruction selecting
7860    architectural low-lanes for a big-endian target, must be described using
7861    a mask selecting GCC high-lanes.
7862
7863                  Big-Endian             Little-Endian
7864
7865 GCC             0   1   2   3           3   2   1   0
7866               | x | x | x | x |       | x | x | x | x |
7867 Architecture    3   2   1   0           3   2   1   0
7868
7869 Low Mask:         { 2, 3 }                { 0, 1 }
7870 High Mask:        { 0, 1 }                { 2, 3 }
7871 */
7872
7873 rtx
7874 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7875 {
7876   int nunits = GET_MODE_NUNITS (mode);
7877   rtvec v = rtvec_alloc (nunits / 2);
7878   int high_base = nunits / 2;
7879   int low_base = 0;
7880   int base;
7881   rtx t1;
7882   int i;
7883
7884   if (BYTES_BIG_ENDIAN)
7885     base = high ? low_base : high_base;
7886   else
7887     base = high ? high_base : low_base;
7888
7889   for (i = 0; i < nunits / 2; i++)
7890     RTVEC_ELT (v, i) = GEN_INT (base + i);
7891
7892   t1 = gen_rtx_PARALLEL (mode, v);
7893   return t1;
7894 }
7895
7896 /* Check OP for validity as a PARALLEL RTX vector with elements
7897    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
7898    from the perspective of the architecture.  See the diagram above
7899    aarch64_simd_vect_par_cnst_half for more details.  */
7900
7901 bool
7902 aarch64_simd_check_vect_par_cnst_half (rtx op, enum machine_mode mode,
7903                                        bool high)
7904 {
7905   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
7906   HOST_WIDE_INT count_op = XVECLEN (op, 0);
7907   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
7908   int i = 0;
7909
7910   if (!VECTOR_MODE_P (mode))
7911     return false;
7912
7913   if (count_op != count_ideal)
7914     return false;
7915
7916   for (i = 0; i < count_ideal; i++)
7917     {
7918       rtx elt_op = XVECEXP (op, 0, i);
7919       rtx elt_ideal = XVECEXP (ideal, 0, i);
7920
7921       if (!CONST_INT_P (elt_op)
7922           || INTVAL (elt_ideal) != INTVAL (elt_op))
7923         return false;
7924     }
7925   return true;
7926 }
7927
7928 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
7929    HIGH (exclusive).  */
7930 void
7931 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7932 {
7933   HOST_WIDE_INT lane;
7934   gcc_assert (CONST_INT_P (operand));
7935   lane = INTVAL (operand);
7936
7937   if (lane < low || lane >= high)
7938     error ("lane out of range");
7939 }
7940
7941 void
7942 aarch64_simd_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7943 {
7944   gcc_assert (CONST_INT_P (operand));
7945   HOST_WIDE_INT lane = INTVAL (operand);
7946
7947   if (lane < low || lane >= high)
7948     error ("constant out of range");
7949 }
7950
7951 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
7952    registers).  */
7953 void
7954 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
7955                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
7956                             rtx op1)
7957 {
7958   rtx mem = gen_rtx_MEM (mode, destaddr);
7959   rtx tmp1 = gen_reg_rtx (mode);
7960   rtx tmp2 = gen_reg_rtx (mode);
7961
7962   emit_insn (intfn (tmp1, op1, tmp2));
7963
7964   emit_move_insn (mem, tmp1);
7965   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
7966   emit_move_insn (mem, tmp2);
7967 }
7968
7969 /* Return TRUE if OP is a valid vector addressing mode.  */
7970 bool
7971 aarch64_simd_mem_operand_p (rtx op)
7972 {
7973   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
7974                         || REG_P (XEXP (op, 0)));
7975 }
7976
7977 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
7978    not to early-clobber SRC registers in the process.
7979
7980    We assume that the operands described by SRC and DEST represent a
7981    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
7982    number of components into which the copy has been decomposed.  */
7983 void
7984 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
7985                                 rtx *src, unsigned int count)
7986 {
7987   unsigned int i;
7988
7989   if (!reg_overlap_mentioned_p (operands[0], operands[1])
7990       || REGNO (operands[0]) < REGNO (operands[1]))
7991     {
7992       for (i = 0; i < count; i++)
7993         {
7994           operands[2 * i] = dest[i];
7995           operands[2 * i + 1] = src[i];
7996         }
7997     }
7998   else
7999     {
8000       for (i = 0; i < count; i++)
8001         {
8002           operands[2 * i] = dest[count - i - 1];
8003           operands[2 * i + 1] = src[count - i - 1];
8004         }
8005     }
8006 }
8007
8008 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8009    one of VSTRUCT modes: OI, CI or XI.  */
8010 int
8011 aarch64_simd_attr_length_move (rtx_insn *insn)
8012 {
8013   enum machine_mode mode;
8014
8015   extract_insn_cached (insn);
8016
8017   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8018     {
8019       mode = GET_MODE (recog_data.operand[0]);
8020       switch (mode)
8021         {
8022         case OImode:
8023           return 8;
8024         case CImode:
8025           return 12;
8026         case XImode:
8027           return 16;
8028         default:
8029           gcc_unreachable ();
8030         }
8031     }
8032   return 4;
8033 }
8034
8035 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8036    alignment of a vector to 128 bits.  */
8037 static HOST_WIDE_INT
8038 aarch64_simd_vector_alignment (const_tree type)
8039 {
8040   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8041   return MIN (align, 128);
8042 }
8043
8044 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8045 static bool
8046 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8047 {
8048   if (is_packed)
8049     return false;
8050
8051   /* We guarantee alignment for vectors up to 128-bits.  */
8052   if (tree_int_cst_compare (TYPE_SIZE (type),
8053                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8054     return false;
8055
8056   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8057   return true;
8058 }
8059
8060 /* If VALS is a vector constant that can be loaded into a register
8061    using DUP, generate instructions to do so and return an RTX to
8062    assign to the register.  Otherwise return NULL_RTX.  */
8063 static rtx
8064 aarch64_simd_dup_constant (rtx vals)
8065 {
8066   enum machine_mode mode = GET_MODE (vals);
8067   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8068   int n_elts = GET_MODE_NUNITS (mode);
8069   bool all_same = true;
8070   rtx x;
8071   int i;
8072
8073   if (GET_CODE (vals) != CONST_VECTOR)
8074     return NULL_RTX;
8075
8076   for (i = 1; i < n_elts; ++i)
8077     {
8078       x = CONST_VECTOR_ELT (vals, i);
8079       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8080         all_same = false;
8081     }
8082
8083   if (!all_same)
8084     return NULL_RTX;
8085
8086   /* We can load this constant by using DUP and a constant in a
8087      single ARM register.  This will be cheaper than a vector
8088      load.  */
8089   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8090   return gen_rtx_VEC_DUPLICATE (mode, x);
8091 }
8092
8093
8094 /* Generate code to load VALS, which is a PARALLEL containing only
8095    constants (for vec_init) or CONST_VECTOR, efficiently into a
8096    register.  Returns an RTX to copy into the register, or NULL_RTX
8097    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8098 static rtx
8099 aarch64_simd_make_constant (rtx vals)
8100 {
8101   enum machine_mode mode = GET_MODE (vals);
8102   rtx const_dup;
8103   rtx const_vec = NULL_RTX;
8104   int n_elts = GET_MODE_NUNITS (mode);
8105   int n_const = 0;
8106   int i;
8107
8108   if (GET_CODE (vals) == CONST_VECTOR)
8109     const_vec = vals;
8110   else if (GET_CODE (vals) == PARALLEL)
8111     {
8112       /* A CONST_VECTOR must contain only CONST_INTs and
8113          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8114          Only store valid constants in a CONST_VECTOR.  */
8115       for (i = 0; i < n_elts; ++i)
8116         {
8117           rtx x = XVECEXP (vals, 0, i);
8118           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8119             n_const++;
8120         }
8121       if (n_const == n_elts)
8122         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8123     }
8124   else
8125     gcc_unreachable ();
8126
8127   if (const_vec != NULL_RTX
8128       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8129     /* Load using MOVI/MVNI.  */
8130     return const_vec;
8131   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8132     /* Loaded using DUP.  */
8133     return const_dup;
8134   else if (const_vec != NULL_RTX)
8135     /* Load from constant pool. We can not take advantage of single-cycle
8136        LD1 because we need a PC-relative addressing mode.  */
8137     return const_vec;
8138   else
8139     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8140        We can not construct an initializer.  */
8141     return NULL_RTX;
8142 }
8143
8144 void
8145 aarch64_expand_vector_init (rtx target, rtx vals)
8146 {
8147   enum machine_mode mode = GET_MODE (target);
8148   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8149   int n_elts = GET_MODE_NUNITS (mode);
8150   int n_var = 0, one_var = -1;
8151   bool all_same = true;
8152   rtx x, mem;
8153   int i;
8154
8155   x = XVECEXP (vals, 0, 0);
8156   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8157     n_var = 1, one_var = 0;
8158
8159   for (i = 1; i < n_elts; ++i)
8160     {
8161       x = XVECEXP (vals, 0, i);
8162       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8163         ++n_var, one_var = i;
8164
8165       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8166         all_same = false;
8167     }
8168
8169   if (n_var == 0)
8170     {
8171       rtx constant = aarch64_simd_make_constant (vals);
8172       if (constant != NULL_RTX)
8173         {
8174           emit_move_insn (target, constant);
8175           return;
8176         }
8177     }
8178
8179   /* Splat a single non-constant element if we can.  */
8180   if (all_same)
8181     {
8182       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8183       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8184       return;
8185     }
8186
8187   /* One field is non-constant.  Load constant then overwrite varying
8188      field.  This is more efficient than using the stack.  */
8189   if (n_var == 1)
8190     {
8191       rtx copy = copy_rtx (vals);
8192       rtx index = GEN_INT (one_var);
8193       enum insn_code icode;
8194
8195       /* Load constant part of vector, substitute neighboring value for
8196          varying element.  */
8197       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8198       aarch64_expand_vector_init (target, copy);
8199
8200       /* Insert variable.  */
8201       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8202       icode = optab_handler (vec_set_optab, mode);
8203       gcc_assert (icode != CODE_FOR_nothing);
8204       emit_insn (GEN_FCN (icode) (target, x, index));
8205       return;
8206     }
8207
8208   /* Construct the vector in memory one field at a time
8209      and load the whole vector.  */
8210   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8211   for (i = 0; i < n_elts; i++)
8212     emit_move_insn (adjust_address_nv (mem, inner_mode,
8213                                     i * GET_MODE_SIZE (inner_mode)),
8214                     XVECEXP (vals, 0, i));
8215   emit_move_insn (target, mem);
8216
8217 }
8218
8219 static unsigned HOST_WIDE_INT
8220 aarch64_shift_truncation_mask (enum machine_mode mode)
8221 {
8222   return
8223     (aarch64_vector_mode_supported_p (mode)
8224      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8225 }
8226
8227 #ifndef TLS_SECTION_ASM_FLAG
8228 #define TLS_SECTION_ASM_FLAG 'T'
8229 #endif
8230
8231 void
8232 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8233                                tree decl ATTRIBUTE_UNUSED)
8234 {
8235   char flagchars[10], *f = flagchars;
8236
8237   /* If we have already declared this section, we can use an
8238      abbreviated form to switch back to it -- unless this section is
8239      part of a COMDAT groups, in which case GAS requires the full
8240      declaration every time.  */
8241   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8242       && (flags & SECTION_DECLARED))
8243     {
8244       fprintf (asm_out_file, "\t.section\t%s\n", name);
8245       return;
8246     }
8247
8248   if (!(flags & SECTION_DEBUG))
8249     *f++ = 'a';
8250   if (flags & SECTION_WRITE)
8251     *f++ = 'w';
8252   if (flags & SECTION_CODE)
8253     *f++ = 'x';
8254   if (flags & SECTION_SMALL)
8255     *f++ = 's';
8256   if (flags & SECTION_MERGE)
8257     *f++ = 'M';
8258   if (flags & SECTION_STRINGS)
8259     *f++ = 'S';
8260   if (flags & SECTION_TLS)
8261     *f++ = TLS_SECTION_ASM_FLAG;
8262   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8263     *f++ = 'G';
8264   *f = '\0';
8265
8266   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8267
8268   if (!(flags & SECTION_NOTYPE))
8269     {
8270       const char *type;
8271       const char *format;
8272
8273       if (flags & SECTION_BSS)
8274         type = "nobits";
8275       else
8276         type = "progbits";
8277
8278 #ifdef TYPE_OPERAND_FMT
8279       format = "," TYPE_OPERAND_FMT;
8280 #else
8281       format = ",@%s";
8282 #endif
8283
8284       fprintf (asm_out_file, format, type);
8285
8286       if (flags & SECTION_ENTSIZE)
8287         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8288       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8289         {
8290           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8291             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8292           else
8293             fprintf (asm_out_file, ",%s,comdat",
8294                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8295         }
8296     }
8297
8298   putc ('\n', asm_out_file);
8299 }
8300
8301 /* Select a format to encode pointers in exception handling data.  */
8302 int
8303 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8304 {
8305    int type;
8306    switch (aarch64_cmodel)
8307      {
8308      case AARCH64_CMODEL_TINY:
8309      case AARCH64_CMODEL_TINY_PIC:
8310      case AARCH64_CMODEL_SMALL:
8311      case AARCH64_CMODEL_SMALL_PIC:
8312        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8313           for everything.  */
8314        type = DW_EH_PE_sdata4;
8315        break;
8316      default:
8317        /* No assumptions here.  8-byte relocs required.  */
8318        type = DW_EH_PE_sdata8;
8319        break;
8320      }
8321    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8322 }
8323
8324 /* Emit load exclusive.  */
8325
8326 static void
8327 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8328                              rtx mem, rtx model_rtx)
8329 {
8330   rtx (*gen) (rtx, rtx, rtx);
8331
8332   switch (mode)
8333     {
8334     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8335     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8336     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8337     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8338     default:
8339       gcc_unreachable ();
8340     }
8341
8342   emit_insn (gen (rval, mem, model_rtx));
8343 }
8344
8345 /* Emit store exclusive.  */
8346
8347 static void
8348 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8349                               rtx rval, rtx mem, rtx model_rtx)
8350 {
8351   rtx (*gen) (rtx, rtx, rtx, rtx);
8352
8353   switch (mode)
8354     {
8355     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8356     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8357     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8358     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8359     default:
8360       gcc_unreachable ();
8361     }
8362
8363   emit_insn (gen (bval, rval, mem, model_rtx));
8364 }
8365
8366 /* Mark the previous jump instruction as unlikely.  */
8367
8368 static void
8369 aarch64_emit_unlikely_jump (rtx insn)
8370 {
8371   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8372
8373   insn = emit_jump_insn (insn);
8374   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8375 }
8376
8377 /* Expand a compare and swap pattern.  */
8378
8379 void
8380 aarch64_expand_compare_and_swap (rtx operands[])
8381 {
8382   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8383   enum machine_mode mode, cmp_mode;
8384   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8385
8386   bval = operands[0];
8387   rval = operands[1];
8388   mem = operands[2];
8389   oldval = operands[3];
8390   newval = operands[4];
8391   is_weak = operands[5];
8392   mod_s = operands[6];
8393   mod_f = operands[7];
8394   mode = GET_MODE (mem);
8395   cmp_mode = mode;
8396
8397   /* Normally the succ memory model must be stronger than fail, but in the
8398      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8399      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8400
8401   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8402       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8403     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8404
8405   switch (mode)
8406     {
8407     case QImode:
8408     case HImode:
8409       /* For short modes, we're going to perform the comparison in SImode,
8410          so do the zero-extension now.  */
8411       cmp_mode = SImode;
8412       rval = gen_reg_rtx (SImode);
8413       oldval = convert_modes (SImode, mode, oldval, true);
8414       /* Fall through.  */
8415
8416     case SImode:
8417     case DImode:
8418       /* Force the value into a register if needed.  */
8419       if (!aarch64_plus_operand (oldval, mode))
8420         oldval = force_reg (cmp_mode, oldval);
8421       break;
8422
8423     default:
8424       gcc_unreachable ();
8425     }
8426
8427   switch (mode)
8428     {
8429     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8430     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8431     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8432     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8433     default:
8434       gcc_unreachable ();
8435     }
8436
8437   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8438
8439   if (mode == QImode || mode == HImode)
8440     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8441
8442   x = gen_rtx_REG (CCmode, CC_REGNUM);
8443   x = gen_rtx_EQ (SImode, x, const0_rtx);
8444   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8445 }
8446
8447 /* Split a compare and swap pattern.  */
8448
8449 void
8450 aarch64_split_compare_and_swap (rtx operands[])
8451 {
8452   rtx rval, mem, oldval, newval, scratch;
8453   enum machine_mode mode;
8454   bool is_weak;
8455   rtx_code_label *label1, *label2;
8456   rtx x, cond;
8457
8458   rval = operands[0];
8459   mem = operands[1];
8460   oldval = operands[2];
8461   newval = operands[3];
8462   is_weak = (operands[4] != const0_rtx);
8463   scratch = operands[7];
8464   mode = GET_MODE (mem);
8465
8466   label1 = NULL;
8467   if (!is_weak)
8468     {
8469       label1 = gen_label_rtx ();
8470       emit_label (label1);
8471     }
8472   label2 = gen_label_rtx ();
8473
8474   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8475
8476   cond = aarch64_gen_compare_reg (NE, rval, oldval);
8477   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8478   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8479                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8480   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8481
8482   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8483
8484   if (!is_weak)
8485     {
8486       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8487       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8488                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8489       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8490     }
8491   else
8492     {
8493       cond = gen_rtx_REG (CCmode, CC_REGNUM);
8494       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8495       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8496     }
8497
8498   emit_label (label2);
8499 }
8500
8501 /* Split an atomic operation.  */
8502
8503 void
8504 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8505                      rtx value, rtx model_rtx, rtx cond)
8506 {
8507   enum machine_mode mode = GET_MODE (mem);
8508   enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8509   rtx_code_label *label;
8510   rtx x;
8511
8512   label = gen_label_rtx ();
8513   emit_label (label);
8514
8515   if (new_out)
8516     new_out = gen_lowpart (wmode, new_out);
8517   if (old_out)
8518     old_out = gen_lowpart (wmode, old_out);
8519   else
8520     old_out = new_out;
8521   value = simplify_gen_subreg (wmode, value, mode, 0);
8522
8523   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8524
8525   switch (code)
8526     {
8527     case SET:
8528       new_out = value;
8529       break;
8530
8531     case NOT:
8532       x = gen_rtx_AND (wmode, old_out, value);
8533       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8534       x = gen_rtx_NOT (wmode, new_out);
8535       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8536       break;
8537
8538     case MINUS:
8539       if (CONST_INT_P (value))
8540         {
8541           value = GEN_INT (-INTVAL (value));
8542           code = PLUS;
8543         }
8544       /* Fall through.  */
8545
8546     default:
8547       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8548       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8549       break;
8550     }
8551
8552   aarch64_emit_store_exclusive (mode, cond, mem,
8553                                 gen_lowpart (mode, new_out), model_rtx);
8554
8555   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8556   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8557                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8558   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8559 }
8560
8561 static void
8562 aarch64_print_extension (void)
8563 {
8564   const struct aarch64_option_extension *opt = NULL;
8565
8566   for (opt = all_extensions; opt->name != NULL; opt++)
8567     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8568       asm_fprintf (asm_out_file, "+%s", opt->name);
8569
8570   asm_fprintf (asm_out_file, "\n");
8571 }
8572
8573 static void
8574 aarch64_start_file (void)
8575 {
8576   if (selected_arch)
8577     {
8578       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8579       aarch64_print_extension ();
8580     }
8581   else if (selected_cpu)
8582     {
8583       const char *truncated_name
8584             = aarch64_rewrite_selected_cpu (selected_cpu->name);
8585       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8586       aarch64_print_extension ();
8587     }
8588   default_file_start();
8589 }
8590
8591 /* Target hook for c_mode_for_suffix.  */
8592 static enum machine_mode
8593 aarch64_c_mode_for_suffix (char suffix)
8594 {
8595   if (suffix == 'q')
8596     return TFmode;
8597
8598   return VOIDmode;
8599 }
8600
8601 /* We can only represent floating point constants which will fit in
8602    "quarter-precision" values.  These values are characterised by
8603    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
8604    by:
8605
8606    (-1)^s * (n/16) * 2^r
8607
8608    Where:
8609      's' is the sign bit.
8610      'n' is an integer in the range 16 <= n <= 31.
8611      'r' is an integer in the range -3 <= r <= 4.  */
8612
8613 /* Return true iff X can be represented by a quarter-precision
8614    floating point immediate operand X.  Note, we cannot represent 0.0.  */
8615 bool
8616 aarch64_float_const_representable_p (rtx x)
8617 {
8618   /* This represents our current view of how many bits
8619      make up the mantissa.  */
8620   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8621   int exponent;
8622   unsigned HOST_WIDE_INT mantissa, mask;
8623   REAL_VALUE_TYPE r, m;
8624   bool fail;
8625
8626   if (!CONST_DOUBLE_P (x))
8627     return false;
8628
8629   if (GET_MODE (x) == VOIDmode)
8630     return false;
8631
8632   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8633
8634   /* We cannot represent infinities, NaNs or +/-zero.  We won't
8635      know if we have +zero until we analyse the mantissa, but we
8636      can reject the other invalid values.  */
8637   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8638       || REAL_VALUE_MINUS_ZERO (r))
8639     return false;
8640
8641   /* Extract exponent.  */
8642   r = real_value_abs (&r);
8643   exponent = REAL_EXP (&r);
8644
8645   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8646      highest (sign) bit, with a fixed binary point at bit point_pos.
8647      m1 holds the low part of the mantissa, m2 the high part.
8648      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8649      bits for the mantissa, this can fail (low bits will be lost).  */
8650   real_ldexp (&m, &r, point_pos - exponent);
8651   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8652
8653   /* If the low part of the mantissa has bits set we cannot represent
8654      the value.  */
8655   if (w.elt (0) != 0)
8656     return false;
8657   /* We have rejected the lower HOST_WIDE_INT, so update our
8658      understanding of how many bits lie in the mantissa and
8659      look only at the high HOST_WIDE_INT.  */
8660   mantissa = w.elt (1);
8661   point_pos -= HOST_BITS_PER_WIDE_INT;
8662
8663   /* We can only represent values with a mantissa of the form 1.xxxx.  */
8664   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8665   if ((mantissa & mask) != 0)
8666     return false;
8667
8668   /* Having filtered unrepresentable values, we may now remove all
8669      but the highest 5 bits.  */
8670   mantissa >>= point_pos - 5;
8671
8672   /* We cannot represent the value 0.0, so reject it.  This is handled
8673      elsewhere.  */
8674   if (mantissa == 0)
8675     return false;
8676
8677   /* Then, as bit 4 is always set, we can mask it off, leaving
8678      the mantissa in the range [0, 15].  */
8679   mantissa &= ~(1 << 4);
8680   gcc_assert (mantissa <= 15);
8681
8682   /* GCC internally does not use IEEE754-like encoding (where normalized
8683      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
8684      Our mantissa values are shifted 4 places to the left relative to
8685      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8686      by 5 places to correct for GCC's representation.  */
8687   exponent = 5 - exponent;
8688
8689   return (exponent >= 0 && exponent <= 7);
8690 }
8691
8692 char*
8693 aarch64_output_simd_mov_immediate (rtx const_vector,
8694                                    enum machine_mode mode,
8695                                    unsigned width)
8696 {
8697   bool is_valid;
8698   static char templ[40];
8699   const char *mnemonic;
8700   const char *shift_op;
8701   unsigned int lane_count = 0;
8702   char element_char;
8703
8704   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8705
8706   /* This will return true to show const_vector is legal for use as either
8707      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
8708      also update INFO to show how the immediate should be generated.  */
8709   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8710   gcc_assert (is_valid);
8711
8712   element_char = sizetochar (info.element_width);
8713   lane_count = width / info.element_width;
8714
8715   mode = GET_MODE_INNER (mode);
8716   if (mode == SFmode || mode == DFmode)
8717     {
8718       gcc_assert (info.shift == 0 && ! info.mvn);
8719       if (aarch64_float_const_zero_rtx_p (info.value))
8720         info.value = GEN_INT (0);
8721       else
8722         {
8723 #define buf_size 20
8724           REAL_VALUE_TYPE r;
8725           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8726           char float_buf[buf_size] = {'\0'};
8727           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8728 #undef buf_size
8729
8730           if (lane_count == 1)
8731             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8732           else
8733             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8734                       lane_count, element_char, float_buf);
8735           return templ;
8736         }
8737     }
8738
8739   mnemonic = info.mvn ? "mvni" : "movi";
8740   shift_op = info.msl ? "msl" : "lsl";
8741
8742   if (lane_count == 1)
8743     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8744               mnemonic, UINTVAL (info.value));
8745   else if (info.shift)
8746     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8747               ", %s %d", mnemonic, lane_count, element_char,
8748               UINTVAL (info.value), shift_op, info.shift);
8749   else
8750     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8751               mnemonic, lane_count, element_char, UINTVAL (info.value));
8752   return templ;
8753 }
8754
8755 char*
8756 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8757                                           enum machine_mode mode)
8758 {
8759   enum machine_mode vmode;
8760
8761   gcc_assert (!VECTOR_MODE_P (mode));
8762   vmode = aarch64_simd_container_mode (mode, 64);
8763   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8764   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8765 }
8766
8767 /* Split operands into moves from op[1] + op[2] into op[0].  */
8768
8769 void
8770 aarch64_split_combinev16qi (rtx operands[3])
8771 {
8772   unsigned int dest = REGNO (operands[0]);
8773   unsigned int src1 = REGNO (operands[1]);
8774   unsigned int src2 = REGNO (operands[2]);
8775   enum machine_mode halfmode = GET_MODE (operands[1]);
8776   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8777   rtx destlo, desthi;
8778
8779   gcc_assert (halfmode == V16QImode);
8780
8781   if (src1 == dest && src2 == dest + halfregs)
8782     {
8783       /* No-op move.  Can't split to nothing; emit something.  */
8784       emit_note (NOTE_INSN_DELETED);
8785       return;
8786     }
8787
8788   /* Preserve register attributes for variable tracking.  */
8789   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8790   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8791                                GET_MODE_SIZE (halfmode));
8792
8793   /* Special case of reversed high/low parts.  */
8794   if (reg_overlap_mentioned_p (operands[2], destlo)
8795       && reg_overlap_mentioned_p (operands[1], desthi))
8796     {
8797       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8798       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8799       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8800     }
8801   else if (!reg_overlap_mentioned_p (operands[2], destlo))
8802     {
8803       /* Try to avoid unnecessary moves if part of the result
8804          is in the right place already.  */
8805       if (src1 != dest)
8806         emit_move_insn (destlo, operands[1]);
8807       if (src2 != dest + halfregs)
8808         emit_move_insn (desthi, operands[2]);
8809     }
8810   else
8811     {
8812       if (src2 != dest + halfregs)
8813         emit_move_insn (desthi, operands[2]);
8814       if (src1 != dest)
8815         emit_move_insn (destlo, operands[1]);
8816     }
8817 }
8818
8819 /* vec_perm support.  */
8820
8821 #define MAX_VECT_LEN 16
8822
8823 struct expand_vec_perm_d
8824 {
8825   rtx target, op0, op1;
8826   unsigned char perm[MAX_VECT_LEN];
8827   enum machine_mode vmode;
8828   unsigned char nelt;
8829   bool one_vector_p;
8830   bool testing_p;
8831 };
8832
8833 /* Generate a variable permutation.  */
8834
8835 static void
8836 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8837 {
8838   enum machine_mode vmode = GET_MODE (target);
8839   bool one_vector_p = rtx_equal_p (op0, op1);
8840
8841   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8842   gcc_checking_assert (GET_MODE (op0) == vmode);
8843   gcc_checking_assert (GET_MODE (op1) == vmode);
8844   gcc_checking_assert (GET_MODE (sel) == vmode);
8845   gcc_checking_assert (TARGET_SIMD);
8846
8847   if (one_vector_p)
8848     {
8849       if (vmode == V8QImode)
8850         {
8851           /* Expand the argument to a V16QI mode by duplicating it.  */
8852           rtx pair = gen_reg_rtx (V16QImode);
8853           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8854           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8855         }
8856       else
8857         {
8858           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8859         }
8860     }
8861   else
8862     {
8863       rtx pair;
8864
8865       if (vmode == V8QImode)
8866         {
8867           pair = gen_reg_rtx (V16QImode);
8868           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8869           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8870         }
8871       else
8872         {
8873           pair = gen_reg_rtx (OImode);
8874           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8875           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8876         }
8877     }
8878 }
8879
8880 void
8881 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8882 {
8883   enum machine_mode vmode = GET_MODE (target);
8884   unsigned int nelt = GET_MODE_NUNITS (vmode);
8885   bool one_vector_p = rtx_equal_p (op0, op1);
8886   rtx mask;
8887
8888   /* The TBL instruction does not use a modulo index, so we must take care
8889      of that ourselves.  */
8890   mask = aarch64_simd_gen_const_vector_dup (vmode,
8891       one_vector_p ? nelt - 1 : 2 * nelt - 1);
8892   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8893
8894   /* For big-endian, we also need to reverse the index within the vector
8895      (but not which vector).  */
8896   if (BYTES_BIG_ENDIAN)
8897     {
8898       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
8899       if (!one_vector_p)
8900         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
8901       sel = expand_simple_binop (vmode, XOR, sel, mask,
8902                                  NULL, 0, OPTAB_LIB_WIDEN);
8903     }
8904   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8905 }
8906
8907 /* Recognize patterns suitable for the TRN instructions.  */
8908 static bool
8909 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8910 {
8911   unsigned int i, odd, mask, nelt = d->nelt;
8912   rtx out, in0, in1, x;
8913   rtx (*gen) (rtx, rtx, rtx);
8914   enum machine_mode vmode = d->vmode;
8915
8916   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8917     return false;
8918
8919   /* Note that these are little-endian tests.
8920      We correct for big-endian later.  */
8921   if (d->perm[0] == 0)
8922     odd = 0;
8923   else if (d->perm[0] == 1)
8924     odd = 1;
8925   else
8926     return false;
8927   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8928
8929   for (i = 0; i < nelt; i += 2)
8930     {
8931       if (d->perm[i] != i + odd)
8932         return false;
8933       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
8934         return false;
8935     }
8936
8937   /* Success!  */
8938   if (d->testing_p)
8939     return true;
8940
8941   in0 = d->op0;
8942   in1 = d->op1;
8943   if (BYTES_BIG_ENDIAN)
8944     {
8945       x = in0, in0 = in1, in1 = x;
8946       odd = !odd;
8947     }
8948   out = d->target;
8949
8950   if (odd)
8951     {
8952       switch (vmode)
8953         {
8954         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
8955         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
8956         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
8957         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
8958         case V4SImode: gen = gen_aarch64_trn2v4si; break;
8959         case V2SImode: gen = gen_aarch64_trn2v2si; break;
8960         case V2DImode: gen = gen_aarch64_trn2v2di; break;
8961         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
8962         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
8963         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
8964         default:
8965           return false;
8966         }
8967     }
8968   else
8969     {
8970       switch (vmode)
8971         {
8972         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
8973         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
8974         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
8975         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
8976         case V4SImode: gen = gen_aarch64_trn1v4si; break;
8977         case V2SImode: gen = gen_aarch64_trn1v2si; break;
8978         case V2DImode: gen = gen_aarch64_trn1v2di; break;
8979         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
8980         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
8981         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
8982         default:
8983           return false;
8984         }
8985     }
8986
8987   emit_insn (gen (out, in0, in1));
8988   return true;
8989 }
8990
8991 /* Recognize patterns suitable for the UZP instructions.  */
8992 static bool
8993 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
8994 {
8995   unsigned int i, odd, mask, nelt = d->nelt;
8996   rtx out, in0, in1, x;
8997   rtx (*gen) (rtx, rtx, rtx);
8998   enum machine_mode vmode = d->vmode;
8999
9000   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9001     return false;
9002
9003   /* Note that these are little-endian tests.
9004      We correct for big-endian later.  */
9005   if (d->perm[0] == 0)
9006     odd = 0;
9007   else if (d->perm[0] == 1)
9008     odd = 1;
9009   else
9010     return false;
9011   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9012
9013   for (i = 0; i < nelt; i++)
9014     {
9015       unsigned elt = (i * 2 + odd) & mask;
9016       if (d->perm[i] != elt)
9017         return false;
9018     }
9019
9020   /* Success!  */
9021   if (d->testing_p)
9022     return true;
9023
9024   in0 = d->op0;
9025   in1 = d->op1;
9026   if (BYTES_BIG_ENDIAN)
9027     {
9028       x = in0, in0 = in1, in1 = x;
9029       odd = !odd;
9030     }
9031   out = d->target;
9032
9033   if (odd)
9034     {
9035       switch (vmode)
9036         {
9037         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9038         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9039         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9040         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9041         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9042         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9043         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9044         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9045         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9046         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9047         default:
9048           return false;
9049         }
9050     }
9051   else
9052     {
9053       switch (vmode)
9054         {
9055         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9056         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9057         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9058         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9059         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9060         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9061         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9062         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9063         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9064         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9065         default:
9066           return false;
9067         }
9068     }
9069
9070   emit_insn (gen (out, in0, in1));
9071   return true;
9072 }
9073
9074 /* Recognize patterns suitable for the ZIP instructions.  */
9075 static bool
9076 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9077 {
9078   unsigned int i, high, mask, nelt = d->nelt;
9079   rtx out, in0, in1, x;
9080   rtx (*gen) (rtx, rtx, rtx);
9081   enum machine_mode vmode = d->vmode;
9082
9083   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9084     return false;
9085
9086   /* Note that these are little-endian tests.
9087      We correct for big-endian later.  */
9088   high = nelt / 2;
9089   if (d->perm[0] == high)
9090     /* Do Nothing.  */
9091     ;
9092   else if (d->perm[0] == 0)
9093     high = 0;
9094   else
9095     return false;
9096   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9097
9098   for (i = 0; i < nelt / 2; i++)
9099     {
9100       unsigned elt = (i + high) & mask;
9101       if (d->perm[i * 2] != elt)
9102         return false;
9103       elt = (elt + nelt) & mask;
9104       if (d->perm[i * 2 + 1] != elt)
9105         return false;
9106     }
9107
9108   /* Success!  */
9109   if (d->testing_p)
9110     return true;
9111
9112   in0 = d->op0;
9113   in1 = d->op1;
9114   if (BYTES_BIG_ENDIAN)
9115     {
9116       x = in0, in0 = in1, in1 = x;
9117       high = !high;
9118     }
9119   out = d->target;
9120
9121   if (high)
9122     {
9123       switch (vmode)
9124         {
9125         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9126         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9127         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9128         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9129         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9130         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9131         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9132         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9133         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9134         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9135         default:
9136           return false;
9137         }
9138     }
9139   else
9140     {
9141       switch (vmode)
9142         {
9143         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9144         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9145         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9146         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9147         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9148         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9149         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9150         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9151         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9152         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9153         default:
9154           return false;
9155         }
9156     }
9157
9158   emit_insn (gen (out, in0, in1));
9159   return true;
9160 }
9161
9162 /* Recognize patterns for the EXT insn.  */
9163
9164 static bool
9165 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9166 {
9167   unsigned int i, nelt = d->nelt;
9168   rtx (*gen) (rtx, rtx, rtx, rtx);
9169   rtx offset;
9170
9171   unsigned int location = d->perm[0]; /* Always < nelt.  */
9172
9173   /* Check if the extracted indices are increasing by one.  */
9174   for (i = 1; i < nelt; i++)
9175     {
9176       unsigned int required = location + i;
9177       if (d->one_vector_p)
9178         {
9179           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9180           required &= (nelt - 1);
9181         }
9182       if (d->perm[i] != required)
9183         return false;
9184     }
9185
9186   switch (d->vmode)
9187     {
9188     case V16QImode: gen = gen_aarch64_extv16qi; break;
9189     case V8QImode: gen = gen_aarch64_extv8qi; break;
9190     case V4HImode: gen = gen_aarch64_extv4hi; break;
9191     case V8HImode: gen = gen_aarch64_extv8hi; break;
9192     case V2SImode: gen = gen_aarch64_extv2si; break;
9193     case V4SImode: gen = gen_aarch64_extv4si; break;
9194     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9195     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9196     case V2DImode: gen = gen_aarch64_extv2di; break;
9197     case V2DFmode: gen = gen_aarch64_extv2df; break;
9198     default:
9199       return false;
9200     }
9201
9202   /* Success! */
9203   if (d->testing_p)
9204     return true;
9205
9206   /* The case where (location == 0) is a no-op for both big- and little-endian,
9207      and is removed by the mid-end at optimization levels -O1 and higher.  */
9208
9209   if (BYTES_BIG_ENDIAN && (location != 0))
9210     {
9211       /* After setup, we want the high elements of the first vector (stored
9212          at the LSB end of the register), and the low elements of the second
9213          vector (stored at the MSB end of the register). So swap.  */
9214       rtx temp = d->op0;
9215       d->op0 = d->op1;
9216       d->op1 = temp;
9217       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9218       location = nelt - location;
9219     }
9220
9221   offset = GEN_INT (location);
9222   emit_insn (gen (d->target, d->op0, d->op1, offset));
9223   return true;
9224 }
9225
9226 /* Recognize patterns for the REV insns.  */
9227
9228 static bool
9229 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9230 {
9231   unsigned int i, j, diff, nelt = d->nelt;
9232   rtx (*gen) (rtx, rtx);
9233
9234   if (!d->one_vector_p)
9235     return false;
9236
9237   diff = d->perm[0];
9238   switch (diff)
9239     {
9240     case 7:
9241       switch (d->vmode)
9242         {
9243         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9244         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9245         default:
9246           return false;
9247         }
9248       break;
9249     case 3:
9250       switch (d->vmode)
9251         {
9252         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9253         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9254         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9255         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9256         default:
9257           return false;
9258         }
9259       break;
9260     case 1:
9261       switch (d->vmode)
9262         {
9263         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9264         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9265         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9266         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9267         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9268         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9269         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9270         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9271         default:
9272           return false;
9273         }
9274       break;
9275     default:
9276       return false;
9277     }
9278
9279   for (i = 0; i < nelt ; i += diff + 1)
9280     for (j = 0; j <= diff; j += 1)
9281       {
9282         /* This is guaranteed to be true as the value of diff
9283            is 7, 3, 1 and we should have enough elements in the
9284            queue to generate this.  Getting a vector mask with a
9285            value of diff other than these values implies that
9286            something is wrong by the time we get here.  */
9287         gcc_assert (i + j < nelt);
9288         if (d->perm[i + j] != i + diff - j)
9289           return false;
9290       }
9291
9292   /* Success! */
9293   if (d->testing_p)
9294     return true;
9295
9296   emit_insn (gen (d->target, d->op0));
9297   return true;
9298 }
9299
9300 static bool
9301 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9302 {
9303   rtx (*gen) (rtx, rtx, rtx);
9304   rtx out = d->target;
9305   rtx in0;
9306   enum machine_mode vmode = d->vmode;
9307   unsigned int i, elt, nelt = d->nelt;
9308   rtx lane;
9309
9310   elt = d->perm[0];
9311   for (i = 1; i < nelt; i++)
9312     {
9313       if (elt != d->perm[i])
9314         return false;
9315     }
9316
9317   /* The generic preparation in aarch64_expand_vec_perm_const_1
9318      swaps the operand order and the permute indices if it finds
9319      d->perm[0] to be in the second operand.  Thus, we can always
9320      use d->op0 and need not do any extra arithmetic to get the
9321      correct lane number.  */
9322   in0 = d->op0;
9323   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9324
9325   switch (vmode)
9326     {
9327     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9328     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9329     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9330     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9331     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9332     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9333     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9334     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9335     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9336     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9337     default:
9338       return false;
9339     }
9340
9341   emit_insn (gen (out, in0, lane));
9342   return true;
9343 }
9344
9345 static bool
9346 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9347 {
9348   rtx rperm[MAX_VECT_LEN], sel;
9349   enum machine_mode vmode = d->vmode;
9350   unsigned int i, nelt = d->nelt;
9351
9352   if (d->testing_p)
9353     return true;
9354
9355   /* Generic code will try constant permutation twice.  Once with the
9356      original mode and again with the elements lowered to QImode.
9357      So wait and don't do the selector expansion ourselves.  */
9358   if (vmode != V8QImode && vmode != V16QImode)
9359     return false;
9360
9361   for (i = 0; i < nelt; ++i)
9362     {
9363       int nunits = GET_MODE_NUNITS (vmode);
9364
9365       /* If big-endian and two vectors we end up with a weird mixed-endian
9366          mode on NEON.  Reverse the index within each word but not the word
9367          itself.  */
9368       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9369                                            : d->perm[i]);
9370     }
9371   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9372   sel = force_reg (vmode, sel);
9373
9374   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9375   return true;
9376 }
9377
9378 static bool
9379 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9380 {
9381   /* The pattern matching functions above are written to look for a small
9382      number to begin the sequence (0, 1, N/2).  If we begin with an index
9383      from the second operand, we can swap the operands.  */
9384   if (d->perm[0] >= d->nelt)
9385     {
9386       unsigned i, nelt = d->nelt;
9387       rtx x;
9388
9389       gcc_assert (nelt == (nelt & -nelt));
9390       for (i = 0; i < nelt; ++i)
9391         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9392
9393       x = d->op0;
9394       d->op0 = d->op1;
9395       d->op1 = x;
9396     }
9397
9398   if (TARGET_SIMD)
9399     {
9400       if (aarch64_evpc_rev (d))
9401         return true;
9402       else if (aarch64_evpc_ext (d))
9403         return true;
9404       else if (aarch64_evpc_dup (d))
9405         return true;
9406       else if (aarch64_evpc_zip (d))
9407         return true;
9408       else if (aarch64_evpc_uzp (d))
9409         return true;
9410       else if (aarch64_evpc_trn (d))
9411         return true;
9412       return aarch64_evpc_tbl (d);
9413     }
9414   return false;
9415 }
9416
9417 /* Expand a vec_perm_const pattern.  */
9418
9419 bool
9420 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9421 {
9422   struct expand_vec_perm_d d;
9423   int i, nelt, which;
9424
9425   d.target = target;
9426   d.op0 = op0;
9427   d.op1 = op1;
9428
9429   d.vmode = GET_MODE (target);
9430   gcc_assert (VECTOR_MODE_P (d.vmode));
9431   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9432   d.testing_p = false;
9433
9434   for (i = which = 0; i < nelt; ++i)
9435     {
9436       rtx e = XVECEXP (sel, 0, i);
9437       int ei = INTVAL (e) & (2 * nelt - 1);
9438       which |= (ei < nelt ? 1 : 2);
9439       d.perm[i] = ei;
9440     }
9441
9442   switch (which)
9443     {
9444     default:
9445       gcc_unreachable ();
9446
9447     case 3:
9448       d.one_vector_p = false;
9449       if (!rtx_equal_p (op0, op1))
9450         break;
9451
9452       /* The elements of PERM do not suggest that only the first operand
9453          is used, but both operands are identical.  Allow easier matching
9454          of the permutation by folding the permutation into the single
9455          input vector.  */
9456       /* Fall Through.  */
9457     case 2:
9458       for (i = 0; i < nelt; ++i)
9459         d.perm[i] &= nelt - 1;
9460       d.op0 = op1;
9461       d.one_vector_p = true;
9462       break;
9463
9464     case 1:
9465       d.op1 = op0;
9466       d.one_vector_p = true;
9467       break;
9468     }
9469
9470   return aarch64_expand_vec_perm_const_1 (&d);
9471 }
9472
9473 static bool
9474 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9475                                      const unsigned char *sel)
9476 {
9477   struct expand_vec_perm_d d;
9478   unsigned int i, nelt, which;
9479   bool ret;
9480
9481   d.vmode = vmode;
9482   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9483   d.testing_p = true;
9484   memcpy (d.perm, sel, nelt);
9485
9486   /* Calculate whether all elements are in one vector.  */
9487   for (i = which = 0; i < nelt; ++i)
9488     {
9489       unsigned char e = d.perm[i];
9490       gcc_assert (e < 2 * nelt);
9491       which |= (e < nelt ? 1 : 2);
9492     }
9493
9494   /* If all elements are from the second vector, reindex as if from the
9495      first vector.  */
9496   if (which == 2)
9497     for (i = 0; i < nelt; ++i)
9498       d.perm[i] -= nelt;
9499
9500   /* Check whether the mask can be applied to a single vector.  */
9501   d.one_vector_p = (which != 3);
9502
9503   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9504   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9505   if (!d.one_vector_p)
9506     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9507
9508   start_sequence ();
9509   ret = aarch64_expand_vec_perm_const_1 (&d);
9510   end_sequence ();
9511
9512   return ret;
9513 }
9514
9515 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
9516 bool
9517 aarch64_cannot_change_mode_class (enum machine_mode from,
9518                                   enum machine_mode to,
9519                                   enum reg_class rclass)
9520 {
9521   /* Full-reg subregs are allowed on general regs or any class if they are
9522      the same size.  */
9523   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9524       || !reg_classes_intersect_p (FP_REGS, rclass))
9525     return false;
9526
9527   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
9528      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9529      2. Scalar to Scalar for integer modes or same size float modes.
9530      3. Vector to Vector modes.
9531      4. On little-endian only, Vector-Structure to Vector modes.  */
9532   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9533     {
9534       if (aarch64_vector_mode_supported_p (from)
9535           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9536         return false;
9537
9538       if (GET_MODE_NUNITS (from) == 1
9539           && GET_MODE_NUNITS (to) == 1
9540           && (GET_MODE_CLASS (from) == MODE_INT
9541               || from == to))
9542         return false;
9543
9544       if (aarch64_vector_mode_supported_p (from)
9545           && aarch64_vector_mode_supported_p (to))
9546         return false;
9547
9548       /* Within an vector structure straddling multiple vector registers
9549          we are in a mixed-endian representation.  As such, we can't
9550          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
9551          switch between vectors and vector structures cheaply.  */
9552       if (!BYTES_BIG_ENDIAN)
9553         if ((aarch64_vector_mode_supported_p (from)
9554               && aarch64_vect_struct_mode_p (to))
9555             || (aarch64_vector_mode_supported_p (to)
9556               && aarch64_vect_struct_mode_p (from)))
9557           return false;
9558     }
9559
9560   return true;
9561 }
9562
9563 /* Implement MODES_TIEABLE_P.  */
9564
9565 bool
9566 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9567 {
9568   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9569     return true;
9570
9571   /* We specifically want to allow elements of "structure" modes to
9572      be tieable to the structure.  This more general condition allows
9573      other rarer situations too.  */
9574   if (TARGET_SIMD
9575       && aarch64_vector_mode_p (mode1)
9576       && aarch64_vector_mode_p (mode2))
9577     return true;
9578
9579   return false;
9580 }
9581
9582 /* Return a new RTX holding the result of moving POINTER forward by
9583    AMOUNT bytes.  */
9584
9585 static rtx
9586 aarch64_move_pointer (rtx pointer, int amount)
9587 {
9588   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9589
9590   return adjust_automodify_address (pointer, GET_MODE (pointer),
9591                                     next, amount);
9592 }
9593
9594 /* Return a new RTX holding the result of moving POINTER forward by the
9595    size of the mode it points to.  */
9596
9597 static rtx
9598 aarch64_progress_pointer (rtx pointer)
9599 {
9600   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9601
9602   return aarch64_move_pointer (pointer, amount);
9603 }
9604
9605 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9606    MODE bytes.  */
9607
9608 static void
9609 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9610                                               enum machine_mode mode)
9611 {
9612   rtx reg = gen_reg_rtx (mode);
9613
9614   /* "Cast" the pointers to the correct mode.  */
9615   *src = adjust_address (*src, mode, 0);
9616   *dst = adjust_address (*dst, mode, 0);
9617   /* Emit the memcpy.  */
9618   emit_move_insn (reg, *src);
9619   emit_move_insn (*dst, reg);
9620   /* Move the pointers forward.  */
9621   *src = aarch64_progress_pointer (*src);
9622   *dst = aarch64_progress_pointer (*dst);
9623 }
9624
9625 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
9626    we succeed, otherwise return false.  */
9627
9628 bool
9629 aarch64_expand_movmem (rtx *operands)
9630 {
9631   unsigned int n;
9632   rtx dst = operands[0];
9633   rtx src = operands[1];
9634   rtx base;
9635   bool speed_p = !optimize_function_for_size_p (cfun);
9636
9637   /* When optimizing for size, give a better estimate of the length of a
9638      memcpy call, but use the default otherwise.  */
9639   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9640
9641   /* We can't do anything smart if the amount to copy is not constant.  */
9642   if (!CONST_INT_P (operands[2]))
9643     return false;
9644
9645   n = UINTVAL (operands[2]);
9646
9647   /* Try to keep the number of instructions low.  For cases below 16 bytes we
9648      need to make at most two moves.  For cases above 16 bytes it will be one
9649      move for each 16 byte chunk, then at most two additional moves.  */
9650   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9651     return false;
9652
9653   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9654   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9655
9656   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9657   src = adjust_automodify_address (src, VOIDmode, base, 0);
9658
9659   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9660      1-byte chunk.  */
9661   if (n < 4)
9662     {
9663       if (n >= 2)
9664         {
9665           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9666           n -= 2;
9667         }
9668
9669       if (n == 1)
9670         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9671
9672       return true;
9673     }
9674
9675   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
9676      4-byte chunk, partially overlapping with the previously copied chunk.  */
9677   if (n < 8)
9678     {
9679       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9680       n -= 4;
9681       if (n > 0)
9682         {
9683           int move = n - 4;
9684
9685           src = aarch64_move_pointer (src, move);
9686           dst = aarch64_move_pointer (dst, move);
9687           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9688         }
9689       return true;
9690     }
9691
9692   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
9693      them, then (if applicable) an 8-byte chunk.  */
9694   while (n >= 8)
9695     {
9696       if (n / 16)
9697         {
9698           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9699           n -= 16;
9700         }
9701       else
9702         {
9703           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9704           n -= 8;
9705         }
9706     }
9707
9708   /* Finish the final bytes of the copy.  We can always do this in one
9709      instruction.  We either copy the exact amount we need, or partially
9710      overlap with the previous chunk we copied and copy 8-bytes.  */
9711   if (n == 0)
9712     return true;
9713   else if (n == 1)
9714     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9715   else if (n == 2)
9716     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9717   else if (n == 4)
9718     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9719   else
9720     {
9721       if (n == 3)
9722         {
9723           src = aarch64_move_pointer (src, -1);
9724           dst = aarch64_move_pointer (dst, -1);
9725           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9726         }
9727       else
9728         {
9729           int move = n - 8;
9730
9731           src = aarch64_move_pointer (src, move);
9732           dst = aarch64_move_pointer (dst, move);
9733           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9734         }
9735     }
9736
9737   return true;
9738 }
9739
9740 #undef TARGET_ADDRESS_COST
9741 #define TARGET_ADDRESS_COST aarch64_address_cost
9742
9743 /* This hook will determines whether unnamed bitfields affect the alignment
9744    of the containing structure.  The hook returns true if the structure
9745    should inherit the alignment requirements of an unnamed bitfield's
9746    type.  */
9747 #undef TARGET_ALIGN_ANON_BITFIELD
9748 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9749
9750 #undef TARGET_ASM_ALIGNED_DI_OP
9751 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9752
9753 #undef TARGET_ASM_ALIGNED_HI_OP
9754 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9755
9756 #undef TARGET_ASM_ALIGNED_SI_OP
9757 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9758
9759 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9760 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9761   hook_bool_const_tree_hwi_hwi_const_tree_true
9762
9763 #undef TARGET_ASM_FILE_START
9764 #define TARGET_ASM_FILE_START aarch64_start_file
9765
9766 #undef TARGET_ASM_OUTPUT_MI_THUNK
9767 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9768
9769 #undef TARGET_ASM_SELECT_RTX_SECTION
9770 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9771
9772 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9773 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9774
9775 #undef TARGET_BUILD_BUILTIN_VA_LIST
9776 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9777
9778 #undef TARGET_CALLEE_COPIES
9779 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9780
9781 #undef TARGET_CAN_ELIMINATE
9782 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9783
9784 #undef TARGET_CANNOT_FORCE_CONST_MEM
9785 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9786
9787 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9788 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9789
9790 /* Only the least significant bit is used for initialization guard
9791    variables.  */
9792 #undef TARGET_CXX_GUARD_MASK_BIT
9793 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9794
9795 #undef TARGET_C_MODE_FOR_SUFFIX
9796 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9797
9798 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9799 #undef  TARGET_DEFAULT_TARGET_FLAGS
9800 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9801 #endif
9802
9803 #undef TARGET_CLASS_MAX_NREGS
9804 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9805
9806 #undef TARGET_BUILTIN_DECL
9807 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9808
9809 #undef  TARGET_EXPAND_BUILTIN
9810 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9811
9812 #undef TARGET_EXPAND_BUILTIN_VA_START
9813 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9814
9815 #undef TARGET_FOLD_BUILTIN
9816 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9817
9818 #undef TARGET_FUNCTION_ARG
9819 #define TARGET_FUNCTION_ARG aarch64_function_arg
9820
9821 #undef TARGET_FUNCTION_ARG_ADVANCE
9822 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9823
9824 #undef TARGET_FUNCTION_ARG_BOUNDARY
9825 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9826
9827 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9828 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9829
9830 #undef TARGET_FUNCTION_VALUE
9831 #define TARGET_FUNCTION_VALUE aarch64_function_value
9832
9833 #undef TARGET_FUNCTION_VALUE_REGNO_P
9834 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9835
9836 #undef TARGET_FRAME_POINTER_REQUIRED
9837 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9838
9839 #undef TARGET_GIMPLE_FOLD_BUILTIN
9840 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9841
9842 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9843 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9844
9845 #undef  TARGET_INIT_BUILTINS
9846 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
9847
9848 #undef TARGET_LEGITIMATE_ADDRESS_P
9849 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9850
9851 #undef TARGET_LEGITIMATE_CONSTANT_P
9852 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9853
9854 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9855 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9856
9857 #undef TARGET_LRA_P
9858 #define TARGET_LRA_P aarch64_lra_p
9859
9860 #undef TARGET_MANGLE_TYPE
9861 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9862
9863 #undef TARGET_MEMORY_MOVE_COST
9864 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9865
9866 #undef TARGET_MUST_PASS_IN_STACK
9867 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9868
9869 /* This target hook should return true if accesses to volatile bitfields
9870    should use the narrowest mode possible.  It should return false if these
9871    accesses should use the bitfield container type.  */
9872 #undef TARGET_NARROW_VOLATILE_BITFIELD
9873 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9874
9875 #undef  TARGET_OPTION_OVERRIDE
9876 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9877
9878 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9879 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9880   aarch64_override_options_after_change
9881
9882 #undef TARGET_PASS_BY_REFERENCE
9883 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9884
9885 #undef TARGET_PREFERRED_RELOAD_CLASS
9886 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9887
9888 #undef TARGET_SECONDARY_RELOAD
9889 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9890
9891 #undef TARGET_SHIFT_TRUNCATION_MASK
9892 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9893
9894 #undef TARGET_SETUP_INCOMING_VARARGS
9895 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9896
9897 #undef TARGET_STRUCT_VALUE_RTX
9898 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
9899
9900 #undef TARGET_REGISTER_MOVE_COST
9901 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9902
9903 #undef TARGET_RETURN_IN_MEMORY
9904 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9905
9906 #undef TARGET_RETURN_IN_MSB
9907 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9908
9909 #undef TARGET_RTX_COSTS
9910 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9911
9912 #undef TARGET_SCHED_ISSUE_RATE
9913 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9914
9915 #undef TARGET_TRAMPOLINE_INIT
9916 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9917
9918 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9919 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9920
9921 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9922 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9923
9924 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9925 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9926
9927 #undef TARGET_VECTORIZE_ADD_STMT_COST
9928 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
9929
9930 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
9931 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
9932   aarch64_builtin_vectorization_cost
9933
9934 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
9935 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
9936
9937 #undef TARGET_VECTORIZE_BUILTINS
9938 #define TARGET_VECTORIZE_BUILTINS
9939
9940 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
9941 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
9942   aarch64_builtin_vectorized_function
9943
9944 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
9945 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
9946   aarch64_autovectorize_vector_sizes
9947
9948 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
9949 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
9950   aarch64_atomic_assign_expand_fenv
9951
9952 /* Section anchor support.  */
9953
9954 #undef TARGET_MIN_ANCHOR_OFFSET
9955 #define TARGET_MIN_ANCHOR_OFFSET -256
9956
9957 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
9958    byte offset; we can do much more for larger data types, but have no way
9959    to determine the size of the access.  We assume accesses are aligned.  */
9960 #undef TARGET_MAX_ANCHOR_OFFSET
9961 #define TARGET_MAX_ANCHOR_OFFSET 4095
9962
9963 #undef TARGET_VECTOR_ALIGNMENT
9964 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
9965
9966 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
9967 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
9968   aarch64_simd_vector_alignment_reachable
9969
9970 /* vec_perm support.  */
9971
9972 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
9973 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
9974   aarch64_vectorize_vec_perm_const_ok
9975
9976
9977 #undef TARGET_FIXED_CONDITION_CODE_REGS
9978 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
9979
9980 #undef TARGET_FLAGS_REGNUM
9981 #define TARGET_FLAGS_REGNUM CC_REGNUM
9982
9983 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
9984 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
9985
9986 struct gcc_target targetm = TARGET_INITIALIZER;
9987
9988 #include "gt-aarch64.h"